github.com/devdivbcp/moby@v17.12.0-ce-rc1.0.20200726071732-2d4bfdc789ad+incompatible/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"os/exec"
     9  	"path/filepath"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  
    14  	"github.com/containerd/containerd/containers"
    15  	coci "github.com/containerd/containerd/oci"
    16  	containertypes "github.com/docker/docker/api/types/container"
    17  	"github.com/docker/docker/container"
    18  	daemonconfig "github.com/docker/docker/daemon/config"
    19  	"github.com/docker/docker/oci"
    20  	"github.com/docker/docker/oci/caps"
    21  	"github.com/docker/docker/pkg/idtools"
    22  	"github.com/docker/docker/pkg/mount"
    23  	"github.com/docker/docker/pkg/stringid"
    24  	"github.com/docker/docker/rootless/specconv"
    25  	volumemounts "github.com/docker/docker/volume/mounts"
    26  	"github.com/opencontainers/runc/libcontainer/apparmor"
    27  	"github.com/opencontainers/runc/libcontainer/cgroups"
    28  	"github.com/opencontainers/runc/libcontainer/devices"
    29  	rsystem "github.com/opencontainers/runc/libcontainer/system"
    30  	"github.com/opencontainers/runc/libcontainer/user"
    31  	"github.com/opencontainers/runtime-spec/specs-go"
    32  	"github.com/pkg/errors"
    33  	"github.com/sirupsen/logrus"
    34  	"golang.org/x/sys/unix"
    35  )
    36  
    37  const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
    38  
    39  // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
    40  func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    41  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    42  		var rlimits []specs.POSIXRlimit
    43  
    44  		// We want to leave the original HostConfig alone so make a copy here
    45  		hostConfig := *c.HostConfig
    46  		// Merge with the daemon defaults
    47  		daemon.mergeUlimits(&hostConfig)
    48  		for _, ul := range hostConfig.Ulimits {
    49  			rlimits = append(rlimits, specs.POSIXRlimit{
    50  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    51  				Soft: uint64(ul.Soft),
    52  				Hard: uint64(ul.Hard),
    53  			})
    54  		}
    55  
    56  		s.Process.Rlimits = rlimits
    57  		return nil
    58  	}
    59  }
    60  
    61  // WithLibnetwork sets the libnetwork hook
    62  func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    63  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    64  		if s.Hooks == nil {
    65  			s.Hooks = &specs.Hooks{}
    66  		}
    67  		for _, ns := range s.Linux.Namespaces {
    68  			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
    69  				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
    70  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    71  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    72  					Path: target,
    73  					Args: []string{
    74  						"libnetwork-setkey",
    75  						"-exec-root=" + daemon.configStore.GetExecRoot(),
    76  						c.ID,
    77  						shortNetCtlrID,
    78  					},
    79  				})
    80  			}
    81  		}
    82  		return nil
    83  	}
    84  }
    85  
    86  // WithRootless sets the spec to the rootless configuration
    87  func WithRootless(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    88  	return specconv.ToRootless(s)
    89  }
    90  
    91  // WithOOMScore sets the oom score
    92  func WithOOMScore(score *int) coci.SpecOpts {
    93  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    94  		s.Process.OOMScoreAdj = score
    95  		return nil
    96  	}
    97  }
    98  
    99  // WithSelinux sets the selinux labels
   100  func WithSelinux(c *container.Container) coci.SpecOpts {
   101  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   102  		s.Process.SelinuxLabel = c.GetProcessLabel()
   103  		s.Linux.MountLabel = c.MountLabel
   104  		return nil
   105  	}
   106  }
   107  
   108  // WithApparmor sets the apparmor profile
   109  func WithApparmor(c *container.Container) coci.SpecOpts {
   110  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   111  		if apparmor.IsEnabled() {
   112  			var appArmorProfile string
   113  			if c.AppArmorProfile != "" {
   114  				appArmorProfile = c.AppArmorProfile
   115  			} else if c.HostConfig.Privileged {
   116  				appArmorProfile = "unconfined"
   117  			} else {
   118  				appArmorProfile = "docker-default"
   119  			}
   120  
   121  			if appArmorProfile == "docker-default" {
   122  				// Unattended upgrades and other fun services can unload AppArmor
   123  				// profiles inadvertently. Since we cannot store our profile in
   124  				// /etc/apparmor.d, nor can we practically add other ways of
   125  				// telling the system to keep our profile loaded, in order to make
   126  				// sure that we keep the default profile enabled we dynamically
   127  				// reload it if necessary.
   128  				if err := ensureDefaultAppArmorProfile(); err != nil {
   129  					return err
   130  				}
   131  			}
   132  			s.Process.ApparmorProfile = appArmorProfile
   133  		}
   134  		return nil
   135  	}
   136  }
   137  
   138  // WithCapabilities sets the container's capabilties
   139  func WithCapabilities(c *container.Container) coci.SpecOpts {
   140  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   141  		capabilities, err := caps.TweakCapabilities(
   142  			oci.DefaultCapabilities(),
   143  			c.HostConfig.CapAdd,
   144  			c.HostConfig.CapDrop,
   145  			c.HostConfig.Capabilities,
   146  			c.HostConfig.Privileged,
   147  		)
   148  		if err != nil {
   149  			return err
   150  		}
   151  		return oci.SetCapabilities(s, capabilities)
   152  	}
   153  }
   154  
   155  func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
   156  	fp, err := c.GetResourcePath(p)
   157  	if err != nil {
   158  		return nil, err
   159  	}
   160  	return os.Open(fp)
   161  }
   162  
   163  func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
   164  	passwdPath, err := user.GetPasswdPath()
   165  	if err != nil {
   166  		return 0, 0, nil, err
   167  	}
   168  	groupPath, err := user.GetGroupPath()
   169  	if err != nil {
   170  		return 0, 0, nil, err
   171  	}
   172  	passwdFile, err := readUserFile(c, passwdPath)
   173  	if err == nil {
   174  		defer passwdFile.Close()
   175  	}
   176  	groupFile, err := readUserFile(c, groupPath)
   177  	if err == nil {
   178  		defer groupFile.Close()
   179  	}
   180  
   181  	execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
   182  	if err != nil {
   183  		return 0, 0, nil, err
   184  	}
   185  
   186  	// todo: fix this double read by a change to libcontainer/user pkg
   187  	groupFile, err = readUserFile(c, groupPath)
   188  	if err == nil {
   189  		defer groupFile.Close()
   190  	}
   191  	var addGroups []int
   192  	if len(c.HostConfig.GroupAdd) > 0 {
   193  		addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
   194  		if err != nil {
   195  			return 0, 0, nil, err
   196  		}
   197  	}
   198  	uid := uint32(execUser.Uid)
   199  	gid := uint32(execUser.Gid)
   200  	sgids := append(execUser.Sgids, addGroups...)
   201  	var additionalGids []uint32
   202  	for _, g := range sgids {
   203  		additionalGids = append(additionalGids, uint32(g))
   204  	}
   205  	return uid, gid, additionalGids, nil
   206  }
   207  
   208  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   209  	for i, n := range s.Linux.Namespaces {
   210  		if n.Type == ns.Type {
   211  			s.Linux.Namespaces[i] = ns
   212  			return
   213  		}
   214  	}
   215  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   216  }
   217  
   218  // WithNamespaces sets the container's namespaces
   219  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   220  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   221  		userNS := false
   222  		// user
   223  		if c.HostConfig.UsernsMode.IsPrivate() {
   224  			uidMap := daemon.idMapping.UIDs()
   225  			if uidMap != nil {
   226  				userNS = true
   227  				ns := specs.LinuxNamespace{Type: "user"}
   228  				setNamespace(s, ns)
   229  				s.Linux.UIDMappings = specMapping(uidMap)
   230  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
   231  			}
   232  		}
   233  		// network
   234  		if !c.Config.NetworkDisabled {
   235  			ns := specs.LinuxNamespace{Type: "network"}
   236  			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   237  			if parts[0] == "container" {
   238  				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   239  				if err != nil {
   240  					return err
   241  				}
   242  				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   243  				if userNS {
   244  					// to share a net namespace, they must also share a user namespace
   245  					nsUser := specs.LinuxNamespace{Type: "user"}
   246  					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   247  					setNamespace(s, nsUser)
   248  				}
   249  			} else if c.HostConfig.NetworkMode.IsHost() {
   250  				ns.Path = c.NetworkSettings.SandboxKey
   251  			}
   252  			setNamespace(s, ns)
   253  		}
   254  
   255  		// ipc
   256  		ipcMode := c.HostConfig.IpcMode
   257  		switch {
   258  		case ipcMode.IsContainer():
   259  			ns := specs.LinuxNamespace{Type: "ipc"}
   260  			ic, err := daemon.getIpcContainer(ipcMode.Container())
   261  			if err != nil {
   262  				return err
   263  			}
   264  			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   265  			setNamespace(s, ns)
   266  			if userNS {
   267  				// to share an IPC namespace, they must also share a user namespace
   268  				nsUser := specs.LinuxNamespace{Type: "user"}
   269  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   270  				setNamespace(s, nsUser)
   271  			}
   272  		case ipcMode.IsHost():
   273  			oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
   274  		case ipcMode.IsEmpty():
   275  			// A container was created by an older version of the daemon.
   276  			// The default behavior used to be what is now called "shareable".
   277  			fallthrough
   278  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   279  			ns := specs.LinuxNamespace{Type: "ipc"}
   280  			setNamespace(s, ns)
   281  		default:
   282  			return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
   283  		}
   284  
   285  		// pid
   286  		if c.HostConfig.PidMode.IsContainer() {
   287  			ns := specs.LinuxNamespace{Type: "pid"}
   288  			pc, err := daemon.getPidContainer(c)
   289  			if err != nil {
   290  				return err
   291  			}
   292  			ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
   293  			setNamespace(s, ns)
   294  			if userNS {
   295  				// to share a PID namespace, they must also share a user namespace
   296  				nsUser := specs.LinuxNamespace{Type: "user"}
   297  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
   298  				setNamespace(s, nsUser)
   299  			}
   300  		} else if c.HostConfig.PidMode.IsHost() {
   301  			oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
   302  		} else {
   303  			ns := specs.LinuxNamespace{Type: "pid"}
   304  			setNamespace(s, ns)
   305  		}
   306  		// uts
   307  		if c.HostConfig.UTSMode.IsHost() {
   308  			oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
   309  			s.Hostname = ""
   310  		}
   311  
   312  		return nil
   313  	}
   314  }
   315  
   316  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   317  	var ids []specs.LinuxIDMapping
   318  	for _, item := range s {
   319  		ids = append(ids, specs.LinuxIDMapping{
   320  			HostID:      uint32(item.HostID),
   321  			ContainerID: uint32(item.ContainerID),
   322  			Size:        uint32(item.Size),
   323  		})
   324  	}
   325  	return ids
   326  }
   327  
   328  // Get the source mount point of directory passed in as argument. Also return
   329  // optional fields.
   330  func getSourceMount(source string) (string, string, error) {
   331  	// Ensure any symlinks are resolved.
   332  	sourcePath, err := filepath.EvalSymlinks(source)
   333  	if err != nil {
   334  		return "", "", err
   335  	}
   336  
   337  	mi, err := mount.GetMounts(mount.ParentsFilter(sourcePath))
   338  	if err != nil {
   339  		return "", "", err
   340  	}
   341  	if len(mi) < 1 {
   342  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   343  	}
   344  
   345  	// find the longest mount point
   346  	var idx, maxlen int
   347  	for i := range mi {
   348  		if len(mi[i].Mountpoint) > maxlen {
   349  			maxlen = len(mi[i].Mountpoint)
   350  			idx = i
   351  		}
   352  	}
   353  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   354  }
   355  
   356  const (
   357  	sharedPropagationOption = "shared:"
   358  	slavePropagationOption  = "master:"
   359  )
   360  
   361  // hasMountinfoOption checks if any of the passed any of the given option values
   362  // are set in the passed in option string.
   363  func hasMountinfoOption(opts string, vals ...string) bool {
   364  	for _, opt := range strings.Split(opts, " ") {
   365  		for _, val := range vals {
   366  			if strings.HasPrefix(opt, val) {
   367  				return true
   368  			}
   369  		}
   370  	}
   371  	return false
   372  }
   373  
   374  // Ensure mount point on which path is mounted, is shared.
   375  func ensureShared(path string) error {
   376  	sourceMount, optionalOpts, err := getSourceMount(path)
   377  	if err != nil {
   378  		return err
   379  	}
   380  	// Make sure source mount point is shared.
   381  	if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
   382  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   383  	}
   384  	return nil
   385  }
   386  
   387  // Ensure mount point on which path is mounted, is either shared or slave.
   388  func ensureSharedOrSlave(path string) error {
   389  	sourceMount, optionalOpts, err := getSourceMount(path)
   390  	if err != nil {
   391  		return err
   392  	}
   393  
   394  	if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   395  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   396  	}
   397  	return nil
   398  }
   399  
   400  // Get the set of mount flags that are set on the mount that contains the given
   401  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   402  // bind-mounting "with options" will not fail with user namespaces, due to
   403  // kernel restrictions that require user namespace mounts to preserve
   404  // CL_UNPRIVILEGED locked flags.
   405  func getUnprivilegedMountFlags(path string) ([]string, error) {
   406  	var statfs unix.Statfs_t
   407  	if err := unix.Statfs(path, &statfs); err != nil {
   408  		return nil, err
   409  	}
   410  
   411  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   412  	unprivilegedFlags := map[uint64]string{
   413  		unix.MS_RDONLY:     "ro",
   414  		unix.MS_NODEV:      "nodev",
   415  		unix.MS_NOEXEC:     "noexec",
   416  		unix.MS_NOSUID:     "nosuid",
   417  		unix.MS_NOATIME:    "noatime",
   418  		unix.MS_RELATIME:   "relatime",
   419  		unix.MS_NODIRATIME: "nodiratime",
   420  	}
   421  
   422  	var flags []string
   423  	for mask, flag := range unprivilegedFlags {
   424  		if uint64(statfs.Flags)&mask == mask {
   425  			flags = append(flags, flag)
   426  		}
   427  	}
   428  
   429  	return flags, nil
   430  }
   431  
   432  var (
   433  	mountPropagationMap = map[string]int{
   434  		"private":  mount.PRIVATE,
   435  		"rprivate": mount.RPRIVATE,
   436  		"shared":   mount.SHARED,
   437  		"rshared":  mount.RSHARED,
   438  		"slave":    mount.SLAVE,
   439  		"rslave":   mount.RSLAVE,
   440  	}
   441  
   442  	mountPropagationReverseMap = map[int]string{
   443  		mount.PRIVATE:  "private",
   444  		mount.RPRIVATE: "rprivate",
   445  		mount.SHARED:   "shared",
   446  		mount.RSHARED:  "rshared",
   447  		mount.SLAVE:    "slave",
   448  		mount.RSLAVE:   "rslave",
   449  	}
   450  )
   451  
   452  // inSlice tests whether a string is contained in a slice of strings or not.
   453  // Comparison is case sensitive
   454  func inSlice(slice []string, s string) bool {
   455  	for _, ss := range slice {
   456  		if s == ss {
   457  			return true
   458  		}
   459  	}
   460  	return false
   461  }
   462  
   463  // WithMounts sets the container's mounts
   464  func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
   465  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   466  		if err := daemon.setupContainerMountsRoot(c); err != nil {
   467  			return err
   468  		}
   469  
   470  		if err := daemon.setupIpcDirs(c); err != nil {
   471  			return err
   472  		}
   473  
   474  		defer func() {
   475  			if err != nil {
   476  				daemon.cleanupSecretDir(c)
   477  			}
   478  		}()
   479  
   480  		if err := daemon.setupSecretDir(c); err != nil {
   481  			return err
   482  		}
   483  
   484  		ms, err := daemon.setupMounts(c)
   485  		if err != nil {
   486  			return err
   487  		}
   488  
   489  		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   490  			ms = append(ms, c.IpcMounts()...)
   491  		}
   492  
   493  		tmpfsMounts, err := c.TmpfsMounts()
   494  		if err != nil {
   495  			return err
   496  		}
   497  		ms = append(ms, tmpfsMounts...)
   498  
   499  		secretMounts, err := c.SecretMounts()
   500  		if err != nil {
   501  			return err
   502  		}
   503  		ms = append(ms, secretMounts...)
   504  
   505  		sort.Sort(mounts(ms))
   506  
   507  		mounts := ms
   508  
   509  		userMounts := make(map[string]struct{})
   510  		for _, m := range mounts {
   511  			userMounts[m.Destination] = struct{}{}
   512  		}
   513  
   514  		// Copy all mounts from spec to defaultMounts, except for
   515  		//  - mounts overridden by a user supplied mount;
   516  		//  - all mounts under /dev if a user supplied /dev is present;
   517  		//  - /dev/shm, in case IpcMode is none.
   518  		// While at it, also
   519  		//  - set size for /dev/shm from shmsize.
   520  		defaultMounts := s.Mounts[:0]
   521  		_, mountDev := userMounts["/dev"]
   522  		for _, m := range s.Mounts {
   523  			if _, ok := userMounts[m.Destination]; ok {
   524  				// filter out mount overridden by a user supplied mount
   525  				continue
   526  			}
   527  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   528  				// filter out everything under /dev if /dev is user-mounted
   529  				continue
   530  			}
   531  
   532  			if m.Destination == "/dev/shm" {
   533  				if c.HostConfig.IpcMode.IsNone() {
   534  					// filter out /dev/shm for "none" IpcMode
   535  					continue
   536  				}
   537  				// set size for /dev/shm mount from spec
   538  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   539  				m.Options = append(m.Options, sizeOpt)
   540  			}
   541  
   542  			defaultMounts = append(defaultMounts, m)
   543  		}
   544  
   545  		s.Mounts = defaultMounts
   546  		for _, m := range mounts {
   547  			if m.Source == "tmpfs" {
   548  				data := m.Data
   549  				parser := volumemounts.NewParser("linux")
   550  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   551  				if data != "" {
   552  					options = append(options, strings.Split(data, ",")...)
   553  				}
   554  
   555  				merged, err := mount.MergeTmpfsOptions(options)
   556  				if err != nil {
   557  					return err
   558  				}
   559  
   560  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   561  				continue
   562  			}
   563  
   564  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   565  
   566  			// Determine property of RootPropagation based on volume
   567  			// properties. If a volume is shared, then keep root propagation
   568  			// shared. This should work for slave and private volumes too.
   569  			//
   570  			// For slave volumes, it can be either [r]shared/[r]slave.
   571  			//
   572  			// For private volumes any root propagation value should work.
   573  			pFlag := mountPropagationMap[m.Propagation]
   574  			switch pFlag {
   575  			case mount.SHARED, mount.RSHARED:
   576  				if err := ensureShared(m.Source); err != nil {
   577  					return err
   578  				}
   579  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   580  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   581  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   582  				}
   583  			case mount.SLAVE, mount.RSLAVE:
   584  				var fallback bool
   585  				if err := ensureSharedOrSlave(m.Source); err != nil {
   586  					// For backwards compatibility purposes, treat mounts from the daemon root
   587  					// as special since we automatically add rslave propagation to these mounts
   588  					// when the user did not set anything, so we should fallback to the old
   589  					// behavior which is to use private propagation which is normally the
   590  					// default.
   591  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   592  						return err
   593  					}
   594  
   595  					cm, ok := c.MountPoints[m.Destination]
   596  					if !ok {
   597  						return err
   598  					}
   599  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   600  						// This means the user explicitly set a propagation, do not fallback in that case.
   601  						return err
   602  					}
   603  					fallback = true
   604  					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   605  				}
   606  				if !fallback {
   607  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   608  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   609  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   610  					}
   611  				}
   612  			}
   613  
   614  			bindMode := "rbind"
   615  			if m.NonRecursive {
   616  				bindMode = "bind"
   617  			}
   618  			opts := []string{bindMode}
   619  			if !m.Writable {
   620  				opts = append(opts, "ro")
   621  			}
   622  			if pFlag != 0 {
   623  				opts = append(opts, mountPropagationReverseMap[pFlag])
   624  			}
   625  
   626  			// If we are using user namespaces, then we must make sure that we
   627  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   628  			// "mount" when we bind-mount. The reason for this is that at the point
   629  			// when runc sets up the root filesystem, it is already inside a user
   630  			// namespace, and thus cannot change any flags that are locked.
   631  			if daemon.configStore.RemappedRoot != "" {
   632  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   633  				if err != nil {
   634  					return err
   635  				}
   636  				opts = append(opts, unprivOpts...)
   637  			}
   638  
   639  			mt.Options = opts
   640  			s.Mounts = append(s.Mounts, mt)
   641  		}
   642  
   643  		if s.Root.Readonly {
   644  			for i, m := range s.Mounts {
   645  				switch m.Destination {
   646  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   647  					continue
   648  				}
   649  				if _, ok := userMounts[m.Destination]; !ok {
   650  					if !inSlice(m.Options, "ro") {
   651  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   652  					}
   653  				}
   654  			}
   655  		}
   656  
   657  		if c.HostConfig.Privileged {
   658  			// clear readonly for /sys
   659  			for i := range s.Mounts {
   660  				if s.Mounts[i].Destination == "/sys" {
   661  					clearReadOnly(&s.Mounts[i])
   662  				}
   663  			}
   664  			s.Linux.ReadonlyPaths = nil
   665  			s.Linux.MaskedPaths = nil
   666  		}
   667  
   668  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   669  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   670  		if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
   671  			for i, m := range s.Mounts {
   672  				if m.Type == "cgroup" {
   673  					clearReadOnly(&s.Mounts[i])
   674  				}
   675  			}
   676  		}
   677  
   678  		return nil
   679  
   680  	}
   681  }
   682  
   683  // WithCommonOptions sets common docker options
   684  func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
   685  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   686  		if c.BaseFS == nil {
   687  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
   688  		}
   689  		linkedEnv, err := daemon.setupLinkedContainers(c)
   690  		if err != nil {
   691  			return err
   692  		}
   693  		s.Root = &specs.Root{
   694  			Path:     c.BaseFS.Path(),
   695  			Readonly: c.HostConfig.ReadonlyRootfs,
   696  		}
   697  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   698  			return err
   699  		}
   700  		cwd := c.Config.WorkingDir
   701  		if len(cwd) == 0 {
   702  			cwd = "/"
   703  		}
   704  		s.Process.Args = append([]string{c.Path}, c.Args...)
   705  
   706  		// only add the custom init if it is specified and the container is running in its
   707  		// own private pid namespace.  It does not make sense to add if it is running in the
   708  		// host namespace or another container's pid namespace where we already have an init
   709  		if c.HostConfig.PidMode.IsPrivate() {
   710  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   711  				(c.HostConfig.Init == nil && daemon.configStore.Init) {
   712  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   713  				path := daemon.configStore.InitPath
   714  				if path == "" {
   715  					path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
   716  					if err != nil {
   717  						return err
   718  					}
   719  				}
   720  				s.Mounts = append(s.Mounts, specs.Mount{
   721  					Destination: inContainerInitPath,
   722  					Type:        "bind",
   723  					Source:      path,
   724  					Options:     []string{"bind", "ro"},
   725  				})
   726  			}
   727  		}
   728  		s.Process.Cwd = cwd
   729  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   730  		s.Process.Terminal = c.Config.Tty
   731  
   732  		s.Hostname = c.Config.Hostname
   733  		setLinuxDomainname(c, s)
   734  
   735  		return nil
   736  	}
   737  }
   738  
   739  // WithCgroups sets the container's cgroups
   740  func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
   741  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   742  		var cgroupsPath string
   743  		scopePrefix := "docker"
   744  		parent := "/docker"
   745  		useSystemd := UsingSystemd(daemon.configStore)
   746  		if useSystemd {
   747  			parent = "system.slice"
   748  		}
   749  
   750  		if c.HostConfig.CgroupParent != "" {
   751  			parent = c.HostConfig.CgroupParent
   752  		} else if daemon.configStore.CgroupParent != "" {
   753  			parent = daemon.configStore.CgroupParent
   754  		}
   755  
   756  		if useSystemd {
   757  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   758  			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   759  		} else {
   760  			cgroupsPath = filepath.Join(parent, c.ID)
   761  		}
   762  		s.Linux.CgroupsPath = cgroupsPath
   763  		p := cgroupsPath
   764  		if useSystemd {
   765  			initPath, err := cgroups.GetInitCgroup("cpu")
   766  			if err != nil {
   767  				return err
   768  			}
   769  			_, err = cgroups.GetOwnCgroup("cpu")
   770  			if err != nil {
   771  				return err
   772  			}
   773  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   774  		}
   775  
   776  		// Clean path to guard against things like ../../../BAD
   777  		parentPath := filepath.Dir(p)
   778  		if !filepath.IsAbs(parentPath) {
   779  			parentPath = filepath.Clean("/" + parentPath)
   780  		}
   781  
   782  		if err := daemon.initCgroupsPath(parentPath); err != nil {
   783  			return fmt.Errorf("linux init cgroups path: %v", err)
   784  		}
   785  		return nil
   786  	}
   787  }
   788  
   789  // WithDevices sets the container's devices
   790  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   791  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   792  		// Build lists of devices allowed and created within the container.
   793  		var devs []specs.LinuxDevice
   794  		devPermissions := s.Linux.Resources.Devices
   795  		if c.HostConfig.Privileged && !rsystem.RunningInUserNS() {
   796  			hostDevices, err := devices.HostDevices()
   797  			if err != nil {
   798  				return err
   799  			}
   800  			for _, d := range hostDevices {
   801  				devs = append(devs, oci.Device(d))
   802  			}
   803  			devPermissions = []specs.LinuxDeviceCgroup{
   804  				{
   805  					Allow:  true,
   806  					Access: "rwm",
   807  				},
   808  			}
   809  		} else {
   810  			for _, deviceMapping := range c.HostConfig.Devices {
   811  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   812  				if err != nil {
   813  					return err
   814  				}
   815  				devs = append(devs, d...)
   816  				devPermissions = append(devPermissions, dPermissions...)
   817  			}
   818  
   819  			var err error
   820  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   821  			if err != nil {
   822  				return err
   823  			}
   824  		}
   825  
   826  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   827  		s.Linux.Resources.Devices = devPermissions
   828  
   829  		for _, req := range c.HostConfig.DeviceRequests {
   830  			if err := daemon.handleDevice(req, s); err != nil {
   831  				return err
   832  			}
   833  		}
   834  		return nil
   835  	}
   836  }
   837  
   838  // WithResources applies the container resources
   839  func WithResources(c *container.Container) coci.SpecOpts {
   840  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   841  		r := c.HostConfig.Resources
   842  		weightDevices, err := getBlkioWeightDevices(r)
   843  		if err != nil {
   844  			return err
   845  		}
   846  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   847  		if err != nil {
   848  			return err
   849  		}
   850  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   851  		if err != nil {
   852  			return err
   853  		}
   854  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   855  		if err != nil {
   856  			return err
   857  		}
   858  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   859  		if err != nil {
   860  			return err
   861  		}
   862  
   863  		memoryRes := getMemoryResources(r)
   864  		cpuRes, err := getCPUResources(r)
   865  		if err != nil {
   866  			return err
   867  		}
   868  		blkioWeight := r.BlkioWeight
   869  
   870  		specResources := &specs.LinuxResources{
   871  			Memory: memoryRes,
   872  			CPU:    cpuRes,
   873  			BlockIO: &specs.LinuxBlockIO{
   874  				Weight:                  &blkioWeight,
   875  				WeightDevice:            weightDevices,
   876  				ThrottleReadBpsDevice:   readBpsDevice,
   877  				ThrottleWriteBpsDevice:  writeBpsDevice,
   878  				ThrottleReadIOPSDevice:  readIOpsDevice,
   879  				ThrottleWriteIOPSDevice: writeIOpsDevice,
   880  			},
   881  			Pids: getPidsLimit(r),
   882  		}
   883  
   884  		if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
   885  			specResources.Devices = s.Linux.Resources.Devices
   886  		}
   887  
   888  		s.Linux.Resources = specResources
   889  		return nil
   890  	}
   891  }
   892  
   893  // WithSysctls sets the container's sysctls
   894  func WithSysctls(c *container.Container) coci.SpecOpts {
   895  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   896  		// We merge the sysctls injected above with the HostConfig (latter takes
   897  		// precedence for backwards-compatibility reasons).
   898  		for k, v := range c.HostConfig.Sysctls {
   899  			s.Linux.Sysctl[k] = v
   900  		}
   901  		return nil
   902  	}
   903  }
   904  
   905  // WithUser sets the container's user
   906  func WithUser(c *container.Container) coci.SpecOpts {
   907  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   908  		uid, gid, additionalGids, err := getUser(c, c.Config.User)
   909  		if err != nil {
   910  			return err
   911  		}
   912  		s.Process.User.UID = uid
   913  		s.Process.User.GID = gid
   914  		s.Process.User.AdditionalGids = additionalGids
   915  		return nil
   916  	}
   917  }
   918  
   919  func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
   920  	var (
   921  		opts []coci.SpecOpts
   922  		s    = oci.DefaultSpec()
   923  	)
   924  	opts = append(opts,
   925  		WithCommonOptions(daemon, c),
   926  		WithCgroups(daemon, c),
   927  		WithResources(c),
   928  		WithSysctls(c),
   929  		WithDevices(daemon, c),
   930  		WithUser(c),
   931  		WithRlimits(daemon, c),
   932  		WithNamespaces(daemon, c),
   933  		WithCapabilities(c),
   934  		WithSeccomp(daemon, c),
   935  		WithMounts(daemon, c),
   936  		WithLibnetwork(daemon, c),
   937  		WithApparmor(c),
   938  		WithSelinux(c),
   939  		WithOOMScore(&c.HostConfig.OomScoreAdj),
   940  	)
   941  	if c.NoNewPrivileges {
   942  		opts = append(opts, coci.WithNoNewPrivileges)
   943  	}
   944  
   945  	// Set the masked and readonly paths with regard to the host config options if they are set.
   946  	if c.HostConfig.MaskedPaths != nil {
   947  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
   948  	}
   949  	if c.HostConfig.ReadonlyPaths != nil {
   950  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
   951  	}
   952  	if daemon.configStore.Rootless {
   953  		opts = append(opts, WithRootless)
   954  	}
   955  	return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
   956  		ID: c.ID,
   957  	}, &s, opts...)
   958  }
   959  
   960  func clearReadOnly(m *specs.Mount) {
   961  	var opt []string
   962  	for _, o := range m.Options {
   963  		if o != "ro" {
   964  			opt = append(opt, o)
   965  		}
   966  	}
   967  	m.Options = opt
   968  }
   969  
   970  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
   971  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
   972  	ulimits := c.Ulimits
   973  	// Merge ulimits with daemon defaults
   974  	ulIdx := make(map[string]struct{})
   975  	for _, ul := range ulimits {
   976  		ulIdx[ul.Name] = struct{}{}
   977  	}
   978  	for name, ul := range daemon.configStore.Ulimits {
   979  		if _, exists := ulIdx[name]; !exists {
   980  			ulimits = append(ulimits, ul)
   981  		}
   982  	}
   983  	c.Ulimits = ulimits
   984  }