github.com/afbjorklund/moby@v20.10.5+incompatible/daemon/oci_linux.go

github.com/afbjorklund/moby@v20.10.5+incompatible/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"os"
     8  	"os/exec"
     9  	"path/filepath"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  
    14  	cdcgroups "github.com/containerd/cgroups"
    15  	"github.com/containerd/containerd/containers"
    16  	coci "github.com/containerd/containerd/oci"
    17  	"github.com/containerd/containerd/sys"
    18  	containertypes "github.com/docker/docker/api/types/container"
    19  	"github.com/docker/docker/container"
    20  	daemonconfig "github.com/docker/docker/daemon/config"
    21  	"github.com/docker/docker/oci"
    22  	"github.com/docker/docker/oci/caps"
    23  	"github.com/docker/docker/pkg/idtools"
    24  	"github.com/docker/docker/pkg/stringid"
    25  	"github.com/docker/docker/rootless/specconv"
    26  	volumemounts "github.com/docker/docker/volume/mounts"
    27  	"github.com/moby/sys/mount"
    28  	"github.com/moby/sys/mountinfo"
    29  	"github.com/opencontainers/runc/libcontainer/apparmor"
    30  	"github.com/opencontainers/runc/libcontainer/cgroups"
    31  	"github.com/opencontainers/runc/libcontainer/devices"
    32  	"github.com/opencontainers/runc/libcontainer/user"
    33  	specs "github.com/opencontainers/runtime-spec/specs-go"
    34  	"github.com/pkg/errors"
    35  	"github.com/sirupsen/logrus"
    36  	"golang.org/x/sys/unix"
    37  )
    38  
    39  const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
    40  
    41  // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
    42  func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    43  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    44  		var rlimits []specs.POSIXRlimit
    45  
    46  		// We want to leave the original HostConfig alone so make a copy here
    47  		hostConfig := *c.HostConfig
    48  		// Merge with the daemon defaults
    49  		daemon.mergeUlimits(&hostConfig)
    50  		for _, ul := range hostConfig.Ulimits {
    51  			rlimits = append(rlimits, specs.POSIXRlimit{
    52  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    53  				Soft: uint64(ul.Soft),
    54  				Hard: uint64(ul.Hard),
    55  			})
    56  		}
    57  
    58  		s.Process.Rlimits = rlimits
    59  		return nil
    60  	}
    61  }
    62  
    63  // WithLibnetwork sets the libnetwork hook
    64  func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    65  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    66  		if s.Hooks == nil {
    67  			s.Hooks = &specs.Hooks{}
    68  		}
    69  		for _, ns := range s.Linux.Namespaces {
    70  			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
    71  				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
    72  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    73  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    74  					Path: target,
    75  					Args: []string{
    76  						"libnetwork-setkey",
    77  						"-exec-root=" + daemon.configStore.GetExecRoot(),
    78  						c.ID,
    79  						shortNetCtlrID,
    80  					},
    81  				})
    82  			}
    83  		}
    84  		return nil
    85  	}
    86  }
    87  
    88  // WithRootless sets the spec to the rootless configuration
    89  func WithRootless(daemon *Daemon) coci.SpecOpts {
    90  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    91  		var v2Controllers []string
    92  		if daemon.getCgroupDriver() == cgroupSystemdDriver {
    93  			if cdcgroups.Mode() != cdcgroups.Unified {
    94  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    95  			}
    96  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    97  			if rootlesskitParentEUID == "" {
    98  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    99  			}
   100  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%s.slice/cgroup.controllers", rootlesskitParentEUID)
   101  			controllersFile, err := ioutil.ReadFile(controllersPath)
   102  			if err != nil {
   103  				return err
   104  			}
   105  			v2Controllers = strings.Fields(string(controllersFile))
   106  		}
   107  		return specconv.ToRootless(s, v2Controllers)
   108  	}
   109  }
   110  
   111  // WithOOMScore sets the oom score
   112  func WithOOMScore(score *int) coci.SpecOpts {
   113  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   114  		s.Process.OOMScoreAdj = score
   115  		return nil
   116  	}
   117  }
   118  
   119  // WithSelinux sets the selinux labels
   120  func WithSelinux(c *container.Container) coci.SpecOpts {
   121  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   122  		s.Process.SelinuxLabel = c.GetProcessLabel()
   123  		s.Linux.MountLabel = c.MountLabel
   124  		return nil
   125  	}
   126  }
   127  
   128  // WithApparmor sets the apparmor profile
   129  func WithApparmor(c *container.Container) coci.SpecOpts {
   130  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   131  		if apparmor.IsEnabled() {
   132  			var appArmorProfile string
   133  			if c.AppArmorProfile != "" {
   134  				appArmorProfile = c.AppArmorProfile
   135  			} else if c.HostConfig.Privileged {
   136  				appArmorProfile = unconfinedAppArmorProfile
   137  			} else {
   138  				appArmorProfile = defaultAppArmorProfile
   139  			}
   140  
   141  			if appArmorProfile == defaultAppArmorProfile {
   142  				// Unattended upgrades and other fun services can unload AppArmor
   143  				// profiles inadvertently. Since we cannot store our profile in
   144  				// /etc/apparmor.d, nor can we practically add other ways of
   145  				// telling the system to keep our profile loaded, in order to make
   146  				// sure that we keep the default profile enabled we dynamically
   147  				// reload it if necessary.
   148  				if err := ensureDefaultAppArmorProfile(); err != nil {
   149  					return err
   150  				}
   151  			}
   152  			s.Process.ApparmorProfile = appArmorProfile
   153  		}
   154  		return nil
   155  	}
   156  }
   157  
   158  // WithCapabilities sets the container's capabilties
   159  func WithCapabilities(c *container.Container) coci.SpecOpts {
   160  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   161  		capabilities, err := caps.TweakCapabilities(
   162  			caps.DefaultCapabilities(),
   163  			c.HostConfig.CapAdd,
   164  			c.HostConfig.CapDrop,
   165  			c.HostConfig.Privileged,
   166  		)
   167  		if err != nil {
   168  			return err
   169  		}
   170  		return oci.SetCapabilities(s, capabilities)
   171  	}
   172  }
   173  
   174  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   175  	p, err := getPath()
   176  	if err != nil {
   177  		return "", err
   178  	}
   179  	return c.GetResourcePath(p)
   180  }
   181  
   182  func getUser(c *container.Container, username string) (specs.User, error) {
   183  	var usr specs.User
   184  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   185  	if err != nil {
   186  		return usr, err
   187  	}
   188  	groupPath, err := resourcePath(c, user.GetGroupPath)
   189  	if err != nil {
   190  		return usr, err
   191  	}
   192  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   193  	if err != nil {
   194  		return usr, err
   195  	}
   196  	usr.UID = uint32(execUser.Uid)
   197  	usr.GID = uint32(execUser.Gid)
   198  
   199  	var addGroups []int
   200  	if len(c.HostConfig.GroupAdd) > 0 {
   201  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   202  		if err != nil {
   203  			return usr, err
   204  		}
   205  	}
   206  	for _, g := range append(execUser.Sgids, addGroups...) {
   207  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   208  	}
   209  	return usr, nil
   210  }
   211  
   212  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   213  	for i, n := range s.Linux.Namespaces {
   214  		if n.Type == ns.Type {
   215  			s.Linux.Namespaces[i] = ns
   216  			return
   217  		}
   218  	}
   219  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   220  }
   221  
   222  // WithNamespaces sets the container's namespaces
   223  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   224  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   225  		userNS := false
   226  		// user
   227  		if c.HostConfig.UsernsMode.IsPrivate() {
   228  			uidMap := daemon.idMapping.UIDs()
   229  			if uidMap != nil {
   230  				userNS = true
   231  				ns := specs.LinuxNamespace{Type: "user"}
   232  				setNamespace(s, ns)
   233  				s.Linux.UIDMappings = specMapping(uidMap)
   234  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
   235  			}
   236  		}
   237  		// network
   238  		if !c.Config.NetworkDisabled {
   239  			ns := specs.LinuxNamespace{Type: "network"}
   240  			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   241  			if parts[0] == "container" {
   242  				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   243  				if err != nil {
   244  					return err
   245  				}
   246  				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   247  				if userNS {
   248  					// to share a net namespace, they must also share a user namespace
   249  					nsUser := specs.LinuxNamespace{Type: "user"}
   250  					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   251  					setNamespace(s, nsUser)
   252  				}
   253  			} else if c.HostConfig.NetworkMode.IsHost() {
   254  				ns.Path = c.NetworkSettings.SandboxKey
   255  			}
   256  			setNamespace(s, ns)
   257  		}
   258  
   259  		// ipc
   260  		ipcMode := c.HostConfig.IpcMode
   261  		switch {
   262  		case ipcMode.IsContainer():
   263  			ns := specs.LinuxNamespace{Type: "ipc"}
   264  			ic, err := daemon.getIpcContainer(ipcMode.Container())
   265  			if err != nil {
   266  				return err
   267  			}
   268  			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   269  			setNamespace(s, ns)
   270  			if userNS {
   271  				// to share an IPC namespace, they must also share a user namespace
   272  				nsUser := specs.LinuxNamespace{Type: "user"}
   273  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   274  				setNamespace(s, nsUser)
   275  			}
   276  		case ipcMode.IsHost():
   277  			oci.RemoveNamespace(s, "ipc")
   278  		case ipcMode.IsEmpty():
   279  			// A container was created by an older version of the daemon.
   280  			// The default behavior used to be what is now called "shareable".
   281  			fallthrough
   282  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   283  			ns := specs.LinuxNamespace{Type: "ipc"}
   284  			setNamespace(s, ns)
   285  		default:
   286  			return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
   287  		}
   288  
   289  		// pid
   290  		if c.HostConfig.PidMode.IsContainer() {
   291  			pc, err := daemon.getPidContainer(c)
   292  			if err != nil {
   293  				return err
   294  			}
   295  			ns := specs.LinuxNamespace{
   296  				Type: "pid",
   297  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   298  			}
   299  			setNamespace(s, ns)
   300  			if userNS {
   301  				// to share a PID namespace, they must also share a user namespace
   302  				nsUser := specs.LinuxNamespace{
   303  					Type: "user",
   304  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   305  				}
   306  				setNamespace(s, nsUser)
   307  			}
   308  		} else if c.HostConfig.PidMode.IsHost() {
   309  			oci.RemoveNamespace(s, "pid")
   310  		} else {
   311  			ns := specs.LinuxNamespace{Type: "pid"}
   312  			setNamespace(s, ns)
   313  		}
   314  		// uts
   315  		if c.HostConfig.UTSMode.IsHost() {
   316  			oci.RemoveNamespace(s, "uts")
   317  			s.Hostname = ""
   318  		}
   319  
   320  		// cgroup
   321  		if !c.HostConfig.CgroupnsMode.IsEmpty() {
   322  			cgroupNsMode := c.HostConfig.CgroupnsMode
   323  			if !cgroupNsMode.Valid() {
   324  				return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
   325  			}
   326  			if cgroupNsMode.IsPrivate() {
   327  				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
   328  				setNamespace(s, nsCgroup)
   329  			}
   330  		}
   331  
   332  		return nil
   333  	}
   334  }
   335  
   336  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   337  	var ids []specs.LinuxIDMapping
   338  	for _, item := range s {
   339  		ids = append(ids, specs.LinuxIDMapping{
   340  			HostID:      uint32(item.HostID),
   341  			ContainerID: uint32(item.ContainerID),
   342  			Size:        uint32(item.Size),
   343  		})
   344  	}
   345  	return ids
   346  }
   347  
   348  // Get the source mount point of directory passed in as argument. Also return
   349  // optional fields.
   350  func getSourceMount(source string) (string, string, error) {
   351  	// Ensure any symlinks are resolved.
   352  	sourcePath, err := filepath.EvalSymlinks(source)
   353  	if err != nil {
   354  		return "", "", err
   355  	}
   356  
   357  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   358  	if err != nil {
   359  		return "", "", err
   360  	}
   361  	if len(mi) < 1 {
   362  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   363  	}
   364  
   365  	// find the longest mount point
   366  	var idx, maxlen int
   367  	for i := range mi {
   368  		if len(mi[i].Mountpoint) > maxlen {
   369  			maxlen = len(mi[i].Mountpoint)
   370  			idx = i
   371  		}
   372  	}
   373  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   374  }
   375  
   376  const (
   377  	sharedPropagationOption = "shared:"
   378  	slavePropagationOption  = "master:"
   379  )
   380  
   381  // hasMountInfoOption checks if any of the passed any of the given option values
   382  // are set in the passed in option string.
   383  func hasMountInfoOption(opts string, vals ...string) bool {
   384  	for _, opt := range strings.Split(opts, " ") {
   385  		for _, val := range vals {
   386  			if strings.HasPrefix(opt, val) {
   387  				return true
   388  			}
   389  		}
   390  	}
   391  	return false
   392  }
   393  
   394  // Ensure mount point on which path is mounted, is shared.
   395  func ensureShared(path string) error {
   396  	sourceMount, optionalOpts, err := getSourceMount(path)
   397  	if err != nil {
   398  		return err
   399  	}
   400  	// Make sure source mount point is shared.
   401  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   402  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   403  	}
   404  	return nil
   405  }
   406  
   407  // Ensure mount point on which path is mounted, is either shared or slave.
   408  func ensureSharedOrSlave(path string) error {
   409  	sourceMount, optionalOpts, err := getSourceMount(path)
   410  	if err != nil {
   411  		return err
   412  	}
   413  
   414  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   415  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   416  	}
   417  	return nil
   418  }
   419  
   420  // Get the set of mount flags that are set on the mount that contains the given
   421  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   422  // bind-mounting "with options" will not fail with user namespaces, due to
   423  // kernel restrictions that require user namespace mounts to preserve
   424  // CL_UNPRIVILEGED locked flags.
   425  func getUnprivilegedMountFlags(path string) ([]string, error) {
   426  	var statfs unix.Statfs_t
   427  	if err := unix.Statfs(path, &statfs); err != nil {
   428  		return nil, err
   429  	}
   430  
   431  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   432  	unprivilegedFlags := map[uint64]string{
   433  		unix.MS_RDONLY:     "ro",
   434  		unix.MS_NODEV:      "nodev",
   435  		unix.MS_NOEXEC:     "noexec",
   436  		unix.MS_NOSUID:     "nosuid",
   437  		unix.MS_NOATIME:    "noatime",
   438  		unix.MS_RELATIME:   "relatime",
   439  		unix.MS_NODIRATIME: "nodiratime",
   440  	}
   441  
   442  	var flags []string
   443  	for mask, flag := range unprivilegedFlags {
   444  		if uint64(statfs.Flags)&mask == mask {
   445  			flags = append(flags, flag)
   446  		}
   447  	}
   448  
   449  	return flags, nil
   450  }
   451  
   452  var (
   453  	mountPropagationMap = map[string]int{
   454  		"private":  mount.PRIVATE,
   455  		"rprivate": mount.RPRIVATE,
   456  		"shared":   mount.SHARED,
   457  		"rshared":  mount.RSHARED,
   458  		"slave":    mount.SLAVE,
   459  		"rslave":   mount.RSLAVE,
   460  	}
   461  
   462  	mountPropagationReverseMap = map[int]string{
   463  		mount.PRIVATE:  "private",
   464  		mount.RPRIVATE: "rprivate",
   465  		mount.SHARED:   "shared",
   466  		mount.RSHARED:  "rshared",
   467  		mount.SLAVE:    "slave",
   468  		mount.RSLAVE:   "rslave",
   469  	}
   470  )
   471  
   472  // inSlice tests whether a string is contained in a slice of strings or not.
   473  // Comparison is case sensitive
   474  func inSlice(slice []string, s string) bool {
   475  	for _, ss := range slice {
   476  		if s == ss {
   477  			return true
   478  		}
   479  	}
   480  	return false
   481  }
   482  
   483  // WithMounts sets the container's mounts
   484  func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
   485  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   486  		if err := daemon.setupContainerMountsRoot(c); err != nil {
   487  			return err
   488  		}
   489  
   490  		if err := daemon.setupIpcDirs(c); err != nil {
   491  			return err
   492  		}
   493  
   494  		defer func() {
   495  			if err != nil {
   496  				daemon.cleanupSecretDir(c)
   497  			}
   498  		}()
   499  
   500  		if err := daemon.setupSecretDir(c); err != nil {
   501  			return err
   502  		}
   503  
   504  		ms, err := daemon.setupMounts(c)
   505  		if err != nil {
   506  			return err
   507  		}
   508  
   509  		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   510  			ms = append(ms, c.IpcMounts()...)
   511  		}
   512  
   513  		tmpfsMounts, err := c.TmpfsMounts()
   514  		if err != nil {
   515  			return err
   516  		}
   517  		ms = append(ms, tmpfsMounts...)
   518  
   519  		secretMounts, err := c.SecretMounts()
   520  		if err != nil {
   521  			return err
   522  		}
   523  		ms = append(ms, secretMounts...)
   524  
   525  		sort.Sort(mounts(ms))
   526  
   527  		mounts := ms
   528  
   529  		userMounts := make(map[string]struct{})
   530  		for _, m := range mounts {
   531  			userMounts[m.Destination] = struct{}{}
   532  		}
   533  
   534  		// Copy all mounts from spec to defaultMounts, except for
   535  		//  - mounts overridden by a user supplied mount;
   536  		//  - all mounts under /dev if a user supplied /dev is present;
   537  		//  - /dev/shm, in case IpcMode is none.
   538  		// While at it, also
   539  		//  - set size for /dev/shm from shmsize.
   540  		defaultMounts := s.Mounts[:0]
   541  		_, mountDev := userMounts["/dev"]
   542  		for _, m := range s.Mounts {
   543  			if _, ok := userMounts[m.Destination]; ok {
   544  				// filter out mount overridden by a user supplied mount
   545  				continue
   546  			}
   547  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   548  				// filter out everything under /dev if /dev is user-mounted
   549  				continue
   550  			}
   551  
   552  			if m.Destination == "/dev/shm" {
   553  				if c.HostConfig.IpcMode.IsNone() {
   554  					// filter out /dev/shm for "none" IpcMode
   555  					continue
   556  				}
   557  				// set size for /dev/shm mount from spec
   558  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   559  				m.Options = append(m.Options, sizeOpt)
   560  			}
   561  
   562  			defaultMounts = append(defaultMounts, m)
   563  		}
   564  
   565  		s.Mounts = defaultMounts
   566  		for _, m := range mounts {
   567  			if m.Source == "tmpfs" {
   568  				data := m.Data
   569  				parser := volumemounts.NewParser("linux")
   570  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   571  				if data != "" {
   572  					options = append(options, strings.Split(data, ",")...)
   573  				}
   574  
   575  				merged, err := mount.MergeTmpfsOptions(options)
   576  				if err != nil {
   577  					return err
   578  				}
   579  
   580  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   581  				continue
   582  			}
   583  
   584  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   585  
   586  			// Determine property of RootPropagation based on volume
   587  			// properties. If a volume is shared, then keep root propagation
   588  			// shared. This should work for slave and private volumes too.
   589  			//
   590  			// For slave volumes, it can be either [r]shared/[r]slave.
   591  			//
   592  			// For private volumes any root propagation value should work.
   593  			pFlag := mountPropagationMap[m.Propagation]
   594  			switch pFlag {
   595  			case mount.SHARED, mount.RSHARED:
   596  				if err := ensureShared(m.Source); err != nil {
   597  					return err
   598  				}
   599  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   600  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   601  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   602  				}
   603  			case mount.SLAVE, mount.RSLAVE:
   604  				var fallback bool
   605  				if err := ensureSharedOrSlave(m.Source); err != nil {
   606  					// For backwards compatibility purposes, treat mounts from the daemon root
   607  					// as special since we automatically add rslave propagation to these mounts
   608  					// when the user did not set anything, so we should fallback to the old
   609  					// behavior which is to use private propagation which is normally the
   610  					// default.
   611  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   612  						return err
   613  					}
   614  
   615  					cm, ok := c.MountPoints[m.Destination]
   616  					if !ok {
   617  						return err
   618  					}
   619  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   620  						// This means the user explicitly set a propagation, do not fallback in that case.
   621  						return err
   622  					}
   623  					fallback = true
   624  					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   625  				}
   626  				if !fallback {
   627  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   628  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   629  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   630  					}
   631  				}
   632  			}
   633  
   634  			bindMode := "rbind"
   635  			if m.NonRecursive {
   636  				bindMode = "bind"
   637  			}
   638  			opts := []string{bindMode}
   639  			if !m.Writable {
   640  				opts = append(opts, "ro")
   641  			}
   642  			if pFlag != 0 {
   643  				opts = append(opts, mountPropagationReverseMap[pFlag])
   644  			}
   645  
   646  			// If we are using user namespaces, then we must make sure that we
   647  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   648  			// "mount" when we bind-mount. The reason for this is that at the point
   649  			// when runc sets up the root filesystem, it is already inside a user
   650  			// namespace, and thus cannot change any flags that are locked.
   651  			if daemon.configStore.RemappedRoot != "" {
   652  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   653  				if err != nil {
   654  					return err
   655  				}
   656  				opts = append(opts, unprivOpts...)
   657  			}
   658  
   659  			mt.Options = opts
   660  			s.Mounts = append(s.Mounts, mt)
   661  		}
   662  
   663  		if s.Root.Readonly {
   664  			for i, m := range s.Mounts {
   665  				switch m.Destination {
   666  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   667  					continue
   668  				}
   669  				if _, ok := userMounts[m.Destination]; !ok {
   670  					if !inSlice(m.Options, "ro") {
   671  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   672  					}
   673  				}
   674  			}
   675  		}
   676  
   677  		if c.HostConfig.Privileged {
   678  			// clear readonly for /sys
   679  			for i := range s.Mounts {
   680  				if s.Mounts[i].Destination == "/sys" {
   681  					clearReadOnly(&s.Mounts[i])
   682  				}
   683  			}
   684  			s.Linux.ReadonlyPaths = nil
   685  			s.Linux.MaskedPaths = nil
   686  		}
   687  
   688  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   689  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   690  		if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
   691  			for i, m := range s.Mounts {
   692  				if m.Type == "cgroup" {
   693  					clearReadOnly(&s.Mounts[i])
   694  				}
   695  			}
   696  		}
   697  
   698  		return nil
   699  
   700  	}
   701  }
   702  
   703  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   704  // exist, so do not add the default ones if running on an old kernel.
   705  func sysctlExists(s string) bool {
   706  	f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1))
   707  	_, err := os.Stat(f)
   708  	return err == nil
   709  }
   710  
   711  // WithCommonOptions sets common docker options
   712  func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
   713  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   714  		if c.BaseFS == nil {
   715  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
   716  		}
   717  		linkedEnv, err := daemon.setupLinkedContainers(c)
   718  		if err != nil {
   719  			return err
   720  		}
   721  		s.Root = &specs.Root{
   722  			Path:     c.BaseFS.Path(),
   723  			Readonly: c.HostConfig.ReadonlyRootfs,
   724  		}
   725  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   726  			return err
   727  		}
   728  		cwd := c.Config.WorkingDir
   729  		if len(cwd) == 0 {
   730  			cwd = "/"
   731  		}
   732  		s.Process.Args = append([]string{c.Path}, c.Args...)
   733  
   734  		// only add the custom init if it is specified and the container is running in its
   735  		// own private pid namespace.  It does not make sense to add if it is running in the
   736  		// host namespace or another container's pid namespace where we already have an init
   737  		if c.HostConfig.PidMode.IsPrivate() {
   738  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   739  				(c.HostConfig.Init == nil && daemon.configStore.Init) {
   740  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   741  				path := daemon.configStore.InitPath
   742  				if path == "" {
   743  					path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
   744  					if err != nil {
   745  						return err
   746  					}
   747  				}
   748  				s.Mounts = append(s.Mounts, specs.Mount{
   749  					Destination: inContainerInitPath,
   750  					Type:        "bind",
   751  					Source:      path,
   752  					Options:     []string{"bind", "ro"},
   753  				})
   754  			}
   755  		}
   756  		s.Process.Cwd = cwd
   757  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   758  		s.Process.Terminal = c.Config.Tty
   759  
   760  		s.Hostname = c.Config.Hostname
   761  		setLinuxDomainname(c, s)
   762  
   763  		// Add default sysctls that are generally safe and useful; currently we
   764  		// grant the capabilities to allow these anyway. You can override if
   765  		// you want to restore the original behaviour.
   766  		// We do not set network sysctls if network namespace is host, or if we are
   767  		// joining an existing namespace, only if we create a new net namespace.
   768  		if c.HostConfig.NetworkMode.IsPrivate() {
   769  			// We cannot set up ping socket support in a user namespace
   770  			if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") {
   771  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   772  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   773  			}
   774  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   775  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   776  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   777  			}
   778  		}
   779  
   780  		return nil
   781  	}
   782  }
   783  
   784  // WithCgroups sets the container's cgroups
   785  func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
   786  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   787  		var cgroupsPath string
   788  		scopePrefix := "docker"
   789  		parent := "/docker"
   790  		useSystemd := UsingSystemd(daemon.configStore)
   791  		if useSystemd {
   792  			parent = "system.slice"
   793  			if daemon.configStore.Rootless {
   794  				parent = "user.slice"
   795  			}
   796  		}
   797  
   798  		if c.HostConfig.CgroupParent != "" {
   799  			parent = c.HostConfig.CgroupParent
   800  		} else if daemon.configStore.CgroupParent != "" {
   801  			parent = daemon.configStore.CgroupParent
   802  		}
   803  
   804  		if useSystemd {
   805  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   806  			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   807  		} else {
   808  			cgroupsPath = filepath.Join(parent, c.ID)
   809  		}
   810  		s.Linux.CgroupsPath = cgroupsPath
   811  
   812  		// the rest is only needed for CPU RT controller
   813  
   814  		if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
   815  			return nil
   816  		}
   817  
   818  		if cdcgroups.Mode() == cdcgroups.Unified {
   819  			return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2")
   820  		}
   821  
   822  		// FIXME this is very expensive way to check if cpu rt is supported
   823  		sysInfo := daemon.RawSysInfo(true)
   824  		if !sysInfo.CPURealtime {
   825  			return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not supported by the kernel")
   826  		}
   827  
   828  		p := cgroupsPath
   829  		if useSystemd {
   830  			initPath, err := cgroups.GetInitCgroup("cpu")
   831  			if err != nil {
   832  				return errors.Wrap(err, "unable to init CPU RT controller")
   833  			}
   834  			_, err = cgroups.GetOwnCgroup("cpu")
   835  			if err != nil {
   836  				return errors.Wrap(err, "unable to init CPU RT controller")
   837  			}
   838  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   839  		}
   840  
   841  		// Clean path to guard against things like ../../../BAD
   842  		parentPath := filepath.Dir(p)
   843  		if !filepath.IsAbs(parentPath) {
   844  			parentPath = filepath.Clean("/" + parentPath)
   845  		}
   846  
   847  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   848  		if err != nil {
   849  			return errors.Wrap(err, "unable to init CPU RT controller")
   850  		}
   851  		// When docker is run inside docker, the root is based of the host cgroup.
   852  		// Should this be handled in runc/libcontainer/cgroups ?
   853  		if strings.HasPrefix(root, "/docker/") {
   854  			root = "/"
   855  		}
   856  		mnt = filepath.Join(mnt, root)
   857  
   858  		if err := daemon.initCPURtController(mnt, parentPath); err != nil {
   859  			return errors.Wrap(err, "unable to init CPU RT controller")
   860  		}
   861  		return nil
   862  	}
   863  }
   864  
   865  // WithDevices sets the container's devices
   866  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   867  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   868  		// Build lists of devices allowed and created within the container.
   869  		var devs []specs.LinuxDevice
   870  		devPermissions := s.Linux.Resources.Devices
   871  
   872  		if c.HostConfig.Privileged && !sys.RunningInUserNS() {
   873  			hostDevices, err := devices.HostDevices()
   874  			if err != nil {
   875  				return err
   876  			}
   877  			for _, d := range hostDevices {
   878  				devs = append(devs, oci.Device(d))
   879  			}
   880  
   881  			// adding device mappings in privileged containers
   882  			for _, deviceMapping := range c.HostConfig.Devices {
   883  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   884  				if deviceMapping.CgroupPermissions != "rwm" {
   885  					logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   886  				}
   887  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   888  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   889  					logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   890  					continue
   891  				}
   892  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   893  				if err != nil {
   894  					return err
   895  				}
   896  				devs = append(devs, d...)
   897  			}
   898  
   899  			devPermissions = []specs.LinuxDeviceCgroup{
   900  				{
   901  					Allow:  true,
   902  					Access: "rwm",
   903  				},
   904  			}
   905  		} else {
   906  			for _, deviceMapping := range c.HostConfig.Devices {
   907  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   908  				if err != nil {
   909  					return err
   910  				}
   911  				devs = append(devs, d...)
   912  				devPermissions = append(devPermissions, dPermissions...)
   913  			}
   914  
   915  			var err error
   916  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   917  			if err != nil {
   918  				return err
   919  			}
   920  		}
   921  
   922  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   923  		s.Linux.Resources.Devices = devPermissions
   924  
   925  		for _, req := range c.HostConfig.DeviceRequests {
   926  			if err := daemon.handleDevice(req, s); err != nil {
   927  				return err
   928  			}
   929  		}
   930  		return nil
   931  	}
   932  }
   933  
   934  // WithResources applies the container resources
   935  func WithResources(c *container.Container) coci.SpecOpts {
   936  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   937  		r := c.HostConfig.Resources
   938  		weightDevices, err := getBlkioWeightDevices(r)
   939  		if err != nil {
   940  			return err
   941  		}
   942  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   943  		if err != nil {
   944  			return err
   945  		}
   946  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   947  		if err != nil {
   948  			return err
   949  		}
   950  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   951  		if err != nil {
   952  			return err
   953  		}
   954  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   955  		if err != nil {
   956  			return err
   957  		}
   958  
   959  		memoryRes := getMemoryResources(r)
   960  		cpuRes, err := getCPUResources(r)
   961  		if err != nil {
   962  			return err
   963  		}
   964  		blkioWeight := r.BlkioWeight
   965  
   966  		specResources := &specs.LinuxResources{
   967  			Memory: memoryRes,
   968  			CPU:    cpuRes,
   969  			BlockIO: &specs.LinuxBlockIO{
   970  				Weight:                  &blkioWeight,
   971  				WeightDevice:            weightDevices,
   972  				ThrottleReadBpsDevice:   readBpsDevice,
   973  				ThrottleWriteBpsDevice:  writeBpsDevice,
   974  				ThrottleReadIOPSDevice:  readIOpsDevice,
   975  				ThrottleWriteIOPSDevice: writeIOpsDevice,
   976  			},
   977  			Pids: getPidsLimit(r),
   978  		}
   979  
   980  		if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
   981  			specResources.Devices = s.Linux.Resources.Devices
   982  		}
   983  
   984  		s.Linux.Resources = specResources
   985  		return nil
   986  	}
   987  }
   988  
   989  // WithSysctls sets the container's sysctls
   990  func WithSysctls(c *container.Container) coci.SpecOpts {
   991  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   992  		// We merge the sysctls injected above with the HostConfig (latter takes
   993  		// precedence for backwards-compatibility reasons).
   994  		for k, v := range c.HostConfig.Sysctls {
   995  			s.Linux.Sysctl[k] = v
   996  		}
   997  		return nil
   998  	}
   999  }
  1000  
  1001  // WithUser sets the container's user
  1002  func WithUser(c *container.Container) coci.SpecOpts {
  1003  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1004  		var err error
  1005  		s.Process.User, err = getUser(c, c.Config.User)
  1006  		return err
  1007  	}
  1008  }
  1009  
  1010  func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  1011  	var (
  1012  		opts []coci.SpecOpts
  1013  		s    = oci.DefaultSpec()
  1014  	)
  1015  	opts = append(opts,
  1016  		WithCommonOptions(daemon, c),
  1017  		WithCgroups(daemon, c),
  1018  		WithResources(c),
  1019  		WithSysctls(c),
  1020  		WithDevices(daemon, c),
  1021  		WithUser(c),
  1022  		WithRlimits(daemon, c),
  1023  		WithNamespaces(daemon, c),
  1024  		WithCapabilities(c),
  1025  		WithSeccomp(daemon, c),
  1026  		WithMounts(daemon, c),
  1027  		WithLibnetwork(daemon, c),
  1028  		WithApparmor(c),
  1029  		WithSelinux(c),
  1030  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1031  	)
  1032  	if c.NoNewPrivileges {
  1033  		opts = append(opts, coci.WithNoNewPrivileges)
  1034  	}
  1035  
  1036  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1037  	if c.HostConfig.MaskedPaths != nil {
  1038  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1039  	}
  1040  	if c.HostConfig.ReadonlyPaths != nil {
  1041  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1042  	}
  1043  	if daemon.configStore.Rootless {
  1044  		opts = append(opts, WithRootless(daemon))
  1045  	}
  1046  	return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
  1047  		ID: c.ID,
  1048  	}, &s, opts...)
  1049  }
  1050  
  1051  func clearReadOnly(m *specs.Mount) {
  1052  	var opt []string
  1053  	for _, o := range m.Options {
  1054  		if o != "ro" {
  1055  			opt = append(opt, o)
  1056  		}
  1057  	}
  1058  	m.Options = opt
  1059  }
  1060  
  1061  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1062  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  1063  	ulimits := c.Ulimits
  1064  	// Merge ulimits with daemon defaults
  1065  	ulIdx := make(map[string]struct{})
  1066  	for _, ul := range ulimits {
  1067  		ulIdx[ul.Name] = struct{}{}
  1068  	}
  1069  	for name, ul := range daemon.configStore.Ulimits {
  1070  		if _, exists := ulIdx[name]; !exists {
  1071  			ulimits = append(ulimits, ul)
  1072  		}
  1073  	}
  1074  	c.Ulimits = ulimits
  1075  }