github.com/Heebron/moby@v0.0.0-20221111184709-6eab4f55faf7/daemon/oci_linux.go

github.com/Heebron/moby@v0.0.0-20221111184709-6eab4f55faf7/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"os/exec"
     8  	"path/filepath"
     9  	"sort"
    10  	"strconv"
    11  	"strings"
    12  
    13  	cdcgroups "github.com/containerd/cgroups"
    14  	"github.com/containerd/containerd/containers"
    15  	coci "github.com/containerd/containerd/oci"
    16  	"github.com/containerd/containerd/pkg/apparmor"
    17  	"github.com/containerd/containerd/pkg/userns"
    18  	containertypes "github.com/docker/docker/api/types/container"
    19  	"github.com/docker/docker/container"
    20  	dconfig "github.com/docker/docker/daemon/config"
    21  	"github.com/docker/docker/errdefs"
    22  	"github.com/docker/docker/oci"
    23  	"github.com/docker/docker/oci/caps"
    24  	"github.com/docker/docker/pkg/idtools"
    25  	"github.com/docker/docker/pkg/stringid"
    26  	"github.com/docker/docker/rootless/specconv"
    27  	volumemounts "github.com/docker/docker/volume/mounts"
    28  	"github.com/moby/sys/mount"
    29  	"github.com/moby/sys/mountinfo"
    30  	"github.com/opencontainers/runc/libcontainer/cgroups"
    31  	"github.com/opencontainers/runc/libcontainer/user"
    32  	specs "github.com/opencontainers/runtime-spec/specs-go"
    33  	"github.com/pkg/errors"
    34  	"github.com/sirupsen/logrus"
    35  	"golang.org/x/sys/unix"
    36  )
    37  
    38  const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
    39  
    40  // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
    41  func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    42  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    43  		var rlimits []specs.POSIXRlimit
    44  
    45  		// We want to leave the original HostConfig alone so make a copy here
    46  		hostConfig := *c.HostConfig
    47  		// Merge with the daemon defaults
    48  		daemon.mergeUlimits(&hostConfig)
    49  		for _, ul := range hostConfig.Ulimits {
    50  			rlimits = append(rlimits, specs.POSIXRlimit{
    51  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    52  				Soft: uint64(ul.Soft),
    53  				Hard: uint64(ul.Hard),
    54  			})
    55  		}
    56  
    57  		s.Process.Rlimits = rlimits
    58  		return nil
    59  	}
    60  }
    61  
    62  // WithLibnetwork sets the libnetwork hook
    63  func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    64  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    65  		if s.Hooks == nil {
    66  			s.Hooks = &specs.Hooks{}
    67  		}
    68  		for _, ns := range s.Linux.Namespaces {
    69  			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
    70  				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
    71  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    72  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    73  					Path: target,
    74  					Args: []string{
    75  						"libnetwork-setkey",
    76  						"-exec-root=" + daemon.configStore.GetExecRoot(),
    77  						c.ID,
    78  						shortNetCtlrID,
    79  					},
    80  				})
    81  			}
    82  		}
    83  		return nil
    84  	}
    85  }
    86  
    87  // WithRootless sets the spec to the rootless configuration
    88  func WithRootless(daemon *Daemon) coci.SpecOpts {
    89  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    90  		var v2Controllers []string
    91  		if daemon.getCgroupDriver() == cgroupSystemdDriver {
    92  			if cdcgroups.Mode() != cdcgroups.Unified {
    93  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    94  			}
    95  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    96  			if rootlesskitParentEUID == "" {
    97  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    98  			}
    99  			euid, err := strconv.Atoi(rootlesskitParentEUID)
   100  			if err != nil {
   101  				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
   102  			}
   103  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
   104  			controllersFile, err := os.ReadFile(controllersPath)
   105  			if err != nil {
   106  				return err
   107  			}
   108  			v2Controllers = strings.Fields(string(controllersFile))
   109  		}
   110  		return specconv.ToRootless(s, v2Controllers)
   111  	}
   112  }
   113  
   114  // WithOOMScore sets the oom score
   115  func WithOOMScore(score *int) coci.SpecOpts {
   116  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   117  		s.Process.OOMScoreAdj = score
   118  		return nil
   119  	}
   120  }
   121  
   122  // WithSelinux sets the selinux labels
   123  func WithSelinux(c *container.Container) coci.SpecOpts {
   124  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   125  		s.Process.SelinuxLabel = c.GetProcessLabel()
   126  		s.Linux.MountLabel = c.MountLabel
   127  		return nil
   128  	}
   129  }
   130  
   131  // WithApparmor sets the apparmor profile
   132  func WithApparmor(c *container.Container) coci.SpecOpts {
   133  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   134  		if apparmor.HostSupports() {
   135  			var appArmorProfile string
   136  			if c.AppArmorProfile != "" {
   137  				appArmorProfile = c.AppArmorProfile
   138  			} else if c.HostConfig.Privileged {
   139  				appArmorProfile = unconfinedAppArmorProfile
   140  			} else {
   141  				appArmorProfile = defaultAppArmorProfile
   142  			}
   143  
   144  			if appArmorProfile == defaultAppArmorProfile {
   145  				// Unattended upgrades and other fun services can unload AppArmor
   146  				// profiles inadvertently. Since we cannot store our profile in
   147  				// /etc/apparmor.d, nor can we practically add other ways of
   148  				// telling the system to keep our profile loaded, in order to make
   149  				// sure that we keep the default profile enabled we dynamically
   150  				// reload it if necessary.
   151  				if err := ensureDefaultAppArmorProfile(); err != nil {
   152  					return err
   153  				}
   154  			}
   155  			s.Process.ApparmorProfile = appArmorProfile
   156  		}
   157  		return nil
   158  	}
   159  }
   160  
   161  // WithCapabilities sets the container's capabilties
   162  func WithCapabilities(c *container.Container) coci.SpecOpts {
   163  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   164  		capabilities, err := caps.TweakCapabilities(
   165  			caps.DefaultCapabilities(),
   166  			c.HostConfig.CapAdd,
   167  			c.HostConfig.CapDrop,
   168  			c.HostConfig.Privileged,
   169  		)
   170  		if err != nil {
   171  			return err
   172  		}
   173  		return oci.SetCapabilities(s, capabilities)
   174  	}
   175  }
   176  
   177  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   178  	p, err := getPath()
   179  	if err != nil {
   180  		return "", err
   181  	}
   182  	return c.GetResourcePath(p)
   183  }
   184  
   185  func getUser(c *container.Container, username string) (specs.User, error) {
   186  	var usr specs.User
   187  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   188  	if err != nil {
   189  		return usr, err
   190  	}
   191  	groupPath, err := resourcePath(c, user.GetGroupPath)
   192  	if err != nil {
   193  		return usr, err
   194  	}
   195  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   196  	if err != nil {
   197  		return usr, err
   198  	}
   199  	usr.UID = uint32(execUser.Uid)
   200  	usr.GID = uint32(execUser.Gid)
   201  	usr.AdditionalGids = []uint32{usr.GID}
   202  
   203  	var addGroups []int
   204  	if len(c.HostConfig.GroupAdd) > 0 {
   205  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   206  		if err != nil {
   207  			return usr, err
   208  		}
   209  	}
   210  	for _, g := range append(execUser.Sgids, addGroups...) {
   211  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   212  	}
   213  	return usr, nil
   214  }
   215  
   216  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   217  	for i, n := range s.Linux.Namespaces {
   218  		if n.Type == ns.Type {
   219  			s.Linux.Namespaces[i] = ns
   220  			return
   221  		}
   222  	}
   223  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   224  }
   225  
   226  // WithNamespaces sets the container's namespaces
   227  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   228  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   229  		userNS := false
   230  		// user
   231  		if c.HostConfig.UsernsMode.IsPrivate() {
   232  			uidMap := daemon.idMapping.UIDMaps
   233  			if uidMap != nil {
   234  				userNS = true
   235  				ns := specs.LinuxNamespace{Type: "user"}
   236  				setNamespace(s, ns)
   237  				s.Linux.UIDMappings = specMapping(uidMap)
   238  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
   239  			}
   240  		}
   241  		// network
   242  		if !c.Config.NetworkDisabled {
   243  			ns := specs.LinuxNamespace{Type: "network"}
   244  			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   245  			if parts[0] == "container" {
   246  				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   247  				if err != nil {
   248  					return err
   249  				}
   250  				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   251  				if userNS {
   252  					// to share a net namespace, they must also share a user namespace
   253  					nsUser := specs.LinuxNamespace{Type: "user"}
   254  					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   255  					setNamespace(s, nsUser)
   256  				}
   257  			} else if c.HostConfig.NetworkMode.IsHost() {
   258  				ns.Path = c.NetworkSettings.SandboxKey
   259  			}
   260  			setNamespace(s, ns)
   261  		}
   262  
   263  		// ipc
   264  		ipcMode := c.HostConfig.IpcMode
   265  		if !ipcMode.Valid() {
   266  			return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
   267  		}
   268  		switch {
   269  		case ipcMode.IsContainer():
   270  			ns := specs.LinuxNamespace{Type: "ipc"}
   271  			ic, err := daemon.getIpcContainer(ipcMode.Container())
   272  			if err != nil {
   273  				return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode))
   274  			}
   275  			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   276  			setNamespace(s, ns)
   277  			if userNS {
   278  				// to share an IPC namespace, they must also share a user namespace
   279  				nsUser := specs.LinuxNamespace{Type: "user"}
   280  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   281  				setNamespace(s, nsUser)
   282  			}
   283  		case ipcMode.IsHost():
   284  			oci.RemoveNamespace(s, "ipc")
   285  		case ipcMode.IsEmpty():
   286  			// A container was created by an older version of the daemon.
   287  			// The default behavior used to be what is now called "shareable".
   288  			fallthrough
   289  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   290  			ns := specs.LinuxNamespace{Type: "ipc"}
   291  			setNamespace(s, ns)
   292  		}
   293  
   294  		// pid
   295  		if !c.HostConfig.PidMode.Valid() {
   296  			return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode))
   297  		}
   298  		if c.HostConfig.PidMode.IsContainer() {
   299  			pc, err := daemon.getPidContainer(c)
   300  			if err != nil {
   301  				return err
   302  			}
   303  			ns := specs.LinuxNamespace{
   304  				Type: "pid",
   305  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   306  			}
   307  			setNamespace(s, ns)
   308  			if userNS {
   309  				// to share a PID namespace, they must also share a user namespace
   310  				nsUser := specs.LinuxNamespace{
   311  					Type: "user",
   312  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   313  				}
   314  				setNamespace(s, nsUser)
   315  			}
   316  		} else if c.HostConfig.PidMode.IsHost() {
   317  			oci.RemoveNamespace(s, "pid")
   318  		} else {
   319  			ns := specs.LinuxNamespace{Type: "pid"}
   320  			setNamespace(s, ns)
   321  		}
   322  		// uts
   323  		if !c.HostConfig.UTSMode.Valid() {
   324  			return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
   325  		}
   326  		if c.HostConfig.UTSMode.IsHost() {
   327  			oci.RemoveNamespace(s, "uts")
   328  			s.Hostname = ""
   329  		}
   330  
   331  		// cgroup
   332  		if !c.HostConfig.CgroupnsMode.Valid() {
   333  			return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
   334  		}
   335  		if !c.HostConfig.CgroupnsMode.IsEmpty() {
   336  			if c.HostConfig.CgroupnsMode.IsPrivate() {
   337  				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
   338  				setNamespace(s, nsCgroup)
   339  			}
   340  		}
   341  
   342  		return nil
   343  	}
   344  }
   345  
   346  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   347  	var ids []specs.LinuxIDMapping
   348  	for _, item := range s {
   349  		ids = append(ids, specs.LinuxIDMapping{
   350  			HostID:      uint32(item.HostID),
   351  			ContainerID: uint32(item.ContainerID),
   352  			Size:        uint32(item.Size),
   353  		})
   354  	}
   355  	return ids
   356  }
   357  
   358  // Get the source mount point of directory passed in as argument. Also return
   359  // optional fields.
   360  func getSourceMount(source string) (string, string, error) {
   361  	// Ensure any symlinks are resolved.
   362  	sourcePath, err := filepath.EvalSymlinks(source)
   363  	if err != nil {
   364  		return "", "", err
   365  	}
   366  
   367  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   368  	if err != nil {
   369  		return "", "", err
   370  	}
   371  	if len(mi) < 1 {
   372  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   373  	}
   374  
   375  	// find the longest mount point
   376  	var idx, maxlen int
   377  	for i := range mi {
   378  		if len(mi[i].Mountpoint) > maxlen {
   379  			maxlen = len(mi[i].Mountpoint)
   380  			idx = i
   381  		}
   382  	}
   383  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   384  }
   385  
   386  const (
   387  	sharedPropagationOption = "shared:"
   388  	slavePropagationOption  = "master:"
   389  )
   390  
   391  // hasMountInfoOption checks if any of the passed any of the given option values
   392  // are set in the passed in option string.
   393  func hasMountInfoOption(opts string, vals ...string) bool {
   394  	for _, opt := range strings.Split(opts, " ") {
   395  		for _, val := range vals {
   396  			if strings.HasPrefix(opt, val) {
   397  				return true
   398  			}
   399  		}
   400  	}
   401  	return false
   402  }
   403  
   404  // Ensure mount point on which path is mounted, is shared.
   405  func ensureShared(path string) error {
   406  	sourceMount, optionalOpts, err := getSourceMount(path)
   407  	if err != nil {
   408  		return err
   409  	}
   410  	// Make sure source mount point is shared.
   411  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   412  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   413  	}
   414  	return nil
   415  }
   416  
   417  // Ensure mount point on which path is mounted, is either shared or slave.
   418  func ensureSharedOrSlave(path string) error {
   419  	sourceMount, optionalOpts, err := getSourceMount(path)
   420  	if err != nil {
   421  		return err
   422  	}
   423  
   424  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   425  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   426  	}
   427  	return nil
   428  }
   429  
   430  // Get the set of mount flags that are set on the mount that contains the given
   431  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   432  // bind-mounting "with options" will not fail with user namespaces, due to
   433  // kernel restrictions that require user namespace mounts to preserve
   434  // CL_UNPRIVILEGED locked flags.
   435  func getUnprivilegedMountFlags(path string) ([]string, error) {
   436  	var statfs unix.Statfs_t
   437  	if err := unix.Statfs(path, &statfs); err != nil {
   438  		return nil, err
   439  	}
   440  
   441  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   442  	unprivilegedFlags := map[uint64]string{
   443  		unix.MS_RDONLY:     "ro",
   444  		unix.MS_NODEV:      "nodev",
   445  		unix.MS_NOEXEC:     "noexec",
   446  		unix.MS_NOSUID:     "nosuid",
   447  		unix.MS_NOATIME:    "noatime",
   448  		unix.MS_RELATIME:   "relatime",
   449  		unix.MS_NODIRATIME: "nodiratime",
   450  	}
   451  
   452  	var flags []string
   453  	for mask, flag := range unprivilegedFlags {
   454  		if uint64(statfs.Flags)&mask == mask {
   455  			flags = append(flags, flag)
   456  		}
   457  	}
   458  
   459  	return flags, nil
   460  }
   461  
   462  var (
   463  	mountPropagationMap = map[string]int{
   464  		"private":  mount.PRIVATE,
   465  		"rprivate": mount.RPRIVATE,
   466  		"shared":   mount.SHARED,
   467  		"rshared":  mount.RSHARED,
   468  		"slave":    mount.SLAVE,
   469  		"rslave":   mount.RSLAVE,
   470  	}
   471  
   472  	mountPropagationReverseMap = map[int]string{
   473  		mount.PRIVATE:  "private",
   474  		mount.RPRIVATE: "rprivate",
   475  		mount.SHARED:   "shared",
   476  		mount.RSHARED:  "rshared",
   477  		mount.SLAVE:    "slave",
   478  		mount.RSLAVE:   "rslave",
   479  	}
   480  )
   481  
   482  // inSlice tests whether a string is contained in a slice of strings or not.
   483  // Comparison is case sensitive
   484  func inSlice(slice []string, s string) bool {
   485  	for _, ss := range slice {
   486  		if s == ss {
   487  			return true
   488  		}
   489  	}
   490  	return false
   491  }
   492  
   493  // WithMounts sets the container's mounts
   494  func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
   495  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   496  		if err := daemon.setupContainerMountsRoot(c); err != nil {
   497  			return err
   498  		}
   499  
   500  		if err := daemon.setupIpcDirs(c); err != nil {
   501  			return err
   502  		}
   503  
   504  		defer func() {
   505  			if err != nil {
   506  				daemon.cleanupSecretDir(c)
   507  			}
   508  		}()
   509  
   510  		if err := daemon.setupSecretDir(c); err != nil {
   511  			return err
   512  		}
   513  
   514  		ms, err := daemon.setupMounts(c)
   515  		if err != nil {
   516  			return err
   517  		}
   518  
   519  		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   520  			ms = append(ms, c.IpcMounts()...)
   521  		}
   522  
   523  		tmpfsMounts, err := c.TmpfsMounts()
   524  		if err != nil {
   525  			return err
   526  		}
   527  		ms = append(ms, tmpfsMounts...)
   528  
   529  		secretMounts, err := c.SecretMounts()
   530  		if err != nil {
   531  			return err
   532  		}
   533  		ms = append(ms, secretMounts...)
   534  
   535  		sort.Sort(mounts(ms))
   536  
   537  		mounts := ms
   538  
   539  		userMounts := make(map[string]struct{})
   540  		for _, m := range mounts {
   541  			userMounts[m.Destination] = struct{}{}
   542  		}
   543  
   544  		// Copy all mounts from spec to defaultMounts, except for
   545  		//  - mounts overridden by a user supplied mount;
   546  		//  - all mounts under /dev if a user supplied /dev is present;
   547  		//  - /dev/shm, in case IpcMode is none.
   548  		// While at it, also
   549  		//  - set size for /dev/shm from shmsize.
   550  		defaultMounts := s.Mounts[:0]
   551  		_, mountDev := userMounts["/dev"]
   552  		for _, m := range s.Mounts {
   553  			if _, ok := userMounts[m.Destination]; ok {
   554  				// filter out mount overridden by a user supplied mount
   555  				continue
   556  			}
   557  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   558  				// filter out everything under /dev if /dev is user-mounted
   559  				continue
   560  			}
   561  
   562  			if m.Destination == "/dev/shm" {
   563  				if c.HostConfig.IpcMode.IsNone() {
   564  					// filter out /dev/shm for "none" IpcMode
   565  					continue
   566  				}
   567  				// set size for /dev/shm mount from spec
   568  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   569  				m.Options = append(m.Options, sizeOpt)
   570  			}
   571  
   572  			defaultMounts = append(defaultMounts, m)
   573  		}
   574  
   575  		s.Mounts = defaultMounts
   576  		for _, m := range mounts {
   577  			if m.Source == "tmpfs" {
   578  				data := m.Data
   579  				parser := volumemounts.NewParser()
   580  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   581  				if data != "" {
   582  					options = append(options, strings.Split(data, ",")...)
   583  				}
   584  
   585  				merged, err := mount.MergeTmpfsOptions(options)
   586  				if err != nil {
   587  					return err
   588  				}
   589  
   590  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   591  				continue
   592  			}
   593  
   594  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   595  
   596  			// Determine property of RootPropagation based on volume
   597  			// properties. If a volume is shared, then keep root propagation
   598  			// shared. This should work for slave and private volumes too.
   599  			//
   600  			// For slave volumes, it can be either [r]shared/[r]slave.
   601  			//
   602  			// For private volumes any root propagation value should work.
   603  			pFlag := mountPropagationMap[m.Propagation]
   604  			switch pFlag {
   605  			case mount.SHARED, mount.RSHARED:
   606  				if err := ensureShared(m.Source); err != nil {
   607  					return err
   608  				}
   609  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   610  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   611  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   612  				}
   613  			case mount.SLAVE, mount.RSLAVE:
   614  				var fallback bool
   615  				if err := ensureSharedOrSlave(m.Source); err != nil {
   616  					// For backwards compatibility purposes, treat mounts from the daemon root
   617  					// as special since we automatically add rslave propagation to these mounts
   618  					// when the user did not set anything, so we should fallback to the old
   619  					// behavior which is to use private propagation which is normally the
   620  					// default.
   621  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   622  						return err
   623  					}
   624  
   625  					cm, ok := c.MountPoints[m.Destination]
   626  					if !ok {
   627  						return err
   628  					}
   629  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   630  						// This means the user explicitly set a propagation, do not fallback in that case.
   631  						return err
   632  					}
   633  					fallback = true
   634  					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   635  				}
   636  				if !fallback {
   637  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   638  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   639  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   640  					}
   641  				}
   642  			}
   643  
   644  			bindMode := "rbind"
   645  			if m.NonRecursive {
   646  				bindMode = "bind"
   647  			}
   648  			opts := []string{bindMode}
   649  			if !m.Writable {
   650  				opts = append(opts, "ro")
   651  			}
   652  			if pFlag != 0 {
   653  				opts = append(opts, mountPropagationReverseMap[pFlag])
   654  			}
   655  
   656  			// If we are using user namespaces, then we must make sure that we
   657  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   658  			// "mount" when we bind-mount. The reason for this is that at the point
   659  			// when runc sets up the root filesystem, it is already inside a user
   660  			// namespace, and thus cannot change any flags that are locked.
   661  			if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() {
   662  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   663  				if err != nil {
   664  					return err
   665  				}
   666  				opts = append(opts, unprivOpts...)
   667  			}
   668  
   669  			mt.Options = opts
   670  			s.Mounts = append(s.Mounts, mt)
   671  		}
   672  
   673  		if s.Root.Readonly {
   674  			for i, m := range s.Mounts {
   675  				switch m.Destination {
   676  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   677  					continue
   678  				}
   679  				if _, ok := userMounts[m.Destination]; !ok {
   680  					if !inSlice(m.Options, "ro") {
   681  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   682  					}
   683  				}
   684  			}
   685  		}
   686  
   687  		if c.HostConfig.Privileged {
   688  			// clear readonly for /sys
   689  			for i := range s.Mounts {
   690  				if s.Mounts[i].Destination == "/sys" {
   691  					clearReadOnly(&s.Mounts[i])
   692  				}
   693  			}
   694  			s.Linux.ReadonlyPaths = nil
   695  			s.Linux.MaskedPaths = nil
   696  		}
   697  
   698  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   699  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   700  		if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
   701  			for i, m := range s.Mounts {
   702  				if m.Type == "cgroup" {
   703  					clearReadOnly(&s.Mounts[i])
   704  				}
   705  			}
   706  		}
   707  
   708  		return nil
   709  	}
   710  }
   711  
   712  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   713  // exist, so do not add the default ones if running on an old kernel.
   714  func sysctlExists(s string) bool {
   715  	f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
   716  	_, err := os.Stat(f)
   717  	return err == nil
   718  }
   719  
   720  // WithCommonOptions sets common docker options
   721  func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
   722  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   723  		if c.BaseFS == "" && !daemon.UsesSnapshotter() {
   724  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
   725  		}
   726  		linkedEnv, err := daemon.setupLinkedContainers(c)
   727  		if err != nil {
   728  			return err
   729  		}
   730  		if !daemon.UsesSnapshotter() {
   731  			s.Root = &specs.Root{
   732  				Path:     c.BaseFS,
   733  				Readonly: c.HostConfig.ReadonlyRootfs,
   734  			}
   735  		}
   736  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   737  			return err
   738  		}
   739  		cwd := c.Config.WorkingDir
   740  		if len(cwd) == 0 {
   741  			cwd = "/"
   742  		}
   743  		s.Process.Args = append([]string{c.Path}, c.Args...)
   744  
   745  		// only add the custom init if it is specified and the container is running in its
   746  		// own private pid namespace.  It does not make sense to add if it is running in the
   747  		// host namespace or another container's pid namespace where we already have an init
   748  		if c.HostConfig.PidMode.IsPrivate() {
   749  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   750  				(c.HostConfig.Init == nil && daemon.configStore.Init) {
   751  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   752  				path := daemon.configStore.InitPath
   753  				if path == "" {
   754  					path, err = exec.LookPath(dconfig.DefaultInitBinary)
   755  					if err != nil {
   756  						return err
   757  					}
   758  				}
   759  				s.Mounts = append(s.Mounts, specs.Mount{
   760  					Destination: inContainerInitPath,
   761  					Type:        "bind",
   762  					Source:      path,
   763  					Options:     []string{"bind", "ro"},
   764  				})
   765  			}
   766  		}
   767  		s.Process.Cwd = cwd
   768  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   769  		s.Process.Terminal = c.Config.Tty
   770  
   771  		s.Hostname = c.Config.Hostname
   772  		setLinuxDomainname(c, s)
   773  
   774  		// Add default sysctls that are generally safe and useful; currently we
   775  		// grant the capabilities to allow these anyway. You can override if
   776  		// you want to restore the original behaviour.
   777  		// We do not set network sysctls if network namespace is host, or if we are
   778  		// joining an existing namespace, only if we create a new net namespace.
   779  		if c.HostConfig.NetworkMode.IsPrivate() {
   780  			// We cannot set up ping socket support in a user namespace
   781  			userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
   782  			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
   783  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   784  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   785  			}
   786  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   787  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   788  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   789  			}
   790  		}
   791  
   792  		return nil
   793  	}
   794  }
   795  
   796  // WithCgroups sets the container's cgroups
   797  func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
   798  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   799  		var cgroupsPath string
   800  		scopePrefix := "docker"
   801  		parent := "/docker"
   802  		useSystemd := UsingSystemd(daemon.configStore)
   803  		if useSystemd {
   804  			parent = "system.slice"
   805  			if daemon.configStore.Rootless {
   806  				parent = "user.slice"
   807  			}
   808  		}
   809  
   810  		if c.HostConfig.CgroupParent != "" {
   811  			parent = c.HostConfig.CgroupParent
   812  		} else if daemon.configStore.CgroupParent != "" {
   813  			parent = daemon.configStore.CgroupParent
   814  		}
   815  
   816  		if useSystemd {
   817  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   818  			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   819  		} else {
   820  			cgroupsPath = filepath.Join(parent, c.ID)
   821  		}
   822  		s.Linux.CgroupsPath = cgroupsPath
   823  
   824  		// the rest is only needed for CPU RT controller
   825  
   826  		if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
   827  			return nil
   828  		}
   829  
   830  		p := cgroupsPath
   831  		if useSystemd {
   832  			initPath, err := cgroups.GetInitCgroup("cpu")
   833  			if err != nil {
   834  				return errors.Wrap(err, "unable to init CPU RT controller")
   835  			}
   836  			_, err = cgroups.GetOwnCgroup("cpu")
   837  			if err != nil {
   838  				return errors.Wrap(err, "unable to init CPU RT controller")
   839  			}
   840  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   841  		}
   842  
   843  		// Clean path to guard against things like ../../../BAD
   844  		parentPath := filepath.Dir(p)
   845  		if !filepath.IsAbs(parentPath) {
   846  			parentPath = filepath.Clean("/" + parentPath)
   847  		}
   848  
   849  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   850  		if err != nil {
   851  			return errors.Wrap(err, "unable to init CPU RT controller")
   852  		}
   853  		// When docker is run inside docker, the root is based of the host cgroup.
   854  		// Should this be handled in runc/libcontainer/cgroups ?
   855  		if strings.HasPrefix(root, "/docker/") {
   856  			root = "/"
   857  		}
   858  		mnt = filepath.Join(mnt, root)
   859  
   860  		if err := daemon.initCPURtController(mnt, parentPath); err != nil {
   861  			return errors.Wrap(err, "unable to init CPU RT controller")
   862  		}
   863  		return nil
   864  	}
   865  }
   866  
   867  // WithDevices sets the container's devices
   868  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   869  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   870  		// Build lists of devices allowed and created within the container.
   871  		var devs []specs.LinuxDevice
   872  		devPermissions := s.Linux.Resources.Devices
   873  
   874  		if c.HostConfig.Privileged {
   875  			hostDevices, err := coci.HostDevices()
   876  			if err != nil {
   877  				return err
   878  			}
   879  			devs = append(devs, hostDevices...)
   880  
   881  			// adding device mappings in privileged containers
   882  			for _, deviceMapping := range c.HostConfig.Devices {
   883  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   884  				if deviceMapping.CgroupPermissions != "rwm" {
   885  					logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   886  				}
   887  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   888  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   889  					logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   890  					continue
   891  				}
   892  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   893  				if err != nil {
   894  					return err
   895  				}
   896  				devs = append(devs, d...)
   897  			}
   898  
   899  			devPermissions = []specs.LinuxDeviceCgroup{
   900  				{
   901  					Allow:  true,
   902  					Access: "rwm",
   903  				},
   904  			}
   905  		} else {
   906  			for _, deviceMapping := range c.HostConfig.Devices {
   907  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   908  				if err != nil {
   909  					return err
   910  				}
   911  				devs = append(devs, d...)
   912  				devPermissions = append(devPermissions, dPermissions...)
   913  			}
   914  
   915  			var err error
   916  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   917  			if err != nil {
   918  				return err
   919  			}
   920  		}
   921  
   922  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   923  		s.Linux.Resources.Devices = devPermissions
   924  
   925  		for _, req := range c.HostConfig.DeviceRequests {
   926  			if err := daemon.handleDevice(req, s); err != nil {
   927  				return err
   928  			}
   929  		}
   930  		return nil
   931  	}
   932  }
   933  
   934  // WithResources applies the container resources
   935  func WithResources(c *container.Container) coci.SpecOpts {
   936  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   937  		r := c.HostConfig.Resources
   938  		weightDevices, err := getBlkioWeightDevices(r)
   939  		if err != nil {
   940  			return err
   941  		}
   942  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   943  		if err != nil {
   944  			return err
   945  		}
   946  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   947  		if err != nil {
   948  			return err
   949  		}
   950  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   951  		if err != nil {
   952  			return err
   953  		}
   954  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   955  		if err != nil {
   956  			return err
   957  		}
   958  
   959  		memoryRes := getMemoryResources(r)
   960  		cpuRes, err := getCPUResources(r)
   961  		if err != nil {
   962  			return err
   963  		}
   964  		blkioWeight := r.BlkioWeight
   965  
   966  		specResources := &specs.LinuxResources{
   967  			Memory: memoryRes,
   968  			CPU:    cpuRes,
   969  			BlockIO: &specs.LinuxBlockIO{
   970  				Weight:                  &blkioWeight,
   971  				WeightDevice:            weightDevices,
   972  				ThrottleReadBpsDevice:   readBpsDevice,
   973  				ThrottleWriteBpsDevice:  writeBpsDevice,
   974  				ThrottleReadIOPSDevice:  readIOpsDevice,
   975  				ThrottleWriteIOPSDevice: writeIOpsDevice,
   976  			},
   977  			Pids: getPidsLimit(r),
   978  		}
   979  
   980  		if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
   981  			specResources.Devices = s.Linux.Resources.Devices
   982  		}
   983  
   984  		s.Linux.Resources = specResources
   985  		return nil
   986  	}
   987  }
   988  
   989  // WithSysctls sets the container's sysctls
   990  func WithSysctls(c *container.Container) coci.SpecOpts {
   991  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   992  		// We merge the sysctls injected above with the HostConfig (latter takes
   993  		// precedence for backwards-compatibility reasons).
   994  		for k, v := range c.HostConfig.Sysctls {
   995  			s.Linux.Sysctl[k] = v
   996  		}
   997  		return nil
   998  	}
   999  }
  1000  
  1001  // WithUser sets the container's user
  1002  func WithUser(c *container.Container) coci.SpecOpts {
  1003  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1004  		var err error
  1005  		s.Process.User, err = getUser(c, c.Config.User)
  1006  		return err
  1007  	}
  1008  }
  1009  
  1010  func (daemon *Daemon) createSpec(ctx context.Context, c *container.Container) (retSpec *specs.Spec, err error) {
  1011  	var (
  1012  		opts []coci.SpecOpts
  1013  		s    = oci.DefaultSpec()
  1014  	)
  1015  	opts = append(opts,
  1016  		WithCommonOptions(daemon, c),
  1017  		WithCgroups(daemon, c),
  1018  		WithResources(c),
  1019  		WithSysctls(c),
  1020  		WithDevices(daemon, c),
  1021  		WithUser(c),
  1022  		WithRlimits(daemon, c),
  1023  		WithNamespaces(daemon, c),
  1024  		WithCapabilities(c),
  1025  		WithSeccomp(daemon, c),
  1026  		WithMounts(daemon, c),
  1027  		WithLibnetwork(daemon, c),
  1028  		WithApparmor(c),
  1029  		WithSelinux(c),
  1030  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1031  	)
  1032  	if c.NoNewPrivileges {
  1033  		opts = append(opts, coci.WithNoNewPrivileges)
  1034  	}
  1035  	if c.Config.Tty {
  1036  		opts = append(opts, WithConsoleSize(c))
  1037  	}
  1038  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1039  	if c.HostConfig.MaskedPaths != nil {
  1040  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1041  	}
  1042  	if c.HostConfig.ReadonlyPaths != nil {
  1043  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1044  	}
  1045  	if daemon.configStore.Rootless {
  1046  		opts = append(opts, WithRootless(daemon))
  1047  	}
  1048  
  1049  	var snapshotter, snapshotKey string
  1050  	if daemon.UsesSnapshotter() {
  1051  		snapshotter = daemon.imageService.StorageDriver()
  1052  		snapshotKey = c.ID
  1053  	}
  1054  
  1055  	return &s, coci.ApplyOpts(ctx, nil, &containers.Container{
  1056  		ID:          c.ID,
  1057  		Snapshotter: snapshotter,
  1058  		SnapshotKey: snapshotKey,
  1059  	}, &s, opts...)
  1060  }
  1061  
  1062  func clearReadOnly(m *specs.Mount) {
  1063  	var opt []string
  1064  	for _, o := range m.Options {
  1065  		if o != "ro" {
  1066  			opt = append(opt, o)
  1067  		}
  1068  	}
  1069  	m.Options = opt
  1070  }
  1071  
  1072  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1073  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  1074  	ulimits := c.Ulimits
  1075  	// Merge ulimits with daemon defaults
  1076  	ulIdx := make(map[string]struct{})
  1077  	for _, ul := range ulimits {
  1078  		ulIdx[ul.Name] = struct{}{}
  1079  	}
  1080  	for name, ul := range daemon.configStore.Ulimits {
  1081  		if _, exists := ulIdx[name]; !exists {
  1082  			ulimits = append(ulimits, ul)
  1083  		}
  1084  	}
  1085  	c.Ulimits = ulimits
  1086  }