github.com/rawahars/moby@v24.0.4+incompatible/daemon/oci_linux.go

github.com/rawahars/moby@v24.0.4+incompatible/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"path/filepath"
     8  	"sort"
     9  	"strconv"
    10  	"strings"
    11  
    12  	cdcgroups "github.com/containerd/cgroups/v3"
    13  	"github.com/containerd/containerd/containers"
    14  	coci "github.com/containerd/containerd/oci"
    15  	"github.com/containerd/containerd/pkg/apparmor"
    16  	"github.com/containerd/containerd/pkg/userns"
    17  	containertypes "github.com/docker/docker/api/types/container"
    18  	"github.com/docker/docker/container"
    19  	dconfig "github.com/docker/docker/daemon/config"
    20  	"github.com/docker/docker/errdefs"
    21  	"github.com/docker/docker/oci"
    22  	"github.com/docker/docker/oci/caps"
    23  	"github.com/docker/docker/pkg/idtools"
    24  	"github.com/docker/docker/pkg/rootless/specconv"
    25  	"github.com/docker/docker/pkg/stringid"
    26  	volumemounts "github.com/docker/docker/volume/mounts"
    27  	"github.com/moby/sys/mount"
    28  	"github.com/moby/sys/mountinfo"
    29  	"github.com/opencontainers/runc/libcontainer/cgroups"
    30  	"github.com/opencontainers/runc/libcontainer/user"
    31  	specs "github.com/opencontainers/runtime-spec/specs-go"
    32  	"github.com/pkg/errors"
    33  	"github.com/sirupsen/logrus"
    34  	"golang.org/x/sys/unix"
    35  )
    36  
    37  const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
    38  
    39  // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
    40  func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    41  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    42  		var rlimits []specs.POSIXRlimit
    43  
    44  		// We want to leave the original HostConfig alone so make a copy here
    45  		hostConfig := *c.HostConfig
    46  		// Merge with the daemon defaults
    47  		daemon.mergeUlimits(&hostConfig)
    48  		for _, ul := range hostConfig.Ulimits {
    49  			rlimits = append(rlimits, specs.POSIXRlimit{
    50  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    51  				Soft: uint64(ul.Soft),
    52  				Hard: uint64(ul.Hard),
    53  			})
    54  		}
    55  
    56  		if s.Process == nil {
    57  			s.Process = &specs.Process{}
    58  		}
    59  		s.Process.Rlimits = rlimits
    60  		return nil
    61  	}
    62  }
    63  
    64  // WithLibnetwork sets the libnetwork hook
    65  func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    66  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    67  		if s.Hooks == nil {
    68  			s.Hooks = &specs.Hooks{}
    69  		}
    70  		for _, ns := range s.Linux.Namespaces {
    71  			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
    72  				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
    73  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    74  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    75  					Path: target,
    76  					Args: []string{
    77  						"libnetwork-setkey",
    78  						"-exec-root=" + daemon.configStore.GetExecRoot(),
    79  						c.ID,
    80  						shortNetCtlrID,
    81  					},
    82  				})
    83  			}
    84  		}
    85  		return nil
    86  	}
    87  }
    88  
    89  // WithRootless sets the spec to the rootless configuration
    90  func WithRootless(daemon *Daemon) coci.SpecOpts {
    91  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    92  		var v2Controllers []string
    93  		if daemon.getCgroupDriver() == cgroupSystemdDriver {
    94  			if cdcgroups.Mode() != cdcgroups.Unified {
    95  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    96  			}
    97  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    98  			if rootlesskitParentEUID == "" {
    99  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
   100  			}
   101  			euid, err := strconv.Atoi(rootlesskitParentEUID)
   102  			if err != nil {
   103  				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
   104  			}
   105  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
   106  			controllersFile, err := os.ReadFile(controllersPath)
   107  			if err != nil {
   108  				return err
   109  			}
   110  			v2Controllers = strings.Fields(string(controllersFile))
   111  		}
   112  		return specconv.ToRootless(s, v2Controllers)
   113  	}
   114  }
   115  
   116  // WithOOMScore sets the oom score
   117  func WithOOMScore(score *int) coci.SpecOpts {
   118  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   119  		if s.Process == nil {
   120  			s.Process = &specs.Process{}
   121  		}
   122  		s.Process.OOMScoreAdj = score
   123  		return nil
   124  	}
   125  }
   126  
   127  // WithSelinux sets the selinux labels
   128  func WithSelinux(c *container.Container) coci.SpecOpts {
   129  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   130  		if s.Process == nil {
   131  			s.Process = &specs.Process{}
   132  		}
   133  		if s.Linux == nil {
   134  			s.Linux = &specs.Linux{}
   135  		}
   136  		s.Process.SelinuxLabel = c.GetProcessLabel()
   137  		s.Linux.MountLabel = c.MountLabel
   138  		return nil
   139  	}
   140  }
   141  
   142  // WithApparmor sets the apparmor profile
   143  func WithApparmor(c *container.Container) coci.SpecOpts {
   144  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   145  		if apparmor.HostSupports() {
   146  			var appArmorProfile string
   147  			if c.AppArmorProfile != "" {
   148  				appArmorProfile = c.AppArmorProfile
   149  			} else if c.HostConfig.Privileged {
   150  				appArmorProfile = unconfinedAppArmorProfile
   151  			} else {
   152  				appArmorProfile = defaultAppArmorProfile
   153  			}
   154  
   155  			if appArmorProfile == defaultAppArmorProfile {
   156  				// Unattended upgrades and other fun services can unload AppArmor
   157  				// profiles inadvertently. Since we cannot store our profile in
   158  				// /etc/apparmor.d, nor can we practically add other ways of
   159  				// telling the system to keep our profile loaded, in order to make
   160  				// sure that we keep the default profile enabled we dynamically
   161  				// reload it if necessary.
   162  				if err := ensureDefaultAppArmorProfile(); err != nil {
   163  					return err
   164  				}
   165  			}
   166  			if s.Process == nil {
   167  				s.Process = &specs.Process{}
   168  			}
   169  			s.Process.ApparmorProfile = appArmorProfile
   170  		}
   171  		return nil
   172  	}
   173  }
   174  
   175  // WithCapabilities sets the container's capabilties
   176  func WithCapabilities(c *container.Container) coci.SpecOpts {
   177  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   178  		capabilities, err := caps.TweakCapabilities(
   179  			caps.DefaultCapabilities(),
   180  			c.HostConfig.CapAdd,
   181  			c.HostConfig.CapDrop,
   182  			c.HostConfig.Privileged,
   183  		)
   184  		if err != nil {
   185  			return err
   186  		}
   187  		return oci.SetCapabilities(s, capabilities)
   188  	}
   189  }
   190  
   191  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   192  	p, err := getPath()
   193  	if err != nil {
   194  		return "", err
   195  	}
   196  	return c.GetResourcePath(p)
   197  }
   198  
   199  func getUser(c *container.Container, username string) (specs.User, error) {
   200  	var usr specs.User
   201  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   202  	if err != nil {
   203  		return usr, err
   204  	}
   205  	groupPath, err := resourcePath(c, user.GetGroupPath)
   206  	if err != nil {
   207  		return usr, err
   208  	}
   209  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   210  	if err != nil {
   211  		return usr, err
   212  	}
   213  	usr.UID = uint32(execUser.Uid)
   214  	usr.GID = uint32(execUser.Gid)
   215  	usr.AdditionalGids = []uint32{usr.GID}
   216  
   217  	var addGroups []int
   218  	if len(c.HostConfig.GroupAdd) > 0 {
   219  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   220  		if err != nil {
   221  			return usr, err
   222  		}
   223  	}
   224  	for _, g := range append(execUser.Sgids, addGroups...) {
   225  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   226  	}
   227  	return usr, nil
   228  }
   229  
   230  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   231  	if s.Linux == nil {
   232  		s.Linux = &specs.Linux{}
   233  	}
   234  
   235  	for i, n := range s.Linux.Namespaces {
   236  		if n.Type == ns.Type {
   237  			s.Linux.Namespaces[i] = ns
   238  			return
   239  		}
   240  	}
   241  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   242  }
   243  
   244  // WithNamespaces sets the container's namespaces
   245  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   246  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   247  		userNS := false
   248  		// user
   249  		if c.HostConfig.UsernsMode.IsPrivate() {
   250  			uidMap := daemon.idMapping.UIDMaps
   251  			if uidMap != nil {
   252  				userNS = true
   253  				ns := specs.LinuxNamespace{Type: "user"}
   254  				setNamespace(s, ns)
   255  				s.Linux.UIDMappings = specMapping(uidMap)
   256  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
   257  			}
   258  		}
   259  		// network
   260  		if !c.Config.NetworkDisabled {
   261  			ns := specs.LinuxNamespace{Type: "network"}
   262  			if c.HostConfig.NetworkMode.IsContainer() {
   263  				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   264  				if err != nil {
   265  					return err
   266  				}
   267  				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   268  				if userNS {
   269  					// to share a net namespace, they must also share a user namespace
   270  					nsUser := specs.LinuxNamespace{Type: "user"}
   271  					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   272  					setNamespace(s, nsUser)
   273  				}
   274  			} else if c.HostConfig.NetworkMode.IsHost() {
   275  				ns.Path = c.NetworkSettings.SandboxKey
   276  			}
   277  			setNamespace(s, ns)
   278  		}
   279  
   280  		// ipc
   281  		ipcMode := c.HostConfig.IpcMode
   282  		if !ipcMode.Valid() {
   283  			return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
   284  		}
   285  		switch {
   286  		case ipcMode.IsContainer():
   287  			ns := specs.LinuxNamespace{Type: "ipc"}
   288  			ic, err := daemon.getIpcContainer(ipcMode.Container())
   289  			if err != nil {
   290  				return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode))
   291  			}
   292  			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   293  			setNamespace(s, ns)
   294  			if userNS {
   295  				// to share an IPC namespace, they must also share a user namespace
   296  				nsUser := specs.LinuxNamespace{Type: "user"}
   297  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   298  				setNamespace(s, nsUser)
   299  			}
   300  		case ipcMode.IsHost():
   301  			oci.RemoveNamespace(s, "ipc")
   302  		case ipcMode.IsEmpty():
   303  			// A container was created by an older version of the daemon.
   304  			// The default behavior used to be what is now called "shareable".
   305  			fallthrough
   306  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   307  			ns := specs.LinuxNamespace{Type: "ipc"}
   308  			setNamespace(s, ns)
   309  		}
   310  
   311  		// pid
   312  		if !c.HostConfig.PidMode.Valid() {
   313  			return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode))
   314  		}
   315  		if c.HostConfig.PidMode.IsContainer() {
   316  			pc, err := daemon.getPidContainer(c)
   317  			if err != nil {
   318  				return err
   319  			}
   320  			ns := specs.LinuxNamespace{
   321  				Type: "pid",
   322  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   323  			}
   324  			setNamespace(s, ns)
   325  			if userNS {
   326  				// to share a PID namespace, they must also share a user namespace
   327  				nsUser := specs.LinuxNamespace{
   328  					Type: "user",
   329  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   330  				}
   331  				setNamespace(s, nsUser)
   332  			}
   333  		} else if c.HostConfig.PidMode.IsHost() {
   334  			oci.RemoveNamespace(s, "pid")
   335  		} else {
   336  			ns := specs.LinuxNamespace{Type: "pid"}
   337  			setNamespace(s, ns)
   338  		}
   339  		// uts
   340  		if !c.HostConfig.UTSMode.Valid() {
   341  			return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
   342  		}
   343  		if c.HostConfig.UTSMode.IsHost() {
   344  			oci.RemoveNamespace(s, "uts")
   345  			s.Hostname = ""
   346  		}
   347  
   348  		// cgroup
   349  		if !c.HostConfig.CgroupnsMode.Valid() {
   350  			return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
   351  		}
   352  		if !c.HostConfig.CgroupnsMode.IsEmpty() {
   353  			if c.HostConfig.CgroupnsMode.IsPrivate() {
   354  				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
   355  				setNamespace(s, nsCgroup)
   356  			}
   357  		}
   358  
   359  		return nil
   360  	}
   361  }
   362  
   363  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   364  	var ids []specs.LinuxIDMapping
   365  	for _, item := range s {
   366  		ids = append(ids, specs.LinuxIDMapping{
   367  			HostID:      uint32(item.HostID),
   368  			ContainerID: uint32(item.ContainerID),
   369  			Size:        uint32(item.Size),
   370  		})
   371  	}
   372  	return ids
   373  }
   374  
   375  // Get the source mount point of directory passed in as argument. Also return
   376  // optional fields.
   377  func getSourceMount(source string) (string, string, error) {
   378  	// Ensure any symlinks are resolved.
   379  	sourcePath, err := filepath.EvalSymlinks(source)
   380  	if err != nil {
   381  		return "", "", err
   382  	}
   383  
   384  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   385  	if err != nil {
   386  		return "", "", err
   387  	}
   388  	if len(mi) < 1 {
   389  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   390  	}
   391  
   392  	// find the longest mount point
   393  	var idx, maxlen int
   394  	for i := range mi {
   395  		if len(mi[i].Mountpoint) > maxlen {
   396  			maxlen = len(mi[i].Mountpoint)
   397  			idx = i
   398  		}
   399  	}
   400  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   401  }
   402  
   403  const (
   404  	sharedPropagationOption = "shared:"
   405  	slavePropagationOption  = "master:"
   406  )
   407  
   408  // hasMountInfoOption checks if any of the passed any of the given option values
   409  // are set in the passed in option string.
   410  func hasMountInfoOption(opts string, vals ...string) bool {
   411  	for _, opt := range strings.Split(opts, " ") {
   412  		for _, val := range vals {
   413  			if strings.HasPrefix(opt, val) {
   414  				return true
   415  			}
   416  		}
   417  	}
   418  	return false
   419  }
   420  
   421  // Ensure mount point on which path is mounted, is shared.
   422  func ensureShared(path string) error {
   423  	sourceMount, optionalOpts, err := getSourceMount(path)
   424  	if err != nil {
   425  		return err
   426  	}
   427  	// Make sure source mount point is shared.
   428  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   429  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   430  	}
   431  	return nil
   432  }
   433  
   434  // Ensure mount point on which path is mounted, is either shared or slave.
   435  func ensureSharedOrSlave(path string) error {
   436  	sourceMount, optionalOpts, err := getSourceMount(path)
   437  	if err != nil {
   438  		return err
   439  	}
   440  
   441  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   442  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   443  	}
   444  	return nil
   445  }
   446  
   447  // Get the set of mount flags that are set on the mount that contains the given
   448  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   449  // bind-mounting "with options" will not fail with user namespaces, due to
   450  // kernel restrictions that require user namespace mounts to preserve
   451  // CL_UNPRIVILEGED locked flags.
   452  func getUnprivilegedMountFlags(path string) ([]string, error) {
   453  	var statfs unix.Statfs_t
   454  	if err := unix.Statfs(path, &statfs); err != nil {
   455  		return nil, err
   456  	}
   457  
   458  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   459  	unprivilegedFlags := map[uint64]string{
   460  		unix.MS_RDONLY:     "ro",
   461  		unix.MS_NODEV:      "nodev",
   462  		unix.MS_NOEXEC:     "noexec",
   463  		unix.MS_NOSUID:     "nosuid",
   464  		unix.MS_NOATIME:    "noatime",
   465  		unix.MS_RELATIME:   "relatime",
   466  		unix.MS_NODIRATIME: "nodiratime",
   467  	}
   468  
   469  	var flags []string
   470  	for mask, flag := range unprivilegedFlags {
   471  		if uint64(statfs.Flags)&mask == mask {
   472  			flags = append(flags, flag)
   473  		}
   474  	}
   475  
   476  	return flags, nil
   477  }
   478  
   479  var (
   480  	mountPropagationMap = map[string]int{
   481  		"private":  mount.PRIVATE,
   482  		"rprivate": mount.RPRIVATE,
   483  		"shared":   mount.SHARED,
   484  		"rshared":  mount.RSHARED,
   485  		"slave":    mount.SLAVE,
   486  		"rslave":   mount.RSLAVE,
   487  	}
   488  
   489  	mountPropagationReverseMap = map[int]string{
   490  		mount.PRIVATE:  "private",
   491  		mount.RPRIVATE: "rprivate",
   492  		mount.SHARED:   "shared",
   493  		mount.RSHARED:  "rshared",
   494  		mount.SLAVE:    "slave",
   495  		mount.RSLAVE:   "rslave",
   496  	}
   497  )
   498  
   499  // inSlice tests whether a string is contained in a slice of strings or not.
   500  // Comparison is case sensitive
   501  func inSlice(slice []string, s string) bool {
   502  	for _, ss := range slice {
   503  		if s == ss {
   504  			return true
   505  		}
   506  	}
   507  	return false
   508  }
   509  
   510  // WithMounts sets the container's mounts
   511  func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
   512  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   513  		if err := daemon.setupContainerMountsRoot(c); err != nil {
   514  			return err
   515  		}
   516  
   517  		if err := daemon.setupIpcDirs(c); err != nil {
   518  			return err
   519  		}
   520  
   521  		defer func() {
   522  			if err != nil {
   523  				daemon.cleanupSecretDir(c)
   524  			}
   525  		}()
   526  
   527  		if err := daemon.setupSecretDir(c); err != nil {
   528  			return err
   529  		}
   530  
   531  		ms, err := daemon.setupMounts(c)
   532  		if err != nil {
   533  			return err
   534  		}
   535  
   536  		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   537  			ms = append(ms, c.IpcMounts()...)
   538  		}
   539  
   540  		tmpfsMounts, err := c.TmpfsMounts()
   541  		if err != nil {
   542  			return err
   543  		}
   544  		ms = append(ms, tmpfsMounts...)
   545  
   546  		secretMounts, err := c.SecretMounts()
   547  		if err != nil {
   548  			return err
   549  		}
   550  		ms = append(ms, secretMounts...)
   551  
   552  		sort.Sort(mounts(ms))
   553  
   554  		mounts := ms
   555  
   556  		userMounts := make(map[string]struct{})
   557  		for _, m := range mounts {
   558  			userMounts[m.Destination] = struct{}{}
   559  		}
   560  
   561  		// Copy all mounts from spec to defaultMounts, except for
   562  		//  - mounts overridden by a user supplied mount;
   563  		//  - all mounts under /dev if a user supplied /dev is present;
   564  		//  - /dev/shm, in case IpcMode is none.
   565  		// While at it, also
   566  		//  - set size for /dev/shm from shmsize.
   567  		defaultMounts := s.Mounts[:0]
   568  		_, mountDev := userMounts["/dev"]
   569  		for _, m := range s.Mounts {
   570  			if _, ok := userMounts[m.Destination]; ok {
   571  				// filter out mount overridden by a user supplied mount
   572  				continue
   573  			}
   574  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   575  				// filter out everything under /dev if /dev is user-mounted
   576  				continue
   577  			}
   578  
   579  			if m.Destination == "/dev/shm" {
   580  				if c.HostConfig.IpcMode.IsNone() {
   581  					// filter out /dev/shm for "none" IpcMode
   582  					continue
   583  				}
   584  				// set size for /dev/shm mount from spec
   585  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   586  				m.Options = append(m.Options, sizeOpt)
   587  			}
   588  
   589  			defaultMounts = append(defaultMounts, m)
   590  		}
   591  
   592  		s.Mounts = defaultMounts
   593  		for _, m := range mounts {
   594  			if m.Source == "tmpfs" {
   595  				data := m.Data
   596  				parser := volumemounts.NewParser()
   597  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   598  				if data != "" {
   599  					options = append(options, strings.Split(data, ",")...)
   600  				}
   601  
   602  				merged, err := mount.MergeTmpfsOptions(options)
   603  				if err != nil {
   604  					return err
   605  				}
   606  
   607  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   608  				continue
   609  			}
   610  
   611  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   612  
   613  			// Determine property of RootPropagation based on volume
   614  			// properties. If a volume is shared, then keep root propagation
   615  			// shared. This should work for slave and private volumes too.
   616  			//
   617  			// For slave volumes, it can be either [r]shared/[r]slave.
   618  			//
   619  			// For private volumes any root propagation value should work.
   620  			pFlag := mountPropagationMap[m.Propagation]
   621  			switch pFlag {
   622  			case mount.SHARED, mount.RSHARED:
   623  				if err := ensureShared(m.Source); err != nil {
   624  					return err
   625  				}
   626  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   627  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   628  					if s.Linux == nil {
   629  						s.Linux = &specs.Linux{}
   630  					}
   631  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   632  				}
   633  			case mount.SLAVE, mount.RSLAVE:
   634  				var fallback bool
   635  				if err := ensureSharedOrSlave(m.Source); err != nil {
   636  					// For backwards compatibility purposes, treat mounts from the daemon root
   637  					// as special since we automatically add rslave propagation to these mounts
   638  					// when the user did not set anything, so we should fallback to the old
   639  					// behavior which is to use private propagation which is normally the
   640  					// default.
   641  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   642  						return err
   643  					}
   644  
   645  					cm, ok := c.MountPoints[m.Destination]
   646  					if !ok {
   647  						return err
   648  					}
   649  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   650  						// This means the user explicitly set a propagation, do not fallback in that case.
   651  						return err
   652  					}
   653  					fallback = true
   654  					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   655  				}
   656  				if !fallback {
   657  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   658  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   659  						if s.Linux == nil {
   660  							s.Linux = &specs.Linux{}
   661  						}
   662  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   663  					}
   664  				}
   665  			}
   666  
   667  			bindMode := "rbind"
   668  			if m.NonRecursive {
   669  				bindMode = "bind"
   670  			}
   671  			opts := []string{bindMode}
   672  			if !m.Writable {
   673  				opts = append(opts, "ro")
   674  			}
   675  			if pFlag != 0 {
   676  				opts = append(opts, mountPropagationReverseMap[pFlag])
   677  			}
   678  
   679  			// If we are using user namespaces, then we must make sure that we
   680  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   681  			// "mount" when we bind-mount. The reason for this is that at the point
   682  			// when runc sets up the root filesystem, it is already inside a user
   683  			// namespace, and thus cannot change any flags that are locked.
   684  			if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() {
   685  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   686  				if err != nil {
   687  					return err
   688  				}
   689  				opts = append(opts, unprivOpts...)
   690  			}
   691  
   692  			mt.Options = opts
   693  			s.Mounts = append(s.Mounts, mt)
   694  		}
   695  
   696  		if s.Root.Readonly {
   697  			for i, m := range s.Mounts {
   698  				switch m.Destination {
   699  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   700  					continue
   701  				}
   702  				if _, ok := userMounts[m.Destination]; !ok {
   703  					if !inSlice(m.Options, "ro") {
   704  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   705  					}
   706  				}
   707  			}
   708  		}
   709  
   710  		if c.HostConfig.Privileged {
   711  			// clear readonly for /sys
   712  			for i := range s.Mounts {
   713  				if s.Mounts[i].Destination == "/sys" {
   714  					clearReadOnly(&s.Mounts[i])
   715  				}
   716  			}
   717  			if s.Linux != nil {
   718  				s.Linux.ReadonlyPaths = nil
   719  				s.Linux.MaskedPaths = nil
   720  			}
   721  		}
   722  
   723  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   724  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   725  		if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
   726  			for i, m := range s.Mounts {
   727  				if m.Type == "cgroup" {
   728  					clearReadOnly(&s.Mounts[i])
   729  				}
   730  			}
   731  		}
   732  
   733  		return nil
   734  	}
   735  }
   736  
   737  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   738  // exist, so do not add the default ones if running on an old kernel.
   739  func sysctlExists(s string) bool {
   740  	f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
   741  	_, err := os.Stat(f)
   742  	return err == nil
   743  }
   744  
   745  // WithCommonOptions sets common docker options
   746  func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
   747  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   748  		if c.BaseFS == "" {
   749  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
   750  		}
   751  		linkedEnv, err := daemon.setupLinkedContainers(c)
   752  		if err != nil {
   753  			return err
   754  		}
   755  		s.Root = &specs.Root{
   756  			Path:     c.BaseFS,
   757  			Readonly: c.HostConfig.ReadonlyRootfs,
   758  		}
   759  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   760  			return err
   761  		}
   762  		cwd := c.Config.WorkingDir
   763  		if len(cwd) == 0 {
   764  			cwd = "/"
   765  		}
   766  		if s.Process == nil {
   767  			s.Process = &specs.Process{}
   768  		}
   769  		s.Process.Args = append([]string{c.Path}, c.Args...)
   770  
   771  		// only add the custom init if it is specified and the container is running in its
   772  		// own private pid namespace.  It does not make sense to add if it is running in the
   773  		// host namespace or another container's pid namespace where we already have an init
   774  		if c.HostConfig.PidMode.IsPrivate() {
   775  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   776  				(c.HostConfig.Init == nil && daemon.configStore.Init) {
   777  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   778  				path, err := daemon.configStore.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
   779  				if err != nil {
   780  					return err
   781  				}
   782  				s.Mounts = append(s.Mounts, specs.Mount{
   783  					Destination: inContainerInitPath,
   784  					Type:        "bind",
   785  					Source:      path,
   786  					Options:     []string{"bind", "ro"},
   787  				})
   788  			}
   789  		}
   790  		s.Process.Cwd = cwd
   791  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   792  		s.Process.Terminal = c.Config.Tty
   793  
   794  		s.Hostname = c.Config.Hostname
   795  		setLinuxDomainname(c, s)
   796  
   797  		// Add default sysctls that are generally safe and useful; currently we
   798  		// grant the capabilities to allow these anyway. You can override if
   799  		// you want to restore the original behaviour.
   800  		// We do not set network sysctls if network namespace is host, or if we are
   801  		// joining an existing namespace, only if we create a new net namespace.
   802  		if c.HostConfig.NetworkMode.IsPrivate() {
   803  			// We cannot set up ping socket support in a user namespace
   804  			userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
   805  			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
   806  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   807  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   808  			}
   809  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   810  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   811  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   812  			}
   813  		}
   814  
   815  		return nil
   816  	}
   817  }
   818  
   819  // WithCgroups sets the container's cgroups
   820  func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
   821  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   822  		var cgroupsPath string
   823  		scopePrefix := "docker"
   824  		parent := "/docker"
   825  		useSystemd := UsingSystemd(daemon.configStore)
   826  		if useSystemd {
   827  			parent = "system.slice"
   828  			if daemon.configStore.Rootless {
   829  				parent = "user.slice"
   830  			}
   831  		}
   832  
   833  		if c.HostConfig.CgroupParent != "" {
   834  			parent = c.HostConfig.CgroupParent
   835  		} else if daemon.configStore.CgroupParent != "" {
   836  			parent = daemon.configStore.CgroupParent
   837  		}
   838  
   839  		if useSystemd {
   840  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   841  			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   842  		} else {
   843  			cgroupsPath = filepath.Join(parent, c.ID)
   844  		}
   845  		if s.Linux == nil {
   846  			s.Linux = &specs.Linux{}
   847  		}
   848  		s.Linux.CgroupsPath = cgroupsPath
   849  
   850  		// the rest is only needed for CPU RT controller
   851  
   852  		if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
   853  			return nil
   854  		}
   855  
   856  		p := cgroupsPath
   857  		if useSystemd {
   858  			initPath, err := cgroups.GetInitCgroup("cpu")
   859  			if err != nil {
   860  				return errors.Wrap(err, "unable to init CPU RT controller")
   861  			}
   862  			_, err = cgroups.GetOwnCgroup("cpu")
   863  			if err != nil {
   864  				return errors.Wrap(err, "unable to init CPU RT controller")
   865  			}
   866  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   867  		}
   868  
   869  		// Clean path to guard against things like ../../../BAD
   870  		parentPath := filepath.Dir(p)
   871  		if !filepath.IsAbs(parentPath) {
   872  			parentPath = filepath.Clean("/" + parentPath)
   873  		}
   874  
   875  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   876  		if err != nil {
   877  			return errors.Wrap(err, "unable to init CPU RT controller")
   878  		}
   879  		// When docker is run inside docker, the root is based of the host cgroup.
   880  		// Should this be handled in runc/libcontainer/cgroups ?
   881  		if strings.HasPrefix(root, "/docker/") {
   882  			root = "/"
   883  		}
   884  		mnt = filepath.Join(mnt, root)
   885  
   886  		if err := daemon.initCPURtController(mnt, parentPath); err != nil {
   887  			return errors.Wrap(err, "unable to init CPU RT controller")
   888  		}
   889  		return nil
   890  	}
   891  }
   892  
   893  // WithDevices sets the container's devices
   894  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   895  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   896  		// Build lists of devices allowed and created within the container.
   897  		var devs []specs.LinuxDevice
   898  		devPermissions := s.Linux.Resources.Devices
   899  
   900  		if c.HostConfig.Privileged {
   901  			hostDevices, err := coci.HostDevices()
   902  			if err != nil {
   903  				return err
   904  			}
   905  			devs = append(devs, hostDevices...)
   906  
   907  			// adding device mappings in privileged containers
   908  			for _, deviceMapping := range c.HostConfig.Devices {
   909  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   910  				if deviceMapping.CgroupPermissions != "rwm" {
   911  					logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   912  				}
   913  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   914  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   915  					logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   916  					continue
   917  				}
   918  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   919  				if err != nil {
   920  					return err
   921  				}
   922  				devs = append(devs, d...)
   923  			}
   924  
   925  			devPermissions = []specs.LinuxDeviceCgroup{
   926  				{
   927  					Allow:  true,
   928  					Access: "rwm",
   929  				},
   930  			}
   931  		} else {
   932  			for _, deviceMapping := range c.HostConfig.Devices {
   933  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   934  				if err != nil {
   935  					return err
   936  				}
   937  				devs = append(devs, d...)
   938  				devPermissions = append(devPermissions, dPermissions...)
   939  			}
   940  
   941  			var err error
   942  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   943  			if err != nil {
   944  				return err
   945  			}
   946  		}
   947  
   948  		if s.Linux == nil {
   949  			s.Linux = &specs.Linux{}
   950  		}
   951  		if s.Linux.Resources == nil {
   952  			s.Linux.Resources = &specs.LinuxResources{}
   953  		}
   954  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   955  		s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
   956  
   957  		for _, req := range c.HostConfig.DeviceRequests {
   958  			if err := daemon.handleDevice(req, s); err != nil {
   959  				return err
   960  			}
   961  		}
   962  		return nil
   963  	}
   964  }
   965  
   966  // WithResources applies the container resources
   967  func WithResources(c *container.Container) coci.SpecOpts {
   968  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   969  		r := c.HostConfig.Resources
   970  		weightDevices, err := getBlkioWeightDevices(r)
   971  		if err != nil {
   972  			return err
   973  		}
   974  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   975  		if err != nil {
   976  			return err
   977  		}
   978  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   979  		if err != nil {
   980  			return err
   981  		}
   982  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   983  		if err != nil {
   984  			return err
   985  		}
   986  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   987  		if err != nil {
   988  			return err
   989  		}
   990  
   991  		memoryRes := getMemoryResources(r)
   992  		cpuRes, err := getCPUResources(r)
   993  		if err != nil {
   994  			return err
   995  		}
   996  
   997  		if s.Linux == nil {
   998  			s.Linux = &specs.Linux{}
   999  		}
  1000  		if s.Linux.Resources == nil {
  1001  			s.Linux.Resources = &specs.LinuxResources{}
  1002  		}
  1003  		s.Linux.Resources.Memory = memoryRes
  1004  		s.Linux.Resources.CPU = cpuRes
  1005  		s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
  1006  			WeightDevice:            weightDevices,
  1007  			ThrottleReadBpsDevice:   readBpsDevice,
  1008  			ThrottleWriteBpsDevice:  writeBpsDevice,
  1009  			ThrottleReadIOPSDevice:  readIOpsDevice,
  1010  			ThrottleWriteIOPSDevice: writeIOpsDevice,
  1011  		}
  1012  		if r.BlkioWeight != 0 {
  1013  			w := r.BlkioWeight
  1014  			s.Linux.Resources.BlockIO.Weight = &w
  1015  		}
  1016  		s.Linux.Resources.Pids = getPidsLimit(r)
  1017  
  1018  		return nil
  1019  	}
  1020  }
  1021  
  1022  // WithSysctls sets the container's sysctls
  1023  func WithSysctls(c *container.Container) coci.SpecOpts {
  1024  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1025  		if len(c.HostConfig.Sysctls) == 0 {
  1026  			return nil
  1027  		}
  1028  		if s.Linux == nil {
  1029  			s.Linux = &specs.Linux{}
  1030  		}
  1031  		if s.Linux.Sysctl == nil {
  1032  			s.Linux.Sysctl = make(map[string]string)
  1033  		}
  1034  		// We merge the sysctls injected above with the HostConfig (latter takes
  1035  		// precedence for backwards-compatibility reasons).
  1036  		for k, v := range c.HostConfig.Sysctls {
  1037  			s.Linux.Sysctl[k] = v
  1038  		}
  1039  		return nil
  1040  	}
  1041  }
  1042  
  1043  // WithUser sets the container's user
  1044  func WithUser(c *container.Container) coci.SpecOpts {
  1045  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1046  		if s.Process == nil {
  1047  			s.Process = &specs.Process{}
  1048  		}
  1049  		var err error
  1050  		s.Process.User, err = getUser(c, c.Config.User)
  1051  		return err
  1052  	}
  1053  }
  1054  
  1055  func (daemon *Daemon) createSpec(ctx context.Context, c *container.Container) (retSpec *specs.Spec, err error) {
  1056  	var (
  1057  		opts []coci.SpecOpts
  1058  		s    = oci.DefaultSpec()
  1059  	)
  1060  	opts = append(opts,
  1061  		WithCommonOptions(daemon, c),
  1062  		WithCgroups(daemon, c),
  1063  		WithResources(c),
  1064  		WithSysctls(c),
  1065  		WithDevices(daemon, c),
  1066  		WithRlimits(daemon, c),
  1067  		WithNamespaces(daemon, c),
  1068  		WithCapabilities(c),
  1069  		WithSeccomp(daemon, c),
  1070  		WithMounts(daemon, c),
  1071  		WithLibnetwork(daemon, c),
  1072  		WithApparmor(c),
  1073  		WithSelinux(c),
  1074  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1075  		coci.WithAnnotations(c.HostConfig.Annotations),
  1076  		WithUser(c),
  1077  	)
  1078  
  1079  	if c.NoNewPrivileges {
  1080  		opts = append(opts, coci.WithNoNewPrivileges)
  1081  	}
  1082  	if c.Config.Tty {
  1083  		opts = append(opts, WithConsoleSize(c))
  1084  	}
  1085  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1086  	if c.HostConfig.MaskedPaths != nil {
  1087  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1088  	}
  1089  	if c.HostConfig.ReadonlyPaths != nil {
  1090  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1091  	}
  1092  	if daemon.configStore.Rootless {
  1093  		opts = append(opts, WithRootless(daemon))
  1094  	}
  1095  
  1096  	var snapshotter, snapshotKey string
  1097  	if daemon.UsesSnapshotter() {
  1098  		snapshotter = daemon.imageService.StorageDriver()
  1099  		snapshotKey = c.ID
  1100  	}
  1101  
  1102  	return &s, coci.ApplyOpts(ctx, daemon.containerdCli, &containers.Container{
  1103  		ID:          c.ID,
  1104  		Snapshotter: snapshotter,
  1105  		SnapshotKey: snapshotKey,
  1106  	}, &s, opts...)
  1107  }
  1108  
  1109  func clearReadOnly(m *specs.Mount) {
  1110  	var opt []string
  1111  	for _, o := range m.Options {
  1112  		if o != "ro" {
  1113  			opt = append(opt, o)
  1114  		}
  1115  	}
  1116  	m.Options = opt
  1117  }
  1118  
  1119  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1120  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  1121  	ulimits := c.Ulimits
  1122  	// Merge ulimits with daemon defaults
  1123  	ulIdx := make(map[string]struct{})
  1124  	for _, ul := range ulimits {
  1125  		ulIdx[ul.Name] = struct{}{}
  1126  	}
  1127  	for name, ul := range daemon.configStore.Ulimits {
  1128  		if _, exists := ulIdx[name]; !exists {
  1129  			ulimits = append(ulimits, ul)
  1130  		}
  1131  	}
  1132  	c.Ulimits = ulimits
  1133  }