github.com/rish1988/moby@v25.0.2+incompatible/daemon/oci_linux.go

github.com/rish1988/moby@v25.0.2+incompatible/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"path/filepath"
     8  	"sort"
     9  	"strconv"
    10  	"strings"
    11  
    12  	cdcgroups "github.com/containerd/cgroups/v3"
    13  	"github.com/containerd/containerd/containers"
    14  	coci "github.com/containerd/containerd/oci"
    15  	"github.com/containerd/containerd/pkg/apparmor"
    16  	"github.com/containerd/containerd/pkg/userns"
    17  	"github.com/containerd/log"
    18  	containertypes "github.com/docker/docker/api/types/container"
    19  	"github.com/docker/docker/container"
    20  	dconfig "github.com/docker/docker/daemon/config"
    21  	"github.com/docker/docker/errdefs"
    22  	"github.com/docker/docker/oci"
    23  	"github.com/docker/docker/oci/caps"
    24  	"github.com/docker/docker/pkg/idtools"
    25  	"github.com/docker/docker/pkg/rootless/specconv"
    26  	"github.com/docker/docker/pkg/stringid"
    27  	volumemounts "github.com/docker/docker/volume/mounts"
    28  	"github.com/moby/sys/mount"
    29  	"github.com/moby/sys/mountinfo"
    30  	"github.com/moby/sys/user"
    31  	"github.com/opencontainers/runc/libcontainer/cgroups"
    32  	specs "github.com/opencontainers/runtime-spec/specs-go"
    33  	"github.com/pkg/errors"
    34  	"golang.org/x/sys/unix"
    35  )
    36  
    37  const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
    38  
    39  // withRlimits sets the container's rlimits along with merging the daemon's rlimits
    40  func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
    41  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    42  		var rlimits []specs.POSIXRlimit
    43  
    44  		// We want to leave the original HostConfig alone so make a copy here
    45  		hostConfig := *c.HostConfig
    46  		// Merge with the daemon defaults
    47  		daemon.mergeUlimits(&hostConfig, daemonCfg)
    48  		for _, ul := range hostConfig.Ulimits {
    49  			rlimits = append(rlimits, specs.POSIXRlimit{
    50  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    51  				Soft: uint64(ul.Soft),
    52  				Hard: uint64(ul.Hard),
    53  			})
    54  		}
    55  
    56  		if s.Process == nil {
    57  			s.Process = &specs.Process{}
    58  		}
    59  		s.Process.Rlimits = rlimits
    60  		return nil
    61  	}
    62  }
    63  
    64  // withLibnetwork sets the libnetwork hook
    65  func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
    66  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    67  		if c.Config.NetworkDisabled {
    68  			return nil
    69  		}
    70  		for _, ns := range s.Linux.Namespaces {
    71  			if ns.Type == specs.NetworkNamespace && ns.Path == "" {
    72  				if s.Hooks == nil {
    73  					s.Hooks = &specs.Hooks{}
    74  				}
    75  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    76  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    77  					Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"),
    78  					Args: []string{"libnetwork-setkey", "-exec-root=" + daemonCfg.GetExecRoot(), c.ID, shortNetCtlrID},
    79  				})
    80  			}
    81  		}
    82  		return nil
    83  	}
    84  }
    85  
    86  // withRootless sets the spec to the rootless configuration
    87  func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
    88  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    89  		var v2Controllers []string
    90  		if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
    91  			if cdcgroups.Mode() != cdcgroups.Unified {
    92  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    93  			}
    94  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    95  			if rootlesskitParentEUID == "" {
    96  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    97  			}
    98  			euid, err := strconv.Atoi(rootlesskitParentEUID)
    99  			if err != nil {
   100  				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
   101  			}
   102  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
   103  			controllersFile, err := os.ReadFile(controllersPath)
   104  			if err != nil {
   105  				return err
   106  			}
   107  			v2Controllers = strings.Fields(string(controllersFile))
   108  		}
   109  		return specconv.ToRootless(s, v2Controllers)
   110  	}
   111  }
   112  
   113  // withRootfulInRootless is used for "rootful-in-rootless" dind;
   114  // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
   115  func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
   116  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   117  		specconv.ToRootfulInRootless(s)
   118  		return nil
   119  	}
   120  }
   121  
   122  // WithOOMScore sets the oom score
   123  func WithOOMScore(score *int) coci.SpecOpts {
   124  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   125  		if s.Process == nil {
   126  			s.Process = &specs.Process{}
   127  		}
   128  		s.Process.OOMScoreAdj = score
   129  		return nil
   130  	}
   131  }
   132  
   133  // WithSelinux sets the selinux labels
   134  func WithSelinux(c *container.Container) coci.SpecOpts {
   135  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   136  		if s.Process == nil {
   137  			s.Process = &specs.Process{}
   138  		}
   139  		if s.Linux == nil {
   140  			s.Linux = &specs.Linux{}
   141  		}
   142  		s.Process.SelinuxLabel = c.GetProcessLabel()
   143  		s.Linux.MountLabel = c.MountLabel
   144  		return nil
   145  	}
   146  }
   147  
   148  // WithApparmor sets the apparmor profile
   149  func WithApparmor(c *container.Container) coci.SpecOpts {
   150  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   151  		if apparmor.HostSupports() {
   152  			var appArmorProfile string
   153  			if c.AppArmorProfile != "" {
   154  				appArmorProfile = c.AppArmorProfile
   155  			} else if c.HostConfig.Privileged {
   156  				appArmorProfile = unconfinedAppArmorProfile
   157  			} else {
   158  				appArmorProfile = defaultAppArmorProfile
   159  			}
   160  
   161  			if appArmorProfile == defaultAppArmorProfile {
   162  				// Unattended upgrades and other fun services can unload AppArmor
   163  				// profiles inadvertently. Since we cannot store our profile in
   164  				// /etc/apparmor.d, nor can we practically add other ways of
   165  				// telling the system to keep our profile loaded, in order to make
   166  				// sure that we keep the default profile enabled we dynamically
   167  				// reload it if necessary.
   168  				if err := ensureDefaultAppArmorProfile(); err != nil {
   169  					return err
   170  				}
   171  			}
   172  			if s.Process == nil {
   173  				s.Process = &specs.Process{}
   174  			}
   175  			s.Process.ApparmorProfile = appArmorProfile
   176  		}
   177  		return nil
   178  	}
   179  }
   180  
   181  // WithCapabilities sets the container's capabilties
   182  func WithCapabilities(c *container.Container) coci.SpecOpts {
   183  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   184  		capabilities, err := caps.TweakCapabilities(
   185  			caps.DefaultCapabilities(),
   186  			c.HostConfig.CapAdd,
   187  			c.HostConfig.CapDrop,
   188  			c.HostConfig.Privileged,
   189  		)
   190  		if err != nil {
   191  			return err
   192  		}
   193  		return oci.SetCapabilities(s, capabilities)
   194  	}
   195  }
   196  
   197  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   198  	p, err := getPath()
   199  	if err != nil {
   200  		return "", err
   201  	}
   202  	return c.GetResourcePath(p)
   203  }
   204  
   205  func getUser(c *container.Container, username string) (specs.User, error) {
   206  	var usr specs.User
   207  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   208  	if err != nil {
   209  		return usr, err
   210  	}
   211  	groupPath, err := resourcePath(c, user.GetGroupPath)
   212  	if err != nil {
   213  		return usr, err
   214  	}
   215  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   216  	if err != nil {
   217  		return usr, err
   218  	}
   219  	usr.UID = uint32(execUser.Uid)
   220  	usr.GID = uint32(execUser.Gid)
   221  	usr.AdditionalGids = []uint32{usr.GID}
   222  
   223  	var addGroups []int
   224  	if len(c.HostConfig.GroupAdd) > 0 {
   225  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   226  		if err != nil {
   227  			return usr, err
   228  		}
   229  	}
   230  	for _, g := range append(execUser.Sgids, addGroups...) {
   231  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   232  	}
   233  	return usr, nil
   234  }
   235  
   236  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   237  	if s.Linux == nil {
   238  		s.Linux = &specs.Linux{}
   239  	}
   240  
   241  	for i, n := range s.Linux.Namespaces {
   242  		if n.Type == ns.Type {
   243  			s.Linux.Namespaces[i] = ns
   244  			return
   245  		}
   246  	}
   247  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   248  }
   249  
   250  // WithNamespaces sets the container's namespaces
   251  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   252  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   253  		userNS := false
   254  		// user
   255  		if c.HostConfig.UsernsMode.IsPrivate() {
   256  			if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
   257  				userNS = true
   258  				setNamespace(s, specs.LinuxNamespace{
   259  					Type: specs.UserNamespace,
   260  				})
   261  				s.Linux.UIDMappings = specMapping(uidMap)
   262  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
   263  			}
   264  		}
   265  		// network
   266  		if !c.Config.NetworkDisabled {
   267  			networkMode := c.HostConfig.NetworkMode
   268  			switch {
   269  			case networkMode.IsContainer():
   270  				nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
   271  				if err != nil {
   272  					return err
   273  				}
   274  				setNamespace(s, specs.LinuxNamespace{
   275  					Type: specs.NetworkNamespace,
   276  					Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
   277  				})
   278  				if userNS {
   279  					// to share a net namespace, the containers must also share a user namespace.
   280  					//
   281  					// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   282  					setNamespace(s, specs.LinuxNamespace{
   283  						Type: specs.UserNamespace,
   284  						Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
   285  					})
   286  				}
   287  			case networkMode.IsHost():
   288  				oci.RemoveNamespace(s, specs.NetworkNamespace)
   289  			default:
   290  				setNamespace(s, specs.LinuxNamespace{
   291  					Type: specs.NetworkNamespace,
   292  				})
   293  			}
   294  		}
   295  
   296  		// ipc
   297  		ipcMode := c.HostConfig.IpcMode
   298  		if !ipcMode.Valid() {
   299  			return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
   300  		}
   301  		switch {
   302  		case ipcMode.IsContainer():
   303  			ic, err := daemon.getIPCContainer(ipcMode.Container())
   304  			if err != nil {
   305  				return errors.Wrap(err, "failed to join IPC namespace")
   306  			}
   307  			setNamespace(s, specs.LinuxNamespace{
   308  				Type: specs.IPCNamespace,
   309  				Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
   310  			})
   311  			if userNS {
   312  				// to share a IPC namespace, the containers must also share a user namespace.
   313  				//
   314  				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   315  				setNamespace(s, specs.LinuxNamespace{
   316  					Type: specs.UserNamespace,
   317  					Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
   318  				})
   319  			}
   320  		case ipcMode.IsHost():
   321  			oci.RemoveNamespace(s, specs.IPCNamespace)
   322  		case ipcMode.IsEmpty():
   323  			// A container was created by an older version of the daemon.
   324  			// The default behavior used to be what is now called "shareable".
   325  			fallthrough
   326  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   327  			setNamespace(s, specs.LinuxNamespace{
   328  				Type: specs.IPCNamespace,
   329  			})
   330  		}
   331  
   332  		// pid
   333  		pidMode := c.HostConfig.PidMode
   334  		if !pidMode.Valid() {
   335  			return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
   336  		}
   337  		switch {
   338  		case pidMode.IsContainer():
   339  			pc, err := daemon.getPIDContainer(pidMode.Container())
   340  			if err != nil {
   341  				return errors.Wrap(err, "failed to join PID namespace")
   342  			}
   343  			setNamespace(s, specs.LinuxNamespace{
   344  				Type: specs.PIDNamespace,
   345  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   346  			})
   347  			if userNS {
   348  				// to share a PID namespace, the containers must also share a user namespace.
   349  				//
   350  				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   351  				setNamespace(s, specs.LinuxNamespace{
   352  					Type: specs.UserNamespace,
   353  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   354  				})
   355  			}
   356  		case pidMode.IsHost():
   357  			oci.RemoveNamespace(s, specs.PIDNamespace)
   358  		default:
   359  			setNamespace(s, specs.LinuxNamespace{
   360  				Type: specs.PIDNamespace,
   361  			})
   362  		}
   363  
   364  		// uts
   365  		if !c.HostConfig.UTSMode.Valid() {
   366  			return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
   367  		}
   368  		if c.HostConfig.UTSMode.IsHost() {
   369  			oci.RemoveNamespace(s, specs.UTSNamespace)
   370  			s.Hostname = ""
   371  		}
   372  
   373  		// cgroup
   374  		if !c.HostConfig.CgroupnsMode.Valid() {
   375  			return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
   376  		}
   377  		if c.HostConfig.CgroupnsMode.IsPrivate() {
   378  			setNamespace(s, specs.LinuxNamespace{
   379  				Type: specs.CgroupNamespace,
   380  			})
   381  		}
   382  
   383  		return nil
   384  	}
   385  }
   386  
   387  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   388  	var ids []specs.LinuxIDMapping
   389  	for _, item := range s {
   390  		ids = append(ids, specs.LinuxIDMapping{
   391  			HostID:      uint32(item.HostID),
   392  			ContainerID: uint32(item.ContainerID),
   393  			Size:        uint32(item.Size),
   394  		})
   395  	}
   396  	return ids
   397  }
   398  
   399  // Get the source mount point of directory passed in as argument. Also return
   400  // optional fields.
   401  func getSourceMount(source string) (string, string, error) {
   402  	// Ensure any symlinks are resolved.
   403  	sourcePath, err := filepath.EvalSymlinks(source)
   404  	if err != nil {
   405  		return "", "", err
   406  	}
   407  
   408  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   409  	if err != nil {
   410  		return "", "", err
   411  	}
   412  	if len(mi) < 1 {
   413  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   414  	}
   415  
   416  	// find the longest mount point
   417  	var idx, maxlen int
   418  	for i := range mi {
   419  		if len(mi[i].Mountpoint) > maxlen {
   420  			maxlen = len(mi[i].Mountpoint)
   421  			idx = i
   422  		}
   423  	}
   424  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   425  }
   426  
   427  const (
   428  	sharedPropagationOption = "shared:"
   429  	slavePropagationOption  = "master:"
   430  )
   431  
   432  // hasMountInfoOption checks if any of the passed any of the given option values
   433  // are set in the passed in option string.
   434  func hasMountInfoOption(opts string, vals ...string) bool {
   435  	for _, opt := range strings.Split(opts, " ") {
   436  		for _, val := range vals {
   437  			if strings.HasPrefix(opt, val) {
   438  				return true
   439  			}
   440  		}
   441  	}
   442  	return false
   443  }
   444  
   445  // Ensure mount point on which path is mounted, is shared.
   446  func ensureShared(path string) error {
   447  	sourceMount, optionalOpts, err := getSourceMount(path)
   448  	if err != nil {
   449  		return err
   450  	}
   451  	// Make sure source mount point is shared.
   452  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   453  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   454  	}
   455  	return nil
   456  }
   457  
   458  // Ensure mount point on which path is mounted, is either shared or slave.
   459  func ensureSharedOrSlave(path string) error {
   460  	sourceMount, optionalOpts, err := getSourceMount(path)
   461  	if err != nil {
   462  		return err
   463  	}
   464  
   465  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   466  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   467  	}
   468  	return nil
   469  }
   470  
   471  // Get the set of mount flags that are set on the mount that contains the given
   472  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   473  // bind-mounting "with options" will not fail with user namespaces, due to
   474  // kernel restrictions that require user namespace mounts to preserve
   475  // CL_UNPRIVILEGED locked flags.
   476  func getUnprivilegedMountFlags(path string) ([]string, error) {
   477  	var statfs unix.Statfs_t
   478  	if err := unix.Statfs(path, &statfs); err != nil {
   479  		return nil, err
   480  	}
   481  
   482  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   483  	unprivilegedFlags := map[uint64]string{
   484  		unix.MS_RDONLY:     "ro",
   485  		unix.MS_NODEV:      "nodev",
   486  		unix.MS_NOEXEC:     "noexec",
   487  		unix.MS_NOSUID:     "nosuid",
   488  		unix.MS_NOATIME:    "noatime",
   489  		unix.MS_RELATIME:   "relatime",
   490  		unix.MS_NODIRATIME: "nodiratime",
   491  	}
   492  
   493  	var flags []string
   494  	for mask, flag := range unprivilegedFlags {
   495  		if uint64(statfs.Flags)&mask == mask {
   496  			flags = append(flags, flag)
   497  		}
   498  	}
   499  
   500  	return flags, nil
   501  }
   502  
   503  var (
   504  	mountPropagationMap = map[string]int{
   505  		"private":  mount.PRIVATE,
   506  		"rprivate": mount.RPRIVATE,
   507  		"shared":   mount.SHARED,
   508  		"rshared":  mount.RSHARED,
   509  		"slave":    mount.SLAVE,
   510  		"rslave":   mount.RSLAVE,
   511  	}
   512  
   513  	mountPropagationReverseMap = map[int]string{
   514  		mount.PRIVATE:  "private",
   515  		mount.RPRIVATE: "rprivate",
   516  		mount.SHARED:   "shared",
   517  		mount.RSHARED:  "rshared",
   518  		mount.SLAVE:    "slave",
   519  		mount.RSLAVE:   "rslave",
   520  	}
   521  )
   522  
   523  // inSlice tests whether a string is contained in a slice of strings or not.
   524  // Comparison is case sensitive
   525  func inSlice(slice []string, s string) bool {
   526  	for _, ss := range slice {
   527  		if s == ss {
   528  			return true
   529  		}
   530  	}
   531  	return false
   532  }
   533  
   534  // withMounts sets the container's mounts
   535  func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
   536  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   537  		sort.Sort(mounts(ms))
   538  
   539  		mounts := ms
   540  
   541  		userMounts := make(map[string]struct{})
   542  		for _, m := range mounts {
   543  			userMounts[m.Destination] = struct{}{}
   544  		}
   545  
   546  		// Copy all mounts from spec to defaultMounts, except for
   547  		//  - mounts overridden by a user supplied mount;
   548  		//  - all mounts under /dev if a user supplied /dev is present;
   549  		//  - /dev/shm, in case IpcMode is none.
   550  		// While at it, also
   551  		//  - set size for /dev/shm from shmsize.
   552  		defaultMounts := s.Mounts[:0]
   553  		_, mountDev := userMounts["/dev"]
   554  		for _, m := range s.Mounts {
   555  			if _, ok := userMounts[m.Destination]; ok {
   556  				// filter out mount overridden by a user supplied mount
   557  				continue
   558  			}
   559  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   560  				// filter out everything under /dev if /dev is user-mounted
   561  				continue
   562  			}
   563  
   564  			if m.Destination == "/dev/shm" {
   565  				if c.HostConfig.IpcMode.IsNone() {
   566  					// filter out /dev/shm for "none" IpcMode
   567  					continue
   568  				}
   569  				// set size for /dev/shm mount from spec
   570  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   571  				m.Options = append(m.Options, sizeOpt)
   572  			}
   573  
   574  			defaultMounts = append(defaultMounts, m)
   575  		}
   576  
   577  		s.Mounts = defaultMounts
   578  		for _, m := range mounts {
   579  			if m.Source == "tmpfs" {
   580  				data := m.Data
   581  				parser := volumemounts.NewParser()
   582  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   583  				if data != "" {
   584  					options = append(options, strings.Split(data, ",")...)
   585  				}
   586  
   587  				merged, err := mount.MergeTmpfsOptions(options)
   588  				if err != nil {
   589  					return err
   590  				}
   591  
   592  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   593  				continue
   594  			}
   595  
   596  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   597  
   598  			// Determine property of RootPropagation based on volume
   599  			// properties. If a volume is shared, then keep root propagation
   600  			// shared. This should work for slave and private volumes too.
   601  			//
   602  			// For slave volumes, it can be either [r]shared/[r]slave.
   603  			//
   604  			// For private volumes any root propagation value should work.
   605  			pFlag := mountPropagationMap[m.Propagation]
   606  			switch pFlag {
   607  			case mount.SHARED, mount.RSHARED:
   608  				if err := ensureShared(m.Source); err != nil {
   609  					return err
   610  				}
   611  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   612  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   613  					if s.Linux == nil {
   614  						s.Linux = &specs.Linux{}
   615  					}
   616  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   617  				}
   618  			case mount.SLAVE, mount.RSLAVE:
   619  				var fallback bool
   620  				if err := ensureSharedOrSlave(m.Source); err != nil {
   621  					// For backwards compatibility purposes, treat mounts from the daemon root
   622  					// as special since we automatically add rslave propagation to these mounts
   623  					// when the user did not set anything, so we should fallback to the old
   624  					// behavior which is to use private propagation which is normally the
   625  					// default.
   626  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   627  						return err
   628  					}
   629  
   630  					cm, ok := c.MountPoints[m.Destination]
   631  					if !ok {
   632  						return err
   633  					}
   634  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   635  						// This means the user explicitly set a propagation, do not fallback in that case.
   636  						return err
   637  					}
   638  					fallback = true
   639  					log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   640  				}
   641  				if !fallback {
   642  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   643  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   644  						if s.Linux == nil {
   645  							s.Linux = &specs.Linux{}
   646  						}
   647  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   648  					}
   649  				}
   650  			}
   651  
   652  			bindMode := "rbind"
   653  			if m.NonRecursive {
   654  				bindMode = "bind"
   655  			}
   656  			opts := []string{bindMode}
   657  			if !m.Writable {
   658  				rro := true
   659  				if m.ReadOnlyNonRecursive {
   660  					rro = false
   661  					if m.ReadOnlyForceRecursive {
   662  						return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
   663  					}
   664  				}
   665  				if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
   666  					rro = false
   667  					if m.ReadOnlyForceRecursive {
   668  						return rroErr
   669  					}
   670  				}
   671  				if rro {
   672  					opts = append(opts, "rro")
   673  				} else {
   674  					opts = append(opts, "ro")
   675  				}
   676  			}
   677  			if pFlag != 0 {
   678  				opts = append(opts, mountPropagationReverseMap[pFlag])
   679  			}
   680  
   681  			// If we are using user namespaces, then we must make sure that we
   682  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   683  			// "mount" when we bind-mount. The reason for this is that at the point
   684  			// when runc sets up the root filesystem, it is already inside a user
   685  			// namespace, and thus cannot change any flags that are locked.
   686  			if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
   687  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   688  				if err != nil {
   689  					return err
   690  				}
   691  				opts = append(opts, unprivOpts...)
   692  			}
   693  
   694  			mt.Options = opts
   695  			s.Mounts = append(s.Mounts, mt)
   696  		}
   697  
   698  		if s.Root.Readonly {
   699  			for i, m := range s.Mounts {
   700  				switch m.Destination {
   701  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   702  					continue
   703  				}
   704  				if _, ok := userMounts[m.Destination]; !ok {
   705  					if !inSlice(m.Options, "ro") {
   706  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   707  					}
   708  				}
   709  			}
   710  		}
   711  
   712  		if c.HostConfig.Privileged {
   713  			// clear readonly for /sys
   714  			for i := range s.Mounts {
   715  				if s.Mounts[i].Destination == "/sys" {
   716  					clearReadOnly(&s.Mounts[i])
   717  				}
   718  			}
   719  			if s.Linux != nil {
   720  				s.Linux.ReadonlyPaths = nil
   721  				s.Linux.MaskedPaths = nil
   722  			}
   723  		}
   724  
   725  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   726  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   727  		if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
   728  			for i, m := range s.Mounts {
   729  				if m.Type == "cgroup" {
   730  					clearReadOnly(&s.Mounts[i])
   731  				}
   732  			}
   733  		}
   734  
   735  		return nil
   736  	}
   737  }
   738  
   739  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   740  // exist, so do not add the default ones if running on an old kernel.
   741  func sysctlExists(s string) bool {
   742  	f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
   743  	_, err := os.Stat(f)
   744  	return err == nil
   745  }
   746  
   747  // withCommonOptions sets common docker options
   748  func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
   749  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   750  		if c.BaseFS == "" {
   751  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
   752  		}
   753  		linkedEnv, err := daemon.setupLinkedContainers(c)
   754  		if err != nil {
   755  			return err
   756  		}
   757  		s.Root = &specs.Root{
   758  			Path:     c.BaseFS,
   759  			Readonly: c.HostConfig.ReadonlyRootfs,
   760  		}
   761  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   762  			return err
   763  		}
   764  		cwd := c.Config.WorkingDir
   765  		if len(cwd) == 0 {
   766  			cwd = "/"
   767  		}
   768  		if s.Process == nil {
   769  			s.Process = &specs.Process{}
   770  		}
   771  		s.Process.Args = append([]string{c.Path}, c.Args...)
   772  
   773  		// only add the custom init if it is specified and the container is running in its
   774  		// own private pid namespace.  It does not make sense to add if it is running in the
   775  		// host namespace or another container's pid namespace where we already have an init
   776  		if c.HostConfig.PidMode.IsPrivate() {
   777  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   778  				(c.HostConfig.Init == nil && daemonCfg.Init) {
   779  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   780  				path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
   781  				if err != nil {
   782  					return err
   783  				}
   784  				s.Mounts = append(s.Mounts, specs.Mount{
   785  					Destination: inContainerInitPath,
   786  					Type:        "bind",
   787  					Source:      path,
   788  					Options:     []string{"bind", "ro"},
   789  				})
   790  			}
   791  		}
   792  		s.Process.Cwd = cwd
   793  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   794  		s.Process.Terminal = c.Config.Tty
   795  
   796  		s.Hostname = c.Config.Hostname
   797  		setLinuxDomainname(c, s)
   798  
   799  		// Add default sysctls that are generally safe and useful; currently we
   800  		// grant the capabilities to allow these anyway. You can override if
   801  		// you want to restore the original behaviour.
   802  		// We do not set network sysctls if network namespace is host, or if we are
   803  		// joining an existing namespace, only if we create a new net namespace.
   804  		if c.HostConfig.NetworkMode.IsPrivate() {
   805  			// We cannot set up ping socket support in a user namespace
   806  			userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
   807  			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
   808  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   809  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   810  			}
   811  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   812  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   813  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   814  			}
   815  		}
   816  
   817  		return nil
   818  	}
   819  }
   820  
   821  // withCgroups sets the container's cgroups
   822  func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
   823  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   824  		var cgroupsPath string
   825  		scopePrefix := "docker"
   826  		parent := "/docker"
   827  		useSystemd := UsingSystemd(daemonCfg)
   828  		if useSystemd {
   829  			parent = "system.slice"
   830  			if daemonCfg.Rootless {
   831  				parent = "user.slice"
   832  			}
   833  		}
   834  
   835  		if c.HostConfig.CgroupParent != "" {
   836  			parent = c.HostConfig.CgroupParent
   837  		} else if daemonCfg.CgroupParent != "" {
   838  			parent = daemonCfg.CgroupParent
   839  		}
   840  
   841  		if useSystemd {
   842  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   843  			log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   844  		} else {
   845  			cgroupsPath = filepath.Join(parent, c.ID)
   846  		}
   847  		if s.Linux == nil {
   848  			s.Linux = &specs.Linux{}
   849  		}
   850  		s.Linux.CgroupsPath = cgroupsPath
   851  
   852  		// the rest is only needed for CPU RT controller
   853  
   854  		if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
   855  			return nil
   856  		}
   857  
   858  		p := cgroupsPath
   859  		if useSystemd {
   860  			initPath, err := cgroups.GetInitCgroup("cpu")
   861  			if err != nil {
   862  				return errors.Wrap(err, "unable to init CPU RT controller")
   863  			}
   864  			_, err = cgroups.GetOwnCgroup("cpu")
   865  			if err != nil {
   866  				return errors.Wrap(err, "unable to init CPU RT controller")
   867  			}
   868  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   869  		}
   870  
   871  		// Clean path to guard against things like ../../../BAD
   872  		parentPath := filepath.Dir(p)
   873  		if !filepath.IsAbs(parentPath) {
   874  			parentPath = filepath.Clean("/" + parentPath)
   875  		}
   876  
   877  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   878  		if err != nil {
   879  			return errors.Wrap(err, "unable to init CPU RT controller")
   880  		}
   881  		// When docker is run inside docker, the root is based of the host cgroup.
   882  		// Should this be handled in runc/libcontainer/cgroups ?
   883  		if strings.HasPrefix(root, "/docker/") {
   884  			root = "/"
   885  		}
   886  		mnt = filepath.Join(mnt, root)
   887  
   888  		if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
   889  			return errors.Wrap(err, "unable to init CPU RT controller")
   890  		}
   891  		return nil
   892  	}
   893  }
   894  
   895  // WithDevices sets the container's devices
   896  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   897  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   898  		// Build lists of devices allowed and created within the container.
   899  		var devs []specs.LinuxDevice
   900  		devPermissions := s.Linux.Resources.Devices
   901  
   902  		if c.HostConfig.Privileged {
   903  			hostDevices, err := coci.HostDevices()
   904  			if err != nil {
   905  				return err
   906  			}
   907  			devs = append(devs, hostDevices...)
   908  
   909  			// adding device mappings in privileged containers
   910  			for _, deviceMapping := range c.HostConfig.Devices {
   911  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   912  				if deviceMapping.CgroupPermissions != "rwm" {
   913  					log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   914  				}
   915  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   916  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   917  					log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   918  					continue
   919  				}
   920  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   921  				if err != nil {
   922  					return err
   923  				}
   924  				devs = append(devs, d...)
   925  			}
   926  
   927  			devPermissions = []specs.LinuxDeviceCgroup{
   928  				{
   929  					Allow:  true,
   930  					Access: "rwm",
   931  				},
   932  			}
   933  		} else {
   934  			for _, deviceMapping := range c.HostConfig.Devices {
   935  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   936  				if err != nil {
   937  					return err
   938  				}
   939  				devs = append(devs, d...)
   940  				devPermissions = append(devPermissions, dPermissions...)
   941  			}
   942  
   943  			var err error
   944  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   945  			if err != nil {
   946  				return err
   947  			}
   948  		}
   949  
   950  		if s.Linux == nil {
   951  			s.Linux = &specs.Linux{}
   952  		}
   953  		if s.Linux.Resources == nil {
   954  			s.Linux.Resources = &specs.LinuxResources{}
   955  		}
   956  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   957  		s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
   958  
   959  		for _, req := range c.HostConfig.DeviceRequests {
   960  			if err := daemon.handleDevice(req, s); err != nil {
   961  				return err
   962  			}
   963  		}
   964  		return nil
   965  	}
   966  }
   967  
   968  // WithResources applies the container resources
   969  func WithResources(c *container.Container) coci.SpecOpts {
   970  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   971  		r := c.HostConfig.Resources
   972  		weightDevices, err := getBlkioWeightDevices(r)
   973  		if err != nil {
   974  			return err
   975  		}
   976  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   977  		if err != nil {
   978  			return err
   979  		}
   980  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   981  		if err != nil {
   982  			return err
   983  		}
   984  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   985  		if err != nil {
   986  			return err
   987  		}
   988  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   989  		if err != nil {
   990  			return err
   991  		}
   992  
   993  		memoryRes := getMemoryResources(r)
   994  		cpuRes, err := getCPUResources(r)
   995  		if err != nil {
   996  			return err
   997  		}
   998  
   999  		if s.Linux == nil {
  1000  			s.Linux = &specs.Linux{}
  1001  		}
  1002  		if s.Linux.Resources == nil {
  1003  			s.Linux.Resources = &specs.LinuxResources{}
  1004  		}
  1005  		s.Linux.Resources.Memory = memoryRes
  1006  		s.Linux.Resources.CPU = cpuRes
  1007  		s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
  1008  			WeightDevice:            weightDevices,
  1009  			ThrottleReadBpsDevice:   readBpsDevice,
  1010  			ThrottleWriteBpsDevice:  writeBpsDevice,
  1011  			ThrottleReadIOPSDevice:  readIOpsDevice,
  1012  			ThrottleWriteIOPSDevice: writeIOpsDevice,
  1013  		}
  1014  		if r.BlkioWeight != 0 {
  1015  			w := r.BlkioWeight
  1016  			s.Linux.Resources.BlockIO.Weight = &w
  1017  		}
  1018  		s.Linux.Resources.Pids = getPidsLimit(r)
  1019  
  1020  		return nil
  1021  	}
  1022  }
  1023  
  1024  // WithSysctls sets the container's sysctls
  1025  func WithSysctls(c *container.Container) coci.SpecOpts {
  1026  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1027  		if len(c.HostConfig.Sysctls) == 0 {
  1028  			return nil
  1029  		}
  1030  		if s.Linux == nil {
  1031  			s.Linux = &specs.Linux{}
  1032  		}
  1033  		if s.Linux.Sysctl == nil {
  1034  			s.Linux.Sysctl = make(map[string]string)
  1035  		}
  1036  		// We merge the sysctls injected above with the HostConfig (latter takes
  1037  		// precedence for backwards-compatibility reasons).
  1038  		for k, v := range c.HostConfig.Sysctls {
  1039  			s.Linux.Sysctl[k] = v
  1040  		}
  1041  		return nil
  1042  	}
  1043  }
  1044  
  1045  // WithUser sets the container's user
  1046  func WithUser(c *container.Container) coci.SpecOpts {
  1047  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1048  		if s.Process == nil {
  1049  			s.Process = &specs.Process{}
  1050  		}
  1051  		var err error
  1052  		s.Process.User, err = getUser(c, c.Config.User)
  1053  		return err
  1054  	}
  1055  }
  1056  
  1057  func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
  1058  	var (
  1059  		opts []coci.SpecOpts
  1060  		s    = oci.DefaultSpec()
  1061  	)
  1062  	opts = append(opts,
  1063  		withCommonOptions(daemon, &daemonCfg.Config, c),
  1064  		withCgroups(daemon, &daemonCfg.Config, c),
  1065  		WithResources(c),
  1066  		WithSysctls(c),
  1067  		WithDevices(daemon, c),
  1068  		withRlimits(daemon, &daemonCfg.Config, c),
  1069  		WithNamespaces(daemon, c),
  1070  		WithCapabilities(c),
  1071  		WithSeccomp(daemon, c),
  1072  		withMounts(daemon, daemonCfg, c, mounts),
  1073  		withLibnetwork(daemon, &daemonCfg.Config, c),
  1074  		WithApparmor(c),
  1075  		WithSelinux(c),
  1076  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1077  		coci.WithAnnotations(c.HostConfig.Annotations),
  1078  		WithUser(c),
  1079  	)
  1080  
  1081  	if c.NoNewPrivileges {
  1082  		opts = append(opts, coci.WithNoNewPrivileges)
  1083  	}
  1084  	if c.Config.Tty {
  1085  		opts = append(opts, WithConsoleSize(c))
  1086  	}
  1087  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1088  	if c.HostConfig.MaskedPaths != nil {
  1089  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1090  	}
  1091  	if c.HostConfig.ReadonlyPaths != nil {
  1092  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1093  	}
  1094  	if daemonCfg.Rootless {
  1095  		opts = append(opts, withRootless(daemon, &daemonCfg.Config))
  1096  	} else if userns.RunningInUserNS() {
  1097  		opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
  1098  	}
  1099  
  1100  	var snapshotter, snapshotKey string
  1101  	if daemon.UsesSnapshotter() {
  1102  		snapshotter = daemon.imageService.StorageDriver()
  1103  		snapshotKey = c.ID
  1104  	}
  1105  
  1106  	return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
  1107  		ID:          c.ID,
  1108  		Snapshotter: snapshotter,
  1109  		SnapshotKey: snapshotKey,
  1110  	}, &s, opts...)
  1111  }
  1112  
  1113  func clearReadOnly(m *specs.Mount) {
  1114  	var opt []string
  1115  	for _, o := range m.Options {
  1116  		if o != "ro" {
  1117  			opt = append(opt, o)
  1118  		}
  1119  	}
  1120  	m.Options = opt
  1121  }
  1122  
  1123  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1124  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
  1125  	ulimits := c.Ulimits
  1126  	// Merge ulimits with daemon defaults
  1127  	ulIdx := make(map[string]struct{})
  1128  	for _, ul := range ulimits {
  1129  		ulIdx[ul.Name] = struct{}{}
  1130  	}
  1131  	for name, ul := range daemonCfg.Ulimits {
  1132  		if _, exists := ulIdx[name]; !exists {
  1133  			ulimits = append(ulimits, ul)
  1134  		}
  1135  	}
  1136  	c.Ulimits = ulimits
  1137  }