github.com/moby/docker@v26.1.3+incompatible/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"path/filepath"
     8  	"sort"
     9  	"strconv"
    10  	"strings"
    11  
    12  	cdcgroups "github.com/containerd/cgroups/v3"
    13  	"github.com/containerd/containerd/containers"
    14  	coci "github.com/containerd/containerd/oci"
    15  	"github.com/containerd/containerd/pkg/apparmor"
    16  	"github.com/containerd/containerd/pkg/userns"
    17  	"github.com/containerd/log"
    18  	containertypes "github.com/docker/docker/api/types/container"
    19  	"github.com/docker/docker/container"
    20  	dconfig "github.com/docker/docker/daemon/config"
    21  	"github.com/docker/docker/errdefs"
    22  	"github.com/docker/docker/internal/rootless/mountopts"
    23  	"github.com/docker/docker/oci"
    24  	"github.com/docker/docker/oci/caps"
    25  	"github.com/docker/docker/pkg/idtools"
    26  	"github.com/docker/docker/pkg/rootless/specconv"
    27  	"github.com/docker/docker/pkg/stringid"
    28  	volumemounts "github.com/docker/docker/volume/mounts"
    29  	"github.com/moby/sys/mount"
    30  	"github.com/moby/sys/mountinfo"
    31  	"github.com/moby/sys/user"
    32  	"github.com/opencontainers/runc/libcontainer/cgroups"
    33  	specs "github.com/opencontainers/runtime-spec/specs-go"
    34  	"github.com/pkg/errors"
    35  )
    36  
    37  const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
    38  
    39  // withRlimits sets the container's rlimits along with merging the daemon's rlimits
    40  func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
    41  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    42  		var rlimits []specs.POSIXRlimit
    43  
    44  		// We want to leave the original HostConfig alone so make a copy here
    45  		hostConfig := *c.HostConfig
    46  		// Merge with the daemon defaults
    47  		daemon.mergeUlimits(&hostConfig, daemonCfg)
    48  		for _, ul := range hostConfig.Ulimits {
    49  			rlimits = append(rlimits, specs.POSIXRlimit{
    50  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    51  				Soft: uint64(ul.Soft),
    52  				Hard: uint64(ul.Hard),
    53  			})
    54  		}
    55  
    56  		if s.Process == nil {
    57  			s.Process = &specs.Process{}
    58  		}
    59  		s.Process.Rlimits = rlimits
    60  		return nil
    61  	}
    62  }
    63  
    64  // withLibnetwork sets the libnetwork hook
    65  func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
    66  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    67  		if c.Config.NetworkDisabled {
    68  			return nil
    69  		}
    70  		for _, ns := range s.Linux.Namespaces {
    71  			if ns.Type == specs.NetworkNamespace && ns.Path == "" {
    72  				if s.Hooks == nil {
    73  					s.Hooks = &specs.Hooks{}
    74  				}
    75  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    76  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ //nolint:staticcheck // FIXME(thaJeztah); replace prestart hook with a non-deprecated one.
    77  					Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"),
    78  					Args: []string{"libnetwork-setkey", "-exec-root=" + daemonCfg.GetExecRoot(), c.ID, shortNetCtlrID},
    79  				})
    80  			}
    81  		}
    82  		return nil
    83  	}
    84  }
    85  
    86  // withRootless sets the spec to the rootless configuration
    87  func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
    88  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    89  		var v2Controllers []string
    90  		if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
    91  			if cdcgroups.Mode() != cdcgroups.Unified {
    92  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    93  			}
    94  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    95  			if rootlesskitParentEUID == "" {
    96  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    97  			}
    98  			euid, err := strconv.Atoi(rootlesskitParentEUID)
    99  			if err != nil {
   100  				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
   101  			}
   102  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
   103  			controllersFile, err := os.ReadFile(controllersPath)
   104  			if err != nil {
   105  				return err
   106  			}
   107  			v2Controllers = strings.Fields(string(controllersFile))
   108  		}
   109  		return specconv.ToRootless(s, v2Controllers)
   110  	}
   111  }
   112  
   113  // withRootfulInRootless is used for "rootful-in-rootless" dind;
   114  // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
   115  func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
   116  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   117  		specconv.ToRootfulInRootless(s)
   118  		return nil
   119  	}
   120  }
   121  
   122  // WithOOMScore sets the oom score
   123  func WithOOMScore(score *int) coci.SpecOpts {
   124  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   125  		if s.Process == nil {
   126  			s.Process = &specs.Process{}
   127  		}
   128  		s.Process.OOMScoreAdj = score
   129  		return nil
   130  	}
   131  }
   132  
   133  // WithSelinux sets the selinux labels
   134  func WithSelinux(c *container.Container) coci.SpecOpts {
   135  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   136  		if s.Process == nil {
   137  			s.Process = &specs.Process{}
   138  		}
   139  		if s.Linux == nil {
   140  			s.Linux = &specs.Linux{}
   141  		}
   142  		s.Process.SelinuxLabel = c.GetProcessLabel()
   143  		s.Linux.MountLabel = c.MountLabel
   144  		return nil
   145  	}
   146  }
   147  
   148  // WithApparmor sets the apparmor profile
   149  func WithApparmor(c *container.Container) coci.SpecOpts {
   150  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   151  		if apparmor.HostSupports() {
   152  			var appArmorProfile string
   153  			if c.AppArmorProfile != "" {
   154  				appArmorProfile = c.AppArmorProfile
   155  			} else if c.HostConfig.Privileged {
   156  				appArmorProfile = unconfinedAppArmorProfile
   157  			} else {
   158  				appArmorProfile = defaultAppArmorProfile
   159  			}
   160  
   161  			if appArmorProfile == defaultAppArmorProfile {
   162  				// Unattended upgrades and other fun services can unload AppArmor
   163  				// profiles inadvertently. Since we cannot store our profile in
   164  				// /etc/apparmor.d, nor can we practically add other ways of
   165  				// telling the system to keep our profile loaded, in order to make
   166  				// sure that we keep the default profile enabled we dynamically
   167  				// reload it if necessary.
   168  				if err := ensureDefaultAppArmorProfile(); err != nil {
   169  					return err
   170  				}
   171  			}
   172  			if s.Process == nil {
   173  				s.Process = &specs.Process{}
   174  			}
   175  			s.Process.ApparmorProfile = appArmorProfile
   176  		}
   177  		return nil
   178  	}
   179  }
   180  
   181  // WithCapabilities sets the container's capabilties
   182  func WithCapabilities(c *container.Container) coci.SpecOpts {
   183  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   184  		capabilities, err := caps.TweakCapabilities(
   185  			caps.DefaultCapabilities(),
   186  			c.HostConfig.CapAdd,
   187  			c.HostConfig.CapDrop,
   188  			c.HostConfig.Privileged,
   189  		)
   190  		if err != nil {
   191  			return err
   192  		}
   193  		return oci.SetCapabilities(s, capabilities)
   194  	}
   195  }
   196  
   197  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   198  	p, err := getPath()
   199  	if err != nil {
   200  		return "", err
   201  	}
   202  	return c.GetResourcePath(p)
   203  }
   204  
   205  func getUser(c *container.Container, username string) (specs.User, error) {
   206  	var usr specs.User
   207  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   208  	if err != nil {
   209  		return usr, err
   210  	}
   211  	groupPath, err := resourcePath(c, user.GetGroupPath)
   212  	if err != nil {
   213  		return usr, err
   214  	}
   215  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   216  	if err != nil {
   217  		return usr, err
   218  	}
   219  	usr.UID = uint32(execUser.Uid)
   220  	usr.GID = uint32(execUser.Gid)
   221  	usr.AdditionalGids = []uint32{usr.GID}
   222  
   223  	var addGroups []int
   224  	if len(c.HostConfig.GroupAdd) > 0 {
   225  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   226  		if err != nil {
   227  			return usr, err
   228  		}
   229  	}
   230  	for _, g := range append(execUser.Sgids, addGroups...) {
   231  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   232  	}
   233  	return usr, nil
   234  }
   235  
   236  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   237  	if s.Linux == nil {
   238  		s.Linux = &specs.Linux{}
   239  	}
   240  
   241  	for i, n := range s.Linux.Namespaces {
   242  		if n.Type == ns.Type {
   243  			s.Linux.Namespaces[i] = ns
   244  			return
   245  		}
   246  	}
   247  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   248  }
   249  
   250  // WithNamespaces sets the container's namespaces
   251  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   252  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   253  		userNS := false
   254  		// user
   255  		if c.HostConfig.UsernsMode.IsPrivate() {
   256  			if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
   257  				userNS = true
   258  				setNamespace(s, specs.LinuxNamespace{
   259  					Type: specs.UserNamespace,
   260  				})
   261  				s.Linux.UIDMappings = specMapping(uidMap)
   262  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
   263  			}
   264  		}
   265  		// network
   266  		if !c.Config.NetworkDisabled {
   267  			networkMode := c.HostConfig.NetworkMode
   268  			switch {
   269  			case networkMode.IsContainer():
   270  				nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
   271  				if err != nil {
   272  					return err
   273  				}
   274  				setNamespace(s, specs.LinuxNamespace{
   275  					Type: specs.NetworkNamespace,
   276  					Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
   277  				})
   278  				if userNS {
   279  					// to share a net namespace, the containers must also share a user namespace.
   280  					//
   281  					// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   282  					setNamespace(s, specs.LinuxNamespace{
   283  						Type: specs.UserNamespace,
   284  						Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
   285  					})
   286  				}
   287  			case networkMode.IsHost():
   288  				oci.RemoveNamespace(s, specs.NetworkNamespace)
   289  			default:
   290  				setNamespace(s, specs.LinuxNamespace{
   291  					Type: specs.NetworkNamespace,
   292  				})
   293  			}
   294  		}
   295  
   296  		// ipc
   297  		ipcMode := c.HostConfig.IpcMode
   298  		if !ipcMode.Valid() {
   299  			return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
   300  		}
   301  		switch {
   302  		case ipcMode.IsContainer():
   303  			ic, err := daemon.getIPCContainer(ipcMode.Container())
   304  			if err != nil {
   305  				return errors.Wrap(err, "failed to join IPC namespace")
   306  			}
   307  			setNamespace(s, specs.LinuxNamespace{
   308  				Type: specs.IPCNamespace,
   309  				Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
   310  			})
   311  			if userNS {
   312  				// to share a IPC namespace, the containers must also share a user namespace.
   313  				//
   314  				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   315  				setNamespace(s, specs.LinuxNamespace{
   316  					Type: specs.UserNamespace,
   317  					Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
   318  				})
   319  			}
   320  		case ipcMode.IsHost():
   321  			oci.RemoveNamespace(s, specs.IPCNamespace)
   322  		case ipcMode.IsEmpty():
   323  			// A container was created by an older version of the daemon.
   324  			// The default behavior used to be what is now called "shareable".
   325  			fallthrough
   326  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   327  			setNamespace(s, specs.LinuxNamespace{
   328  				Type: specs.IPCNamespace,
   329  			})
   330  		}
   331  
   332  		// pid
   333  		pidMode := c.HostConfig.PidMode
   334  		if !pidMode.Valid() {
   335  			return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
   336  		}
   337  		switch {
   338  		case pidMode.IsContainer():
   339  			pc, err := daemon.getPIDContainer(pidMode.Container())
   340  			if err != nil {
   341  				return errors.Wrap(err, "failed to join PID namespace")
   342  			}
   343  			setNamespace(s, specs.LinuxNamespace{
   344  				Type: specs.PIDNamespace,
   345  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   346  			})
   347  			if userNS {
   348  				// to share a PID namespace, the containers must also share a user namespace.
   349  				//
   350  				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   351  				setNamespace(s, specs.LinuxNamespace{
   352  					Type: specs.UserNamespace,
   353  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   354  				})
   355  			}
   356  		case pidMode.IsHost():
   357  			oci.RemoveNamespace(s, specs.PIDNamespace)
   358  		default:
   359  			setNamespace(s, specs.LinuxNamespace{
   360  				Type: specs.PIDNamespace,
   361  			})
   362  		}
   363  
   364  		// uts
   365  		if !c.HostConfig.UTSMode.Valid() {
   366  			return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
   367  		}
   368  		if c.HostConfig.UTSMode.IsHost() {
   369  			oci.RemoveNamespace(s, specs.UTSNamespace)
   370  			s.Hostname = ""
   371  		}
   372  
   373  		// cgroup
   374  		if !c.HostConfig.CgroupnsMode.Valid() {
   375  			return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
   376  		}
   377  		if c.HostConfig.CgroupnsMode.IsPrivate() {
   378  			setNamespace(s, specs.LinuxNamespace{
   379  				Type: specs.CgroupNamespace,
   380  			})
   381  		}
   382  
   383  		return nil
   384  	}
   385  }
   386  
   387  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   388  	var ids []specs.LinuxIDMapping
   389  	for _, item := range s {
   390  		ids = append(ids, specs.LinuxIDMapping{
   391  			HostID:      uint32(item.HostID),
   392  			ContainerID: uint32(item.ContainerID),
   393  			Size:        uint32(item.Size),
   394  		})
   395  	}
   396  	return ids
   397  }
   398  
   399  // Get the source mount point of directory passed in as argument. Also return
   400  // optional fields.
   401  func getSourceMount(source string) (string, string, error) {
   402  	// Ensure any symlinks are resolved.
   403  	sourcePath, err := filepath.EvalSymlinks(source)
   404  	if err != nil {
   405  		return "", "", err
   406  	}
   407  
   408  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   409  	if err != nil {
   410  		return "", "", err
   411  	}
   412  	if len(mi) < 1 {
   413  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   414  	}
   415  
   416  	// find the longest mount point
   417  	var idx, maxlen int
   418  	for i := range mi {
   419  		if len(mi[i].Mountpoint) > maxlen {
   420  			maxlen = len(mi[i].Mountpoint)
   421  			idx = i
   422  		}
   423  	}
   424  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   425  }
   426  
   427  const (
   428  	sharedPropagationOption = "shared:"
   429  	slavePropagationOption  = "master:"
   430  )
   431  
   432  // hasMountInfoOption checks if any of the passed any of the given option values
   433  // are set in the passed in option string.
   434  func hasMountInfoOption(opts string, vals ...string) bool {
   435  	for _, opt := range strings.Split(opts, " ") {
   436  		for _, val := range vals {
   437  			if strings.HasPrefix(opt, val) {
   438  				return true
   439  			}
   440  		}
   441  	}
   442  	return false
   443  }
   444  
   445  // Ensure mount point on which path is mounted, is shared.
   446  func ensureShared(path string) error {
   447  	sourceMount, optionalOpts, err := getSourceMount(path)
   448  	if err != nil {
   449  		return err
   450  	}
   451  	// Make sure source mount point is shared.
   452  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   453  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   454  	}
   455  	return nil
   456  }
   457  
   458  // Ensure mount point on which path is mounted, is either shared or slave.
   459  func ensureSharedOrSlave(path string) error {
   460  	sourceMount, optionalOpts, err := getSourceMount(path)
   461  	if err != nil {
   462  		return err
   463  	}
   464  
   465  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   466  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   467  	}
   468  	return nil
   469  }
   470  
   471  var (
   472  	mountPropagationMap = map[string]int{
   473  		"private":  mount.PRIVATE,
   474  		"rprivate": mount.RPRIVATE,
   475  		"shared":   mount.SHARED,
   476  		"rshared":  mount.RSHARED,
   477  		"slave":    mount.SLAVE,
   478  		"rslave":   mount.RSLAVE,
   479  	}
   480  
   481  	mountPropagationReverseMap = map[int]string{
   482  		mount.PRIVATE:  "private",
   483  		mount.RPRIVATE: "rprivate",
   484  		mount.SHARED:   "shared",
   485  		mount.RSHARED:  "rshared",
   486  		mount.SLAVE:    "slave",
   487  		mount.RSLAVE:   "rslave",
   488  	}
   489  )
   490  
   491  // inSlice tests whether a string is contained in a slice of strings or not.
   492  // Comparison is case sensitive
   493  func inSlice(slice []string, s string) bool {
   494  	for _, ss := range slice {
   495  		if s == ss {
   496  			return true
   497  		}
   498  	}
   499  	return false
   500  }
   501  
   502  // withMounts sets the container's mounts
   503  func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
   504  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   505  		sort.Sort(mounts(ms))
   506  
   507  		mounts := ms
   508  
   509  		userMounts := make(map[string]struct{})
   510  		for _, m := range mounts {
   511  			userMounts[m.Destination] = struct{}{}
   512  		}
   513  
   514  		// Copy all mounts from spec to defaultMounts, except for
   515  		//  - mounts overridden by a user supplied mount;
   516  		//  - all mounts under /dev if a user supplied /dev is present;
   517  		//  - /dev/shm, in case IpcMode is none.
   518  		// While at it, also
   519  		//  - set size for /dev/shm from shmsize.
   520  		defaultMounts := s.Mounts[:0]
   521  		_, mountDev := userMounts["/dev"]
   522  		for _, m := range s.Mounts {
   523  			if _, ok := userMounts[m.Destination]; ok {
   524  				// filter out mount overridden by a user supplied mount
   525  				continue
   526  			}
   527  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   528  				// filter out everything under /dev if /dev is user-mounted
   529  				continue
   530  			}
   531  
   532  			if m.Destination == "/dev/shm" {
   533  				if c.HostConfig.IpcMode.IsNone() {
   534  					// filter out /dev/shm for "none" IpcMode
   535  					continue
   536  				}
   537  				// set size for /dev/shm mount from spec
   538  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   539  				m.Options = append(m.Options, sizeOpt)
   540  			}
   541  
   542  			defaultMounts = append(defaultMounts, m)
   543  		}
   544  
   545  		s.Mounts = defaultMounts
   546  		for _, m := range mounts {
   547  			if m.Source == "tmpfs" {
   548  				data := m.Data
   549  				parser := volumemounts.NewParser()
   550  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   551  				if data != "" {
   552  					options = append(options, strings.Split(data, ",")...)
   553  				}
   554  
   555  				merged, err := mount.MergeTmpfsOptions(options)
   556  				if err != nil {
   557  					return err
   558  				}
   559  
   560  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   561  				continue
   562  			}
   563  
   564  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   565  
   566  			// Determine property of RootPropagation based on volume
   567  			// properties. If a volume is shared, then keep root propagation
   568  			// shared. This should work for slave and private volumes too.
   569  			//
   570  			// For slave volumes, it can be either [r]shared/[r]slave.
   571  			//
   572  			// For private volumes any root propagation value should work.
   573  			pFlag := mountPropagationMap[m.Propagation]
   574  			switch pFlag {
   575  			case mount.SHARED, mount.RSHARED:
   576  				if err := ensureShared(m.Source); err != nil {
   577  					return err
   578  				}
   579  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   580  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   581  					if s.Linux == nil {
   582  						s.Linux = &specs.Linux{}
   583  					}
   584  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   585  				}
   586  			case mount.SLAVE, mount.RSLAVE:
   587  				var fallback bool
   588  				if err := ensureSharedOrSlave(m.Source); err != nil {
   589  					// For backwards compatibility purposes, treat mounts from the daemon root
   590  					// as special since we automatically add rslave propagation to these mounts
   591  					// when the user did not set anything, so we should fallback to the old
   592  					// behavior which is to use private propagation which is normally the
   593  					// default.
   594  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   595  						return err
   596  					}
   597  
   598  					cm, ok := c.MountPoints[m.Destination]
   599  					if !ok {
   600  						return err
   601  					}
   602  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   603  						// This means the user explicitly set a propagation, do not fallback in that case.
   604  						return err
   605  					}
   606  					fallback = true
   607  					log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   608  				}
   609  				if !fallback {
   610  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   611  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   612  						if s.Linux == nil {
   613  							s.Linux = &specs.Linux{}
   614  						}
   615  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   616  					}
   617  				}
   618  			}
   619  
   620  			bindMode := "rbind"
   621  			if m.NonRecursive {
   622  				bindMode = "bind"
   623  			}
   624  			opts := []string{bindMode}
   625  			if !m.Writable {
   626  				rro := true
   627  				if m.ReadOnlyNonRecursive {
   628  					rro = false
   629  					if m.ReadOnlyForceRecursive {
   630  						return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
   631  					}
   632  				}
   633  				if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
   634  					rro = false
   635  					if m.ReadOnlyForceRecursive {
   636  						return rroErr
   637  					}
   638  				}
   639  				if rro {
   640  					opts = append(opts, "rro")
   641  				} else {
   642  					opts = append(opts, "ro")
   643  				}
   644  			}
   645  			if pFlag != 0 {
   646  				opts = append(opts, mountPropagationReverseMap[pFlag])
   647  			}
   648  
   649  			// If we are using user namespaces, then we must make sure that we
   650  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   651  			// "mount" when we bind-mount. The reason for this is that at the point
   652  			// when runc sets up the root filesystem, it is already inside a user
   653  			// namespace, and thus cannot change any flags that are locked.
   654  			if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
   655  				unprivOpts, err := mountopts.UnprivilegedMountFlags(m.Source)
   656  				if err != nil {
   657  					return err
   658  				}
   659  				opts = append(opts, unprivOpts...)
   660  			}
   661  
   662  			mt.Options = opts
   663  			s.Mounts = append(s.Mounts, mt)
   664  		}
   665  
   666  		if s.Root.Readonly {
   667  			for i, m := range s.Mounts {
   668  				switch m.Destination {
   669  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   670  					continue
   671  				}
   672  				if _, ok := userMounts[m.Destination]; !ok {
   673  					if !inSlice(m.Options, "ro") {
   674  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   675  					}
   676  				}
   677  			}
   678  		}
   679  
   680  		if c.HostConfig.Privileged {
   681  			// clear readonly for /sys
   682  			for i := range s.Mounts {
   683  				if s.Mounts[i].Destination == "/sys" {
   684  					clearReadOnly(&s.Mounts[i])
   685  				}
   686  			}
   687  			if s.Linux != nil {
   688  				s.Linux.ReadonlyPaths = nil
   689  				s.Linux.MaskedPaths = nil
   690  			}
   691  		}
   692  
   693  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   694  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   695  		if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
   696  			for i, m := range s.Mounts {
   697  				if m.Type == "cgroup" {
   698  					clearReadOnly(&s.Mounts[i])
   699  				}
   700  			}
   701  		}
   702  
   703  		return nil
   704  	}
   705  }
   706  
   707  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   708  // exist, so do not add the default ones if running on an old kernel.
   709  func sysctlExists(s string) bool {
   710  	f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
   711  	_, err := os.Stat(f)
   712  	return err == nil
   713  }
   714  
   715  // withCommonOptions sets common docker options
   716  func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
   717  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   718  		if c.BaseFS == "" {
   719  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
   720  		}
   721  		linkedEnv, err := daemon.setupLinkedContainers(c)
   722  		if err != nil {
   723  			return err
   724  		}
   725  		s.Root = &specs.Root{
   726  			Path:     c.BaseFS,
   727  			Readonly: c.HostConfig.ReadonlyRootfs,
   728  		}
   729  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   730  			return err
   731  		}
   732  		cwd := c.Config.WorkingDir
   733  		if len(cwd) == 0 {
   734  			cwd = "/"
   735  		}
   736  		if s.Process == nil {
   737  			s.Process = &specs.Process{}
   738  		}
   739  		s.Process.Args = append([]string{c.Path}, c.Args...)
   740  
   741  		// only add the custom init if it is specified and the container is running in its
   742  		// own private pid namespace.  It does not make sense to add if it is running in the
   743  		// host namespace or another container's pid namespace where we already have an init
   744  		if c.HostConfig.PidMode.IsPrivate() {
   745  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   746  				(c.HostConfig.Init == nil && daemonCfg.Init) {
   747  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   748  				path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
   749  				if err != nil {
   750  					return err
   751  				}
   752  				s.Mounts = append(s.Mounts, specs.Mount{
   753  					Destination: inContainerInitPath,
   754  					Type:        "bind",
   755  					Source:      path,
   756  					Options:     []string{"bind", "ro"},
   757  				})
   758  			}
   759  		}
   760  		s.Process.Cwd = cwd
   761  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   762  		s.Process.Terminal = c.Config.Tty
   763  
   764  		s.Hostname = c.Config.Hostname
   765  		setLinuxDomainname(c, s)
   766  
   767  		// Add default sysctls that are generally safe and useful; currently we
   768  		// grant the capabilities to allow these anyway. You can override if
   769  		// you want to restore the original behaviour.
   770  		// We do not set network sysctls if network namespace is host, or if we are
   771  		// joining an existing namespace, only if we create a new net namespace.
   772  		if c.HostConfig.NetworkMode.IsPrivate() {
   773  			// We cannot set up ping socket support in a user namespace
   774  			userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
   775  			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
   776  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   777  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   778  			}
   779  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   780  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   781  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   782  			}
   783  		}
   784  
   785  		return nil
   786  	}
   787  }
   788  
   789  // withCgroups sets the container's cgroups
   790  func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
   791  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   792  		var cgroupsPath string
   793  		scopePrefix := "docker"
   794  		parent := "/docker"
   795  		useSystemd := UsingSystemd(daemonCfg)
   796  		if useSystemd {
   797  			parent = "system.slice"
   798  			if daemonCfg.Rootless {
   799  				parent = "user.slice"
   800  			}
   801  		}
   802  
   803  		if c.HostConfig.CgroupParent != "" {
   804  			parent = c.HostConfig.CgroupParent
   805  		} else if daemonCfg.CgroupParent != "" {
   806  			parent = daemonCfg.CgroupParent
   807  		}
   808  
   809  		if useSystemd {
   810  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   811  			log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   812  		} else {
   813  			cgroupsPath = filepath.Join(parent, c.ID)
   814  		}
   815  		if s.Linux == nil {
   816  			s.Linux = &specs.Linux{}
   817  		}
   818  		s.Linux.CgroupsPath = cgroupsPath
   819  
   820  		// the rest is only needed for CPU RT controller
   821  
   822  		if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
   823  			return nil
   824  		}
   825  
   826  		p := cgroupsPath
   827  		if useSystemd {
   828  			initPath, err := cgroups.GetInitCgroup("cpu")
   829  			if err != nil {
   830  				return errors.Wrap(err, "unable to init CPU RT controller")
   831  			}
   832  			_, err = cgroups.GetOwnCgroup("cpu")
   833  			if err != nil {
   834  				return errors.Wrap(err, "unable to init CPU RT controller")
   835  			}
   836  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   837  		}
   838  
   839  		// Clean path to guard against things like ../../../BAD
   840  		parentPath := filepath.Dir(p)
   841  		if !filepath.IsAbs(parentPath) {
   842  			parentPath = filepath.Clean("/" + parentPath)
   843  		}
   844  
   845  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   846  		if err != nil {
   847  			return errors.Wrap(err, "unable to init CPU RT controller")
   848  		}
   849  		// When docker is run inside docker, the root is based of the host cgroup.
   850  		// Should this be handled in runc/libcontainer/cgroups ?
   851  		if strings.HasPrefix(root, "/docker/") {
   852  			root = "/"
   853  		}
   854  		mnt = filepath.Join(mnt, root)
   855  
   856  		if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
   857  			return errors.Wrap(err, "unable to init CPU RT controller")
   858  		}
   859  		return nil
   860  	}
   861  }
   862  
   863  // WithDevices sets the container's devices
   864  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   865  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   866  		// Build lists of devices allowed and created within the container.
   867  		var devs []specs.LinuxDevice
   868  		devPermissions := s.Linux.Resources.Devices
   869  
   870  		if c.HostConfig.Privileged {
   871  			hostDevices, err := coci.HostDevices()
   872  			if err != nil {
   873  				return err
   874  			}
   875  			devs = append(devs, hostDevices...)
   876  
   877  			// adding device mappings in privileged containers
   878  			for _, deviceMapping := range c.HostConfig.Devices {
   879  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   880  				if deviceMapping.CgroupPermissions != "rwm" {
   881  					log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   882  				}
   883  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   884  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   885  					log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   886  					continue
   887  				}
   888  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   889  				if err != nil {
   890  					return err
   891  				}
   892  				devs = append(devs, d...)
   893  			}
   894  
   895  			devPermissions = []specs.LinuxDeviceCgroup{
   896  				{
   897  					Allow:  true,
   898  					Access: "rwm",
   899  				},
   900  			}
   901  		} else {
   902  			for _, deviceMapping := range c.HostConfig.Devices {
   903  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   904  				if err != nil {
   905  					return err
   906  				}
   907  				devs = append(devs, d...)
   908  				devPermissions = append(devPermissions, dPermissions...)
   909  			}
   910  
   911  			var err error
   912  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   913  			if err != nil {
   914  				return err
   915  			}
   916  		}
   917  
   918  		if s.Linux == nil {
   919  			s.Linux = &specs.Linux{}
   920  		}
   921  		if s.Linux.Resources == nil {
   922  			s.Linux.Resources = &specs.LinuxResources{}
   923  		}
   924  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   925  		s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
   926  
   927  		for _, req := range c.HostConfig.DeviceRequests {
   928  			if err := daemon.handleDevice(req, s); err != nil {
   929  				return err
   930  			}
   931  		}
   932  		return nil
   933  	}
   934  }
   935  
   936  // WithResources applies the container resources
   937  func WithResources(c *container.Container) coci.SpecOpts {
   938  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   939  		r := c.HostConfig.Resources
   940  		weightDevices, err := getBlkioWeightDevices(r)
   941  		if err != nil {
   942  			return err
   943  		}
   944  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   945  		if err != nil {
   946  			return err
   947  		}
   948  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   949  		if err != nil {
   950  			return err
   951  		}
   952  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   953  		if err != nil {
   954  			return err
   955  		}
   956  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   957  		if err != nil {
   958  			return err
   959  		}
   960  
   961  		memoryRes := getMemoryResources(r)
   962  		cpuRes, err := getCPUResources(r)
   963  		if err != nil {
   964  			return err
   965  		}
   966  
   967  		if s.Linux == nil {
   968  			s.Linux = &specs.Linux{}
   969  		}
   970  		if s.Linux.Resources == nil {
   971  			s.Linux.Resources = &specs.LinuxResources{}
   972  		}
   973  		s.Linux.Resources.Memory = memoryRes
   974  		s.Linux.Resources.CPU = cpuRes
   975  		s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
   976  			WeightDevice:            weightDevices,
   977  			ThrottleReadBpsDevice:   readBpsDevice,
   978  			ThrottleWriteBpsDevice:  writeBpsDevice,
   979  			ThrottleReadIOPSDevice:  readIOpsDevice,
   980  			ThrottleWriteIOPSDevice: writeIOpsDevice,
   981  		}
   982  		if r.BlkioWeight != 0 {
   983  			w := r.BlkioWeight
   984  			s.Linux.Resources.BlockIO.Weight = &w
   985  		}
   986  		s.Linux.Resources.Pids = getPidsLimit(r)
   987  
   988  		return nil
   989  	}
   990  }
   991  
   992  // WithSysctls sets the container's sysctls
   993  func WithSysctls(c *container.Container) coci.SpecOpts {
   994  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   995  		if len(c.HostConfig.Sysctls) == 0 {
   996  			return nil
   997  		}
   998  		if s.Linux == nil {
   999  			s.Linux = &specs.Linux{}
  1000  		}
  1001  		if s.Linux.Sysctl == nil {
  1002  			s.Linux.Sysctl = make(map[string]string)
  1003  		}
  1004  		// We merge the sysctls injected above with the HostConfig (latter takes
  1005  		// precedence for backwards-compatibility reasons).
  1006  		for k, v := range c.HostConfig.Sysctls {
  1007  			s.Linux.Sysctl[k] = v
  1008  		}
  1009  		return nil
  1010  	}
  1011  }
  1012  
  1013  // WithUser sets the container's user
  1014  func WithUser(c *container.Container) coci.SpecOpts {
  1015  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1016  		if s.Process == nil {
  1017  			s.Process = &specs.Process{}
  1018  		}
  1019  		var err error
  1020  		s.Process.User, err = getUser(c, c.Config.User)
  1021  		return err
  1022  	}
  1023  }
  1024  
  1025  func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
  1026  	var (
  1027  		opts []coci.SpecOpts
  1028  		s    = oci.DefaultSpec()
  1029  	)
  1030  	opts = append(opts,
  1031  		withCommonOptions(daemon, &daemonCfg.Config, c),
  1032  		withCgroups(daemon, &daemonCfg.Config, c),
  1033  		WithResources(c),
  1034  		WithSysctls(c),
  1035  		WithDevices(daemon, c),
  1036  		withRlimits(daemon, &daemonCfg.Config, c),
  1037  		WithNamespaces(daemon, c),
  1038  		WithCapabilities(c),
  1039  		WithSeccomp(daemon, c),
  1040  		withMounts(daemon, daemonCfg, c, mounts),
  1041  		withLibnetwork(daemon, &daemonCfg.Config, c),
  1042  		WithApparmor(c),
  1043  		WithSelinux(c),
  1044  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1045  		coci.WithAnnotations(c.HostConfig.Annotations),
  1046  		WithUser(c),
  1047  	)
  1048  
  1049  	if c.NoNewPrivileges {
  1050  		opts = append(opts, coci.WithNoNewPrivileges)
  1051  	}
  1052  	if c.Config.Tty {
  1053  		opts = append(opts, WithConsoleSize(c))
  1054  	}
  1055  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1056  	if c.HostConfig.MaskedPaths != nil {
  1057  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1058  	}
  1059  	if c.HostConfig.ReadonlyPaths != nil {
  1060  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1061  	}
  1062  	if daemonCfg.Rootless {
  1063  		opts = append(opts, withRootless(daemon, &daemonCfg.Config))
  1064  	} else if userns.RunningInUserNS() {
  1065  		opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
  1066  	}
  1067  
  1068  	var snapshotter, snapshotKey string
  1069  	if daemon.UsesSnapshotter() {
  1070  		snapshotter = daemon.imageService.StorageDriver()
  1071  		snapshotKey = c.ID
  1072  	}
  1073  
  1074  	return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
  1075  		ID:          c.ID,
  1076  		Snapshotter: snapshotter,
  1077  		SnapshotKey: snapshotKey,
  1078  	}, &s, opts...)
  1079  }
  1080  
  1081  func clearReadOnly(m *specs.Mount) {
  1082  	var opt []string
  1083  	for _, o := range m.Options {
  1084  		if o != "ro" {
  1085  			opt = append(opt, o)
  1086  		}
  1087  	}
  1088  	m.Options = opt
  1089  }
  1090  
  1091  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1092  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
  1093  	ulimits := c.Ulimits
  1094  	// Merge ulimits with daemon defaults
  1095  	ulIdx := make(map[string]struct{})
  1096  	for _, ul := range ulimits {
  1097  		ulIdx[ul.Name] = struct{}{}
  1098  	}
  1099  	for name, ul := range daemonCfg.Ulimits {
  1100  		if _, exists := ulIdx[name]; !exists {
  1101  			ulimits = append(ulimits, ul)
  1102  		}
  1103  	}
  1104  	c.Ulimits = ulimits
  1105  }