github.com/tonistiigi/docker@v0.10.1-0.20240229224939-974013b0dc6a/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"path/filepath"
     8  	"sort"
     9  	"strconv"
    10  	"strings"
    11  
    12  	cdcgroups "github.com/containerd/cgroups/v3"
    13  	"github.com/containerd/containerd/containers"
    14  	coci "github.com/containerd/containerd/oci"
    15  	"github.com/containerd/containerd/pkg/apparmor"
    16  	"github.com/containerd/containerd/pkg/userns"
    17  	"github.com/containerd/log"
    18  	containertypes "github.com/docker/docker/api/types/container"
    19  	"github.com/docker/docker/container"
    20  	dconfig "github.com/docker/docker/daemon/config"
    21  	"github.com/docker/docker/errdefs"
    22  	"github.com/docker/docker/oci"
    23  	"github.com/docker/docker/oci/caps"
    24  	"github.com/docker/docker/pkg/idtools"
    25  	"github.com/docker/docker/pkg/rootless/specconv"
    26  	volumemounts "github.com/docker/docker/volume/mounts"
    27  	"github.com/moby/sys/mount"
    28  	"github.com/moby/sys/mountinfo"
    29  	"github.com/moby/sys/user"
    30  	"github.com/opencontainers/runc/libcontainer/cgroups"
    31  	specs "github.com/opencontainers/runtime-spec/specs-go"
    32  	"github.com/pkg/errors"
    33  	"golang.org/x/sys/unix"
    34  )
    35  
    36  const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
    37  
    38  // withRlimits sets the container's rlimits along with merging the daemon's rlimits
    39  func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
    40  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    41  		var rlimits []specs.POSIXRlimit
    42  
    43  		// We want to leave the original HostConfig alone so make a copy here
    44  		hostConfig := *c.HostConfig
    45  		// Merge with the daemon defaults
    46  		daemon.mergeUlimits(&hostConfig, daemonCfg)
    47  		for _, ul := range hostConfig.Ulimits {
    48  			rlimits = append(rlimits, specs.POSIXRlimit{
    49  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    50  				Soft: uint64(ul.Soft),
    51  				Hard: uint64(ul.Hard),
    52  			})
    53  		}
    54  
    55  		if s.Process == nil {
    56  			s.Process = &specs.Process{}
    57  		}
    58  		s.Process.Rlimits = rlimits
    59  		return nil
    60  	}
    61  }
    62  
    63  // withRootless sets the spec to the rootless configuration
    64  func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
    65  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    66  		var v2Controllers []string
    67  		if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
    68  			if cdcgroups.Mode() != cdcgroups.Unified {
    69  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    70  			}
    71  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    72  			if rootlesskitParentEUID == "" {
    73  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    74  			}
    75  			euid, err := strconv.Atoi(rootlesskitParentEUID)
    76  			if err != nil {
    77  				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
    78  			}
    79  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
    80  			controllersFile, err := os.ReadFile(controllersPath)
    81  			if err != nil {
    82  				return err
    83  			}
    84  			v2Controllers = strings.Fields(string(controllersFile))
    85  		}
    86  		return specconv.ToRootless(s, v2Controllers)
    87  	}
    88  }
    89  
    90  // withRootfulInRootless is used for "rootful-in-rootless" dind;
    91  // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
    92  func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
    93  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    94  		specconv.ToRootfulInRootless(s)
    95  		return nil
    96  	}
    97  }
    98  
    99  // WithOOMScore sets the oom score
   100  func WithOOMScore(score *int) coci.SpecOpts {
   101  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   102  		if s.Process == nil {
   103  			s.Process = &specs.Process{}
   104  		}
   105  		s.Process.OOMScoreAdj = score
   106  		return nil
   107  	}
   108  }
   109  
   110  // WithSelinux sets the selinux labels
   111  func WithSelinux(c *container.Container) coci.SpecOpts {
   112  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   113  		if s.Process == nil {
   114  			s.Process = &specs.Process{}
   115  		}
   116  		if s.Linux == nil {
   117  			s.Linux = &specs.Linux{}
   118  		}
   119  		s.Process.SelinuxLabel = c.GetProcessLabel()
   120  		s.Linux.MountLabel = c.MountLabel
   121  		return nil
   122  	}
   123  }
   124  
   125  // WithApparmor sets the apparmor profile
   126  func WithApparmor(c *container.Container) coci.SpecOpts {
   127  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   128  		if apparmor.HostSupports() {
   129  			var appArmorProfile string
   130  			if c.AppArmorProfile != "" {
   131  				appArmorProfile = c.AppArmorProfile
   132  			} else if c.HostConfig.Privileged {
   133  				appArmorProfile = unconfinedAppArmorProfile
   134  			} else {
   135  				appArmorProfile = defaultAppArmorProfile
   136  			}
   137  
   138  			if appArmorProfile == defaultAppArmorProfile {
   139  				// Unattended upgrades and other fun services can unload AppArmor
   140  				// profiles inadvertently. Since we cannot store our profile in
   141  				// /etc/apparmor.d, nor can we practically add other ways of
   142  				// telling the system to keep our profile loaded, in order to make
   143  				// sure that we keep the default profile enabled we dynamically
   144  				// reload it if necessary.
   145  				if err := ensureDefaultAppArmorProfile(); err != nil {
   146  					return err
   147  				}
   148  			}
   149  			if s.Process == nil {
   150  				s.Process = &specs.Process{}
   151  			}
   152  			s.Process.ApparmorProfile = appArmorProfile
   153  		}
   154  		return nil
   155  	}
   156  }
   157  
   158  // WithCapabilities sets the container's capabilties
   159  func WithCapabilities(c *container.Container) coci.SpecOpts {
   160  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   161  		capabilities, err := caps.TweakCapabilities(
   162  			caps.DefaultCapabilities(),
   163  			c.HostConfig.CapAdd,
   164  			c.HostConfig.CapDrop,
   165  			c.HostConfig.Privileged,
   166  		)
   167  		if err != nil {
   168  			return err
   169  		}
   170  		return oci.SetCapabilities(s, capabilities)
   171  	}
   172  }
   173  
   174  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   175  	p, err := getPath()
   176  	if err != nil {
   177  		return "", err
   178  	}
   179  	return c.GetResourcePath(p)
   180  }
   181  
   182  func getUser(c *container.Container, username string) (specs.User, error) {
   183  	var usr specs.User
   184  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   185  	if err != nil {
   186  		return usr, err
   187  	}
   188  	groupPath, err := resourcePath(c, user.GetGroupPath)
   189  	if err != nil {
   190  		return usr, err
   191  	}
   192  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   193  	if err != nil {
   194  		return usr, err
   195  	}
   196  	usr.UID = uint32(execUser.Uid)
   197  	usr.GID = uint32(execUser.Gid)
   198  	usr.AdditionalGids = []uint32{usr.GID}
   199  
   200  	var addGroups []int
   201  	if len(c.HostConfig.GroupAdd) > 0 {
   202  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   203  		if err != nil {
   204  			return usr, err
   205  		}
   206  	}
   207  	for _, g := range append(execUser.Sgids, addGroups...) {
   208  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   209  	}
   210  	return usr, nil
   211  }
   212  
   213  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   214  	if s.Linux == nil {
   215  		s.Linux = &specs.Linux{}
   216  	}
   217  
   218  	for i, n := range s.Linux.Namespaces {
   219  		if n.Type == ns.Type {
   220  			s.Linux.Namespaces[i] = ns
   221  			return
   222  		}
   223  	}
   224  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   225  }
   226  
   227  // WithNamespaces sets the container's namespaces
   228  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   229  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   230  		userNS := false
   231  		// user
   232  		if c.HostConfig.UsernsMode.IsPrivate() {
   233  			if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
   234  				userNS = true
   235  				setNamespace(s, specs.LinuxNamespace{
   236  					Type: specs.UserNamespace,
   237  				})
   238  				s.Linux.UIDMappings = specMapping(uidMap)
   239  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
   240  			}
   241  		}
   242  		// network
   243  		if !c.Config.NetworkDisabled {
   244  			networkMode := c.HostConfig.NetworkMode
   245  			switch {
   246  			case networkMode.IsContainer():
   247  				nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
   248  				if err != nil {
   249  					return err
   250  				}
   251  				setNamespace(s, specs.LinuxNamespace{
   252  					Type: specs.NetworkNamespace,
   253  					Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
   254  				})
   255  				if userNS {
   256  					// to share a net namespace, the containers must also share a user namespace.
   257  					//
   258  					// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   259  					setNamespace(s, specs.LinuxNamespace{
   260  						Type: specs.UserNamespace,
   261  						Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
   262  					})
   263  				}
   264  			case networkMode.IsHost():
   265  				oci.RemoveNamespace(s, specs.NetworkNamespace)
   266  			default:
   267  				setNamespace(s, specs.LinuxNamespace{
   268  					Type: specs.NetworkNamespace,
   269  				})
   270  			}
   271  		}
   272  
   273  		// ipc
   274  		ipcMode := c.HostConfig.IpcMode
   275  		if !ipcMode.Valid() {
   276  			return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
   277  		}
   278  		switch {
   279  		case ipcMode.IsContainer():
   280  			ic, err := daemon.getIPCContainer(ipcMode.Container())
   281  			if err != nil {
   282  				return errors.Wrap(err, "failed to join IPC namespace")
   283  			}
   284  			setNamespace(s, specs.LinuxNamespace{
   285  				Type: specs.IPCNamespace,
   286  				Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
   287  			})
   288  			if userNS {
   289  				// to share a IPC namespace, the containers must also share a user namespace.
   290  				//
   291  				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   292  				setNamespace(s, specs.LinuxNamespace{
   293  					Type: specs.UserNamespace,
   294  					Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
   295  				})
   296  			}
   297  		case ipcMode.IsHost():
   298  			oci.RemoveNamespace(s, specs.IPCNamespace)
   299  		case ipcMode.IsEmpty():
   300  			// A container was created by an older version of the daemon.
   301  			// The default behavior used to be what is now called "shareable".
   302  			fallthrough
   303  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   304  			setNamespace(s, specs.LinuxNamespace{
   305  				Type: specs.IPCNamespace,
   306  			})
   307  		}
   308  
   309  		// pid
   310  		pidMode := c.HostConfig.PidMode
   311  		if !pidMode.Valid() {
   312  			return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
   313  		}
   314  		switch {
   315  		case pidMode.IsContainer():
   316  			pc, err := daemon.getPIDContainer(pidMode.Container())
   317  			if err != nil {
   318  				return errors.Wrap(err, "failed to join PID namespace")
   319  			}
   320  			setNamespace(s, specs.LinuxNamespace{
   321  				Type: specs.PIDNamespace,
   322  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   323  			})
   324  			if userNS {
   325  				// to share a PID namespace, the containers must also share a user namespace.
   326  				//
   327  				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
   328  				setNamespace(s, specs.LinuxNamespace{
   329  					Type: specs.UserNamespace,
   330  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   331  				})
   332  			}
   333  		case pidMode.IsHost():
   334  			oci.RemoveNamespace(s, specs.PIDNamespace)
   335  		default:
   336  			setNamespace(s, specs.LinuxNamespace{
   337  				Type: specs.PIDNamespace,
   338  			})
   339  		}
   340  
   341  		// uts
   342  		if !c.HostConfig.UTSMode.Valid() {
   343  			return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
   344  		}
   345  		if c.HostConfig.UTSMode.IsHost() {
   346  			oci.RemoveNamespace(s, specs.UTSNamespace)
   347  			s.Hostname = ""
   348  		}
   349  
   350  		// cgroup
   351  		if !c.HostConfig.CgroupnsMode.Valid() {
   352  			return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
   353  		}
   354  		if c.HostConfig.CgroupnsMode.IsPrivate() {
   355  			setNamespace(s, specs.LinuxNamespace{
   356  				Type: specs.CgroupNamespace,
   357  			})
   358  		}
   359  
   360  		return nil
   361  	}
   362  }
   363  
   364  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   365  	var ids []specs.LinuxIDMapping
   366  	for _, item := range s {
   367  		ids = append(ids, specs.LinuxIDMapping{
   368  			HostID:      uint32(item.HostID),
   369  			ContainerID: uint32(item.ContainerID),
   370  			Size:        uint32(item.Size),
   371  		})
   372  	}
   373  	return ids
   374  }
   375  
   376  // Get the source mount point of directory passed in as argument. Also return
   377  // optional fields.
   378  func getSourceMount(source string) (string, string, error) {
   379  	// Ensure any symlinks are resolved.
   380  	sourcePath, err := filepath.EvalSymlinks(source)
   381  	if err != nil {
   382  		return "", "", err
   383  	}
   384  
   385  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   386  	if err != nil {
   387  		return "", "", err
   388  	}
   389  	if len(mi) < 1 {
   390  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   391  	}
   392  
   393  	// find the longest mount point
   394  	var idx, maxlen int
   395  	for i := range mi {
   396  		if len(mi[i].Mountpoint) > maxlen {
   397  			maxlen = len(mi[i].Mountpoint)
   398  			idx = i
   399  		}
   400  	}
   401  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   402  }
   403  
   404  const (
   405  	sharedPropagationOption = "shared:"
   406  	slavePropagationOption  = "master:"
   407  )
   408  
   409  // hasMountInfoOption checks if any of the passed any of the given option values
   410  // are set in the passed in option string.
   411  func hasMountInfoOption(opts string, vals ...string) bool {
   412  	for _, opt := range strings.Split(opts, " ") {
   413  		for _, val := range vals {
   414  			if strings.HasPrefix(opt, val) {
   415  				return true
   416  			}
   417  		}
   418  	}
   419  	return false
   420  }
   421  
   422  // Ensure mount point on which path is mounted, is shared.
   423  func ensureShared(path string) error {
   424  	sourceMount, optionalOpts, err := getSourceMount(path)
   425  	if err != nil {
   426  		return err
   427  	}
   428  	// Make sure source mount point is shared.
   429  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   430  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   431  	}
   432  	return nil
   433  }
   434  
   435  // Ensure mount point on which path is mounted, is either shared or slave.
   436  func ensureSharedOrSlave(path string) error {
   437  	sourceMount, optionalOpts, err := getSourceMount(path)
   438  	if err != nil {
   439  		return err
   440  	}
   441  
   442  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   443  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   444  	}
   445  	return nil
   446  }
   447  
   448  // Get the set of mount flags that are set on the mount that contains the given
   449  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   450  // bind-mounting "with options" will not fail with user namespaces, due to
   451  // kernel restrictions that require user namespace mounts to preserve
   452  // CL_UNPRIVILEGED locked flags.
   453  func getUnprivilegedMountFlags(path string) ([]string, error) {
   454  	var statfs unix.Statfs_t
   455  	if err := unix.Statfs(path, &statfs); err != nil {
   456  		return nil, err
   457  	}
   458  
   459  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   460  	unprivilegedFlags := map[uint64]string{
   461  		unix.MS_RDONLY:     "ro",
   462  		unix.MS_NODEV:      "nodev",
   463  		unix.MS_NOEXEC:     "noexec",
   464  		unix.MS_NOSUID:     "nosuid",
   465  		unix.MS_NOATIME:    "noatime",
   466  		unix.MS_RELATIME:   "relatime",
   467  		unix.MS_NODIRATIME: "nodiratime",
   468  	}
   469  
   470  	var flags []string
   471  	for mask, flag := range unprivilegedFlags {
   472  		if uint64(statfs.Flags)&mask == mask {
   473  			flags = append(flags, flag)
   474  		}
   475  	}
   476  
   477  	return flags, nil
   478  }
   479  
   480  var (
   481  	mountPropagationMap = map[string]int{
   482  		"private":  mount.PRIVATE,
   483  		"rprivate": mount.RPRIVATE,
   484  		"shared":   mount.SHARED,
   485  		"rshared":  mount.RSHARED,
   486  		"slave":    mount.SLAVE,
   487  		"rslave":   mount.RSLAVE,
   488  	}
   489  
   490  	mountPropagationReverseMap = map[int]string{
   491  		mount.PRIVATE:  "private",
   492  		mount.RPRIVATE: "rprivate",
   493  		mount.SHARED:   "shared",
   494  		mount.RSHARED:  "rshared",
   495  		mount.SLAVE:    "slave",
   496  		mount.RSLAVE:   "rslave",
   497  	}
   498  )
   499  
   500  // inSlice tests whether a string is contained in a slice of strings or not.
   501  // Comparison is case sensitive
   502  func inSlice(slice []string, s string) bool {
   503  	for _, ss := range slice {
   504  		if s == ss {
   505  			return true
   506  		}
   507  	}
   508  	return false
   509  }
   510  
   511  // withMounts sets the container's mounts
   512  func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
   513  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   514  		sort.Sort(mounts(ms))
   515  
   516  		mounts := ms
   517  
   518  		userMounts := make(map[string]struct{})
   519  		for _, m := range mounts {
   520  			userMounts[m.Destination] = struct{}{}
   521  		}
   522  
   523  		// Copy all mounts from spec to defaultMounts, except for
   524  		//  - mounts overridden by a user supplied mount;
   525  		//  - all mounts under /dev if a user supplied /dev is present;
   526  		//  - /dev/shm, in case IpcMode is none.
   527  		// While at it, also
   528  		//  - set size for /dev/shm from shmsize.
   529  		defaultMounts := s.Mounts[:0]
   530  		_, mountDev := userMounts["/dev"]
   531  		for _, m := range s.Mounts {
   532  			if _, ok := userMounts[m.Destination]; ok {
   533  				// filter out mount overridden by a user supplied mount
   534  				continue
   535  			}
   536  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   537  				// filter out everything under /dev if /dev is user-mounted
   538  				continue
   539  			}
   540  
   541  			if m.Destination == "/dev/shm" {
   542  				if c.HostConfig.IpcMode.IsNone() {
   543  					// filter out /dev/shm for "none" IpcMode
   544  					continue
   545  				}
   546  				// set size for /dev/shm mount from spec
   547  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   548  				m.Options = append(m.Options, sizeOpt)
   549  			}
   550  
   551  			defaultMounts = append(defaultMounts, m)
   552  		}
   553  
   554  		s.Mounts = defaultMounts
   555  		for _, m := range mounts {
   556  			if m.Source == "tmpfs" {
   557  				data := m.Data
   558  				parser := volumemounts.NewParser()
   559  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   560  				if data != "" {
   561  					options = append(options, strings.Split(data, ",")...)
   562  				}
   563  
   564  				merged, err := mount.MergeTmpfsOptions(options)
   565  				if err != nil {
   566  					return err
   567  				}
   568  
   569  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   570  				continue
   571  			}
   572  
   573  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   574  
   575  			// Determine property of RootPropagation based on volume
   576  			// properties. If a volume is shared, then keep root propagation
   577  			// shared. This should work for slave and private volumes too.
   578  			//
   579  			// For slave volumes, it can be either [r]shared/[r]slave.
   580  			//
   581  			// For private volumes any root propagation value should work.
   582  			pFlag := mountPropagationMap[m.Propagation]
   583  			switch pFlag {
   584  			case mount.SHARED, mount.RSHARED:
   585  				if err := ensureShared(m.Source); err != nil {
   586  					return err
   587  				}
   588  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   589  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   590  					if s.Linux == nil {
   591  						s.Linux = &specs.Linux{}
   592  					}
   593  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   594  				}
   595  			case mount.SLAVE, mount.RSLAVE:
   596  				var fallback bool
   597  				if err := ensureSharedOrSlave(m.Source); err != nil {
   598  					// For backwards compatibility purposes, treat mounts from the daemon root
   599  					// as special since we automatically add rslave propagation to these mounts
   600  					// when the user did not set anything, so we should fallback to the old
   601  					// behavior which is to use private propagation which is normally the
   602  					// default.
   603  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   604  						return err
   605  					}
   606  
   607  					cm, ok := c.MountPoints[m.Destination]
   608  					if !ok {
   609  						return err
   610  					}
   611  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   612  						// This means the user explicitly set a propagation, do not fallback in that case.
   613  						return err
   614  					}
   615  					fallback = true
   616  					log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   617  				}
   618  				if !fallback {
   619  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   620  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   621  						if s.Linux == nil {
   622  							s.Linux = &specs.Linux{}
   623  						}
   624  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   625  					}
   626  				}
   627  			}
   628  
   629  			bindMode := "rbind"
   630  			if m.NonRecursive {
   631  				bindMode = "bind"
   632  			}
   633  			opts := []string{bindMode}
   634  			if !m.Writable {
   635  				rro := true
   636  				if m.ReadOnlyNonRecursive {
   637  					rro = false
   638  					if m.ReadOnlyForceRecursive {
   639  						return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
   640  					}
   641  				}
   642  				if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
   643  					rro = false
   644  					if m.ReadOnlyForceRecursive {
   645  						return rroErr
   646  					}
   647  				}
   648  				if rro {
   649  					opts = append(opts, "rro")
   650  				} else {
   651  					opts = append(opts, "ro")
   652  				}
   653  			}
   654  			if pFlag != 0 {
   655  				opts = append(opts, mountPropagationReverseMap[pFlag])
   656  			}
   657  
   658  			// If we are using user namespaces, then we must make sure that we
   659  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   660  			// "mount" when we bind-mount. The reason for this is that at the point
   661  			// when runc sets up the root filesystem, it is already inside a user
   662  			// namespace, and thus cannot change any flags that are locked.
   663  			if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
   664  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   665  				if err != nil {
   666  					return err
   667  				}
   668  				opts = append(opts, unprivOpts...)
   669  			}
   670  
   671  			mt.Options = opts
   672  			s.Mounts = append(s.Mounts, mt)
   673  		}
   674  
   675  		if s.Root.Readonly {
   676  			for i, m := range s.Mounts {
   677  				switch m.Destination {
   678  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   679  					continue
   680  				}
   681  				if _, ok := userMounts[m.Destination]; !ok {
   682  					if !inSlice(m.Options, "ro") {
   683  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   684  					}
   685  				}
   686  			}
   687  		}
   688  
   689  		if c.HostConfig.Privileged {
   690  			// clear readonly for /sys
   691  			for i := range s.Mounts {
   692  				if s.Mounts[i].Destination == "/sys" {
   693  					clearReadOnly(&s.Mounts[i])
   694  				}
   695  			}
   696  			if s.Linux != nil {
   697  				s.Linux.ReadonlyPaths = nil
   698  				s.Linux.MaskedPaths = nil
   699  			}
   700  		}
   701  
   702  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   703  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   704  		if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
   705  			for i, m := range s.Mounts {
   706  				if m.Type == "cgroup" {
   707  					clearReadOnly(&s.Mounts[i])
   708  				}
   709  			}
   710  		}
   711  
   712  		return nil
   713  	}
   714  }
   715  
   716  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   717  // exist, so do not add the default ones if running on an old kernel.
   718  func sysctlExists(s string) bool {
   719  	f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
   720  	_, err := os.Stat(f)
   721  	return err == nil
   722  }
   723  
   724  // withCommonOptions sets common docker options
   725  func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
   726  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   727  		if c.BaseFS == "" {
   728  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
   729  		}
   730  		linkedEnv, err := daemon.setupLinkedContainers(c)
   731  		if err != nil {
   732  			return err
   733  		}
   734  		s.Root = &specs.Root{
   735  			Path:     c.BaseFS,
   736  			Readonly: c.HostConfig.ReadonlyRootfs,
   737  		}
   738  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   739  			return err
   740  		}
   741  		cwd := c.Config.WorkingDir
   742  		if len(cwd) == 0 {
   743  			cwd = "/"
   744  		}
   745  		if s.Process == nil {
   746  			s.Process = &specs.Process{}
   747  		}
   748  		s.Process.Args = append([]string{c.Path}, c.Args...)
   749  
   750  		// only add the custom init if it is specified and the container is running in its
   751  		// own private pid namespace.  It does not make sense to add if it is running in the
   752  		// host namespace or another container's pid namespace where we already have an init
   753  		if c.HostConfig.PidMode.IsPrivate() {
   754  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   755  				(c.HostConfig.Init == nil && daemonCfg.Init) {
   756  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   757  				path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
   758  				if err != nil {
   759  					return err
   760  				}
   761  				s.Mounts = append(s.Mounts, specs.Mount{
   762  					Destination: inContainerInitPath,
   763  					Type:        "bind",
   764  					Source:      path,
   765  					Options:     []string{"bind", "ro"},
   766  				})
   767  			}
   768  		}
   769  		s.Process.Cwd = cwd
   770  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   771  		s.Process.Terminal = c.Config.Tty
   772  
   773  		s.Hostname = c.Config.Hostname
   774  		setLinuxDomainname(c, s)
   775  
   776  		// Add default sysctls that are generally safe and useful; currently we
   777  		// grant the capabilities to allow these anyway. You can override if
   778  		// you want to restore the original behaviour.
   779  		// We do not set network sysctls if network namespace is host, or if we are
   780  		// joining an existing namespace, only if we create a new net namespace.
   781  		if c.HostConfig.NetworkMode.IsPrivate() {
   782  			// We cannot set up ping socket support in a user namespace
   783  			userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
   784  			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
   785  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   786  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   787  			}
   788  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   789  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   790  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   791  			}
   792  		}
   793  
   794  		return nil
   795  	}
   796  }
   797  
   798  // withCgroups sets the container's cgroups
   799  func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
   800  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   801  		var cgroupsPath string
   802  		scopePrefix := "docker"
   803  		parent := "/docker"
   804  		useSystemd := UsingSystemd(daemonCfg)
   805  		if useSystemd {
   806  			parent = "system.slice"
   807  			if daemonCfg.Rootless {
   808  				parent = "user.slice"
   809  			}
   810  		}
   811  
   812  		if c.HostConfig.CgroupParent != "" {
   813  			parent = c.HostConfig.CgroupParent
   814  		} else if daemonCfg.CgroupParent != "" {
   815  			parent = daemonCfg.CgroupParent
   816  		}
   817  
   818  		if useSystemd {
   819  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   820  			log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   821  		} else {
   822  			cgroupsPath = filepath.Join(parent, c.ID)
   823  		}
   824  		if s.Linux == nil {
   825  			s.Linux = &specs.Linux{}
   826  		}
   827  		s.Linux.CgroupsPath = cgroupsPath
   828  
   829  		// the rest is only needed for CPU RT controller
   830  
   831  		if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
   832  			return nil
   833  		}
   834  
   835  		p := cgroupsPath
   836  		if useSystemd {
   837  			initPath, err := cgroups.GetInitCgroup("cpu")
   838  			if err != nil {
   839  				return errors.Wrap(err, "unable to init CPU RT controller")
   840  			}
   841  			_, err = cgroups.GetOwnCgroup("cpu")
   842  			if err != nil {
   843  				return errors.Wrap(err, "unable to init CPU RT controller")
   844  			}
   845  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   846  		}
   847  
   848  		// Clean path to guard against things like ../../../BAD
   849  		parentPath := filepath.Dir(p)
   850  		if !filepath.IsAbs(parentPath) {
   851  			parentPath = filepath.Clean("/" + parentPath)
   852  		}
   853  
   854  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   855  		if err != nil {
   856  			return errors.Wrap(err, "unable to init CPU RT controller")
   857  		}
   858  		// When docker is run inside docker, the root is based of the host cgroup.
   859  		// Should this be handled in runc/libcontainer/cgroups ?
   860  		if strings.HasPrefix(root, "/docker/") {
   861  			root = "/"
   862  		}
   863  		mnt = filepath.Join(mnt, root)
   864  
   865  		if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
   866  			return errors.Wrap(err, "unable to init CPU RT controller")
   867  		}
   868  		return nil
   869  	}
   870  }
   871  
   872  // WithDevices sets the container's devices
   873  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   874  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   875  		// Build lists of devices allowed and created within the container.
   876  		var devs []specs.LinuxDevice
   877  		devPermissions := s.Linux.Resources.Devices
   878  
   879  		if c.HostConfig.Privileged {
   880  			hostDevices, err := coci.HostDevices()
   881  			if err != nil {
   882  				return err
   883  			}
   884  			devs = append(devs, hostDevices...)
   885  
   886  			// adding device mappings in privileged containers
   887  			for _, deviceMapping := range c.HostConfig.Devices {
   888  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   889  				if deviceMapping.CgroupPermissions != "rwm" {
   890  					log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   891  				}
   892  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   893  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   894  					log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   895  					continue
   896  				}
   897  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   898  				if err != nil {
   899  					return err
   900  				}
   901  				devs = append(devs, d...)
   902  			}
   903  
   904  			devPermissions = []specs.LinuxDeviceCgroup{
   905  				{
   906  					Allow:  true,
   907  					Access: "rwm",
   908  				},
   909  			}
   910  		} else {
   911  			for _, deviceMapping := range c.HostConfig.Devices {
   912  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   913  				if err != nil {
   914  					return err
   915  				}
   916  				devs = append(devs, d...)
   917  				devPermissions = append(devPermissions, dPermissions...)
   918  			}
   919  
   920  			var err error
   921  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   922  			if err != nil {
   923  				return err
   924  			}
   925  		}
   926  
   927  		if s.Linux == nil {
   928  			s.Linux = &specs.Linux{}
   929  		}
   930  		if s.Linux.Resources == nil {
   931  			s.Linux.Resources = &specs.LinuxResources{}
   932  		}
   933  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   934  		s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
   935  
   936  		for _, req := range c.HostConfig.DeviceRequests {
   937  			if err := daemon.handleDevice(req, s); err != nil {
   938  				return err
   939  			}
   940  		}
   941  		return nil
   942  	}
   943  }
   944  
   945  // WithResources applies the container resources
   946  func WithResources(c *container.Container) coci.SpecOpts {
   947  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   948  		r := c.HostConfig.Resources
   949  		weightDevices, err := getBlkioWeightDevices(r)
   950  		if err != nil {
   951  			return err
   952  		}
   953  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   954  		if err != nil {
   955  			return err
   956  		}
   957  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   958  		if err != nil {
   959  			return err
   960  		}
   961  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   962  		if err != nil {
   963  			return err
   964  		}
   965  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   966  		if err != nil {
   967  			return err
   968  		}
   969  
   970  		memoryRes := getMemoryResources(r)
   971  		cpuRes, err := getCPUResources(r)
   972  		if err != nil {
   973  			return err
   974  		}
   975  
   976  		if s.Linux == nil {
   977  			s.Linux = &specs.Linux{}
   978  		}
   979  		if s.Linux.Resources == nil {
   980  			s.Linux.Resources = &specs.LinuxResources{}
   981  		}
   982  		s.Linux.Resources.Memory = memoryRes
   983  		s.Linux.Resources.CPU = cpuRes
   984  		s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
   985  			WeightDevice:            weightDevices,
   986  			ThrottleReadBpsDevice:   readBpsDevice,
   987  			ThrottleWriteBpsDevice:  writeBpsDevice,
   988  			ThrottleReadIOPSDevice:  readIOpsDevice,
   989  			ThrottleWriteIOPSDevice: writeIOpsDevice,
   990  		}
   991  		if r.BlkioWeight != 0 {
   992  			w := r.BlkioWeight
   993  			s.Linux.Resources.BlockIO.Weight = &w
   994  		}
   995  		s.Linux.Resources.Pids = getPidsLimit(r)
   996  
   997  		return nil
   998  	}
   999  }
  1000  
  1001  // WithSysctls sets the container's sysctls
  1002  func WithSysctls(c *container.Container) coci.SpecOpts {
  1003  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1004  		if len(c.HostConfig.Sysctls) == 0 {
  1005  			return nil
  1006  		}
  1007  		if s.Linux == nil {
  1008  			s.Linux = &specs.Linux{}
  1009  		}
  1010  		if s.Linux.Sysctl == nil {
  1011  			s.Linux.Sysctl = make(map[string]string)
  1012  		}
  1013  		// We merge the sysctls injected above with the HostConfig (latter takes
  1014  		// precedence for backwards-compatibility reasons).
  1015  		for k, v := range c.HostConfig.Sysctls {
  1016  			s.Linux.Sysctl[k] = v
  1017  		}
  1018  		return nil
  1019  	}
  1020  }
  1021  
  1022  // WithUser sets the container's user
  1023  func WithUser(c *container.Container) coci.SpecOpts {
  1024  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1025  		if s.Process == nil {
  1026  			s.Process = &specs.Process{}
  1027  		}
  1028  		var err error
  1029  		s.Process.User, err = getUser(c, c.Config.User)
  1030  		return err
  1031  	}
  1032  }
  1033  
  1034  func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
  1035  	var (
  1036  		opts []coci.SpecOpts
  1037  		s    = oci.DefaultSpec()
  1038  	)
  1039  	opts = append(opts,
  1040  		withCommonOptions(daemon, &daemonCfg.Config, c),
  1041  		withCgroups(daemon, &daemonCfg.Config, c),
  1042  		WithResources(c),
  1043  		WithSysctls(c),
  1044  		WithDevices(daemon, c),
  1045  		withRlimits(daemon, &daemonCfg.Config, c),
  1046  		WithNamespaces(daemon, c),
  1047  		WithCapabilities(c),
  1048  		WithSeccomp(daemon, c),
  1049  		withMounts(daemon, daemonCfg, c, mounts),
  1050  		WithApparmor(c),
  1051  		WithSelinux(c),
  1052  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1053  		coci.WithAnnotations(c.HostConfig.Annotations),
  1054  		WithUser(c),
  1055  	)
  1056  
  1057  	if c.NoNewPrivileges {
  1058  		opts = append(opts, coci.WithNoNewPrivileges)
  1059  	}
  1060  	if c.Config.Tty {
  1061  		opts = append(opts, WithConsoleSize(c))
  1062  	}
  1063  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1064  	if c.HostConfig.MaskedPaths != nil {
  1065  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1066  	}
  1067  	if c.HostConfig.ReadonlyPaths != nil {
  1068  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1069  	}
  1070  	if daemonCfg.Rootless {
  1071  		opts = append(opts, withRootless(daemon, &daemonCfg.Config))
  1072  	} else if userns.RunningInUserNS() {
  1073  		opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
  1074  	}
  1075  
  1076  	var snapshotter, snapshotKey string
  1077  	if daemon.UsesSnapshotter() {
  1078  		snapshotter = daemon.imageService.StorageDriver()
  1079  		snapshotKey = c.ID
  1080  	}
  1081  
  1082  	return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
  1083  		ID:          c.ID,
  1084  		Snapshotter: snapshotter,
  1085  		SnapshotKey: snapshotKey,
  1086  	}, &s, opts...)
  1087  }
  1088  
  1089  func clearReadOnly(m *specs.Mount) {
  1090  	var opt []string
  1091  	for _, o := range m.Options {
  1092  		if o != "ro" {
  1093  			opt = append(opt, o)
  1094  		}
  1095  	}
  1096  	m.Options = opt
  1097  }
  1098  
  1099  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1100  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
  1101  	ulimits := c.Ulimits
  1102  	// Merge ulimits with daemon defaults
  1103  	ulIdx := make(map[string]struct{})
  1104  	for _, ul := range ulimits {
  1105  		ulIdx[ul.Name] = struct{}{}
  1106  	}
  1107  	for name, ul := range daemonCfg.Ulimits {
  1108  		if _, exists := ulIdx[name]; !exists {
  1109  			ulimits = append(ulimits, ul)
  1110  		}
  1111  	}
  1112  	c.Ulimits = ulimits
  1113  }