github.com/rish1988/moby@v25.0.2+incompatible/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "path/filepath" 8 "sort" 9 "strconv" 10 "strings" 11 12 cdcgroups "github.com/containerd/cgroups/v3" 13 "github.com/containerd/containerd/containers" 14 coci "github.com/containerd/containerd/oci" 15 "github.com/containerd/containerd/pkg/apparmor" 16 "github.com/containerd/containerd/pkg/userns" 17 "github.com/containerd/log" 18 containertypes "github.com/docker/docker/api/types/container" 19 "github.com/docker/docker/container" 20 dconfig "github.com/docker/docker/daemon/config" 21 "github.com/docker/docker/errdefs" 22 "github.com/docker/docker/oci" 23 "github.com/docker/docker/oci/caps" 24 "github.com/docker/docker/pkg/idtools" 25 "github.com/docker/docker/pkg/rootless/specconv" 26 "github.com/docker/docker/pkg/stringid" 27 volumemounts "github.com/docker/docker/volume/mounts" 28 "github.com/moby/sys/mount" 29 "github.com/moby/sys/mountinfo" 30 "github.com/moby/sys/user" 31 "github.com/opencontainers/runc/libcontainer/cgroups" 32 specs "github.com/opencontainers/runtime-spec/specs-go" 33 "github.com/pkg/errors" 34 "golang.org/x/sys/unix" 35 ) 36 37 const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary 38 39 // withRlimits sets the container's rlimits along with merging the daemon's rlimits 40 func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 41 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 42 var rlimits []specs.POSIXRlimit 43 44 // We want to leave the original HostConfig alone so make a copy here 45 hostConfig := *c.HostConfig 46 // Merge with the daemon defaults 47 daemon.mergeUlimits(&hostConfig, daemonCfg) 48 for _, ul := range hostConfig.Ulimits { 49 rlimits = append(rlimits, specs.POSIXRlimit{ 50 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 51 Soft: uint64(ul.Soft), 52 Hard: uint64(ul.Hard), 53 }) 54 } 55 56 if s.Process == nil { 57 s.Process = &specs.Process{} 58 } 59 s.Process.Rlimits = rlimits 60 return nil 61 } 62 } 63 64 // withLibnetwork sets the libnetwork hook 65 func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 66 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 67 if c.Config.NetworkDisabled { 68 return nil 69 } 70 for _, ns := range s.Linux.Namespaces { 71 if ns.Type == specs.NetworkNamespace && ns.Path == "" { 72 if s.Hooks == nil { 73 s.Hooks = &specs.Hooks{} 74 } 75 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 76 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ 77 Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"), 78 Args: []string{"libnetwork-setkey", "-exec-root=" + daemonCfg.GetExecRoot(), c.ID, shortNetCtlrID}, 79 }) 80 } 81 } 82 return nil 83 } 84 } 85 86 // withRootless sets the spec to the rootless configuration 87 func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts { 88 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 89 var v2Controllers []string 90 if cgroupDriver(daemonCfg) == cgroupSystemdDriver { 91 if cdcgroups.Mode() != cdcgroups.Unified { 92 return errors.New("rootless systemd driver doesn't support cgroup v1") 93 } 94 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 95 if rootlesskitParentEUID == "" { 96 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 97 } 98 euid, err := strconv.Atoi(rootlesskitParentEUID) 99 if err != nil { 100 return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value") 101 } 102 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid) 103 controllersFile, err := os.ReadFile(controllersPath) 104 if err != nil { 105 return err 106 } 107 v2Controllers = strings.Fields(string(controllersFile)) 108 } 109 return specconv.ToRootless(s, v2Controllers) 110 } 111 } 112 113 // withRootfulInRootless is used for "rootful-in-rootless" dind; 114 // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc. 115 func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts { 116 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 117 specconv.ToRootfulInRootless(s) 118 return nil 119 } 120 } 121 122 // WithOOMScore sets the oom score 123 func WithOOMScore(score *int) coci.SpecOpts { 124 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 125 if s.Process == nil { 126 s.Process = &specs.Process{} 127 } 128 s.Process.OOMScoreAdj = score 129 return nil 130 } 131 } 132 133 // WithSelinux sets the selinux labels 134 func WithSelinux(c *container.Container) coci.SpecOpts { 135 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 136 if s.Process == nil { 137 s.Process = &specs.Process{} 138 } 139 if s.Linux == nil { 140 s.Linux = &specs.Linux{} 141 } 142 s.Process.SelinuxLabel = c.GetProcessLabel() 143 s.Linux.MountLabel = c.MountLabel 144 return nil 145 } 146 } 147 148 // WithApparmor sets the apparmor profile 149 func WithApparmor(c *container.Container) coci.SpecOpts { 150 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 151 if apparmor.HostSupports() { 152 var appArmorProfile string 153 if c.AppArmorProfile != "" { 154 appArmorProfile = c.AppArmorProfile 155 } else if c.HostConfig.Privileged { 156 appArmorProfile = unconfinedAppArmorProfile 157 } else { 158 appArmorProfile = defaultAppArmorProfile 159 } 160 161 if appArmorProfile == defaultAppArmorProfile { 162 // Unattended upgrades and other fun services can unload AppArmor 163 // profiles inadvertently. Since we cannot store our profile in 164 // /etc/apparmor.d, nor can we practically add other ways of 165 // telling the system to keep our profile loaded, in order to make 166 // sure that we keep the default profile enabled we dynamically 167 // reload it if necessary. 168 if err := ensureDefaultAppArmorProfile(); err != nil { 169 return err 170 } 171 } 172 if s.Process == nil { 173 s.Process = &specs.Process{} 174 } 175 s.Process.ApparmorProfile = appArmorProfile 176 } 177 return nil 178 } 179 } 180 181 // WithCapabilities sets the container's capabilties 182 func WithCapabilities(c *container.Container) coci.SpecOpts { 183 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 184 capabilities, err := caps.TweakCapabilities( 185 caps.DefaultCapabilities(), 186 c.HostConfig.CapAdd, 187 c.HostConfig.CapDrop, 188 c.HostConfig.Privileged, 189 ) 190 if err != nil { 191 return err 192 } 193 return oci.SetCapabilities(s, capabilities) 194 } 195 } 196 197 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 198 p, err := getPath() 199 if err != nil { 200 return "", err 201 } 202 return c.GetResourcePath(p) 203 } 204 205 func getUser(c *container.Container, username string) (specs.User, error) { 206 var usr specs.User 207 passwdPath, err := resourcePath(c, user.GetPasswdPath) 208 if err != nil { 209 return usr, err 210 } 211 groupPath, err := resourcePath(c, user.GetGroupPath) 212 if err != nil { 213 return usr, err 214 } 215 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 216 if err != nil { 217 return usr, err 218 } 219 usr.UID = uint32(execUser.Uid) 220 usr.GID = uint32(execUser.Gid) 221 usr.AdditionalGids = []uint32{usr.GID} 222 223 var addGroups []int 224 if len(c.HostConfig.GroupAdd) > 0 { 225 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 226 if err != nil { 227 return usr, err 228 } 229 } 230 for _, g := range append(execUser.Sgids, addGroups...) { 231 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 232 } 233 return usr, nil 234 } 235 236 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 237 if s.Linux == nil { 238 s.Linux = &specs.Linux{} 239 } 240 241 for i, n := range s.Linux.Namespaces { 242 if n.Type == ns.Type { 243 s.Linux.Namespaces[i] = ns 244 return 245 } 246 } 247 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 248 } 249 250 // WithNamespaces sets the container's namespaces 251 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 252 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 253 userNS := false 254 // user 255 if c.HostConfig.UsernsMode.IsPrivate() { 256 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil { 257 userNS = true 258 setNamespace(s, specs.LinuxNamespace{ 259 Type: specs.UserNamespace, 260 }) 261 s.Linux.UIDMappings = specMapping(uidMap) 262 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps) 263 } 264 } 265 // network 266 if !c.Config.NetworkDisabled { 267 networkMode := c.HostConfig.NetworkMode 268 switch { 269 case networkMode.IsContainer(): 270 nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer()) 271 if err != nil { 272 return err 273 } 274 setNamespace(s, specs.LinuxNamespace{ 275 Type: specs.NetworkNamespace, 276 Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()), 277 }) 278 if userNS { 279 // to share a net namespace, the containers must also share a user namespace. 280 // 281 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 282 setNamespace(s, specs.LinuxNamespace{ 283 Type: specs.UserNamespace, 284 Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()), 285 }) 286 } 287 case networkMode.IsHost(): 288 oci.RemoveNamespace(s, specs.NetworkNamespace) 289 default: 290 setNamespace(s, specs.LinuxNamespace{ 291 Type: specs.NetworkNamespace, 292 }) 293 } 294 } 295 296 // ipc 297 ipcMode := c.HostConfig.IpcMode 298 if !ipcMode.Valid() { 299 return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode)) 300 } 301 switch { 302 case ipcMode.IsContainer(): 303 ic, err := daemon.getIPCContainer(ipcMode.Container()) 304 if err != nil { 305 return errors.Wrap(err, "failed to join IPC namespace") 306 } 307 setNamespace(s, specs.LinuxNamespace{ 308 Type: specs.IPCNamespace, 309 Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()), 310 }) 311 if userNS { 312 // to share a IPC namespace, the containers must also share a user namespace. 313 // 314 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 315 setNamespace(s, specs.LinuxNamespace{ 316 Type: specs.UserNamespace, 317 Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()), 318 }) 319 } 320 case ipcMode.IsHost(): 321 oci.RemoveNamespace(s, specs.IPCNamespace) 322 case ipcMode.IsEmpty(): 323 // A container was created by an older version of the daemon. 324 // The default behavior used to be what is now called "shareable". 325 fallthrough 326 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 327 setNamespace(s, specs.LinuxNamespace{ 328 Type: specs.IPCNamespace, 329 }) 330 } 331 332 // pid 333 pidMode := c.HostConfig.PidMode 334 if !pidMode.Valid() { 335 return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode)) 336 } 337 switch { 338 case pidMode.IsContainer(): 339 pc, err := daemon.getPIDContainer(pidMode.Container()) 340 if err != nil { 341 return errors.Wrap(err, "failed to join PID namespace") 342 } 343 setNamespace(s, specs.LinuxNamespace{ 344 Type: specs.PIDNamespace, 345 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 346 }) 347 if userNS { 348 // to share a PID namespace, the containers must also share a user namespace. 349 // 350 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 351 setNamespace(s, specs.LinuxNamespace{ 352 Type: specs.UserNamespace, 353 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 354 }) 355 } 356 case pidMode.IsHost(): 357 oci.RemoveNamespace(s, specs.PIDNamespace) 358 default: 359 setNamespace(s, specs.LinuxNamespace{ 360 Type: specs.PIDNamespace, 361 }) 362 } 363 364 // uts 365 if !c.HostConfig.UTSMode.Valid() { 366 return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode)) 367 } 368 if c.HostConfig.UTSMode.IsHost() { 369 oci.RemoveNamespace(s, specs.UTSNamespace) 370 s.Hostname = "" 371 } 372 373 // cgroup 374 if !c.HostConfig.CgroupnsMode.Valid() { 375 return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode)) 376 } 377 if c.HostConfig.CgroupnsMode.IsPrivate() { 378 setNamespace(s, specs.LinuxNamespace{ 379 Type: specs.CgroupNamespace, 380 }) 381 } 382 383 return nil 384 } 385 } 386 387 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 388 var ids []specs.LinuxIDMapping 389 for _, item := range s { 390 ids = append(ids, specs.LinuxIDMapping{ 391 HostID: uint32(item.HostID), 392 ContainerID: uint32(item.ContainerID), 393 Size: uint32(item.Size), 394 }) 395 } 396 return ids 397 } 398 399 // Get the source mount point of directory passed in as argument. Also return 400 // optional fields. 401 func getSourceMount(source string) (string, string, error) { 402 // Ensure any symlinks are resolved. 403 sourcePath, err := filepath.EvalSymlinks(source) 404 if err != nil { 405 return "", "", err 406 } 407 408 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 409 if err != nil { 410 return "", "", err 411 } 412 if len(mi) < 1 { 413 return "", "", fmt.Errorf("Can't find mount point of %s", source) 414 } 415 416 // find the longest mount point 417 var idx, maxlen int 418 for i := range mi { 419 if len(mi[i].Mountpoint) > maxlen { 420 maxlen = len(mi[i].Mountpoint) 421 idx = i 422 } 423 } 424 return mi[idx].Mountpoint, mi[idx].Optional, nil 425 } 426 427 const ( 428 sharedPropagationOption = "shared:" 429 slavePropagationOption = "master:" 430 ) 431 432 // hasMountInfoOption checks if any of the passed any of the given option values 433 // are set in the passed in option string. 434 func hasMountInfoOption(opts string, vals ...string) bool { 435 for _, opt := range strings.Split(opts, " ") { 436 for _, val := range vals { 437 if strings.HasPrefix(opt, val) { 438 return true 439 } 440 } 441 } 442 return false 443 } 444 445 // Ensure mount point on which path is mounted, is shared. 446 func ensureShared(path string) error { 447 sourceMount, optionalOpts, err := getSourceMount(path) 448 if err != nil { 449 return err 450 } 451 // Make sure source mount point is shared. 452 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 453 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 454 } 455 return nil 456 } 457 458 // Ensure mount point on which path is mounted, is either shared or slave. 459 func ensureSharedOrSlave(path string) error { 460 sourceMount, optionalOpts, err := getSourceMount(path) 461 if err != nil { 462 return err 463 } 464 465 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 466 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 467 } 468 return nil 469 } 470 471 // Get the set of mount flags that are set on the mount that contains the given 472 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 473 // bind-mounting "with options" will not fail with user namespaces, due to 474 // kernel restrictions that require user namespace mounts to preserve 475 // CL_UNPRIVILEGED locked flags. 476 func getUnprivilegedMountFlags(path string) ([]string, error) { 477 var statfs unix.Statfs_t 478 if err := unix.Statfs(path, &statfs); err != nil { 479 return nil, err 480 } 481 482 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 483 unprivilegedFlags := map[uint64]string{ 484 unix.MS_RDONLY: "ro", 485 unix.MS_NODEV: "nodev", 486 unix.MS_NOEXEC: "noexec", 487 unix.MS_NOSUID: "nosuid", 488 unix.MS_NOATIME: "noatime", 489 unix.MS_RELATIME: "relatime", 490 unix.MS_NODIRATIME: "nodiratime", 491 } 492 493 var flags []string 494 for mask, flag := range unprivilegedFlags { 495 if uint64(statfs.Flags)&mask == mask { 496 flags = append(flags, flag) 497 } 498 } 499 500 return flags, nil 501 } 502 503 var ( 504 mountPropagationMap = map[string]int{ 505 "private": mount.PRIVATE, 506 "rprivate": mount.RPRIVATE, 507 "shared": mount.SHARED, 508 "rshared": mount.RSHARED, 509 "slave": mount.SLAVE, 510 "rslave": mount.RSLAVE, 511 } 512 513 mountPropagationReverseMap = map[int]string{ 514 mount.PRIVATE: "private", 515 mount.RPRIVATE: "rprivate", 516 mount.SHARED: "shared", 517 mount.RSHARED: "rshared", 518 mount.SLAVE: "slave", 519 mount.RSLAVE: "rslave", 520 } 521 ) 522 523 // inSlice tests whether a string is contained in a slice of strings or not. 524 // Comparison is case sensitive 525 func inSlice(slice []string, s string) bool { 526 for _, ss := range slice { 527 if s == ss { 528 return true 529 } 530 } 531 return false 532 } 533 534 // withMounts sets the container's mounts 535 func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts { 536 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 537 sort.Sort(mounts(ms)) 538 539 mounts := ms 540 541 userMounts := make(map[string]struct{}) 542 for _, m := range mounts { 543 userMounts[m.Destination] = struct{}{} 544 } 545 546 // Copy all mounts from spec to defaultMounts, except for 547 // - mounts overridden by a user supplied mount; 548 // - all mounts under /dev if a user supplied /dev is present; 549 // - /dev/shm, in case IpcMode is none. 550 // While at it, also 551 // - set size for /dev/shm from shmsize. 552 defaultMounts := s.Mounts[:0] 553 _, mountDev := userMounts["/dev"] 554 for _, m := range s.Mounts { 555 if _, ok := userMounts[m.Destination]; ok { 556 // filter out mount overridden by a user supplied mount 557 continue 558 } 559 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 560 // filter out everything under /dev if /dev is user-mounted 561 continue 562 } 563 564 if m.Destination == "/dev/shm" { 565 if c.HostConfig.IpcMode.IsNone() { 566 // filter out /dev/shm for "none" IpcMode 567 continue 568 } 569 // set size for /dev/shm mount from spec 570 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 571 m.Options = append(m.Options, sizeOpt) 572 } 573 574 defaultMounts = append(defaultMounts, m) 575 } 576 577 s.Mounts = defaultMounts 578 for _, m := range mounts { 579 if m.Source == "tmpfs" { 580 data := m.Data 581 parser := volumemounts.NewParser() 582 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 583 if data != "" { 584 options = append(options, strings.Split(data, ",")...) 585 } 586 587 merged, err := mount.MergeTmpfsOptions(options) 588 if err != nil { 589 return err 590 } 591 592 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 593 continue 594 } 595 596 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 597 598 // Determine property of RootPropagation based on volume 599 // properties. If a volume is shared, then keep root propagation 600 // shared. This should work for slave and private volumes too. 601 // 602 // For slave volumes, it can be either [r]shared/[r]slave. 603 // 604 // For private volumes any root propagation value should work. 605 pFlag := mountPropagationMap[m.Propagation] 606 switch pFlag { 607 case mount.SHARED, mount.RSHARED: 608 if err := ensureShared(m.Source); err != nil { 609 return err 610 } 611 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 612 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 613 if s.Linux == nil { 614 s.Linux = &specs.Linux{} 615 } 616 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 617 } 618 case mount.SLAVE, mount.RSLAVE: 619 var fallback bool 620 if err := ensureSharedOrSlave(m.Source); err != nil { 621 // For backwards compatibility purposes, treat mounts from the daemon root 622 // as special since we automatically add rslave propagation to these mounts 623 // when the user did not set anything, so we should fallback to the old 624 // behavior which is to use private propagation which is normally the 625 // default. 626 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 627 return err 628 } 629 630 cm, ok := c.MountPoints[m.Destination] 631 if !ok { 632 return err 633 } 634 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 635 // This means the user explicitly set a propagation, do not fallback in that case. 636 return err 637 } 638 fallback = true 639 log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 640 } 641 if !fallback { 642 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 643 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 644 if s.Linux == nil { 645 s.Linux = &specs.Linux{} 646 } 647 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 648 } 649 } 650 } 651 652 bindMode := "rbind" 653 if m.NonRecursive { 654 bindMode = "bind" 655 } 656 opts := []string{bindMode} 657 if !m.Writable { 658 rro := true 659 if m.ReadOnlyNonRecursive { 660 rro = false 661 if m.ReadOnlyForceRecursive { 662 return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive") 663 } 664 } 665 if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil { 666 rro = false 667 if m.ReadOnlyForceRecursive { 668 return rroErr 669 } 670 } 671 if rro { 672 opts = append(opts, "rro") 673 } else { 674 opts = append(opts, "ro") 675 } 676 } 677 if pFlag != 0 { 678 opts = append(opts, mountPropagationReverseMap[pFlag]) 679 } 680 681 // If we are using user namespaces, then we must make sure that we 682 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 683 // "mount" when we bind-mount. The reason for this is that at the point 684 // when runc sets up the root filesystem, it is already inside a user 685 // namespace, and thus cannot change any flags that are locked. 686 if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() { 687 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 688 if err != nil { 689 return err 690 } 691 opts = append(opts, unprivOpts...) 692 } 693 694 mt.Options = opts 695 s.Mounts = append(s.Mounts, mt) 696 } 697 698 if s.Root.Readonly { 699 for i, m := range s.Mounts { 700 switch m.Destination { 701 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 702 continue 703 } 704 if _, ok := userMounts[m.Destination]; !ok { 705 if !inSlice(m.Options, "ro") { 706 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 707 } 708 } 709 } 710 } 711 712 if c.HostConfig.Privileged { 713 // clear readonly for /sys 714 for i := range s.Mounts { 715 if s.Mounts[i].Destination == "/sys" { 716 clearReadOnly(&s.Mounts[i]) 717 } 718 } 719 if s.Linux != nil { 720 s.Linux.ReadonlyPaths = nil 721 s.Linux.MaskedPaths = nil 722 } 723 } 724 725 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 726 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 727 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged { 728 for i, m := range s.Mounts { 729 if m.Type == "cgroup" { 730 clearReadOnly(&s.Mounts[i]) 731 } 732 } 733 } 734 735 return nil 736 } 737 } 738 739 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 740 // exist, so do not add the default ones if running on an old kernel. 741 func sysctlExists(s string) bool { 742 f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/")) 743 _, err := os.Stat(f) 744 return err == nil 745 } 746 747 // withCommonOptions sets common docker options 748 func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 749 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 750 if c.BaseFS == "" { 751 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty") 752 } 753 linkedEnv, err := daemon.setupLinkedContainers(c) 754 if err != nil { 755 return err 756 } 757 s.Root = &specs.Root{ 758 Path: c.BaseFS, 759 Readonly: c.HostConfig.ReadonlyRootfs, 760 } 761 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 762 return err 763 } 764 cwd := c.Config.WorkingDir 765 if len(cwd) == 0 { 766 cwd = "/" 767 } 768 if s.Process == nil { 769 s.Process = &specs.Process{} 770 } 771 s.Process.Args = append([]string{c.Path}, c.Args...) 772 773 // only add the custom init if it is specified and the container is running in its 774 // own private pid namespace. It does not make sense to add if it is running in the 775 // host namespace or another container's pid namespace where we already have an init 776 if c.HostConfig.PidMode.IsPrivate() { 777 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 778 (c.HostConfig.Init == nil && daemonCfg.Init) { 779 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 780 path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path 781 if err != nil { 782 return err 783 } 784 s.Mounts = append(s.Mounts, specs.Mount{ 785 Destination: inContainerInitPath, 786 Type: "bind", 787 Source: path, 788 Options: []string{"bind", "ro"}, 789 }) 790 } 791 } 792 s.Process.Cwd = cwd 793 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 794 s.Process.Terminal = c.Config.Tty 795 796 s.Hostname = c.Config.Hostname 797 setLinuxDomainname(c, s) 798 799 // Add default sysctls that are generally safe and useful; currently we 800 // grant the capabilities to allow these anyway. You can override if 801 // you want to restore the original behaviour. 802 // We do not set network sysctls if network namespace is host, or if we are 803 // joining an existing namespace, only if we create a new net namespace. 804 if c.HostConfig.NetworkMode.IsPrivate() { 805 // We cannot set up ping socket support in a user namespace 806 userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate() 807 if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") { 808 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 809 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 810 } 811 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 812 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 813 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 814 } 815 } 816 817 return nil 818 } 819 } 820 821 // withCgroups sets the container's cgroups 822 func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 823 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 824 var cgroupsPath string 825 scopePrefix := "docker" 826 parent := "/docker" 827 useSystemd := UsingSystemd(daemonCfg) 828 if useSystemd { 829 parent = "system.slice" 830 if daemonCfg.Rootless { 831 parent = "user.slice" 832 } 833 } 834 835 if c.HostConfig.CgroupParent != "" { 836 parent = c.HostConfig.CgroupParent 837 } else if daemonCfg.CgroupParent != "" { 838 parent = daemonCfg.CgroupParent 839 } 840 841 if useSystemd { 842 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 843 log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 844 } else { 845 cgroupsPath = filepath.Join(parent, c.ID) 846 } 847 if s.Linux == nil { 848 s.Linux = &specs.Linux{} 849 } 850 s.Linux.CgroupsPath = cgroupsPath 851 852 // the rest is only needed for CPU RT controller 853 854 if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 { 855 return nil 856 } 857 858 p := cgroupsPath 859 if useSystemd { 860 initPath, err := cgroups.GetInitCgroup("cpu") 861 if err != nil { 862 return errors.Wrap(err, "unable to init CPU RT controller") 863 } 864 _, err = cgroups.GetOwnCgroup("cpu") 865 if err != nil { 866 return errors.Wrap(err, "unable to init CPU RT controller") 867 } 868 p = filepath.Join(initPath, s.Linux.CgroupsPath) 869 } 870 871 // Clean path to guard against things like ../../../BAD 872 parentPath := filepath.Dir(p) 873 if !filepath.IsAbs(parentPath) { 874 parentPath = filepath.Clean("/" + parentPath) 875 } 876 877 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 878 if err != nil { 879 return errors.Wrap(err, "unable to init CPU RT controller") 880 } 881 // When docker is run inside docker, the root is based of the host cgroup. 882 // Should this be handled in runc/libcontainer/cgroups ? 883 if strings.HasPrefix(root, "/docker/") { 884 root = "/" 885 } 886 mnt = filepath.Join(mnt, root) 887 888 if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil { 889 return errors.Wrap(err, "unable to init CPU RT controller") 890 } 891 return nil 892 } 893 } 894 895 // WithDevices sets the container's devices 896 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 897 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 898 // Build lists of devices allowed and created within the container. 899 var devs []specs.LinuxDevice 900 devPermissions := s.Linux.Resources.Devices 901 902 if c.HostConfig.Privileged { 903 hostDevices, err := coci.HostDevices() 904 if err != nil { 905 return err 906 } 907 devs = append(devs, hostDevices...) 908 909 // adding device mappings in privileged containers 910 for _, deviceMapping := range c.HostConfig.Devices { 911 // issue a warning that custom cgroup permissions are ignored in privileged mode 912 if deviceMapping.CgroupPermissions != "rwm" { 913 log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 914 } 915 // issue a warning that the device path already exists via /dev mounting in privileged mode 916 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 917 log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 918 continue 919 } 920 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 921 if err != nil { 922 return err 923 } 924 devs = append(devs, d...) 925 } 926 927 devPermissions = []specs.LinuxDeviceCgroup{ 928 { 929 Allow: true, 930 Access: "rwm", 931 }, 932 } 933 } else { 934 for _, deviceMapping := range c.HostConfig.Devices { 935 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 936 if err != nil { 937 return err 938 } 939 devs = append(devs, d...) 940 devPermissions = append(devPermissions, dPermissions...) 941 } 942 943 var err error 944 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 945 if err != nil { 946 return err 947 } 948 } 949 950 if s.Linux == nil { 951 s.Linux = &specs.Linux{} 952 } 953 if s.Linux.Resources == nil { 954 s.Linux.Resources = &specs.LinuxResources{} 955 } 956 s.Linux.Devices = append(s.Linux.Devices, devs...) 957 s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...) 958 959 for _, req := range c.HostConfig.DeviceRequests { 960 if err := daemon.handleDevice(req, s); err != nil { 961 return err 962 } 963 } 964 return nil 965 } 966 } 967 968 // WithResources applies the container resources 969 func WithResources(c *container.Container) coci.SpecOpts { 970 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 971 r := c.HostConfig.Resources 972 weightDevices, err := getBlkioWeightDevices(r) 973 if err != nil { 974 return err 975 } 976 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 977 if err != nil { 978 return err 979 } 980 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 981 if err != nil { 982 return err 983 } 984 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 985 if err != nil { 986 return err 987 } 988 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 989 if err != nil { 990 return err 991 } 992 993 memoryRes := getMemoryResources(r) 994 cpuRes, err := getCPUResources(r) 995 if err != nil { 996 return err 997 } 998 999 if s.Linux == nil { 1000 s.Linux = &specs.Linux{} 1001 } 1002 if s.Linux.Resources == nil { 1003 s.Linux.Resources = &specs.LinuxResources{} 1004 } 1005 s.Linux.Resources.Memory = memoryRes 1006 s.Linux.Resources.CPU = cpuRes 1007 s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{ 1008 WeightDevice: weightDevices, 1009 ThrottleReadBpsDevice: readBpsDevice, 1010 ThrottleWriteBpsDevice: writeBpsDevice, 1011 ThrottleReadIOPSDevice: readIOpsDevice, 1012 ThrottleWriteIOPSDevice: writeIOpsDevice, 1013 } 1014 if r.BlkioWeight != 0 { 1015 w := r.BlkioWeight 1016 s.Linux.Resources.BlockIO.Weight = &w 1017 } 1018 s.Linux.Resources.Pids = getPidsLimit(r) 1019 1020 return nil 1021 } 1022 } 1023 1024 // WithSysctls sets the container's sysctls 1025 func WithSysctls(c *container.Container) coci.SpecOpts { 1026 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1027 if len(c.HostConfig.Sysctls) == 0 { 1028 return nil 1029 } 1030 if s.Linux == nil { 1031 s.Linux = &specs.Linux{} 1032 } 1033 if s.Linux.Sysctl == nil { 1034 s.Linux.Sysctl = make(map[string]string) 1035 } 1036 // We merge the sysctls injected above with the HostConfig (latter takes 1037 // precedence for backwards-compatibility reasons). 1038 for k, v := range c.HostConfig.Sysctls { 1039 s.Linux.Sysctl[k] = v 1040 } 1041 return nil 1042 } 1043 } 1044 1045 // WithUser sets the container's user 1046 func WithUser(c *container.Container) coci.SpecOpts { 1047 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1048 if s.Process == nil { 1049 s.Process = &specs.Process{} 1050 } 1051 var err error 1052 s.Process.User, err = getUser(c, c.Config.User) 1053 return err 1054 } 1055 } 1056 1057 func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) { 1058 var ( 1059 opts []coci.SpecOpts 1060 s = oci.DefaultSpec() 1061 ) 1062 opts = append(opts, 1063 withCommonOptions(daemon, &daemonCfg.Config, c), 1064 withCgroups(daemon, &daemonCfg.Config, c), 1065 WithResources(c), 1066 WithSysctls(c), 1067 WithDevices(daemon, c), 1068 withRlimits(daemon, &daemonCfg.Config, c), 1069 WithNamespaces(daemon, c), 1070 WithCapabilities(c), 1071 WithSeccomp(daemon, c), 1072 withMounts(daemon, daemonCfg, c, mounts), 1073 withLibnetwork(daemon, &daemonCfg.Config, c), 1074 WithApparmor(c), 1075 WithSelinux(c), 1076 WithOOMScore(&c.HostConfig.OomScoreAdj), 1077 coci.WithAnnotations(c.HostConfig.Annotations), 1078 WithUser(c), 1079 ) 1080 1081 if c.NoNewPrivileges { 1082 opts = append(opts, coci.WithNoNewPrivileges) 1083 } 1084 if c.Config.Tty { 1085 opts = append(opts, WithConsoleSize(c)) 1086 } 1087 // Set the masked and readonly paths with regard to the host config options if they are set. 1088 if c.HostConfig.MaskedPaths != nil { 1089 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1090 } 1091 if c.HostConfig.ReadonlyPaths != nil { 1092 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1093 } 1094 if daemonCfg.Rootless { 1095 opts = append(opts, withRootless(daemon, &daemonCfg.Config)) 1096 } else if userns.RunningInUserNS() { 1097 opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config)) 1098 } 1099 1100 var snapshotter, snapshotKey string 1101 if daemon.UsesSnapshotter() { 1102 snapshotter = daemon.imageService.StorageDriver() 1103 snapshotKey = c.ID 1104 } 1105 1106 return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{ 1107 ID: c.ID, 1108 Snapshotter: snapshotter, 1109 SnapshotKey: snapshotKey, 1110 }, &s, opts...) 1111 } 1112 1113 func clearReadOnly(m *specs.Mount) { 1114 var opt []string 1115 for _, o := range m.Options { 1116 if o != "ro" { 1117 opt = append(opt, o) 1118 } 1119 } 1120 m.Options = opt 1121 } 1122 1123 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1124 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) { 1125 ulimits := c.Ulimits 1126 // Merge ulimits with daemon defaults 1127 ulIdx := make(map[string]struct{}) 1128 for _, ul := range ulimits { 1129 ulIdx[ul.Name] = struct{}{} 1130 } 1131 for name, ul := range daemonCfg.Ulimits { 1132 if _, exists := ulIdx[name]; !exists { 1133 ulimits = append(ulimits, ul) 1134 } 1135 } 1136 c.Ulimits = ulimits 1137 }