github.com/tonistiigi/docker@v0.10.1-0.20240229224939-974013b0dc6a/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "path/filepath" 8 "sort" 9 "strconv" 10 "strings" 11 12 cdcgroups "github.com/containerd/cgroups/v3" 13 "github.com/containerd/containerd/containers" 14 coci "github.com/containerd/containerd/oci" 15 "github.com/containerd/containerd/pkg/apparmor" 16 "github.com/containerd/containerd/pkg/userns" 17 "github.com/containerd/log" 18 containertypes "github.com/docker/docker/api/types/container" 19 "github.com/docker/docker/container" 20 dconfig "github.com/docker/docker/daemon/config" 21 "github.com/docker/docker/errdefs" 22 "github.com/docker/docker/oci" 23 "github.com/docker/docker/oci/caps" 24 "github.com/docker/docker/pkg/idtools" 25 "github.com/docker/docker/pkg/rootless/specconv" 26 volumemounts "github.com/docker/docker/volume/mounts" 27 "github.com/moby/sys/mount" 28 "github.com/moby/sys/mountinfo" 29 "github.com/moby/sys/user" 30 "github.com/opencontainers/runc/libcontainer/cgroups" 31 specs "github.com/opencontainers/runtime-spec/specs-go" 32 "github.com/pkg/errors" 33 "golang.org/x/sys/unix" 34 ) 35 36 const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary 37 38 // withRlimits sets the container's rlimits along with merging the daemon's rlimits 39 func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 40 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 41 var rlimits []specs.POSIXRlimit 42 43 // We want to leave the original HostConfig alone so make a copy here 44 hostConfig := *c.HostConfig 45 // Merge with the daemon defaults 46 daemon.mergeUlimits(&hostConfig, daemonCfg) 47 for _, ul := range hostConfig.Ulimits { 48 rlimits = append(rlimits, specs.POSIXRlimit{ 49 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 50 Soft: uint64(ul.Soft), 51 Hard: uint64(ul.Hard), 52 }) 53 } 54 55 if s.Process == nil { 56 s.Process = &specs.Process{} 57 } 58 s.Process.Rlimits = rlimits 59 return nil 60 } 61 } 62 63 // withRootless sets the spec to the rootless configuration 64 func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts { 65 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 66 var v2Controllers []string 67 if cgroupDriver(daemonCfg) == cgroupSystemdDriver { 68 if cdcgroups.Mode() != cdcgroups.Unified { 69 return errors.New("rootless systemd driver doesn't support cgroup v1") 70 } 71 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 72 if rootlesskitParentEUID == "" { 73 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 74 } 75 euid, err := strconv.Atoi(rootlesskitParentEUID) 76 if err != nil { 77 return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value") 78 } 79 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid) 80 controllersFile, err := os.ReadFile(controllersPath) 81 if err != nil { 82 return err 83 } 84 v2Controllers = strings.Fields(string(controllersFile)) 85 } 86 return specconv.ToRootless(s, v2Controllers) 87 } 88 } 89 90 // withRootfulInRootless is used for "rootful-in-rootless" dind; 91 // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc. 92 func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts { 93 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 94 specconv.ToRootfulInRootless(s) 95 return nil 96 } 97 } 98 99 // WithOOMScore sets the oom score 100 func WithOOMScore(score *int) coci.SpecOpts { 101 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 102 if s.Process == nil { 103 s.Process = &specs.Process{} 104 } 105 s.Process.OOMScoreAdj = score 106 return nil 107 } 108 } 109 110 // WithSelinux sets the selinux labels 111 func WithSelinux(c *container.Container) coci.SpecOpts { 112 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 113 if s.Process == nil { 114 s.Process = &specs.Process{} 115 } 116 if s.Linux == nil { 117 s.Linux = &specs.Linux{} 118 } 119 s.Process.SelinuxLabel = c.GetProcessLabel() 120 s.Linux.MountLabel = c.MountLabel 121 return nil 122 } 123 } 124 125 // WithApparmor sets the apparmor profile 126 func WithApparmor(c *container.Container) coci.SpecOpts { 127 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 128 if apparmor.HostSupports() { 129 var appArmorProfile string 130 if c.AppArmorProfile != "" { 131 appArmorProfile = c.AppArmorProfile 132 } else if c.HostConfig.Privileged { 133 appArmorProfile = unconfinedAppArmorProfile 134 } else { 135 appArmorProfile = defaultAppArmorProfile 136 } 137 138 if appArmorProfile == defaultAppArmorProfile { 139 // Unattended upgrades and other fun services can unload AppArmor 140 // profiles inadvertently. Since we cannot store our profile in 141 // /etc/apparmor.d, nor can we practically add other ways of 142 // telling the system to keep our profile loaded, in order to make 143 // sure that we keep the default profile enabled we dynamically 144 // reload it if necessary. 145 if err := ensureDefaultAppArmorProfile(); err != nil { 146 return err 147 } 148 } 149 if s.Process == nil { 150 s.Process = &specs.Process{} 151 } 152 s.Process.ApparmorProfile = appArmorProfile 153 } 154 return nil 155 } 156 } 157 158 // WithCapabilities sets the container's capabilties 159 func WithCapabilities(c *container.Container) coci.SpecOpts { 160 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 161 capabilities, err := caps.TweakCapabilities( 162 caps.DefaultCapabilities(), 163 c.HostConfig.CapAdd, 164 c.HostConfig.CapDrop, 165 c.HostConfig.Privileged, 166 ) 167 if err != nil { 168 return err 169 } 170 return oci.SetCapabilities(s, capabilities) 171 } 172 } 173 174 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 175 p, err := getPath() 176 if err != nil { 177 return "", err 178 } 179 return c.GetResourcePath(p) 180 } 181 182 func getUser(c *container.Container, username string) (specs.User, error) { 183 var usr specs.User 184 passwdPath, err := resourcePath(c, user.GetPasswdPath) 185 if err != nil { 186 return usr, err 187 } 188 groupPath, err := resourcePath(c, user.GetGroupPath) 189 if err != nil { 190 return usr, err 191 } 192 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 193 if err != nil { 194 return usr, err 195 } 196 usr.UID = uint32(execUser.Uid) 197 usr.GID = uint32(execUser.Gid) 198 usr.AdditionalGids = []uint32{usr.GID} 199 200 var addGroups []int 201 if len(c.HostConfig.GroupAdd) > 0 { 202 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 203 if err != nil { 204 return usr, err 205 } 206 } 207 for _, g := range append(execUser.Sgids, addGroups...) { 208 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 209 } 210 return usr, nil 211 } 212 213 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 214 if s.Linux == nil { 215 s.Linux = &specs.Linux{} 216 } 217 218 for i, n := range s.Linux.Namespaces { 219 if n.Type == ns.Type { 220 s.Linux.Namespaces[i] = ns 221 return 222 } 223 } 224 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 225 } 226 227 // WithNamespaces sets the container's namespaces 228 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 229 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 230 userNS := false 231 // user 232 if c.HostConfig.UsernsMode.IsPrivate() { 233 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil { 234 userNS = true 235 setNamespace(s, specs.LinuxNamespace{ 236 Type: specs.UserNamespace, 237 }) 238 s.Linux.UIDMappings = specMapping(uidMap) 239 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps) 240 } 241 } 242 // network 243 if !c.Config.NetworkDisabled { 244 networkMode := c.HostConfig.NetworkMode 245 switch { 246 case networkMode.IsContainer(): 247 nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer()) 248 if err != nil { 249 return err 250 } 251 setNamespace(s, specs.LinuxNamespace{ 252 Type: specs.NetworkNamespace, 253 Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()), 254 }) 255 if userNS { 256 // to share a net namespace, the containers must also share a user namespace. 257 // 258 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 259 setNamespace(s, specs.LinuxNamespace{ 260 Type: specs.UserNamespace, 261 Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()), 262 }) 263 } 264 case networkMode.IsHost(): 265 oci.RemoveNamespace(s, specs.NetworkNamespace) 266 default: 267 setNamespace(s, specs.LinuxNamespace{ 268 Type: specs.NetworkNamespace, 269 }) 270 } 271 } 272 273 // ipc 274 ipcMode := c.HostConfig.IpcMode 275 if !ipcMode.Valid() { 276 return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode)) 277 } 278 switch { 279 case ipcMode.IsContainer(): 280 ic, err := daemon.getIPCContainer(ipcMode.Container()) 281 if err != nil { 282 return errors.Wrap(err, "failed to join IPC namespace") 283 } 284 setNamespace(s, specs.LinuxNamespace{ 285 Type: specs.IPCNamespace, 286 Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()), 287 }) 288 if userNS { 289 // to share a IPC namespace, the containers must also share a user namespace. 290 // 291 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 292 setNamespace(s, specs.LinuxNamespace{ 293 Type: specs.UserNamespace, 294 Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()), 295 }) 296 } 297 case ipcMode.IsHost(): 298 oci.RemoveNamespace(s, specs.IPCNamespace) 299 case ipcMode.IsEmpty(): 300 // A container was created by an older version of the daemon. 301 // The default behavior used to be what is now called "shareable". 302 fallthrough 303 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 304 setNamespace(s, specs.LinuxNamespace{ 305 Type: specs.IPCNamespace, 306 }) 307 } 308 309 // pid 310 pidMode := c.HostConfig.PidMode 311 if !pidMode.Valid() { 312 return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode)) 313 } 314 switch { 315 case pidMode.IsContainer(): 316 pc, err := daemon.getPIDContainer(pidMode.Container()) 317 if err != nil { 318 return errors.Wrap(err, "failed to join PID namespace") 319 } 320 setNamespace(s, specs.LinuxNamespace{ 321 Type: specs.PIDNamespace, 322 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 323 }) 324 if userNS { 325 // to share a PID namespace, the containers must also share a user namespace. 326 // 327 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 328 setNamespace(s, specs.LinuxNamespace{ 329 Type: specs.UserNamespace, 330 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 331 }) 332 } 333 case pidMode.IsHost(): 334 oci.RemoveNamespace(s, specs.PIDNamespace) 335 default: 336 setNamespace(s, specs.LinuxNamespace{ 337 Type: specs.PIDNamespace, 338 }) 339 } 340 341 // uts 342 if !c.HostConfig.UTSMode.Valid() { 343 return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode)) 344 } 345 if c.HostConfig.UTSMode.IsHost() { 346 oci.RemoveNamespace(s, specs.UTSNamespace) 347 s.Hostname = "" 348 } 349 350 // cgroup 351 if !c.HostConfig.CgroupnsMode.Valid() { 352 return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode)) 353 } 354 if c.HostConfig.CgroupnsMode.IsPrivate() { 355 setNamespace(s, specs.LinuxNamespace{ 356 Type: specs.CgroupNamespace, 357 }) 358 } 359 360 return nil 361 } 362 } 363 364 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 365 var ids []specs.LinuxIDMapping 366 for _, item := range s { 367 ids = append(ids, specs.LinuxIDMapping{ 368 HostID: uint32(item.HostID), 369 ContainerID: uint32(item.ContainerID), 370 Size: uint32(item.Size), 371 }) 372 } 373 return ids 374 } 375 376 // Get the source mount point of directory passed in as argument. Also return 377 // optional fields. 378 func getSourceMount(source string) (string, string, error) { 379 // Ensure any symlinks are resolved. 380 sourcePath, err := filepath.EvalSymlinks(source) 381 if err != nil { 382 return "", "", err 383 } 384 385 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 386 if err != nil { 387 return "", "", err 388 } 389 if len(mi) < 1 { 390 return "", "", fmt.Errorf("Can't find mount point of %s", source) 391 } 392 393 // find the longest mount point 394 var idx, maxlen int 395 for i := range mi { 396 if len(mi[i].Mountpoint) > maxlen { 397 maxlen = len(mi[i].Mountpoint) 398 idx = i 399 } 400 } 401 return mi[idx].Mountpoint, mi[idx].Optional, nil 402 } 403 404 const ( 405 sharedPropagationOption = "shared:" 406 slavePropagationOption = "master:" 407 ) 408 409 // hasMountInfoOption checks if any of the passed any of the given option values 410 // are set in the passed in option string. 411 func hasMountInfoOption(opts string, vals ...string) bool { 412 for _, opt := range strings.Split(opts, " ") { 413 for _, val := range vals { 414 if strings.HasPrefix(opt, val) { 415 return true 416 } 417 } 418 } 419 return false 420 } 421 422 // Ensure mount point on which path is mounted, is shared. 423 func ensureShared(path string) error { 424 sourceMount, optionalOpts, err := getSourceMount(path) 425 if err != nil { 426 return err 427 } 428 // Make sure source mount point is shared. 429 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 430 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 431 } 432 return nil 433 } 434 435 // Ensure mount point on which path is mounted, is either shared or slave. 436 func ensureSharedOrSlave(path string) error { 437 sourceMount, optionalOpts, err := getSourceMount(path) 438 if err != nil { 439 return err 440 } 441 442 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 443 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 444 } 445 return nil 446 } 447 448 // Get the set of mount flags that are set on the mount that contains the given 449 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 450 // bind-mounting "with options" will not fail with user namespaces, due to 451 // kernel restrictions that require user namespace mounts to preserve 452 // CL_UNPRIVILEGED locked flags. 453 func getUnprivilegedMountFlags(path string) ([]string, error) { 454 var statfs unix.Statfs_t 455 if err := unix.Statfs(path, &statfs); err != nil { 456 return nil, err 457 } 458 459 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 460 unprivilegedFlags := map[uint64]string{ 461 unix.MS_RDONLY: "ro", 462 unix.MS_NODEV: "nodev", 463 unix.MS_NOEXEC: "noexec", 464 unix.MS_NOSUID: "nosuid", 465 unix.MS_NOATIME: "noatime", 466 unix.MS_RELATIME: "relatime", 467 unix.MS_NODIRATIME: "nodiratime", 468 } 469 470 var flags []string 471 for mask, flag := range unprivilegedFlags { 472 if uint64(statfs.Flags)&mask == mask { 473 flags = append(flags, flag) 474 } 475 } 476 477 return flags, nil 478 } 479 480 var ( 481 mountPropagationMap = map[string]int{ 482 "private": mount.PRIVATE, 483 "rprivate": mount.RPRIVATE, 484 "shared": mount.SHARED, 485 "rshared": mount.RSHARED, 486 "slave": mount.SLAVE, 487 "rslave": mount.RSLAVE, 488 } 489 490 mountPropagationReverseMap = map[int]string{ 491 mount.PRIVATE: "private", 492 mount.RPRIVATE: "rprivate", 493 mount.SHARED: "shared", 494 mount.RSHARED: "rshared", 495 mount.SLAVE: "slave", 496 mount.RSLAVE: "rslave", 497 } 498 ) 499 500 // inSlice tests whether a string is contained in a slice of strings or not. 501 // Comparison is case sensitive 502 func inSlice(slice []string, s string) bool { 503 for _, ss := range slice { 504 if s == ss { 505 return true 506 } 507 } 508 return false 509 } 510 511 // withMounts sets the container's mounts 512 func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts { 513 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 514 sort.Sort(mounts(ms)) 515 516 mounts := ms 517 518 userMounts := make(map[string]struct{}) 519 for _, m := range mounts { 520 userMounts[m.Destination] = struct{}{} 521 } 522 523 // Copy all mounts from spec to defaultMounts, except for 524 // - mounts overridden by a user supplied mount; 525 // - all mounts under /dev if a user supplied /dev is present; 526 // - /dev/shm, in case IpcMode is none. 527 // While at it, also 528 // - set size for /dev/shm from shmsize. 529 defaultMounts := s.Mounts[:0] 530 _, mountDev := userMounts["/dev"] 531 for _, m := range s.Mounts { 532 if _, ok := userMounts[m.Destination]; ok { 533 // filter out mount overridden by a user supplied mount 534 continue 535 } 536 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 537 // filter out everything under /dev if /dev is user-mounted 538 continue 539 } 540 541 if m.Destination == "/dev/shm" { 542 if c.HostConfig.IpcMode.IsNone() { 543 // filter out /dev/shm for "none" IpcMode 544 continue 545 } 546 // set size for /dev/shm mount from spec 547 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 548 m.Options = append(m.Options, sizeOpt) 549 } 550 551 defaultMounts = append(defaultMounts, m) 552 } 553 554 s.Mounts = defaultMounts 555 for _, m := range mounts { 556 if m.Source == "tmpfs" { 557 data := m.Data 558 parser := volumemounts.NewParser() 559 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 560 if data != "" { 561 options = append(options, strings.Split(data, ",")...) 562 } 563 564 merged, err := mount.MergeTmpfsOptions(options) 565 if err != nil { 566 return err 567 } 568 569 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 570 continue 571 } 572 573 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 574 575 // Determine property of RootPropagation based on volume 576 // properties. If a volume is shared, then keep root propagation 577 // shared. This should work for slave and private volumes too. 578 // 579 // For slave volumes, it can be either [r]shared/[r]slave. 580 // 581 // For private volumes any root propagation value should work. 582 pFlag := mountPropagationMap[m.Propagation] 583 switch pFlag { 584 case mount.SHARED, mount.RSHARED: 585 if err := ensureShared(m.Source); err != nil { 586 return err 587 } 588 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 589 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 590 if s.Linux == nil { 591 s.Linux = &specs.Linux{} 592 } 593 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 594 } 595 case mount.SLAVE, mount.RSLAVE: 596 var fallback bool 597 if err := ensureSharedOrSlave(m.Source); err != nil { 598 // For backwards compatibility purposes, treat mounts from the daemon root 599 // as special since we automatically add rslave propagation to these mounts 600 // when the user did not set anything, so we should fallback to the old 601 // behavior which is to use private propagation which is normally the 602 // default. 603 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 604 return err 605 } 606 607 cm, ok := c.MountPoints[m.Destination] 608 if !ok { 609 return err 610 } 611 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 612 // This means the user explicitly set a propagation, do not fallback in that case. 613 return err 614 } 615 fallback = true 616 log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 617 } 618 if !fallback { 619 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 620 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 621 if s.Linux == nil { 622 s.Linux = &specs.Linux{} 623 } 624 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 625 } 626 } 627 } 628 629 bindMode := "rbind" 630 if m.NonRecursive { 631 bindMode = "bind" 632 } 633 opts := []string{bindMode} 634 if !m.Writable { 635 rro := true 636 if m.ReadOnlyNonRecursive { 637 rro = false 638 if m.ReadOnlyForceRecursive { 639 return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive") 640 } 641 } 642 if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil { 643 rro = false 644 if m.ReadOnlyForceRecursive { 645 return rroErr 646 } 647 } 648 if rro { 649 opts = append(opts, "rro") 650 } else { 651 opts = append(opts, "ro") 652 } 653 } 654 if pFlag != 0 { 655 opts = append(opts, mountPropagationReverseMap[pFlag]) 656 } 657 658 // If we are using user namespaces, then we must make sure that we 659 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 660 // "mount" when we bind-mount. The reason for this is that at the point 661 // when runc sets up the root filesystem, it is already inside a user 662 // namespace, and thus cannot change any flags that are locked. 663 if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() { 664 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 665 if err != nil { 666 return err 667 } 668 opts = append(opts, unprivOpts...) 669 } 670 671 mt.Options = opts 672 s.Mounts = append(s.Mounts, mt) 673 } 674 675 if s.Root.Readonly { 676 for i, m := range s.Mounts { 677 switch m.Destination { 678 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 679 continue 680 } 681 if _, ok := userMounts[m.Destination]; !ok { 682 if !inSlice(m.Options, "ro") { 683 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 684 } 685 } 686 } 687 } 688 689 if c.HostConfig.Privileged { 690 // clear readonly for /sys 691 for i := range s.Mounts { 692 if s.Mounts[i].Destination == "/sys" { 693 clearReadOnly(&s.Mounts[i]) 694 } 695 } 696 if s.Linux != nil { 697 s.Linux.ReadonlyPaths = nil 698 s.Linux.MaskedPaths = nil 699 } 700 } 701 702 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 703 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 704 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged { 705 for i, m := range s.Mounts { 706 if m.Type == "cgroup" { 707 clearReadOnly(&s.Mounts[i]) 708 } 709 } 710 } 711 712 return nil 713 } 714 } 715 716 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 717 // exist, so do not add the default ones if running on an old kernel. 718 func sysctlExists(s string) bool { 719 f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/")) 720 _, err := os.Stat(f) 721 return err == nil 722 } 723 724 // withCommonOptions sets common docker options 725 func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 726 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 727 if c.BaseFS == "" { 728 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty") 729 } 730 linkedEnv, err := daemon.setupLinkedContainers(c) 731 if err != nil { 732 return err 733 } 734 s.Root = &specs.Root{ 735 Path: c.BaseFS, 736 Readonly: c.HostConfig.ReadonlyRootfs, 737 } 738 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 739 return err 740 } 741 cwd := c.Config.WorkingDir 742 if len(cwd) == 0 { 743 cwd = "/" 744 } 745 if s.Process == nil { 746 s.Process = &specs.Process{} 747 } 748 s.Process.Args = append([]string{c.Path}, c.Args...) 749 750 // only add the custom init if it is specified and the container is running in its 751 // own private pid namespace. It does not make sense to add if it is running in the 752 // host namespace or another container's pid namespace where we already have an init 753 if c.HostConfig.PidMode.IsPrivate() { 754 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 755 (c.HostConfig.Init == nil && daemonCfg.Init) { 756 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 757 path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path 758 if err != nil { 759 return err 760 } 761 s.Mounts = append(s.Mounts, specs.Mount{ 762 Destination: inContainerInitPath, 763 Type: "bind", 764 Source: path, 765 Options: []string{"bind", "ro"}, 766 }) 767 } 768 } 769 s.Process.Cwd = cwd 770 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 771 s.Process.Terminal = c.Config.Tty 772 773 s.Hostname = c.Config.Hostname 774 setLinuxDomainname(c, s) 775 776 // Add default sysctls that are generally safe and useful; currently we 777 // grant the capabilities to allow these anyway. You can override if 778 // you want to restore the original behaviour. 779 // We do not set network sysctls if network namespace is host, or if we are 780 // joining an existing namespace, only if we create a new net namespace. 781 if c.HostConfig.NetworkMode.IsPrivate() { 782 // We cannot set up ping socket support in a user namespace 783 userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate() 784 if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") { 785 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 786 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 787 } 788 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 789 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 790 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 791 } 792 } 793 794 return nil 795 } 796 } 797 798 // withCgroups sets the container's cgroups 799 func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 800 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 801 var cgroupsPath string 802 scopePrefix := "docker" 803 parent := "/docker" 804 useSystemd := UsingSystemd(daemonCfg) 805 if useSystemd { 806 parent = "system.slice" 807 if daemonCfg.Rootless { 808 parent = "user.slice" 809 } 810 } 811 812 if c.HostConfig.CgroupParent != "" { 813 parent = c.HostConfig.CgroupParent 814 } else if daemonCfg.CgroupParent != "" { 815 parent = daemonCfg.CgroupParent 816 } 817 818 if useSystemd { 819 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 820 log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 821 } else { 822 cgroupsPath = filepath.Join(parent, c.ID) 823 } 824 if s.Linux == nil { 825 s.Linux = &specs.Linux{} 826 } 827 s.Linux.CgroupsPath = cgroupsPath 828 829 // the rest is only needed for CPU RT controller 830 831 if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 { 832 return nil 833 } 834 835 p := cgroupsPath 836 if useSystemd { 837 initPath, err := cgroups.GetInitCgroup("cpu") 838 if err != nil { 839 return errors.Wrap(err, "unable to init CPU RT controller") 840 } 841 _, err = cgroups.GetOwnCgroup("cpu") 842 if err != nil { 843 return errors.Wrap(err, "unable to init CPU RT controller") 844 } 845 p = filepath.Join(initPath, s.Linux.CgroupsPath) 846 } 847 848 // Clean path to guard against things like ../../../BAD 849 parentPath := filepath.Dir(p) 850 if !filepath.IsAbs(parentPath) { 851 parentPath = filepath.Clean("/" + parentPath) 852 } 853 854 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 855 if err != nil { 856 return errors.Wrap(err, "unable to init CPU RT controller") 857 } 858 // When docker is run inside docker, the root is based of the host cgroup. 859 // Should this be handled in runc/libcontainer/cgroups ? 860 if strings.HasPrefix(root, "/docker/") { 861 root = "/" 862 } 863 mnt = filepath.Join(mnt, root) 864 865 if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil { 866 return errors.Wrap(err, "unable to init CPU RT controller") 867 } 868 return nil 869 } 870 } 871 872 // WithDevices sets the container's devices 873 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 874 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 875 // Build lists of devices allowed and created within the container. 876 var devs []specs.LinuxDevice 877 devPermissions := s.Linux.Resources.Devices 878 879 if c.HostConfig.Privileged { 880 hostDevices, err := coci.HostDevices() 881 if err != nil { 882 return err 883 } 884 devs = append(devs, hostDevices...) 885 886 // adding device mappings in privileged containers 887 for _, deviceMapping := range c.HostConfig.Devices { 888 // issue a warning that custom cgroup permissions are ignored in privileged mode 889 if deviceMapping.CgroupPermissions != "rwm" { 890 log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 891 } 892 // issue a warning that the device path already exists via /dev mounting in privileged mode 893 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 894 log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 895 continue 896 } 897 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 898 if err != nil { 899 return err 900 } 901 devs = append(devs, d...) 902 } 903 904 devPermissions = []specs.LinuxDeviceCgroup{ 905 { 906 Allow: true, 907 Access: "rwm", 908 }, 909 } 910 } else { 911 for _, deviceMapping := range c.HostConfig.Devices { 912 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 913 if err != nil { 914 return err 915 } 916 devs = append(devs, d...) 917 devPermissions = append(devPermissions, dPermissions...) 918 } 919 920 var err error 921 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 922 if err != nil { 923 return err 924 } 925 } 926 927 if s.Linux == nil { 928 s.Linux = &specs.Linux{} 929 } 930 if s.Linux.Resources == nil { 931 s.Linux.Resources = &specs.LinuxResources{} 932 } 933 s.Linux.Devices = append(s.Linux.Devices, devs...) 934 s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...) 935 936 for _, req := range c.HostConfig.DeviceRequests { 937 if err := daemon.handleDevice(req, s); err != nil { 938 return err 939 } 940 } 941 return nil 942 } 943 } 944 945 // WithResources applies the container resources 946 func WithResources(c *container.Container) coci.SpecOpts { 947 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 948 r := c.HostConfig.Resources 949 weightDevices, err := getBlkioWeightDevices(r) 950 if err != nil { 951 return err 952 } 953 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 954 if err != nil { 955 return err 956 } 957 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 958 if err != nil { 959 return err 960 } 961 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 962 if err != nil { 963 return err 964 } 965 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 966 if err != nil { 967 return err 968 } 969 970 memoryRes := getMemoryResources(r) 971 cpuRes, err := getCPUResources(r) 972 if err != nil { 973 return err 974 } 975 976 if s.Linux == nil { 977 s.Linux = &specs.Linux{} 978 } 979 if s.Linux.Resources == nil { 980 s.Linux.Resources = &specs.LinuxResources{} 981 } 982 s.Linux.Resources.Memory = memoryRes 983 s.Linux.Resources.CPU = cpuRes 984 s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{ 985 WeightDevice: weightDevices, 986 ThrottleReadBpsDevice: readBpsDevice, 987 ThrottleWriteBpsDevice: writeBpsDevice, 988 ThrottleReadIOPSDevice: readIOpsDevice, 989 ThrottleWriteIOPSDevice: writeIOpsDevice, 990 } 991 if r.BlkioWeight != 0 { 992 w := r.BlkioWeight 993 s.Linux.Resources.BlockIO.Weight = &w 994 } 995 s.Linux.Resources.Pids = getPidsLimit(r) 996 997 return nil 998 } 999 } 1000 1001 // WithSysctls sets the container's sysctls 1002 func WithSysctls(c *container.Container) coci.SpecOpts { 1003 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1004 if len(c.HostConfig.Sysctls) == 0 { 1005 return nil 1006 } 1007 if s.Linux == nil { 1008 s.Linux = &specs.Linux{} 1009 } 1010 if s.Linux.Sysctl == nil { 1011 s.Linux.Sysctl = make(map[string]string) 1012 } 1013 // We merge the sysctls injected above with the HostConfig (latter takes 1014 // precedence for backwards-compatibility reasons). 1015 for k, v := range c.HostConfig.Sysctls { 1016 s.Linux.Sysctl[k] = v 1017 } 1018 return nil 1019 } 1020 } 1021 1022 // WithUser sets the container's user 1023 func WithUser(c *container.Container) coci.SpecOpts { 1024 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1025 if s.Process == nil { 1026 s.Process = &specs.Process{} 1027 } 1028 var err error 1029 s.Process.User, err = getUser(c, c.Config.User) 1030 return err 1031 } 1032 } 1033 1034 func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) { 1035 var ( 1036 opts []coci.SpecOpts 1037 s = oci.DefaultSpec() 1038 ) 1039 opts = append(opts, 1040 withCommonOptions(daemon, &daemonCfg.Config, c), 1041 withCgroups(daemon, &daemonCfg.Config, c), 1042 WithResources(c), 1043 WithSysctls(c), 1044 WithDevices(daemon, c), 1045 withRlimits(daemon, &daemonCfg.Config, c), 1046 WithNamespaces(daemon, c), 1047 WithCapabilities(c), 1048 WithSeccomp(daemon, c), 1049 withMounts(daemon, daemonCfg, c, mounts), 1050 WithApparmor(c), 1051 WithSelinux(c), 1052 WithOOMScore(&c.HostConfig.OomScoreAdj), 1053 coci.WithAnnotations(c.HostConfig.Annotations), 1054 WithUser(c), 1055 ) 1056 1057 if c.NoNewPrivileges { 1058 opts = append(opts, coci.WithNoNewPrivileges) 1059 } 1060 if c.Config.Tty { 1061 opts = append(opts, WithConsoleSize(c)) 1062 } 1063 // Set the masked and readonly paths with regard to the host config options if they are set. 1064 if c.HostConfig.MaskedPaths != nil { 1065 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1066 } 1067 if c.HostConfig.ReadonlyPaths != nil { 1068 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1069 } 1070 if daemonCfg.Rootless { 1071 opts = append(opts, withRootless(daemon, &daemonCfg.Config)) 1072 } else if userns.RunningInUserNS() { 1073 opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config)) 1074 } 1075 1076 var snapshotter, snapshotKey string 1077 if daemon.UsesSnapshotter() { 1078 snapshotter = daemon.imageService.StorageDriver() 1079 snapshotKey = c.ID 1080 } 1081 1082 return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{ 1083 ID: c.ID, 1084 Snapshotter: snapshotter, 1085 SnapshotKey: snapshotKey, 1086 }, &s, opts...) 1087 } 1088 1089 func clearReadOnly(m *specs.Mount) { 1090 var opt []string 1091 for _, o := range m.Options { 1092 if o != "ro" { 1093 opt = append(opt, o) 1094 } 1095 } 1096 m.Options = opt 1097 } 1098 1099 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1100 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) { 1101 ulimits := c.Ulimits 1102 // Merge ulimits with daemon defaults 1103 ulIdx := make(map[string]struct{}) 1104 for _, ul := range ulimits { 1105 ulIdx[ul.Name] = struct{}{} 1106 } 1107 for name, ul := range daemonCfg.Ulimits { 1108 if _, exists := ulIdx[name]; !exists { 1109 ulimits = append(ulimits, ul) 1110 } 1111 } 1112 c.Ulimits = ulimits 1113 }