github.com/moby/docker@v26.1.3+incompatible/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "path/filepath" 8 "sort" 9 "strconv" 10 "strings" 11 12 cdcgroups "github.com/containerd/cgroups/v3" 13 "github.com/containerd/containerd/containers" 14 coci "github.com/containerd/containerd/oci" 15 "github.com/containerd/containerd/pkg/apparmor" 16 "github.com/containerd/containerd/pkg/userns" 17 "github.com/containerd/log" 18 containertypes "github.com/docker/docker/api/types/container" 19 "github.com/docker/docker/container" 20 dconfig "github.com/docker/docker/daemon/config" 21 "github.com/docker/docker/errdefs" 22 "github.com/docker/docker/internal/rootless/mountopts" 23 "github.com/docker/docker/oci" 24 "github.com/docker/docker/oci/caps" 25 "github.com/docker/docker/pkg/idtools" 26 "github.com/docker/docker/pkg/rootless/specconv" 27 "github.com/docker/docker/pkg/stringid" 28 volumemounts "github.com/docker/docker/volume/mounts" 29 "github.com/moby/sys/mount" 30 "github.com/moby/sys/mountinfo" 31 "github.com/moby/sys/user" 32 "github.com/opencontainers/runc/libcontainer/cgroups" 33 specs "github.com/opencontainers/runtime-spec/specs-go" 34 "github.com/pkg/errors" 35 ) 36 37 const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary 38 39 // withRlimits sets the container's rlimits along with merging the daemon's rlimits 40 func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 41 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 42 var rlimits []specs.POSIXRlimit 43 44 // We want to leave the original HostConfig alone so make a copy here 45 hostConfig := *c.HostConfig 46 // Merge with the daemon defaults 47 daemon.mergeUlimits(&hostConfig, daemonCfg) 48 for _, ul := range hostConfig.Ulimits { 49 rlimits = append(rlimits, specs.POSIXRlimit{ 50 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 51 Soft: uint64(ul.Soft), 52 Hard: uint64(ul.Hard), 53 }) 54 } 55 56 if s.Process == nil { 57 s.Process = &specs.Process{} 58 } 59 s.Process.Rlimits = rlimits 60 return nil 61 } 62 } 63 64 // withLibnetwork sets the libnetwork hook 65 func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 66 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 67 if c.Config.NetworkDisabled { 68 return nil 69 } 70 for _, ns := range s.Linux.Namespaces { 71 if ns.Type == specs.NetworkNamespace && ns.Path == "" { 72 if s.Hooks == nil { 73 s.Hooks = &specs.Hooks{} 74 } 75 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 76 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ //nolint:staticcheck // FIXME(thaJeztah); replace prestart hook with a non-deprecated one. 77 Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"), 78 Args: []string{"libnetwork-setkey", "-exec-root=" + daemonCfg.GetExecRoot(), c.ID, shortNetCtlrID}, 79 }) 80 } 81 } 82 return nil 83 } 84 } 85 86 // withRootless sets the spec to the rootless configuration 87 func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts { 88 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 89 var v2Controllers []string 90 if cgroupDriver(daemonCfg) == cgroupSystemdDriver { 91 if cdcgroups.Mode() != cdcgroups.Unified { 92 return errors.New("rootless systemd driver doesn't support cgroup v1") 93 } 94 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 95 if rootlesskitParentEUID == "" { 96 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 97 } 98 euid, err := strconv.Atoi(rootlesskitParentEUID) 99 if err != nil { 100 return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value") 101 } 102 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid) 103 controllersFile, err := os.ReadFile(controllersPath) 104 if err != nil { 105 return err 106 } 107 v2Controllers = strings.Fields(string(controllersFile)) 108 } 109 return specconv.ToRootless(s, v2Controllers) 110 } 111 } 112 113 // withRootfulInRootless is used for "rootful-in-rootless" dind; 114 // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc. 115 func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts { 116 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 117 specconv.ToRootfulInRootless(s) 118 return nil 119 } 120 } 121 122 // WithOOMScore sets the oom score 123 func WithOOMScore(score *int) coci.SpecOpts { 124 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 125 if s.Process == nil { 126 s.Process = &specs.Process{} 127 } 128 s.Process.OOMScoreAdj = score 129 return nil 130 } 131 } 132 133 // WithSelinux sets the selinux labels 134 func WithSelinux(c *container.Container) coci.SpecOpts { 135 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 136 if s.Process == nil { 137 s.Process = &specs.Process{} 138 } 139 if s.Linux == nil { 140 s.Linux = &specs.Linux{} 141 } 142 s.Process.SelinuxLabel = c.GetProcessLabel() 143 s.Linux.MountLabel = c.MountLabel 144 return nil 145 } 146 } 147 148 // WithApparmor sets the apparmor profile 149 func WithApparmor(c *container.Container) coci.SpecOpts { 150 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 151 if apparmor.HostSupports() { 152 var appArmorProfile string 153 if c.AppArmorProfile != "" { 154 appArmorProfile = c.AppArmorProfile 155 } else if c.HostConfig.Privileged { 156 appArmorProfile = unconfinedAppArmorProfile 157 } else { 158 appArmorProfile = defaultAppArmorProfile 159 } 160 161 if appArmorProfile == defaultAppArmorProfile { 162 // Unattended upgrades and other fun services can unload AppArmor 163 // profiles inadvertently. Since we cannot store our profile in 164 // /etc/apparmor.d, nor can we practically add other ways of 165 // telling the system to keep our profile loaded, in order to make 166 // sure that we keep the default profile enabled we dynamically 167 // reload it if necessary. 168 if err := ensureDefaultAppArmorProfile(); err != nil { 169 return err 170 } 171 } 172 if s.Process == nil { 173 s.Process = &specs.Process{} 174 } 175 s.Process.ApparmorProfile = appArmorProfile 176 } 177 return nil 178 } 179 } 180 181 // WithCapabilities sets the container's capabilties 182 func WithCapabilities(c *container.Container) coci.SpecOpts { 183 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 184 capabilities, err := caps.TweakCapabilities( 185 caps.DefaultCapabilities(), 186 c.HostConfig.CapAdd, 187 c.HostConfig.CapDrop, 188 c.HostConfig.Privileged, 189 ) 190 if err != nil { 191 return err 192 } 193 return oci.SetCapabilities(s, capabilities) 194 } 195 } 196 197 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 198 p, err := getPath() 199 if err != nil { 200 return "", err 201 } 202 return c.GetResourcePath(p) 203 } 204 205 func getUser(c *container.Container, username string) (specs.User, error) { 206 var usr specs.User 207 passwdPath, err := resourcePath(c, user.GetPasswdPath) 208 if err != nil { 209 return usr, err 210 } 211 groupPath, err := resourcePath(c, user.GetGroupPath) 212 if err != nil { 213 return usr, err 214 } 215 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 216 if err != nil { 217 return usr, err 218 } 219 usr.UID = uint32(execUser.Uid) 220 usr.GID = uint32(execUser.Gid) 221 usr.AdditionalGids = []uint32{usr.GID} 222 223 var addGroups []int 224 if len(c.HostConfig.GroupAdd) > 0 { 225 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 226 if err != nil { 227 return usr, err 228 } 229 } 230 for _, g := range append(execUser.Sgids, addGroups...) { 231 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 232 } 233 return usr, nil 234 } 235 236 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 237 if s.Linux == nil { 238 s.Linux = &specs.Linux{} 239 } 240 241 for i, n := range s.Linux.Namespaces { 242 if n.Type == ns.Type { 243 s.Linux.Namespaces[i] = ns 244 return 245 } 246 } 247 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 248 } 249 250 // WithNamespaces sets the container's namespaces 251 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 252 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 253 userNS := false 254 // user 255 if c.HostConfig.UsernsMode.IsPrivate() { 256 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil { 257 userNS = true 258 setNamespace(s, specs.LinuxNamespace{ 259 Type: specs.UserNamespace, 260 }) 261 s.Linux.UIDMappings = specMapping(uidMap) 262 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps) 263 } 264 } 265 // network 266 if !c.Config.NetworkDisabled { 267 networkMode := c.HostConfig.NetworkMode 268 switch { 269 case networkMode.IsContainer(): 270 nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer()) 271 if err != nil { 272 return err 273 } 274 setNamespace(s, specs.LinuxNamespace{ 275 Type: specs.NetworkNamespace, 276 Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()), 277 }) 278 if userNS { 279 // to share a net namespace, the containers must also share a user namespace. 280 // 281 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 282 setNamespace(s, specs.LinuxNamespace{ 283 Type: specs.UserNamespace, 284 Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()), 285 }) 286 } 287 case networkMode.IsHost(): 288 oci.RemoveNamespace(s, specs.NetworkNamespace) 289 default: 290 setNamespace(s, specs.LinuxNamespace{ 291 Type: specs.NetworkNamespace, 292 }) 293 } 294 } 295 296 // ipc 297 ipcMode := c.HostConfig.IpcMode 298 if !ipcMode.Valid() { 299 return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode)) 300 } 301 switch { 302 case ipcMode.IsContainer(): 303 ic, err := daemon.getIPCContainer(ipcMode.Container()) 304 if err != nil { 305 return errors.Wrap(err, "failed to join IPC namespace") 306 } 307 setNamespace(s, specs.LinuxNamespace{ 308 Type: specs.IPCNamespace, 309 Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()), 310 }) 311 if userNS { 312 // to share a IPC namespace, the containers must also share a user namespace. 313 // 314 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 315 setNamespace(s, specs.LinuxNamespace{ 316 Type: specs.UserNamespace, 317 Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()), 318 }) 319 } 320 case ipcMode.IsHost(): 321 oci.RemoveNamespace(s, specs.IPCNamespace) 322 case ipcMode.IsEmpty(): 323 // A container was created by an older version of the daemon. 324 // The default behavior used to be what is now called "shareable". 325 fallthrough 326 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 327 setNamespace(s, specs.LinuxNamespace{ 328 Type: specs.IPCNamespace, 329 }) 330 } 331 332 // pid 333 pidMode := c.HostConfig.PidMode 334 if !pidMode.Valid() { 335 return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode)) 336 } 337 switch { 338 case pidMode.IsContainer(): 339 pc, err := daemon.getPIDContainer(pidMode.Container()) 340 if err != nil { 341 return errors.Wrap(err, "failed to join PID namespace") 342 } 343 setNamespace(s, specs.LinuxNamespace{ 344 Type: specs.PIDNamespace, 345 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 346 }) 347 if userNS { 348 // to share a PID namespace, the containers must also share a user namespace. 349 // 350 // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210 351 setNamespace(s, specs.LinuxNamespace{ 352 Type: specs.UserNamespace, 353 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 354 }) 355 } 356 case pidMode.IsHost(): 357 oci.RemoveNamespace(s, specs.PIDNamespace) 358 default: 359 setNamespace(s, specs.LinuxNamespace{ 360 Type: specs.PIDNamespace, 361 }) 362 } 363 364 // uts 365 if !c.HostConfig.UTSMode.Valid() { 366 return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode)) 367 } 368 if c.HostConfig.UTSMode.IsHost() { 369 oci.RemoveNamespace(s, specs.UTSNamespace) 370 s.Hostname = "" 371 } 372 373 // cgroup 374 if !c.HostConfig.CgroupnsMode.Valid() { 375 return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode)) 376 } 377 if c.HostConfig.CgroupnsMode.IsPrivate() { 378 setNamespace(s, specs.LinuxNamespace{ 379 Type: specs.CgroupNamespace, 380 }) 381 } 382 383 return nil 384 } 385 } 386 387 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 388 var ids []specs.LinuxIDMapping 389 for _, item := range s { 390 ids = append(ids, specs.LinuxIDMapping{ 391 HostID: uint32(item.HostID), 392 ContainerID: uint32(item.ContainerID), 393 Size: uint32(item.Size), 394 }) 395 } 396 return ids 397 } 398 399 // Get the source mount point of directory passed in as argument. Also return 400 // optional fields. 401 func getSourceMount(source string) (string, string, error) { 402 // Ensure any symlinks are resolved. 403 sourcePath, err := filepath.EvalSymlinks(source) 404 if err != nil { 405 return "", "", err 406 } 407 408 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 409 if err != nil { 410 return "", "", err 411 } 412 if len(mi) < 1 { 413 return "", "", fmt.Errorf("Can't find mount point of %s", source) 414 } 415 416 // find the longest mount point 417 var idx, maxlen int 418 for i := range mi { 419 if len(mi[i].Mountpoint) > maxlen { 420 maxlen = len(mi[i].Mountpoint) 421 idx = i 422 } 423 } 424 return mi[idx].Mountpoint, mi[idx].Optional, nil 425 } 426 427 const ( 428 sharedPropagationOption = "shared:" 429 slavePropagationOption = "master:" 430 ) 431 432 // hasMountInfoOption checks if any of the passed any of the given option values 433 // are set in the passed in option string. 434 func hasMountInfoOption(opts string, vals ...string) bool { 435 for _, opt := range strings.Split(opts, " ") { 436 for _, val := range vals { 437 if strings.HasPrefix(opt, val) { 438 return true 439 } 440 } 441 } 442 return false 443 } 444 445 // Ensure mount point on which path is mounted, is shared. 446 func ensureShared(path string) error { 447 sourceMount, optionalOpts, err := getSourceMount(path) 448 if err != nil { 449 return err 450 } 451 // Make sure source mount point is shared. 452 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 453 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 454 } 455 return nil 456 } 457 458 // Ensure mount point on which path is mounted, is either shared or slave. 459 func ensureSharedOrSlave(path string) error { 460 sourceMount, optionalOpts, err := getSourceMount(path) 461 if err != nil { 462 return err 463 } 464 465 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 466 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 467 } 468 return nil 469 } 470 471 var ( 472 mountPropagationMap = map[string]int{ 473 "private": mount.PRIVATE, 474 "rprivate": mount.RPRIVATE, 475 "shared": mount.SHARED, 476 "rshared": mount.RSHARED, 477 "slave": mount.SLAVE, 478 "rslave": mount.RSLAVE, 479 } 480 481 mountPropagationReverseMap = map[int]string{ 482 mount.PRIVATE: "private", 483 mount.RPRIVATE: "rprivate", 484 mount.SHARED: "shared", 485 mount.RSHARED: "rshared", 486 mount.SLAVE: "slave", 487 mount.RSLAVE: "rslave", 488 } 489 ) 490 491 // inSlice tests whether a string is contained in a slice of strings or not. 492 // Comparison is case sensitive 493 func inSlice(slice []string, s string) bool { 494 for _, ss := range slice { 495 if s == ss { 496 return true 497 } 498 } 499 return false 500 } 501 502 // withMounts sets the container's mounts 503 func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts { 504 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 505 sort.Sort(mounts(ms)) 506 507 mounts := ms 508 509 userMounts := make(map[string]struct{}) 510 for _, m := range mounts { 511 userMounts[m.Destination] = struct{}{} 512 } 513 514 // Copy all mounts from spec to defaultMounts, except for 515 // - mounts overridden by a user supplied mount; 516 // - all mounts under /dev if a user supplied /dev is present; 517 // - /dev/shm, in case IpcMode is none. 518 // While at it, also 519 // - set size for /dev/shm from shmsize. 520 defaultMounts := s.Mounts[:0] 521 _, mountDev := userMounts["/dev"] 522 for _, m := range s.Mounts { 523 if _, ok := userMounts[m.Destination]; ok { 524 // filter out mount overridden by a user supplied mount 525 continue 526 } 527 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 528 // filter out everything under /dev if /dev is user-mounted 529 continue 530 } 531 532 if m.Destination == "/dev/shm" { 533 if c.HostConfig.IpcMode.IsNone() { 534 // filter out /dev/shm for "none" IpcMode 535 continue 536 } 537 // set size for /dev/shm mount from spec 538 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 539 m.Options = append(m.Options, sizeOpt) 540 } 541 542 defaultMounts = append(defaultMounts, m) 543 } 544 545 s.Mounts = defaultMounts 546 for _, m := range mounts { 547 if m.Source == "tmpfs" { 548 data := m.Data 549 parser := volumemounts.NewParser() 550 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 551 if data != "" { 552 options = append(options, strings.Split(data, ",")...) 553 } 554 555 merged, err := mount.MergeTmpfsOptions(options) 556 if err != nil { 557 return err 558 } 559 560 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 561 continue 562 } 563 564 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 565 566 // Determine property of RootPropagation based on volume 567 // properties. If a volume is shared, then keep root propagation 568 // shared. This should work for slave and private volumes too. 569 // 570 // For slave volumes, it can be either [r]shared/[r]slave. 571 // 572 // For private volumes any root propagation value should work. 573 pFlag := mountPropagationMap[m.Propagation] 574 switch pFlag { 575 case mount.SHARED, mount.RSHARED: 576 if err := ensureShared(m.Source); err != nil { 577 return err 578 } 579 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 580 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 581 if s.Linux == nil { 582 s.Linux = &specs.Linux{} 583 } 584 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 585 } 586 case mount.SLAVE, mount.RSLAVE: 587 var fallback bool 588 if err := ensureSharedOrSlave(m.Source); err != nil { 589 // For backwards compatibility purposes, treat mounts from the daemon root 590 // as special since we automatically add rslave propagation to these mounts 591 // when the user did not set anything, so we should fallback to the old 592 // behavior which is to use private propagation which is normally the 593 // default. 594 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 595 return err 596 } 597 598 cm, ok := c.MountPoints[m.Destination] 599 if !ok { 600 return err 601 } 602 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 603 // This means the user explicitly set a propagation, do not fallback in that case. 604 return err 605 } 606 fallback = true 607 log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 608 } 609 if !fallback { 610 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 611 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 612 if s.Linux == nil { 613 s.Linux = &specs.Linux{} 614 } 615 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 616 } 617 } 618 } 619 620 bindMode := "rbind" 621 if m.NonRecursive { 622 bindMode = "bind" 623 } 624 opts := []string{bindMode} 625 if !m.Writable { 626 rro := true 627 if m.ReadOnlyNonRecursive { 628 rro = false 629 if m.ReadOnlyForceRecursive { 630 return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive") 631 } 632 } 633 if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil { 634 rro = false 635 if m.ReadOnlyForceRecursive { 636 return rroErr 637 } 638 } 639 if rro { 640 opts = append(opts, "rro") 641 } else { 642 opts = append(opts, "ro") 643 } 644 } 645 if pFlag != 0 { 646 opts = append(opts, mountPropagationReverseMap[pFlag]) 647 } 648 649 // If we are using user namespaces, then we must make sure that we 650 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 651 // "mount" when we bind-mount. The reason for this is that at the point 652 // when runc sets up the root filesystem, it is already inside a user 653 // namespace, and thus cannot change any flags that are locked. 654 if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() { 655 unprivOpts, err := mountopts.UnprivilegedMountFlags(m.Source) 656 if err != nil { 657 return err 658 } 659 opts = append(opts, unprivOpts...) 660 } 661 662 mt.Options = opts 663 s.Mounts = append(s.Mounts, mt) 664 } 665 666 if s.Root.Readonly { 667 for i, m := range s.Mounts { 668 switch m.Destination { 669 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 670 continue 671 } 672 if _, ok := userMounts[m.Destination]; !ok { 673 if !inSlice(m.Options, "ro") { 674 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 675 } 676 } 677 } 678 } 679 680 if c.HostConfig.Privileged { 681 // clear readonly for /sys 682 for i := range s.Mounts { 683 if s.Mounts[i].Destination == "/sys" { 684 clearReadOnly(&s.Mounts[i]) 685 } 686 } 687 if s.Linux != nil { 688 s.Linux.ReadonlyPaths = nil 689 s.Linux.MaskedPaths = nil 690 } 691 } 692 693 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 694 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 695 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged { 696 for i, m := range s.Mounts { 697 if m.Type == "cgroup" { 698 clearReadOnly(&s.Mounts[i]) 699 } 700 } 701 } 702 703 return nil 704 } 705 } 706 707 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 708 // exist, so do not add the default ones if running on an old kernel. 709 func sysctlExists(s string) bool { 710 f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/")) 711 _, err := os.Stat(f) 712 return err == nil 713 } 714 715 // withCommonOptions sets common docker options 716 func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 717 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 718 if c.BaseFS == "" { 719 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty") 720 } 721 linkedEnv, err := daemon.setupLinkedContainers(c) 722 if err != nil { 723 return err 724 } 725 s.Root = &specs.Root{ 726 Path: c.BaseFS, 727 Readonly: c.HostConfig.ReadonlyRootfs, 728 } 729 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 730 return err 731 } 732 cwd := c.Config.WorkingDir 733 if len(cwd) == 0 { 734 cwd = "/" 735 } 736 if s.Process == nil { 737 s.Process = &specs.Process{} 738 } 739 s.Process.Args = append([]string{c.Path}, c.Args...) 740 741 // only add the custom init if it is specified and the container is running in its 742 // own private pid namespace. It does not make sense to add if it is running in the 743 // host namespace or another container's pid namespace where we already have an init 744 if c.HostConfig.PidMode.IsPrivate() { 745 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 746 (c.HostConfig.Init == nil && daemonCfg.Init) { 747 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 748 path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path 749 if err != nil { 750 return err 751 } 752 s.Mounts = append(s.Mounts, specs.Mount{ 753 Destination: inContainerInitPath, 754 Type: "bind", 755 Source: path, 756 Options: []string{"bind", "ro"}, 757 }) 758 } 759 } 760 s.Process.Cwd = cwd 761 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 762 s.Process.Terminal = c.Config.Tty 763 764 s.Hostname = c.Config.Hostname 765 setLinuxDomainname(c, s) 766 767 // Add default sysctls that are generally safe and useful; currently we 768 // grant the capabilities to allow these anyway. You can override if 769 // you want to restore the original behaviour. 770 // We do not set network sysctls if network namespace is host, or if we are 771 // joining an existing namespace, only if we create a new net namespace. 772 if c.HostConfig.NetworkMode.IsPrivate() { 773 // We cannot set up ping socket support in a user namespace 774 userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate() 775 if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") { 776 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 777 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 778 } 779 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 780 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 781 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 782 } 783 } 784 785 return nil 786 } 787 } 788 789 // withCgroups sets the container's cgroups 790 func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts { 791 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 792 var cgroupsPath string 793 scopePrefix := "docker" 794 parent := "/docker" 795 useSystemd := UsingSystemd(daemonCfg) 796 if useSystemd { 797 parent = "system.slice" 798 if daemonCfg.Rootless { 799 parent = "user.slice" 800 } 801 } 802 803 if c.HostConfig.CgroupParent != "" { 804 parent = c.HostConfig.CgroupParent 805 } else if daemonCfg.CgroupParent != "" { 806 parent = daemonCfg.CgroupParent 807 } 808 809 if useSystemd { 810 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 811 log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 812 } else { 813 cgroupsPath = filepath.Join(parent, c.ID) 814 } 815 if s.Linux == nil { 816 s.Linux = &specs.Linux{} 817 } 818 s.Linux.CgroupsPath = cgroupsPath 819 820 // the rest is only needed for CPU RT controller 821 822 if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 { 823 return nil 824 } 825 826 p := cgroupsPath 827 if useSystemd { 828 initPath, err := cgroups.GetInitCgroup("cpu") 829 if err != nil { 830 return errors.Wrap(err, "unable to init CPU RT controller") 831 } 832 _, err = cgroups.GetOwnCgroup("cpu") 833 if err != nil { 834 return errors.Wrap(err, "unable to init CPU RT controller") 835 } 836 p = filepath.Join(initPath, s.Linux.CgroupsPath) 837 } 838 839 // Clean path to guard against things like ../../../BAD 840 parentPath := filepath.Dir(p) 841 if !filepath.IsAbs(parentPath) { 842 parentPath = filepath.Clean("/" + parentPath) 843 } 844 845 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 846 if err != nil { 847 return errors.Wrap(err, "unable to init CPU RT controller") 848 } 849 // When docker is run inside docker, the root is based of the host cgroup. 850 // Should this be handled in runc/libcontainer/cgroups ? 851 if strings.HasPrefix(root, "/docker/") { 852 root = "/" 853 } 854 mnt = filepath.Join(mnt, root) 855 856 if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil { 857 return errors.Wrap(err, "unable to init CPU RT controller") 858 } 859 return nil 860 } 861 } 862 863 // WithDevices sets the container's devices 864 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 865 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 866 // Build lists of devices allowed and created within the container. 867 var devs []specs.LinuxDevice 868 devPermissions := s.Linux.Resources.Devices 869 870 if c.HostConfig.Privileged { 871 hostDevices, err := coci.HostDevices() 872 if err != nil { 873 return err 874 } 875 devs = append(devs, hostDevices...) 876 877 // adding device mappings in privileged containers 878 for _, deviceMapping := range c.HostConfig.Devices { 879 // issue a warning that custom cgroup permissions are ignored in privileged mode 880 if deviceMapping.CgroupPermissions != "rwm" { 881 log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 882 } 883 // issue a warning that the device path already exists via /dev mounting in privileged mode 884 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 885 log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 886 continue 887 } 888 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 889 if err != nil { 890 return err 891 } 892 devs = append(devs, d...) 893 } 894 895 devPermissions = []specs.LinuxDeviceCgroup{ 896 { 897 Allow: true, 898 Access: "rwm", 899 }, 900 } 901 } else { 902 for _, deviceMapping := range c.HostConfig.Devices { 903 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 904 if err != nil { 905 return err 906 } 907 devs = append(devs, d...) 908 devPermissions = append(devPermissions, dPermissions...) 909 } 910 911 var err error 912 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 913 if err != nil { 914 return err 915 } 916 } 917 918 if s.Linux == nil { 919 s.Linux = &specs.Linux{} 920 } 921 if s.Linux.Resources == nil { 922 s.Linux.Resources = &specs.LinuxResources{} 923 } 924 s.Linux.Devices = append(s.Linux.Devices, devs...) 925 s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...) 926 927 for _, req := range c.HostConfig.DeviceRequests { 928 if err := daemon.handleDevice(req, s); err != nil { 929 return err 930 } 931 } 932 return nil 933 } 934 } 935 936 // WithResources applies the container resources 937 func WithResources(c *container.Container) coci.SpecOpts { 938 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 939 r := c.HostConfig.Resources 940 weightDevices, err := getBlkioWeightDevices(r) 941 if err != nil { 942 return err 943 } 944 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 945 if err != nil { 946 return err 947 } 948 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 949 if err != nil { 950 return err 951 } 952 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 953 if err != nil { 954 return err 955 } 956 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 957 if err != nil { 958 return err 959 } 960 961 memoryRes := getMemoryResources(r) 962 cpuRes, err := getCPUResources(r) 963 if err != nil { 964 return err 965 } 966 967 if s.Linux == nil { 968 s.Linux = &specs.Linux{} 969 } 970 if s.Linux.Resources == nil { 971 s.Linux.Resources = &specs.LinuxResources{} 972 } 973 s.Linux.Resources.Memory = memoryRes 974 s.Linux.Resources.CPU = cpuRes 975 s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{ 976 WeightDevice: weightDevices, 977 ThrottleReadBpsDevice: readBpsDevice, 978 ThrottleWriteBpsDevice: writeBpsDevice, 979 ThrottleReadIOPSDevice: readIOpsDevice, 980 ThrottleWriteIOPSDevice: writeIOpsDevice, 981 } 982 if r.BlkioWeight != 0 { 983 w := r.BlkioWeight 984 s.Linux.Resources.BlockIO.Weight = &w 985 } 986 s.Linux.Resources.Pids = getPidsLimit(r) 987 988 return nil 989 } 990 } 991 992 // WithSysctls sets the container's sysctls 993 func WithSysctls(c *container.Container) coci.SpecOpts { 994 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 995 if len(c.HostConfig.Sysctls) == 0 { 996 return nil 997 } 998 if s.Linux == nil { 999 s.Linux = &specs.Linux{} 1000 } 1001 if s.Linux.Sysctl == nil { 1002 s.Linux.Sysctl = make(map[string]string) 1003 } 1004 // We merge the sysctls injected above with the HostConfig (latter takes 1005 // precedence for backwards-compatibility reasons). 1006 for k, v := range c.HostConfig.Sysctls { 1007 s.Linux.Sysctl[k] = v 1008 } 1009 return nil 1010 } 1011 } 1012 1013 // WithUser sets the container's user 1014 func WithUser(c *container.Container) coci.SpecOpts { 1015 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1016 if s.Process == nil { 1017 s.Process = &specs.Process{} 1018 } 1019 var err error 1020 s.Process.User, err = getUser(c, c.Config.User) 1021 return err 1022 } 1023 } 1024 1025 func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) { 1026 var ( 1027 opts []coci.SpecOpts 1028 s = oci.DefaultSpec() 1029 ) 1030 opts = append(opts, 1031 withCommonOptions(daemon, &daemonCfg.Config, c), 1032 withCgroups(daemon, &daemonCfg.Config, c), 1033 WithResources(c), 1034 WithSysctls(c), 1035 WithDevices(daemon, c), 1036 withRlimits(daemon, &daemonCfg.Config, c), 1037 WithNamespaces(daemon, c), 1038 WithCapabilities(c), 1039 WithSeccomp(daemon, c), 1040 withMounts(daemon, daemonCfg, c, mounts), 1041 withLibnetwork(daemon, &daemonCfg.Config, c), 1042 WithApparmor(c), 1043 WithSelinux(c), 1044 WithOOMScore(&c.HostConfig.OomScoreAdj), 1045 coci.WithAnnotations(c.HostConfig.Annotations), 1046 WithUser(c), 1047 ) 1048 1049 if c.NoNewPrivileges { 1050 opts = append(opts, coci.WithNoNewPrivileges) 1051 } 1052 if c.Config.Tty { 1053 opts = append(opts, WithConsoleSize(c)) 1054 } 1055 // Set the masked and readonly paths with regard to the host config options if they are set. 1056 if c.HostConfig.MaskedPaths != nil { 1057 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1058 } 1059 if c.HostConfig.ReadonlyPaths != nil { 1060 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1061 } 1062 if daemonCfg.Rootless { 1063 opts = append(opts, withRootless(daemon, &daemonCfg.Config)) 1064 } else if userns.RunningInUserNS() { 1065 opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config)) 1066 } 1067 1068 var snapshotter, snapshotKey string 1069 if daemon.UsesSnapshotter() { 1070 snapshotter = daemon.imageService.StorageDriver() 1071 snapshotKey = c.ID 1072 } 1073 1074 return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{ 1075 ID: c.ID, 1076 Snapshotter: snapshotter, 1077 SnapshotKey: snapshotKey, 1078 }, &s, opts...) 1079 } 1080 1081 func clearReadOnly(m *specs.Mount) { 1082 var opt []string 1083 for _, o := range m.Options { 1084 if o != "ro" { 1085 opt = append(opt, o) 1086 } 1087 } 1088 m.Options = opt 1089 } 1090 1091 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1092 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) { 1093 ulimits := c.Ulimits 1094 // Merge ulimits with daemon defaults 1095 ulIdx := make(map[string]struct{}) 1096 for _, ul := range ulimits { 1097 ulIdx[ul.Name] = struct{}{} 1098 } 1099 for name, ul := range daemonCfg.Ulimits { 1100 if _, exists := ulIdx[name]; !exists { 1101 ulimits = append(ulimits, ul) 1102 } 1103 } 1104 c.Ulimits = ulimits 1105 }