github.com/rumpl/bof@v23.0.0-rc.2+incompatible/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "os/exec" 8 "path/filepath" 9 "sort" 10 "strconv" 11 "strings" 12 13 cdcgroups "github.com/containerd/cgroups" 14 "github.com/containerd/containerd/containers" 15 coci "github.com/containerd/containerd/oci" 16 "github.com/containerd/containerd/pkg/apparmor" 17 "github.com/containerd/containerd/pkg/userns" 18 containertypes "github.com/docker/docker/api/types/container" 19 "github.com/docker/docker/container" 20 dconfig "github.com/docker/docker/daemon/config" 21 "github.com/docker/docker/errdefs" 22 "github.com/docker/docker/oci" 23 "github.com/docker/docker/oci/caps" 24 "github.com/docker/docker/pkg/idtools" 25 "github.com/docker/docker/pkg/rootless/specconv" 26 "github.com/docker/docker/pkg/stringid" 27 volumemounts "github.com/docker/docker/volume/mounts" 28 "github.com/moby/sys/mount" 29 "github.com/moby/sys/mountinfo" 30 "github.com/opencontainers/runc/libcontainer/cgroups" 31 "github.com/opencontainers/runc/libcontainer/user" 32 specs "github.com/opencontainers/runtime-spec/specs-go" 33 "github.com/pkg/errors" 34 "github.com/sirupsen/logrus" 35 "golang.org/x/sys/unix" 36 ) 37 38 const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary 39 40 // WithRlimits sets the container's rlimits along with merging the daemon's rlimits 41 func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { 42 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 43 var rlimits []specs.POSIXRlimit 44 45 // We want to leave the original HostConfig alone so make a copy here 46 hostConfig := *c.HostConfig 47 // Merge with the daemon defaults 48 daemon.mergeUlimits(&hostConfig) 49 for _, ul := range hostConfig.Ulimits { 50 rlimits = append(rlimits, specs.POSIXRlimit{ 51 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 52 Soft: uint64(ul.Soft), 53 Hard: uint64(ul.Hard), 54 }) 55 } 56 57 s.Process.Rlimits = rlimits 58 return nil 59 } 60 } 61 62 // WithLibnetwork sets the libnetwork hook 63 func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts { 64 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 65 if s.Hooks == nil { 66 s.Hooks = &specs.Hooks{} 67 } 68 for _, ns := range s.Linux.Namespaces { 69 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 70 target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") 71 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 72 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ 73 Path: target, 74 Args: []string{ 75 "libnetwork-setkey", 76 "-exec-root=" + daemon.configStore.GetExecRoot(), 77 c.ID, 78 shortNetCtlrID, 79 }, 80 }) 81 } 82 } 83 return nil 84 } 85 } 86 87 // WithRootless sets the spec to the rootless configuration 88 func WithRootless(daemon *Daemon) coci.SpecOpts { 89 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 90 var v2Controllers []string 91 if daemon.getCgroupDriver() == cgroupSystemdDriver { 92 if cdcgroups.Mode() != cdcgroups.Unified { 93 return errors.New("rootless systemd driver doesn't support cgroup v1") 94 } 95 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 96 if rootlesskitParentEUID == "" { 97 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 98 } 99 euid, err := strconv.Atoi(rootlesskitParentEUID) 100 if err != nil { 101 return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value") 102 } 103 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid) 104 controllersFile, err := os.ReadFile(controllersPath) 105 if err != nil { 106 return err 107 } 108 v2Controllers = strings.Fields(string(controllersFile)) 109 } 110 return specconv.ToRootless(s, v2Controllers) 111 } 112 } 113 114 // WithOOMScore sets the oom score 115 func WithOOMScore(score *int) coci.SpecOpts { 116 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 117 s.Process.OOMScoreAdj = score 118 return nil 119 } 120 } 121 122 // WithSelinux sets the selinux labels 123 func WithSelinux(c *container.Container) coci.SpecOpts { 124 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 125 s.Process.SelinuxLabel = c.GetProcessLabel() 126 s.Linux.MountLabel = c.MountLabel 127 return nil 128 } 129 } 130 131 // WithApparmor sets the apparmor profile 132 func WithApparmor(c *container.Container) coci.SpecOpts { 133 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 134 if apparmor.HostSupports() { 135 var appArmorProfile string 136 if c.AppArmorProfile != "" { 137 appArmorProfile = c.AppArmorProfile 138 } else if c.HostConfig.Privileged { 139 appArmorProfile = unconfinedAppArmorProfile 140 } else { 141 appArmorProfile = defaultAppArmorProfile 142 } 143 144 if appArmorProfile == defaultAppArmorProfile { 145 // Unattended upgrades and other fun services can unload AppArmor 146 // profiles inadvertently. Since we cannot store our profile in 147 // /etc/apparmor.d, nor can we practically add other ways of 148 // telling the system to keep our profile loaded, in order to make 149 // sure that we keep the default profile enabled we dynamically 150 // reload it if necessary. 151 if err := ensureDefaultAppArmorProfile(); err != nil { 152 return err 153 } 154 } 155 s.Process.ApparmorProfile = appArmorProfile 156 } 157 return nil 158 } 159 } 160 161 // WithCapabilities sets the container's capabilties 162 func WithCapabilities(c *container.Container) coci.SpecOpts { 163 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 164 capabilities, err := caps.TweakCapabilities( 165 caps.DefaultCapabilities(), 166 c.HostConfig.CapAdd, 167 c.HostConfig.CapDrop, 168 c.HostConfig.Privileged, 169 ) 170 if err != nil { 171 return err 172 } 173 return oci.SetCapabilities(s, capabilities) 174 } 175 } 176 177 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 178 p, err := getPath() 179 if err != nil { 180 return "", err 181 } 182 return c.GetResourcePath(p) 183 } 184 185 func getUser(c *container.Container, username string) (specs.User, error) { 186 var usr specs.User 187 passwdPath, err := resourcePath(c, user.GetPasswdPath) 188 if err != nil { 189 return usr, err 190 } 191 groupPath, err := resourcePath(c, user.GetGroupPath) 192 if err != nil { 193 return usr, err 194 } 195 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 196 if err != nil { 197 return usr, err 198 } 199 usr.UID = uint32(execUser.Uid) 200 usr.GID = uint32(execUser.Gid) 201 usr.AdditionalGids = []uint32{usr.GID} 202 203 var addGroups []int 204 if len(c.HostConfig.GroupAdd) > 0 { 205 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 206 if err != nil { 207 return usr, err 208 } 209 } 210 for _, g := range append(execUser.Sgids, addGroups...) { 211 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 212 } 213 return usr, nil 214 } 215 216 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 217 for i, n := range s.Linux.Namespaces { 218 if n.Type == ns.Type { 219 s.Linux.Namespaces[i] = ns 220 return 221 } 222 } 223 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 224 } 225 226 // WithNamespaces sets the container's namespaces 227 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 228 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 229 userNS := false 230 // user 231 if c.HostConfig.UsernsMode.IsPrivate() { 232 uidMap := daemon.idMapping.UIDMaps 233 if uidMap != nil { 234 userNS = true 235 ns := specs.LinuxNamespace{Type: "user"} 236 setNamespace(s, ns) 237 s.Linux.UIDMappings = specMapping(uidMap) 238 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps) 239 } 240 } 241 // network 242 if !c.Config.NetworkDisabled { 243 ns := specs.LinuxNamespace{Type: "network"} 244 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 245 if parts[0] == "container" { 246 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 247 if err != nil { 248 return err 249 } 250 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 251 if userNS { 252 // to share a net namespace, they must also share a user namespace 253 nsUser := specs.LinuxNamespace{Type: "user"} 254 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 255 setNamespace(s, nsUser) 256 } 257 } else if c.HostConfig.NetworkMode.IsHost() { 258 ns.Path = c.NetworkSettings.SandboxKey 259 } 260 setNamespace(s, ns) 261 } 262 263 // ipc 264 ipcMode := c.HostConfig.IpcMode 265 if !ipcMode.Valid() { 266 return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode)) 267 } 268 switch { 269 case ipcMode.IsContainer(): 270 ns := specs.LinuxNamespace{Type: "ipc"} 271 ic, err := daemon.getIpcContainer(ipcMode.Container()) 272 if err != nil { 273 return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode)) 274 } 275 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 276 setNamespace(s, ns) 277 if userNS { 278 // to share an IPC namespace, they must also share a user namespace 279 nsUser := specs.LinuxNamespace{Type: "user"} 280 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 281 setNamespace(s, nsUser) 282 } 283 case ipcMode.IsHost(): 284 oci.RemoveNamespace(s, "ipc") 285 case ipcMode.IsEmpty(): 286 // A container was created by an older version of the daemon. 287 // The default behavior used to be what is now called "shareable". 288 fallthrough 289 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 290 ns := specs.LinuxNamespace{Type: "ipc"} 291 setNamespace(s, ns) 292 } 293 294 // pid 295 if !c.HostConfig.PidMode.Valid() { 296 return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode)) 297 } 298 if c.HostConfig.PidMode.IsContainer() { 299 pc, err := daemon.getPidContainer(c) 300 if err != nil { 301 return err 302 } 303 ns := specs.LinuxNamespace{ 304 Type: "pid", 305 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 306 } 307 setNamespace(s, ns) 308 if userNS { 309 // to share a PID namespace, they must also share a user namespace 310 nsUser := specs.LinuxNamespace{ 311 Type: "user", 312 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 313 } 314 setNamespace(s, nsUser) 315 } 316 } else if c.HostConfig.PidMode.IsHost() { 317 oci.RemoveNamespace(s, "pid") 318 } else { 319 ns := specs.LinuxNamespace{Type: "pid"} 320 setNamespace(s, ns) 321 } 322 // uts 323 if !c.HostConfig.UTSMode.Valid() { 324 return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode)) 325 } 326 if c.HostConfig.UTSMode.IsHost() { 327 oci.RemoveNamespace(s, "uts") 328 s.Hostname = "" 329 } 330 331 // cgroup 332 if !c.HostConfig.CgroupnsMode.Valid() { 333 return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode)) 334 } 335 if !c.HostConfig.CgroupnsMode.IsEmpty() { 336 if c.HostConfig.CgroupnsMode.IsPrivate() { 337 nsCgroup := specs.LinuxNamespace{Type: "cgroup"} 338 setNamespace(s, nsCgroup) 339 } 340 } 341 342 return nil 343 } 344 } 345 346 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 347 var ids []specs.LinuxIDMapping 348 for _, item := range s { 349 ids = append(ids, specs.LinuxIDMapping{ 350 HostID: uint32(item.HostID), 351 ContainerID: uint32(item.ContainerID), 352 Size: uint32(item.Size), 353 }) 354 } 355 return ids 356 } 357 358 // Get the source mount point of directory passed in as argument. Also return 359 // optional fields. 360 func getSourceMount(source string) (string, string, error) { 361 // Ensure any symlinks are resolved. 362 sourcePath, err := filepath.EvalSymlinks(source) 363 if err != nil { 364 return "", "", err 365 } 366 367 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 368 if err != nil { 369 return "", "", err 370 } 371 if len(mi) < 1 { 372 return "", "", fmt.Errorf("Can't find mount point of %s", source) 373 } 374 375 // find the longest mount point 376 var idx, maxlen int 377 for i := range mi { 378 if len(mi[i].Mountpoint) > maxlen { 379 maxlen = len(mi[i].Mountpoint) 380 idx = i 381 } 382 } 383 return mi[idx].Mountpoint, mi[idx].Optional, nil 384 } 385 386 const ( 387 sharedPropagationOption = "shared:" 388 slavePropagationOption = "master:" 389 ) 390 391 // hasMountInfoOption checks if any of the passed any of the given option values 392 // are set in the passed in option string. 393 func hasMountInfoOption(opts string, vals ...string) bool { 394 for _, opt := range strings.Split(opts, " ") { 395 for _, val := range vals { 396 if strings.HasPrefix(opt, val) { 397 return true 398 } 399 } 400 } 401 return false 402 } 403 404 // Ensure mount point on which path is mounted, is shared. 405 func ensureShared(path string) error { 406 sourceMount, optionalOpts, err := getSourceMount(path) 407 if err != nil { 408 return err 409 } 410 // Make sure source mount point is shared. 411 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 412 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 413 } 414 return nil 415 } 416 417 // Ensure mount point on which path is mounted, is either shared or slave. 418 func ensureSharedOrSlave(path string) error { 419 sourceMount, optionalOpts, err := getSourceMount(path) 420 if err != nil { 421 return err 422 } 423 424 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 425 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 426 } 427 return nil 428 } 429 430 // Get the set of mount flags that are set on the mount that contains the given 431 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 432 // bind-mounting "with options" will not fail with user namespaces, due to 433 // kernel restrictions that require user namespace mounts to preserve 434 // CL_UNPRIVILEGED locked flags. 435 func getUnprivilegedMountFlags(path string) ([]string, error) { 436 var statfs unix.Statfs_t 437 if err := unix.Statfs(path, &statfs); err != nil { 438 return nil, err 439 } 440 441 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 442 unprivilegedFlags := map[uint64]string{ 443 unix.MS_RDONLY: "ro", 444 unix.MS_NODEV: "nodev", 445 unix.MS_NOEXEC: "noexec", 446 unix.MS_NOSUID: "nosuid", 447 unix.MS_NOATIME: "noatime", 448 unix.MS_RELATIME: "relatime", 449 unix.MS_NODIRATIME: "nodiratime", 450 } 451 452 var flags []string 453 for mask, flag := range unprivilegedFlags { 454 if uint64(statfs.Flags)&mask == mask { 455 flags = append(flags, flag) 456 } 457 } 458 459 return flags, nil 460 } 461 462 var ( 463 mountPropagationMap = map[string]int{ 464 "private": mount.PRIVATE, 465 "rprivate": mount.RPRIVATE, 466 "shared": mount.SHARED, 467 "rshared": mount.RSHARED, 468 "slave": mount.SLAVE, 469 "rslave": mount.RSLAVE, 470 } 471 472 mountPropagationReverseMap = map[int]string{ 473 mount.PRIVATE: "private", 474 mount.RPRIVATE: "rprivate", 475 mount.SHARED: "shared", 476 mount.RSHARED: "rshared", 477 mount.SLAVE: "slave", 478 mount.RSLAVE: "rslave", 479 } 480 ) 481 482 // inSlice tests whether a string is contained in a slice of strings or not. 483 // Comparison is case sensitive 484 func inSlice(slice []string, s string) bool { 485 for _, ss := range slice { 486 if s == ss { 487 return true 488 } 489 } 490 return false 491 } 492 493 // WithMounts sets the container's mounts 494 func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { 495 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 496 if err := daemon.setupContainerMountsRoot(c); err != nil { 497 return err 498 } 499 500 if err := daemon.setupIpcDirs(c); err != nil { 501 return err 502 } 503 504 defer func() { 505 if err != nil { 506 daemon.cleanupSecretDir(c) 507 } 508 }() 509 510 if err := daemon.setupSecretDir(c); err != nil { 511 return err 512 } 513 514 ms, err := daemon.setupMounts(c) 515 if err != nil { 516 return err 517 } 518 519 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 520 ms = append(ms, c.IpcMounts()...) 521 } 522 523 tmpfsMounts, err := c.TmpfsMounts() 524 if err != nil { 525 return err 526 } 527 ms = append(ms, tmpfsMounts...) 528 529 secretMounts, err := c.SecretMounts() 530 if err != nil { 531 return err 532 } 533 ms = append(ms, secretMounts...) 534 535 sort.Sort(mounts(ms)) 536 537 mounts := ms 538 539 userMounts := make(map[string]struct{}) 540 for _, m := range mounts { 541 userMounts[m.Destination] = struct{}{} 542 } 543 544 // Copy all mounts from spec to defaultMounts, except for 545 // - mounts overridden by a user supplied mount; 546 // - all mounts under /dev if a user supplied /dev is present; 547 // - /dev/shm, in case IpcMode is none. 548 // While at it, also 549 // - set size for /dev/shm from shmsize. 550 defaultMounts := s.Mounts[:0] 551 _, mountDev := userMounts["/dev"] 552 for _, m := range s.Mounts { 553 if _, ok := userMounts[m.Destination]; ok { 554 // filter out mount overridden by a user supplied mount 555 continue 556 } 557 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 558 // filter out everything under /dev if /dev is user-mounted 559 continue 560 } 561 562 if m.Destination == "/dev/shm" { 563 if c.HostConfig.IpcMode.IsNone() { 564 // filter out /dev/shm for "none" IpcMode 565 continue 566 } 567 // set size for /dev/shm mount from spec 568 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 569 m.Options = append(m.Options, sizeOpt) 570 } 571 572 defaultMounts = append(defaultMounts, m) 573 } 574 575 s.Mounts = defaultMounts 576 for _, m := range mounts { 577 if m.Source == "tmpfs" { 578 data := m.Data 579 parser := volumemounts.NewParser() 580 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 581 if data != "" { 582 options = append(options, strings.Split(data, ",")...) 583 } 584 585 merged, err := mount.MergeTmpfsOptions(options) 586 if err != nil { 587 return err 588 } 589 590 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 591 continue 592 } 593 594 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 595 596 // Determine property of RootPropagation based on volume 597 // properties. If a volume is shared, then keep root propagation 598 // shared. This should work for slave and private volumes too. 599 // 600 // For slave volumes, it can be either [r]shared/[r]slave. 601 // 602 // For private volumes any root propagation value should work. 603 pFlag := mountPropagationMap[m.Propagation] 604 switch pFlag { 605 case mount.SHARED, mount.RSHARED: 606 if err := ensureShared(m.Source); err != nil { 607 return err 608 } 609 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 610 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 611 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 612 } 613 case mount.SLAVE, mount.RSLAVE: 614 var fallback bool 615 if err := ensureSharedOrSlave(m.Source); err != nil { 616 // For backwards compatibility purposes, treat mounts from the daemon root 617 // as special since we automatically add rslave propagation to these mounts 618 // when the user did not set anything, so we should fallback to the old 619 // behavior which is to use private propagation which is normally the 620 // default. 621 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 622 return err 623 } 624 625 cm, ok := c.MountPoints[m.Destination] 626 if !ok { 627 return err 628 } 629 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 630 // This means the user explicitly set a propagation, do not fallback in that case. 631 return err 632 } 633 fallback = true 634 logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 635 } 636 if !fallback { 637 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 638 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 639 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 640 } 641 } 642 } 643 644 bindMode := "rbind" 645 if m.NonRecursive { 646 bindMode = "bind" 647 } 648 opts := []string{bindMode} 649 if !m.Writable { 650 opts = append(opts, "ro") 651 } 652 if pFlag != 0 { 653 opts = append(opts, mountPropagationReverseMap[pFlag]) 654 } 655 656 // If we are using user namespaces, then we must make sure that we 657 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 658 // "mount" when we bind-mount. The reason for this is that at the point 659 // when runc sets up the root filesystem, it is already inside a user 660 // namespace, and thus cannot change any flags that are locked. 661 if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() { 662 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 663 if err != nil { 664 return err 665 } 666 opts = append(opts, unprivOpts...) 667 } 668 669 mt.Options = opts 670 s.Mounts = append(s.Mounts, mt) 671 } 672 673 if s.Root.Readonly { 674 for i, m := range s.Mounts { 675 switch m.Destination { 676 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 677 continue 678 } 679 if _, ok := userMounts[m.Destination]; !ok { 680 if !inSlice(m.Options, "ro") { 681 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 682 } 683 } 684 } 685 } 686 687 if c.HostConfig.Privileged { 688 // clear readonly for /sys 689 for i := range s.Mounts { 690 if s.Mounts[i].Destination == "/sys" { 691 clearReadOnly(&s.Mounts[i]) 692 } 693 } 694 s.Linux.ReadonlyPaths = nil 695 s.Linux.MaskedPaths = nil 696 } 697 698 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 699 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 700 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged { 701 for i, m := range s.Mounts { 702 if m.Type == "cgroup" { 703 clearReadOnly(&s.Mounts[i]) 704 } 705 } 706 } 707 708 return nil 709 } 710 } 711 712 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 713 // exist, so do not add the default ones if running on an old kernel. 714 func sysctlExists(s string) bool { 715 f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/")) 716 _, err := os.Stat(f) 717 return err == nil 718 } 719 720 // WithCommonOptions sets common docker options 721 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { 722 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 723 if c.BaseFS == nil { 724 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil") 725 } 726 linkedEnv, err := daemon.setupLinkedContainers(c) 727 if err != nil { 728 return err 729 } 730 s.Root = &specs.Root{ 731 Path: c.BaseFS.Path(), 732 Readonly: c.HostConfig.ReadonlyRootfs, 733 } 734 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 735 return err 736 } 737 cwd := c.Config.WorkingDir 738 if len(cwd) == 0 { 739 cwd = "/" 740 } 741 s.Process.Args = append([]string{c.Path}, c.Args...) 742 743 // only add the custom init if it is specified and the container is running in its 744 // own private pid namespace. It does not make sense to add if it is running in the 745 // host namespace or another container's pid namespace where we already have an init 746 if c.HostConfig.PidMode.IsPrivate() { 747 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 748 (c.HostConfig.Init == nil && daemon.configStore.Init) { 749 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 750 path := daemon.configStore.InitPath 751 if path == "" { 752 path, err = exec.LookPath(dconfig.DefaultInitBinary) 753 if err != nil { 754 return err 755 } 756 } 757 s.Mounts = append(s.Mounts, specs.Mount{ 758 Destination: inContainerInitPath, 759 Type: "bind", 760 Source: path, 761 Options: []string{"bind", "ro"}, 762 }) 763 } 764 } 765 s.Process.Cwd = cwd 766 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 767 s.Process.Terminal = c.Config.Tty 768 769 s.Hostname = c.Config.Hostname 770 setLinuxDomainname(c, s) 771 772 // Add default sysctls that are generally safe and useful; currently we 773 // grant the capabilities to allow these anyway. You can override if 774 // you want to restore the original behaviour. 775 // We do not set network sysctls if network namespace is host, or if we are 776 // joining an existing namespace, only if we create a new net namespace. 777 if c.HostConfig.NetworkMode.IsPrivate() { 778 // We cannot set up ping socket support in a user namespace 779 userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate() 780 if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") { 781 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 782 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 783 } 784 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 785 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 786 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 787 } 788 } 789 790 return nil 791 } 792 } 793 794 // WithCgroups sets the container's cgroups 795 func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { 796 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 797 var cgroupsPath string 798 scopePrefix := "docker" 799 parent := "/docker" 800 useSystemd := UsingSystemd(daemon.configStore) 801 if useSystemd { 802 parent = "system.slice" 803 if daemon.configStore.Rootless { 804 parent = "user.slice" 805 } 806 } 807 808 if c.HostConfig.CgroupParent != "" { 809 parent = c.HostConfig.CgroupParent 810 } else if daemon.configStore.CgroupParent != "" { 811 parent = daemon.configStore.CgroupParent 812 } 813 814 if useSystemd { 815 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 816 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 817 } else { 818 cgroupsPath = filepath.Join(parent, c.ID) 819 } 820 s.Linux.CgroupsPath = cgroupsPath 821 822 // the rest is only needed for CPU RT controller 823 824 if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 { 825 return nil 826 } 827 828 p := cgroupsPath 829 if useSystemd { 830 initPath, err := cgroups.GetInitCgroup("cpu") 831 if err != nil { 832 return errors.Wrap(err, "unable to init CPU RT controller") 833 } 834 _, err = cgroups.GetOwnCgroup("cpu") 835 if err != nil { 836 return errors.Wrap(err, "unable to init CPU RT controller") 837 } 838 p = filepath.Join(initPath, s.Linux.CgroupsPath) 839 } 840 841 // Clean path to guard against things like ../../../BAD 842 parentPath := filepath.Dir(p) 843 if !filepath.IsAbs(parentPath) { 844 parentPath = filepath.Clean("/" + parentPath) 845 } 846 847 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 848 if err != nil { 849 return errors.Wrap(err, "unable to init CPU RT controller") 850 } 851 // When docker is run inside docker, the root is based of the host cgroup. 852 // Should this be handled in runc/libcontainer/cgroups ? 853 if strings.HasPrefix(root, "/docker/") { 854 root = "/" 855 } 856 mnt = filepath.Join(mnt, root) 857 858 if err := daemon.initCPURtController(mnt, parentPath); err != nil { 859 return errors.Wrap(err, "unable to init CPU RT controller") 860 } 861 return nil 862 } 863 } 864 865 // WithDevices sets the container's devices 866 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 867 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 868 // Build lists of devices allowed and created within the container. 869 var devs []specs.LinuxDevice 870 devPermissions := s.Linux.Resources.Devices 871 872 if c.HostConfig.Privileged { 873 hostDevices, err := coci.HostDevices() 874 if err != nil { 875 return err 876 } 877 devs = append(devs, hostDevices...) 878 879 // adding device mappings in privileged containers 880 for _, deviceMapping := range c.HostConfig.Devices { 881 // issue a warning that custom cgroup permissions are ignored in privileged mode 882 if deviceMapping.CgroupPermissions != "rwm" { 883 logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 884 } 885 // issue a warning that the device path already exists via /dev mounting in privileged mode 886 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 887 logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 888 continue 889 } 890 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 891 if err != nil { 892 return err 893 } 894 devs = append(devs, d...) 895 } 896 897 devPermissions = []specs.LinuxDeviceCgroup{ 898 { 899 Allow: true, 900 Access: "rwm", 901 }, 902 } 903 } else { 904 for _, deviceMapping := range c.HostConfig.Devices { 905 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 906 if err != nil { 907 return err 908 } 909 devs = append(devs, d...) 910 devPermissions = append(devPermissions, dPermissions...) 911 } 912 913 var err error 914 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 915 if err != nil { 916 return err 917 } 918 } 919 920 s.Linux.Devices = append(s.Linux.Devices, devs...) 921 s.Linux.Resources.Devices = devPermissions 922 923 for _, req := range c.HostConfig.DeviceRequests { 924 if err := daemon.handleDevice(req, s); err != nil { 925 return err 926 } 927 } 928 return nil 929 } 930 } 931 932 // WithResources applies the container resources 933 func WithResources(c *container.Container) coci.SpecOpts { 934 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 935 r := c.HostConfig.Resources 936 weightDevices, err := getBlkioWeightDevices(r) 937 if err != nil { 938 return err 939 } 940 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 941 if err != nil { 942 return err 943 } 944 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 945 if err != nil { 946 return err 947 } 948 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 949 if err != nil { 950 return err 951 } 952 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 953 if err != nil { 954 return err 955 } 956 957 memoryRes := getMemoryResources(r) 958 cpuRes, err := getCPUResources(r) 959 if err != nil { 960 return err 961 } 962 blkioWeight := r.BlkioWeight 963 964 specResources := &specs.LinuxResources{ 965 Memory: memoryRes, 966 CPU: cpuRes, 967 BlockIO: &specs.LinuxBlockIO{ 968 Weight: &blkioWeight, 969 WeightDevice: weightDevices, 970 ThrottleReadBpsDevice: readBpsDevice, 971 ThrottleWriteBpsDevice: writeBpsDevice, 972 ThrottleReadIOPSDevice: readIOpsDevice, 973 ThrottleWriteIOPSDevice: writeIOpsDevice, 974 }, 975 Pids: getPidsLimit(r), 976 } 977 978 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 979 specResources.Devices = s.Linux.Resources.Devices 980 } 981 982 s.Linux.Resources = specResources 983 return nil 984 } 985 } 986 987 // WithSysctls sets the container's sysctls 988 func WithSysctls(c *container.Container) coci.SpecOpts { 989 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 990 // We merge the sysctls injected above with the HostConfig (latter takes 991 // precedence for backwards-compatibility reasons). 992 for k, v := range c.HostConfig.Sysctls { 993 s.Linux.Sysctl[k] = v 994 } 995 return nil 996 } 997 } 998 999 // WithUser sets the container's user 1000 func WithUser(c *container.Container) coci.SpecOpts { 1001 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1002 var err error 1003 s.Process.User, err = getUser(c, c.Config.User) 1004 return err 1005 } 1006 } 1007 1008 func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) { 1009 var ( 1010 opts []coci.SpecOpts 1011 s = oci.DefaultSpec() 1012 ) 1013 opts = append(opts, 1014 WithCommonOptions(daemon, c), 1015 WithCgroups(daemon, c), 1016 WithResources(c), 1017 WithSysctls(c), 1018 WithDevices(daemon, c), 1019 WithUser(c), 1020 WithRlimits(daemon, c), 1021 WithNamespaces(daemon, c), 1022 WithCapabilities(c), 1023 WithSeccomp(daemon, c), 1024 WithMounts(daemon, c), 1025 WithLibnetwork(daemon, c), 1026 WithApparmor(c), 1027 WithSelinux(c), 1028 WithOOMScore(&c.HostConfig.OomScoreAdj), 1029 ) 1030 if c.NoNewPrivileges { 1031 opts = append(opts, coci.WithNoNewPrivileges) 1032 } 1033 if c.Config.Tty { 1034 opts = append(opts, WithConsoleSize(c)) 1035 } 1036 // Set the masked and readonly paths with regard to the host config options if they are set. 1037 if c.HostConfig.MaskedPaths != nil { 1038 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1039 } 1040 if c.HostConfig.ReadonlyPaths != nil { 1041 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1042 } 1043 if daemon.configStore.Rootless { 1044 opts = append(opts, WithRootless(daemon)) 1045 } 1046 return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{ 1047 ID: c.ID, 1048 }, &s, opts...) 1049 } 1050 1051 func clearReadOnly(m *specs.Mount) { 1052 var opt []string 1053 for _, o := range m.Options { 1054 if o != "ro" { 1055 opt = append(opt, o) 1056 } 1057 } 1058 m.Options = opt 1059 } 1060 1061 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1062 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 1063 ulimits := c.Ulimits 1064 // Merge ulimits with daemon defaults 1065 ulIdx := make(map[string]struct{}) 1066 for _, ul := range ulimits { 1067 ulIdx[ul.Name] = struct{}{} 1068 } 1069 for name, ul := range daemon.configStore.Ulimits { 1070 if _, exists := ulIdx[name]; !exists { 1071 ulimits = append(ulimits, ul) 1072 } 1073 } 1074 c.Ulimits = ulimits 1075 }