github.com/afbjorklund/moby@v20.10.5+incompatible/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "io/ioutil" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "sort" 11 "strconv" 12 "strings" 13 14 cdcgroups "github.com/containerd/cgroups" 15 "github.com/containerd/containerd/containers" 16 coci "github.com/containerd/containerd/oci" 17 "github.com/containerd/containerd/sys" 18 containertypes "github.com/docker/docker/api/types/container" 19 "github.com/docker/docker/container" 20 daemonconfig "github.com/docker/docker/daemon/config" 21 "github.com/docker/docker/oci" 22 "github.com/docker/docker/oci/caps" 23 "github.com/docker/docker/pkg/idtools" 24 "github.com/docker/docker/pkg/stringid" 25 "github.com/docker/docker/rootless/specconv" 26 volumemounts "github.com/docker/docker/volume/mounts" 27 "github.com/moby/sys/mount" 28 "github.com/moby/sys/mountinfo" 29 "github.com/opencontainers/runc/libcontainer/apparmor" 30 "github.com/opencontainers/runc/libcontainer/cgroups" 31 "github.com/opencontainers/runc/libcontainer/devices" 32 "github.com/opencontainers/runc/libcontainer/user" 33 specs "github.com/opencontainers/runtime-spec/specs-go" 34 "github.com/pkg/errors" 35 "github.com/sirupsen/logrus" 36 "golang.org/x/sys/unix" 37 ) 38 39 const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary 40 41 // WithRlimits sets the container's rlimits along with merging the daemon's rlimits 42 func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { 43 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 44 var rlimits []specs.POSIXRlimit 45 46 // We want to leave the original HostConfig alone so make a copy here 47 hostConfig := *c.HostConfig 48 // Merge with the daemon defaults 49 daemon.mergeUlimits(&hostConfig) 50 for _, ul := range hostConfig.Ulimits { 51 rlimits = append(rlimits, specs.POSIXRlimit{ 52 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 53 Soft: uint64(ul.Soft), 54 Hard: uint64(ul.Hard), 55 }) 56 } 57 58 s.Process.Rlimits = rlimits 59 return nil 60 } 61 } 62 63 // WithLibnetwork sets the libnetwork hook 64 func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts { 65 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 66 if s.Hooks == nil { 67 s.Hooks = &specs.Hooks{} 68 } 69 for _, ns := range s.Linux.Namespaces { 70 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 71 target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") 72 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 73 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ 74 Path: target, 75 Args: []string{ 76 "libnetwork-setkey", 77 "-exec-root=" + daemon.configStore.GetExecRoot(), 78 c.ID, 79 shortNetCtlrID, 80 }, 81 }) 82 } 83 } 84 return nil 85 } 86 } 87 88 // WithRootless sets the spec to the rootless configuration 89 func WithRootless(daemon *Daemon) coci.SpecOpts { 90 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 91 var v2Controllers []string 92 if daemon.getCgroupDriver() == cgroupSystemdDriver { 93 if cdcgroups.Mode() != cdcgroups.Unified { 94 return errors.New("rootless systemd driver doesn't support cgroup v1") 95 } 96 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 97 if rootlesskitParentEUID == "" { 98 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 99 } 100 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%s.slice/cgroup.controllers", rootlesskitParentEUID) 101 controllersFile, err := ioutil.ReadFile(controllersPath) 102 if err != nil { 103 return err 104 } 105 v2Controllers = strings.Fields(string(controllersFile)) 106 } 107 return specconv.ToRootless(s, v2Controllers) 108 } 109 } 110 111 // WithOOMScore sets the oom score 112 func WithOOMScore(score *int) coci.SpecOpts { 113 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 114 s.Process.OOMScoreAdj = score 115 return nil 116 } 117 } 118 119 // WithSelinux sets the selinux labels 120 func WithSelinux(c *container.Container) coci.SpecOpts { 121 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 122 s.Process.SelinuxLabel = c.GetProcessLabel() 123 s.Linux.MountLabel = c.MountLabel 124 return nil 125 } 126 } 127 128 // WithApparmor sets the apparmor profile 129 func WithApparmor(c *container.Container) coci.SpecOpts { 130 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 131 if apparmor.IsEnabled() { 132 var appArmorProfile string 133 if c.AppArmorProfile != "" { 134 appArmorProfile = c.AppArmorProfile 135 } else if c.HostConfig.Privileged { 136 appArmorProfile = unconfinedAppArmorProfile 137 } else { 138 appArmorProfile = defaultAppArmorProfile 139 } 140 141 if appArmorProfile == defaultAppArmorProfile { 142 // Unattended upgrades and other fun services can unload AppArmor 143 // profiles inadvertently. Since we cannot store our profile in 144 // /etc/apparmor.d, nor can we practically add other ways of 145 // telling the system to keep our profile loaded, in order to make 146 // sure that we keep the default profile enabled we dynamically 147 // reload it if necessary. 148 if err := ensureDefaultAppArmorProfile(); err != nil { 149 return err 150 } 151 } 152 s.Process.ApparmorProfile = appArmorProfile 153 } 154 return nil 155 } 156 } 157 158 // WithCapabilities sets the container's capabilties 159 func WithCapabilities(c *container.Container) coci.SpecOpts { 160 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 161 capabilities, err := caps.TweakCapabilities( 162 caps.DefaultCapabilities(), 163 c.HostConfig.CapAdd, 164 c.HostConfig.CapDrop, 165 c.HostConfig.Privileged, 166 ) 167 if err != nil { 168 return err 169 } 170 return oci.SetCapabilities(s, capabilities) 171 } 172 } 173 174 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 175 p, err := getPath() 176 if err != nil { 177 return "", err 178 } 179 return c.GetResourcePath(p) 180 } 181 182 func getUser(c *container.Container, username string) (specs.User, error) { 183 var usr specs.User 184 passwdPath, err := resourcePath(c, user.GetPasswdPath) 185 if err != nil { 186 return usr, err 187 } 188 groupPath, err := resourcePath(c, user.GetGroupPath) 189 if err != nil { 190 return usr, err 191 } 192 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 193 if err != nil { 194 return usr, err 195 } 196 usr.UID = uint32(execUser.Uid) 197 usr.GID = uint32(execUser.Gid) 198 199 var addGroups []int 200 if len(c.HostConfig.GroupAdd) > 0 { 201 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 202 if err != nil { 203 return usr, err 204 } 205 } 206 for _, g := range append(execUser.Sgids, addGroups...) { 207 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 208 } 209 return usr, nil 210 } 211 212 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 213 for i, n := range s.Linux.Namespaces { 214 if n.Type == ns.Type { 215 s.Linux.Namespaces[i] = ns 216 return 217 } 218 } 219 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 220 } 221 222 // WithNamespaces sets the container's namespaces 223 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 224 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 225 userNS := false 226 // user 227 if c.HostConfig.UsernsMode.IsPrivate() { 228 uidMap := daemon.idMapping.UIDs() 229 if uidMap != nil { 230 userNS = true 231 ns := specs.LinuxNamespace{Type: "user"} 232 setNamespace(s, ns) 233 s.Linux.UIDMappings = specMapping(uidMap) 234 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs()) 235 } 236 } 237 // network 238 if !c.Config.NetworkDisabled { 239 ns := specs.LinuxNamespace{Type: "network"} 240 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 241 if parts[0] == "container" { 242 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 243 if err != nil { 244 return err 245 } 246 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 247 if userNS { 248 // to share a net namespace, they must also share a user namespace 249 nsUser := specs.LinuxNamespace{Type: "user"} 250 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 251 setNamespace(s, nsUser) 252 } 253 } else if c.HostConfig.NetworkMode.IsHost() { 254 ns.Path = c.NetworkSettings.SandboxKey 255 } 256 setNamespace(s, ns) 257 } 258 259 // ipc 260 ipcMode := c.HostConfig.IpcMode 261 switch { 262 case ipcMode.IsContainer(): 263 ns := specs.LinuxNamespace{Type: "ipc"} 264 ic, err := daemon.getIpcContainer(ipcMode.Container()) 265 if err != nil { 266 return err 267 } 268 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 269 setNamespace(s, ns) 270 if userNS { 271 // to share an IPC namespace, they must also share a user namespace 272 nsUser := specs.LinuxNamespace{Type: "user"} 273 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 274 setNamespace(s, nsUser) 275 } 276 case ipcMode.IsHost(): 277 oci.RemoveNamespace(s, "ipc") 278 case ipcMode.IsEmpty(): 279 // A container was created by an older version of the daemon. 280 // The default behavior used to be what is now called "shareable". 281 fallthrough 282 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 283 ns := specs.LinuxNamespace{Type: "ipc"} 284 setNamespace(s, ns) 285 default: 286 return fmt.Errorf("Invalid IPC mode: %v", ipcMode) 287 } 288 289 // pid 290 if c.HostConfig.PidMode.IsContainer() { 291 pc, err := daemon.getPidContainer(c) 292 if err != nil { 293 return err 294 } 295 ns := specs.LinuxNamespace{ 296 Type: "pid", 297 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 298 } 299 setNamespace(s, ns) 300 if userNS { 301 // to share a PID namespace, they must also share a user namespace 302 nsUser := specs.LinuxNamespace{ 303 Type: "user", 304 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 305 } 306 setNamespace(s, nsUser) 307 } 308 } else if c.HostConfig.PidMode.IsHost() { 309 oci.RemoveNamespace(s, "pid") 310 } else { 311 ns := specs.LinuxNamespace{Type: "pid"} 312 setNamespace(s, ns) 313 } 314 // uts 315 if c.HostConfig.UTSMode.IsHost() { 316 oci.RemoveNamespace(s, "uts") 317 s.Hostname = "" 318 } 319 320 // cgroup 321 if !c.HostConfig.CgroupnsMode.IsEmpty() { 322 cgroupNsMode := c.HostConfig.CgroupnsMode 323 if !cgroupNsMode.Valid() { 324 return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode) 325 } 326 if cgroupNsMode.IsPrivate() { 327 nsCgroup := specs.LinuxNamespace{Type: "cgroup"} 328 setNamespace(s, nsCgroup) 329 } 330 } 331 332 return nil 333 } 334 } 335 336 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 337 var ids []specs.LinuxIDMapping 338 for _, item := range s { 339 ids = append(ids, specs.LinuxIDMapping{ 340 HostID: uint32(item.HostID), 341 ContainerID: uint32(item.ContainerID), 342 Size: uint32(item.Size), 343 }) 344 } 345 return ids 346 } 347 348 // Get the source mount point of directory passed in as argument. Also return 349 // optional fields. 350 func getSourceMount(source string) (string, string, error) { 351 // Ensure any symlinks are resolved. 352 sourcePath, err := filepath.EvalSymlinks(source) 353 if err != nil { 354 return "", "", err 355 } 356 357 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 358 if err != nil { 359 return "", "", err 360 } 361 if len(mi) < 1 { 362 return "", "", fmt.Errorf("Can't find mount point of %s", source) 363 } 364 365 // find the longest mount point 366 var idx, maxlen int 367 for i := range mi { 368 if len(mi[i].Mountpoint) > maxlen { 369 maxlen = len(mi[i].Mountpoint) 370 idx = i 371 } 372 } 373 return mi[idx].Mountpoint, mi[idx].Optional, nil 374 } 375 376 const ( 377 sharedPropagationOption = "shared:" 378 slavePropagationOption = "master:" 379 ) 380 381 // hasMountInfoOption checks if any of the passed any of the given option values 382 // are set in the passed in option string. 383 func hasMountInfoOption(opts string, vals ...string) bool { 384 for _, opt := range strings.Split(opts, " ") { 385 for _, val := range vals { 386 if strings.HasPrefix(opt, val) { 387 return true 388 } 389 } 390 } 391 return false 392 } 393 394 // Ensure mount point on which path is mounted, is shared. 395 func ensureShared(path string) error { 396 sourceMount, optionalOpts, err := getSourceMount(path) 397 if err != nil { 398 return err 399 } 400 // Make sure source mount point is shared. 401 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 402 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 403 } 404 return nil 405 } 406 407 // Ensure mount point on which path is mounted, is either shared or slave. 408 func ensureSharedOrSlave(path string) error { 409 sourceMount, optionalOpts, err := getSourceMount(path) 410 if err != nil { 411 return err 412 } 413 414 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 415 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 416 } 417 return nil 418 } 419 420 // Get the set of mount flags that are set on the mount that contains the given 421 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 422 // bind-mounting "with options" will not fail with user namespaces, due to 423 // kernel restrictions that require user namespace mounts to preserve 424 // CL_UNPRIVILEGED locked flags. 425 func getUnprivilegedMountFlags(path string) ([]string, error) { 426 var statfs unix.Statfs_t 427 if err := unix.Statfs(path, &statfs); err != nil { 428 return nil, err 429 } 430 431 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 432 unprivilegedFlags := map[uint64]string{ 433 unix.MS_RDONLY: "ro", 434 unix.MS_NODEV: "nodev", 435 unix.MS_NOEXEC: "noexec", 436 unix.MS_NOSUID: "nosuid", 437 unix.MS_NOATIME: "noatime", 438 unix.MS_RELATIME: "relatime", 439 unix.MS_NODIRATIME: "nodiratime", 440 } 441 442 var flags []string 443 for mask, flag := range unprivilegedFlags { 444 if uint64(statfs.Flags)&mask == mask { 445 flags = append(flags, flag) 446 } 447 } 448 449 return flags, nil 450 } 451 452 var ( 453 mountPropagationMap = map[string]int{ 454 "private": mount.PRIVATE, 455 "rprivate": mount.RPRIVATE, 456 "shared": mount.SHARED, 457 "rshared": mount.RSHARED, 458 "slave": mount.SLAVE, 459 "rslave": mount.RSLAVE, 460 } 461 462 mountPropagationReverseMap = map[int]string{ 463 mount.PRIVATE: "private", 464 mount.RPRIVATE: "rprivate", 465 mount.SHARED: "shared", 466 mount.RSHARED: "rshared", 467 mount.SLAVE: "slave", 468 mount.RSLAVE: "rslave", 469 } 470 ) 471 472 // inSlice tests whether a string is contained in a slice of strings or not. 473 // Comparison is case sensitive 474 func inSlice(slice []string, s string) bool { 475 for _, ss := range slice { 476 if s == ss { 477 return true 478 } 479 } 480 return false 481 } 482 483 // WithMounts sets the container's mounts 484 func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { 485 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 486 if err := daemon.setupContainerMountsRoot(c); err != nil { 487 return err 488 } 489 490 if err := daemon.setupIpcDirs(c); err != nil { 491 return err 492 } 493 494 defer func() { 495 if err != nil { 496 daemon.cleanupSecretDir(c) 497 } 498 }() 499 500 if err := daemon.setupSecretDir(c); err != nil { 501 return err 502 } 503 504 ms, err := daemon.setupMounts(c) 505 if err != nil { 506 return err 507 } 508 509 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 510 ms = append(ms, c.IpcMounts()...) 511 } 512 513 tmpfsMounts, err := c.TmpfsMounts() 514 if err != nil { 515 return err 516 } 517 ms = append(ms, tmpfsMounts...) 518 519 secretMounts, err := c.SecretMounts() 520 if err != nil { 521 return err 522 } 523 ms = append(ms, secretMounts...) 524 525 sort.Sort(mounts(ms)) 526 527 mounts := ms 528 529 userMounts := make(map[string]struct{}) 530 for _, m := range mounts { 531 userMounts[m.Destination] = struct{}{} 532 } 533 534 // Copy all mounts from spec to defaultMounts, except for 535 // - mounts overridden by a user supplied mount; 536 // - all mounts under /dev if a user supplied /dev is present; 537 // - /dev/shm, in case IpcMode is none. 538 // While at it, also 539 // - set size for /dev/shm from shmsize. 540 defaultMounts := s.Mounts[:0] 541 _, mountDev := userMounts["/dev"] 542 for _, m := range s.Mounts { 543 if _, ok := userMounts[m.Destination]; ok { 544 // filter out mount overridden by a user supplied mount 545 continue 546 } 547 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 548 // filter out everything under /dev if /dev is user-mounted 549 continue 550 } 551 552 if m.Destination == "/dev/shm" { 553 if c.HostConfig.IpcMode.IsNone() { 554 // filter out /dev/shm for "none" IpcMode 555 continue 556 } 557 // set size for /dev/shm mount from spec 558 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 559 m.Options = append(m.Options, sizeOpt) 560 } 561 562 defaultMounts = append(defaultMounts, m) 563 } 564 565 s.Mounts = defaultMounts 566 for _, m := range mounts { 567 if m.Source == "tmpfs" { 568 data := m.Data 569 parser := volumemounts.NewParser("linux") 570 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 571 if data != "" { 572 options = append(options, strings.Split(data, ",")...) 573 } 574 575 merged, err := mount.MergeTmpfsOptions(options) 576 if err != nil { 577 return err 578 } 579 580 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 581 continue 582 } 583 584 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 585 586 // Determine property of RootPropagation based on volume 587 // properties. If a volume is shared, then keep root propagation 588 // shared. This should work for slave and private volumes too. 589 // 590 // For slave volumes, it can be either [r]shared/[r]slave. 591 // 592 // For private volumes any root propagation value should work. 593 pFlag := mountPropagationMap[m.Propagation] 594 switch pFlag { 595 case mount.SHARED, mount.RSHARED: 596 if err := ensureShared(m.Source); err != nil { 597 return err 598 } 599 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 600 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 601 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 602 } 603 case mount.SLAVE, mount.RSLAVE: 604 var fallback bool 605 if err := ensureSharedOrSlave(m.Source); err != nil { 606 // For backwards compatibility purposes, treat mounts from the daemon root 607 // as special since we automatically add rslave propagation to these mounts 608 // when the user did not set anything, so we should fallback to the old 609 // behavior which is to use private propagation which is normally the 610 // default. 611 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 612 return err 613 } 614 615 cm, ok := c.MountPoints[m.Destination] 616 if !ok { 617 return err 618 } 619 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 620 // This means the user explicitly set a propagation, do not fallback in that case. 621 return err 622 } 623 fallback = true 624 logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 625 } 626 if !fallback { 627 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 628 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 629 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 630 } 631 } 632 } 633 634 bindMode := "rbind" 635 if m.NonRecursive { 636 bindMode = "bind" 637 } 638 opts := []string{bindMode} 639 if !m.Writable { 640 opts = append(opts, "ro") 641 } 642 if pFlag != 0 { 643 opts = append(opts, mountPropagationReverseMap[pFlag]) 644 } 645 646 // If we are using user namespaces, then we must make sure that we 647 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 648 // "mount" when we bind-mount. The reason for this is that at the point 649 // when runc sets up the root filesystem, it is already inside a user 650 // namespace, and thus cannot change any flags that are locked. 651 if daemon.configStore.RemappedRoot != "" { 652 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 653 if err != nil { 654 return err 655 } 656 opts = append(opts, unprivOpts...) 657 } 658 659 mt.Options = opts 660 s.Mounts = append(s.Mounts, mt) 661 } 662 663 if s.Root.Readonly { 664 for i, m := range s.Mounts { 665 switch m.Destination { 666 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 667 continue 668 } 669 if _, ok := userMounts[m.Destination]; !ok { 670 if !inSlice(m.Options, "ro") { 671 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 672 } 673 } 674 } 675 } 676 677 if c.HostConfig.Privileged { 678 // clear readonly for /sys 679 for i := range s.Mounts { 680 if s.Mounts[i].Destination == "/sys" { 681 clearReadOnly(&s.Mounts[i]) 682 } 683 } 684 s.Linux.ReadonlyPaths = nil 685 s.Linux.MaskedPaths = nil 686 } 687 688 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 689 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 690 if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged { 691 for i, m := range s.Mounts { 692 if m.Type == "cgroup" { 693 clearReadOnly(&s.Mounts[i]) 694 } 695 } 696 } 697 698 return nil 699 700 } 701 } 702 703 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 704 // exist, so do not add the default ones if running on an old kernel. 705 func sysctlExists(s string) bool { 706 f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1)) 707 _, err := os.Stat(f) 708 return err == nil 709 } 710 711 // WithCommonOptions sets common docker options 712 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { 713 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 714 if c.BaseFS == nil { 715 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil") 716 } 717 linkedEnv, err := daemon.setupLinkedContainers(c) 718 if err != nil { 719 return err 720 } 721 s.Root = &specs.Root{ 722 Path: c.BaseFS.Path(), 723 Readonly: c.HostConfig.ReadonlyRootfs, 724 } 725 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 726 return err 727 } 728 cwd := c.Config.WorkingDir 729 if len(cwd) == 0 { 730 cwd = "/" 731 } 732 s.Process.Args = append([]string{c.Path}, c.Args...) 733 734 // only add the custom init if it is specified and the container is running in its 735 // own private pid namespace. It does not make sense to add if it is running in the 736 // host namespace or another container's pid namespace where we already have an init 737 if c.HostConfig.PidMode.IsPrivate() { 738 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 739 (c.HostConfig.Init == nil && daemon.configStore.Init) { 740 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 741 path := daemon.configStore.InitPath 742 if path == "" { 743 path, err = exec.LookPath(daemonconfig.DefaultInitBinary) 744 if err != nil { 745 return err 746 } 747 } 748 s.Mounts = append(s.Mounts, specs.Mount{ 749 Destination: inContainerInitPath, 750 Type: "bind", 751 Source: path, 752 Options: []string{"bind", "ro"}, 753 }) 754 } 755 } 756 s.Process.Cwd = cwd 757 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 758 s.Process.Terminal = c.Config.Tty 759 760 s.Hostname = c.Config.Hostname 761 setLinuxDomainname(c, s) 762 763 // Add default sysctls that are generally safe and useful; currently we 764 // grant the capabilities to allow these anyway. You can override if 765 // you want to restore the original behaviour. 766 // We do not set network sysctls if network namespace is host, or if we are 767 // joining an existing namespace, only if we create a new net namespace. 768 if c.HostConfig.NetworkMode.IsPrivate() { 769 // We cannot set up ping socket support in a user namespace 770 if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") { 771 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 772 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 773 } 774 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 775 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 776 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 777 } 778 } 779 780 return nil 781 } 782 } 783 784 // WithCgroups sets the container's cgroups 785 func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { 786 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 787 var cgroupsPath string 788 scopePrefix := "docker" 789 parent := "/docker" 790 useSystemd := UsingSystemd(daemon.configStore) 791 if useSystemd { 792 parent = "system.slice" 793 if daemon.configStore.Rootless { 794 parent = "user.slice" 795 } 796 } 797 798 if c.HostConfig.CgroupParent != "" { 799 parent = c.HostConfig.CgroupParent 800 } else if daemon.configStore.CgroupParent != "" { 801 parent = daemon.configStore.CgroupParent 802 } 803 804 if useSystemd { 805 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 806 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 807 } else { 808 cgroupsPath = filepath.Join(parent, c.ID) 809 } 810 s.Linux.CgroupsPath = cgroupsPath 811 812 // the rest is only needed for CPU RT controller 813 814 if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 { 815 return nil 816 } 817 818 if cdcgroups.Mode() == cdcgroups.Unified { 819 return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2") 820 } 821 822 // FIXME this is very expensive way to check if cpu rt is supported 823 sysInfo := daemon.RawSysInfo(true) 824 if !sysInfo.CPURealtime { 825 return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not supported by the kernel") 826 } 827 828 p := cgroupsPath 829 if useSystemd { 830 initPath, err := cgroups.GetInitCgroup("cpu") 831 if err != nil { 832 return errors.Wrap(err, "unable to init CPU RT controller") 833 } 834 _, err = cgroups.GetOwnCgroup("cpu") 835 if err != nil { 836 return errors.Wrap(err, "unable to init CPU RT controller") 837 } 838 p = filepath.Join(initPath, s.Linux.CgroupsPath) 839 } 840 841 // Clean path to guard against things like ../../../BAD 842 parentPath := filepath.Dir(p) 843 if !filepath.IsAbs(parentPath) { 844 parentPath = filepath.Clean("/" + parentPath) 845 } 846 847 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 848 if err != nil { 849 return errors.Wrap(err, "unable to init CPU RT controller") 850 } 851 // When docker is run inside docker, the root is based of the host cgroup. 852 // Should this be handled in runc/libcontainer/cgroups ? 853 if strings.HasPrefix(root, "/docker/") { 854 root = "/" 855 } 856 mnt = filepath.Join(mnt, root) 857 858 if err := daemon.initCPURtController(mnt, parentPath); err != nil { 859 return errors.Wrap(err, "unable to init CPU RT controller") 860 } 861 return nil 862 } 863 } 864 865 // WithDevices sets the container's devices 866 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 867 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 868 // Build lists of devices allowed and created within the container. 869 var devs []specs.LinuxDevice 870 devPermissions := s.Linux.Resources.Devices 871 872 if c.HostConfig.Privileged && !sys.RunningInUserNS() { 873 hostDevices, err := devices.HostDevices() 874 if err != nil { 875 return err 876 } 877 for _, d := range hostDevices { 878 devs = append(devs, oci.Device(d)) 879 } 880 881 // adding device mappings in privileged containers 882 for _, deviceMapping := range c.HostConfig.Devices { 883 // issue a warning that custom cgroup permissions are ignored in privileged mode 884 if deviceMapping.CgroupPermissions != "rwm" { 885 logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 886 } 887 // issue a warning that the device path already exists via /dev mounting in privileged mode 888 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 889 logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 890 continue 891 } 892 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 893 if err != nil { 894 return err 895 } 896 devs = append(devs, d...) 897 } 898 899 devPermissions = []specs.LinuxDeviceCgroup{ 900 { 901 Allow: true, 902 Access: "rwm", 903 }, 904 } 905 } else { 906 for _, deviceMapping := range c.HostConfig.Devices { 907 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 908 if err != nil { 909 return err 910 } 911 devs = append(devs, d...) 912 devPermissions = append(devPermissions, dPermissions...) 913 } 914 915 var err error 916 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 917 if err != nil { 918 return err 919 } 920 } 921 922 s.Linux.Devices = append(s.Linux.Devices, devs...) 923 s.Linux.Resources.Devices = devPermissions 924 925 for _, req := range c.HostConfig.DeviceRequests { 926 if err := daemon.handleDevice(req, s); err != nil { 927 return err 928 } 929 } 930 return nil 931 } 932 } 933 934 // WithResources applies the container resources 935 func WithResources(c *container.Container) coci.SpecOpts { 936 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 937 r := c.HostConfig.Resources 938 weightDevices, err := getBlkioWeightDevices(r) 939 if err != nil { 940 return err 941 } 942 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 943 if err != nil { 944 return err 945 } 946 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 947 if err != nil { 948 return err 949 } 950 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 951 if err != nil { 952 return err 953 } 954 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 955 if err != nil { 956 return err 957 } 958 959 memoryRes := getMemoryResources(r) 960 cpuRes, err := getCPUResources(r) 961 if err != nil { 962 return err 963 } 964 blkioWeight := r.BlkioWeight 965 966 specResources := &specs.LinuxResources{ 967 Memory: memoryRes, 968 CPU: cpuRes, 969 BlockIO: &specs.LinuxBlockIO{ 970 Weight: &blkioWeight, 971 WeightDevice: weightDevices, 972 ThrottleReadBpsDevice: readBpsDevice, 973 ThrottleWriteBpsDevice: writeBpsDevice, 974 ThrottleReadIOPSDevice: readIOpsDevice, 975 ThrottleWriteIOPSDevice: writeIOpsDevice, 976 }, 977 Pids: getPidsLimit(r), 978 } 979 980 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 981 specResources.Devices = s.Linux.Resources.Devices 982 } 983 984 s.Linux.Resources = specResources 985 return nil 986 } 987 } 988 989 // WithSysctls sets the container's sysctls 990 func WithSysctls(c *container.Container) coci.SpecOpts { 991 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 992 // We merge the sysctls injected above with the HostConfig (latter takes 993 // precedence for backwards-compatibility reasons). 994 for k, v := range c.HostConfig.Sysctls { 995 s.Linux.Sysctl[k] = v 996 } 997 return nil 998 } 999 } 1000 1001 // WithUser sets the container's user 1002 func WithUser(c *container.Container) coci.SpecOpts { 1003 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1004 var err error 1005 s.Process.User, err = getUser(c, c.Config.User) 1006 return err 1007 } 1008 } 1009 1010 func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) { 1011 var ( 1012 opts []coci.SpecOpts 1013 s = oci.DefaultSpec() 1014 ) 1015 opts = append(opts, 1016 WithCommonOptions(daemon, c), 1017 WithCgroups(daemon, c), 1018 WithResources(c), 1019 WithSysctls(c), 1020 WithDevices(daemon, c), 1021 WithUser(c), 1022 WithRlimits(daemon, c), 1023 WithNamespaces(daemon, c), 1024 WithCapabilities(c), 1025 WithSeccomp(daemon, c), 1026 WithMounts(daemon, c), 1027 WithLibnetwork(daemon, c), 1028 WithApparmor(c), 1029 WithSelinux(c), 1030 WithOOMScore(&c.HostConfig.OomScoreAdj), 1031 ) 1032 if c.NoNewPrivileges { 1033 opts = append(opts, coci.WithNoNewPrivileges) 1034 } 1035 1036 // Set the masked and readonly paths with regard to the host config options if they are set. 1037 if c.HostConfig.MaskedPaths != nil { 1038 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1039 } 1040 if c.HostConfig.ReadonlyPaths != nil { 1041 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1042 } 1043 if daemon.configStore.Rootless { 1044 opts = append(opts, WithRootless(daemon)) 1045 } 1046 return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{ 1047 ID: c.ID, 1048 }, &s, opts...) 1049 } 1050 1051 func clearReadOnly(m *specs.Mount) { 1052 var opt []string 1053 for _, o := range m.Options { 1054 if o != "ro" { 1055 opt = append(opt, o) 1056 } 1057 } 1058 m.Options = opt 1059 } 1060 1061 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1062 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 1063 ulimits := c.Ulimits 1064 // Merge ulimits with daemon defaults 1065 ulIdx := make(map[string]struct{}) 1066 for _, ul := range ulimits { 1067 ulIdx[ul.Name] = struct{}{} 1068 } 1069 for name, ul := range daemon.configStore.Ulimits { 1070 if _, exists := ulIdx[name]; !exists { 1071 ulimits = append(ulimits, ul) 1072 } 1073 } 1074 c.Ulimits = ulimits 1075 }