github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "io/ioutil" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "sort" 11 "strconv" 12 "strings" 13 14 cdcgroups "github.com/containerd/cgroups" 15 "github.com/containerd/containerd/containers" 16 coci "github.com/containerd/containerd/oci" 17 "github.com/containerd/containerd/pkg/apparmor" 18 "github.com/containerd/containerd/pkg/userns" 19 containertypes "github.com/docker/docker/api/types/container" 20 "github.com/docker/docker/container" 21 daemonconfig "github.com/docker/docker/daemon/config" 22 "github.com/docker/docker/oci" 23 "github.com/docker/docker/oci/caps" 24 "github.com/docker/docker/pkg/idtools" 25 "github.com/docker/docker/pkg/stringid" 26 "github.com/docker/docker/rootless/specconv" 27 volumemounts "github.com/docker/docker/volume/mounts" 28 "github.com/moby/sys/mount" 29 "github.com/moby/sys/mountinfo" 30 "github.com/opencontainers/runc/libcontainer/cgroups" 31 "github.com/opencontainers/runc/libcontainer/devices" 32 "github.com/opencontainers/runc/libcontainer/user" 33 specs "github.com/opencontainers/runtime-spec/specs-go" 34 "github.com/pkg/errors" 35 "github.com/sirupsen/logrus" 36 "golang.org/x/sys/unix" 37 ) 38 39 const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary 40 41 // WithRlimits sets the container's rlimits along with merging the daemon's rlimits 42 func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { 43 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 44 var rlimits []specs.POSIXRlimit 45 46 // We want to leave the original HostConfig alone so make a copy here 47 hostConfig := *c.HostConfig 48 // Merge with the daemon defaults 49 daemon.mergeUlimits(&hostConfig) 50 for _, ul := range hostConfig.Ulimits { 51 rlimits = append(rlimits, specs.POSIXRlimit{ 52 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 53 Soft: uint64(ul.Soft), 54 Hard: uint64(ul.Hard), 55 }) 56 } 57 58 s.Process.Rlimits = rlimits 59 return nil 60 } 61 } 62 63 // WithLibnetwork sets the libnetwork hook 64 func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts { 65 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 66 if s.Hooks == nil { 67 s.Hooks = &specs.Hooks{} 68 } 69 for _, ns := range s.Linux.Namespaces { 70 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 71 target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") 72 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 73 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ 74 Path: target, 75 Args: []string{ 76 "libnetwork-setkey", 77 "-exec-root=" + daemon.configStore.GetExecRoot(), 78 c.ID, 79 shortNetCtlrID, 80 }, 81 }) 82 } 83 } 84 return nil 85 } 86 } 87 88 // WithRootless sets the spec to the rootless configuration 89 func WithRootless(daemon *Daemon) coci.SpecOpts { 90 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 91 var v2Controllers []string 92 if daemon.getCgroupDriver() == cgroupSystemdDriver { 93 if cdcgroups.Mode() != cdcgroups.Unified { 94 return errors.New("rootless systemd driver doesn't support cgroup v1") 95 } 96 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 97 if rootlesskitParentEUID == "" { 98 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 99 } 100 euid, err := strconv.Atoi(rootlesskitParentEUID) 101 if err != nil { 102 return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value") 103 } 104 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid) 105 controllersFile, err := ioutil.ReadFile(controllersPath) 106 if err != nil { 107 return err 108 } 109 v2Controllers = strings.Fields(string(controllersFile)) 110 } 111 return specconv.ToRootless(s, v2Controllers) 112 } 113 } 114 115 // WithOOMScore sets the oom score 116 func WithOOMScore(score *int) coci.SpecOpts { 117 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 118 s.Process.OOMScoreAdj = score 119 return nil 120 } 121 } 122 123 // WithSelinux sets the selinux labels 124 func WithSelinux(c *container.Container) coci.SpecOpts { 125 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 126 s.Process.SelinuxLabel = c.GetProcessLabel() 127 s.Linux.MountLabel = c.MountLabel 128 return nil 129 } 130 } 131 132 // WithApparmor sets the apparmor profile 133 func WithApparmor(c *container.Container) coci.SpecOpts { 134 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 135 if apparmor.HostSupports() { 136 var appArmorProfile string 137 if c.AppArmorProfile != "" { 138 appArmorProfile = c.AppArmorProfile 139 } else if c.HostConfig.Privileged { 140 appArmorProfile = unconfinedAppArmorProfile 141 } else { 142 appArmorProfile = defaultAppArmorProfile 143 } 144 145 if appArmorProfile == defaultAppArmorProfile { 146 // Unattended upgrades and other fun services can unload AppArmor 147 // profiles inadvertently. Since we cannot store our profile in 148 // /etc/apparmor.d, nor can we practically add other ways of 149 // telling the system to keep our profile loaded, in order to make 150 // sure that we keep the default profile enabled we dynamically 151 // reload it if necessary. 152 if err := ensureDefaultAppArmorProfile(); err != nil { 153 return err 154 } 155 } 156 s.Process.ApparmorProfile = appArmorProfile 157 } 158 return nil 159 } 160 } 161 162 // WithCapabilities sets the container's capabilties 163 func WithCapabilities(c *container.Container) coci.SpecOpts { 164 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 165 capabilities, err := caps.TweakCapabilities( 166 caps.DefaultCapabilities(), 167 c.HostConfig.CapAdd, 168 c.HostConfig.CapDrop, 169 c.HostConfig.Privileged, 170 ) 171 if err != nil { 172 return err 173 } 174 return oci.SetCapabilities(s, capabilities) 175 } 176 } 177 178 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 179 p, err := getPath() 180 if err != nil { 181 return "", err 182 } 183 return c.GetResourcePath(p) 184 } 185 186 func getUser(c *container.Container, username string) (specs.User, error) { 187 var usr specs.User 188 passwdPath, err := resourcePath(c, user.GetPasswdPath) 189 if err != nil { 190 return usr, err 191 } 192 groupPath, err := resourcePath(c, user.GetGroupPath) 193 if err != nil { 194 return usr, err 195 } 196 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 197 if err != nil { 198 return usr, err 199 } 200 usr.UID = uint32(execUser.Uid) 201 usr.GID = uint32(execUser.Gid) 202 203 var addGroups []int 204 if len(c.HostConfig.GroupAdd) > 0 { 205 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 206 if err != nil { 207 return usr, err 208 } 209 } 210 for _, g := range append(execUser.Sgids, addGroups...) { 211 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 212 } 213 return usr, nil 214 } 215 216 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 217 for i, n := range s.Linux.Namespaces { 218 if n.Type == ns.Type { 219 s.Linux.Namespaces[i] = ns 220 return 221 } 222 } 223 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 224 } 225 226 // WithNamespaces sets the container's namespaces 227 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 228 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 229 userNS := false 230 // user 231 if c.HostConfig.UsernsMode.IsPrivate() { 232 uidMap := daemon.idMapping.UIDs() 233 if uidMap != nil { 234 userNS = true 235 ns := specs.LinuxNamespace{Type: "user"} 236 setNamespace(s, ns) 237 s.Linux.UIDMappings = specMapping(uidMap) 238 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs()) 239 } 240 } 241 // network 242 if !c.Config.NetworkDisabled { 243 ns := specs.LinuxNamespace{Type: "network"} 244 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 245 if parts[0] == "container" { 246 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 247 if err != nil { 248 return err 249 } 250 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 251 if userNS { 252 // to share a net namespace, they must also share a user namespace 253 nsUser := specs.LinuxNamespace{Type: "user"} 254 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 255 setNamespace(s, nsUser) 256 } 257 } else if c.HostConfig.NetworkMode.IsHost() { 258 ns.Path = c.NetworkSettings.SandboxKey 259 } 260 setNamespace(s, ns) 261 } 262 263 // ipc 264 ipcMode := c.HostConfig.IpcMode 265 switch { 266 case ipcMode.IsContainer(): 267 ns := specs.LinuxNamespace{Type: "ipc"} 268 ic, err := daemon.getIpcContainer(ipcMode.Container()) 269 if err != nil { 270 return err 271 } 272 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 273 setNamespace(s, ns) 274 if userNS { 275 // to share an IPC namespace, they must also share a user namespace 276 nsUser := specs.LinuxNamespace{Type: "user"} 277 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 278 setNamespace(s, nsUser) 279 } 280 case ipcMode.IsHost(): 281 oci.RemoveNamespace(s, "ipc") 282 case ipcMode.IsEmpty(): 283 // A container was created by an older version of the daemon. 284 // The default behavior used to be what is now called "shareable". 285 fallthrough 286 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 287 ns := specs.LinuxNamespace{Type: "ipc"} 288 setNamespace(s, ns) 289 default: 290 return fmt.Errorf("Invalid IPC mode: %v", ipcMode) 291 } 292 293 // pid 294 if c.HostConfig.PidMode.IsContainer() { 295 pc, err := daemon.getPidContainer(c) 296 if err != nil { 297 return err 298 } 299 ns := specs.LinuxNamespace{ 300 Type: "pid", 301 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 302 } 303 setNamespace(s, ns) 304 if userNS { 305 // to share a PID namespace, they must also share a user namespace 306 nsUser := specs.LinuxNamespace{ 307 Type: "user", 308 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 309 } 310 setNamespace(s, nsUser) 311 } 312 } else if c.HostConfig.PidMode.IsHost() { 313 oci.RemoveNamespace(s, "pid") 314 } else { 315 ns := specs.LinuxNamespace{Type: "pid"} 316 setNamespace(s, ns) 317 } 318 // uts 319 if c.HostConfig.UTSMode.IsHost() { 320 oci.RemoveNamespace(s, "uts") 321 s.Hostname = "" 322 } 323 324 // cgroup 325 if !c.HostConfig.CgroupnsMode.IsEmpty() { 326 cgroupNsMode := c.HostConfig.CgroupnsMode 327 if !cgroupNsMode.Valid() { 328 return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode) 329 } 330 if cgroupNsMode.IsPrivate() { 331 nsCgroup := specs.LinuxNamespace{Type: "cgroup"} 332 setNamespace(s, nsCgroup) 333 } 334 } 335 336 return nil 337 } 338 } 339 340 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 341 var ids []specs.LinuxIDMapping 342 for _, item := range s { 343 ids = append(ids, specs.LinuxIDMapping{ 344 HostID: uint32(item.HostID), 345 ContainerID: uint32(item.ContainerID), 346 Size: uint32(item.Size), 347 }) 348 } 349 return ids 350 } 351 352 // Get the source mount point of directory passed in as argument. Also return 353 // optional fields. 354 func getSourceMount(source string) (string, string, error) { 355 // Ensure any symlinks are resolved. 356 sourcePath, err := filepath.EvalSymlinks(source) 357 if err != nil { 358 return "", "", err 359 } 360 361 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 362 if err != nil { 363 return "", "", err 364 } 365 if len(mi) < 1 { 366 return "", "", fmt.Errorf("Can't find mount point of %s", source) 367 } 368 369 // find the longest mount point 370 var idx, maxlen int 371 for i := range mi { 372 if len(mi[i].Mountpoint) > maxlen { 373 maxlen = len(mi[i].Mountpoint) 374 idx = i 375 } 376 } 377 return mi[idx].Mountpoint, mi[idx].Optional, nil 378 } 379 380 const ( 381 sharedPropagationOption = "shared:" 382 slavePropagationOption = "master:" 383 ) 384 385 // hasMountInfoOption checks if any of the passed any of the given option values 386 // are set in the passed in option string. 387 func hasMountInfoOption(opts string, vals ...string) bool { 388 for _, opt := range strings.Split(opts, " ") { 389 for _, val := range vals { 390 if strings.HasPrefix(opt, val) { 391 return true 392 } 393 } 394 } 395 return false 396 } 397 398 // Ensure mount point on which path is mounted, is shared. 399 func ensureShared(path string) error { 400 sourceMount, optionalOpts, err := getSourceMount(path) 401 if err != nil { 402 return err 403 } 404 // Make sure source mount point is shared. 405 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 406 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 407 } 408 return nil 409 } 410 411 // Ensure mount point on which path is mounted, is either shared or slave. 412 func ensureSharedOrSlave(path string) error { 413 sourceMount, optionalOpts, err := getSourceMount(path) 414 if err != nil { 415 return err 416 } 417 418 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 419 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 420 } 421 return nil 422 } 423 424 // Get the set of mount flags that are set on the mount that contains the given 425 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 426 // bind-mounting "with options" will not fail with user namespaces, due to 427 // kernel restrictions that require user namespace mounts to preserve 428 // CL_UNPRIVILEGED locked flags. 429 func getUnprivilegedMountFlags(path string) ([]string, error) { 430 var statfs unix.Statfs_t 431 if err := unix.Statfs(path, &statfs); err != nil { 432 return nil, err 433 } 434 435 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 436 unprivilegedFlags := map[uint64]string{ 437 unix.MS_RDONLY: "ro", 438 unix.MS_NODEV: "nodev", 439 unix.MS_NOEXEC: "noexec", 440 unix.MS_NOSUID: "nosuid", 441 unix.MS_NOATIME: "noatime", 442 unix.MS_RELATIME: "relatime", 443 unix.MS_NODIRATIME: "nodiratime", 444 } 445 446 var flags []string 447 for mask, flag := range unprivilegedFlags { 448 if uint64(statfs.Flags)&mask == mask { 449 flags = append(flags, flag) 450 } 451 } 452 453 return flags, nil 454 } 455 456 var ( 457 mountPropagationMap = map[string]int{ 458 "private": mount.PRIVATE, 459 "rprivate": mount.RPRIVATE, 460 "shared": mount.SHARED, 461 "rshared": mount.RSHARED, 462 "slave": mount.SLAVE, 463 "rslave": mount.RSLAVE, 464 } 465 466 mountPropagationReverseMap = map[int]string{ 467 mount.PRIVATE: "private", 468 mount.RPRIVATE: "rprivate", 469 mount.SHARED: "shared", 470 mount.RSHARED: "rshared", 471 mount.SLAVE: "slave", 472 mount.RSLAVE: "rslave", 473 } 474 ) 475 476 // inSlice tests whether a string is contained in a slice of strings or not. 477 // Comparison is case sensitive 478 func inSlice(slice []string, s string) bool { 479 for _, ss := range slice { 480 if s == ss { 481 return true 482 } 483 } 484 return false 485 } 486 487 // WithMounts sets the container's mounts 488 func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { 489 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 490 if err := daemon.setupContainerMountsRoot(c); err != nil { 491 return err 492 } 493 494 if err := daemon.setupIpcDirs(c); err != nil { 495 return err 496 } 497 498 defer func() { 499 if err != nil { 500 daemon.cleanupSecretDir(c) 501 } 502 }() 503 504 if err := daemon.setupSecretDir(c); err != nil { 505 return err 506 } 507 508 ms, err := daemon.setupMounts(c) 509 if err != nil { 510 return err 511 } 512 513 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 514 ms = append(ms, c.IpcMounts()...) 515 } 516 517 tmpfsMounts, err := c.TmpfsMounts() 518 if err != nil { 519 return err 520 } 521 ms = append(ms, tmpfsMounts...) 522 523 secretMounts, err := c.SecretMounts() 524 if err != nil { 525 return err 526 } 527 ms = append(ms, secretMounts...) 528 529 sort.Sort(mounts(ms)) 530 531 mounts := ms 532 533 userMounts := make(map[string]struct{}) 534 for _, m := range mounts { 535 userMounts[m.Destination] = struct{}{} 536 } 537 538 // Copy all mounts from spec to defaultMounts, except for 539 // - mounts overridden by a user supplied mount; 540 // - all mounts under /dev if a user supplied /dev is present; 541 // - /dev/shm, in case IpcMode is none. 542 // While at it, also 543 // - set size for /dev/shm from shmsize. 544 defaultMounts := s.Mounts[:0] 545 _, mountDev := userMounts["/dev"] 546 for _, m := range s.Mounts { 547 if _, ok := userMounts[m.Destination]; ok { 548 // filter out mount overridden by a user supplied mount 549 continue 550 } 551 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 552 // filter out everything under /dev if /dev is user-mounted 553 continue 554 } 555 556 if m.Destination == "/dev/shm" { 557 if c.HostConfig.IpcMode.IsNone() { 558 // filter out /dev/shm for "none" IpcMode 559 continue 560 } 561 // set size for /dev/shm mount from spec 562 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 563 m.Options = append(m.Options, sizeOpt) 564 } 565 566 defaultMounts = append(defaultMounts, m) 567 } 568 569 s.Mounts = defaultMounts 570 for _, m := range mounts { 571 if m.Source == "tmpfs" { 572 data := m.Data 573 parser := volumemounts.NewParser("linux") 574 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 575 if data != "" { 576 options = append(options, strings.Split(data, ",")...) 577 } 578 579 merged, err := mount.MergeTmpfsOptions(options) 580 if err != nil { 581 return err 582 } 583 584 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 585 continue 586 } 587 588 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 589 590 // Determine property of RootPropagation based on volume 591 // properties. If a volume is shared, then keep root propagation 592 // shared. This should work for slave and private volumes too. 593 // 594 // For slave volumes, it can be either [r]shared/[r]slave. 595 // 596 // For private volumes any root propagation value should work. 597 pFlag := mountPropagationMap[m.Propagation] 598 switch pFlag { 599 case mount.SHARED, mount.RSHARED: 600 if err := ensureShared(m.Source); err != nil { 601 return err 602 } 603 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 604 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 605 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 606 } 607 case mount.SLAVE, mount.RSLAVE: 608 var fallback bool 609 if err := ensureSharedOrSlave(m.Source); err != nil { 610 // For backwards compatibility purposes, treat mounts from the daemon root 611 // as special since we automatically add rslave propagation to these mounts 612 // when the user did not set anything, so we should fallback to the old 613 // behavior which is to use private propagation which is normally the 614 // default. 615 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 616 return err 617 } 618 619 cm, ok := c.MountPoints[m.Destination] 620 if !ok { 621 return err 622 } 623 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 624 // This means the user explicitly set a propagation, do not fallback in that case. 625 return err 626 } 627 fallback = true 628 logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 629 } 630 if !fallback { 631 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 632 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 633 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 634 } 635 } 636 } 637 638 bindMode := "rbind" 639 if m.NonRecursive { 640 bindMode = "bind" 641 } 642 opts := []string{bindMode} 643 if !m.Writable { 644 opts = append(opts, "ro") 645 } 646 if pFlag != 0 { 647 opts = append(opts, mountPropagationReverseMap[pFlag]) 648 } 649 650 // If we are using user namespaces, then we must make sure that we 651 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 652 // "mount" when we bind-mount. The reason for this is that at the point 653 // when runc sets up the root filesystem, it is already inside a user 654 // namespace, and thus cannot change any flags that are locked. 655 if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() { 656 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 657 if err != nil { 658 return err 659 } 660 opts = append(opts, unprivOpts...) 661 } 662 663 mt.Options = opts 664 s.Mounts = append(s.Mounts, mt) 665 } 666 667 if s.Root.Readonly { 668 for i, m := range s.Mounts { 669 switch m.Destination { 670 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 671 continue 672 } 673 if _, ok := userMounts[m.Destination]; !ok { 674 if !inSlice(m.Options, "ro") { 675 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 676 } 677 } 678 } 679 } 680 681 if c.HostConfig.Privileged { 682 // clear readonly for /sys 683 for i := range s.Mounts { 684 if s.Mounts[i].Destination == "/sys" { 685 clearReadOnly(&s.Mounts[i]) 686 } 687 } 688 s.Linux.ReadonlyPaths = nil 689 s.Linux.MaskedPaths = nil 690 } 691 692 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 693 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 694 if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged { 695 for i, m := range s.Mounts { 696 if m.Type == "cgroup" { 697 clearReadOnly(&s.Mounts[i]) 698 } 699 } 700 } 701 702 return nil 703 704 } 705 } 706 707 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 708 // exist, so do not add the default ones if running on an old kernel. 709 func sysctlExists(s string) bool { 710 f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1)) 711 _, err := os.Stat(f) 712 return err == nil 713 } 714 715 // WithCommonOptions sets common docker options 716 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { 717 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 718 if c.BaseFS == nil { 719 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil") 720 } 721 linkedEnv, err := daemon.setupLinkedContainers(c) 722 if err != nil { 723 return err 724 } 725 s.Root = &specs.Root{ 726 Path: c.BaseFS.Path(), 727 Readonly: c.HostConfig.ReadonlyRootfs, 728 } 729 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 730 return err 731 } 732 cwd := c.Config.WorkingDir 733 if len(cwd) == 0 { 734 cwd = "/" 735 } 736 s.Process.Args = append([]string{c.Path}, c.Args...) 737 738 // only add the custom init if it is specified and the container is running in its 739 // own private pid namespace. It does not make sense to add if it is running in the 740 // host namespace or another container's pid namespace where we already have an init 741 if c.HostConfig.PidMode.IsPrivate() { 742 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 743 (c.HostConfig.Init == nil && daemon.configStore.Init) { 744 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 745 path := daemon.configStore.InitPath 746 if path == "" { 747 path, err = exec.LookPath(daemonconfig.DefaultInitBinary) 748 if err != nil { 749 return err 750 } 751 } 752 s.Mounts = append(s.Mounts, specs.Mount{ 753 Destination: inContainerInitPath, 754 Type: "bind", 755 Source: path, 756 Options: []string{"bind", "ro"}, 757 }) 758 } 759 } 760 s.Process.Cwd = cwd 761 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 762 s.Process.Terminal = c.Config.Tty 763 764 s.Hostname = c.Config.Hostname 765 setLinuxDomainname(c, s) 766 767 // Add default sysctls that are generally safe and useful; currently we 768 // grant the capabilities to allow these anyway. You can override if 769 // you want to restore the original behaviour. 770 // We do not set network sysctls if network namespace is host, or if we are 771 // joining an existing namespace, only if we create a new net namespace. 772 if c.HostConfig.NetworkMode.IsPrivate() { 773 // We cannot set up ping socket support in a user namespace 774 if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") { 775 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 776 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 777 } 778 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 779 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 780 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 781 } 782 } 783 784 return nil 785 } 786 } 787 788 // WithCgroups sets the container's cgroups 789 func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { 790 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 791 var cgroupsPath string 792 scopePrefix := "docker" 793 parent := "/docker" 794 useSystemd := UsingSystemd(daemon.configStore) 795 if useSystemd { 796 parent = "system.slice" 797 if daemon.configStore.Rootless { 798 parent = "user.slice" 799 } 800 } 801 802 if c.HostConfig.CgroupParent != "" { 803 parent = c.HostConfig.CgroupParent 804 } else if daemon.configStore.CgroupParent != "" { 805 parent = daemon.configStore.CgroupParent 806 } 807 808 if useSystemd { 809 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 810 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 811 } else { 812 cgroupsPath = filepath.Join(parent, c.ID) 813 } 814 s.Linux.CgroupsPath = cgroupsPath 815 816 // the rest is only needed for CPU RT controller 817 818 if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 { 819 return nil 820 } 821 822 if cdcgroups.Mode() == cdcgroups.Unified { 823 return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2") 824 } 825 826 // FIXME this is very expensive way to check if cpu rt is supported 827 sysInfo := daemon.RawSysInfo(true) 828 if !sysInfo.CPURealtime { 829 return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not supported by the kernel") 830 } 831 832 p := cgroupsPath 833 if useSystemd { 834 initPath, err := cgroups.GetInitCgroup("cpu") 835 if err != nil { 836 return errors.Wrap(err, "unable to init CPU RT controller") 837 } 838 _, err = cgroups.GetOwnCgroup("cpu") 839 if err != nil { 840 return errors.Wrap(err, "unable to init CPU RT controller") 841 } 842 p = filepath.Join(initPath, s.Linux.CgroupsPath) 843 } 844 845 // Clean path to guard against things like ../../../BAD 846 parentPath := filepath.Dir(p) 847 if !filepath.IsAbs(parentPath) { 848 parentPath = filepath.Clean("/" + parentPath) 849 } 850 851 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 852 if err != nil { 853 return errors.Wrap(err, "unable to init CPU RT controller") 854 } 855 // When docker is run inside docker, the root is based of the host cgroup. 856 // Should this be handled in runc/libcontainer/cgroups ? 857 if strings.HasPrefix(root, "/docker/") { 858 root = "/" 859 } 860 mnt = filepath.Join(mnt, root) 861 862 if err := daemon.initCPURtController(mnt, parentPath); err != nil { 863 return errors.Wrap(err, "unable to init CPU RT controller") 864 } 865 return nil 866 } 867 } 868 869 // WithDevices sets the container's devices 870 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 871 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 872 // Build lists of devices allowed and created within the container. 873 var devs []specs.LinuxDevice 874 devPermissions := s.Linux.Resources.Devices 875 876 if c.HostConfig.Privileged && !userns.RunningInUserNS() { 877 hostDevices, err := devices.HostDevices() 878 if err != nil { 879 return err 880 } 881 for _, d := range hostDevices { 882 devs = append(devs, oci.Device(d)) 883 } 884 885 // adding device mappings in privileged containers 886 for _, deviceMapping := range c.HostConfig.Devices { 887 // issue a warning that custom cgroup permissions are ignored in privileged mode 888 if deviceMapping.CgroupPermissions != "rwm" { 889 logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 890 } 891 // issue a warning that the device path already exists via /dev mounting in privileged mode 892 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 893 logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 894 continue 895 } 896 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 897 if err != nil { 898 return err 899 } 900 devs = append(devs, d...) 901 } 902 903 devPermissions = []specs.LinuxDeviceCgroup{ 904 { 905 Allow: true, 906 Access: "rwm", 907 }, 908 } 909 } else { 910 for _, deviceMapping := range c.HostConfig.Devices { 911 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 912 if err != nil { 913 return err 914 } 915 devs = append(devs, d...) 916 devPermissions = append(devPermissions, dPermissions...) 917 } 918 919 var err error 920 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 921 if err != nil { 922 return err 923 } 924 } 925 926 s.Linux.Devices = append(s.Linux.Devices, devs...) 927 s.Linux.Resources.Devices = devPermissions 928 929 for _, req := range c.HostConfig.DeviceRequests { 930 if err := daemon.handleDevice(req, s); err != nil { 931 return err 932 } 933 } 934 return nil 935 } 936 } 937 938 // WithResources applies the container resources 939 func WithResources(c *container.Container) coci.SpecOpts { 940 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 941 r := c.HostConfig.Resources 942 weightDevices, err := getBlkioWeightDevices(r) 943 if err != nil { 944 return err 945 } 946 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 947 if err != nil { 948 return err 949 } 950 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 951 if err != nil { 952 return err 953 } 954 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 955 if err != nil { 956 return err 957 } 958 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 959 if err != nil { 960 return err 961 } 962 963 memoryRes := getMemoryResources(r) 964 cpuRes, err := getCPUResources(r) 965 if err != nil { 966 return err 967 } 968 blkioWeight := r.BlkioWeight 969 970 specResources := &specs.LinuxResources{ 971 Memory: memoryRes, 972 CPU: cpuRes, 973 BlockIO: &specs.LinuxBlockIO{ 974 Weight: &blkioWeight, 975 WeightDevice: weightDevices, 976 ThrottleReadBpsDevice: readBpsDevice, 977 ThrottleWriteBpsDevice: writeBpsDevice, 978 ThrottleReadIOPSDevice: readIOpsDevice, 979 ThrottleWriteIOPSDevice: writeIOpsDevice, 980 }, 981 Pids: getPidsLimit(r), 982 } 983 984 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 985 specResources.Devices = s.Linux.Resources.Devices 986 } 987 988 s.Linux.Resources = specResources 989 return nil 990 } 991 } 992 993 // WithSysctls sets the container's sysctls 994 func WithSysctls(c *container.Container) coci.SpecOpts { 995 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 996 // We merge the sysctls injected above with the HostConfig (latter takes 997 // precedence for backwards-compatibility reasons). 998 for k, v := range c.HostConfig.Sysctls { 999 s.Linux.Sysctl[k] = v 1000 } 1001 return nil 1002 } 1003 } 1004 1005 // WithUser sets the container's user 1006 func WithUser(c *container.Container) coci.SpecOpts { 1007 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1008 var err error 1009 s.Process.User, err = getUser(c, c.Config.User) 1010 return err 1011 } 1012 } 1013 1014 func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) { 1015 var ( 1016 opts []coci.SpecOpts 1017 s = oci.DefaultSpec() 1018 ) 1019 opts = append(opts, 1020 WithCommonOptions(daemon, c), 1021 WithCgroups(daemon, c), 1022 WithResources(c), 1023 WithSysctls(c), 1024 WithDevices(daemon, c), 1025 WithUser(c), 1026 WithRlimits(daemon, c), 1027 WithNamespaces(daemon, c), 1028 WithCapabilities(c), 1029 WithSeccomp(daemon, c), 1030 WithMounts(daemon, c), 1031 WithLibnetwork(daemon, c), 1032 WithApparmor(c), 1033 WithSelinux(c), 1034 WithOOMScore(&c.HostConfig.OomScoreAdj), 1035 ) 1036 if c.NoNewPrivileges { 1037 opts = append(opts, coci.WithNoNewPrivileges) 1038 } 1039 1040 // Set the masked and readonly paths with regard to the host config options if they are set. 1041 if c.HostConfig.MaskedPaths != nil { 1042 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1043 } 1044 if c.HostConfig.ReadonlyPaths != nil { 1045 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1046 } 1047 if daemon.configStore.Rootless { 1048 opts = append(opts, WithRootless(daemon)) 1049 } 1050 return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{ 1051 ID: c.ID, 1052 }, &s, opts...) 1053 } 1054 1055 func clearReadOnly(m *specs.Mount) { 1056 var opt []string 1057 for _, o := range m.Options { 1058 if o != "ro" { 1059 opt = append(opt, o) 1060 } 1061 } 1062 m.Options = opt 1063 } 1064 1065 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1066 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 1067 ulimits := c.Ulimits 1068 // Merge ulimits with daemon defaults 1069 ulIdx := make(map[string]struct{}) 1070 for _, ul := range ulimits { 1071 ulIdx[ul.Name] = struct{}{} 1072 } 1073 for name, ul := range daemon.configStore.Ulimits { 1074 if _, exists := ulIdx[name]; !exists { 1075 ulimits = append(ulimits, ul) 1076 } 1077 } 1078 c.Ulimits = ulimits 1079 }