github.com/docker/docker@v299999999.0.0-20200612211812-aaf470eca7b5+incompatible/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "io/ioutil" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "sort" 12 "strconv" 13 "strings" 14 15 "github.com/containerd/containerd/containers" 16 coci "github.com/containerd/containerd/oci" 17 containertypes "github.com/docker/docker/api/types/container" 18 "github.com/docker/docker/container" 19 daemonconfig "github.com/docker/docker/daemon/config" 20 "github.com/docker/docker/oci" 21 "github.com/docker/docker/oci/caps" 22 "github.com/docker/docker/pkg/idtools" 23 "github.com/docker/docker/pkg/stringid" 24 "github.com/docker/docker/rootless/specconv" 25 volumemounts "github.com/docker/docker/volume/mounts" 26 "github.com/moby/sys/mount" 27 "github.com/moby/sys/mountinfo" 28 "github.com/opencontainers/runc/libcontainer/apparmor" 29 "github.com/opencontainers/runc/libcontainer/cgroups" 30 "github.com/opencontainers/runc/libcontainer/devices" 31 rsystem "github.com/opencontainers/runc/libcontainer/system" 32 "github.com/opencontainers/runc/libcontainer/user" 33 specs "github.com/opencontainers/runtime-spec/specs-go" 34 "github.com/pkg/errors" 35 "github.com/sirupsen/logrus" 36 "golang.org/x/sys/unix" 37 ) 38 39 const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary 40 41 // WithRlimits sets the container's rlimits along with merging the daemon's rlimits 42 func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { 43 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 44 var rlimits []specs.POSIXRlimit 45 46 // We want to leave the original HostConfig alone so make a copy here 47 hostConfig := *c.HostConfig 48 // Merge with the daemon defaults 49 daemon.mergeUlimits(&hostConfig) 50 for _, ul := range hostConfig.Ulimits { 51 rlimits = append(rlimits, specs.POSIXRlimit{ 52 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 53 Soft: uint64(ul.Soft), 54 Hard: uint64(ul.Hard), 55 }) 56 } 57 58 s.Process.Rlimits = rlimits 59 return nil 60 } 61 } 62 63 // WithLibnetwork sets the libnetwork hook 64 func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts { 65 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 66 if s.Hooks == nil { 67 s.Hooks = &specs.Hooks{} 68 } 69 for _, ns := range s.Linux.Namespaces { 70 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 71 target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") 72 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 73 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ 74 Path: target, 75 Args: []string{ 76 "libnetwork-setkey", 77 "-exec-root=" + daemon.configStore.GetExecRoot(), 78 c.ID, 79 shortNetCtlrID, 80 }, 81 }) 82 } 83 } 84 return nil 85 } 86 } 87 88 // WithRootless sets the spec to the rootless configuration 89 func WithRootless(daemon *Daemon) coci.SpecOpts { 90 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 91 var v2Controllers []string 92 if daemon.getCgroupDriver() == cgroupSystemdDriver { 93 if !cgroups.IsCgroup2UnifiedMode() { 94 return errors.New("rootless systemd driver doesn't support cgroup v1") 95 } 96 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 97 if rootlesskitParentEUID == "" { 98 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 99 } 100 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%s.slice/cgroup.controllers", rootlesskitParentEUID) 101 controllersFile, err := ioutil.ReadFile(controllersPath) 102 if err != nil { 103 return err 104 } 105 v2Controllers = strings.Fields(string(controllersFile)) 106 } 107 return specconv.ToRootless(s, v2Controllers) 108 } 109 } 110 111 // WithOOMScore sets the oom score 112 func WithOOMScore(score *int) coci.SpecOpts { 113 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 114 s.Process.OOMScoreAdj = score 115 return nil 116 } 117 } 118 119 // WithSelinux sets the selinux labels 120 func WithSelinux(c *container.Container) coci.SpecOpts { 121 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 122 s.Process.SelinuxLabel = c.GetProcessLabel() 123 s.Linux.MountLabel = c.MountLabel 124 return nil 125 } 126 } 127 128 // WithApparmor sets the apparmor profile 129 func WithApparmor(c *container.Container) coci.SpecOpts { 130 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 131 if apparmor.IsEnabled() { 132 var appArmorProfile string 133 if c.AppArmorProfile != "" { 134 appArmorProfile = c.AppArmorProfile 135 } else if c.HostConfig.Privileged { 136 appArmorProfile = unconfinedAppArmorProfile 137 } else { 138 appArmorProfile = defaultAppArmorProfile 139 } 140 141 if appArmorProfile == defaultAppArmorProfile { 142 // Unattended upgrades and other fun services can unload AppArmor 143 // profiles inadvertently. Since we cannot store our profile in 144 // /etc/apparmor.d, nor can we practically add other ways of 145 // telling the system to keep our profile loaded, in order to make 146 // sure that we keep the default profile enabled we dynamically 147 // reload it if necessary. 148 if err := ensureDefaultAppArmorProfile(); err != nil { 149 return err 150 } 151 } 152 s.Process.ApparmorProfile = appArmorProfile 153 } 154 return nil 155 } 156 } 157 158 // WithCapabilities sets the container's capabilties 159 func WithCapabilities(c *container.Container) coci.SpecOpts { 160 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 161 capabilities, err := caps.TweakCapabilities( 162 caps.DefaultCapabilities(), 163 c.HostConfig.CapAdd, 164 c.HostConfig.CapDrop, 165 c.HostConfig.Capabilities, 166 c.HostConfig.Privileged, 167 ) 168 if err != nil { 169 return err 170 } 171 return oci.SetCapabilities(s, capabilities) 172 } 173 } 174 175 func readUserFile(c *container.Container, p string) (io.ReadCloser, error) { 176 fp, err := c.GetResourcePath(p) 177 if err != nil { 178 return nil, err 179 } 180 return os.Open(fp) 181 } 182 183 func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) { 184 passwdPath, err := user.GetPasswdPath() 185 if err != nil { 186 return 0, 0, nil, err 187 } 188 groupPath, err := user.GetGroupPath() 189 if err != nil { 190 return 0, 0, nil, err 191 } 192 passwdFile, err := readUserFile(c, passwdPath) 193 if err == nil { 194 defer passwdFile.Close() 195 } 196 groupFile, err := readUserFile(c, groupPath) 197 if err == nil { 198 defer groupFile.Close() 199 } 200 201 execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile) 202 if err != nil { 203 return 0, 0, nil, err 204 } 205 206 // todo: fix this double read by a change to libcontainer/user pkg 207 groupFile, err = readUserFile(c, groupPath) 208 if err == nil { 209 defer groupFile.Close() 210 } 211 var addGroups []int 212 if len(c.HostConfig.GroupAdd) > 0 { 213 addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile) 214 if err != nil { 215 return 0, 0, nil, err 216 } 217 } 218 uid := uint32(execUser.Uid) 219 gid := uint32(execUser.Gid) 220 sgids := append(execUser.Sgids, addGroups...) 221 var additionalGids []uint32 222 for _, g := range sgids { 223 additionalGids = append(additionalGids, uint32(g)) 224 } 225 return uid, gid, additionalGids, nil 226 } 227 228 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 229 for i, n := range s.Linux.Namespaces { 230 if n.Type == ns.Type { 231 s.Linux.Namespaces[i] = ns 232 return 233 } 234 } 235 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 236 } 237 238 // WithNamespaces sets the container's namespaces 239 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 240 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 241 userNS := false 242 // user 243 if c.HostConfig.UsernsMode.IsPrivate() { 244 uidMap := daemon.idMapping.UIDs() 245 if uidMap != nil { 246 userNS = true 247 ns := specs.LinuxNamespace{Type: "user"} 248 setNamespace(s, ns) 249 s.Linux.UIDMappings = specMapping(uidMap) 250 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs()) 251 } 252 } 253 // network 254 if !c.Config.NetworkDisabled { 255 ns := specs.LinuxNamespace{Type: "network"} 256 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 257 if parts[0] == "container" { 258 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 259 if err != nil { 260 return err 261 } 262 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 263 if userNS { 264 // to share a net namespace, they must also share a user namespace 265 nsUser := specs.LinuxNamespace{Type: "user"} 266 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 267 setNamespace(s, nsUser) 268 } 269 } else if c.HostConfig.NetworkMode.IsHost() { 270 ns.Path = c.NetworkSettings.SandboxKey 271 } 272 setNamespace(s, ns) 273 } 274 275 // ipc 276 ipcMode := c.HostConfig.IpcMode 277 switch { 278 case ipcMode.IsContainer(): 279 ns := specs.LinuxNamespace{Type: "ipc"} 280 ic, err := daemon.getIpcContainer(ipcMode.Container()) 281 if err != nil { 282 return err 283 } 284 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 285 setNamespace(s, ns) 286 if userNS { 287 // to share an IPC namespace, they must also share a user namespace 288 nsUser := specs.LinuxNamespace{Type: "user"} 289 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 290 setNamespace(s, nsUser) 291 } 292 case ipcMode.IsHost(): 293 oci.RemoveNamespace(s, "ipc") 294 case ipcMode.IsEmpty(): 295 // A container was created by an older version of the daemon. 296 // The default behavior used to be what is now called "shareable". 297 fallthrough 298 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 299 ns := specs.LinuxNamespace{Type: "ipc"} 300 setNamespace(s, ns) 301 default: 302 return fmt.Errorf("Invalid IPC mode: %v", ipcMode) 303 } 304 305 // pid 306 if c.HostConfig.PidMode.IsContainer() { 307 pc, err := daemon.getPidContainer(c) 308 if err != nil { 309 return err 310 } 311 ns := specs.LinuxNamespace{ 312 Type: "pid", 313 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 314 } 315 setNamespace(s, ns) 316 if userNS { 317 // to share a PID namespace, they must also share a user namespace 318 nsUser := specs.LinuxNamespace{ 319 Type: "user", 320 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 321 } 322 setNamespace(s, nsUser) 323 } 324 } else if c.HostConfig.PidMode.IsHost() { 325 oci.RemoveNamespace(s, "pid") 326 } else { 327 ns := specs.LinuxNamespace{Type: "pid"} 328 setNamespace(s, ns) 329 } 330 // uts 331 if c.HostConfig.UTSMode.IsHost() { 332 oci.RemoveNamespace(s, "uts") 333 s.Hostname = "" 334 } 335 336 // cgroup 337 if !c.HostConfig.CgroupnsMode.IsEmpty() { 338 cgroupNsMode := c.HostConfig.CgroupnsMode 339 if !cgroupNsMode.Valid() { 340 return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode) 341 } 342 if cgroupNsMode.IsPrivate() { 343 nsCgroup := specs.LinuxNamespace{Type: "cgroup"} 344 setNamespace(s, nsCgroup) 345 } 346 } 347 348 return nil 349 } 350 } 351 352 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 353 var ids []specs.LinuxIDMapping 354 for _, item := range s { 355 ids = append(ids, specs.LinuxIDMapping{ 356 HostID: uint32(item.HostID), 357 ContainerID: uint32(item.ContainerID), 358 Size: uint32(item.Size), 359 }) 360 } 361 return ids 362 } 363 364 // Get the source mount point of directory passed in as argument. Also return 365 // optional fields. 366 func getSourceMount(source string) (string, string, error) { 367 // Ensure any symlinks are resolved. 368 sourcePath, err := filepath.EvalSymlinks(source) 369 if err != nil { 370 return "", "", err 371 } 372 373 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 374 if err != nil { 375 return "", "", err 376 } 377 if len(mi) < 1 { 378 return "", "", fmt.Errorf("Can't find mount point of %s", source) 379 } 380 381 // find the longest mount point 382 var idx, maxlen int 383 for i := range mi { 384 if len(mi[i].Mountpoint) > maxlen { 385 maxlen = len(mi[i].Mountpoint) 386 idx = i 387 } 388 } 389 return mi[idx].Mountpoint, mi[idx].Optional, nil 390 } 391 392 const ( 393 sharedPropagationOption = "shared:" 394 slavePropagationOption = "master:" 395 ) 396 397 // hasMountInfoOption checks if any of the passed any of the given option values 398 // are set in the passed in option string. 399 func hasMountInfoOption(opts string, vals ...string) bool { 400 for _, opt := range strings.Split(opts, " ") { 401 for _, val := range vals { 402 if strings.HasPrefix(opt, val) { 403 return true 404 } 405 } 406 } 407 return false 408 } 409 410 // Ensure mount point on which path is mounted, is shared. 411 func ensureShared(path string) error { 412 sourceMount, optionalOpts, err := getSourceMount(path) 413 if err != nil { 414 return err 415 } 416 // Make sure source mount point is shared. 417 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 418 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 419 } 420 return nil 421 } 422 423 // Ensure mount point on which path is mounted, is either shared or slave. 424 func ensureSharedOrSlave(path string) error { 425 sourceMount, optionalOpts, err := getSourceMount(path) 426 if err != nil { 427 return err 428 } 429 430 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 431 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 432 } 433 return nil 434 } 435 436 // Get the set of mount flags that are set on the mount that contains the given 437 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 438 // bind-mounting "with options" will not fail with user namespaces, due to 439 // kernel restrictions that require user namespace mounts to preserve 440 // CL_UNPRIVILEGED locked flags. 441 func getUnprivilegedMountFlags(path string) ([]string, error) { 442 var statfs unix.Statfs_t 443 if err := unix.Statfs(path, &statfs); err != nil { 444 return nil, err 445 } 446 447 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 448 unprivilegedFlags := map[uint64]string{ 449 unix.MS_RDONLY: "ro", 450 unix.MS_NODEV: "nodev", 451 unix.MS_NOEXEC: "noexec", 452 unix.MS_NOSUID: "nosuid", 453 unix.MS_NOATIME: "noatime", 454 unix.MS_RELATIME: "relatime", 455 unix.MS_NODIRATIME: "nodiratime", 456 } 457 458 var flags []string 459 for mask, flag := range unprivilegedFlags { 460 if uint64(statfs.Flags)&mask == mask { 461 flags = append(flags, flag) 462 } 463 } 464 465 return flags, nil 466 } 467 468 var ( 469 mountPropagationMap = map[string]int{ 470 "private": mount.PRIVATE, 471 "rprivate": mount.RPRIVATE, 472 "shared": mount.SHARED, 473 "rshared": mount.RSHARED, 474 "slave": mount.SLAVE, 475 "rslave": mount.RSLAVE, 476 } 477 478 mountPropagationReverseMap = map[int]string{ 479 mount.PRIVATE: "private", 480 mount.RPRIVATE: "rprivate", 481 mount.SHARED: "shared", 482 mount.RSHARED: "rshared", 483 mount.SLAVE: "slave", 484 mount.RSLAVE: "rslave", 485 } 486 ) 487 488 // inSlice tests whether a string is contained in a slice of strings or not. 489 // Comparison is case sensitive 490 func inSlice(slice []string, s string) bool { 491 for _, ss := range slice { 492 if s == ss { 493 return true 494 } 495 } 496 return false 497 } 498 499 // WithMounts sets the container's mounts 500 func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { 501 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 502 if err := daemon.setupContainerMountsRoot(c); err != nil { 503 return err 504 } 505 506 if err := daemon.setupIpcDirs(c); err != nil { 507 return err 508 } 509 510 defer func() { 511 if err != nil { 512 daemon.cleanupSecretDir(c) 513 } 514 }() 515 516 if err := daemon.setupSecretDir(c); err != nil { 517 return err 518 } 519 520 ms, err := daemon.setupMounts(c) 521 if err != nil { 522 return err 523 } 524 525 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 526 ms = append(ms, c.IpcMounts()...) 527 } 528 529 tmpfsMounts, err := c.TmpfsMounts() 530 if err != nil { 531 return err 532 } 533 ms = append(ms, tmpfsMounts...) 534 535 secretMounts, err := c.SecretMounts() 536 if err != nil { 537 return err 538 } 539 ms = append(ms, secretMounts...) 540 541 sort.Sort(mounts(ms)) 542 543 mounts := ms 544 545 userMounts := make(map[string]struct{}) 546 for _, m := range mounts { 547 userMounts[m.Destination] = struct{}{} 548 } 549 550 // Copy all mounts from spec to defaultMounts, except for 551 // - mounts overridden by a user supplied mount; 552 // - all mounts under /dev if a user supplied /dev is present; 553 // - /dev/shm, in case IpcMode is none. 554 // While at it, also 555 // - set size for /dev/shm from shmsize. 556 defaultMounts := s.Mounts[:0] 557 _, mountDev := userMounts["/dev"] 558 for _, m := range s.Mounts { 559 if _, ok := userMounts[m.Destination]; ok { 560 // filter out mount overridden by a user supplied mount 561 continue 562 } 563 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 564 // filter out everything under /dev if /dev is user-mounted 565 continue 566 } 567 568 if m.Destination == "/dev/shm" { 569 if c.HostConfig.IpcMode.IsNone() { 570 // filter out /dev/shm for "none" IpcMode 571 continue 572 } 573 // set size for /dev/shm mount from spec 574 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 575 m.Options = append(m.Options, sizeOpt) 576 } 577 578 defaultMounts = append(defaultMounts, m) 579 } 580 581 s.Mounts = defaultMounts 582 for _, m := range mounts { 583 if m.Source == "tmpfs" { 584 data := m.Data 585 parser := volumemounts.NewParser("linux") 586 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 587 if data != "" { 588 options = append(options, strings.Split(data, ",")...) 589 } 590 591 merged, err := mount.MergeTmpfsOptions(options) 592 if err != nil { 593 return err 594 } 595 596 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 597 continue 598 } 599 600 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 601 602 // Determine property of RootPropagation based on volume 603 // properties. If a volume is shared, then keep root propagation 604 // shared. This should work for slave and private volumes too. 605 // 606 // For slave volumes, it can be either [r]shared/[r]slave. 607 // 608 // For private volumes any root propagation value should work. 609 pFlag := mountPropagationMap[m.Propagation] 610 switch pFlag { 611 case mount.SHARED, mount.RSHARED: 612 if err := ensureShared(m.Source); err != nil { 613 return err 614 } 615 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 616 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 617 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 618 } 619 case mount.SLAVE, mount.RSLAVE: 620 var fallback bool 621 if err := ensureSharedOrSlave(m.Source); err != nil { 622 // For backwards compatibility purposes, treat mounts from the daemon root 623 // as special since we automatically add rslave propagation to these mounts 624 // when the user did not set anything, so we should fallback to the old 625 // behavior which is to use private propagation which is normally the 626 // default. 627 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 628 return err 629 } 630 631 cm, ok := c.MountPoints[m.Destination] 632 if !ok { 633 return err 634 } 635 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 636 // This means the user explicitly set a propagation, do not fallback in that case. 637 return err 638 } 639 fallback = true 640 logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 641 } 642 if !fallback { 643 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 644 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 645 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 646 } 647 } 648 } 649 650 bindMode := "rbind" 651 if m.NonRecursive { 652 bindMode = "bind" 653 } 654 opts := []string{bindMode} 655 if !m.Writable { 656 opts = append(opts, "ro") 657 } 658 if pFlag != 0 { 659 opts = append(opts, mountPropagationReverseMap[pFlag]) 660 } 661 662 // If we are using user namespaces, then we must make sure that we 663 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 664 // "mount" when we bind-mount. The reason for this is that at the point 665 // when runc sets up the root filesystem, it is already inside a user 666 // namespace, and thus cannot change any flags that are locked. 667 if daemon.configStore.RemappedRoot != "" { 668 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 669 if err != nil { 670 return err 671 } 672 opts = append(opts, unprivOpts...) 673 } 674 675 mt.Options = opts 676 s.Mounts = append(s.Mounts, mt) 677 } 678 679 if s.Root.Readonly { 680 for i, m := range s.Mounts { 681 switch m.Destination { 682 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 683 continue 684 } 685 if _, ok := userMounts[m.Destination]; !ok { 686 if !inSlice(m.Options, "ro") { 687 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 688 } 689 } 690 } 691 } 692 693 if c.HostConfig.Privileged { 694 // clear readonly for /sys 695 for i := range s.Mounts { 696 if s.Mounts[i].Destination == "/sys" { 697 clearReadOnly(&s.Mounts[i]) 698 } 699 } 700 s.Linux.ReadonlyPaths = nil 701 s.Linux.MaskedPaths = nil 702 } 703 704 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 705 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 706 if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged { 707 for i, m := range s.Mounts { 708 if m.Type == "cgroup" { 709 clearReadOnly(&s.Mounts[i]) 710 } 711 } 712 } 713 714 return nil 715 716 } 717 } 718 719 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 720 // exist, so do not add the default ones if running on an old kernel. 721 func sysctlExists(s string) bool { 722 f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1)) 723 _, err := os.Stat(f) 724 return err == nil 725 } 726 727 // WithCommonOptions sets common docker options 728 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { 729 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 730 if c.BaseFS == nil { 731 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil") 732 } 733 linkedEnv, err := daemon.setupLinkedContainers(c) 734 if err != nil { 735 return err 736 } 737 s.Root = &specs.Root{ 738 Path: c.BaseFS.Path(), 739 Readonly: c.HostConfig.ReadonlyRootfs, 740 } 741 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 742 return err 743 } 744 cwd := c.Config.WorkingDir 745 if len(cwd) == 0 { 746 cwd = "/" 747 } 748 s.Process.Args = append([]string{c.Path}, c.Args...) 749 750 // only add the custom init if it is specified and the container is running in its 751 // own private pid namespace. It does not make sense to add if it is running in the 752 // host namespace or another container's pid namespace where we already have an init 753 if c.HostConfig.PidMode.IsPrivate() { 754 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 755 (c.HostConfig.Init == nil && daemon.configStore.Init) { 756 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 757 path := daemon.configStore.InitPath 758 if path == "" { 759 path, err = exec.LookPath(daemonconfig.DefaultInitBinary) 760 if err != nil { 761 return err 762 } 763 } 764 s.Mounts = append(s.Mounts, specs.Mount{ 765 Destination: inContainerInitPath, 766 Type: "bind", 767 Source: path, 768 Options: []string{"bind", "ro"}, 769 }) 770 } 771 } 772 s.Process.Cwd = cwd 773 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 774 s.Process.Terminal = c.Config.Tty 775 776 s.Hostname = c.Config.Hostname 777 setLinuxDomainname(c, s) 778 779 // Add default sysctls that are generally safe and useful; currently we 780 // grant the capabilities to allow these anyway. You can override if 781 // you want to restore the original behaviour. 782 // We do not set network sysctls if network namespace is host, or if we are 783 // joining an existing namespace, only if we create a new net namespace. 784 if c.HostConfig.NetworkMode.IsPrivate() { 785 // We cannot set up ping socket support in a user namespace 786 if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") { 787 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 788 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 789 } 790 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 791 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 792 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 793 } 794 } 795 796 return nil 797 } 798 } 799 800 // WithCgroups sets the container's cgroups 801 func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { 802 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 803 var cgroupsPath string 804 scopePrefix := "docker" 805 parent := "/docker" 806 useSystemd := UsingSystemd(daemon.configStore) 807 if useSystemd { 808 parent = "system.slice" 809 if daemon.configStore.Rootless { 810 parent = "user.slice" 811 } 812 } 813 814 if c.HostConfig.CgroupParent != "" { 815 parent = c.HostConfig.CgroupParent 816 } else if daemon.configStore.CgroupParent != "" { 817 parent = daemon.configStore.CgroupParent 818 } 819 820 if useSystemd { 821 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 822 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 823 } else { 824 cgroupsPath = filepath.Join(parent, c.ID) 825 } 826 s.Linux.CgroupsPath = cgroupsPath 827 p := cgroupsPath 828 if useSystemd { 829 initPath, err := cgroups.GetInitCgroup("cpu") 830 if err != nil { 831 return err 832 } 833 _, err = cgroups.GetOwnCgroup("cpu") 834 if err != nil { 835 return err 836 } 837 p = filepath.Join(initPath, s.Linux.CgroupsPath) 838 } 839 840 // Clean path to guard against things like ../../../BAD 841 parentPath := filepath.Dir(p) 842 if !filepath.IsAbs(parentPath) { 843 parentPath = filepath.Clean("/" + parentPath) 844 } 845 846 if err := daemon.initCgroupsPath(parentPath); err != nil { 847 return fmt.Errorf("linux init cgroups path: %v", err) 848 } 849 return nil 850 } 851 } 852 853 // WithDevices sets the container's devices 854 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 855 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 856 // Build lists of devices allowed and created within the container. 857 var devs []specs.LinuxDevice 858 devPermissions := s.Linux.Resources.Devices 859 860 if c.HostConfig.Privileged && !rsystem.RunningInUserNS() { 861 hostDevices, err := devices.HostDevices() 862 if err != nil { 863 return err 864 } 865 for _, d := range hostDevices { 866 devs = append(devs, oci.Device(d)) 867 } 868 869 // adding device mappings in privileged containers 870 for _, deviceMapping := range c.HostConfig.Devices { 871 // issue a warning that custom cgroup permissions are ignored in privileged mode 872 if deviceMapping.CgroupPermissions != "rwm" { 873 logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 874 } 875 // issue a warning that the device path already exists via /dev mounting in privileged mode 876 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 877 logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 878 continue 879 } 880 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 881 if err != nil { 882 return err 883 } 884 devs = append(devs, d...) 885 } 886 887 devPermissions = []specs.LinuxDeviceCgroup{ 888 { 889 Allow: true, 890 Access: "rwm", 891 }, 892 } 893 } else { 894 for _, deviceMapping := range c.HostConfig.Devices { 895 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 896 if err != nil { 897 return err 898 } 899 devs = append(devs, d...) 900 devPermissions = append(devPermissions, dPermissions...) 901 } 902 903 var err error 904 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 905 if err != nil { 906 return err 907 } 908 } 909 910 s.Linux.Devices = append(s.Linux.Devices, devs...) 911 s.Linux.Resources.Devices = devPermissions 912 913 for _, req := range c.HostConfig.DeviceRequests { 914 if err := daemon.handleDevice(req, s); err != nil { 915 return err 916 } 917 } 918 return nil 919 } 920 } 921 922 // WithResources applies the container resources 923 func WithResources(c *container.Container) coci.SpecOpts { 924 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 925 r := c.HostConfig.Resources 926 weightDevices, err := getBlkioWeightDevices(r) 927 if err != nil { 928 return err 929 } 930 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 931 if err != nil { 932 return err 933 } 934 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 935 if err != nil { 936 return err 937 } 938 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 939 if err != nil { 940 return err 941 } 942 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 943 if err != nil { 944 return err 945 } 946 947 memoryRes := getMemoryResources(r) 948 cpuRes, err := getCPUResources(r) 949 if err != nil { 950 return err 951 } 952 blkioWeight := r.BlkioWeight 953 954 specResources := &specs.LinuxResources{ 955 Memory: memoryRes, 956 CPU: cpuRes, 957 BlockIO: &specs.LinuxBlockIO{ 958 Weight: &blkioWeight, 959 WeightDevice: weightDevices, 960 ThrottleReadBpsDevice: readBpsDevice, 961 ThrottleWriteBpsDevice: writeBpsDevice, 962 ThrottleReadIOPSDevice: readIOpsDevice, 963 ThrottleWriteIOPSDevice: writeIOpsDevice, 964 }, 965 Pids: getPidsLimit(r), 966 } 967 968 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 969 specResources.Devices = s.Linux.Resources.Devices 970 } 971 972 s.Linux.Resources = specResources 973 return nil 974 } 975 } 976 977 // WithSysctls sets the container's sysctls 978 func WithSysctls(c *container.Container) coci.SpecOpts { 979 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 980 // We merge the sysctls injected above with the HostConfig (latter takes 981 // precedence for backwards-compatibility reasons). 982 for k, v := range c.HostConfig.Sysctls { 983 s.Linux.Sysctl[k] = v 984 } 985 return nil 986 } 987 } 988 989 // WithUser sets the container's user 990 func WithUser(c *container.Container) coci.SpecOpts { 991 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 992 uid, gid, additionalGids, err := getUser(c, c.Config.User) 993 if err != nil { 994 return err 995 } 996 s.Process.User.UID = uid 997 s.Process.User.GID = gid 998 s.Process.User.AdditionalGids = additionalGids 999 return nil 1000 } 1001 } 1002 1003 func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) { 1004 var ( 1005 opts []coci.SpecOpts 1006 s = oci.DefaultSpec() 1007 ) 1008 opts = append(opts, 1009 WithCommonOptions(daemon, c), 1010 WithCgroups(daemon, c), 1011 WithResources(c), 1012 WithSysctls(c), 1013 WithDevices(daemon, c), 1014 WithUser(c), 1015 WithRlimits(daemon, c), 1016 WithNamespaces(daemon, c), 1017 WithCapabilities(c), 1018 WithSeccomp(daemon, c), 1019 WithMounts(daemon, c), 1020 WithLibnetwork(daemon, c), 1021 WithApparmor(c), 1022 WithSelinux(c), 1023 WithOOMScore(&c.HostConfig.OomScoreAdj), 1024 ) 1025 if c.NoNewPrivileges { 1026 opts = append(opts, coci.WithNoNewPrivileges) 1027 } 1028 1029 // Set the masked and readonly paths with regard to the host config options if they are set. 1030 if c.HostConfig.MaskedPaths != nil { 1031 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1032 } 1033 if c.HostConfig.ReadonlyPaths != nil { 1034 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1035 } 1036 if daemon.configStore.Rootless { 1037 opts = append(opts, WithRootless(daemon)) 1038 } 1039 return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{ 1040 ID: c.ID, 1041 }, &s, opts...) 1042 } 1043 1044 func clearReadOnly(m *specs.Mount) { 1045 var opt []string 1046 for _, o := range m.Options { 1047 if o != "ro" { 1048 opt = append(opt, o) 1049 } 1050 } 1051 m.Options = opt 1052 } 1053 1054 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1055 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 1056 ulimits := c.Ulimits 1057 // Merge ulimits with daemon defaults 1058 ulIdx := make(map[string]struct{}) 1059 for _, ul := range ulimits { 1060 ulIdx[ul.Name] = struct{}{} 1061 } 1062 for name, ul := range daemon.configStore.Ulimits { 1063 if _, exists := ulIdx[name]; !exists { 1064 ulimits = append(ulimits, ul) 1065 } 1066 } 1067 c.Ulimits = ulimits 1068 }