github.com/ssdev-go/moby@v17.12.1-ce-rc2+incompatible/daemon/oci_linux.go (about) 1 package daemon 2 3 import ( 4 "fmt" 5 "io" 6 "os" 7 "os/exec" 8 "path/filepath" 9 "regexp" 10 "sort" 11 "strconv" 12 "strings" 13 14 containertypes "github.com/docker/docker/api/types/container" 15 "github.com/docker/docker/container" 16 "github.com/docker/docker/daemon/caps" 17 daemonconfig "github.com/docker/docker/daemon/config" 18 "github.com/docker/docker/oci" 19 "github.com/docker/docker/pkg/idtools" 20 "github.com/docker/docker/pkg/mount" 21 "github.com/docker/docker/volume" 22 "github.com/opencontainers/runc/libcontainer/apparmor" 23 "github.com/opencontainers/runc/libcontainer/cgroups" 24 "github.com/opencontainers/runc/libcontainer/devices" 25 "github.com/opencontainers/runc/libcontainer/user" 26 specs "github.com/opencontainers/runtime-spec/specs-go" 27 "github.com/pkg/errors" 28 "github.com/sirupsen/logrus" 29 "golang.org/x/sys/unix" 30 ) 31 32 // nolint: gosimple 33 var ( 34 deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$") 35 ) 36 37 func setResources(s *specs.Spec, r containertypes.Resources) error { 38 weightDevices, err := getBlkioWeightDevices(r) 39 if err != nil { 40 return err 41 } 42 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 43 if err != nil { 44 return err 45 } 46 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 47 if err != nil { 48 return err 49 } 50 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 51 if err != nil { 52 return err 53 } 54 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 55 if err != nil { 56 return err 57 } 58 59 memoryRes := getMemoryResources(r) 60 cpuRes, err := getCPUResources(r) 61 if err != nil { 62 return err 63 } 64 blkioWeight := r.BlkioWeight 65 66 specResources := &specs.LinuxResources{ 67 Memory: memoryRes, 68 CPU: cpuRes, 69 BlockIO: &specs.LinuxBlockIO{ 70 Weight: &blkioWeight, 71 WeightDevice: weightDevices, 72 ThrottleReadBpsDevice: readBpsDevice, 73 ThrottleWriteBpsDevice: writeBpsDevice, 74 ThrottleReadIOPSDevice: readIOpsDevice, 75 ThrottleWriteIOPSDevice: writeIOpsDevice, 76 }, 77 Pids: &specs.LinuxPids{ 78 Limit: r.PidsLimit, 79 }, 80 } 81 82 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 83 specResources.Devices = s.Linux.Resources.Devices 84 } 85 86 s.Linux.Resources = specResources 87 return nil 88 } 89 90 func setDevices(s *specs.Spec, c *container.Container) error { 91 // Build lists of devices allowed and created within the container. 92 var devs []specs.LinuxDevice 93 devPermissions := s.Linux.Resources.Devices 94 if c.HostConfig.Privileged { 95 hostDevices, err := devices.HostDevices() 96 if err != nil { 97 return err 98 } 99 for _, d := range hostDevices { 100 devs = append(devs, oci.Device(d)) 101 } 102 devPermissions = []specs.LinuxDeviceCgroup{ 103 { 104 Allow: true, 105 Access: "rwm", 106 }, 107 } 108 } else { 109 for _, deviceMapping := range c.HostConfig.Devices { 110 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 111 if err != nil { 112 return err 113 } 114 devs = append(devs, d...) 115 devPermissions = append(devPermissions, dPermissions...) 116 } 117 118 for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules { 119 ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1) 120 if len(ss[0]) != 5 { 121 return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule) 122 } 123 matches := ss[0] 124 125 dPermissions := specs.LinuxDeviceCgroup{ 126 Allow: true, 127 Type: matches[1], 128 Access: matches[4], 129 } 130 if matches[2] == "*" { 131 major := int64(-1) 132 dPermissions.Major = &major 133 } else { 134 major, err := strconv.ParseInt(matches[2], 10, 64) 135 if err != nil { 136 return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule) 137 } 138 dPermissions.Major = &major 139 } 140 if matches[3] == "*" { 141 minor := int64(-1) 142 dPermissions.Minor = &minor 143 } else { 144 minor, err := strconv.ParseInt(matches[3], 10, 64) 145 if err != nil { 146 return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule) 147 } 148 dPermissions.Minor = &minor 149 } 150 devPermissions = append(devPermissions, dPermissions) 151 } 152 } 153 154 s.Linux.Devices = append(s.Linux.Devices, devs...) 155 s.Linux.Resources.Devices = devPermissions 156 return nil 157 } 158 159 func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error { 160 var rlimits []specs.POSIXRlimit 161 162 // We want to leave the original HostConfig alone so make a copy here 163 hostConfig := *c.HostConfig 164 // Merge with the daemon defaults 165 daemon.mergeUlimits(&hostConfig) 166 for _, ul := range hostConfig.Ulimits { 167 rlimits = append(rlimits, specs.POSIXRlimit{ 168 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 169 Soft: uint64(ul.Soft), 170 Hard: uint64(ul.Hard), 171 }) 172 } 173 174 s.Process.Rlimits = rlimits 175 return nil 176 } 177 178 func setUser(s *specs.Spec, c *container.Container) error { 179 uid, gid, additionalGids, err := getUser(c, c.Config.User) 180 if err != nil { 181 return err 182 } 183 s.Process.User.UID = uid 184 s.Process.User.GID = gid 185 s.Process.User.AdditionalGids = additionalGids 186 return nil 187 } 188 189 func readUserFile(c *container.Container, p string) (io.ReadCloser, error) { 190 fp, err := c.GetResourcePath(p) 191 if err != nil { 192 return nil, err 193 } 194 return os.Open(fp) 195 } 196 197 func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) { 198 passwdPath, err := user.GetPasswdPath() 199 if err != nil { 200 return 0, 0, nil, err 201 } 202 groupPath, err := user.GetGroupPath() 203 if err != nil { 204 return 0, 0, nil, err 205 } 206 passwdFile, err := readUserFile(c, passwdPath) 207 if err == nil { 208 defer passwdFile.Close() 209 } 210 groupFile, err := readUserFile(c, groupPath) 211 if err == nil { 212 defer groupFile.Close() 213 } 214 215 execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile) 216 if err != nil { 217 return 0, 0, nil, err 218 } 219 220 // todo: fix this double read by a change to libcontainer/user pkg 221 groupFile, err = readUserFile(c, groupPath) 222 if err == nil { 223 defer groupFile.Close() 224 } 225 var addGroups []int 226 if len(c.HostConfig.GroupAdd) > 0 { 227 addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile) 228 if err != nil { 229 return 0, 0, nil, err 230 } 231 } 232 uid := uint32(execUser.Uid) 233 gid := uint32(execUser.Gid) 234 sgids := append(execUser.Sgids, addGroups...) 235 var additionalGids []uint32 236 for _, g := range sgids { 237 additionalGids = append(additionalGids, uint32(g)) 238 } 239 return uid, gid, additionalGids, nil 240 } 241 242 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 243 for i, n := range s.Linux.Namespaces { 244 if n.Type == ns.Type { 245 s.Linux.Namespaces[i] = ns 246 return 247 } 248 } 249 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 250 } 251 252 func setCapabilities(s *specs.Spec, c *container.Container) error { 253 var caplist []string 254 var err error 255 if c.HostConfig.Privileged { 256 caplist = caps.GetAllCapabilities() 257 } else { 258 caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Effective, c.HostConfig.CapAdd, c.HostConfig.CapDrop) 259 if err != nil { 260 return err 261 } 262 } 263 s.Process.Capabilities.Effective = caplist 264 s.Process.Capabilities.Bounding = caplist 265 s.Process.Capabilities.Permitted = caplist 266 s.Process.Capabilities.Inheritable = caplist 267 return nil 268 } 269 270 func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error { 271 userNS := false 272 // user 273 if c.HostConfig.UsernsMode.IsPrivate() { 274 uidMap := daemon.idMappings.UIDs() 275 if uidMap != nil { 276 userNS = true 277 ns := specs.LinuxNamespace{Type: "user"} 278 setNamespace(s, ns) 279 s.Linux.UIDMappings = specMapping(uidMap) 280 s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs()) 281 } 282 } 283 // network 284 if !c.Config.NetworkDisabled { 285 ns := specs.LinuxNamespace{Type: "network"} 286 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 287 if parts[0] == "container" { 288 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 289 if err != nil { 290 return err 291 } 292 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 293 if userNS { 294 // to share a net namespace, they must also share a user namespace 295 nsUser := specs.LinuxNamespace{Type: "user"} 296 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 297 setNamespace(s, nsUser) 298 } 299 } else if c.HostConfig.NetworkMode.IsHost() { 300 ns.Path = c.NetworkSettings.SandboxKey 301 } 302 setNamespace(s, ns) 303 } 304 305 // ipc 306 ipcMode := c.HostConfig.IpcMode 307 switch { 308 case ipcMode.IsContainer(): 309 ns := specs.LinuxNamespace{Type: "ipc"} 310 ic, err := daemon.getIpcContainer(ipcMode.Container()) 311 if err != nil { 312 return err 313 } 314 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 315 setNamespace(s, ns) 316 if userNS { 317 // to share an IPC namespace, they must also share a user namespace 318 nsUser := specs.LinuxNamespace{Type: "user"} 319 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 320 setNamespace(s, nsUser) 321 } 322 case ipcMode.IsHost(): 323 oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc")) 324 case ipcMode.IsEmpty(): 325 // A container was created by an older version of the daemon. 326 // The default behavior used to be what is now called "shareable". 327 fallthrough 328 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 329 ns := specs.LinuxNamespace{Type: "ipc"} 330 setNamespace(s, ns) 331 default: 332 return fmt.Errorf("Invalid IPC mode: %v", ipcMode) 333 } 334 335 // pid 336 if c.HostConfig.PidMode.IsContainer() { 337 ns := specs.LinuxNamespace{Type: "pid"} 338 pc, err := daemon.getPidContainer(c) 339 if err != nil { 340 return err 341 } 342 ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()) 343 setNamespace(s, ns) 344 if userNS { 345 // to share a PID namespace, they must also share a user namespace 346 nsUser := specs.LinuxNamespace{Type: "user"} 347 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()) 348 setNamespace(s, nsUser) 349 } 350 } else if c.HostConfig.PidMode.IsHost() { 351 oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid")) 352 } else { 353 ns := specs.LinuxNamespace{Type: "pid"} 354 setNamespace(s, ns) 355 } 356 // uts 357 if c.HostConfig.UTSMode.IsHost() { 358 oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts")) 359 s.Hostname = "" 360 } 361 362 return nil 363 } 364 365 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 366 var ids []specs.LinuxIDMapping 367 for _, item := range s { 368 ids = append(ids, specs.LinuxIDMapping{ 369 HostID: uint32(item.HostID), 370 ContainerID: uint32(item.ContainerID), 371 Size: uint32(item.Size), 372 }) 373 } 374 return ids 375 } 376 377 func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info { 378 for _, m := range mountinfo { 379 if m.Mountpoint == dir { 380 return m 381 } 382 } 383 return nil 384 } 385 386 // Get the source mount point of directory passed in as argument. Also return 387 // optional fields. 388 func getSourceMount(source string) (string, string, error) { 389 // Ensure any symlinks are resolved. 390 sourcePath, err := filepath.EvalSymlinks(source) 391 if err != nil { 392 return "", "", err 393 } 394 395 mountinfos, err := mount.GetMounts() 396 if err != nil { 397 return "", "", err 398 } 399 400 mountinfo := getMountInfo(mountinfos, sourcePath) 401 if mountinfo != nil { 402 return sourcePath, mountinfo.Optional, nil 403 } 404 405 path := sourcePath 406 for { 407 path = filepath.Dir(path) 408 409 mountinfo = getMountInfo(mountinfos, path) 410 if mountinfo != nil { 411 return path, mountinfo.Optional, nil 412 } 413 414 if path == "/" { 415 break 416 } 417 } 418 419 // If we are here, we did not find parent mount. Something is wrong. 420 return "", "", fmt.Errorf("Could not find source mount of %s", source) 421 } 422 423 const ( 424 sharedPropagationOption = "shared:" 425 slavePropagationOption = "master:" 426 ) 427 428 // hasMountinfoOption checks if any of the passed any of the given option values 429 // are set in the passed in option string. 430 func hasMountinfoOption(opts string, vals ...string) bool { 431 for _, opt := range strings.Split(opts, " ") { 432 for _, val := range vals { 433 if strings.HasPrefix(opt, val) { 434 return true 435 } 436 } 437 } 438 return false 439 } 440 441 // Ensure mount point on which path is mounted, is shared. 442 func ensureShared(path string) error { 443 sourceMount, optionalOpts, err := getSourceMount(path) 444 if err != nil { 445 return err 446 } 447 // Make sure source mount point is shared. 448 if !hasMountinfoOption(optionalOpts, sharedPropagationOption) { 449 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 450 } 451 return nil 452 } 453 454 // Ensure mount point on which path is mounted, is either shared or slave. 455 func ensureSharedOrSlave(path string) error { 456 sourceMount, optionalOpts, err := getSourceMount(path) 457 if err != nil { 458 return err 459 } 460 461 if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 462 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 463 } 464 return nil 465 } 466 467 // Get the set of mount flags that are set on the mount that contains the given 468 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 469 // bind-mounting "with options" will not fail with user namespaces, due to 470 // kernel restrictions that require user namespace mounts to preserve 471 // CL_UNPRIVILEGED locked flags. 472 func getUnprivilegedMountFlags(path string) ([]string, error) { 473 var statfs unix.Statfs_t 474 if err := unix.Statfs(path, &statfs); err != nil { 475 return nil, err 476 } 477 478 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 479 unprivilegedFlags := map[uint64]string{ 480 unix.MS_RDONLY: "ro", 481 unix.MS_NODEV: "nodev", 482 unix.MS_NOEXEC: "noexec", 483 unix.MS_NOSUID: "nosuid", 484 unix.MS_NOATIME: "noatime", 485 unix.MS_RELATIME: "relatime", 486 unix.MS_NODIRATIME: "nodiratime", 487 } 488 489 var flags []string 490 for mask, flag := range unprivilegedFlags { 491 if uint64(statfs.Flags)&mask == mask { 492 flags = append(flags, flag) 493 } 494 } 495 496 return flags, nil 497 } 498 499 var ( 500 mountPropagationMap = map[string]int{ 501 "private": mount.PRIVATE, 502 "rprivate": mount.RPRIVATE, 503 "shared": mount.SHARED, 504 "rshared": mount.RSHARED, 505 "slave": mount.SLAVE, 506 "rslave": mount.RSLAVE, 507 } 508 509 mountPropagationReverseMap = map[int]string{ 510 mount.PRIVATE: "private", 511 mount.RPRIVATE: "rprivate", 512 mount.SHARED: "shared", 513 mount.RSHARED: "rshared", 514 mount.SLAVE: "slave", 515 mount.RSLAVE: "rslave", 516 } 517 ) 518 519 // inSlice tests whether a string is contained in a slice of strings or not. 520 // Comparison is case sensitive 521 func inSlice(slice []string, s string) bool { 522 for _, ss := range slice { 523 if s == ss { 524 return true 525 } 526 } 527 return false 528 } 529 530 func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error { 531 userMounts := make(map[string]struct{}) 532 for _, m := range mounts { 533 userMounts[m.Destination] = struct{}{} 534 } 535 536 // Copy all mounts from spec to defaultMounts, except for 537 // - mounts overriden by a user supplied mount; 538 // - all mounts under /dev if a user supplied /dev is present; 539 // - /dev/shm, in case IpcMode is none. 540 // While at it, also 541 // - set size for /dev/shm from shmsize. 542 var defaultMounts []specs.Mount 543 _, mountDev := userMounts["/dev"] 544 for _, m := range s.Mounts { 545 if _, ok := userMounts[m.Destination]; ok { 546 // filter out mount overridden by a user supplied mount 547 continue 548 } 549 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 550 // filter out everything under /dev if /dev is user-mounted 551 continue 552 } 553 554 if m.Destination == "/dev/shm" { 555 if c.HostConfig.IpcMode.IsNone() { 556 // filter out /dev/shm for "none" IpcMode 557 continue 558 } 559 // set size for /dev/shm mount from spec 560 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 561 m.Options = append(m.Options, sizeOpt) 562 } 563 564 defaultMounts = append(defaultMounts, m) 565 } 566 567 s.Mounts = defaultMounts 568 for _, m := range mounts { 569 for _, cm := range s.Mounts { 570 if cm.Destination == m.Destination { 571 return duplicateMountPointError(m.Destination) 572 } 573 } 574 575 if m.Source == "tmpfs" { 576 data := m.Data 577 parser := volume.NewParser("linux") 578 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 579 if data != "" { 580 options = append(options, strings.Split(data, ",")...) 581 } 582 583 merged, err := mount.MergeTmpfsOptions(options) 584 if err != nil { 585 return err 586 } 587 588 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 589 continue 590 } 591 592 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 593 594 // Determine property of RootPropagation based on volume 595 // properties. If a volume is shared, then keep root propagation 596 // shared. This should work for slave and private volumes too. 597 // 598 // For slave volumes, it can be either [r]shared/[r]slave. 599 // 600 // For private volumes any root propagation value should work. 601 pFlag := mountPropagationMap[m.Propagation] 602 switch pFlag { 603 case mount.SHARED, mount.RSHARED: 604 if err := ensureShared(m.Source); err != nil { 605 return err 606 } 607 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 608 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 609 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 610 } 611 case mount.SLAVE, mount.RSLAVE: 612 var fallback bool 613 if err := ensureSharedOrSlave(m.Source); err != nil { 614 // For backwards compatability purposes, treat mounts from the daemon root 615 // as special since we automatically add rslave propagation to these mounts 616 // when the user did not set anything, so we should fallback to the old 617 // behavior which is to use private propagation which is normally the 618 // default. 619 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 620 return err 621 } 622 623 cm, ok := c.MountPoints[m.Destination] 624 if !ok { 625 return err 626 } 627 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 628 // This means the user explicitly set a propagation, do not fallback in that case. 629 return err 630 } 631 fallback = true 632 logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 633 } 634 if !fallback { 635 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 636 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 637 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 638 } 639 } 640 } 641 642 opts := []string{"rbind"} 643 if !m.Writable { 644 opts = append(opts, "ro") 645 } 646 if pFlag != 0 { 647 opts = append(opts, mountPropagationReverseMap[pFlag]) 648 } 649 650 // If we are using user namespaces, then we must make sure that we 651 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 652 // "mount" when we bind-mount. The reason for this is that at the point 653 // when runc sets up the root filesystem, it is already inside a user 654 // namespace, and thus cannot change any flags that are locked. 655 if daemon.configStore.RemappedRoot != "" { 656 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 657 if err != nil { 658 return err 659 } 660 opts = append(opts, unprivOpts...) 661 } 662 663 mt.Options = opts 664 s.Mounts = append(s.Mounts, mt) 665 } 666 667 if s.Root.Readonly { 668 for i, m := range s.Mounts { 669 switch m.Destination { 670 case "/proc", "/dev/pts", "/dev/mqueue", "/dev": 671 continue 672 } 673 if _, ok := userMounts[m.Destination]; !ok { 674 if !inSlice(m.Options, "ro") { 675 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 676 } 677 } 678 } 679 } 680 681 if c.HostConfig.Privileged { 682 if !s.Root.Readonly { 683 // clear readonly for /sys 684 for i := range s.Mounts { 685 if s.Mounts[i].Destination == "/sys" { 686 clearReadOnly(&s.Mounts[i]) 687 } 688 } 689 } 690 s.Linux.ReadonlyPaths = nil 691 s.Linux.MaskedPaths = nil 692 } 693 694 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 695 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 696 if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged { 697 for i, m := range s.Mounts { 698 if m.Type == "cgroup" { 699 clearReadOnly(&s.Mounts[i]) 700 } 701 } 702 } 703 704 return nil 705 } 706 707 func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error { 708 linkedEnv, err := daemon.setupLinkedContainers(c) 709 if err != nil { 710 return err 711 } 712 s.Root = &specs.Root{ 713 Path: c.BaseFS.Path(), 714 Readonly: c.HostConfig.ReadonlyRootfs, 715 } 716 if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil { 717 return err 718 } 719 cwd := c.Config.WorkingDir 720 if len(cwd) == 0 { 721 cwd = "/" 722 } 723 s.Process.Args = append([]string{c.Path}, c.Args...) 724 725 // only add the custom init if it is specified and the container is running in its 726 // own private pid namespace. It does not make sense to add if it is running in the 727 // host namespace or another container's pid namespace where we already have an init 728 if c.HostConfig.PidMode.IsPrivate() { 729 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 730 (c.HostConfig.Init == nil && daemon.configStore.Init) { 731 s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...) 732 var path string 733 if daemon.configStore.InitPath == "" { 734 path, err = exec.LookPath(daemonconfig.DefaultInitBinary) 735 if err != nil { 736 return err 737 } 738 } 739 if daemon.configStore.InitPath != "" { 740 path = daemon.configStore.InitPath 741 } 742 s.Mounts = append(s.Mounts, specs.Mount{ 743 Destination: "/dev/init", 744 Type: "bind", 745 Source: path, 746 Options: []string{"bind", "ro"}, 747 }) 748 } 749 } 750 s.Process.Cwd = cwd 751 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 752 s.Process.Terminal = c.Config.Tty 753 s.Hostname = c.FullHostname() 754 755 return nil 756 } 757 758 func (daemon *Daemon) createSpec(c *container.Container) (*specs.Spec, error) { 759 s := oci.DefaultSpec() 760 if err := daemon.populateCommonSpec(&s, c); err != nil { 761 return nil, err 762 } 763 764 var cgroupsPath string 765 scopePrefix := "docker" 766 parent := "/docker" 767 useSystemd := UsingSystemd(daemon.configStore) 768 if useSystemd { 769 parent = "system.slice" 770 } 771 772 if c.HostConfig.CgroupParent != "" { 773 parent = c.HostConfig.CgroupParent 774 } else if daemon.configStore.CgroupParent != "" { 775 parent = daemon.configStore.CgroupParent 776 } 777 778 if useSystemd { 779 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 780 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 781 } else { 782 cgroupsPath = filepath.Join(parent, c.ID) 783 } 784 s.Linux.CgroupsPath = cgroupsPath 785 786 if err := setResources(&s, c.HostConfig.Resources); err != nil { 787 return nil, fmt.Errorf("linux runtime spec resources: %v", err) 788 } 789 s.Linux.Sysctl = c.HostConfig.Sysctls 790 791 p := s.Linux.CgroupsPath 792 if useSystemd { 793 initPath, err := cgroups.GetInitCgroup("cpu") 794 if err != nil { 795 return nil, err 796 } 797 _, err = cgroups.GetOwnCgroup("cpu") 798 if err != nil { 799 return nil, err 800 } 801 p = filepath.Join(initPath, s.Linux.CgroupsPath) 802 } 803 804 // Clean path to guard against things like ../../../BAD 805 parentPath := filepath.Dir(p) 806 if !filepath.IsAbs(parentPath) { 807 parentPath = filepath.Clean("/" + parentPath) 808 } 809 810 if err := daemon.initCgroupsPath(parentPath); err != nil { 811 return nil, fmt.Errorf("linux init cgroups path: %v", err) 812 } 813 if err := setDevices(&s, c); err != nil { 814 return nil, fmt.Errorf("linux runtime spec devices: %v", err) 815 } 816 if err := daemon.setRlimits(&s, c); err != nil { 817 return nil, fmt.Errorf("linux runtime spec rlimits: %v", err) 818 } 819 if err := setUser(&s, c); err != nil { 820 return nil, fmt.Errorf("linux spec user: %v", err) 821 } 822 if err := setNamespaces(daemon, &s, c); err != nil { 823 return nil, fmt.Errorf("linux spec namespaces: %v", err) 824 } 825 if err := setCapabilities(&s, c); err != nil { 826 return nil, fmt.Errorf("linux spec capabilities: %v", err) 827 } 828 if err := setSeccomp(daemon, &s, c); err != nil { 829 return nil, fmt.Errorf("linux seccomp: %v", err) 830 } 831 832 if err := daemon.setupIpcDirs(c); err != nil { 833 return nil, err 834 } 835 836 if err := daemon.setupSecretDir(c); err != nil { 837 return nil, err 838 } 839 840 if err := daemon.setupConfigDir(c); err != nil { 841 return nil, err 842 } 843 844 ms, err := daemon.setupMounts(c) 845 if err != nil { 846 return nil, err 847 } 848 849 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 850 ms = append(ms, c.IpcMounts()...) 851 } 852 853 tmpfsMounts, err := c.TmpfsMounts() 854 if err != nil { 855 return nil, err 856 } 857 ms = append(ms, tmpfsMounts...) 858 859 if m := c.SecretMounts(); m != nil { 860 ms = append(ms, m...) 861 } 862 863 ms = append(ms, c.ConfigMounts()...) 864 865 sort.Sort(mounts(ms)) 866 if err := setMounts(daemon, &s, c, ms); err != nil { 867 return nil, fmt.Errorf("linux mounts: %v", err) 868 } 869 870 for _, ns := range s.Linux.Namespaces { 871 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 872 target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")) 873 if err != nil { 874 return nil, err 875 } 876 877 s.Hooks = &specs.Hooks{ 878 Prestart: []specs.Hook{{ 879 Path: target, // FIXME: cross-platform 880 Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()}, 881 }}, 882 } 883 } 884 } 885 886 if apparmor.IsEnabled() { 887 var appArmorProfile string 888 if c.AppArmorProfile != "" { 889 appArmorProfile = c.AppArmorProfile 890 } else if c.HostConfig.Privileged { 891 appArmorProfile = "unconfined" 892 } else { 893 appArmorProfile = "docker-default" 894 } 895 896 if appArmorProfile == "docker-default" { 897 // Unattended upgrades and other fun services can unload AppArmor 898 // profiles inadvertently. Since we cannot store our profile in 899 // /etc/apparmor.d, nor can we practically add other ways of 900 // telling the system to keep our profile loaded, in order to make 901 // sure that we keep the default profile enabled we dynamically 902 // reload it if necessary. 903 if err := ensureDefaultAppArmorProfile(); err != nil { 904 return nil, err 905 } 906 } 907 908 s.Process.ApparmorProfile = appArmorProfile 909 } 910 s.Process.SelinuxLabel = c.GetProcessLabel() 911 s.Process.NoNewPrivileges = c.NoNewPrivileges 912 s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj 913 s.Linux.MountLabel = c.MountLabel 914 915 return &s, nil 916 } 917 918 func clearReadOnly(m *specs.Mount) { 919 var opt []string 920 for _, o := range m.Options { 921 if o != "ro" { 922 opt = append(opt, o) 923 } 924 } 925 m.Options = opt 926 } 927 928 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 929 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 930 ulimits := c.Ulimits 931 // Merge ulimits with daemon defaults 932 ulIdx := make(map[string]struct{}) 933 for _, ul := range ulimits { 934 ulIdx[ul.Name] = struct{}{} 935 } 936 for name, ul := range daemon.configStore.Ulimits { 937 if _, exists := ulIdx[name]; !exists { 938 ulimits = append(ulimits, ul) 939 } 940 } 941 c.Ulimits = ulimits 942 }