github.com/rhatdan/docker@v0.7.7-0.20180119204836-47a0dcbcd20a/daemon/oci_linux.go (about) 1 package daemon 2 3 import ( 4 "fmt" 5 "io" 6 "os" 7 "os/exec" 8 "path/filepath" 9 "regexp" 10 "sort" 11 "strconv" 12 "strings" 13 14 containertypes "github.com/docker/docker/api/types/container" 15 "github.com/docker/docker/container" 16 "github.com/docker/docker/daemon/caps" 17 daemonconfig "github.com/docker/docker/daemon/config" 18 "github.com/docker/docker/oci" 19 "github.com/docker/docker/pkg/idtools" 20 "github.com/docker/docker/pkg/mount" 21 "github.com/docker/docker/volume" 22 "github.com/opencontainers/runc/libcontainer/apparmor" 23 "github.com/opencontainers/runc/libcontainer/cgroups" 24 "github.com/opencontainers/runc/libcontainer/devices" 25 "github.com/opencontainers/runc/libcontainer/user" 26 specs "github.com/opencontainers/runtime-spec/specs-go" 27 "github.com/sirupsen/logrus" 28 "golang.org/x/sys/unix" 29 ) 30 31 // nolint: gosimple 32 var ( 33 deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$") 34 ) 35 36 func setResources(s *specs.Spec, r containertypes.Resources) error { 37 weightDevices, err := getBlkioWeightDevices(r) 38 if err != nil { 39 return err 40 } 41 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 42 if err != nil { 43 return err 44 } 45 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 46 if err != nil { 47 return err 48 } 49 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 50 if err != nil { 51 return err 52 } 53 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 54 if err != nil { 55 return err 56 } 57 58 memoryRes := getMemoryResources(r) 59 cpuRes, err := getCPUResources(r) 60 if err != nil { 61 return err 62 } 63 blkioWeight := r.BlkioWeight 64 65 specResources := &specs.LinuxResources{ 66 Memory: memoryRes, 67 CPU: cpuRes, 68 BlockIO: &specs.LinuxBlockIO{ 69 Weight: &blkioWeight, 70 WeightDevice: weightDevices, 71 ThrottleReadBpsDevice: readBpsDevice, 72 ThrottleWriteBpsDevice: writeBpsDevice, 73 ThrottleReadIOPSDevice: readIOpsDevice, 74 ThrottleWriteIOPSDevice: writeIOpsDevice, 75 }, 76 Pids: &specs.LinuxPids{ 77 Limit: r.PidsLimit, 78 }, 79 } 80 81 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 82 specResources.Devices = s.Linux.Resources.Devices 83 } 84 85 s.Linux.Resources = specResources 86 return nil 87 } 88 89 func setDevices(s *specs.Spec, c *container.Container) error { 90 // Build lists of devices allowed and created within the container. 91 var devs []specs.LinuxDevice 92 devPermissions := s.Linux.Resources.Devices 93 if c.HostConfig.Privileged { 94 hostDevices, err := devices.HostDevices() 95 if err != nil { 96 return err 97 } 98 for _, d := range hostDevices { 99 devs = append(devs, oci.Device(d)) 100 } 101 devPermissions = []specs.LinuxDeviceCgroup{ 102 { 103 Allow: true, 104 Access: "rwm", 105 }, 106 } 107 } else { 108 for _, deviceMapping := range c.HostConfig.Devices { 109 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 110 if err != nil { 111 return err 112 } 113 devs = append(devs, d...) 114 devPermissions = append(devPermissions, dPermissions...) 115 } 116 117 for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules { 118 ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1) 119 if len(ss[0]) != 5 { 120 return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule) 121 } 122 matches := ss[0] 123 124 dPermissions := specs.LinuxDeviceCgroup{ 125 Allow: true, 126 Type: matches[1], 127 Access: matches[4], 128 } 129 if matches[2] == "*" { 130 major := int64(-1) 131 dPermissions.Major = &major 132 } else { 133 major, err := strconv.ParseInt(matches[2], 10, 64) 134 if err != nil { 135 return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule) 136 } 137 dPermissions.Major = &major 138 } 139 if matches[3] == "*" { 140 minor := int64(-1) 141 dPermissions.Minor = &minor 142 } else { 143 minor, err := strconv.ParseInt(matches[3], 10, 64) 144 if err != nil { 145 return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule) 146 } 147 dPermissions.Minor = &minor 148 } 149 devPermissions = append(devPermissions, dPermissions) 150 } 151 } 152 153 s.Linux.Devices = append(s.Linux.Devices, devs...) 154 s.Linux.Resources.Devices = devPermissions 155 return nil 156 } 157 158 func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error { 159 var rlimits []specs.POSIXRlimit 160 161 // We want to leave the original HostConfig alone so make a copy here 162 hostConfig := *c.HostConfig 163 // Merge with the daemon defaults 164 daemon.mergeUlimits(&hostConfig) 165 for _, ul := range hostConfig.Ulimits { 166 rlimits = append(rlimits, specs.POSIXRlimit{ 167 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 168 Soft: uint64(ul.Soft), 169 Hard: uint64(ul.Hard), 170 }) 171 } 172 173 s.Process.Rlimits = rlimits 174 return nil 175 } 176 177 func setUser(s *specs.Spec, c *container.Container) error { 178 uid, gid, additionalGids, err := getUser(c, c.Config.User) 179 if err != nil { 180 return err 181 } 182 s.Process.User.UID = uid 183 s.Process.User.GID = gid 184 s.Process.User.AdditionalGids = additionalGids 185 return nil 186 } 187 188 func readUserFile(c *container.Container, p string) (io.ReadCloser, error) { 189 fp, err := c.GetResourcePath(p) 190 if err != nil { 191 return nil, err 192 } 193 return os.Open(fp) 194 } 195 196 func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) { 197 passwdPath, err := user.GetPasswdPath() 198 if err != nil { 199 return 0, 0, nil, err 200 } 201 groupPath, err := user.GetGroupPath() 202 if err != nil { 203 return 0, 0, nil, err 204 } 205 passwdFile, err := readUserFile(c, passwdPath) 206 if err == nil { 207 defer passwdFile.Close() 208 } 209 groupFile, err := readUserFile(c, groupPath) 210 if err == nil { 211 defer groupFile.Close() 212 } 213 214 execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile) 215 if err != nil { 216 return 0, 0, nil, err 217 } 218 219 // todo: fix this double read by a change to libcontainer/user pkg 220 groupFile, err = readUserFile(c, groupPath) 221 if err == nil { 222 defer groupFile.Close() 223 } 224 var addGroups []int 225 if len(c.HostConfig.GroupAdd) > 0 { 226 addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile) 227 if err != nil { 228 return 0, 0, nil, err 229 } 230 } 231 uid := uint32(execUser.Uid) 232 gid := uint32(execUser.Gid) 233 sgids := append(execUser.Sgids, addGroups...) 234 var additionalGids []uint32 235 for _, g := range sgids { 236 additionalGids = append(additionalGids, uint32(g)) 237 } 238 return uid, gid, additionalGids, nil 239 } 240 241 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 242 for i, n := range s.Linux.Namespaces { 243 if n.Type == ns.Type { 244 s.Linux.Namespaces[i] = ns 245 return 246 } 247 } 248 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 249 } 250 251 func setCapabilities(s *specs.Spec, c *container.Container) error { 252 var caplist []string 253 var err error 254 if c.HostConfig.Privileged { 255 caplist = caps.GetAllCapabilities() 256 } else { 257 caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Effective, c.HostConfig.CapAdd, c.HostConfig.CapDrop) 258 if err != nil { 259 return err 260 } 261 } 262 s.Process.Capabilities.Effective = caplist 263 s.Process.Capabilities.Bounding = caplist 264 s.Process.Capabilities.Permitted = caplist 265 s.Process.Capabilities.Inheritable = caplist 266 return nil 267 } 268 269 func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error { 270 userNS := false 271 // user 272 if c.HostConfig.UsernsMode.IsPrivate() { 273 uidMap := daemon.idMappings.UIDs() 274 if uidMap != nil { 275 userNS = true 276 ns := specs.LinuxNamespace{Type: "user"} 277 setNamespace(s, ns) 278 s.Linux.UIDMappings = specMapping(uidMap) 279 s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs()) 280 } 281 } 282 // network 283 if !c.Config.NetworkDisabled { 284 ns := specs.LinuxNamespace{Type: "network"} 285 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 286 if parts[0] == "container" { 287 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 288 if err != nil { 289 return err 290 } 291 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 292 if userNS { 293 // to share a net namespace, they must also share a user namespace 294 nsUser := specs.LinuxNamespace{Type: "user"} 295 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 296 setNamespace(s, nsUser) 297 } 298 } else if c.HostConfig.NetworkMode.IsHost() { 299 ns.Path = c.NetworkSettings.SandboxKey 300 } 301 setNamespace(s, ns) 302 } 303 304 // ipc 305 ipcMode := c.HostConfig.IpcMode 306 switch { 307 case ipcMode.IsContainer(): 308 ns := specs.LinuxNamespace{Type: "ipc"} 309 ic, err := daemon.getIpcContainer(ipcMode.Container()) 310 if err != nil { 311 return err 312 } 313 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 314 setNamespace(s, ns) 315 if userNS { 316 // to share an IPC namespace, they must also share a user namespace 317 nsUser := specs.LinuxNamespace{Type: "user"} 318 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 319 setNamespace(s, nsUser) 320 } 321 case ipcMode.IsHost(): 322 oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc")) 323 case ipcMode.IsEmpty(): 324 // A container was created by an older version of the daemon. 325 // The default behavior used to be what is now called "shareable". 326 fallthrough 327 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 328 ns := specs.LinuxNamespace{Type: "ipc"} 329 setNamespace(s, ns) 330 default: 331 return fmt.Errorf("Invalid IPC mode: %v", ipcMode) 332 } 333 334 // pid 335 if c.HostConfig.PidMode.IsContainer() { 336 ns := specs.LinuxNamespace{Type: "pid"} 337 pc, err := daemon.getPidContainer(c) 338 if err != nil { 339 return err 340 } 341 ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()) 342 setNamespace(s, ns) 343 if userNS { 344 // to share a PID namespace, they must also share a user namespace 345 nsUser := specs.LinuxNamespace{Type: "user"} 346 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()) 347 setNamespace(s, nsUser) 348 } 349 } else if c.HostConfig.PidMode.IsHost() { 350 oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid")) 351 } else { 352 ns := specs.LinuxNamespace{Type: "pid"} 353 setNamespace(s, ns) 354 } 355 // uts 356 if c.HostConfig.UTSMode.IsHost() { 357 oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts")) 358 s.Hostname = "" 359 } 360 361 return nil 362 } 363 364 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 365 var ids []specs.LinuxIDMapping 366 for _, item := range s { 367 ids = append(ids, specs.LinuxIDMapping{ 368 HostID: uint32(item.HostID), 369 ContainerID: uint32(item.ContainerID), 370 Size: uint32(item.Size), 371 }) 372 } 373 return ids 374 } 375 376 func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info { 377 for _, m := range mountinfo { 378 if m.Mountpoint == dir { 379 return m 380 } 381 } 382 return nil 383 } 384 385 // Get the source mount point of directory passed in as argument. Also return 386 // optional fields. 387 func getSourceMount(source string) (string, string, error) { 388 // Ensure any symlinks are resolved. 389 sourcePath, err := filepath.EvalSymlinks(source) 390 if err != nil { 391 return "", "", err 392 } 393 394 mountinfos, err := mount.GetMounts() 395 if err != nil { 396 return "", "", err 397 } 398 399 mountinfo := getMountInfo(mountinfos, sourcePath) 400 if mountinfo != nil { 401 return sourcePath, mountinfo.Optional, nil 402 } 403 404 path := sourcePath 405 for { 406 path = filepath.Dir(path) 407 408 mountinfo = getMountInfo(mountinfos, path) 409 if mountinfo != nil { 410 return path, mountinfo.Optional, nil 411 } 412 413 if path == "/" { 414 break 415 } 416 } 417 418 // If we are here, we did not find parent mount. Something is wrong. 419 return "", "", fmt.Errorf("Could not find source mount of %s", source) 420 } 421 422 // Ensure mount point on which path is mounted, is shared. 423 func ensureShared(path string) error { 424 sharedMount := false 425 426 sourceMount, optionalOpts, err := getSourceMount(path) 427 if err != nil { 428 return err 429 } 430 // Make sure source mount point is shared. 431 optsSplit := strings.Split(optionalOpts, " ") 432 for _, opt := range optsSplit { 433 if strings.HasPrefix(opt, "shared:") { 434 sharedMount = true 435 break 436 } 437 } 438 439 if !sharedMount { 440 return fmt.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 441 } 442 return nil 443 } 444 445 // Ensure mount point on which path is mounted, is either shared or slave. 446 func ensureSharedOrSlave(path string) error { 447 sharedMount := false 448 slaveMount := false 449 450 sourceMount, optionalOpts, err := getSourceMount(path) 451 if err != nil { 452 return err 453 } 454 // Make sure source mount point is shared. 455 optsSplit := strings.Split(optionalOpts, " ") 456 for _, opt := range optsSplit { 457 if strings.HasPrefix(opt, "shared:") { 458 sharedMount = true 459 break 460 } else if strings.HasPrefix(opt, "master:") { 461 slaveMount = true 462 break 463 } 464 } 465 466 if !sharedMount && !slaveMount { 467 return fmt.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 468 } 469 return nil 470 } 471 472 // Get the set of mount flags that are set on the mount that contains the given 473 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 474 // bind-mounting "with options" will not fail with user namespaces, due to 475 // kernel restrictions that require user namespace mounts to preserve 476 // CL_UNPRIVILEGED locked flags. 477 func getUnprivilegedMountFlags(path string) ([]string, error) { 478 var statfs unix.Statfs_t 479 if err := unix.Statfs(path, &statfs); err != nil { 480 return nil, err 481 } 482 483 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 484 unprivilegedFlags := map[uint64]string{ 485 unix.MS_RDONLY: "ro", 486 unix.MS_NODEV: "nodev", 487 unix.MS_NOEXEC: "noexec", 488 unix.MS_NOSUID: "nosuid", 489 unix.MS_NOATIME: "noatime", 490 unix.MS_RELATIME: "relatime", 491 unix.MS_NODIRATIME: "nodiratime", 492 } 493 494 var flags []string 495 for mask, flag := range unprivilegedFlags { 496 if uint64(statfs.Flags)&mask == mask { 497 flags = append(flags, flag) 498 } 499 } 500 501 return flags, nil 502 } 503 504 var ( 505 mountPropagationMap = map[string]int{ 506 "private": mount.PRIVATE, 507 "rprivate": mount.RPRIVATE, 508 "shared": mount.SHARED, 509 "rshared": mount.RSHARED, 510 "slave": mount.SLAVE, 511 "rslave": mount.RSLAVE, 512 } 513 514 mountPropagationReverseMap = map[int]string{ 515 mount.PRIVATE: "private", 516 mount.RPRIVATE: "rprivate", 517 mount.SHARED: "shared", 518 mount.RSHARED: "rshared", 519 mount.SLAVE: "slave", 520 mount.RSLAVE: "rslave", 521 } 522 ) 523 524 // inSlice tests whether a string is contained in a slice of strings or not. 525 // Comparison is case sensitive 526 func inSlice(slice []string, s string) bool { 527 for _, ss := range slice { 528 if s == ss { 529 return true 530 } 531 } 532 return false 533 } 534 535 func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error { 536 userMounts := make(map[string]struct{}) 537 for _, m := range mounts { 538 userMounts[m.Destination] = struct{}{} 539 } 540 541 // Copy all mounts from spec to defaultMounts, except for 542 // - mounts overriden by a user supplied mount; 543 // - all mounts under /dev if a user supplied /dev is present; 544 // - /dev/shm, in case IpcMode is none. 545 // While at it, also 546 // - set size for /dev/shm from shmsize. 547 var defaultMounts []specs.Mount 548 _, mountDev := userMounts["/dev"] 549 for _, m := range s.Mounts { 550 if _, ok := userMounts[m.Destination]; ok { 551 // filter out mount overridden by a user supplied mount 552 continue 553 } 554 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 555 // filter out everything under /dev if /dev is user-mounted 556 continue 557 } 558 559 if m.Destination == "/dev/shm" { 560 if c.HostConfig.IpcMode.IsNone() { 561 // filter out /dev/shm for "none" IpcMode 562 continue 563 } 564 // set size for /dev/shm mount from spec 565 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 566 m.Options = append(m.Options, sizeOpt) 567 } 568 569 defaultMounts = append(defaultMounts, m) 570 } 571 572 s.Mounts = defaultMounts 573 for _, m := range mounts { 574 for _, cm := range s.Mounts { 575 if cm.Destination == m.Destination { 576 return duplicateMountPointError(m.Destination) 577 } 578 } 579 580 if m.Source == "tmpfs" { 581 data := m.Data 582 parser := volume.NewParser("linux") 583 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 584 if data != "" { 585 options = append(options, strings.Split(data, ",")...) 586 } 587 588 merged, err := mount.MergeTmpfsOptions(options) 589 if err != nil { 590 return err 591 } 592 593 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 594 continue 595 } 596 597 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 598 599 // Determine property of RootPropagation based on volume 600 // properties. If a volume is shared, then keep root propagation 601 // shared. This should work for slave and private volumes too. 602 // 603 // For slave volumes, it can be either [r]shared/[r]slave. 604 // 605 // For private volumes any root propagation value should work. 606 pFlag := mountPropagationMap[m.Propagation] 607 if pFlag == mount.SHARED || pFlag == mount.RSHARED { 608 if err := ensureShared(m.Source); err != nil { 609 return err 610 } 611 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 612 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 613 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 614 } 615 } else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE { 616 if err := ensureSharedOrSlave(m.Source); err != nil { 617 return err 618 } 619 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 620 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 621 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 622 } 623 } 624 625 opts := []string{"rbind"} 626 if !m.Writable { 627 opts = append(opts, "ro") 628 } 629 if pFlag != 0 { 630 opts = append(opts, mountPropagationReverseMap[pFlag]) 631 } 632 633 // If we are using user namespaces, then we must make sure that we 634 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 635 // "mount" when we bind-mount. The reason for this is that at the point 636 // when runc sets up the root filesystem, it is already inside a user 637 // namespace, and thus cannot change any flags that are locked. 638 if daemon.configStore.RemappedRoot != "" { 639 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 640 if err != nil { 641 return err 642 } 643 opts = append(opts, unprivOpts...) 644 } 645 646 mt.Options = opts 647 s.Mounts = append(s.Mounts, mt) 648 } 649 650 if s.Root.Readonly { 651 for i, m := range s.Mounts { 652 switch m.Destination { 653 case "/proc", "/dev/pts", "/dev/mqueue", "/dev": 654 continue 655 } 656 if _, ok := userMounts[m.Destination]; !ok { 657 if !inSlice(m.Options, "ro") { 658 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 659 } 660 } 661 } 662 } 663 664 if c.HostConfig.Privileged { 665 if !s.Root.Readonly { 666 // clear readonly for /sys 667 for i := range s.Mounts { 668 if s.Mounts[i].Destination == "/sys" { 669 clearReadOnly(&s.Mounts[i]) 670 } 671 } 672 } 673 s.Linux.ReadonlyPaths = nil 674 s.Linux.MaskedPaths = nil 675 } 676 677 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 678 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 679 if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged { 680 for i, m := range s.Mounts { 681 if m.Type == "cgroup" { 682 clearReadOnly(&s.Mounts[i]) 683 } 684 } 685 } 686 687 return nil 688 } 689 690 func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error { 691 linkedEnv, err := daemon.setupLinkedContainers(c) 692 if err != nil { 693 return err 694 } 695 s.Root = &specs.Root{ 696 Path: c.BaseFS.Path(), 697 Readonly: c.HostConfig.ReadonlyRootfs, 698 } 699 if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil { 700 return err 701 } 702 cwd := c.Config.WorkingDir 703 if len(cwd) == 0 { 704 cwd = "/" 705 } 706 s.Process.Args = append([]string{c.Path}, c.Args...) 707 708 // only add the custom init if it is specified and the container is running in its 709 // own private pid namespace. It does not make sense to add if it is running in the 710 // host namespace or another container's pid namespace where we already have an init 711 if c.HostConfig.PidMode.IsPrivate() { 712 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 713 (c.HostConfig.Init == nil && daemon.configStore.Init) { 714 s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...) 715 var path string 716 if daemon.configStore.InitPath == "" { 717 path, err = exec.LookPath(daemonconfig.DefaultInitBinary) 718 if err != nil { 719 return err 720 } 721 } 722 if daemon.configStore.InitPath != "" { 723 path = daemon.configStore.InitPath 724 } 725 s.Mounts = append(s.Mounts, specs.Mount{ 726 Destination: "/dev/init", 727 Type: "bind", 728 Source: path, 729 Options: []string{"bind", "ro"}, 730 }) 731 } 732 } 733 s.Process.Cwd = cwd 734 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 735 s.Process.Terminal = c.Config.Tty 736 s.Hostname = c.FullHostname() 737 738 return nil 739 } 740 741 func (daemon *Daemon) createSpec(c *container.Container) (*specs.Spec, error) { 742 s := oci.DefaultSpec() 743 if err := daemon.populateCommonSpec(&s, c); err != nil { 744 return nil, err 745 } 746 747 var cgroupsPath string 748 scopePrefix := "docker" 749 parent := "/docker" 750 useSystemd := UsingSystemd(daemon.configStore) 751 if useSystemd { 752 parent = "system.slice" 753 } 754 755 if c.HostConfig.CgroupParent != "" { 756 parent = c.HostConfig.CgroupParent 757 } else if daemon.configStore.CgroupParent != "" { 758 parent = daemon.configStore.CgroupParent 759 } 760 761 if useSystemd { 762 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 763 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 764 } else { 765 cgroupsPath = filepath.Join(parent, c.ID) 766 } 767 s.Linux.CgroupsPath = cgroupsPath 768 769 if err := setResources(&s, c.HostConfig.Resources); err != nil { 770 return nil, fmt.Errorf("linux runtime spec resources: %v", err) 771 } 772 s.Linux.Sysctl = c.HostConfig.Sysctls 773 774 p := s.Linux.CgroupsPath 775 if useSystemd { 776 initPath, err := cgroups.GetInitCgroup("cpu") 777 if err != nil { 778 return nil, err 779 } 780 _, err = cgroups.GetOwnCgroup("cpu") 781 if err != nil { 782 return nil, err 783 } 784 p = filepath.Join(initPath, s.Linux.CgroupsPath) 785 } 786 787 // Clean path to guard against things like ../../../BAD 788 parentPath := filepath.Dir(p) 789 if !filepath.IsAbs(parentPath) { 790 parentPath = filepath.Clean("/" + parentPath) 791 } 792 793 if err := daemon.initCgroupsPath(parentPath); err != nil { 794 return nil, fmt.Errorf("linux init cgroups path: %v", err) 795 } 796 if err := setDevices(&s, c); err != nil { 797 return nil, fmt.Errorf("linux runtime spec devices: %v", err) 798 } 799 if err := daemon.setRlimits(&s, c); err != nil { 800 return nil, fmt.Errorf("linux runtime spec rlimits: %v", err) 801 } 802 if err := setUser(&s, c); err != nil { 803 return nil, fmt.Errorf("linux spec user: %v", err) 804 } 805 if err := setNamespaces(daemon, &s, c); err != nil { 806 return nil, fmt.Errorf("linux spec namespaces: %v", err) 807 } 808 if err := setCapabilities(&s, c); err != nil { 809 return nil, fmt.Errorf("linux spec capabilities: %v", err) 810 } 811 if err := setSeccomp(daemon, &s, c); err != nil { 812 return nil, fmt.Errorf("linux seccomp: %v", err) 813 } 814 815 if err := daemon.setupIpcDirs(c); err != nil { 816 return nil, err 817 } 818 819 if err := daemon.setupSecretDir(c); err != nil { 820 return nil, err 821 } 822 823 if err := daemon.setupConfigDir(c); err != nil { 824 return nil, err 825 } 826 827 ms, err := daemon.setupMounts(c) 828 if err != nil { 829 return nil, err 830 } 831 832 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 833 ms = append(ms, c.IpcMounts()...) 834 } 835 836 tmpfsMounts, err := c.TmpfsMounts() 837 if err != nil { 838 return nil, err 839 } 840 ms = append(ms, tmpfsMounts...) 841 842 if m := c.SecretMounts(); m != nil { 843 ms = append(ms, m...) 844 } 845 846 ms = append(ms, c.ConfigMounts()...) 847 848 sort.Sort(mounts(ms)) 849 if err := setMounts(daemon, &s, c, ms); err != nil { 850 return nil, fmt.Errorf("linux mounts: %v", err) 851 } 852 853 for _, ns := range s.Linux.Namespaces { 854 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 855 target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")) 856 if err != nil { 857 return nil, err 858 } 859 860 s.Hooks = &specs.Hooks{ 861 Prestart: []specs.Hook{{ 862 Path: target, // FIXME: cross-platform 863 Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()}, 864 }}, 865 } 866 } 867 } 868 869 if apparmor.IsEnabled() { 870 var appArmorProfile string 871 if c.AppArmorProfile != "" { 872 appArmorProfile = c.AppArmorProfile 873 } else if c.HostConfig.Privileged { 874 appArmorProfile = "unconfined" 875 } else { 876 appArmorProfile = "docker-default" 877 } 878 879 if appArmorProfile == "docker-default" { 880 // Unattended upgrades and other fun services can unload AppArmor 881 // profiles inadvertently. Since we cannot store our profile in 882 // /etc/apparmor.d, nor can we practically add other ways of 883 // telling the system to keep our profile loaded, in order to make 884 // sure that we keep the default profile enabled we dynamically 885 // reload it if necessary. 886 if err := ensureDefaultAppArmorProfile(); err != nil { 887 return nil, err 888 } 889 } 890 891 s.Process.ApparmorProfile = appArmorProfile 892 } 893 s.Process.SelinuxLabel = c.GetProcessLabel() 894 s.Process.NoNewPrivileges = c.NoNewPrivileges 895 s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj 896 s.Linux.MountLabel = c.MountLabel 897 898 return &s, nil 899 } 900 901 func clearReadOnly(m *specs.Mount) { 902 var opt []string 903 for _, o := range m.Options { 904 if o != "ro" { 905 opt = append(opt, o) 906 } 907 } 908 m.Options = opt 909 } 910 911 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 912 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 913 ulimits := c.Ulimits 914 // Merge ulimits with daemon defaults 915 ulIdx := make(map[string]struct{}) 916 for _, ul := range ulimits { 917 ulIdx[ul.Name] = struct{}{} 918 } 919 for name, ul := range daemon.configStore.Ulimits { 920 if _, exists := ulIdx[name]; !exists { 921 ulimits = append(ulimits, ul) 922 } 923 } 924 c.Ulimits = ulimits 925 }