github.com/Heebron/moby@v0.0.0-20221111184709-6eab4f55faf7/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "os/exec" 8 "path/filepath" 9 "sort" 10 "strconv" 11 "strings" 12 13 cdcgroups "github.com/containerd/cgroups" 14 "github.com/containerd/containerd/containers" 15 coci "github.com/containerd/containerd/oci" 16 "github.com/containerd/containerd/pkg/apparmor" 17 "github.com/containerd/containerd/pkg/userns" 18 containertypes "github.com/docker/docker/api/types/container" 19 "github.com/docker/docker/container" 20 dconfig "github.com/docker/docker/daemon/config" 21 "github.com/docker/docker/errdefs" 22 "github.com/docker/docker/oci" 23 "github.com/docker/docker/oci/caps" 24 "github.com/docker/docker/pkg/idtools" 25 "github.com/docker/docker/pkg/stringid" 26 "github.com/docker/docker/rootless/specconv" 27 volumemounts "github.com/docker/docker/volume/mounts" 28 "github.com/moby/sys/mount" 29 "github.com/moby/sys/mountinfo" 30 "github.com/opencontainers/runc/libcontainer/cgroups" 31 "github.com/opencontainers/runc/libcontainer/user" 32 specs "github.com/opencontainers/runtime-spec/specs-go" 33 "github.com/pkg/errors" 34 "github.com/sirupsen/logrus" 35 "golang.org/x/sys/unix" 36 ) 37 38 const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary 39 40 // WithRlimits sets the container's rlimits along with merging the daemon's rlimits 41 func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { 42 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 43 var rlimits []specs.POSIXRlimit 44 45 // We want to leave the original HostConfig alone so make a copy here 46 hostConfig := *c.HostConfig 47 // Merge with the daemon defaults 48 daemon.mergeUlimits(&hostConfig) 49 for _, ul := range hostConfig.Ulimits { 50 rlimits = append(rlimits, specs.POSIXRlimit{ 51 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 52 Soft: uint64(ul.Soft), 53 Hard: uint64(ul.Hard), 54 }) 55 } 56 57 s.Process.Rlimits = rlimits 58 return nil 59 } 60 } 61 62 // WithLibnetwork sets the libnetwork hook 63 func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts { 64 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 65 if s.Hooks == nil { 66 s.Hooks = &specs.Hooks{} 67 } 68 for _, ns := range s.Linux.Namespaces { 69 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 70 target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") 71 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 72 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ 73 Path: target, 74 Args: []string{ 75 "libnetwork-setkey", 76 "-exec-root=" + daemon.configStore.GetExecRoot(), 77 c.ID, 78 shortNetCtlrID, 79 }, 80 }) 81 } 82 } 83 return nil 84 } 85 } 86 87 // WithRootless sets the spec to the rootless configuration 88 func WithRootless(daemon *Daemon) coci.SpecOpts { 89 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 90 var v2Controllers []string 91 if daemon.getCgroupDriver() == cgroupSystemdDriver { 92 if cdcgroups.Mode() != cdcgroups.Unified { 93 return errors.New("rootless systemd driver doesn't support cgroup v1") 94 } 95 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 96 if rootlesskitParentEUID == "" { 97 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 98 } 99 euid, err := strconv.Atoi(rootlesskitParentEUID) 100 if err != nil { 101 return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value") 102 } 103 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid) 104 controllersFile, err := os.ReadFile(controllersPath) 105 if err != nil { 106 return err 107 } 108 v2Controllers = strings.Fields(string(controllersFile)) 109 } 110 return specconv.ToRootless(s, v2Controllers) 111 } 112 } 113 114 // WithOOMScore sets the oom score 115 func WithOOMScore(score *int) coci.SpecOpts { 116 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 117 s.Process.OOMScoreAdj = score 118 return nil 119 } 120 } 121 122 // WithSelinux sets the selinux labels 123 func WithSelinux(c *container.Container) coci.SpecOpts { 124 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 125 s.Process.SelinuxLabel = c.GetProcessLabel() 126 s.Linux.MountLabel = c.MountLabel 127 return nil 128 } 129 } 130 131 // WithApparmor sets the apparmor profile 132 func WithApparmor(c *container.Container) coci.SpecOpts { 133 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 134 if apparmor.HostSupports() { 135 var appArmorProfile string 136 if c.AppArmorProfile != "" { 137 appArmorProfile = c.AppArmorProfile 138 } else if c.HostConfig.Privileged { 139 appArmorProfile = unconfinedAppArmorProfile 140 } else { 141 appArmorProfile = defaultAppArmorProfile 142 } 143 144 if appArmorProfile == defaultAppArmorProfile { 145 // Unattended upgrades and other fun services can unload AppArmor 146 // profiles inadvertently. Since we cannot store our profile in 147 // /etc/apparmor.d, nor can we practically add other ways of 148 // telling the system to keep our profile loaded, in order to make 149 // sure that we keep the default profile enabled we dynamically 150 // reload it if necessary. 151 if err := ensureDefaultAppArmorProfile(); err != nil { 152 return err 153 } 154 } 155 s.Process.ApparmorProfile = appArmorProfile 156 } 157 return nil 158 } 159 } 160 161 // WithCapabilities sets the container's capabilties 162 func WithCapabilities(c *container.Container) coci.SpecOpts { 163 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 164 capabilities, err := caps.TweakCapabilities( 165 caps.DefaultCapabilities(), 166 c.HostConfig.CapAdd, 167 c.HostConfig.CapDrop, 168 c.HostConfig.Privileged, 169 ) 170 if err != nil { 171 return err 172 } 173 return oci.SetCapabilities(s, capabilities) 174 } 175 } 176 177 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 178 p, err := getPath() 179 if err != nil { 180 return "", err 181 } 182 return c.GetResourcePath(p) 183 } 184 185 func getUser(c *container.Container, username string) (specs.User, error) { 186 var usr specs.User 187 passwdPath, err := resourcePath(c, user.GetPasswdPath) 188 if err != nil { 189 return usr, err 190 } 191 groupPath, err := resourcePath(c, user.GetGroupPath) 192 if err != nil { 193 return usr, err 194 } 195 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 196 if err != nil { 197 return usr, err 198 } 199 usr.UID = uint32(execUser.Uid) 200 usr.GID = uint32(execUser.Gid) 201 usr.AdditionalGids = []uint32{usr.GID} 202 203 var addGroups []int 204 if len(c.HostConfig.GroupAdd) > 0 { 205 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 206 if err != nil { 207 return usr, err 208 } 209 } 210 for _, g := range append(execUser.Sgids, addGroups...) { 211 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 212 } 213 return usr, nil 214 } 215 216 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 217 for i, n := range s.Linux.Namespaces { 218 if n.Type == ns.Type { 219 s.Linux.Namespaces[i] = ns 220 return 221 } 222 } 223 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 224 } 225 226 // WithNamespaces sets the container's namespaces 227 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 228 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 229 userNS := false 230 // user 231 if c.HostConfig.UsernsMode.IsPrivate() { 232 uidMap := daemon.idMapping.UIDMaps 233 if uidMap != nil { 234 userNS = true 235 ns := specs.LinuxNamespace{Type: "user"} 236 setNamespace(s, ns) 237 s.Linux.UIDMappings = specMapping(uidMap) 238 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps) 239 } 240 } 241 // network 242 if !c.Config.NetworkDisabled { 243 ns := specs.LinuxNamespace{Type: "network"} 244 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 245 if parts[0] == "container" { 246 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 247 if err != nil { 248 return err 249 } 250 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 251 if userNS { 252 // to share a net namespace, they must also share a user namespace 253 nsUser := specs.LinuxNamespace{Type: "user"} 254 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 255 setNamespace(s, nsUser) 256 } 257 } else if c.HostConfig.NetworkMode.IsHost() { 258 ns.Path = c.NetworkSettings.SandboxKey 259 } 260 setNamespace(s, ns) 261 } 262 263 // ipc 264 ipcMode := c.HostConfig.IpcMode 265 if !ipcMode.Valid() { 266 return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode)) 267 } 268 switch { 269 case ipcMode.IsContainer(): 270 ns := specs.LinuxNamespace{Type: "ipc"} 271 ic, err := daemon.getIpcContainer(ipcMode.Container()) 272 if err != nil { 273 return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode)) 274 } 275 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 276 setNamespace(s, ns) 277 if userNS { 278 // to share an IPC namespace, they must also share a user namespace 279 nsUser := specs.LinuxNamespace{Type: "user"} 280 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 281 setNamespace(s, nsUser) 282 } 283 case ipcMode.IsHost(): 284 oci.RemoveNamespace(s, "ipc") 285 case ipcMode.IsEmpty(): 286 // A container was created by an older version of the daemon. 287 // The default behavior used to be what is now called "shareable". 288 fallthrough 289 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 290 ns := specs.LinuxNamespace{Type: "ipc"} 291 setNamespace(s, ns) 292 } 293 294 // pid 295 if !c.HostConfig.PidMode.Valid() { 296 return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode)) 297 } 298 if c.HostConfig.PidMode.IsContainer() { 299 pc, err := daemon.getPidContainer(c) 300 if err != nil { 301 return err 302 } 303 ns := specs.LinuxNamespace{ 304 Type: "pid", 305 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 306 } 307 setNamespace(s, ns) 308 if userNS { 309 // to share a PID namespace, they must also share a user namespace 310 nsUser := specs.LinuxNamespace{ 311 Type: "user", 312 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 313 } 314 setNamespace(s, nsUser) 315 } 316 } else if c.HostConfig.PidMode.IsHost() { 317 oci.RemoveNamespace(s, "pid") 318 } else { 319 ns := specs.LinuxNamespace{Type: "pid"} 320 setNamespace(s, ns) 321 } 322 // uts 323 if !c.HostConfig.UTSMode.Valid() { 324 return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode)) 325 } 326 if c.HostConfig.UTSMode.IsHost() { 327 oci.RemoveNamespace(s, "uts") 328 s.Hostname = "" 329 } 330 331 // cgroup 332 if !c.HostConfig.CgroupnsMode.Valid() { 333 return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode)) 334 } 335 if !c.HostConfig.CgroupnsMode.IsEmpty() { 336 if c.HostConfig.CgroupnsMode.IsPrivate() { 337 nsCgroup := specs.LinuxNamespace{Type: "cgroup"} 338 setNamespace(s, nsCgroup) 339 } 340 } 341 342 return nil 343 } 344 } 345 346 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 347 var ids []specs.LinuxIDMapping 348 for _, item := range s { 349 ids = append(ids, specs.LinuxIDMapping{ 350 HostID: uint32(item.HostID), 351 ContainerID: uint32(item.ContainerID), 352 Size: uint32(item.Size), 353 }) 354 } 355 return ids 356 } 357 358 // Get the source mount point of directory passed in as argument. Also return 359 // optional fields. 360 func getSourceMount(source string) (string, string, error) { 361 // Ensure any symlinks are resolved. 362 sourcePath, err := filepath.EvalSymlinks(source) 363 if err != nil { 364 return "", "", err 365 } 366 367 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 368 if err != nil { 369 return "", "", err 370 } 371 if len(mi) < 1 { 372 return "", "", fmt.Errorf("Can't find mount point of %s", source) 373 } 374 375 // find the longest mount point 376 var idx, maxlen int 377 for i := range mi { 378 if len(mi[i].Mountpoint) > maxlen { 379 maxlen = len(mi[i].Mountpoint) 380 idx = i 381 } 382 } 383 return mi[idx].Mountpoint, mi[idx].Optional, nil 384 } 385 386 const ( 387 sharedPropagationOption = "shared:" 388 slavePropagationOption = "master:" 389 ) 390 391 // hasMountInfoOption checks if any of the passed any of the given option values 392 // are set in the passed in option string. 393 func hasMountInfoOption(opts string, vals ...string) bool { 394 for _, opt := range strings.Split(opts, " ") { 395 for _, val := range vals { 396 if strings.HasPrefix(opt, val) { 397 return true 398 } 399 } 400 } 401 return false 402 } 403 404 // Ensure mount point on which path is mounted, is shared. 405 func ensureShared(path string) error { 406 sourceMount, optionalOpts, err := getSourceMount(path) 407 if err != nil { 408 return err 409 } 410 // Make sure source mount point is shared. 411 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 412 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 413 } 414 return nil 415 } 416 417 // Ensure mount point on which path is mounted, is either shared or slave. 418 func ensureSharedOrSlave(path string) error { 419 sourceMount, optionalOpts, err := getSourceMount(path) 420 if err != nil { 421 return err 422 } 423 424 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 425 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 426 } 427 return nil 428 } 429 430 // Get the set of mount flags that are set on the mount that contains the given 431 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 432 // bind-mounting "with options" will not fail with user namespaces, due to 433 // kernel restrictions that require user namespace mounts to preserve 434 // CL_UNPRIVILEGED locked flags. 435 func getUnprivilegedMountFlags(path string) ([]string, error) { 436 var statfs unix.Statfs_t 437 if err := unix.Statfs(path, &statfs); err != nil { 438 return nil, err 439 } 440 441 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 442 unprivilegedFlags := map[uint64]string{ 443 unix.MS_RDONLY: "ro", 444 unix.MS_NODEV: "nodev", 445 unix.MS_NOEXEC: "noexec", 446 unix.MS_NOSUID: "nosuid", 447 unix.MS_NOATIME: "noatime", 448 unix.MS_RELATIME: "relatime", 449 unix.MS_NODIRATIME: "nodiratime", 450 } 451 452 var flags []string 453 for mask, flag := range unprivilegedFlags { 454 if uint64(statfs.Flags)&mask == mask { 455 flags = append(flags, flag) 456 } 457 } 458 459 return flags, nil 460 } 461 462 var ( 463 mountPropagationMap = map[string]int{ 464 "private": mount.PRIVATE, 465 "rprivate": mount.RPRIVATE, 466 "shared": mount.SHARED, 467 "rshared": mount.RSHARED, 468 "slave": mount.SLAVE, 469 "rslave": mount.RSLAVE, 470 } 471 472 mountPropagationReverseMap = map[int]string{ 473 mount.PRIVATE: "private", 474 mount.RPRIVATE: "rprivate", 475 mount.SHARED: "shared", 476 mount.RSHARED: "rshared", 477 mount.SLAVE: "slave", 478 mount.RSLAVE: "rslave", 479 } 480 ) 481 482 // inSlice tests whether a string is contained in a slice of strings or not. 483 // Comparison is case sensitive 484 func inSlice(slice []string, s string) bool { 485 for _, ss := range slice { 486 if s == ss { 487 return true 488 } 489 } 490 return false 491 } 492 493 // WithMounts sets the container's mounts 494 func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { 495 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 496 if err := daemon.setupContainerMountsRoot(c); err != nil { 497 return err 498 } 499 500 if err := daemon.setupIpcDirs(c); err != nil { 501 return err 502 } 503 504 defer func() { 505 if err != nil { 506 daemon.cleanupSecretDir(c) 507 } 508 }() 509 510 if err := daemon.setupSecretDir(c); err != nil { 511 return err 512 } 513 514 ms, err := daemon.setupMounts(c) 515 if err != nil { 516 return err 517 } 518 519 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 520 ms = append(ms, c.IpcMounts()...) 521 } 522 523 tmpfsMounts, err := c.TmpfsMounts() 524 if err != nil { 525 return err 526 } 527 ms = append(ms, tmpfsMounts...) 528 529 secretMounts, err := c.SecretMounts() 530 if err != nil { 531 return err 532 } 533 ms = append(ms, secretMounts...) 534 535 sort.Sort(mounts(ms)) 536 537 mounts := ms 538 539 userMounts := make(map[string]struct{}) 540 for _, m := range mounts { 541 userMounts[m.Destination] = struct{}{} 542 } 543 544 // Copy all mounts from spec to defaultMounts, except for 545 // - mounts overridden by a user supplied mount; 546 // - all mounts under /dev if a user supplied /dev is present; 547 // - /dev/shm, in case IpcMode is none. 548 // While at it, also 549 // - set size for /dev/shm from shmsize. 550 defaultMounts := s.Mounts[:0] 551 _, mountDev := userMounts["/dev"] 552 for _, m := range s.Mounts { 553 if _, ok := userMounts[m.Destination]; ok { 554 // filter out mount overridden by a user supplied mount 555 continue 556 } 557 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 558 // filter out everything under /dev if /dev is user-mounted 559 continue 560 } 561 562 if m.Destination == "/dev/shm" { 563 if c.HostConfig.IpcMode.IsNone() { 564 // filter out /dev/shm for "none" IpcMode 565 continue 566 } 567 // set size for /dev/shm mount from spec 568 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 569 m.Options = append(m.Options, sizeOpt) 570 } 571 572 defaultMounts = append(defaultMounts, m) 573 } 574 575 s.Mounts = defaultMounts 576 for _, m := range mounts { 577 if m.Source == "tmpfs" { 578 data := m.Data 579 parser := volumemounts.NewParser() 580 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 581 if data != "" { 582 options = append(options, strings.Split(data, ",")...) 583 } 584 585 merged, err := mount.MergeTmpfsOptions(options) 586 if err != nil { 587 return err 588 } 589 590 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 591 continue 592 } 593 594 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 595 596 // Determine property of RootPropagation based on volume 597 // properties. If a volume is shared, then keep root propagation 598 // shared. This should work for slave and private volumes too. 599 // 600 // For slave volumes, it can be either [r]shared/[r]slave. 601 // 602 // For private volumes any root propagation value should work. 603 pFlag := mountPropagationMap[m.Propagation] 604 switch pFlag { 605 case mount.SHARED, mount.RSHARED: 606 if err := ensureShared(m.Source); err != nil { 607 return err 608 } 609 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 610 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 611 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 612 } 613 case mount.SLAVE, mount.RSLAVE: 614 var fallback bool 615 if err := ensureSharedOrSlave(m.Source); err != nil { 616 // For backwards compatibility purposes, treat mounts from the daemon root 617 // as special since we automatically add rslave propagation to these mounts 618 // when the user did not set anything, so we should fallback to the old 619 // behavior which is to use private propagation which is normally the 620 // default. 621 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 622 return err 623 } 624 625 cm, ok := c.MountPoints[m.Destination] 626 if !ok { 627 return err 628 } 629 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 630 // This means the user explicitly set a propagation, do not fallback in that case. 631 return err 632 } 633 fallback = true 634 logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 635 } 636 if !fallback { 637 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 638 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 639 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 640 } 641 } 642 } 643 644 bindMode := "rbind" 645 if m.NonRecursive { 646 bindMode = "bind" 647 } 648 opts := []string{bindMode} 649 if !m.Writable { 650 opts = append(opts, "ro") 651 } 652 if pFlag != 0 { 653 opts = append(opts, mountPropagationReverseMap[pFlag]) 654 } 655 656 // If we are using user namespaces, then we must make sure that we 657 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 658 // "mount" when we bind-mount. The reason for this is that at the point 659 // when runc sets up the root filesystem, it is already inside a user 660 // namespace, and thus cannot change any flags that are locked. 661 if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() { 662 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 663 if err != nil { 664 return err 665 } 666 opts = append(opts, unprivOpts...) 667 } 668 669 mt.Options = opts 670 s.Mounts = append(s.Mounts, mt) 671 } 672 673 if s.Root.Readonly { 674 for i, m := range s.Mounts { 675 switch m.Destination { 676 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 677 continue 678 } 679 if _, ok := userMounts[m.Destination]; !ok { 680 if !inSlice(m.Options, "ro") { 681 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 682 } 683 } 684 } 685 } 686 687 if c.HostConfig.Privileged { 688 // clear readonly for /sys 689 for i := range s.Mounts { 690 if s.Mounts[i].Destination == "/sys" { 691 clearReadOnly(&s.Mounts[i]) 692 } 693 } 694 s.Linux.ReadonlyPaths = nil 695 s.Linux.MaskedPaths = nil 696 } 697 698 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 699 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 700 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged { 701 for i, m := range s.Mounts { 702 if m.Type == "cgroup" { 703 clearReadOnly(&s.Mounts[i]) 704 } 705 } 706 } 707 708 return nil 709 } 710 } 711 712 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 713 // exist, so do not add the default ones if running on an old kernel. 714 func sysctlExists(s string) bool { 715 f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/")) 716 _, err := os.Stat(f) 717 return err == nil 718 } 719 720 // WithCommonOptions sets common docker options 721 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { 722 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 723 if c.BaseFS == "" && !daemon.UsesSnapshotter() { 724 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty") 725 } 726 linkedEnv, err := daemon.setupLinkedContainers(c) 727 if err != nil { 728 return err 729 } 730 if !daemon.UsesSnapshotter() { 731 s.Root = &specs.Root{ 732 Path: c.BaseFS, 733 Readonly: c.HostConfig.ReadonlyRootfs, 734 } 735 } 736 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 737 return err 738 } 739 cwd := c.Config.WorkingDir 740 if len(cwd) == 0 { 741 cwd = "/" 742 } 743 s.Process.Args = append([]string{c.Path}, c.Args...) 744 745 // only add the custom init if it is specified and the container is running in its 746 // own private pid namespace. It does not make sense to add if it is running in the 747 // host namespace or another container's pid namespace where we already have an init 748 if c.HostConfig.PidMode.IsPrivate() { 749 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 750 (c.HostConfig.Init == nil && daemon.configStore.Init) { 751 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 752 path := daemon.configStore.InitPath 753 if path == "" { 754 path, err = exec.LookPath(dconfig.DefaultInitBinary) 755 if err != nil { 756 return err 757 } 758 } 759 s.Mounts = append(s.Mounts, specs.Mount{ 760 Destination: inContainerInitPath, 761 Type: "bind", 762 Source: path, 763 Options: []string{"bind", "ro"}, 764 }) 765 } 766 } 767 s.Process.Cwd = cwd 768 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 769 s.Process.Terminal = c.Config.Tty 770 771 s.Hostname = c.Config.Hostname 772 setLinuxDomainname(c, s) 773 774 // Add default sysctls that are generally safe and useful; currently we 775 // grant the capabilities to allow these anyway. You can override if 776 // you want to restore the original behaviour. 777 // We do not set network sysctls if network namespace is host, or if we are 778 // joining an existing namespace, only if we create a new net namespace. 779 if c.HostConfig.NetworkMode.IsPrivate() { 780 // We cannot set up ping socket support in a user namespace 781 userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate() 782 if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") { 783 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 784 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 785 } 786 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 787 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 788 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 789 } 790 } 791 792 return nil 793 } 794 } 795 796 // WithCgroups sets the container's cgroups 797 func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { 798 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 799 var cgroupsPath string 800 scopePrefix := "docker" 801 parent := "/docker" 802 useSystemd := UsingSystemd(daemon.configStore) 803 if useSystemd { 804 parent = "system.slice" 805 if daemon.configStore.Rootless { 806 parent = "user.slice" 807 } 808 } 809 810 if c.HostConfig.CgroupParent != "" { 811 parent = c.HostConfig.CgroupParent 812 } else if daemon.configStore.CgroupParent != "" { 813 parent = daemon.configStore.CgroupParent 814 } 815 816 if useSystemd { 817 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 818 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 819 } else { 820 cgroupsPath = filepath.Join(parent, c.ID) 821 } 822 s.Linux.CgroupsPath = cgroupsPath 823 824 // the rest is only needed for CPU RT controller 825 826 if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 { 827 return nil 828 } 829 830 p := cgroupsPath 831 if useSystemd { 832 initPath, err := cgroups.GetInitCgroup("cpu") 833 if err != nil { 834 return errors.Wrap(err, "unable to init CPU RT controller") 835 } 836 _, err = cgroups.GetOwnCgroup("cpu") 837 if err != nil { 838 return errors.Wrap(err, "unable to init CPU RT controller") 839 } 840 p = filepath.Join(initPath, s.Linux.CgroupsPath) 841 } 842 843 // Clean path to guard against things like ../../../BAD 844 parentPath := filepath.Dir(p) 845 if !filepath.IsAbs(parentPath) { 846 parentPath = filepath.Clean("/" + parentPath) 847 } 848 849 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 850 if err != nil { 851 return errors.Wrap(err, "unable to init CPU RT controller") 852 } 853 // When docker is run inside docker, the root is based of the host cgroup. 854 // Should this be handled in runc/libcontainer/cgroups ? 855 if strings.HasPrefix(root, "/docker/") { 856 root = "/" 857 } 858 mnt = filepath.Join(mnt, root) 859 860 if err := daemon.initCPURtController(mnt, parentPath); err != nil { 861 return errors.Wrap(err, "unable to init CPU RT controller") 862 } 863 return nil 864 } 865 } 866 867 // WithDevices sets the container's devices 868 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 869 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 870 // Build lists of devices allowed and created within the container. 871 var devs []specs.LinuxDevice 872 devPermissions := s.Linux.Resources.Devices 873 874 if c.HostConfig.Privileged { 875 hostDevices, err := coci.HostDevices() 876 if err != nil { 877 return err 878 } 879 devs = append(devs, hostDevices...) 880 881 // adding device mappings in privileged containers 882 for _, deviceMapping := range c.HostConfig.Devices { 883 // issue a warning that custom cgroup permissions are ignored in privileged mode 884 if deviceMapping.CgroupPermissions != "rwm" { 885 logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 886 } 887 // issue a warning that the device path already exists via /dev mounting in privileged mode 888 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 889 logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 890 continue 891 } 892 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 893 if err != nil { 894 return err 895 } 896 devs = append(devs, d...) 897 } 898 899 devPermissions = []specs.LinuxDeviceCgroup{ 900 { 901 Allow: true, 902 Access: "rwm", 903 }, 904 } 905 } else { 906 for _, deviceMapping := range c.HostConfig.Devices { 907 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 908 if err != nil { 909 return err 910 } 911 devs = append(devs, d...) 912 devPermissions = append(devPermissions, dPermissions...) 913 } 914 915 var err error 916 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 917 if err != nil { 918 return err 919 } 920 } 921 922 s.Linux.Devices = append(s.Linux.Devices, devs...) 923 s.Linux.Resources.Devices = devPermissions 924 925 for _, req := range c.HostConfig.DeviceRequests { 926 if err := daemon.handleDevice(req, s); err != nil { 927 return err 928 } 929 } 930 return nil 931 } 932 } 933 934 // WithResources applies the container resources 935 func WithResources(c *container.Container) coci.SpecOpts { 936 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 937 r := c.HostConfig.Resources 938 weightDevices, err := getBlkioWeightDevices(r) 939 if err != nil { 940 return err 941 } 942 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 943 if err != nil { 944 return err 945 } 946 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 947 if err != nil { 948 return err 949 } 950 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 951 if err != nil { 952 return err 953 } 954 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 955 if err != nil { 956 return err 957 } 958 959 memoryRes := getMemoryResources(r) 960 cpuRes, err := getCPUResources(r) 961 if err != nil { 962 return err 963 } 964 blkioWeight := r.BlkioWeight 965 966 specResources := &specs.LinuxResources{ 967 Memory: memoryRes, 968 CPU: cpuRes, 969 BlockIO: &specs.LinuxBlockIO{ 970 Weight: &blkioWeight, 971 WeightDevice: weightDevices, 972 ThrottleReadBpsDevice: readBpsDevice, 973 ThrottleWriteBpsDevice: writeBpsDevice, 974 ThrottleReadIOPSDevice: readIOpsDevice, 975 ThrottleWriteIOPSDevice: writeIOpsDevice, 976 }, 977 Pids: getPidsLimit(r), 978 } 979 980 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 981 specResources.Devices = s.Linux.Resources.Devices 982 } 983 984 s.Linux.Resources = specResources 985 return nil 986 } 987 } 988 989 // WithSysctls sets the container's sysctls 990 func WithSysctls(c *container.Container) coci.SpecOpts { 991 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 992 // We merge the sysctls injected above with the HostConfig (latter takes 993 // precedence for backwards-compatibility reasons). 994 for k, v := range c.HostConfig.Sysctls { 995 s.Linux.Sysctl[k] = v 996 } 997 return nil 998 } 999 } 1000 1001 // WithUser sets the container's user 1002 func WithUser(c *container.Container) coci.SpecOpts { 1003 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1004 var err error 1005 s.Process.User, err = getUser(c, c.Config.User) 1006 return err 1007 } 1008 } 1009 1010 func (daemon *Daemon) createSpec(ctx context.Context, c *container.Container) (retSpec *specs.Spec, err error) { 1011 var ( 1012 opts []coci.SpecOpts 1013 s = oci.DefaultSpec() 1014 ) 1015 opts = append(opts, 1016 WithCommonOptions(daemon, c), 1017 WithCgroups(daemon, c), 1018 WithResources(c), 1019 WithSysctls(c), 1020 WithDevices(daemon, c), 1021 WithUser(c), 1022 WithRlimits(daemon, c), 1023 WithNamespaces(daemon, c), 1024 WithCapabilities(c), 1025 WithSeccomp(daemon, c), 1026 WithMounts(daemon, c), 1027 WithLibnetwork(daemon, c), 1028 WithApparmor(c), 1029 WithSelinux(c), 1030 WithOOMScore(&c.HostConfig.OomScoreAdj), 1031 ) 1032 if c.NoNewPrivileges { 1033 opts = append(opts, coci.WithNoNewPrivileges) 1034 } 1035 if c.Config.Tty { 1036 opts = append(opts, WithConsoleSize(c)) 1037 } 1038 // Set the masked and readonly paths with regard to the host config options if they are set. 1039 if c.HostConfig.MaskedPaths != nil { 1040 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1041 } 1042 if c.HostConfig.ReadonlyPaths != nil { 1043 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1044 } 1045 if daemon.configStore.Rootless { 1046 opts = append(opts, WithRootless(daemon)) 1047 } 1048 1049 var snapshotter, snapshotKey string 1050 if daemon.UsesSnapshotter() { 1051 snapshotter = daemon.imageService.StorageDriver() 1052 snapshotKey = c.ID 1053 } 1054 1055 return &s, coci.ApplyOpts(ctx, nil, &containers.Container{ 1056 ID: c.ID, 1057 Snapshotter: snapshotter, 1058 SnapshotKey: snapshotKey, 1059 }, &s, opts...) 1060 } 1061 1062 func clearReadOnly(m *specs.Mount) { 1063 var opt []string 1064 for _, o := range m.Options { 1065 if o != "ro" { 1066 opt = append(opt, o) 1067 } 1068 } 1069 m.Options = opt 1070 } 1071 1072 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1073 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 1074 ulimits := c.Ulimits 1075 // Merge ulimits with daemon defaults 1076 ulIdx := make(map[string]struct{}) 1077 for _, ul := range ulimits { 1078 ulIdx[ul.Name] = struct{}{} 1079 } 1080 for name, ul := range daemon.configStore.Ulimits { 1081 if _, exists := ulIdx[name]; !exists { 1082 ulimits = append(ulimits, ul) 1083 } 1084 } 1085 c.Ulimits = ulimits 1086 }