github.com/rawahars/moby@v24.0.4+incompatible/daemon/oci_linux.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "path/filepath" 8 "sort" 9 "strconv" 10 "strings" 11 12 cdcgroups "github.com/containerd/cgroups/v3" 13 "github.com/containerd/containerd/containers" 14 coci "github.com/containerd/containerd/oci" 15 "github.com/containerd/containerd/pkg/apparmor" 16 "github.com/containerd/containerd/pkg/userns" 17 containertypes "github.com/docker/docker/api/types/container" 18 "github.com/docker/docker/container" 19 dconfig "github.com/docker/docker/daemon/config" 20 "github.com/docker/docker/errdefs" 21 "github.com/docker/docker/oci" 22 "github.com/docker/docker/oci/caps" 23 "github.com/docker/docker/pkg/idtools" 24 "github.com/docker/docker/pkg/rootless/specconv" 25 "github.com/docker/docker/pkg/stringid" 26 volumemounts "github.com/docker/docker/volume/mounts" 27 "github.com/moby/sys/mount" 28 "github.com/moby/sys/mountinfo" 29 "github.com/opencontainers/runc/libcontainer/cgroups" 30 "github.com/opencontainers/runc/libcontainer/user" 31 specs "github.com/opencontainers/runtime-spec/specs-go" 32 "github.com/pkg/errors" 33 "github.com/sirupsen/logrus" 34 "golang.org/x/sys/unix" 35 ) 36 37 const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary 38 39 // WithRlimits sets the container's rlimits along with merging the daemon's rlimits 40 func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { 41 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 42 var rlimits []specs.POSIXRlimit 43 44 // We want to leave the original HostConfig alone so make a copy here 45 hostConfig := *c.HostConfig 46 // Merge with the daemon defaults 47 daemon.mergeUlimits(&hostConfig) 48 for _, ul := range hostConfig.Ulimits { 49 rlimits = append(rlimits, specs.POSIXRlimit{ 50 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 51 Soft: uint64(ul.Soft), 52 Hard: uint64(ul.Hard), 53 }) 54 } 55 56 if s.Process == nil { 57 s.Process = &specs.Process{} 58 } 59 s.Process.Rlimits = rlimits 60 return nil 61 } 62 } 63 64 // WithLibnetwork sets the libnetwork hook 65 func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts { 66 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 67 if s.Hooks == nil { 68 s.Hooks = &specs.Hooks{} 69 } 70 for _, ns := range s.Linux.Namespaces { 71 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 72 target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") 73 shortNetCtlrID := stringid.TruncateID(daemon.netController.ID()) 74 s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ 75 Path: target, 76 Args: []string{ 77 "libnetwork-setkey", 78 "-exec-root=" + daemon.configStore.GetExecRoot(), 79 c.ID, 80 shortNetCtlrID, 81 }, 82 }) 83 } 84 } 85 return nil 86 } 87 } 88 89 // WithRootless sets the spec to the rootless configuration 90 func WithRootless(daemon *Daemon) coci.SpecOpts { 91 return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 92 var v2Controllers []string 93 if daemon.getCgroupDriver() == cgroupSystemdDriver { 94 if cdcgroups.Mode() != cdcgroups.Unified { 95 return errors.New("rootless systemd driver doesn't support cgroup v1") 96 } 97 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 98 if rootlesskitParentEUID == "" { 99 return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)") 100 } 101 euid, err := strconv.Atoi(rootlesskitParentEUID) 102 if err != nil { 103 return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value") 104 } 105 controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid) 106 controllersFile, err := os.ReadFile(controllersPath) 107 if err != nil { 108 return err 109 } 110 v2Controllers = strings.Fields(string(controllersFile)) 111 } 112 return specconv.ToRootless(s, v2Controllers) 113 } 114 } 115 116 // WithOOMScore sets the oom score 117 func WithOOMScore(score *int) coci.SpecOpts { 118 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 119 if s.Process == nil { 120 s.Process = &specs.Process{} 121 } 122 s.Process.OOMScoreAdj = score 123 return nil 124 } 125 } 126 127 // WithSelinux sets the selinux labels 128 func WithSelinux(c *container.Container) coci.SpecOpts { 129 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 130 if s.Process == nil { 131 s.Process = &specs.Process{} 132 } 133 if s.Linux == nil { 134 s.Linux = &specs.Linux{} 135 } 136 s.Process.SelinuxLabel = c.GetProcessLabel() 137 s.Linux.MountLabel = c.MountLabel 138 return nil 139 } 140 } 141 142 // WithApparmor sets the apparmor profile 143 func WithApparmor(c *container.Container) coci.SpecOpts { 144 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 145 if apparmor.HostSupports() { 146 var appArmorProfile string 147 if c.AppArmorProfile != "" { 148 appArmorProfile = c.AppArmorProfile 149 } else if c.HostConfig.Privileged { 150 appArmorProfile = unconfinedAppArmorProfile 151 } else { 152 appArmorProfile = defaultAppArmorProfile 153 } 154 155 if appArmorProfile == defaultAppArmorProfile { 156 // Unattended upgrades and other fun services can unload AppArmor 157 // profiles inadvertently. Since we cannot store our profile in 158 // /etc/apparmor.d, nor can we practically add other ways of 159 // telling the system to keep our profile loaded, in order to make 160 // sure that we keep the default profile enabled we dynamically 161 // reload it if necessary. 162 if err := ensureDefaultAppArmorProfile(); err != nil { 163 return err 164 } 165 } 166 if s.Process == nil { 167 s.Process = &specs.Process{} 168 } 169 s.Process.ApparmorProfile = appArmorProfile 170 } 171 return nil 172 } 173 } 174 175 // WithCapabilities sets the container's capabilties 176 func WithCapabilities(c *container.Container) coci.SpecOpts { 177 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 178 capabilities, err := caps.TweakCapabilities( 179 caps.DefaultCapabilities(), 180 c.HostConfig.CapAdd, 181 c.HostConfig.CapDrop, 182 c.HostConfig.Privileged, 183 ) 184 if err != nil { 185 return err 186 } 187 return oci.SetCapabilities(s, capabilities) 188 } 189 } 190 191 func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) { 192 p, err := getPath() 193 if err != nil { 194 return "", err 195 } 196 return c.GetResourcePath(p) 197 } 198 199 func getUser(c *container.Container, username string) (specs.User, error) { 200 var usr specs.User 201 passwdPath, err := resourcePath(c, user.GetPasswdPath) 202 if err != nil { 203 return usr, err 204 } 205 groupPath, err := resourcePath(c, user.GetGroupPath) 206 if err != nil { 207 return usr, err 208 } 209 execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath) 210 if err != nil { 211 return usr, err 212 } 213 usr.UID = uint32(execUser.Uid) 214 usr.GID = uint32(execUser.Gid) 215 usr.AdditionalGids = []uint32{usr.GID} 216 217 var addGroups []int 218 if len(c.HostConfig.GroupAdd) > 0 { 219 addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath) 220 if err != nil { 221 return usr, err 222 } 223 } 224 for _, g := range append(execUser.Sgids, addGroups...) { 225 usr.AdditionalGids = append(usr.AdditionalGids, uint32(g)) 226 } 227 return usr, nil 228 } 229 230 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { 231 if s.Linux == nil { 232 s.Linux = &specs.Linux{} 233 } 234 235 for i, n := range s.Linux.Namespaces { 236 if n.Type == ns.Type { 237 s.Linux.Namespaces[i] = ns 238 return 239 } 240 } 241 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 242 } 243 244 // WithNamespaces sets the container's namespaces 245 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { 246 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 247 userNS := false 248 // user 249 if c.HostConfig.UsernsMode.IsPrivate() { 250 uidMap := daemon.idMapping.UIDMaps 251 if uidMap != nil { 252 userNS = true 253 ns := specs.LinuxNamespace{Type: "user"} 254 setNamespace(s, ns) 255 s.Linux.UIDMappings = specMapping(uidMap) 256 s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps) 257 } 258 } 259 // network 260 if !c.Config.NetworkDisabled { 261 ns := specs.LinuxNamespace{Type: "network"} 262 if c.HostConfig.NetworkMode.IsContainer() { 263 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 264 if err != nil { 265 return err 266 } 267 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 268 if userNS { 269 // to share a net namespace, they must also share a user namespace 270 nsUser := specs.LinuxNamespace{Type: "user"} 271 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 272 setNamespace(s, nsUser) 273 } 274 } else if c.HostConfig.NetworkMode.IsHost() { 275 ns.Path = c.NetworkSettings.SandboxKey 276 } 277 setNamespace(s, ns) 278 } 279 280 // ipc 281 ipcMode := c.HostConfig.IpcMode 282 if !ipcMode.Valid() { 283 return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode)) 284 } 285 switch { 286 case ipcMode.IsContainer(): 287 ns := specs.LinuxNamespace{Type: "ipc"} 288 ic, err := daemon.getIpcContainer(ipcMode.Container()) 289 if err != nil { 290 return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode)) 291 } 292 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 293 setNamespace(s, ns) 294 if userNS { 295 // to share an IPC namespace, they must also share a user namespace 296 nsUser := specs.LinuxNamespace{Type: "user"} 297 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 298 setNamespace(s, nsUser) 299 } 300 case ipcMode.IsHost(): 301 oci.RemoveNamespace(s, "ipc") 302 case ipcMode.IsEmpty(): 303 // A container was created by an older version of the daemon. 304 // The default behavior used to be what is now called "shareable". 305 fallthrough 306 case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): 307 ns := specs.LinuxNamespace{Type: "ipc"} 308 setNamespace(s, ns) 309 } 310 311 // pid 312 if !c.HostConfig.PidMode.Valid() { 313 return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode)) 314 } 315 if c.HostConfig.PidMode.IsContainer() { 316 pc, err := daemon.getPidContainer(c) 317 if err != nil { 318 return err 319 } 320 ns := specs.LinuxNamespace{ 321 Type: "pid", 322 Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()), 323 } 324 setNamespace(s, ns) 325 if userNS { 326 // to share a PID namespace, they must also share a user namespace 327 nsUser := specs.LinuxNamespace{ 328 Type: "user", 329 Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()), 330 } 331 setNamespace(s, nsUser) 332 } 333 } else if c.HostConfig.PidMode.IsHost() { 334 oci.RemoveNamespace(s, "pid") 335 } else { 336 ns := specs.LinuxNamespace{Type: "pid"} 337 setNamespace(s, ns) 338 } 339 // uts 340 if !c.HostConfig.UTSMode.Valid() { 341 return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode)) 342 } 343 if c.HostConfig.UTSMode.IsHost() { 344 oci.RemoveNamespace(s, "uts") 345 s.Hostname = "" 346 } 347 348 // cgroup 349 if !c.HostConfig.CgroupnsMode.Valid() { 350 return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode)) 351 } 352 if !c.HostConfig.CgroupnsMode.IsEmpty() { 353 if c.HostConfig.CgroupnsMode.IsPrivate() { 354 nsCgroup := specs.LinuxNamespace{Type: "cgroup"} 355 setNamespace(s, nsCgroup) 356 } 357 } 358 359 return nil 360 } 361 } 362 363 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { 364 var ids []specs.LinuxIDMapping 365 for _, item := range s { 366 ids = append(ids, specs.LinuxIDMapping{ 367 HostID: uint32(item.HostID), 368 ContainerID: uint32(item.ContainerID), 369 Size: uint32(item.Size), 370 }) 371 } 372 return ids 373 } 374 375 // Get the source mount point of directory passed in as argument. Also return 376 // optional fields. 377 func getSourceMount(source string) (string, string, error) { 378 // Ensure any symlinks are resolved. 379 sourcePath, err := filepath.EvalSymlinks(source) 380 if err != nil { 381 return "", "", err 382 } 383 384 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath)) 385 if err != nil { 386 return "", "", err 387 } 388 if len(mi) < 1 { 389 return "", "", fmt.Errorf("Can't find mount point of %s", source) 390 } 391 392 // find the longest mount point 393 var idx, maxlen int 394 for i := range mi { 395 if len(mi[i].Mountpoint) > maxlen { 396 maxlen = len(mi[i].Mountpoint) 397 idx = i 398 } 399 } 400 return mi[idx].Mountpoint, mi[idx].Optional, nil 401 } 402 403 const ( 404 sharedPropagationOption = "shared:" 405 slavePropagationOption = "master:" 406 ) 407 408 // hasMountInfoOption checks if any of the passed any of the given option values 409 // are set in the passed in option string. 410 func hasMountInfoOption(opts string, vals ...string) bool { 411 for _, opt := range strings.Split(opts, " ") { 412 for _, val := range vals { 413 if strings.HasPrefix(opt, val) { 414 return true 415 } 416 } 417 } 418 return false 419 } 420 421 // Ensure mount point on which path is mounted, is shared. 422 func ensureShared(path string) error { 423 sourceMount, optionalOpts, err := getSourceMount(path) 424 if err != nil { 425 return err 426 } 427 // Make sure source mount point is shared. 428 if !hasMountInfoOption(optionalOpts, sharedPropagationOption) { 429 return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount) 430 } 431 return nil 432 } 433 434 // Ensure mount point on which path is mounted, is either shared or slave. 435 func ensureSharedOrSlave(path string) error { 436 sourceMount, optionalOpts, err := getSourceMount(path) 437 if err != nil { 438 return err 439 } 440 441 if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) { 442 return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount) 443 } 444 return nil 445 } 446 447 // Get the set of mount flags that are set on the mount that contains the given 448 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 449 // bind-mounting "with options" will not fail with user namespaces, due to 450 // kernel restrictions that require user namespace mounts to preserve 451 // CL_UNPRIVILEGED locked flags. 452 func getUnprivilegedMountFlags(path string) ([]string, error) { 453 var statfs unix.Statfs_t 454 if err := unix.Statfs(path, &statfs); err != nil { 455 return nil, err 456 } 457 458 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 459 unprivilegedFlags := map[uint64]string{ 460 unix.MS_RDONLY: "ro", 461 unix.MS_NODEV: "nodev", 462 unix.MS_NOEXEC: "noexec", 463 unix.MS_NOSUID: "nosuid", 464 unix.MS_NOATIME: "noatime", 465 unix.MS_RELATIME: "relatime", 466 unix.MS_NODIRATIME: "nodiratime", 467 } 468 469 var flags []string 470 for mask, flag := range unprivilegedFlags { 471 if uint64(statfs.Flags)&mask == mask { 472 flags = append(flags, flag) 473 } 474 } 475 476 return flags, nil 477 } 478 479 var ( 480 mountPropagationMap = map[string]int{ 481 "private": mount.PRIVATE, 482 "rprivate": mount.RPRIVATE, 483 "shared": mount.SHARED, 484 "rshared": mount.RSHARED, 485 "slave": mount.SLAVE, 486 "rslave": mount.RSLAVE, 487 } 488 489 mountPropagationReverseMap = map[int]string{ 490 mount.PRIVATE: "private", 491 mount.RPRIVATE: "rprivate", 492 mount.SHARED: "shared", 493 mount.RSHARED: "rshared", 494 mount.SLAVE: "slave", 495 mount.RSLAVE: "rslave", 496 } 497 ) 498 499 // inSlice tests whether a string is contained in a slice of strings or not. 500 // Comparison is case sensitive 501 func inSlice(slice []string, s string) bool { 502 for _, ss := range slice { 503 if s == ss { 504 return true 505 } 506 } 507 return false 508 } 509 510 // WithMounts sets the container's mounts 511 func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { 512 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { 513 if err := daemon.setupContainerMountsRoot(c); err != nil { 514 return err 515 } 516 517 if err := daemon.setupIpcDirs(c); err != nil { 518 return err 519 } 520 521 defer func() { 522 if err != nil { 523 daemon.cleanupSecretDir(c) 524 } 525 }() 526 527 if err := daemon.setupSecretDir(c); err != nil { 528 return err 529 } 530 531 ms, err := daemon.setupMounts(c) 532 if err != nil { 533 return err 534 } 535 536 if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { 537 ms = append(ms, c.IpcMounts()...) 538 } 539 540 tmpfsMounts, err := c.TmpfsMounts() 541 if err != nil { 542 return err 543 } 544 ms = append(ms, tmpfsMounts...) 545 546 secretMounts, err := c.SecretMounts() 547 if err != nil { 548 return err 549 } 550 ms = append(ms, secretMounts...) 551 552 sort.Sort(mounts(ms)) 553 554 mounts := ms 555 556 userMounts := make(map[string]struct{}) 557 for _, m := range mounts { 558 userMounts[m.Destination] = struct{}{} 559 } 560 561 // Copy all mounts from spec to defaultMounts, except for 562 // - mounts overridden by a user supplied mount; 563 // - all mounts under /dev if a user supplied /dev is present; 564 // - /dev/shm, in case IpcMode is none. 565 // While at it, also 566 // - set size for /dev/shm from shmsize. 567 defaultMounts := s.Mounts[:0] 568 _, mountDev := userMounts["/dev"] 569 for _, m := range s.Mounts { 570 if _, ok := userMounts[m.Destination]; ok { 571 // filter out mount overridden by a user supplied mount 572 continue 573 } 574 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 575 // filter out everything under /dev if /dev is user-mounted 576 continue 577 } 578 579 if m.Destination == "/dev/shm" { 580 if c.HostConfig.IpcMode.IsNone() { 581 // filter out /dev/shm for "none" IpcMode 582 continue 583 } 584 // set size for /dev/shm mount from spec 585 sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) 586 m.Options = append(m.Options, sizeOpt) 587 } 588 589 defaultMounts = append(defaultMounts, m) 590 } 591 592 s.Mounts = defaultMounts 593 for _, m := range mounts { 594 if m.Source == "tmpfs" { 595 data := m.Data 596 parser := volumemounts.NewParser() 597 options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} 598 if data != "" { 599 options = append(options, strings.Split(data, ",")...) 600 } 601 602 merged, err := mount.MergeTmpfsOptions(options) 603 if err != nil { 604 return err 605 } 606 607 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) 608 continue 609 } 610 611 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 612 613 // Determine property of RootPropagation based on volume 614 // properties. If a volume is shared, then keep root propagation 615 // shared. This should work for slave and private volumes too. 616 // 617 // For slave volumes, it can be either [r]shared/[r]slave. 618 // 619 // For private volumes any root propagation value should work. 620 pFlag := mountPropagationMap[m.Propagation] 621 switch pFlag { 622 case mount.SHARED, mount.RSHARED: 623 if err := ensureShared(m.Source); err != nil { 624 return err 625 } 626 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 627 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 628 if s.Linux == nil { 629 s.Linux = &specs.Linux{} 630 } 631 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 632 } 633 case mount.SLAVE, mount.RSLAVE: 634 var fallback bool 635 if err := ensureSharedOrSlave(m.Source); err != nil { 636 // For backwards compatibility purposes, treat mounts from the daemon root 637 // as special since we automatically add rslave propagation to these mounts 638 // when the user did not set anything, so we should fallback to the old 639 // behavior which is to use private propagation which is normally the 640 // default. 641 if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { 642 return err 643 } 644 645 cm, ok := c.MountPoints[m.Destination] 646 if !ok { 647 return err 648 } 649 if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { 650 // This means the user explicitly set a propagation, do not fallback in that case. 651 return err 652 } 653 fallback = true 654 logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") 655 } 656 if !fallback { 657 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 658 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 659 if s.Linux == nil { 660 s.Linux = &specs.Linux{} 661 } 662 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 663 } 664 } 665 } 666 667 bindMode := "rbind" 668 if m.NonRecursive { 669 bindMode = "bind" 670 } 671 opts := []string{bindMode} 672 if !m.Writable { 673 opts = append(opts, "ro") 674 } 675 if pFlag != 0 { 676 opts = append(opts, mountPropagationReverseMap[pFlag]) 677 } 678 679 // If we are using user namespaces, then we must make sure that we 680 // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source 681 // "mount" when we bind-mount. The reason for this is that at the point 682 // when runc sets up the root filesystem, it is already inside a user 683 // namespace, and thus cannot change any flags that are locked. 684 if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() { 685 unprivOpts, err := getUnprivilegedMountFlags(m.Source) 686 if err != nil { 687 return err 688 } 689 opts = append(opts, unprivOpts...) 690 } 691 692 mt.Options = opts 693 s.Mounts = append(s.Mounts, mt) 694 } 695 696 if s.Root.Readonly { 697 for i, m := range s.Mounts { 698 switch m.Destination { 699 case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": 700 continue 701 } 702 if _, ok := userMounts[m.Destination]; !ok { 703 if !inSlice(m.Options, "ro") { 704 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 705 } 706 } 707 } 708 } 709 710 if c.HostConfig.Privileged { 711 // clear readonly for /sys 712 for i := range s.Mounts { 713 if s.Mounts[i].Destination == "/sys" { 714 clearReadOnly(&s.Mounts[i]) 715 } 716 } 717 if s.Linux != nil { 718 s.Linux.ReadonlyPaths = nil 719 s.Linux.MaskedPaths = nil 720 } 721 } 722 723 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 724 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 725 if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged { 726 for i, m := range s.Mounts { 727 if m.Type == "cgroup" { 728 clearReadOnly(&s.Mounts[i]) 729 } 730 } 731 } 732 733 return nil 734 } 735 } 736 737 // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually 738 // exist, so do not add the default ones if running on an old kernel. 739 func sysctlExists(s string) bool { 740 f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/")) 741 _, err := os.Stat(f) 742 return err == nil 743 } 744 745 // WithCommonOptions sets common docker options 746 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { 747 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 748 if c.BaseFS == "" { 749 return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty") 750 } 751 linkedEnv, err := daemon.setupLinkedContainers(c) 752 if err != nil { 753 return err 754 } 755 s.Root = &specs.Root{ 756 Path: c.BaseFS, 757 Readonly: c.HostConfig.ReadonlyRootfs, 758 } 759 if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { 760 return err 761 } 762 cwd := c.Config.WorkingDir 763 if len(cwd) == 0 { 764 cwd = "/" 765 } 766 if s.Process == nil { 767 s.Process = &specs.Process{} 768 } 769 s.Process.Args = append([]string{c.Path}, c.Args...) 770 771 // only add the custom init if it is specified and the container is running in its 772 // own private pid namespace. It does not make sense to add if it is running in the 773 // host namespace or another container's pid namespace where we already have an init 774 if c.HostConfig.PidMode.IsPrivate() { 775 if (c.HostConfig.Init != nil && *c.HostConfig.Init) || 776 (c.HostConfig.Init == nil && daemon.configStore.Init) { 777 s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) 778 path, err := daemon.configStore.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path 779 if err != nil { 780 return err 781 } 782 s.Mounts = append(s.Mounts, specs.Mount{ 783 Destination: inContainerInitPath, 784 Type: "bind", 785 Source: path, 786 Options: []string{"bind", "ro"}, 787 }) 788 } 789 } 790 s.Process.Cwd = cwd 791 s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) 792 s.Process.Terminal = c.Config.Tty 793 794 s.Hostname = c.Config.Hostname 795 setLinuxDomainname(c, s) 796 797 // Add default sysctls that are generally safe and useful; currently we 798 // grant the capabilities to allow these anyway. You can override if 799 // you want to restore the original behaviour. 800 // We do not set network sysctls if network namespace is host, or if we are 801 // joining an existing namespace, only if we create a new net namespace. 802 if c.HostConfig.NetworkMode.IsPrivate() { 803 // We cannot set up ping socket support in a user namespace 804 userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate() 805 if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") { 806 // allow unprivileged ICMP echo sockets without CAP_NET_RAW 807 s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" 808 } 809 // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE 810 if sysctlExists("net.ipv4.ip_unprivileged_port_start") { 811 s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" 812 } 813 } 814 815 return nil 816 } 817 } 818 819 // WithCgroups sets the container's cgroups 820 func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { 821 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 822 var cgroupsPath string 823 scopePrefix := "docker" 824 parent := "/docker" 825 useSystemd := UsingSystemd(daemon.configStore) 826 if useSystemd { 827 parent = "system.slice" 828 if daemon.configStore.Rootless { 829 parent = "user.slice" 830 } 831 } 832 833 if c.HostConfig.CgroupParent != "" { 834 parent = c.HostConfig.CgroupParent 835 } else if daemon.configStore.CgroupParent != "" { 836 parent = daemon.configStore.CgroupParent 837 } 838 839 if useSystemd { 840 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 841 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 842 } else { 843 cgroupsPath = filepath.Join(parent, c.ID) 844 } 845 if s.Linux == nil { 846 s.Linux = &specs.Linux{} 847 } 848 s.Linux.CgroupsPath = cgroupsPath 849 850 // the rest is only needed for CPU RT controller 851 852 if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 { 853 return nil 854 } 855 856 p := cgroupsPath 857 if useSystemd { 858 initPath, err := cgroups.GetInitCgroup("cpu") 859 if err != nil { 860 return errors.Wrap(err, "unable to init CPU RT controller") 861 } 862 _, err = cgroups.GetOwnCgroup("cpu") 863 if err != nil { 864 return errors.Wrap(err, "unable to init CPU RT controller") 865 } 866 p = filepath.Join(initPath, s.Linux.CgroupsPath) 867 } 868 869 // Clean path to guard against things like ../../../BAD 870 parentPath := filepath.Dir(p) 871 if !filepath.IsAbs(parentPath) { 872 parentPath = filepath.Clean("/" + parentPath) 873 } 874 875 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 876 if err != nil { 877 return errors.Wrap(err, "unable to init CPU RT controller") 878 } 879 // When docker is run inside docker, the root is based of the host cgroup. 880 // Should this be handled in runc/libcontainer/cgroups ? 881 if strings.HasPrefix(root, "/docker/") { 882 root = "/" 883 } 884 mnt = filepath.Join(mnt, root) 885 886 if err := daemon.initCPURtController(mnt, parentPath); err != nil { 887 return errors.Wrap(err, "unable to init CPU RT controller") 888 } 889 return nil 890 } 891 } 892 893 // WithDevices sets the container's devices 894 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { 895 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 896 // Build lists of devices allowed and created within the container. 897 var devs []specs.LinuxDevice 898 devPermissions := s.Linux.Resources.Devices 899 900 if c.HostConfig.Privileged { 901 hostDevices, err := coci.HostDevices() 902 if err != nil { 903 return err 904 } 905 devs = append(devs, hostDevices...) 906 907 // adding device mappings in privileged containers 908 for _, deviceMapping := range c.HostConfig.Devices { 909 // issue a warning that custom cgroup permissions are ignored in privileged mode 910 if deviceMapping.CgroupPermissions != "rwm" { 911 logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost) 912 } 913 // issue a warning that the device path already exists via /dev mounting in privileged mode 914 if deviceMapping.PathOnHost == deviceMapping.PathInContainer { 915 logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer) 916 continue 917 } 918 d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm") 919 if err != nil { 920 return err 921 } 922 devs = append(devs, d...) 923 } 924 925 devPermissions = []specs.LinuxDeviceCgroup{ 926 { 927 Allow: true, 928 Access: "rwm", 929 }, 930 } 931 } else { 932 for _, deviceMapping := range c.HostConfig.Devices { 933 d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) 934 if err != nil { 935 return err 936 } 937 devs = append(devs, d...) 938 devPermissions = append(devPermissions, dPermissions...) 939 } 940 941 var err error 942 devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules) 943 if err != nil { 944 return err 945 } 946 } 947 948 if s.Linux == nil { 949 s.Linux = &specs.Linux{} 950 } 951 if s.Linux.Resources == nil { 952 s.Linux.Resources = &specs.LinuxResources{} 953 } 954 s.Linux.Devices = append(s.Linux.Devices, devs...) 955 s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...) 956 957 for _, req := range c.HostConfig.DeviceRequests { 958 if err := daemon.handleDevice(req, s); err != nil { 959 return err 960 } 961 } 962 return nil 963 } 964 } 965 966 // WithResources applies the container resources 967 func WithResources(c *container.Container) coci.SpecOpts { 968 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 969 r := c.HostConfig.Resources 970 weightDevices, err := getBlkioWeightDevices(r) 971 if err != nil { 972 return err 973 } 974 readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps) 975 if err != nil { 976 return err 977 } 978 writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps) 979 if err != nil { 980 return err 981 } 982 readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps) 983 if err != nil { 984 return err 985 } 986 writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps) 987 if err != nil { 988 return err 989 } 990 991 memoryRes := getMemoryResources(r) 992 cpuRes, err := getCPUResources(r) 993 if err != nil { 994 return err 995 } 996 997 if s.Linux == nil { 998 s.Linux = &specs.Linux{} 999 } 1000 if s.Linux.Resources == nil { 1001 s.Linux.Resources = &specs.LinuxResources{} 1002 } 1003 s.Linux.Resources.Memory = memoryRes 1004 s.Linux.Resources.CPU = cpuRes 1005 s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{ 1006 WeightDevice: weightDevices, 1007 ThrottleReadBpsDevice: readBpsDevice, 1008 ThrottleWriteBpsDevice: writeBpsDevice, 1009 ThrottleReadIOPSDevice: readIOpsDevice, 1010 ThrottleWriteIOPSDevice: writeIOpsDevice, 1011 } 1012 if r.BlkioWeight != 0 { 1013 w := r.BlkioWeight 1014 s.Linux.Resources.BlockIO.Weight = &w 1015 } 1016 s.Linux.Resources.Pids = getPidsLimit(r) 1017 1018 return nil 1019 } 1020 } 1021 1022 // WithSysctls sets the container's sysctls 1023 func WithSysctls(c *container.Container) coci.SpecOpts { 1024 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1025 if len(c.HostConfig.Sysctls) == 0 { 1026 return nil 1027 } 1028 if s.Linux == nil { 1029 s.Linux = &specs.Linux{} 1030 } 1031 if s.Linux.Sysctl == nil { 1032 s.Linux.Sysctl = make(map[string]string) 1033 } 1034 // We merge the sysctls injected above with the HostConfig (latter takes 1035 // precedence for backwards-compatibility reasons). 1036 for k, v := range c.HostConfig.Sysctls { 1037 s.Linux.Sysctl[k] = v 1038 } 1039 return nil 1040 } 1041 } 1042 1043 // WithUser sets the container's user 1044 func WithUser(c *container.Container) coci.SpecOpts { 1045 return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { 1046 if s.Process == nil { 1047 s.Process = &specs.Process{} 1048 } 1049 var err error 1050 s.Process.User, err = getUser(c, c.Config.User) 1051 return err 1052 } 1053 } 1054 1055 func (daemon *Daemon) createSpec(ctx context.Context, c *container.Container) (retSpec *specs.Spec, err error) { 1056 var ( 1057 opts []coci.SpecOpts 1058 s = oci.DefaultSpec() 1059 ) 1060 opts = append(opts, 1061 WithCommonOptions(daemon, c), 1062 WithCgroups(daemon, c), 1063 WithResources(c), 1064 WithSysctls(c), 1065 WithDevices(daemon, c), 1066 WithRlimits(daemon, c), 1067 WithNamespaces(daemon, c), 1068 WithCapabilities(c), 1069 WithSeccomp(daemon, c), 1070 WithMounts(daemon, c), 1071 WithLibnetwork(daemon, c), 1072 WithApparmor(c), 1073 WithSelinux(c), 1074 WithOOMScore(&c.HostConfig.OomScoreAdj), 1075 coci.WithAnnotations(c.HostConfig.Annotations), 1076 WithUser(c), 1077 ) 1078 1079 if c.NoNewPrivileges { 1080 opts = append(opts, coci.WithNoNewPrivileges) 1081 } 1082 if c.Config.Tty { 1083 opts = append(opts, WithConsoleSize(c)) 1084 } 1085 // Set the masked and readonly paths with regard to the host config options if they are set. 1086 if c.HostConfig.MaskedPaths != nil { 1087 opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) 1088 } 1089 if c.HostConfig.ReadonlyPaths != nil { 1090 opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) 1091 } 1092 if daemon.configStore.Rootless { 1093 opts = append(opts, WithRootless(daemon)) 1094 } 1095 1096 var snapshotter, snapshotKey string 1097 if daemon.UsesSnapshotter() { 1098 snapshotter = daemon.imageService.StorageDriver() 1099 snapshotKey = c.ID 1100 } 1101 1102 return &s, coci.ApplyOpts(ctx, daemon.containerdCli, &containers.Container{ 1103 ID: c.ID, 1104 Snapshotter: snapshotter, 1105 SnapshotKey: snapshotKey, 1106 }, &s, opts...) 1107 } 1108 1109 func clearReadOnly(m *specs.Mount) { 1110 var opt []string 1111 for _, o := range m.Options { 1112 if o != "ro" { 1113 opt = append(opt, o) 1114 } 1115 } 1116 m.Options = opt 1117 } 1118 1119 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig 1120 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) { 1121 ulimits := c.Ulimits 1122 // Merge ulimits with daemon defaults 1123 ulIdx := make(map[string]struct{}) 1124 for _, ul := range ulimits { 1125 ulIdx[ul.Name] = struct{}{} 1126 } 1127 for name, ul := range daemon.configStore.Ulimits { 1128 if _, exists := ulIdx[name]; !exists { 1129 ulimits = append(ulimits, ul) 1130 } 1131 } 1132 c.Ulimits = ulimits 1133 }