github.com/sijibomii/docker@v0.0.0-20231230191044-5cf6ca554647/daemon/oci_linux.go (about) 1 package daemon 2 3 import ( 4 "fmt" 5 "io" 6 "os" 7 "path/filepath" 8 "strconv" 9 "strings" 10 11 "github.com/Sirupsen/logrus" 12 "github.com/docker/docker/container" 13 "github.com/docker/docker/daemon/caps" 14 "github.com/docker/docker/libcontainerd" 15 "github.com/docker/docker/oci" 16 "github.com/docker/docker/pkg/idtools" 17 "github.com/docker/docker/pkg/mount" 18 "github.com/docker/docker/pkg/stringutils" 19 "github.com/docker/docker/pkg/symlink" 20 "github.com/docker/docker/volume" 21 containertypes "github.com/docker/engine-api/types/container" 22 "github.com/opencontainers/runc/libcontainer/apparmor" 23 "github.com/opencontainers/runc/libcontainer/devices" 24 "github.com/opencontainers/runc/libcontainer/user" 25 "github.com/opencontainers/specs/specs-go" 26 ) 27 28 func setResources(s *specs.Spec, r containertypes.Resources) error { 29 weightDevices, err := getBlkioWeightDevices(r) 30 if err != nil { 31 return err 32 } 33 readBpsDevice, err := getBlkioReadBpsDevices(r) 34 if err != nil { 35 return err 36 } 37 writeBpsDevice, err := getBlkioWriteBpsDevices(r) 38 if err != nil { 39 return err 40 } 41 readIOpsDevice, err := getBlkioReadIOpsDevices(r) 42 if err != nil { 43 return err 44 } 45 writeIOpsDevice, err := getBlkioWriteIOpsDevices(r) 46 if err != nil { 47 return err 48 } 49 50 memoryRes := getMemoryResources(r) 51 cpuRes := getCPUResources(r) 52 blkioWeight := r.BlkioWeight 53 54 specResources := &specs.Resources{ 55 Memory: memoryRes, 56 CPU: cpuRes, 57 BlockIO: &specs.BlockIO{ 58 Weight: &blkioWeight, 59 WeightDevice: weightDevices, 60 ThrottleReadBpsDevice: readBpsDevice, 61 ThrottleWriteBpsDevice: writeBpsDevice, 62 ThrottleReadIOPSDevice: readIOpsDevice, 63 ThrottleWriteIOPSDevice: writeIOpsDevice, 64 }, 65 DisableOOMKiller: r.OomKillDisable, 66 Pids: &specs.Pids{ 67 Limit: &r.PidsLimit, 68 }, 69 } 70 71 if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { 72 specResources.Devices = s.Linux.Resources.Devices 73 } 74 75 s.Linux.Resources = specResources 76 return nil 77 } 78 79 func setDevices(s *specs.Spec, c *container.Container) error { 80 // Build lists of devices allowed and created within the container. 81 var devs []specs.Device 82 devPermissions := s.Linux.Resources.Devices 83 if c.HostConfig.Privileged { 84 hostDevices, err := devices.HostDevices() 85 if err != nil { 86 return err 87 } 88 for _, d := range hostDevices { 89 devs = append(devs, specDevice(d)) 90 } 91 rwm := "rwm" 92 devPermissions = []specs.DeviceCgroup{ 93 { 94 Allow: true, 95 Access: &rwm, 96 }, 97 } 98 } else { 99 for _, deviceMapping := range c.HostConfig.Devices { 100 d, dPermissions, err := getDevicesFromPath(deviceMapping) 101 if err != nil { 102 return err 103 } 104 devs = append(devs, d...) 105 devPermissions = append(devPermissions, dPermissions...) 106 } 107 } 108 109 s.Linux.Devices = append(s.Linux.Devices, devs...) 110 s.Linux.Resources.Devices = devPermissions 111 return nil 112 } 113 114 func setRlimits(daemon *Daemon, s *specs.Spec, c *container.Container) error { 115 var rlimits []specs.Rlimit 116 117 ulimits := c.HostConfig.Ulimits 118 // Merge ulimits with daemon defaults 119 ulIdx := make(map[string]struct{}) 120 for _, ul := range ulimits { 121 ulIdx[ul.Name] = struct{}{} 122 } 123 for name, ul := range daemon.configStore.Ulimits { 124 if _, exists := ulIdx[name]; !exists { 125 ulimits = append(ulimits, ul) 126 } 127 } 128 129 for _, ul := range ulimits { 130 rlimits = append(rlimits, specs.Rlimit{ 131 Type: "RLIMIT_" + strings.ToUpper(ul.Name), 132 Soft: uint64(ul.Soft), 133 Hard: uint64(ul.Hard), 134 }) 135 } 136 137 s.Process.Rlimits = rlimits 138 return nil 139 } 140 141 func setUser(s *specs.Spec, c *container.Container) error { 142 uid, gid, additionalGids, err := getUser(c, c.Config.User) 143 if err != nil { 144 return err 145 } 146 s.Process.User.UID = uid 147 s.Process.User.GID = gid 148 s.Process.User.AdditionalGids = additionalGids 149 return nil 150 } 151 152 func readUserFile(c *container.Container, p string) (io.ReadCloser, error) { 153 fp, err := symlink.FollowSymlinkInScope(filepath.Join(c.BaseFS, p), c.BaseFS) 154 if err != nil { 155 return nil, err 156 } 157 return os.Open(fp) 158 } 159 160 func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) { 161 passwdPath, err := user.GetPasswdPath() 162 if err != nil { 163 return 0, 0, nil, err 164 } 165 groupPath, err := user.GetGroupPath() 166 if err != nil { 167 return 0, 0, nil, err 168 } 169 passwdFile, err := readUserFile(c, passwdPath) 170 if err == nil { 171 defer passwdFile.Close() 172 } 173 groupFile, err := readUserFile(c, groupPath) 174 if err == nil { 175 defer groupFile.Close() 176 } 177 178 execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile) 179 if err != nil { 180 return 0, 0, nil, err 181 } 182 183 // todo: fix this double read by a change to libcontainer/user pkg 184 groupFile, err = readUserFile(c, groupPath) 185 if err == nil { 186 defer groupFile.Close() 187 } 188 var addGroups []int 189 if len(c.HostConfig.GroupAdd) > 0 { 190 addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile) 191 if err != nil { 192 return 0, 0, nil, err 193 } 194 } 195 uid := uint32(execUser.Uid) 196 gid := uint32(execUser.Gid) 197 sgids := append(execUser.Sgids, addGroups...) 198 var additionalGids []uint32 199 for _, g := range sgids { 200 additionalGids = append(additionalGids, uint32(g)) 201 } 202 return uid, gid, additionalGids, nil 203 } 204 205 func setNamespace(s *specs.Spec, ns specs.Namespace) { 206 for i, n := range s.Linux.Namespaces { 207 if n.Type == ns.Type { 208 s.Linux.Namespaces[i] = ns 209 return 210 } 211 } 212 s.Linux.Namespaces = append(s.Linux.Namespaces, ns) 213 } 214 215 func setCapabilities(s *specs.Spec, c *container.Container) error { 216 var caplist []string 217 var err error 218 if c.HostConfig.Privileged { 219 caplist = caps.GetAllCapabilities() 220 } else { 221 caplist, err = caps.TweakCapabilities(s.Process.Capabilities, c.HostConfig.CapAdd, c.HostConfig.CapDrop) 222 if err != nil { 223 return err 224 } 225 } 226 s.Process.Capabilities = caplist 227 return nil 228 } 229 230 func delNamespace(s *specs.Spec, nsType specs.NamespaceType) { 231 idx := -1 232 for i, n := range s.Linux.Namespaces { 233 if n.Type == nsType { 234 idx = i 235 } 236 } 237 if idx >= 0 { 238 s.Linux.Namespaces = append(s.Linux.Namespaces[:idx], s.Linux.Namespaces[idx+1:]...) 239 } 240 } 241 242 func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error { 243 userNS := false 244 // user 245 if c.HostConfig.UsernsMode.IsPrivate() { 246 uidMap, gidMap := daemon.GetUIDGIDMaps() 247 if uidMap != nil { 248 userNS = true 249 ns := specs.Namespace{Type: "user"} 250 setNamespace(s, ns) 251 s.Linux.UIDMappings = specMapping(uidMap) 252 s.Linux.GIDMappings = specMapping(gidMap) 253 } 254 } 255 // network 256 if !c.Config.NetworkDisabled { 257 ns := specs.Namespace{Type: "network"} 258 parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) 259 if parts[0] == "container" { 260 nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) 261 if err != nil { 262 return err 263 } 264 ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) 265 if userNS { 266 // to share a net namespace, they must also share a user namespace 267 nsUser := specs.Namespace{Type: "user"} 268 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) 269 setNamespace(s, nsUser) 270 } 271 } else if c.HostConfig.NetworkMode.IsHost() { 272 ns.Path = c.NetworkSettings.SandboxKey 273 } 274 setNamespace(s, ns) 275 } 276 // ipc 277 if c.HostConfig.IpcMode.IsContainer() { 278 ns := specs.Namespace{Type: "ipc"} 279 ic, err := daemon.getIpcContainer(c) 280 if err != nil { 281 return err 282 } 283 ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) 284 setNamespace(s, ns) 285 if userNS { 286 // to share an IPC namespace, they must also share a user namespace 287 nsUser := specs.Namespace{Type: "user"} 288 nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) 289 setNamespace(s, nsUser) 290 } 291 } else if c.HostConfig.IpcMode.IsHost() { 292 delNamespace(s, specs.NamespaceType("ipc")) 293 } else { 294 ns := specs.Namespace{Type: "ipc"} 295 setNamespace(s, ns) 296 } 297 // pid 298 if c.HostConfig.PidMode.IsHost() { 299 delNamespace(s, specs.NamespaceType("pid")) 300 } 301 // uts 302 if c.HostConfig.UTSMode.IsHost() { 303 delNamespace(s, specs.NamespaceType("uts")) 304 s.Hostname = "" 305 } 306 307 return nil 308 } 309 310 func specMapping(s []idtools.IDMap) []specs.IDMapping { 311 var ids []specs.IDMapping 312 for _, item := range s { 313 ids = append(ids, specs.IDMapping{ 314 HostID: uint32(item.HostID), 315 ContainerID: uint32(item.ContainerID), 316 Size: uint32(item.Size), 317 }) 318 } 319 return ids 320 } 321 322 func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info { 323 for _, m := range mountinfo { 324 if m.Mountpoint == dir { 325 return m 326 } 327 } 328 return nil 329 } 330 331 // Get the source mount point of directory passed in as argument. Also return 332 // optional fields. 333 func getSourceMount(source string) (string, string, error) { 334 // Ensure any symlinks are resolved. 335 sourcePath, err := filepath.EvalSymlinks(source) 336 if err != nil { 337 return "", "", err 338 } 339 340 mountinfos, err := mount.GetMounts() 341 if err != nil { 342 return "", "", err 343 } 344 345 mountinfo := getMountInfo(mountinfos, sourcePath) 346 if mountinfo != nil { 347 return sourcePath, mountinfo.Optional, nil 348 } 349 350 path := sourcePath 351 for { 352 path = filepath.Dir(path) 353 354 mountinfo = getMountInfo(mountinfos, path) 355 if mountinfo != nil { 356 return path, mountinfo.Optional, nil 357 } 358 359 if path == "/" { 360 break 361 } 362 } 363 364 // If we are here, we did not find parent mount. Something is wrong. 365 return "", "", fmt.Errorf("Could not find source mount of %s", source) 366 } 367 368 // Ensure mount point on which path is mounted, is shared. 369 func ensureShared(path string) error { 370 sharedMount := false 371 372 sourceMount, optionalOpts, err := getSourceMount(path) 373 if err != nil { 374 return err 375 } 376 // Make sure source mount point is shared. 377 optsSplit := strings.Split(optionalOpts, " ") 378 for _, opt := range optsSplit { 379 if strings.HasPrefix(opt, "shared:") { 380 sharedMount = true 381 break 382 } 383 } 384 385 if !sharedMount { 386 return fmt.Errorf("Path %s is mounted on %s but it is not a shared mount.", path, sourceMount) 387 } 388 return nil 389 } 390 391 // Ensure mount point on which path is mounted, is either shared or slave. 392 func ensureSharedOrSlave(path string) error { 393 sharedMount := false 394 slaveMount := false 395 396 sourceMount, optionalOpts, err := getSourceMount(path) 397 if err != nil { 398 return err 399 } 400 // Make sure source mount point is shared. 401 optsSplit := strings.Split(optionalOpts, " ") 402 for _, opt := range optsSplit { 403 if strings.HasPrefix(opt, "shared:") { 404 sharedMount = true 405 break 406 } else if strings.HasPrefix(opt, "master:") { 407 slaveMount = true 408 break 409 } 410 } 411 412 if !sharedMount && !slaveMount { 413 return fmt.Errorf("Path %s is mounted on %s but it is not a shared or slave mount.", path, sourceMount) 414 } 415 return nil 416 } 417 418 var ( 419 mountPropagationMap = map[string]int{ 420 "private": mount.PRIVATE, 421 "rprivate": mount.RPRIVATE, 422 "shared": mount.SHARED, 423 "rshared": mount.RSHARED, 424 "slave": mount.SLAVE, 425 "rslave": mount.RSLAVE, 426 } 427 428 mountPropagationReverseMap = map[int]string{ 429 mount.PRIVATE: "private", 430 mount.RPRIVATE: "rprivate", 431 mount.SHARED: "shared", 432 mount.RSHARED: "rshared", 433 mount.SLAVE: "slave", 434 mount.RSLAVE: "rslave", 435 } 436 ) 437 438 func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error { 439 userMounts := make(map[string]struct{}) 440 for _, m := range mounts { 441 userMounts[m.Destination] = struct{}{} 442 } 443 444 // Filter out mounts that are overriden by user supplied mounts 445 var defaultMounts []specs.Mount 446 _, mountDev := userMounts["/dev"] 447 for _, m := range s.Mounts { 448 if _, ok := userMounts[m.Destination]; !ok { 449 if mountDev && strings.HasPrefix(m.Destination, "/dev/") { 450 continue 451 } 452 defaultMounts = append(defaultMounts, m) 453 } 454 } 455 456 s.Mounts = defaultMounts 457 for _, m := range mounts { 458 for _, cm := range s.Mounts { 459 if cm.Destination == m.Destination { 460 return fmt.Errorf("Duplicate mount point '%s'", m.Destination) 461 } 462 } 463 464 if m.Source == "tmpfs" { 465 opt := []string{"noexec", "nosuid", "nodev", volume.DefaultPropagationMode} 466 if m.Data != "" { 467 opt = append(opt, strings.Split(m.Data, ",")...) 468 } else { 469 opt = append(opt, "size=65536k") 470 } 471 472 s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: opt}) 473 continue 474 } 475 476 mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} 477 478 // Determine property of RootPropagation based on volume 479 // properties. If a volume is shared, then keep root propagation 480 // shared. This should work for slave and private volumes too. 481 // 482 // For slave volumes, it can be either [r]shared/[r]slave. 483 // 484 // For private volumes any root propagation value should work. 485 pFlag := mountPropagationMap[m.Propagation] 486 if pFlag == mount.SHARED || pFlag == mount.RSHARED { 487 if err := ensureShared(m.Source); err != nil { 488 return err 489 } 490 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 491 if rootpg != mount.SHARED && rootpg != mount.RSHARED { 492 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] 493 } 494 } else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE { 495 if err := ensureSharedOrSlave(m.Source); err != nil { 496 return err 497 } 498 rootpg := mountPropagationMap[s.Linux.RootfsPropagation] 499 if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { 500 s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] 501 } 502 } 503 504 opts := []string{"rbind"} 505 if !m.Writable { 506 opts = append(opts, "ro") 507 } 508 if pFlag != 0 { 509 opts = append(opts, mountPropagationReverseMap[pFlag]) 510 } 511 512 mt.Options = opts 513 s.Mounts = append(s.Mounts, mt) 514 } 515 516 if s.Root.Readonly { 517 for i, m := range s.Mounts { 518 switch m.Destination { 519 case "/proc", "/dev/pts", "/dev/mqueue": // /dev is remounted by runc 520 continue 521 } 522 if _, ok := userMounts[m.Destination]; !ok { 523 if !stringutils.InSlice(m.Options, "ro") { 524 s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") 525 } 526 } 527 } 528 } 529 530 if c.HostConfig.Privileged { 531 if !s.Root.Readonly { 532 // clear readonly for /sys 533 for i := range s.Mounts { 534 if s.Mounts[i].Destination == "/sys" { 535 clearReadOnly(&s.Mounts[i]) 536 } 537 } 538 } 539 s.Linux.ReadonlyPaths = nil 540 s.Linux.MaskedPaths = nil 541 } 542 543 // TODO: until a kernel/mount solution exists for handling remount in a user namespace, 544 // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) 545 if uidMap, _ := daemon.GetUIDGIDMaps(); uidMap != nil || c.HostConfig.Privileged { 546 for i, m := range s.Mounts { 547 if m.Type == "cgroup" { 548 clearReadOnly(&s.Mounts[i]) 549 } 550 } 551 } 552 553 return nil 554 } 555 556 func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error { 557 linkedEnv, err := daemon.setupLinkedContainers(c) 558 if err != nil { 559 return err 560 } 561 s.Root = specs.Root{ 562 Path: c.BaseFS, 563 Readonly: c.HostConfig.ReadonlyRootfs, 564 } 565 rootUID, rootGID := daemon.GetRemappedUIDGID() 566 if err := c.SetupWorkingDirectory(rootUID, rootGID); err != nil { 567 return err 568 } 569 cwd := c.Config.WorkingDir 570 if len(cwd) == 0 { 571 cwd = "/" 572 } 573 s.Process.Args = append([]string{c.Path}, c.Args...) 574 s.Process.Cwd = cwd 575 s.Process.Env = c.CreateDaemonEnvironment(linkedEnv) 576 s.Process.Terminal = c.Config.Tty 577 s.Hostname = c.FullHostname() 578 579 return nil 580 } 581 582 func (daemon *Daemon) createSpec(c *container.Container) (*libcontainerd.Spec, error) { 583 s := oci.DefaultSpec() 584 if err := daemon.populateCommonSpec(&s, c); err != nil { 585 return nil, err 586 } 587 588 var cgroupsPath string 589 scopePrefix := "docker" 590 parent := "/docker" 591 useSystemd := UsingSystemd(daemon.configStore) 592 if useSystemd { 593 parent = "system.slice" 594 } 595 596 if c.HostConfig.CgroupParent != "" { 597 parent = c.HostConfig.CgroupParent 598 } else if daemon.configStore.CgroupParent != "" { 599 parent = daemon.configStore.CgroupParent 600 } 601 602 if useSystemd { 603 cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID 604 logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath) 605 } else { 606 cgroupsPath = filepath.Join(parent, c.ID) 607 } 608 s.Linux.CgroupsPath = &cgroupsPath 609 610 if err := setResources(&s, c.HostConfig.Resources); err != nil { 611 return nil, fmt.Errorf("linux runtime spec resources: %v", err) 612 } 613 s.Linux.Resources.OOMScoreAdj = &c.HostConfig.OomScoreAdj 614 s.Linux.Sysctl = c.HostConfig.Sysctls 615 if err := setDevices(&s, c); err != nil { 616 return nil, fmt.Errorf("linux runtime spec devices: %v", err) 617 } 618 if err := setRlimits(daemon, &s, c); err != nil { 619 return nil, fmt.Errorf("linux runtime spec rlimits: %v", err) 620 } 621 if err := setUser(&s, c); err != nil { 622 return nil, fmt.Errorf("linux spec user: %v", err) 623 } 624 if err := setNamespaces(daemon, &s, c); err != nil { 625 return nil, fmt.Errorf("linux spec namespaces: %v", err) 626 } 627 if err := setCapabilities(&s, c); err != nil { 628 return nil, fmt.Errorf("linux spec capabilities: %v", err) 629 } 630 if err := setSeccomp(daemon, &s, c); err != nil { 631 return nil, fmt.Errorf("linux seccomp: %v", err) 632 } 633 634 if err := daemon.setupIpcDirs(c); err != nil { 635 return nil, err 636 } 637 638 mounts, err := daemon.setupMounts(c) 639 if err != nil { 640 return nil, err 641 } 642 mounts = append(mounts, c.IpcMounts()...) 643 mounts = append(mounts, c.TmpfsMounts()...) 644 if err := setMounts(daemon, &s, c, mounts); err != nil { 645 return nil, fmt.Errorf("linux mounts: %v", err) 646 } 647 648 for _, ns := range s.Linux.Namespaces { 649 if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { 650 target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")) 651 if err != nil { 652 return nil, err 653 } 654 655 s.Hooks = specs.Hooks{ 656 Prestart: []specs.Hook{{ 657 Path: target, // FIXME: cross-platform 658 Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()}, 659 }}, 660 } 661 } 662 } 663 664 if apparmor.IsEnabled() { 665 appArmorProfile := "docker-default" 666 if len(c.AppArmorProfile) > 0 { 667 appArmorProfile = c.AppArmorProfile 668 } else if c.HostConfig.Privileged { 669 appArmorProfile = "unconfined" 670 } 671 s.Process.ApparmorProfile = appArmorProfile 672 } 673 s.Process.SelinuxLabel = c.GetProcessLabel() 674 s.Process.NoNewPrivileges = c.NoNewPrivileges 675 676 return (*libcontainerd.Spec)(&s), nil 677 } 678 679 func clearReadOnly(m *specs.Mount) { 680 var opt []string 681 for _, o := range m.Options { 682 if o != "ro" { 683 opt = append(opt, o) 684 } 685 } 686 m.Options = opt 687 }