github.com/moby/docker@v26.1.3+incompatible/daemon/daemon_unix.go (about) 1 //go:build linux || freebsd 2 3 package daemon // import "github.com/docker/docker/daemon" 4 5 import ( 6 "bufio" 7 "context" 8 "fmt" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "runtime/debug" 14 "strconv" 15 "strings" 16 "sync" 17 "syscall" 18 "time" 19 20 "github.com/containerd/cgroups/v3" 21 "github.com/containerd/containerd/pkg/userns" 22 "github.com/containerd/log" 23 "github.com/docker/docker/api/types/blkiodev" 24 pblkiodev "github.com/docker/docker/api/types/blkiodev" 25 containertypes "github.com/docker/docker/api/types/container" 26 "github.com/docker/docker/api/types/network" 27 "github.com/docker/docker/container" 28 "github.com/docker/docker/daemon/config" 29 "github.com/docker/docker/daemon/initlayer" 30 "github.com/docker/docker/errdefs" 31 "github.com/docker/docker/libcontainerd/remote" 32 "github.com/docker/docker/libnetwork" 33 nwconfig "github.com/docker/docker/libnetwork/config" 34 "github.com/docker/docker/libnetwork/drivers/bridge" 35 "github.com/docker/docker/libnetwork/netlabel" 36 "github.com/docker/docker/libnetwork/options" 37 lntypes "github.com/docker/docker/libnetwork/types" 38 "github.com/docker/docker/opts" 39 "github.com/docker/docker/pkg/idtools" 40 "github.com/docker/docker/pkg/parsers" 41 "github.com/docker/docker/pkg/parsers/kernel" 42 "github.com/docker/docker/pkg/sysinfo" 43 "github.com/docker/docker/runconfig" 44 volumemounts "github.com/docker/docker/volume/mounts" 45 "github.com/moby/sys/mount" 46 specs "github.com/opencontainers/runtime-spec/specs-go" 47 "github.com/opencontainers/selinux/go-selinux" 48 "github.com/opencontainers/selinux/go-selinux/label" 49 "github.com/pkg/errors" 50 "github.com/vishvananda/netlink" 51 "golang.org/x/sys/unix" 52 ) 53 54 const ( 55 isWindows = false 56 57 // These values were used to adjust the CPU-shares for older API versions, 58 // but were not used for validation. 59 // 60 // TODO(thaJeztah): validate min/max values for CPU-shares, similar to Windows: https://github.com/moby/moby/issues/47340 61 // https://github.com/moby/moby/blob/27e85c7b6885c2d21ae90791136d9aba78b83d01/daemon/daemon_windows.go#L97-L99 62 // 63 // See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269 64 // linuxMinCPUShares = 2 65 // linuxMaxCPUShares = 262144 66 67 // It's not kernel limit, we want this 6M limit to account for overhead during startup, and to supply a reasonable functional container 68 linuxMinMemory = 6291456 69 // constants for remapped root settings 70 defaultIDSpecifier = "default" 71 defaultRemappedID = "dockremap" 72 73 // constant for cgroup drivers 74 cgroupFsDriver = "cgroupfs" 75 cgroupSystemdDriver = "systemd" 76 cgroupNoneDriver = "none" 77 ) 78 79 type containerGetter interface { 80 GetContainer(string) (*container.Container, error) 81 } 82 83 func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory { 84 memory := specs.LinuxMemory{} 85 86 if config.Memory > 0 { 87 memory.Limit = &config.Memory 88 } 89 90 if config.MemoryReservation > 0 { 91 memory.Reservation = &config.MemoryReservation 92 } 93 94 if config.MemorySwap > 0 { 95 memory.Swap = &config.MemorySwap 96 } 97 98 if config.MemorySwappiness != nil { 99 swappiness := uint64(*config.MemorySwappiness) 100 memory.Swappiness = &swappiness 101 } 102 103 if config.OomKillDisable != nil { 104 memory.DisableOOMKiller = config.OomKillDisable 105 } 106 107 if config.KernelMemory != 0 { //nolint:staticcheck // ignore SA1019: memory.Kernel is deprecated: kernel-memory limits are not supported in cgroups v2, and were obsoleted in [kernel v5.4]. This field should no longer be used, as it may be ignored by runtimes. 108 memory.Kernel = &config.KernelMemory //nolint:staticcheck // ignore SA1019: memory.Kernel is deprecated: kernel-memory limits are not supported in cgroups v2, and were obsoleted in [kernel v5.4]. This field should no longer be used, as it may be ignored by runtimes. 109 } 110 111 if config.KernelMemoryTCP != 0 { 112 memory.KernelTCP = &config.KernelMemoryTCP 113 } 114 115 if memory != (specs.LinuxMemory{}) { 116 return &memory 117 } 118 return nil 119 } 120 121 func getPidsLimit(config containertypes.Resources) *specs.LinuxPids { 122 if config.PidsLimit == nil { 123 return nil 124 } 125 if *config.PidsLimit <= 0 { 126 // docker API allows 0 and negative values to unset this to be consistent 127 // with default values. When updating values, runc requires -1 to unset 128 // the previous limit. 129 return &specs.LinuxPids{Limit: -1} 130 } 131 return &specs.LinuxPids{Limit: *config.PidsLimit} 132 } 133 134 func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { 135 cpu := specs.LinuxCPU{} 136 137 if config.CPUShares < 0 { 138 return nil, fmt.Errorf("shares: invalid argument") 139 } 140 if config.CPUShares > 0 { 141 shares := uint64(config.CPUShares) 142 cpu.Shares = &shares 143 } 144 145 if config.CpusetCpus != "" { 146 cpu.Cpus = config.CpusetCpus 147 } 148 149 if config.CpusetMems != "" { 150 cpu.Mems = config.CpusetMems 151 } 152 153 if config.NanoCPUs > 0 { 154 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 155 period := uint64(100 * time.Millisecond / time.Microsecond) 156 quota := config.NanoCPUs * int64(period) / 1e9 157 cpu.Period = &period 158 cpu.Quota = "a 159 } 160 161 if config.CPUPeriod != 0 { 162 period := uint64(config.CPUPeriod) 163 cpu.Period = &period 164 } 165 166 if config.CPUQuota != 0 { 167 q := config.CPUQuota 168 cpu.Quota = &q 169 } 170 171 if config.CPURealtimePeriod != 0 { 172 period := uint64(config.CPURealtimePeriod) 173 cpu.RealtimePeriod = &period 174 } 175 176 if config.CPURealtimeRuntime != 0 { 177 c := config.CPURealtimeRuntime 178 cpu.RealtimeRuntime = &c 179 } 180 181 if cpu != (specs.LinuxCPU{}) { 182 return &cpu, nil 183 } 184 return nil, nil 185 } 186 187 func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) { 188 var stat unix.Stat_t 189 var blkioWeightDevices []specs.LinuxWeightDevice 190 191 for _, weightDevice := range config.BlkioWeightDevice { 192 if err := unix.Stat(weightDevice.Path, &stat); err != nil { 193 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: weightDevice.Path, Err: err}) 194 } 195 weight := weightDevice.Weight 196 d := specs.LinuxWeightDevice{Weight: &weight} 197 // The type is 32bit on mips. 198 d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert 199 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert 200 blkioWeightDevices = append(blkioWeightDevices, d) 201 } 202 203 return blkioWeightDevices, nil 204 } 205 206 func (daemon *Daemon) parseSecurityOpt(cfg *config.Config, securityOptions *container.SecurityOptions, hostConfig *containertypes.HostConfig) error { 207 securityOptions.NoNewPrivileges = cfg.NoNewPrivileges 208 return parseSecurityOpt(securityOptions, hostConfig) 209 } 210 211 func parseSecurityOpt(securityOptions *container.SecurityOptions, config *containertypes.HostConfig) error { 212 var ( 213 labelOpts []string 214 err error 215 ) 216 217 for _, opt := range config.SecurityOpt { 218 if opt == "no-new-privileges" { 219 securityOptions.NoNewPrivileges = true 220 continue 221 } 222 if opt == "disable" { 223 labelOpts = append(labelOpts, "disable") 224 continue 225 } 226 227 var k, v string 228 var ok bool 229 if strings.Contains(opt, "=") { 230 k, v, ok = strings.Cut(opt, "=") 231 } else if strings.Contains(opt, ":") { 232 k, v, ok = strings.Cut(opt, ":") 233 log.G(context.TODO()).Warn("Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead.") 234 } 235 if !ok { 236 return fmt.Errorf("invalid --security-opt 1: %q", opt) 237 } 238 239 switch k { 240 case "label": 241 labelOpts = append(labelOpts, v) 242 case "apparmor": 243 securityOptions.AppArmorProfile = v 244 case "seccomp": 245 securityOptions.SeccompProfile = v 246 case "no-new-privileges": 247 noNewPrivileges, err := strconv.ParseBool(v) 248 if err != nil { 249 return fmt.Errorf("invalid --security-opt 2: %q", opt) 250 } 251 securityOptions.NoNewPrivileges = noNewPrivileges 252 default: 253 return fmt.Errorf("invalid --security-opt 2: %q", opt) 254 } 255 } 256 257 securityOptions.ProcessLabel, securityOptions.MountLabel, err = label.InitLabels(labelOpts) 258 return err 259 } 260 261 func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { 262 var throttleDevices []specs.LinuxThrottleDevice 263 var stat unix.Stat_t 264 265 for _, d := range devs { 266 if err := unix.Stat(d.Path, &stat); err != nil { 267 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: d.Path, Err: err}) 268 } 269 d := specs.LinuxThrottleDevice{Rate: d.Rate} 270 // the type is 32bit on mips 271 d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert 272 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert 273 throttleDevices = append(throttleDevices, d) 274 } 275 276 return throttleDevices, nil 277 } 278 279 // adjustParallelLimit takes a number of objects and a proposed limit and 280 // figures out if it's reasonable (and adjusts it accordingly). This is only 281 // used for daemon startup, which does a lot of parallel loading of containers 282 // (and if we exceed RLIMIT_NOFILE then we're in trouble). 283 func adjustParallelLimit(n int, limit int) int { 284 // Rule-of-thumb overhead factor (how many files will each goroutine open 285 // simultaneously). Yes, this is ugly but to be frank this whole thing is 286 // ugly. 287 const overhead = 2 288 289 // On Linux, we need to ensure that parallelStartupJobs doesn't cause us to 290 // exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it 291 // and give a warning (since in theory the user should increase their 292 // ulimits to the largest possible value for dockerd). 293 var rlim unix.Rlimit 294 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil { 295 log.G(context.TODO()).Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err) 296 return limit 297 } 298 softRlimit := int(rlim.Cur) 299 300 // Much fewer containers than RLIMIT_NOFILE. No need to adjust anything. 301 if softRlimit > overhead*n { 302 return limit 303 } 304 305 // RLIMIT_NOFILE big enough, no need to adjust anything. 306 if softRlimit > overhead*limit { 307 return limit 308 } 309 310 log.G(context.TODO()).Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit) 311 return softRlimit / overhead 312 } 313 314 // adaptContainerSettings is called during container creation to modify any 315 // settings necessary in the HostConfig structure. 316 func (daemon *Daemon) adaptContainerSettings(daemonCfg *config.Config, hostConfig *containertypes.HostConfig) error { 317 if hostConfig.Memory > 0 && hostConfig.MemorySwap == 0 { 318 // By default, MemorySwap is set to twice the size of Memory. 319 hostConfig.MemorySwap = hostConfig.Memory * 2 320 } 321 if hostConfig.ShmSize == 0 { 322 hostConfig.ShmSize = config.DefaultShmSize 323 if daemonCfg != nil { 324 hostConfig.ShmSize = int64(daemonCfg.ShmSize) 325 } 326 } 327 // Set default IPC mode, if unset for container 328 if hostConfig.IpcMode.IsEmpty() { 329 m := config.DefaultIpcMode 330 if daemonCfg != nil { 331 m = containertypes.IpcMode(daemonCfg.IpcMode) 332 } 333 hostConfig.IpcMode = m 334 } 335 336 // Set default cgroup namespace mode, if unset for container 337 if hostConfig.CgroupnsMode.IsEmpty() { 338 // for cgroup v2: unshare cgroupns even for privileged containers 339 // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 340 if hostConfig.Privileged && cgroups.Mode() != cgroups.Unified { 341 hostConfig.CgroupnsMode = containertypes.CgroupnsModeHost 342 } else { 343 m := containertypes.CgroupnsModeHost 344 if cgroups.Mode() == cgroups.Unified { 345 m = containertypes.CgroupnsModePrivate 346 } 347 if daemonCfg != nil { 348 m = containertypes.CgroupnsMode(daemonCfg.CgroupNamespaceMode) 349 } 350 hostConfig.CgroupnsMode = m 351 } 352 } 353 354 adaptSharedNamespaceContainer(daemon, hostConfig) 355 356 var err error 357 secOpts, err := daemon.generateSecurityOpt(hostConfig) 358 if err != nil { 359 return err 360 } 361 hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, secOpts...) 362 if hostConfig.OomKillDisable == nil { 363 defaultOomKillDisable := false 364 hostConfig.OomKillDisable = &defaultOomKillDisable 365 } 366 367 return nil 368 } 369 370 // adaptSharedNamespaceContainer replaces container name with its ID in hostConfig. 371 // To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode 372 // and NetworkMode. 373 // 374 // When a container shares its namespace with another container, use ID can keep the namespace 375 // sharing connection between the two containers even the another container is renamed. 376 func adaptSharedNamespaceContainer(daemon containerGetter, hostConfig *containertypes.HostConfig) { 377 containerPrefix := "container:" 378 if hostConfig.PidMode.IsContainer() { 379 pidContainer := hostConfig.PidMode.Container() 380 // if there is any error returned here, we just ignore it and leave it to be 381 // handled in the following logic 382 if c, err := daemon.GetContainer(pidContainer); err == nil { 383 hostConfig.PidMode = containertypes.PidMode(containerPrefix + c.ID) 384 } 385 } 386 if hostConfig.IpcMode.IsContainer() { 387 ipcContainer := hostConfig.IpcMode.Container() 388 if c, err := daemon.GetContainer(ipcContainer); err == nil { 389 hostConfig.IpcMode = containertypes.IpcMode(containerPrefix + c.ID) 390 } 391 } 392 if hostConfig.NetworkMode.IsContainer() { 393 netContainer := hostConfig.NetworkMode.ConnectedContainer() 394 if c, err := daemon.GetContainer(netContainer); err == nil { 395 hostConfig.NetworkMode = containertypes.NetworkMode(containerPrefix + c.ID) 396 } 397 } 398 } 399 400 // verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration 401 func verifyPlatformContainerResources(resources *containertypes.Resources, sysInfo *sysinfo.SysInfo, update bool) (warnings []string, err error) { 402 fixMemorySwappiness(resources) 403 404 // memory subsystem checks and adjustments 405 if resources.Memory != 0 && resources.Memory < linuxMinMemory { 406 return warnings, fmt.Errorf("Minimum memory limit allowed is 6MB") 407 } 408 if resources.Memory > 0 && !sysInfo.MemoryLimit { 409 warnings = append(warnings, "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 410 resources.Memory = 0 411 resources.MemorySwap = -1 412 } 413 if resources.Memory > 0 && resources.MemorySwap != -1 && !sysInfo.SwapLimit { 414 warnings = append(warnings, "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.") 415 resources.MemorySwap = -1 416 } 417 if resources.Memory > 0 && resources.MemorySwap > 0 && resources.MemorySwap < resources.Memory { 418 return warnings, fmt.Errorf("Minimum memoryswap limit should be larger than memory limit, see usage") 419 } 420 if resources.Memory == 0 && resources.MemorySwap > 0 && !update { 421 return warnings, fmt.Errorf("You should always set the Memory limit when using Memoryswap limit, see usage") 422 } 423 if resources.MemorySwappiness != nil && !sysInfo.MemorySwappiness { 424 warnings = append(warnings, "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded.") 425 resources.MemorySwappiness = nil 426 } 427 if resources.MemorySwappiness != nil { 428 swappiness := *resources.MemorySwappiness 429 if swappiness < 0 || swappiness > 100 { 430 return warnings, fmt.Errorf("Invalid value: %v, valid memory swappiness range is 0-100", swappiness) 431 } 432 } 433 if resources.MemoryReservation > 0 && !sysInfo.MemoryReservation { 434 warnings = append(warnings, "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded.") 435 resources.MemoryReservation = 0 436 } 437 if resources.MemoryReservation > 0 && resources.MemoryReservation < linuxMinMemory { 438 return warnings, fmt.Errorf("Minimum memory reservation allowed is 6MB") 439 } 440 if resources.Memory > 0 && resources.MemoryReservation > 0 && resources.Memory < resources.MemoryReservation { 441 return warnings, fmt.Errorf("Minimum memory limit can not be less than memory reservation limit, see usage") 442 } 443 if resources.KernelMemory > 0 { 444 // Kernel memory limit is not supported on cgroup v2. 445 // Even on cgroup v1, kernel memory limit (`kmem.limit_in_bytes`) has been deprecated since kernel 5.4. 446 // https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0b5adf44cae99b3ebcc7 447 if !sysInfo.KernelMemory { 448 warnings = append(warnings, "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 449 resources.KernelMemory = 0 450 } 451 if resources.KernelMemory > 0 && resources.KernelMemory < linuxMinMemory { 452 return warnings, fmt.Errorf("Minimum kernel memory limit allowed is 6MB") 453 } 454 if !kernel.CheckKernelVersion(4, 0, 0) { 455 warnings = append(warnings, "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable.") 456 } 457 } 458 if resources.OomKillDisable != nil && !sysInfo.OomKillDisable { 459 // only produce warnings if the setting wasn't to *disable* the OOM Kill; no point 460 // warning the caller if they already wanted the feature to be off 461 if *resources.OomKillDisable { 462 warnings = append(warnings, "Your kernel does not support OomKillDisable. OomKillDisable discarded.") 463 } 464 resources.OomKillDisable = nil 465 } 466 if resources.OomKillDisable != nil && *resources.OomKillDisable && resources.Memory == 0 { 467 warnings = append(warnings, "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources.") 468 } 469 if resources.PidsLimit != nil && !sysInfo.PidsLimit { 470 if *resources.PidsLimit > 0 { 471 warnings = append(warnings, "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded.") 472 } 473 resources.PidsLimit = nil 474 } 475 476 // cpu subsystem checks and adjustments 477 if resources.NanoCPUs > 0 && resources.CPUPeriod > 0 { 478 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Period cannot both be set") 479 } 480 if resources.NanoCPUs > 0 && resources.CPUQuota > 0 { 481 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Quota cannot both be set") 482 } 483 if resources.NanoCPUs > 0 && !sysInfo.CPUCfs { 484 return warnings, fmt.Errorf("NanoCPUs can not be set, as your kernel does not support CPU CFS scheduler or the cgroup is not mounted") 485 } 486 // The highest precision we could get on Linux is 0.001, by setting 487 // cpu.cfs_period_us=1000ms 488 // cpu.cfs_quota=1ms 489 // See the following link for details: 490 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 491 // Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error. 492 // The error message is 0.01 so that this is consistent with Windows 493 if resources.NanoCPUs < 0 || resources.NanoCPUs > int64(sysinfo.NumCPU())*1e9 { 494 return warnings, fmt.Errorf("Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available", sysinfo.NumCPU(), sysinfo.NumCPU()) 495 } 496 497 if resources.CPUShares > 0 && !sysInfo.CPUShares { 498 warnings = append(warnings, "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded.") 499 resources.CPUShares = 0 500 } 501 if (resources.CPUPeriod != 0 || resources.CPUQuota != 0) && !sysInfo.CPUCfs { 502 warnings = append(warnings, "Your kernel does not support CPU CFS scheduler. CPU period/quota discarded.") 503 resources.CPUPeriod = 0 504 resources.CPUQuota = 0 505 } 506 if resources.CPUPeriod != 0 && (resources.CPUPeriod < 1000 || resources.CPUPeriod > 1000000) { 507 return warnings, fmt.Errorf("CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)") 508 } 509 if resources.CPUQuota > 0 && resources.CPUQuota < 1000 { 510 return warnings, fmt.Errorf("CPU cfs quota can not be less than 1ms (i.e. 1000)") 511 } 512 if resources.CPUPercent > 0 { 513 warnings = append(warnings, fmt.Sprintf("%s does not support CPU percent. Percent discarded.", runtime.GOOS)) 514 resources.CPUPercent = 0 515 } 516 517 // cpuset subsystem checks and adjustments 518 if (resources.CpusetCpus != "" || resources.CpusetMems != "") && !sysInfo.Cpuset { 519 warnings = append(warnings, "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded.") 520 resources.CpusetCpus = "" 521 resources.CpusetMems = "" 522 } 523 cpusAvailable, err := sysInfo.IsCpusetCpusAvailable(resources.CpusetCpus) 524 if err != nil { 525 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset cpus", resources.CpusetCpus) 526 } 527 if !cpusAvailable { 528 return warnings, fmt.Errorf("Requested CPUs are not available - requested %s, available: %s", resources.CpusetCpus, sysInfo.Cpus) 529 } 530 memsAvailable, err := sysInfo.IsCpusetMemsAvailable(resources.CpusetMems) 531 if err != nil { 532 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset mems", resources.CpusetMems) 533 } 534 if !memsAvailable { 535 return warnings, fmt.Errorf("Requested memory nodes are not available - requested %s, available: %s", resources.CpusetMems, sysInfo.Mems) 536 } 537 538 // blkio subsystem checks and adjustments 539 if resources.BlkioWeight > 0 && !sysInfo.BlkioWeight { 540 warnings = append(warnings, "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded.") 541 resources.BlkioWeight = 0 542 } 543 if resources.BlkioWeight > 0 && (resources.BlkioWeight < 10 || resources.BlkioWeight > 1000) { 544 return warnings, fmt.Errorf("Range of blkio weight is from 10 to 1000") 545 } 546 if resources.IOMaximumBandwidth != 0 || resources.IOMaximumIOps != 0 { 547 return warnings, fmt.Errorf("Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps", runtime.GOOS) 548 } 549 if len(resources.BlkioWeightDevice) > 0 && !sysInfo.BlkioWeightDevice { 550 warnings = append(warnings, "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded.") 551 resources.BlkioWeightDevice = []*pblkiodev.WeightDevice{} 552 } 553 if len(resources.BlkioDeviceReadBps) > 0 && !sysInfo.BlkioReadBpsDevice { 554 warnings = append(warnings, "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded.") 555 resources.BlkioDeviceReadBps = []*pblkiodev.ThrottleDevice{} 556 } 557 if len(resources.BlkioDeviceWriteBps) > 0 && !sysInfo.BlkioWriteBpsDevice { 558 warnings = append(warnings, "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded.") 559 resources.BlkioDeviceWriteBps = []*pblkiodev.ThrottleDevice{} 560 } 561 if len(resources.BlkioDeviceReadIOps) > 0 && !sysInfo.BlkioReadIOpsDevice { 562 warnings = append(warnings, "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded.") 563 resources.BlkioDeviceReadIOps = []*pblkiodev.ThrottleDevice{} 564 } 565 if len(resources.BlkioDeviceWriteIOps) > 0 && !sysInfo.BlkioWriteIOpsDevice { 566 warnings = append(warnings, "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded.") 567 resources.BlkioDeviceWriteIOps = []*pblkiodev.ThrottleDevice{} 568 } 569 570 return warnings, nil 571 } 572 573 func cgroupDriver(cfg *config.Config) string { 574 if UsingSystemd(cfg) { 575 return cgroupSystemdDriver 576 } 577 if cfg.Rootless { 578 return cgroupNoneDriver 579 } 580 return cgroupFsDriver 581 } 582 583 // getCD gets the raw value of the native.cgroupdriver option, if set. 584 func getCD(config *config.Config) string { 585 for _, option := range config.ExecOptions { 586 key, val, err := parsers.ParseKeyValueOpt(option) 587 if err != nil || !strings.EqualFold(key, "native.cgroupdriver") { 588 continue 589 } 590 return val 591 } 592 return "" 593 } 594 595 // verifyCgroupDriver validates native.cgroupdriver 596 func verifyCgroupDriver(config *config.Config) error { 597 cd := getCD(config) 598 if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver { 599 return nil 600 } 601 if cd == cgroupNoneDriver { 602 return fmt.Errorf("native.cgroupdriver option %s is internally used and cannot be specified manually", cd) 603 } 604 return fmt.Errorf("native.cgroupdriver option %s not supported", cd) 605 } 606 607 // UsingSystemd returns true if cli option includes native.cgroupdriver=systemd 608 func UsingSystemd(config *config.Config) bool { 609 cd := getCD(config) 610 611 if cd == cgroupSystemdDriver { 612 return true 613 } 614 // On cgroup v2 hosts, default to systemd driver 615 if cd == "" && cgroups.Mode() == cgroups.Unified && isRunningSystemd() { 616 return true 617 } 618 return false 619 } 620 621 var ( 622 runningSystemd bool 623 detectSystemd sync.Once 624 ) 625 626 // isRunningSystemd checks whether the host was booted with systemd as its init 627 // system. This functions similarly to systemd's `sd_booted(3)`: internally, it 628 // checks whether /run/systemd/system/ exists and is a directory. 629 // http://www.freedesktop.org/software/systemd/man/sd_booted.html 630 // 631 // NOTE: This function comes from package github.com/coreos/go-systemd/util 632 // It was borrowed here to avoid a dependency on cgo. 633 func isRunningSystemd() bool { 634 detectSystemd.Do(func() { 635 fi, err := os.Lstat("/run/systemd/system") 636 if err != nil { 637 return 638 } 639 runningSystemd = fi.IsDir() 640 }) 641 return runningSystemd 642 } 643 644 // verifyPlatformContainerSettings performs platform-specific validation of the 645 // hostconfig and config structures. 646 func verifyPlatformContainerSettings(daemon *Daemon, daemonCfg *configStore, hostConfig *containertypes.HostConfig, update bool) (warnings []string, err error) { 647 if hostConfig == nil { 648 return nil, nil 649 } 650 sysInfo := daemon.RawSysInfo() 651 652 w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update) 653 654 // no matter err is nil or not, w could have data in itself. 655 warnings = append(warnings, w...) 656 657 if err != nil { 658 return warnings, err 659 } 660 661 if !hostConfig.IpcMode.Valid() { 662 return warnings, errors.Errorf("invalid IPC mode: %v", hostConfig.IpcMode) 663 } 664 if !hostConfig.PidMode.Valid() { 665 return warnings, errors.Errorf("invalid PID mode: %v", hostConfig.PidMode) 666 } 667 if hostConfig.ShmSize < 0 { 668 return warnings, fmt.Errorf("SHM size can not be less than 0") 669 } 670 if !hostConfig.UTSMode.Valid() { 671 return warnings, errors.Errorf("invalid UTS mode: %v", hostConfig.UTSMode) 672 } 673 674 if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 { 675 return warnings, fmt.Errorf("Invalid value %d, range for oom score adj is [-1000, 1000]", hostConfig.OomScoreAdj) 676 } 677 678 // ip-forwarding does not affect container with '--net=host' (or '--net=none') 679 if sysInfo.IPv4ForwardingDisabled && !(hostConfig.NetworkMode.IsHost() || hostConfig.NetworkMode.IsNone()) { 680 warnings = append(warnings, "IPv4 forwarding is disabled. Networking will not work.") 681 } 682 if hostConfig.NetworkMode.IsHost() && len(hostConfig.PortBindings) > 0 { 683 warnings = append(warnings, "Published ports are discarded when using host network mode") 684 } 685 686 // check for various conflicting options with user namespaces 687 if daemonCfg.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() { 688 if hostConfig.Privileged { 689 return warnings, fmt.Errorf("privileged mode is incompatible with user namespaces. You must run the container in the host namespace when running privileged mode") 690 } 691 if hostConfig.NetworkMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 692 return warnings, fmt.Errorf("cannot share the host's network namespace when user namespaces are enabled") 693 } 694 if hostConfig.PidMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 695 return warnings, fmt.Errorf("cannot share the host PID namespace when user namespaces are enabled") 696 } 697 } 698 if hostConfig.CgroupParent != "" && UsingSystemd(&daemonCfg.Config) { 699 // CgroupParent for systemd cgroup should be named as "xxx.slice" 700 if len(hostConfig.CgroupParent) <= 6 || !strings.HasSuffix(hostConfig.CgroupParent, ".slice") { 701 return warnings, fmt.Errorf(`cgroup-parent for systemd cgroup should be a valid slice named as "xxx.slice"`) 702 } 703 } 704 if hostConfig.Runtime == "" { 705 hostConfig.Runtime = daemonCfg.Runtimes.Default 706 } 707 708 if _, _, err := daemonCfg.Runtimes.Get(hostConfig.Runtime); err != nil { 709 return warnings, err 710 } 711 712 parser := volumemounts.NewParser() 713 for dest := range hostConfig.Tmpfs { 714 if err := parser.ValidateTmpfsMountDestination(dest); err != nil { 715 return warnings, err 716 } 717 } 718 719 if !hostConfig.CgroupnsMode.Valid() { 720 return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode) 721 } 722 if hostConfig.CgroupnsMode.IsPrivate() { 723 if !sysInfo.CgroupNamespaces { 724 warnings = append(warnings, "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded.") 725 } 726 } 727 728 return warnings, nil 729 } 730 731 // verifyDaemonSettings performs validation of daemon config struct 732 func verifyDaemonSettings(conf *config.Config) error { 733 if conf.ContainerdNamespace == conf.ContainerdPluginNamespace { 734 return errors.New("containers namespace and plugins namespace cannot be the same") 735 } 736 // Check for mutually incompatible config options 737 if conf.BridgeConfig.Iface != "" && conf.BridgeConfig.IP != "" { 738 return fmt.Errorf("You specified -b & --bip, mutually exclusive options. Please specify only one") 739 } 740 if !conf.BridgeConfig.EnableIPTables && !conf.BridgeConfig.InterContainerCommunication { 741 return fmt.Errorf("You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true") 742 } 743 if conf.BridgeConfig.EnableIP6Tables && !conf.Experimental { 744 return fmt.Errorf("ip6tables rules are only available if experimental features are enabled") 745 } 746 if !conf.BridgeConfig.EnableIPTables && conf.BridgeConfig.EnableIPMasq { 747 conf.BridgeConfig.EnableIPMasq = false 748 } 749 if err := verifyCgroupDriver(conf); err != nil { 750 return err 751 } 752 if conf.CgroupParent != "" && UsingSystemd(conf) { 753 if len(conf.CgroupParent) <= 6 || !strings.HasSuffix(conf.CgroupParent, ".slice") { 754 return fmt.Errorf(`cgroup-parent for systemd cgroup should be a valid slice named as "xxx.slice"`) 755 } 756 } 757 758 if conf.Rootless && UsingSystemd(conf) && cgroups.Mode() != cgroups.Unified { 759 return fmt.Errorf("exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode") 760 } 761 return nil 762 } 763 764 // checkSystem validates platform-specific requirements 765 func checkSystem() error { 766 return nil 767 } 768 769 // configureMaxThreads sets the Go runtime max threads threshold 770 // which is 90% of the kernel setting from /proc/sys/kernel/threads-max 771 func configureMaxThreads(config *config.Config) error { 772 mt, err := os.ReadFile("/proc/sys/kernel/threads-max") 773 if err != nil { 774 return err 775 } 776 mtint, err := strconv.Atoi(strings.TrimSpace(string(mt))) 777 if err != nil { 778 return err 779 } 780 maxThreads := (mtint / 100) * 90 781 debug.SetMaxThreads(maxThreads) 782 log.G(context.TODO()).Debugf("Golang's threads limit set to %d", maxThreads) 783 return nil 784 } 785 786 func overlaySupportsSelinux() (bool, error) { 787 f, err := os.Open("/proc/kallsyms") 788 if err != nil { 789 if os.IsNotExist(err) { 790 return false, nil 791 } 792 return false, err 793 } 794 defer f.Close() 795 796 s := bufio.NewScanner(f) 797 for s.Scan() { 798 if strings.HasSuffix(s.Text(), " security_inode_copy_up") { 799 return true, nil 800 } 801 } 802 803 return false, s.Err() 804 } 805 806 // configureKernelSecuritySupport configures and validates security support for the kernel 807 func configureKernelSecuritySupport(config *config.Config, driverName string) error { 808 if config.EnableSelinuxSupport { 809 if !selinux.GetEnabled() { 810 log.G(context.TODO()).Warn("Docker could not enable SELinux on the host system") 811 return nil 812 } 813 814 if driverName == "overlay2" || driverName == "overlayfs" { 815 // If driver is overlay2, make sure kernel 816 // supports selinux with overlay. 817 supported, err := overlaySupportsSelinux() 818 if err != nil { 819 return err 820 } 821 822 if !supported { 823 log.G(context.TODO()).Warnf("SELinux is not supported with the %v graph driver on this kernel", driverName) 824 } 825 } 826 } else { 827 selinux.SetDisabled() 828 } 829 return nil 830 } 831 832 // initNetworkController initializes the libnetwork controller and configures 833 // network settings. If there's active sandboxes, configuration changes will not 834 // take effect. 835 func (daemon *Daemon) initNetworkController(cfg *config.Config, activeSandboxes map[string]interface{}) error { 836 netOptions, err := daemon.networkOptions(cfg, daemon.PluginStore, activeSandboxes) 837 if err != nil { 838 return err 839 } 840 841 daemon.netController, err = libnetwork.New(netOptions...) 842 if err != nil { 843 return fmt.Errorf("error obtaining controller instance: %v", err) 844 } 845 846 if len(activeSandboxes) > 0 { 847 log.G(context.TODO()).Info("there are running containers, updated network configuration will not take affect") 848 } else if err := configureNetworking(daemon.netController, cfg); err != nil { 849 return err 850 } 851 852 // Set HostGatewayIP to the default bridge's IP if it is empty 853 setHostGatewayIP(daemon.netController, cfg) 854 return nil 855 } 856 857 func configureNetworking(controller *libnetwork.Controller, conf *config.Config) error { 858 // Create predefined network "none" 859 if n, _ := controller.NetworkByName(network.NetworkNone); n == nil { 860 if _, err := controller.NewNetwork("null", network.NetworkNone, "", libnetwork.NetworkOptionPersist(true)); err != nil { 861 return errors.Wrapf(err, `error creating default %q network`, network.NetworkNone) 862 } 863 } 864 865 // Create predefined network "host" 866 if n, _ := controller.NetworkByName(network.NetworkHost); n == nil { 867 if _, err := controller.NewNetwork("host", network.NetworkHost, "", libnetwork.NetworkOptionPersist(true)); err != nil { 868 return errors.Wrapf(err, `error creating default %q network`, network.NetworkHost) 869 } 870 } 871 872 // Clear stale bridge network 873 if n, err := controller.NetworkByName(network.NetworkBridge); err == nil { 874 if err = n.Delete(); err != nil { 875 return errors.Wrapf(err, `could not delete the default %q network`, network.NetworkBridge) 876 } 877 if len(conf.NetworkConfig.DefaultAddressPools.Value()) > 0 && !conf.LiveRestoreEnabled { 878 removeDefaultBridgeInterface() 879 } 880 } 881 882 if !conf.DisableBridge { 883 // Initialize default driver "bridge" 884 if err := initBridgeDriver(controller, conf.BridgeConfig); err != nil { 885 return err 886 } 887 } else { 888 removeDefaultBridgeInterface() 889 } 890 891 return nil 892 } 893 894 // setHostGatewayIP sets cfg.HostGatewayIP to the default bridge's IP if it is empty. 895 func setHostGatewayIP(controller *libnetwork.Controller, config *config.Config) { 896 if config.HostGatewayIP != nil { 897 return 898 } 899 if n, err := controller.NetworkByName(network.NetworkBridge); err == nil { 900 v4Info, v6Info := n.IpamInfo() 901 if len(v4Info) > 0 { 902 config.HostGatewayIP = v4Info[0].Gateway.IP 903 } else if len(v6Info) > 0 { 904 config.HostGatewayIP = v6Info[0].Gateway.IP 905 } 906 } 907 } 908 909 func driverOptions(config *config.Config) nwconfig.Option { 910 return nwconfig.OptionDriverConfig("bridge", options.Generic{ 911 netlabel.GenericData: options.Generic{ 912 "EnableIPForwarding": config.BridgeConfig.EnableIPForward, 913 "EnableIPTables": config.BridgeConfig.EnableIPTables, 914 "EnableIP6Tables": config.BridgeConfig.EnableIP6Tables, 915 "EnableUserlandProxy": config.BridgeConfig.EnableUserlandProxy, 916 "UserlandProxyPath": config.BridgeConfig.UserlandProxyPath, 917 }, 918 }) 919 } 920 921 func initBridgeDriver(controller *libnetwork.Controller, cfg config.BridgeConfig) error { 922 bridgeName := bridge.DefaultBridgeName 923 if cfg.Iface != "" { 924 bridgeName = cfg.Iface 925 } 926 netOption := map[string]string{ 927 bridge.BridgeName: bridgeName, 928 bridge.DefaultBridge: strconv.FormatBool(true), 929 netlabel.DriverMTU: strconv.Itoa(cfg.MTU), 930 bridge.EnableIPMasquerade: strconv.FormatBool(cfg.EnableIPMasq), 931 bridge.EnableICC: strconv.FormatBool(cfg.InterContainerCommunication), 932 } 933 934 // --ip processing 935 if cfg.DefaultIP != nil { 936 netOption[bridge.DefaultBindingIP] = cfg.DefaultIP.String() 937 } 938 939 ipamV4Conf := &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 940 941 // By default, libnetwork will request an arbitrary available address 942 // pool for the network from the configured IPAM allocator. 943 // Configure it to use the IPv4 network ranges of the existing bridge 944 // interface if one exists with IPv4 addresses assigned to it. 945 946 nwList, nw6List, err := ifaceAddrs(bridgeName) 947 if err != nil { 948 return errors.Wrap(err, "list bridge addresses failed") 949 } 950 951 if len(nwList) > 0 { 952 nw := nwList[0] 953 if len(nwList) > 1 && cfg.FixedCIDR != "" { 954 _, fCIDR, err := net.ParseCIDR(cfg.FixedCIDR) 955 if err != nil { 956 return errors.Wrap(err, "parse CIDR failed") 957 } 958 // Iterate through in case there are multiple addresses for the bridge 959 for _, entry := range nwList { 960 if fCIDR.Contains(entry.IP) { 961 nw = entry 962 break 963 } 964 } 965 } 966 967 ipamV4Conf.PreferredPool = lntypes.GetIPNetCanonical(nw).String() 968 hip, _ := lntypes.GetHostPartIP(nw.IP, nw.Mask) 969 if hip.IsGlobalUnicast() { 970 ipamV4Conf.Gateway = nw.IP.String() 971 } 972 } 973 974 if cfg.IP != "" { 975 ip, ipNet, err := net.ParseCIDR(cfg.IP) 976 if err != nil { 977 return err 978 } 979 ipamV4Conf.PreferredPool = ipNet.String() 980 ipamV4Conf.Gateway = ip.String() 981 } else if bridgeName == bridge.DefaultBridgeName && ipamV4Conf.PreferredPool != "" { 982 log.G(context.TODO()).Infof("Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address", bridgeName, ipamV4Conf.PreferredPool) 983 } 984 985 if cfg.FixedCIDR != "" { 986 _, fCIDR, err := net.ParseCIDR(cfg.FixedCIDR) 987 if err != nil { 988 return err 989 } 990 991 ipamV4Conf.SubPool = fCIDR.String() 992 if ipamV4Conf.PreferredPool == "" { 993 ipamV4Conf.PreferredPool = fCIDR.String() 994 } 995 } 996 997 if cfg.DefaultGatewayIPv4 != nil { 998 ipamV4Conf.AuxAddresses["DefaultGatewayIPv4"] = cfg.DefaultGatewayIPv4.String() 999 } 1000 1001 var ( 1002 deferIPv6Alloc bool 1003 ipamV6Conf *libnetwork.IpamConf 1004 ) 1005 1006 if cfg.EnableIPv6 && cfg.FixedCIDRv6 == "" { 1007 return errdefs.InvalidParameter(errors.New("IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6")) 1008 } else if cfg.FixedCIDRv6 != "" { 1009 _, fCIDRv6, err := net.ParseCIDR(cfg.FixedCIDRv6) 1010 if err != nil { 1011 return err 1012 } 1013 1014 // In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has 1015 // at least 48 host bits, we need to guarantee the current behavior where the containers' 1016 // IPv6 addresses will be constructed based on the containers' interface MAC address. 1017 // We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints 1018 // on this network until after the driver has created the endpoint and returned the 1019 // constructed address. Libnetwork will then reserve this address with the ipam driver. 1020 ones, _ := fCIDRv6.Mask.Size() 1021 deferIPv6Alloc = ones <= 80 1022 1023 ipamV6Conf = &libnetwork.IpamConf{ 1024 AuxAddresses: make(map[string]string), 1025 PreferredPool: fCIDRv6.String(), 1026 } 1027 1028 // In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6 1029 // address belongs to the same network, we need to inform libnetwork about it, so 1030 // that it can be reserved with IPAM and it will not be given away to somebody else 1031 for _, nw6 := range nw6List { 1032 if fCIDRv6.Contains(nw6.IP) { 1033 ipamV6Conf.Gateway = nw6.IP.String() 1034 break 1035 } 1036 } 1037 } 1038 1039 if cfg.DefaultGatewayIPv6 != nil { 1040 if ipamV6Conf == nil { 1041 ipamV6Conf = &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 1042 } 1043 ipamV6Conf.AuxAddresses["DefaultGatewayIPv6"] = cfg.DefaultGatewayIPv6.String() 1044 } 1045 1046 v4Conf := []*libnetwork.IpamConf{ipamV4Conf} 1047 v6Conf := []*libnetwork.IpamConf{} 1048 if ipamV6Conf != nil { 1049 v6Conf = append(v6Conf, ipamV6Conf) 1050 } 1051 // Initialize default network on "bridge" with the same name 1052 _, err = controller.NewNetwork("bridge", network.NetworkBridge, "", 1053 libnetwork.NetworkOptionEnableIPv6(cfg.EnableIPv6), 1054 libnetwork.NetworkOptionDriverOpts(netOption), 1055 libnetwork.NetworkOptionIpam("default", "", v4Conf, v6Conf, nil), 1056 libnetwork.NetworkOptionDeferIPv6Alloc(deferIPv6Alloc)) 1057 if err != nil { 1058 return fmt.Errorf(`error creating default %q network: %v`, network.NetworkBridge, err) 1059 } 1060 return nil 1061 } 1062 1063 // Remove default bridge interface if present (--bridge=none use case) 1064 func removeDefaultBridgeInterface() { 1065 if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil { 1066 if err := netlink.LinkDel(lnk); err != nil { 1067 log.G(context.TODO()).Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err) 1068 } 1069 } 1070 } 1071 1072 func setupInitLayer(idMapping idtools.IdentityMapping) func(string) error { 1073 return func(initPath string) error { 1074 return initlayer.Setup(initPath, idMapping.RootPair()) 1075 } 1076 } 1077 1078 // Parse the remapped root (user namespace) option, which can be one of: 1079 // 1080 // - username - valid username from /etc/passwd 1081 // - username:groupname - valid username; valid groupname from /etc/group 1082 // - uid - 32-bit unsigned int valid Linux UID value 1083 // - uid:gid - uid value; 32-bit unsigned int Linux GID value 1084 // 1085 // If no groupname is specified, and a username is specified, an attempt 1086 // will be made to lookup a gid for that username as a groupname 1087 // 1088 // If names are used, they are verified to exist in passwd/group 1089 func parseRemappedRoot(usergrp string) (string, string, error) { 1090 var ( 1091 userID, groupID int 1092 username, groupname string 1093 ) 1094 1095 idparts := strings.Split(usergrp, ":") 1096 if len(idparts) > 2 { 1097 return "", "", fmt.Errorf("Invalid user/group specification in --userns-remap: %q", usergrp) 1098 } 1099 1100 if uid, err := strconv.ParseInt(idparts[0], 10, 32); err == nil { 1101 // must be a uid; take it as valid 1102 userID = int(uid) 1103 luser, err := idtools.LookupUID(userID) 1104 if err != nil { 1105 return "", "", fmt.Errorf("Uid %d has no entry in /etc/passwd: %v", userID, err) 1106 } 1107 username = luser.Name 1108 if len(idparts) == 1 { 1109 // if the uid was numeric and no gid was specified, take the uid as the gid 1110 groupID = userID 1111 lgrp, err := idtools.LookupGID(groupID) 1112 if err != nil { 1113 return "", "", fmt.Errorf("Gid %d has no entry in /etc/group: %v", groupID, err) 1114 } 1115 groupname = lgrp.Name 1116 } 1117 } else { 1118 lookupName := idparts[0] 1119 // special case: if the user specified "default", they want Docker to create or 1120 // use (after creation) the "dockremap" user/group for root remapping 1121 if lookupName == defaultIDSpecifier { 1122 lookupName = defaultRemappedID 1123 } 1124 luser, err := idtools.LookupUser(lookupName) 1125 if err != nil && idparts[0] != defaultIDSpecifier { 1126 // error if the name requested isn't the special "dockremap" ID 1127 return "", "", fmt.Errorf("Error during uid lookup for %q: %v", lookupName, err) 1128 } else if err != nil { 1129 // special case-- if the username == "default", then we have been asked 1130 // to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid} 1131 // ranges will be used for the user and group mappings in user namespaced containers 1132 _, _, err := idtools.AddNamespaceRangesUser(defaultRemappedID) 1133 if err == nil { 1134 return defaultRemappedID, defaultRemappedID, nil 1135 } 1136 return "", "", fmt.Errorf("Error during %q user creation: %v", defaultRemappedID, err) 1137 } 1138 username = luser.Name 1139 if len(idparts) == 1 { 1140 // we only have a string username, and no group specified; look up gid from username as group 1141 group, err := idtools.LookupGroup(lookupName) 1142 if err != nil { 1143 return "", "", fmt.Errorf("Error during gid lookup for %q: %v", lookupName, err) 1144 } 1145 groupname = group.Name 1146 } 1147 } 1148 1149 if len(idparts) == 2 { 1150 // groupname or gid is separately specified and must be resolved 1151 // to an unsigned 32-bit gid 1152 if gid, err := strconv.ParseInt(idparts[1], 10, 32); err == nil { 1153 // must be a gid, take it as valid 1154 groupID = int(gid) 1155 lgrp, err := idtools.LookupGID(groupID) 1156 if err != nil { 1157 return "", "", fmt.Errorf("Gid %d has no entry in /etc/passwd: %v", groupID, err) 1158 } 1159 groupname = lgrp.Name 1160 } else { 1161 // not a number; attempt a lookup 1162 if _, err := idtools.LookupGroup(idparts[1]); err != nil { 1163 return "", "", fmt.Errorf("Error during groupname lookup for %q: %v", idparts[1], err) 1164 } 1165 groupname = idparts[1] 1166 } 1167 } 1168 return username, groupname, nil 1169 } 1170 1171 func setupRemappedRoot(config *config.Config) (idtools.IdentityMapping, error) { 1172 if runtime.GOOS != "linux" && config.RemappedRoot != "" { 1173 return idtools.IdentityMapping{}, fmt.Errorf("User namespaces are only supported on Linux") 1174 } 1175 1176 // if the daemon was started with remapped root option, parse 1177 // the config option to the int uid,gid values 1178 if config.RemappedRoot != "" { 1179 username, groupname, err := parseRemappedRoot(config.RemappedRoot) 1180 if err != nil { 1181 return idtools.IdentityMapping{}, err 1182 } 1183 if username == "root" { 1184 // Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op 1185 // effectively 1186 log.G(context.TODO()).Warn("User namespaces: root cannot be remapped with itself; user namespaces are OFF") 1187 return idtools.IdentityMapping{}, nil 1188 } 1189 log.G(context.TODO()).Infof("User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s", username) 1190 // update remapped root setting now that we have resolved them to actual names 1191 config.RemappedRoot = fmt.Sprintf("%s:%s", username, groupname) 1192 1193 mappings, err := idtools.LoadIdentityMapping(username) 1194 if err != nil { 1195 return idtools.IdentityMapping{}, errors.Wrap(err, "Can't create ID mappings") 1196 } 1197 return mappings, nil 1198 } 1199 return idtools.IdentityMapping{}, nil 1200 } 1201 1202 func setupDaemonRoot(config *config.Config, rootDir string, remappedRoot idtools.Identity) error { 1203 config.Root = rootDir 1204 // the docker root metadata directory needs to have execute permissions for all users (g+x,o+x) 1205 // so that syscalls executing as non-root, operating on subdirectories of the graph root 1206 // (e.g. mounted layers of a container) can traverse this path. 1207 // The user namespace support will create subdirectories for the remapped root host uid:gid 1208 // pair owned by that same uid:gid pair for proper write access to those needed metadata and 1209 // layer content subtrees. 1210 if _, err := os.Stat(rootDir); err == nil { 1211 // root current exists; verify the access bits are correct by setting them 1212 if err = os.Chmod(rootDir, 0o711); err != nil { 1213 return err 1214 } 1215 } else if os.IsNotExist(err) { 1216 // no root exists yet, create it 0711 with root:root ownership 1217 if err := os.MkdirAll(rootDir, 0o711); err != nil { 1218 return err 1219 } 1220 } 1221 1222 id := idtools.Identity{UID: idtools.CurrentIdentity().UID, GID: remappedRoot.GID} 1223 // First make sure the current root dir has the correct perms. 1224 if err := idtools.MkdirAllAndChown(config.Root, 0o710, id); err != nil { 1225 return errors.Wrapf(err, "could not create or set daemon root permissions: %s", config.Root) 1226 } 1227 1228 // if user namespaces are enabled we will create a subtree underneath the specified root 1229 // with any/all specified remapped root uid/gid options on the daemon creating 1230 // a new subdirectory with ownership set to the remapped uid/gid (so as to allow 1231 // `chdir()` to work for containers namespaced to that uid/gid) 1232 if config.RemappedRoot != "" { 1233 config.Root = filepath.Join(rootDir, fmt.Sprintf("%d.%d", remappedRoot.UID, remappedRoot.GID)) 1234 log.G(context.TODO()).Debugf("Creating user namespaced daemon root: %s", config.Root) 1235 // Create the root directory if it doesn't exist 1236 if err := idtools.MkdirAllAndChown(config.Root, 0o710, id); err != nil { 1237 return fmt.Errorf("Cannot create daemon root: %s: %v", config.Root, err) 1238 } 1239 // we also need to verify that any pre-existing directories in the path to 1240 // the graphroot won't block access to remapped root--if any pre-existing directory 1241 // has strict permissions that don't allow "x", container start will fail, so 1242 // better to warn and fail now 1243 dirPath := config.Root 1244 for { 1245 dirPath = filepath.Dir(dirPath) 1246 if dirPath == "/" { 1247 break 1248 } 1249 if !canAccess(dirPath, remappedRoot) { 1250 return fmt.Errorf("a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories", config.Root) 1251 } 1252 } 1253 } 1254 1255 if err := setupDaemonRootPropagation(config); err != nil { 1256 log.G(context.TODO()).WithError(err).WithField("dir", config.Root).Warn("Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior") 1257 } 1258 return nil 1259 } 1260 1261 // canAccess takes a valid (existing) directory and a uid, gid pair and determines 1262 // if that uid, gid pair has access (execute bit) to the directory. 1263 // 1264 // Note: this is a very rudimentary check, and may not produce accurate results, 1265 // so should not be used for anything other than the current use, see: 1266 // https://github.com/moby/moby/issues/43724 1267 func canAccess(path string, pair idtools.Identity) bool { 1268 statInfo, err := os.Stat(path) 1269 if err != nil { 1270 return false 1271 } 1272 perms := statInfo.Mode().Perm() 1273 if perms&0o001 == 0o001 { 1274 // world access 1275 return true 1276 } 1277 ssi := statInfo.Sys().(*syscall.Stat_t) 1278 if ssi.Uid == uint32(pair.UID) && (perms&0o100 == 0o100) { 1279 // owner access. 1280 return true 1281 } 1282 if ssi.Gid == uint32(pair.GID) && (perms&0o010 == 0o010) { 1283 // group access. 1284 return true 1285 } 1286 return false 1287 } 1288 1289 func setupDaemonRootPropagation(cfg *config.Config) error { 1290 rootParentMount, mountOptions, err := getSourceMount(cfg.Root) 1291 if err != nil { 1292 return errors.Wrap(err, "error getting daemon root's parent mount") 1293 } 1294 1295 var cleanupOldFile bool 1296 cleanupFile := getUnmountOnShutdownPath(cfg) 1297 defer func() { 1298 if !cleanupOldFile { 1299 return 1300 } 1301 if err := os.Remove(cleanupFile); err != nil && !os.IsNotExist(err) { 1302 log.G(context.TODO()).WithError(err).WithField("file", cleanupFile).Warn("could not clean up old root propagation unmount file") 1303 } 1304 }() 1305 1306 if hasMountInfoOption(mountOptions, sharedPropagationOption, slavePropagationOption) { 1307 cleanupOldFile = true 1308 return nil 1309 } 1310 1311 if err := mount.MakeShared(cfg.Root); err != nil { 1312 return errors.Wrap(err, "could not setup daemon root propagation to shared") 1313 } 1314 1315 // check the case where this may have already been a mount to itself. 1316 // If so then the daemon only performed a remount and should not try to unmount this later. 1317 if rootParentMount == cfg.Root { 1318 cleanupOldFile = true 1319 return nil 1320 } 1321 1322 if err := os.MkdirAll(filepath.Dir(cleanupFile), 0o700); err != nil { 1323 return errors.Wrap(err, "error creating dir to store mount cleanup file") 1324 } 1325 1326 if err := os.WriteFile(cleanupFile, nil, 0o600); err != nil { 1327 return errors.Wrap(err, "error writing file to signal mount cleanup on shutdown") 1328 } 1329 return nil 1330 } 1331 1332 // getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown 1333 // the daemon root should be unmounted. 1334 func getUnmountOnShutdownPath(config *config.Config) string { 1335 return filepath.Join(config.ExecRoot, "unmount-on-shutdown") 1336 } 1337 1338 // registerLinks registers network links between container and other containers 1339 // with the daemon using the specification in hostConfig. 1340 func (daemon *Daemon) registerLinks(container *container.Container, hostConfig *containertypes.HostConfig) error { 1341 if hostConfig == nil || hostConfig.NetworkMode.IsUserDefined() { 1342 return nil 1343 } 1344 1345 for _, l := range hostConfig.Links { 1346 name, alias, err := opts.ParseLink(l) 1347 if err != nil { 1348 return err 1349 } 1350 child, err := daemon.GetContainer(name) 1351 if err != nil { 1352 if errdefs.IsNotFound(err) { 1353 // Trying to link to a non-existing container is not valid, and 1354 // should return an "invalid parameter" error. Returning a "not 1355 // found" error here would make the client report the container's 1356 // image could not be found (see moby/moby#39823) 1357 err = errdefs.InvalidParameter(err) 1358 } 1359 return errors.Wrapf(err, "could not get container for %s", name) 1360 } 1361 for child.HostConfig.NetworkMode.IsContainer() { 1362 cid := child.HostConfig.NetworkMode.ConnectedContainer() 1363 child, err = daemon.GetContainer(cid) 1364 if err != nil { 1365 if errdefs.IsNotFound(err) { 1366 // Trying to link to a non-existing container is not valid, and 1367 // should return an "invalid parameter" error. Returning a "not 1368 // found" error here would make the client report the container's 1369 // image could not be found (see moby/moby#39823) 1370 err = errdefs.InvalidParameter(err) 1371 } 1372 return errors.Wrapf(err, "could not get container for %s", cid) 1373 } 1374 } 1375 if child.HostConfig.NetworkMode.IsHost() { 1376 return runconfig.ErrConflictHostNetworkAndLinks 1377 } 1378 if err := daemon.registerLink(container, child, alias); err != nil { 1379 return err 1380 } 1381 } 1382 1383 return nil 1384 } 1385 1386 // conditionalMountOnStart is a platform specific helper function during the 1387 // container start to call mount. 1388 func (daemon *Daemon) conditionalMountOnStart(container *container.Container) error { 1389 return daemon.Mount(container) 1390 } 1391 1392 // conditionalUnmountOnCleanup is a platform specific helper function called 1393 // during the cleanup of a container to unmount. 1394 func (daemon *Daemon) conditionalUnmountOnCleanup(container *container.Container) error { 1395 return daemon.Unmount(container) 1396 } 1397 1398 // setDefaultIsolation determines the default isolation mode for the 1399 // daemon to run in. This is only applicable on Windows 1400 func (daemon *Daemon) setDefaultIsolation(*config.Config) error { 1401 return nil 1402 } 1403 1404 // This is used to allow removal of mountpoints that may be mounted in other 1405 // namespaces on RHEL based kernels starting from RHEL 7.4. 1406 // Without this setting, removals on these RHEL based kernels may fail with 1407 // "device or resource busy". 1408 // This setting is not available in upstream kernels as it is not configurable, 1409 // but has been in the upstream kernels since 3.15. 1410 func setMayDetachMounts() error { 1411 f, err := os.OpenFile("/proc/sys/fs/may_detach_mounts", os.O_WRONLY, 0) 1412 if err != nil { 1413 if os.IsNotExist(err) { 1414 return nil 1415 } 1416 return errors.Wrap(err, "error opening may_detach_mounts kernel config file") 1417 } 1418 defer f.Close() 1419 1420 _, err = f.WriteString("1") 1421 if os.IsPermission(err) { 1422 // Setting may_detach_mounts does not work in an 1423 // unprivileged container. Ignore the error, but log 1424 // it if we appear not to be in that situation. 1425 if !userns.RunningInUserNS() { 1426 log.G(context.TODO()).Debugf("Permission denied writing %q to /proc/sys/fs/may_detach_mounts", "1") 1427 } 1428 return nil 1429 } 1430 return err 1431 } 1432 1433 func (daemon *Daemon) initCPURtController(cfg *config.Config, mnt, path string) error { 1434 if path == "/" || path == "." { 1435 return nil 1436 } 1437 1438 // Recursively create cgroup to ensure that the system and all parent cgroups have values set 1439 // for the period and runtime as this limits what the children can be set to. 1440 if err := daemon.initCPURtController(cfg, mnt, filepath.Dir(path)); err != nil { 1441 return err 1442 } 1443 1444 path = filepath.Join(mnt, path) 1445 if err := os.MkdirAll(path, 0o755); err != nil { 1446 return err 1447 } 1448 if err := maybeCreateCPURealTimeFile(cfg.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil { 1449 return err 1450 } 1451 return maybeCreateCPURealTimeFile(cfg.CPURealtimeRuntime, "cpu.rt_runtime_us", path) 1452 } 1453 1454 func maybeCreateCPURealTimeFile(configValue int64, file string, path string) error { 1455 if configValue == 0 { 1456 return nil 1457 } 1458 return os.WriteFile(filepath.Join(path, file), []byte(strconv.FormatInt(configValue, 10)), 0o700) 1459 } 1460 1461 func (daemon *Daemon) setupSeccompProfile(cfg *config.Config) error { 1462 switch profile := cfg.SeccompProfile; profile { 1463 case "", config.SeccompProfileDefault: 1464 daemon.seccompProfilePath = config.SeccompProfileDefault 1465 case config.SeccompProfileUnconfined: 1466 daemon.seccompProfilePath = config.SeccompProfileUnconfined 1467 default: 1468 daemon.seccompProfilePath = profile 1469 b, err := os.ReadFile(profile) 1470 if err != nil { 1471 return fmt.Errorf("opening seccomp profile (%s) failed: %v", profile, err) 1472 } 1473 daemon.seccompProfile = b 1474 } 1475 return nil 1476 } 1477 1478 func getSysInfo(cfg *config.Config) *sysinfo.SysInfo { 1479 var siOpts []sysinfo.Opt 1480 if cgroupDriver(cfg) == cgroupSystemdDriver { 1481 if euid := os.Getenv("ROOTLESSKIT_PARENT_EUID"); euid != "" { 1482 siOpts = append(siOpts, sysinfo.WithCgroup2GroupPath("/user.slice/user-"+euid+".slice")) 1483 } 1484 } 1485 return sysinfo.New(siOpts...) 1486 } 1487 1488 func (daemon *Daemon) initLibcontainerd(ctx context.Context, cfg *config.Config) error { 1489 var err error 1490 daemon.containerd, err = remote.NewClient( 1491 ctx, 1492 daemon.containerdClient, 1493 filepath.Join(cfg.ExecRoot, "containerd"), 1494 cfg.ContainerdNamespace, 1495 daemon, 1496 ) 1497 return err 1498 } 1499 1500 func recursiveUnmount(target string) error { 1501 return mount.RecursiveUnmount(target) 1502 }