github.com/rish1988/moby@v25.0.2+incompatible/daemon/daemon_unix.go (about) 1 //go:build linux || freebsd 2 3 package daemon // import "github.com/docker/docker/daemon" 4 5 import ( 6 "bufio" 7 "context" 8 "fmt" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "runtime/debug" 14 "strconv" 15 "strings" 16 "sync" 17 "syscall" 18 "time" 19 20 "github.com/containerd/cgroups/v3" 21 "github.com/containerd/containerd/pkg/userns" 22 "github.com/containerd/log" 23 "github.com/docker/docker/api/types/blkiodev" 24 pblkiodev "github.com/docker/docker/api/types/blkiodev" 25 containertypes "github.com/docker/docker/api/types/container" 26 "github.com/docker/docker/api/types/network" 27 "github.com/docker/docker/container" 28 "github.com/docker/docker/daemon/config" 29 "github.com/docker/docker/daemon/initlayer" 30 "github.com/docker/docker/errdefs" 31 "github.com/docker/docker/libcontainerd/remote" 32 "github.com/docker/docker/libnetwork" 33 nwconfig "github.com/docker/docker/libnetwork/config" 34 "github.com/docker/docker/libnetwork/drivers/bridge" 35 "github.com/docker/docker/libnetwork/netlabel" 36 "github.com/docker/docker/libnetwork/options" 37 lntypes "github.com/docker/docker/libnetwork/types" 38 "github.com/docker/docker/opts" 39 "github.com/docker/docker/pkg/idtools" 40 "github.com/docker/docker/pkg/parsers" 41 "github.com/docker/docker/pkg/parsers/kernel" 42 "github.com/docker/docker/pkg/sysinfo" 43 "github.com/docker/docker/runconfig" 44 volumemounts "github.com/docker/docker/volume/mounts" 45 "github.com/moby/sys/mount" 46 specs "github.com/opencontainers/runtime-spec/specs-go" 47 "github.com/opencontainers/selinux/go-selinux" 48 "github.com/opencontainers/selinux/go-selinux/label" 49 "github.com/pkg/errors" 50 "github.com/vishvananda/netlink" 51 "golang.org/x/sys/unix" 52 ) 53 54 const ( 55 isWindows = false 56 57 // See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269 58 linuxMinCPUShares = 2 59 linuxMaxCPUShares = 262144 60 // It's not kernel limit, we want this 6M limit to account for overhead during startup, and to supply a reasonable functional container 61 linuxMinMemory = 6291456 62 // constants for remapped root settings 63 defaultIDSpecifier = "default" 64 defaultRemappedID = "dockremap" 65 66 // constant for cgroup drivers 67 cgroupFsDriver = "cgroupfs" 68 cgroupSystemdDriver = "systemd" 69 cgroupNoneDriver = "none" 70 ) 71 72 type containerGetter interface { 73 GetContainer(string) (*container.Container, error) 74 } 75 76 func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory { 77 memory := specs.LinuxMemory{} 78 79 if config.Memory > 0 { 80 memory.Limit = &config.Memory 81 } 82 83 if config.MemoryReservation > 0 { 84 memory.Reservation = &config.MemoryReservation 85 } 86 87 if config.MemorySwap > 0 { 88 memory.Swap = &config.MemorySwap 89 } 90 91 if config.MemorySwappiness != nil { 92 swappiness := uint64(*config.MemorySwappiness) 93 memory.Swappiness = &swappiness 94 } 95 96 if config.OomKillDisable != nil { 97 memory.DisableOOMKiller = config.OomKillDisable 98 } 99 100 if config.KernelMemory != 0 { 101 memory.Kernel = &config.KernelMemory 102 } 103 104 if config.KernelMemoryTCP != 0 { 105 memory.KernelTCP = &config.KernelMemoryTCP 106 } 107 108 if memory != (specs.LinuxMemory{}) { 109 return &memory 110 } 111 return nil 112 } 113 114 func getPidsLimit(config containertypes.Resources) *specs.LinuxPids { 115 if config.PidsLimit == nil { 116 return nil 117 } 118 if *config.PidsLimit <= 0 { 119 // docker API allows 0 and negative values to unset this to be consistent 120 // with default values. When updating values, runc requires -1 to unset 121 // the previous limit. 122 return &specs.LinuxPids{Limit: -1} 123 } 124 return &specs.LinuxPids{Limit: *config.PidsLimit} 125 } 126 127 func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { 128 cpu := specs.LinuxCPU{} 129 130 if config.CPUShares < 0 { 131 return nil, fmt.Errorf("shares: invalid argument") 132 } 133 if config.CPUShares > 0 { 134 shares := uint64(config.CPUShares) 135 cpu.Shares = &shares 136 } 137 138 if config.CpusetCpus != "" { 139 cpu.Cpus = config.CpusetCpus 140 } 141 142 if config.CpusetMems != "" { 143 cpu.Mems = config.CpusetMems 144 } 145 146 if config.NanoCPUs > 0 { 147 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 148 period := uint64(100 * time.Millisecond / time.Microsecond) 149 quota := config.NanoCPUs * int64(period) / 1e9 150 cpu.Period = &period 151 cpu.Quota = "a 152 } 153 154 if config.CPUPeriod != 0 { 155 period := uint64(config.CPUPeriod) 156 cpu.Period = &period 157 } 158 159 if config.CPUQuota != 0 { 160 q := config.CPUQuota 161 cpu.Quota = &q 162 } 163 164 if config.CPURealtimePeriod != 0 { 165 period := uint64(config.CPURealtimePeriod) 166 cpu.RealtimePeriod = &period 167 } 168 169 if config.CPURealtimeRuntime != 0 { 170 c := config.CPURealtimeRuntime 171 cpu.RealtimeRuntime = &c 172 } 173 174 if cpu != (specs.LinuxCPU{}) { 175 return &cpu, nil 176 } 177 return nil, nil 178 } 179 180 func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) { 181 var stat unix.Stat_t 182 var blkioWeightDevices []specs.LinuxWeightDevice 183 184 for _, weightDevice := range config.BlkioWeightDevice { 185 if err := unix.Stat(weightDevice.Path, &stat); err != nil { 186 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: weightDevice.Path, Err: err}) 187 } 188 weight := weightDevice.Weight 189 d := specs.LinuxWeightDevice{Weight: &weight} 190 // The type is 32bit on mips. 191 d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert 192 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert 193 blkioWeightDevices = append(blkioWeightDevices, d) 194 } 195 196 return blkioWeightDevices, nil 197 } 198 199 func (daemon *Daemon) parseSecurityOpt(cfg *config.Config, securityOptions *container.SecurityOptions, hostConfig *containertypes.HostConfig) error { 200 securityOptions.NoNewPrivileges = cfg.NoNewPrivileges 201 return parseSecurityOpt(securityOptions, hostConfig) 202 } 203 204 func parseSecurityOpt(securityOptions *container.SecurityOptions, config *containertypes.HostConfig) error { 205 var ( 206 labelOpts []string 207 err error 208 ) 209 210 for _, opt := range config.SecurityOpt { 211 if opt == "no-new-privileges" { 212 securityOptions.NoNewPrivileges = true 213 continue 214 } 215 if opt == "disable" { 216 labelOpts = append(labelOpts, "disable") 217 continue 218 } 219 220 var k, v string 221 var ok bool 222 if strings.Contains(opt, "=") { 223 k, v, ok = strings.Cut(opt, "=") 224 } else if strings.Contains(opt, ":") { 225 k, v, ok = strings.Cut(opt, ":") 226 log.G(context.TODO()).Warn("Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead.") 227 } 228 if !ok { 229 return fmt.Errorf("invalid --security-opt 1: %q", opt) 230 } 231 232 switch k { 233 case "label": 234 labelOpts = append(labelOpts, v) 235 case "apparmor": 236 securityOptions.AppArmorProfile = v 237 case "seccomp": 238 securityOptions.SeccompProfile = v 239 case "no-new-privileges": 240 noNewPrivileges, err := strconv.ParseBool(v) 241 if err != nil { 242 return fmt.Errorf("invalid --security-opt 2: %q", opt) 243 } 244 securityOptions.NoNewPrivileges = noNewPrivileges 245 default: 246 return fmt.Errorf("invalid --security-opt 2: %q", opt) 247 } 248 } 249 250 securityOptions.ProcessLabel, securityOptions.MountLabel, err = label.InitLabels(labelOpts) 251 return err 252 } 253 254 func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { 255 var throttleDevices []specs.LinuxThrottleDevice 256 var stat unix.Stat_t 257 258 for _, d := range devs { 259 if err := unix.Stat(d.Path, &stat); err != nil { 260 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: d.Path, Err: err}) 261 } 262 d := specs.LinuxThrottleDevice{Rate: d.Rate} 263 // the type is 32bit on mips 264 d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert 265 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert 266 throttleDevices = append(throttleDevices, d) 267 } 268 269 return throttleDevices, nil 270 } 271 272 // adjustParallelLimit takes a number of objects and a proposed limit and 273 // figures out if it's reasonable (and adjusts it accordingly). This is only 274 // used for daemon startup, which does a lot of parallel loading of containers 275 // (and if we exceed RLIMIT_NOFILE then we're in trouble). 276 func adjustParallelLimit(n int, limit int) int { 277 // Rule-of-thumb overhead factor (how many files will each goroutine open 278 // simultaneously). Yes, this is ugly but to be frank this whole thing is 279 // ugly. 280 const overhead = 2 281 282 // On Linux, we need to ensure that parallelStartupJobs doesn't cause us to 283 // exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it 284 // and give a warning (since in theory the user should increase their 285 // ulimits to the largest possible value for dockerd). 286 var rlim unix.Rlimit 287 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil { 288 log.G(context.TODO()).Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err) 289 return limit 290 } 291 softRlimit := int(rlim.Cur) 292 293 // Much fewer containers than RLIMIT_NOFILE. No need to adjust anything. 294 if softRlimit > overhead*n { 295 return limit 296 } 297 298 // RLIMIT_NOFILE big enough, no need to adjust anything. 299 if softRlimit > overhead*limit { 300 return limit 301 } 302 303 log.G(context.TODO()).Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit) 304 return softRlimit / overhead 305 } 306 307 // adaptContainerSettings is called during container creation to modify any 308 // settings necessary in the HostConfig structure. 309 func (daemon *Daemon) adaptContainerSettings(daemonCfg *config.Config, hostConfig *containertypes.HostConfig, adjustCPUShares bool) error { 310 if adjustCPUShares && hostConfig.CPUShares > 0 { 311 // Handle unsupported CPUShares 312 if hostConfig.CPUShares < linuxMinCPUShares { 313 log.G(context.TODO()).Warnf("Changing requested CPUShares of %d to minimum allowed of %d", hostConfig.CPUShares, linuxMinCPUShares) 314 hostConfig.CPUShares = linuxMinCPUShares 315 } else if hostConfig.CPUShares > linuxMaxCPUShares { 316 log.G(context.TODO()).Warnf("Changing requested CPUShares of %d to maximum allowed of %d", hostConfig.CPUShares, linuxMaxCPUShares) 317 hostConfig.CPUShares = linuxMaxCPUShares 318 } 319 } 320 if hostConfig.Memory > 0 && hostConfig.MemorySwap == 0 { 321 // By default, MemorySwap is set to twice the size of Memory. 322 hostConfig.MemorySwap = hostConfig.Memory * 2 323 } 324 if hostConfig.ShmSize == 0 { 325 hostConfig.ShmSize = config.DefaultShmSize 326 if daemonCfg != nil { 327 hostConfig.ShmSize = int64(daemonCfg.ShmSize) 328 } 329 } 330 // Set default IPC mode, if unset for container 331 if hostConfig.IpcMode.IsEmpty() { 332 m := config.DefaultIpcMode 333 if daemonCfg != nil { 334 m = containertypes.IpcMode(daemonCfg.IpcMode) 335 } 336 hostConfig.IpcMode = m 337 } 338 339 // Set default cgroup namespace mode, if unset for container 340 if hostConfig.CgroupnsMode.IsEmpty() { 341 // for cgroup v2: unshare cgroupns even for privileged containers 342 // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 343 if hostConfig.Privileged && cgroups.Mode() != cgroups.Unified { 344 hostConfig.CgroupnsMode = containertypes.CgroupnsModeHost 345 } else { 346 m := containertypes.CgroupnsModeHost 347 if cgroups.Mode() == cgroups.Unified { 348 m = containertypes.CgroupnsModePrivate 349 } 350 if daemonCfg != nil { 351 m = containertypes.CgroupnsMode(daemonCfg.CgroupNamespaceMode) 352 } 353 hostConfig.CgroupnsMode = m 354 } 355 } 356 357 adaptSharedNamespaceContainer(daemon, hostConfig) 358 359 var err error 360 secOpts, err := daemon.generateSecurityOpt(hostConfig) 361 if err != nil { 362 return err 363 } 364 hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, secOpts...) 365 if hostConfig.OomKillDisable == nil { 366 defaultOomKillDisable := false 367 hostConfig.OomKillDisable = &defaultOomKillDisable 368 } 369 370 return nil 371 } 372 373 // adaptSharedNamespaceContainer replaces container name with its ID in hostConfig. 374 // To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode 375 // and NetworkMode. 376 // 377 // When a container shares its namespace with another container, use ID can keep the namespace 378 // sharing connection between the two containers even the another container is renamed. 379 func adaptSharedNamespaceContainer(daemon containerGetter, hostConfig *containertypes.HostConfig) { 380 containerPrefix := "container:" 381 if hostConfig.PidMode.IsContainer() { 382 pidContainer := hostConfig.PidMode.Container() 383 // if there is any error returned here, we just ignore it and leave it to be 384 // handled in the following logic 385 if c, err := daemon.GetContainer(pidContainer); err == nil { 386 hostConfig.PidMode = containertypes.PidMode(containerPrefix + c.ID) 387 } 388 } 389 if hostConfig.IpcMode.IsContainer() { 390 ipcContainer := hostConfig.IpcMode.Container() 391 if c, err := daemon.GetContainer(ipcContainer); err == nil { 392 hostConfig.IpcMode = containertypes.IpcMode(containerPrefix + c.ID) 393 } 394 } 395 if hostConfig.NetworkMode.IsContainer() { 396 netContainer := hostConfig.NetworkMode.ConnectedContainer() 397 if c, err := daemon.GetContainer(netContainer); err == nil { 398 hostConfig.NetworkMode = containertypes.NetworkMode(containerPrefix + c.ID) 399 } 400 } 401 } 402 403 // verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration 404 func verifyPlatformContainerResources(resources *containertypes.Resources, sysInfo *sysinfo.SysInfo, update bool) (warnings []string, err error) { 405 fixMemorySwappiness(resources) 406 407 // memory subsystem checks and adjustments 408 if resources.Memory != 0 && resources.Memory < linuxMinMemory { 409 return warnings, fmt.Errorf("Minimum memory limit allowed is 6MB") 410 } 411 if resources.Memory > 0 && !sysInfo.MemoryLimit { 412 warnings = append(warnings, "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 413 resources.Memory = 0 414 resources.MemorySwap = -1 415 } 416 if resources.Memory > 0 && resources.MemorySwap != -1 && !sysInfo.SwapLimit { 417 warnings = append(warnings, "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.") 418 resources.MemorySwap = -1 419 } 420 if resources.Memory > 0 && resources.MemorySwap > 0 && resources.MemorySwap < resources.Memory { 421 return warnings, fmt.Errorf("Minimum memoryswap limit should be larger than memory limit, see usage") 422 } 423 if resources.Memory == 0 && resources.MemorySwap > 0 && !update { 424 return warnings, fmt.Errorf("You should always set the Memory limit when using Memoryswap limit, see usage") 425 } 426 if resources.MemorySwappiness != nil && !sysInfo.MemorySwappiness { 427 warnings = append(warnings, "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded.") 428 resources.MemorySwappiness = nil 429 } 430 if resources.MemorySwappiness != nil { 431 swappiness := *resources.MemorySwappiness 432 if swappiness < 0 || swappiness > 100 { 433 return warnings, fmt.Errorf("Invalid value: %v, valid memory swappiness range is 0-100", swappiness) 434 } 435 } 436 if resources.MemoryReservation > 0 && !sysInfo.MemoryReservation { 437 warnings = append(warnings, "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded.") 438 resources.MemoryReservation = 0 439 } 440 if resources.MemoryReservation > 0 && resources.MemoryReservation < linuxMinMemory { 441 return warnings, fmt.Errorf("Minimum memory reservation allowed is 6MB") 442 } 443 if resources.Memory > 0 && resources.MemoryReservation > 0 && resources.Memory < resources.MemoryReservation { 444 return warnings, fmt.Errorf("Minimum memory limit can not be less than memory reservation limit, see usage") 445 } 446 if resources.KernelMemory > 0 { 447 // Kernel memory limit is not supported on cgroup v2. 448 // Even on cgroup v1, kernel memory limit (`kmem.limit_in_bytes`) has been deprecated since kernel 5.4. 449 // https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0b5adf44cae99b3ebcc7 450 if !sysInfo.KernelMemory { 451 warnings = append(warnings, "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 452 resources.KernelMemory = 0 453 } 454 if resources.KernelMemory > 0 && resources.KernelMemory < linuxMinMemory { 455 return warnings, fmt.Errorf("Minimum kernel memory limit allowed is 6MB") 456 } 457 if !kernel.CheckKernelVersion(4, 0, 0) { 458 warnings = append(warnings, "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable.") 459 } 460 } 461 if resources.OomKillDisable != nil && !sysInfo.OomKillDisable { 462 // only produce warnings if the setting wasn't to *disable* the OOM Kill; no point 463 // warning the caller if they already wanted the feature to be off 464 if *resources.OomKillDisable { 465 warnings = append(warnings, "Your kernel does not support OomKillDisable. OomKillDisable discarded.") 466 } 467 resources.OomKillDisable = nil 468 } 469 if resources.OomKillDisable != nil && *resources.OomKillDisable && resources.Memory == 0 { 470 warnings = append(warnings, "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources.") 471 } 472 if resources.PidsLimit != nil && !sysInfo.PidsLimit { 473 if *resources.PidsLimit > 0 { 474 warnings = append(warnings, "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded.") 475 } 476 resources.PidsLimit = nil 477 } 478 479 // cpu subsystem checks and adjustments 480 if resources.NanoCPUs > 0 && resources.CPUPeriod > 0 { 481 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Period cannot both be set") 482 } 483 if resources.NanoCPUs > 0 && resources.CPUQuota > 0 { 484 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Quota cannot both be set") 485 } 486 if resources.NanoCPUs > 0 && !sysInfo.CPUCfs { 487 return warnings, fmt.Errorf("NanoCPUs can not be set, as your kernel does not support CPU CFS scheduler or the cgroup is not mounted") 488 } 489 // The highest precision we could get on Linux is 0.001, by setting 490 // cpu.cfs_period_us=1000ms 491 // cpu.cfs_quota=1ms 492 // See the following link for details: 493 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 494 // Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error. 495 // The error message is 0.01 so that this is consistent with Windows 496 if resources.NanoCPUs < 0 || resources.NanoCPUs > int64(sysinfo.NumCPU())*1e9 { 497 return warnings, fmt.Errorf("Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available", sysinfo.NumCPU(), sysinfo.NumCPU()) 498 } 499 500 if resources.CPUShares > 0 && !sysInfo.CPUShares { 501 warnings = append(warnings, "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded.") 502 resources.CPUShares = 0 503 } 504 if (resources.CPUPeriod != 0 || resources.CPUQuota != 0) && !sysInfo.CPUCfs { 505 warnings = append(warnings, "Your kernel does not support CPU CFS scheduler. CPU period/quota discarded.") 506 resources.CPUPeriod = 0 507 resources.CPUQuota = 0 508 } 509 if resources.CPUPeriod != 0 && (resources.CPUPeriod < 1000 || resources.CPUPeriod > 1000000) { 510 return warnings, fmt.Errorf("CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)") 511 } 512 if resources.CPUQuota > 0 && resources.CPUQuota < 1000 { 513 return warnings, fmt.Errorf("CPU cfs quota can not be less than 1ms (i.e. 1000)") 514 } 515 if resources.CPUPercent > 0 { 516 warnings = append(warnings, fmt.Sprintf("%s does not support CPU percent. Percent discarded.", runtime.GOOS)) 517 resources.CPUPercent = 0 518 } 519 520 // cpuset subsystem checks and adjustments 521 if (resources.CpusetCpus != "" || resources.CpusetMems != "") && !sysInfo.Cpuset { 522 warnings = append(warnings, "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded.") 523 resources.CpusetCpus = "" 524 resources.CpusetMems = "" 525 } 526 cpusAvailable, err := sysInfo.IsCpusetCpusAvailable(resources.CpusetCpus) 527 if err != nil { 528 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset cpus", resources.CpusetCpus) 529 } 530 if !cpusAvailable { 531 return warnings, fmt.Errorf("Requested CPUs are not available - requested %s, available: %s", resources.CpusetCpus, sysInfo.Cpus) 532 } 533 memsAvailable, err := sysInfo.IsCpusetMemsAvailable(resources.CpusetMems) 534 if err != nil { 535 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset mems", resources.CpusetMems) 536 } 537 if !memsAvailable { 538 return warnings, fmt.Errorf("Requested memory nodes are not available - requested %s, available: %s", resources.CpusetMems, sysInfo.Mems) 539 } 540 541 // blkio subsystem checks and adjustments 542 if resources.BlkioWeight > 0 && !sysInfo.BlkioWeight { 543 warnings = append(warnings, "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded.") 544 resources.BlkioWeight = 0 545 } 546 if resources.BlkioWeight > 0 && (resources.BlkioWeight < 10 || resources.BlkioWeight > 1000) { 547 return warnings, fmt.Errorf("Range of blkio weight is from 10 to 1000") 548 } 549 if resources.IOMaximumBandwidth != 0 || resources.IOMaximumIOps != 0 { 550 return warnings, fmt.Errorf("Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps", runtime.GOOS) 551 } 552 if len(resources.BlkioWeightDevice) > 0 && !sysInfo.BlkioWeightDevice { 553 warnings = append(warnings, "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded.") 554 resources.BlkioWeightDevice = []*pblkiodev.WeightDevice{} 555 } 556 if len(resources.BlkioDeviceReadBps) > 0 && !sysInfo.BlkioReadBpsDevice { 557 warnings = append(warnings, "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded.") 558 resources.BlkioDeviceReadBps = []*pblkiodev.ThrottleDevice{} 559 } 560 if len(resources.BlkioDeviceWriteBps) > 0 && !sysInfo.BlkioWriteBpsDevice { 561 warnings = append(warnings, "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded.") 562 resources.BlkioDeviceWriteBps = []*pblkiodev.ThrottleDevice{} 563 } 564 if len(resources.BlkioDeviceReadIOps) > 0 && !sysInfo.BlkioReadIOpsDevice { 565 warnings = append(warnings, "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded.") 566 resources.BlkioDeviceReadIOps = []*pblkiodev.ThrottleDevice{} 567 } 568 if len(resources.BlkioDeviceWriteIOps) > 0 && !sysInfo.BlkioWriteIOpsDevice { 569 warnings = append(warnings, "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded.") 570 resources.BlkioDeviceWriteIOps = []*pblkiodev.ThrottleDevice{} 571 } 572 573 return warnings, nil 574 } 575 576 func cgroupDriver(cfg *config.Config) string { 577 if UsingSystemd(cfg) { 578 return cgroupSystemdDriver 579 } 580 if cfg.Rootless { 581 return cgroupNoneDriver 582 } 583 return cgroupFsDriver 584 } 585 586 // getCD gets the raw value of the native.cgroupdriver option, if set. 587 func getCD(config *config.Config) string { 588 for _, option := range config.ExecOptions { 589 key, val, err := parsers.ParseKeyValueOpt(option) 590 if err != nil || !strings.EqualFold(key, "native.cgroupdriver") { 591 continue 592 } 593 return val 594 } 595 return "" 596 } 597 598 // verifyCgroupDriver validates native.cgroupdriver 599 func verifyCgroupDriver(config *config.Config) error { 600 cd := getCD(config) 601 if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver { 602 return nil 603 } 604 if cd == cgroupNoneDriver { 605 return fmt.Errorf("native.cgroupdriver option %s is internally used and cannot be specified manually", cd) 606 } 607 return fmt.Errorf("native.cgroupdriver option %s not supported", cd) 608 } 609 610 // UsingSystemd returns true if cli option includes native.cgroupdriver=systemd 611 func UsingSystemd(config *config.Config) bool { 612 cd := getCD(config) 613 614 if cd == cgroupSystemdDriver { 615 return true 616 } 617 // On cgroup v2 hosts, default to systemd driver 618 if cd == "" && cgroups.Mode() == cgroups.Unified && isRunningSystemd() { 619 return true 620 } 621 return false 622 } 623 624 var ( 625 runningSystemd bool 626 detectSystemd sync.Once 627 ) 628 629 // isRunningSystemd checks whether the host was booted with systemd as its init 630 // system. This functions similarly to systemd's `sd_booted(3)`: internally, it 631 // checks whether /run/systemd/system/ exists and is a directory. 632 // http://www.freedesktop.org/software/systemd/man/sd_booted.html 633 // 634 // NOTE: This function comes from package github.com/coreos/go-systemd/util 635 // It was borrowed here to avoid a dependency on cgo. 636 func isRunningSystemd() bool { 637 detectSystemd.Do(func() { 638 fi, err := os.Lstat("/run/systemd/system") 639 if err != nil { 640 return 641 } 642 runningSystemd = fi.IsDir() 643 }) 644 return runningSystemd 645 } 646 647 // verifyPlatformContainerSettings performs platform-specific validation of the 648 // hostconfig and config structures. 649 func verifyPlatformContainerSettings(daemon *Daemon, daemonCfg *configStore, hostConfig *containertypes.HostConfig, update bool) (warnings []string, err error) { 650 if hostConfig == nil { 651 return nil, nil 652 } 653 sysInfo := daemon.RawSysInfo() 654 655 w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update) 656 657 // no matter err is nil or not, w could have data in itself. 658 warnings = append(warnings, w...) 659 660 if err != nil { 661 return warnings, err 662 } 663 664 if !hostConfig.IpcMode.Valid() { 665 return warnings, errors.Errorf("invalid IPC mode: %v", hostConfig.IpcMode) 666 } 667 if !hostConfig.PidMode.Valid() { 668 return warnings, errors.Errorf("invalid PID mode: %v", hostConfig.PidMode) 669 } 670 if hostConfig.ShmSize < 0 { 671 return warnings, fmt.Errorf("SHM size can not be less than 0") 672 } 673 if !hostConfig.UTSMode.Valid() { 674 return warnings, errors.Errorf("invalid UTS mode: %v", hostConfig.UTSMode) 675 } 676 677 if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 { 678 return warnings, fmt.Errorf("Invalid value %d, range for oom score adj is [-1000, 1000]", hostConfig.OomScoreAdj) 679 } 680 681 // ip-forwarding does not affect container with '--net=host' (or '--net=none') 682 if sysInfo.IPv4ForwardingDisabled && !(hostConfig.NetworkMode.IsHost() || hostConfig.NetworkMode.IsNone()) { 683 warnings = append(warnings, "IPv4 forwarding is disabled. Networking will not work.") 684 } 685 if hostConfig.NetworkMode.IsHost() && len(hostConfig.PortBindings) > 0 { 686 warnings = append(warnings, "Published ports are discarded when using host network mode") 687 } 688 689 // check for various conflicting options with user namespaces 690 if daemonCfg.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() { 691 if hostConfig.Privileged { 692 return warnings, fmt.Errorf("privileged mode is incompatible with user namespaces. You must run the container in the host namespace when running privileged mode") 693 } 694 if hostConfig.NetworkMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 695 return warnings, fmt.Errorf("cannot share the host's network namespace when user namespaces are enabled") 696 } 697 if hostConfig.PidMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 698 return warnings, fmt.Errorf("cannot share the host PID namespace when user namespaces are enabled") 699 } 700 } 701 if hostConfig.CgroupParent != "" && UsingSystemd(&daemonCfg.Config) { 702 // CgroupParent for systemd cgroup should be named as "xxx.slice" 703 if len(hostConfig.CgroupParent) <= 6 || !strings.HasSuffix(hostConfig.CgroupParent, ".slice") { 704 return warnings, fmt.Errorf(`cgroup-parent for systemd cgroup should be a valid slice named as "xxx.slice"`) 705 } 706 } 707 if hostConfig.Runtime == "" { 708 hostConfig.Runtime = daemonCfg.Runtimes.Default 709 } 710 711 if _, _, err := daemonCfg.Runtimes.Get(hostConfig.Runtime); err != nil { 712 return warnings, err 713 } 714 715 parser := volumemounts.NewParser() 716 for dest := range hostConfig.Tmpfs { 717 if err := parser.ValidateTmpfsMountDestination(dest); err != nil { 718 return warnings, err 719 } 720 } 721 722 if !hostConfig.CgroupnsMode.Valid() { 723 return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode) 724 } 725 if hostConfig.CgroupnsMode.IsPrivate() { 726 if !sysInfo.CgroupNamespaces { 727 warnings = append(warnings, "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded.") 728 } 729 } 730 731 return warnings, nil 732 } 733 734 // verifyDaemonSettings performs validation of daemon config struct 735 func verifyDaemonSettings(conf *config.Config) error { 736 if conf.ContainerdNamespace == conf.ContainerdPluginNamespace { 737 return errors.New("containers namespace and plugins namespace cannot be the same") 738 } 739 // Check for mutually incompatible config options 740 if conf.BridgeConfig.Iface != "" && conf.BridgeConfig.IP != "" { 741 return fmt.Errorf("You specified -b & --bip, mutually exclusive options. Please specify only one") 742 } 743 if !conf.BridgeConfig.EnableIPTables && !conf.BridgeConfig.InterContainerCommunication { 744 return fmt.Errorf("You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true") 745 } 746 if conf.BridgeConfig.EnableIP6Tables && !conf.Experimental { 747 return fmt.Errorf("ip6tables rules are only available if experimental features are enabled") 748 } 749 if !conf.BridgeConfig.EnableIPTables && conf.BridgeConfig.EnableIPMasq { 750 conf.BridgeConfig.EnableIPMasq = false 751 } 752 if err := verifyCgroupDriver(conf); err != nil { 753 return err 754 } 755 if conf.CgroupParent != "" && UsingSystemd(conf) { 756 if len(conf.CgroupParent) <= 6 || !strings.HasSuffix(conf.CgroupParent, ".slice") { 757 return fmt.Errorf(`cgroup-parent for systemd cgroup should be a valid slice named as "xxx.slice"`) 758 } 759 } 760 761 if conf.Rootless && UsingSystemd(conf) && cgroups.Mode() != cgroups.Unified { 762 return fmt.Errorf("exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode") 763 } 764 return nil 765 } 766 767 // checkSystem validates platform-specific requirements 768 func checkSystem() error { 769 return nil 770 } 771 772 // configureMaxThreads sets the Go runtime max threads threshold 773 // which is 90% of the kernel setting from /proc/sys/kernel/threads-max 774 func configureMaxThreads(config *config.Config) error { 775 mt, err := os.ReadFile("/proc/sys/kernel/threads-max") 776 if err != nil { 777 return err 778 } 779 mtint, err := strconv.Atoi(strings.TrimSpace(string(mt))) 780 if err != nil { 781 return err 782 } 783 maxThreads := (mtint / 100) * 90 784 debug.SetMaxThreads(maxThreads) 785 log.G(context.TODO()).Debugf("Golang's threads limit set to %d", maxThreads) 786 return nil 787 } 788 789 func overlaySupportsSelinux() (bool, error) { 790 f, err := os.Open("/proc/kallsyms") 791 if err != nil { 792 if os.IsNotExist(err) { 793 return false, nil 794 } 795 return false, err 796 } 797 defer f.Close() 798 799 s := bufio.NewScanner(f) 800 for s.Scan() { 801 if strings.HasSuffix(s.Text(), " security_inode_copy_up") { 802 return true, nil 803 } 804 } 805 806 return false, s.Err() 807 } 808 809 // configureKernelSecuritySupport configures and validates security support for the kernel 810 func configureKernelSecuritySupport(config *config.Config, driverName string) error { 811 if config.EnableSelinuxSupport { 812 if !selinux.GetEnabled() { 813 log.G(context.TODO()).Warn("Docker could not enable SELinux on the host system") 814 return nil 815 } 816 817 if driverName == "overlay2" || driverName == "overlayfs" { 818 // If driver is overlay2, make sure kernel 819 // supports selinux with overlay. 820 supported, err := overlaySupportsSelinux() 821 if err != nil { 822 return err 823 } 824 825 if !supported { 826 log.G(context.TODO()).Warnf("SELinux is not supported with the %v graph driver on this kernel", driverName) 827 } 828 } 829 } else { 830 selinux.SetDisabled() 831 } 832 return nil 833 } 834 835 // initNetworkController initializes the libnetwork controller and configures 836 // network settings. If there's active sandboxes, configuration changes will not 837 // take effect. 838 func (daemon *Daemon) initNetworkController(cfg *config.Config, activeSandboxes map[string]interface{}) error { 839 netOptions, err := daemon.networkOptions(cfg, daemon.PluginStore, activeSandboxes) 840 if err != nil { 841 return err 842 } 843 844 daemon.netController, err = libnetwork.New(netOptions...) 845 if err != nil { 846 return fmt.Errorf("error obtaining controller instance: %v", err) 847 } 848 849 if len(activeSandboxes) > 0 { 850 log.G(context.TODO()).Info("there are running containers, updated network configuration will not take affect") 851 } else if err := configureNetworking(daemon.netController, cfg); err != nil { 852 return err 853 } 854 855 // Set HostGatewayIP to the default bridge's IP if it is empty 856 setHostGatewayIP(daemon.netController, cfg) 857 return nil 858 } 859 860 func configureNetworking(controller *libnetwork.Controller, conf *config.Config) error { 861 // Create predefined network "none" 862 if n, _ := controller.NetworkByName(network.NetworkNone); n == nil { 863 if _, err := controller.NewNetwork("null", network.NetworkNone, "", libnetwork.NetworkOptionPersist(true)); err != nil { 864 return errors.Wrapf(err, `error creating default %q network`, network.NetworkNone) 865 } 866 } 867 868 // Create predefined network "host" 869 if n, _ := controller.NetworkByName(network.NetworkHost); n == nil { 870 if _, err := controller.NewNetwork("host", network.NetworkHost, "", libnetwork.NetworkOptionPersist(true)); err != nil { 871 return errors.Wrapf(err, `error creating default %q network`, network.NetworkHost) 872 } 873 } 874 875 // Clear stale bridge network 876 if n, err := controller.NetworkByName(network.NetworkBridge); err == nil { 877 if err = n.Delete(); err != nil { 878 return errors.Wrapf(err, `could not delete the default %q network`, network.NetworkBridge) 879 } 880 if len(conf.NetworkConfig.DefaultAddressPools.Value()) > 0 && !conf.LiveRestoreEnabled { 881 removeDefaultBridgeInterface() 882 } 883 } 884 885 if !conf.DisableBridge { 886 // Initialize default driver "bridge" 887 if err := initBridgeDriver(controller, conf.BridgeConfig); err != nil { 888 return err 889 } 890 } else { 891 removeDefaultBridgeInterface() 892 } 893 894 return nil 895 } 896 897 // setHostGatewayIP sets cfg.HostGatewayIP to the default bridge's IP if it is empty. 898 func setHostGatewayIP(controller *libnetwork.Controller, config *config.Config) { 899 if config.HostGatewayIP != nil { 900 return 901 } 902 if n, err := controller.NetworkByName(network.NetworkBridge); err == nil { 903 v4Info, v6Info := n.IpamInfo() 904 if len(v4Info) > 0 { 905 config.HostGatewayIP = v4Info[0].Gateway.IP 906 } else if len(v6Info) > 0 { 907 config.HostGatewayIP = v6Info[0].Gateway.IP 908 } 909 } 910 } 911 912 func driverOptions(config *config.Config) nwconfig.Option { 913 return nwconfig.OptionDriverConfig("bridge", options.Generic{ 914 netlabel.GenericData: options.Generic{ 915 "EnableIPForwarding": config.BridgeConfig.EnableIPForward, 916 "EnableIPTables": config.BridgeConfig.EnableIPTables, 917 "EnableIP6Tables": config.BridgeConfig.EnableIP6Tables, 918 "EnableUserlandProxy": config.BridgeConfig.EnableUserlandProxy, 919 "UserlandProxyPath": config.BridgeConfig.UserlandProxyPath, 920 }, 921 }) 922 } 923 924 func initBridgeDriver(controller *libnetwork.Controller, cfg config.BridgeConfig) error { 925 bridgeName := bridge.DefaultBridgeName 926 if cfg.Iface != "" { 927 bridgeName = cfg.Iface 928 } 929 netOption := map[string]string{ 930 bridge.BridgeName: bridgeName, 931 bridge.DefaultBridge: strconv.FormatBool(true), 932 netlabel.DriverMTU: strconv.Itoa(cfg.MTU), 933 bridge.EnableIPMasquerade: strconv.FormatBool(cfg.EnableIPMasq), 934 bridge.EnableICC: strconv.FormatBool(cfg.InterContainerCommunication), 935 } 936 937 // --ip processing 938 if cfg.DefaultIP != nil { 939 netOption[bridge.DefaultBindingIP] = cfg.DefaultIP.String() 940 } 941 942 ipamV4Conf := &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 943 944 // By default, libnetwork will request an arbitrary available address 945 // pool for the network from the configured IPAM allocator. 946 // Configure it to use the IPv4 network ranges of the existing bridge 947 // interface if one exists with IPv4 addresses assigned to it. 948 949 nwList, nw6List, err := ifaceAddrs(bridgeName) 950 if err != nil { 951 return errors.Wrap(err, "list bridge addresses failed") 952 } 953 954 if len(nwList) > 0 { 955 nw := nwList[0] 956 if len(nwList) > 1 && cfg.FixedCIDR != "" { 957 _, fCIDR, err := net.ParseCIDR(cfg.FixedCIDR) 958 if err != nil { 959 return errors.Wrap(err, "parse CIDR failed") 960 } 961 // Iterate through in case there are multiple addresses for the bridge 962 for _, entry := range nwList { 963 if fCIDR.Contains(entry.IP) { 964 nw = entry 965 break 966 } 967 } 968 } 969 970 ipamV4Conf.PreferredPool = lntypes.GetIPNetCanonical(nw).String() 971 hip, _ := lntypes.GetHostPartIP(nw.IP, nw.Mask) 972 if hip.IsGlobalUnicast() { 973 ipamV4Conf.Gateway = nw.IP.String() 974 } 975 } 976 977 if cfg.IP != "" { 978 ip, ipNet, err := net.ParseCIDR(cfg.IP) 979 if err != nil { 980 return err 981 } 982 ipamV4Conf.PreferredPool = ipNet.String() 983 ipamV4Conf.Gateway = ip.String() 984 } else if bridgeName == bridge.DefaultBridgeName && ipamV4Conf.PreferredPool != "" { 985 log.G(context.TODO()).Infof("Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address", bridgeName, ipamV4Conf.PreferredPool) 986 } 987 988 if cfg.FixedCIDR != "" { 989 _, fCIDR, err := net.ParseCIDR(cfg.FixedCIDR) 990 if err != nil { 991 return err 992 } 993 994 ipamV4Conf.SubPool = fCIDR.String() 995 if ipamV4Conf.PreferredPool == "" { 996 ipamV4Conf.PreferredPool = fCIDR.String() 997 } 998 } 999 1000 if cfg.DefaultGatewayIPv4 != nil { 1001 ipamV4Conf.AuxAddresses["DefaultGatewayIPv4"] = cfg.DefaultGatewayIPv4.String() 1002 } 1003 1004 var ( 1005 deferIPv6Alloc bool 1006 ipamV6Conf *libnetwork.IpamConf 1007 ) 1008 1009 if cfg.EnableIPv6 && cfg.FixedCIDRv6 == "" { 1010 return errdefs.InvalidParameter(errors.New("IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6")) 1011 } else if cfg.FixedCIDRv6 != "" { 1012 _, fCIDRv6, err := net.ParseCIDR(cfg.FixedCIDRv6) 1013 if err != nil { 1014 return err 1015 } 1016 1017 // In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has 1018 // at least 48 host bits, we need to guarantee the current behavior where the containers' 1019 // IPv6 addresses will be constructed based on the containers' interface MAC address. 1020 // We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints 1021 // on this network until after the driver has created the endpoint and returned the 1022 // constructed address. Libnetwork will then reserve this address with the ipam driver. 1023 ones, _ := fCIDRv6.Mask.Size() 1024 deferIPv6Alloc = ones <= 80 1025 1026 ipamV6Conf = &libnetwork.IpamConf{ 1027 AuxAddresses: make(map[string]string), 1028 PreferredPool: fCIDRv6.String(), 1029 } 1030 1031 // In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6 1032 // address belongs to the same network, we need to inform libnetwork about it, so 1033 // that it can be reserved with IPAM and it will not be given away to somebody else 1034 for _, nw6 := range nw6List { 1035 if fCIDRv6.Contains(nw6.IP) { 1036 ipamV6Conf.Gateway = nw6.IP.String() 1037 break 1038 } 1039 } 1040 } 1041 1042 if cfg.DefaultGatewayIPv6 != nil { 1043 if ipamV6Conf == nil { 1044 ipamV6Conf = &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 1045 } 1046 ipamV6Conf.AuxAddresses["DefaultGatewayIPv6"] = cfg.DefaultGatewayIPv6.String() 1047 } 1048 1049 v4Conf := []*libnetwork.IpamConf{ipamV4Conf} 1050 v6Conf := []*libnetwork.IpamConf{} 1051 if ipamV6Conf != nil { 1052 v6Conf = append(v6Conf, ipamV6Conf) 1053 } 1054 // Initialize default network on "bridge" with the same name 1055 _, err = controller.NewNetwork("bridge", network.NetworkBridge, "", 1056 libnetwork.NetworkOptionEnableIPv6(cfg.EnableIPv6), 1057 libnetwork.NetworkOptionDriverOpts(netOption), 1058 libnetwork.NetworkOptionIpam("default", "", v4Conf, v6Conf, nil), 1059 libnetwork.NetworkOptionDeferIPv6Alloc(deferIPv6Alloc)) 1060 if err != nil { 1061 return fmt.Errorf(`error creating default %q network: %v`, network.NetworkBridge, err) 1062 } 1063 return nil 1064 } 1065 1066 // Remove default bridge interface if present (--bridge=none use case) 1067 func removeDefaultBridgeInterface() { 1068 if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil { 1069 if err := netlink.LinkDel(lnk); err != nil { 1070 log.G(context.TODO()).Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err) 1071 } 1072 } 1073 } 1074 1075 func setupInitLayer(idMapping idtools.IdentityMapping) func(string) error { 1076 return func(initPath string) error { 1077 return initlayer.Setup(initPath, idMapping.RootPair()) 1078 } 1079 } 1080 1081 // Parse the remapped root (user namespace) option, which can be one of: 1082 // 1083 // - username - valid username from /etc/passwd 1084 // - username:groupname - valid username; valid groupname from /etc/group 1085 // - uid - 32-bit unsigned int valid Linux UID value 1086 // - uid:gid - uid value; 32-bit unsigned int Linux GID value 1087 // 1088 // If no groupname is specified, and a username is specified, an attempt 1089 // will be made to lookup a gid for that username as a groupname 1090 // 1091 // If names are used, they are verified to exist in passwd/group 1092 func parseRemappedRoot(usergrp string) (string, string, error) { 1093 var ( 1094 userID, groupID int 1095 username, groupname string 1096 ) 1097 1098 idparts := strings.Split(usergrp, ":") 1099 if len(idparts) > 2 { 1100 return "", "", fmt.Errorf("Invalid user/group specification in --userns-remap: %q", usergrp) 1101 } 1102 1103 if uid, err := strconv.ParseInt(idparts[0], 10, 32); err == nil { 1104 // must be a uid; take it as valid 1105 userID = int(uid) 1106 luser, err := idtools.LookupUID(userID) 1107 if err != nil { 1108 return "", "", fmt.Errorf("Uid %d has no entry in /etc/passwd: %v", userID, err) 1109 } 1110 username = luser.Name 1111 if len(idparts) == 1 { 1112 // if the uid was numeric and no gid was specified, take the uid as the gid 1113 groupID = userID 1114 lgrp, err := idtools.LookupGID(groupID) 1115 if err != nil { 1116 return "", "", fmt.Errorf("Gid %d has no entry in /etc/group: %v", groupID, err) 1117 } 1118 groupname = lgrp.Name 1119 } 1120 } else { 1121 lookupName := idparts[0] 1122 // special case: if the user specified "default", they want Docker to create or 1123 // use (after creation) the "dockremap" user/group for root remapping 1124 if lookupName == defaultIDSpecifier { 1125 lookupName = defaultRemappedID 1126 } 1127 luser, err := idtools.LookupUser(lookupName) 1128 if err != nil && idparts[0] != defaultIDSpecifier { 1129 // error if the name requested isn't the special "dockremap" ID 1130 return "", "", fmt.Errorf("Error during uid lookup for %q: %v", lookupName, err) 1131 } else if err != nil { 1132 // special case-- if the username == "default", then we have been asked 1133 // to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid} 1134 // ranges will be used for the user and group mappings in user namespaced containers 1135 _, _, err := idtools.AddNamespaceRangesUser(defaultRemappedID) 1136 if err == nil { 1137 return defaultRemappedID, defaultRemappedID, nil 1138 } 1139 return "", "", fmt.Errorf("Error during %q user creation: %v", defaultRemappedID, err) 1140 } 1141 username = luser.Name 1142 if len(idparts) == 1 { 1143 // we only have a string username, and no group specified; look up gid from username as group 1144 group, err := idtools.LookupGroup(lookupName) 1145 if err != nil { 1146 return "", "", fmt.Errorf("Error during gid lookup for %q: %v", lookupName, err) 1147 } 1148 groupname = group.Name 1149 } 1150 } 1151 1152 if len(idparts) == 2 { 1153 // groupname or gid is separately specified and must be resolved 1154 // to an unsigned 32-bit gid 1155 if gid, err := strconv.ParseInt(idparts[1], 10, 32); err == nil { 1156 // must be a gid, take it as valid 1157 groupID = int(gid) 1158 lgrp, err := idtools.LookupGID(groupID) 1159 if err != nil { 1160 return "", "", fmt.Errorf("Gid %d has no entry in /etc/passwd: %v", groupID, err) 1161 } 1162 groupname = lgrp.Name 1163 } else { 1164 // not a number; attempt a lookup 1165 if _, err := idtools.LookupGroup(idparts[1]); err != nil { 1166 return "", "", fmt.Errorf("Error during groupname lookup for %q: %v", idparts[1], err) 1167 } 1168 groupname = idparts[1] 1169 } 1170 } 1171 return username, groupname, nil 1172 } 1173 1174 func setupRemappedRoot(config *config.Config) (idtools.IdentityMapping, error) { 1175 if runtime.GOOS != "linux" && config.RemappedRoot != "" { 1176 return idtools.IdentityMapping{}, fmt.Errorf("User namespaces are only supported on Linux") 1177 } 1178 1179 // if the daemon was started with remapped root option, parse 1180 // the config option to the int uid,gid values 1181 if config.RemappedRoot != "" { 1182 username, groupname, err := parseRemappedRoot(config.RemappedRoot) 1183 if err != nil { 1184 return idtools.IdentityMapping{}, err 1185 } 1186 if username == "root" { 1187 // Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op 1188 // effectively 1189 log.G(context.TODO()).Warn("User namespaces: root cannot be remapped with itself; user namespaces are OFF") 1190 return idtools.IdentityMapping{}, nil 1191 } 1192 log.G(context.TODO()).Infof("User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s", username) 1193 // update remapped root setting now that we have resolved them to actual names 1194 config.RemappedRoot = fmt.Sprintf("%s:%s", username, groupname) 1195 1196 mappings, err := idtools.LoadIdentityMapping(username) 1197 if err != nil { 1198 return idtools.IdentityMapping{}, errors.Wrap(err, "Can't create ID mappings") 1199 } 1200 return mappings, nil 1201 } 1202 return idtools.IdentityMapping{}, nil 1203 } 1204 1205 func setupDaemonRoot(config *config.Config, rootDir string, remappedRoot idtools.Identity) error { 1206 config.Root = rootDir 1207 // the docker root metadata directory needs to have execute permissions for all users (g+x,o+x) 1208 // so that syscalls executing as non-root, operating on subdirectories of the graph root 1209 // (e.g. mounted layers of a container) can traverse this path. 1210 // The user namespace support will create subdirectories for the remapped root host uid:gid 1211 // pair owned by that same uid:gid pair for proper write access to those needed metadata and 1212 // layer content subtrees. 1213 if _, err := os.Stat(rootDir); err == nil { 1214 // root current exists; verify the access bits are correct by setting them 1215 if err = os.Chmod(rootDir, 0o711); err != nil { 1216 return err 1217 } 1218 } else if os.IsNotExist(err) { 1219 // no root exists yet, create it 0711 with root:root ownership 1220 if err := os.MkdirAll(rootDir, 0o711); err != nil { 1221 return err 1222 } 1223 } 1224 1225 id := idtools.Identity{UID: idtools.CurrentIdentity().UID, GID: remappedRoot.GID} 1226 // First make sure the current root dir has the correct perms. 1227 if err := idtools.MkdirAllAndChown(config.Root, 0o710, id); err != nil { 1228 return errors.Wrapf(err, "could not create or set daemon root permissions: %s", config.Root) 1229 } 1230 1231 // if user namespaces are enabled we will create a subtree underneath the specified root 1232 // with any/all specified remapped root uid/gid options on the daemon creating 1233 // a new subdirectory with ownership set to the remapped uid/gid (so as to allow 1234 // `chdir()` to work for containers namespaced to that uid/gid) 1235 if config.RemappedRoot != "" { 1236 config.Root = filepath.Join(rootDir, fmt.Sprintf("%d.%d", remappedRoot.UID, remappedRoot.GID)) 1237 log.G(context.TODO()).Debugf("Creating user namespaced daemon root: %s", config.Root) 1238 // Create the root directory if it doesn't exist 1239 if err := idtools.MkdirAllAndChown(config.Root, 0o710, id); err != nil { 1240 return fmt.Errorf("Cannot create daemon root: %s: %v", config.Root, err) 1241 } 1242 // we also need to verify that any pre-existing directories in the path to 1243 // the graphroot won't block access to remapped root--if any pre-existing directory 1244 // has strict permissions that don't allow "x", container start will fail, so 1245 // better to warn and fail now 1246 dirPath := config.Root 1247 for { 1248 dirPath = filepath.Dir(dirPath) 1249 if dirPath == "/" { 1250 break 1251 } 1252 if !canAccess(dirPath, remappedRoot) { 1253 return fmt.Errorf("a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories", config.Root) 1254 } 1255 } 1256 } 1257 1258 if err := setupDaemonRootPropagation(config); err != nil { 1259 log.G(context.TODO()).WithError(err).WithField("dir", config.Root).Warn("Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior") 1260 } 1261 return nil 1262 } 1263 1264 // canAccess takes a valid (existing) directory and a uid, gid pair and determines 1265 // if that uid, gid pair has access (execute bit) to the directory. 1266 // 1267 // Note: this is a very rudimentary check, and may not produce accurate results, 1268 // so should not be used for anything other than the current use, see: 1269 // https://github.com/moby/moby/issues/43724 1270 func canAccess(path string, pair idtools.Identity) bool { 1271 statInfo, err := os.Stat(path) 1272 if err != nil { 1273 return false 1274 } 1275 perms := statInfo.Mode().Perm() 1276 if perms&0o001 == 0o001 { 1277 // world access 1278 return true 1279 } 1280 ssi := statInfo.Sys().(*syscall.Stat_t) 1281 if ssi.Uid == uint32(pair.UID) && (perms&0o100 == 0o100) { 1282 // owner access. 1283 return true 1284 } 1285 if ssi.Gid == uint32(pair.GID) && (perms&0o010 == 0o010) { 1286 // group access. 1287 return true 1288 } 1289 return false 1290 } 1291 1292 func setupDaemonRootPropagation(cfg *config.Config) error { 1293 rootParentMount, mountOptions, err := getSourceMount(cfg.Root) 1294 if err != nil { 1295 return errors.Wrap(err, "error getting daemon root's parent mount") 1296 } 1297 1298 var cleanupOldFile bool 1299 cleanupFile := getUnmountOnShutdownPath(cfg) 1300 defer func() { 1301 if !cleanupOldFile { 1302 return 1303 } 1304 if err := os.Remove(cleanupFile); err != nil && !os.IsNotExist(err) { 1305 log.G(context.TODO()).WithError(err).WithField("file", cleanupFile).Warn("could not clean up old root propagation unmount file") 1306 } 1307 }() 1308 1309 if hasMountInfoOption(mountOptions, sharedPropagationOption, slavePropagationOption) { 1310 cleanupOldFile = true 1311 return nil 1312 } 1313 1314 if err := mount.MakeShared(cfg.Root); err != nil { 1315 return errors.Wrap(err, "could not setup daemon root propagation to shared") 1316 } 1317 1318 // check the case where this may have already been a mount to itself. 1319 // If so then the daemon only performed a remount and should not try to unmount this later. 1320 if rootParentMount == cfg.Root { 1321 cleanupOldFile = true 1322 return nil 1323 } 1324 1325 if err := os.MkdirAll(filepath.Dir(cleanupFile), 0o700); err != nil { 1326 return errors.Wrap(err, "error creating dir to store mount cleanup file") 1327 } 1328 1329 if err := os.WriteFile(cleanupFile, nil, 0o600); err != nil { 1330 return errors.Wrap(err, "error writing file to signal mount cleanup on shutdown") 1331 } 1332 return nil 1333 } 1334 1335 // getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown 1336 // the daemon root should be unmounted. 1337 func getUnmountOnShutdownPath(config *config.Config) string { 1338 return filepath.Join(config.ExecRoot, "unmount-on-shutdown") 1339 } 1340 1341 // registerLinks registers network links between container and other containers 1342 // with the daemon using the specification in hostConfig. 1343 func (daemon *Daemon) registerLinks(container *container.Container, hostConfig *containertypes.HostConfig) error { 1344 if hostConfig == nil || hostConfig.NetworkMode.IsUserDefined() { 1345 return nil 1346 } 1347 1348 for _, l := range hostConfig.Links { 1349 name, alias, err := opts.ParseLink(l) 1350 if err != nil { 1351 return err 1352 } 1353 child, err := daemon.GetContainer(name) 1354 if err != nil { 1355 if errdefs.IsNotFound(err) { 1356 // Trying to link to a non-existing container is not valid, and 1357 // should return an "invalid parameter" error. Returning a "not 1358 // found" error here would make the client report the container's 1359 // image could not be found (see moby/moby#39823) 1360 err = errdefs.InvalidParameter(err) 1361 } 1362 return errors.Wrapf(err, "could not get container for %s", name) 1363 } 1364 for child.HostConfig.NetworkMode.IsContainer() { 1365 cid := child.HostConfig.NetworkMode.ConnectedContainer() 1366 child, err = daemon.GetContainer(cid) 1367 if err != nil { 1368 if errdefs.IsNotFound(err) { 1369 // Trying to link to a non-existing container is not valid, and 1370 // should return an "invalid parameter" error. Returning a "not 1371 // found" error here would make the client report the container's 1372 // image could not be found (see moby/moby#39823) 1373 err = errdefs.InvalidParameter(err) 1374 } 1375 return errors.Wrapf(err, "could not get container for %s", cid) 1376 } 1377 } 1378 if child.HostConfig.NetworkMode.IsHost() { 1379 return runconfig.ErrConflictHostNetworkAndLinks 1380 } 1381 if err := daemon.registerLink(container, child, alias); err != nil { 1382 return err 1383 } 1384 } 1385 1386 return nil 1387 } 1388 1389 // conditionalMountOnStart is a platform specific helper function during the 1390 // container start to call mount. 1391 func (daemon *Daemon) conditionalMountOnStart(container *container.Container) error { 1392 return daemon.Mount(container) 1393 } 1394 1395 // conditionalUnmountOnCleanup is a platform specific helper function called 1396 // during the cleanup of a container to unmount. 1397 func (daemon *Daemon) conditionalUnmountOnCleanup(container *container.Container) error { 1398 return daemon.Unmount(container) 1399 } 1400 1401 // setDefaultIsolation determines the default isolation mode for the 1402 // daemon to run in. This is only applicable on Windows 1403 func (daemon *Daemon) setDefaultIsolation(*config.Config) error { 1404 return nil 1405 } 1406 1407 // This is used to allow removal of mountpoints that may be mounted in other 1408 // namespaces on RHEL based kernels starting from RHEL 7.4. 1409 // Without this setting, removals on these RHEL based kernels may fail with 1410 // "device or resource busy". 1411 // This setting is not available in upstream kernels as it is not configurable, 1412 // but has been in the upstream kernels since 3.15. 1413 func setMayDetachMounts() error { 1414 f, err := os.OpenFile("/proc/sys/fs/may_detach_mounts", os.O_WRONLY, 0) 1415 if err != nil { 1416 if os.IsNotExist(err) { 1417 return nil 1418 } 1419 return errors.Wrap(err, "error opening may_detach_mounts kernel config file") 1420 } 1421 defer f.Close() 1422 1423 _, err = f.WriteString("1") 1424 if os.IsPermission(err) { 1425 // Setting may_detach_mounts does not work in an 1426 // unprivileged container. Ignore the error, but log 1427 // it if we appear not to be in that situation. 1428 if !userns.RunningInUserNS() { 1429 log.G(context.TODO()).Debugf("Permission denied writing %q to /proc/sys/fs/may_detach_mounts", "1") 1430 } 1431 return nil 1432 } 1433 return err 1434 } 1435 1436 func (daemon *Daemon) initCPURtController(cfg *config.Config, mnt, path string) error { 1437 if path == "/" || path == "." { 1438 return nil 1439 } 1440 1441 // Recursively create cgroup to ensure that the system and all parent cgroups have values set 1442 // for the period and runtime as this limits what the children can be set to. 1443 if err := daemon.initCPURtController(cfg, mnt, filepath.Dir(path)); err != nil { 1444 return err 1445 } 1446 1447 path = filepath.Join(mnt, path) 1448 if err := os.MkdirAll(path, 0o755); err != nil { 1449 return err 1450 } 1451 if err := maybeCreateCPURealTimeFile(cfg.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil { 1452 return err 1453 } 1454 return maybeCreateCPURealTimeFile(cfg.CPURealtimeRuntime, "cpu.rt_runtime_us", path) 1455 } 1456 1457 func maybeCreateCPURealTimeFile(configValue int64, file string, path string) error { 1458 if configValue == 0 { 1459 return nil 1460 } 1461 return os.WriteFile(filepath.Join(path, file), []byte(strconv.FormatInt(configValue, 10)), 0o700) 1462 } 1463 1464 func (daemon *Daemon) setupSeccompProfile(cfg *config.Config) error { 1465 switch profile := cfg.SeccompProfile; profile { 1466 case "", config.SeccompProfileDefault: 1467 daemon.seccompProfilePath = config.SeccompProfileDefault 1468 case config.SeccompProfileUnconfined: 1469 daemon.seccompProfilePath = config.SeccompProfileUnconfined 1470 default: 1471 daemon.seccompProfilePath = profile 1472 b, err := os.ReadFile(profile) 1473 if err != nil { 1474 return fmt.Errorf("opening seccomp profile (%s) failed: %v", profile, err) 1475 } 1476 daemon.seccompProfile = b 1477 } 1478 return nil 1479 } 1480 1481 func getSysInfo(cfg *config.Config) *sysinfo.SysInfo { 1482 var siOpts []sysinfo.Opt 1483 if cgroupDriver(cfg) == cgroupSystemdDriver { 1484 if euid := os.Getenv("ROOTLESSKIT_PARENT_EUID"); euid != "" { 1485 siOpts = append(siOpts, sysinfo.WithCgroup2GroupPath("/user.slice/user-"+euid+".slice")) 1486 } 1487 } 1488 return sysinfo.New(siOpts...) 1489 } 1490 1491 func (daemon *Daemon) initLibcontainerd(ctx context.Context, cfg *config.Config) error { 1492 var err error 1493 daemon.containerd, err = remote.NewClient( 1494 ctx, 1495 daemon.containerdClient, 1496 filepath.Join(cfg.ExecRoot, "containerd"), 1497 cfg.ContainerdNamespace, 1498 daemon, 1499 ) 1500 return err 1501 } 1502 1503 func recursiveUnmount(target string) error { 1504 return mount.RecursiveUnmount(target) 1505 }