github.com/afbjorklund/moby@v20.10.5+incompatible/daemon/daemon_unix.go (about) 1 // +build linux freebsd 2 3 package daemon // import "github.com/docker/docker/daemon" 4 5 import ( 6 "bufio" 7 "context" 8 "fmt" 9 "io/ioutil" 10 "net" 11 "os" 12 "path/filepath" 13 "runtime" 14 "runtime/debug" 15 "strconv" 16 "strings" 17 "time" 18 19 "github.com/containerd/cgroups" 20 statsV1 "github.com/containerd/cgroups/stats/v1" 21 statsV2 "github.com/containerd/cgroups/v2/stats" 22 "github.com/containerd/containerd/sys" 23 "github.com/docker/docker/api/types" 24 "github.com/docker/docker/api/types/blkiodev" 25 pblkiodev "github.com/docker/docker/api/types/blkiodev" 26 containertypes "github.com/docker/docker/api/types/container" 27 "github.com/docker/docker/container" 28 "github.com/docker/docker/daemon/config" 29 "github.com/docker/docker/daemon/initlayer" 30 "github.com/docker/docker/errdefs" 31 "github.com/docker/docker/opts" 32 "github.com/docker/docker/pkg/containerfs" 33 "github.com/docker/docker/pkg/idtools" 34 "github.com/docker/docker/pkg/parsers" 35 "github.com/docker/docker/pkg/parsers/kernel" 36 "github.com/docker/docker/pkg/sysinfo" 37 "github.com/docker/docker/runconfig" 38 volumemounts "github.com/docker/docker/volume/mounts" 39 "github.com/docker/libnetwork" 40 nwconfig "github.com/docker/libnetwork/config" 41 "github.com/docker/libnetwork/drivers/bridge" 42 "github.com/docker/libnetwork/netlabel" 43 "github.com/docker/libnetwork/netutils" 44 "github.com/docker/libnetwork/options" 45 lntypes "github.com/docker/libnetwork/types" 46 "github.com/moby/sys/mount" 47 specs "github.com/opencontainers/runtime-spec/specs-go" 48 "github.com/opencontainers/selinux/go-selinux" 49 "github.com/opencontainers/selinux/go-selinux/label" 50 "github.com/pkg/errors" 51 "github.com/sirupsen/logrus" 52 "github.com/vishvananda/netlink" 53 "golang.org/x/sys/unix" 54 ) 55 56 const ( 57 isWindows = false 58 59 // DefaultShimBinary is the default shim to be used by containerd if none 60 // is specified 61 DefaultShimBinary = "containerd-shim" 62 63 // DefaultRuntimeBinary is the default runtime to be used by 64 // containerd if none is specified 65 DefaultRuntimeBinary = "runc" 66 67 // See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269 68 linuxMinCPUShares = 2 69 linuxMaxCPUShares = 262144 70 platformSupported = true 71 // It's not kernel limit, we want this 6M limit to account for overhead during startup, and to supply a reasonable functional container 72 linuxMinMemory = 6291456 73 // constants for remapped root settings 74 defaultIDSpecifier = "default" 75 defaultRemappedID = "dockremap" 76 77 // constant for cgroup drivers 78 cgroupFsDriver = "cgroupfs" 79 cgroupSystemdDriver = "systemd" 80 cgroupNoneDriver = "none" 81 ) 82 83 type containerGetter interface { 84 GetContainer(string) (*container.Container, error) 85 } 86 87 func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory { 88 memory := specs.LinuxMemory{} 89 90 if config.Memory > 0 { 91 memory.Limit = &config.Memory 92 } 93 94 if config.MemoryReservation > 0 { 95 memory.Reservation = &config.MemoryReservation 96 } 97 98 if config.MemorySwap > 0 { 99 memory.Swap = &config.MemorySwap 100 } 101 102 if config.MemorySwappiness != nil { 103 swappiness := uint64(*config.MemorySwappiness) 104 memory.Swappiness = &swappiness 105 } 106 107 if config.OomKillDisable != nil { 108 memory.DisableOOMKiller = config.OomKillDisable 109 } 110 111 if config.KernelMemory != 0 { 112 memory.Kernel = &config.KernelMemory 113 } 114 115 if config.KernelMemoryTCP != 0 { 116 memory.KernelTCP = &config.KernelMemoryTCP 117 } 118 119 return &memory 120 } 121 122 func getPidsLimit(config containertypes.Resources) *specs.LinuxPids { 123 if config.PidsLimit == nil { 124 return nil 125 } 126 if *config.PidsLimit <= 0 { 127 // docker API allows 0 and negative values to unset this to be consistent 128 // with default values. When updating values, runc requires -1 to unset 129 // the previous limit. 130 return &specs.LinuxPids{Limit: -1} 131 } 132 return &specs.LinuxPids{Limit: *config.PidsLimit} 133 } 134 135 func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { 136 cpu := specs.LinuxCPU{} 137 138 if config.CPUShares < 0 { 139 return nil, fmt.Errorf("shares: invalid argument") 140 } 141 if config.CPUShares >= 0 { 142 shares := uint64(config.CPUShares) 143 cpu.Shares = &shares 144 } 145 146 if config.CpusetCpus != "" { 147 cpu.Cpus = config.CpusetCpus 148 } 149 150 if config.CpusetMems != "" { 151 cpu.Mems = config.CpusetMems 152 } 153 154 if config.NanoCPUs > 0 { 155 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 156 period := uint64(100 * time.Millisecond / time.Microsecond) 157 quota := config.NanoCPUs * int64(period) / 1e9 158 cpu.Period = &period 159 cpu.Quota = "a 160 } 161 162 if config.CPUPeriod != 0 { 163 period := uint64(config.CPUPeriod) 164 cpu.Period = &period 165 } 166 167 if config.CPUQuota != 0 { 168 q := config.CPUQuota 169 cpu.Quota = &q 170 } 171 172 if config.CPURealtimePeriod != 0 { 173 period := uint64(config.CPURealtimePeriod) 174 cpu.RealtimePeriod = &period 175 } 176 177 if config.CPURealtimeRuntime != 0 { 178 c := config.CPURealtimeRuntime 179 cpu.RealtimeRuntime = &c 180 } 181 182 return &cpu, nil 183 } 184 185 func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) { 186 var stat unix.Stat_t 187 var blkioWeightDevices []specs.LinuxWeightDevice 188 189 for _, weightDevice := range config.BlkioWeightDevice { 190 if err := unix.Stat(weightDevice.Path, &stat); err != nil { 191 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: weightDevice.Path, Err: err}) 192 } 193 weight := weightDevice.Weight 194 d := specs.LinuxWeightDevice{Weight: &weight} 195 // The type is 32bit on mips. 196 d.Major = int64(unix.Major(uint64(stat.Rdev))) // nolint: unconvert 197 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) // nolint: unconvert 198 blkioWeightDevices = append(blkioWeightDevices, d) 199 } 200 201 return blkioWeightDevices, nil 202 } 203 204 func (daemon *Daemon) parseSecurityOpt(container *container.Container, hostConfig *containertypes.HostConfig) error { 205 container.NoNewPrivileges = daemon.configStore.NoNewPrivileges 206 return parseSecurityOpt(container, hostConfig) 207 } 208 209 func parseSecurityOpt(container *container.Container, config *containertypes.HostConfig) error { 210 var ( 211 labelOpts []string 212 err error 213 ) 214 215 for _, opt := range config.SecurityOpt { 216 if opt == "no-new-privileges" { 217 container.NoNewPrivileges = true 218 continue 219 } 220 if opt == "disable" { 221 labelOpts = append(labelOpts, "disable") 222 continue 223 } 224 225 var con []string 226 if strings.Contains(opt, "=") { 227 con = strings.SplitN(opt, "=", 2) 228 } else if strings.Contains(opt, ":") { 229 con = strings.SplitN(opt, ":", 2) 230 logrus.Warn("Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead.") 231 } 232 if len(con) != 2 { 233 return fmt.Errorf("invalid --security-opt 1: %q", opt) 234 } 235 236 switch con[0] { 237 case "label": 238 labelOpts = append(labelOpts, con[1]) 239 case "apparmor": 240 container.AppArmorProfile = con[1] 241 case "seccomp": 242 container.SeccompProfile = con[1] 243 case "no-new-privileges": 244 noNewPrivileges, err := strconv.ParseBool(con[1]) 245 if err != nil { 246 return fmt.Errorf("invalid --security-opt 2: %q", opt) 247 } 248 container.NoNewPrivileges = noNewPrivileges 249 default: 250 return fmt.Errorf("invalid --security-opt 2: %q", opt) 251 } 252 } 253 254 container.ProcessLabel, container.MountLabel, err = label.InitLabels(labelOpts) 255 return err 256 } 257 258 func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { 259 var throttleDevices []specs.LinuxThrottleDevice 260 var stat unix.Stat_t 261 262 for _, d := range devs { 263 if err := unix.Stat(d.Path, &stat); err != nil { 264 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: d.Path, Err: err}) 265 } 266 d := specs.LinuxThrottleDevice{Rate: d.Rate} 267 // the type is 32bit on mips 268 d.Major = int64(unix.Major(uint64(stat.Rdev))) // nolint: unconvert 269 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) // nolint: unconvert 270 throttleDevices = append(throttleDevices, d) 271 } 272 273 return throttleDevices, nil 274 } 275 276 // adjustParallelLimit takes a number of objects and a proposed limit and 277 // figures out if it's reasonable (and adjusts it accordingly). This is only 278 // used for daemon startup, which does a lot of parallel loading of containers 279 // (and if we exceed RLIMIT_NOFILE then we're in trouble). 280 func adjustParallelLimit(n int, limit int) int { 281 // Rule-of-thumb overhead factor (how many files will each goroutine open 282 // simultaneously). Yes, this is ugly but to be frank this whole thing is 283 // ugly. 284 const overhead = 2 285 286 // On Linux, we need to ensure that parallelStartupJobs doesn't cause us to 287 // exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it 288 // and give a warning (since in theory the user should increase their 289 // ulimits to the largest possible value for dockerd). 290 var rlim unix.Rlimit 291 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil { 292 logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err) 293 return limit 294 } 295 softRlimit := int(rlim.Cur) 296 297 // Much fewer containers than RLIMIT_NOFILE. No need to adjust anything. 298 if softRlimit > overhead*n { 299 return limit 300 } 301 302 // RLIMIT_NOFILE big enough, no need to adjust anything. 303 if softRlimit > overhead*limit { 304 return limit 305 } 306 307 logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit) 308 return softRlimit / overhead 309 } 310 311 func checkKernel() error { 312 // Check for unsupported kernel versions 313 // FIXME: it would be cleaner to not test for specific versions, but rather 314 // test for specific functionalities. 315 // Unfortunately we can't test for the feature "does not cause a kernel panic" 316 // without actually causing a kernel panic, so we need this workaround until 317 // the circumstances of pre-3.10 crashes are clearer. 318 // For details see https://github.com/docker/docker/issues/407 319 // Docker 1.11 and above doesn't actually run on kernels older than 3.4, 320 // due to containerd-shim usage of PR_SET_CHILD_SUBREAPER (introduced in 3.4). 321 if !kernel.CheckKernelVersion(3, 10, 0) { 322 v, _ := kernel.GetKernelVersion() 323 if os.Getenv("DOCKER_NOWARN_KERNEL_VERSION") == "" { 324 logrus.Fatalf("Your Linux kernel version %s is not supported for running docker. Please upgrade your kernel to 3.10.0 or newer.", v.String()) 325 } 326 } 327 return nil 328 } 329 330 // adaptContainerSettings is called during container creation to modify any 331 // settings necessary in the HostConfig structure. 332 func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConfig, adjustCPUShares bool) error { 333 if adjustCPUShares && hostConfig.CPUShares > 0 { 334 // Handle unsupported CPUShares 335 if hostConfig.CPUShares < linuxMinCPUShares { 336 logrus.Warnf("Changing requested CPUShares of %d to minimum allowed of %d", hostConfig.CPUShares, linuxMinCPUShares) 337 hostConfig.CPUShares = linuxMinCPUShares 338 } else if hostConfig.CPUShares > linuxMaxCPUShares { 339 logrus.Warnf("Changing requested CPUShares of %d to maximum allowed of %d", hostConfig.CPUShares, linuxMaxCPUShares) 340 hostConfig.CPUShares = linuxMaxCPUShares 341 } 342 } 343 if hostConfig.Memory > 0 && hostConfig.MemorySwap == 0 { 344 // By default, MemorySwap is set to twice the size of Memory. 345 hostConfig.MemorySwap = hostConfig.Memory * 2 346 } 347 if hostConfig.ShmSize == 0 { 348 hostConfig.ShmSize = config.DefaultShmSize 349 if daemon.configStore != nil { 350 hostConfig.ShmSize = int64(daemon.configStore.ShmSize) 351 } 352 } 353 // Set default IPC mode, if unset for container 354 if hostConfig.IpcMode.IsEmpty() { 355 m := config.DefaultIpcMode 356 if daemon.configStore != nil { 357 m = daemon.configStore.IpcMode 358 } 359 hostConfig.IpcMode = containertypes.IpcMode(m) 360 } 361 362 // Set default cgroup namespace mode, if unset for container 363 if hostConfig.CgroupnsMode.IsEmpty() { 364 // for cgroup v2: unshare cgroupns even for privileged containers 365 // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 366 if hostConfig.Privileged && cgroups.Mode() != cgroups.Unified { 367 hostConfig.CgroupnsMode = containertypes.CgroupnsMode("host") 368 } else { 369 m := "host" 370 if cgroups.Mode() == cgroups.Unified { 371 m = "private" 372 } 373 if daemon.configStore != nil { 374 m = daemon.configStore.CgroupNamespaceMode 375 } 376 hostConfig.CgroupnsMode = containertypes.CgroupnsMode(m) 377 } 378 } 379 380 adaptSharedNamespaceContainer(daemon, hostConfig) 381 382 var err error 383 secOpts, err := daemon.generateSecurityOpt(hostConfig) 384 if err != nil { 385 return err 386 } 387 hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, secOpts...) 388 if hostConfig.OomKillDisable == nil { 389 defaultOomKillDisable := false 390 hostConfig.OomKillDisable = &defaultOomKillDisable 391 } 392 393 return nil 394 } 395 396 // adaptSharedNamespaceContainer replaces container name with its ID in hostConfig. 397 // To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode 398 // and NetworkMode. 399 // 400 // When a container shares its namespace with another container, use ID can keep the namespace 401 // sharing connection between the two containers even the another container is renamed. 402 func adaptSharedNamespaceContainer(daemon containerGetter, hostConfig *containertypes.HostConfig) { 403 containerPrefix := "container:" 404 if hostConfig.PidMode.IsContainer() { 405 pidContainer := hostConfig.PidMode.Container() 406 // if there is any error returned here, we just ignore it and leave it to be 407 // handled in the following logic 408 if c, err := daemon.GetContainer(pidContainer); err == nil { 409 hostConfig.PidMode = containertypes.PidMode(containerPrefix + c.ID) 410 } 411 } 412 if hostConfig.IpcMode.IsContainer() { 413 ipcContainer := hostConfig.IpcMode.Container() 414 if c, err := daemon.GetContainer(ipcContainer); err == nil { 415 hostConfig.IpcMode = containertypes.IpcMode(containerPrefix + c.ID) 416 } 417 } 418 if hostConfig.NetworkMode.IsContainer() { 419 netContainer := hostConfig.NetworkMode.ConnectedContainer() 420 if c, err := daemon.GetContainer(netContainer); err == nil { 421 hostConfig.NetworkMode = containertypes.NetworkMode(containerPrefix + c.ID) 422 } 423 } 424 } 425 426 // verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration 427 func verifyPlatformContainerResources(resources *containertypes.Resources, sysInfo *sysinfo.SysInfo, update bool) (warnings []string, err error) { 428 fixMemorySwappiness(resources) 429 430 // memory subsystem checks and adjustments 431 if resources.Memory != 0 && resources.Memory < linuxMinMemory { 432 return warnings, fmt.Errorf("Minimum memory limit allowed is 6MB") 433 } 434 if resources.Memory > 0 && !sysInfo.MemoryLimit { 435 warnings = append(warnings, "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 436 resources.Memory = 0 437 resources.MemorySwap = -1 438 } 439 if resources.Memory > 0 && resources.MemorySwap != -1 && !sysInfo.SwapLimit { 440 warnings = append(warnings, "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.") 441 resources.MemorySwap = -1 442 } 443 if resources.Memory > 0 && resources.MemorySwap > 0 && resources.MemorySwap < resources.Memory { 444 return warnings, fmt.Errorf("Minimum memoryswap limit should be larger than memory limit, see usage") 445 } 446 if resources.Memory == 0 && resources.MemorySwap > 0 && !update { 447 return warnings, fmt.Errorf("You should always set the Memory limit when using Memoryswap limit, see usage") 448 } 449 if resources.MemorySwappiness != nil && !sysInfo.MemorySwappiness { 450 warnings = append(warnings, "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded.") 451 resources.MemorySwappiness = nil 452 } 453 if resources.MemorySwappiness != nil { 454 swappiness := *resources.MemorySwappiness 455 if swappiness < 0 || swappiness > 100 { 456 return warnings, fmt.Errorf("Invalid value: %v, valid memory swappiness range is 0-100", swappiness) 457 } 458 } 459 if resources.MemoryReservation > 0 && !sysInfo.MemoryReservation { 460 warnings = append(warnings, "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded.") 461 resources.MemoryReservation = 0 462 } 463 if resources.MemoryReservation > 0 && resources.MemoryReservation < linuxMinMemory { 464 return warnings, fmt.Errorf("Minimum memory reservation allowed is 6MB") 465 } 466 if resources.Memory > 0 && resources.MemoryReservation > 0 && resources.Memory < resources.MemoryReservation { 467 return warnings, fmt.Errorf("Minimum memory limit can not be less than memory reservation limit, see usage") 468 } 469 if resources.KernelMemory > 0 { 470 // Kernel memory limit is not supported on cgroup v2. 471 // Even on cgroup v1, kernel memory limit (`kmem.limit_in_bytes`) has been deprecated since kernel 5.4. 472 // https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0b5adf44cae99b3ebcc7 473 warnings = append(warnings, "Specifying a kernel memory limit is deprecated and will be removed in a future release.") 474 } 475 if resources.KernelMemory > 0 && !sysInfo.KernelMemory { 476 warnings = append(warnings, "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 477 resources.KernelMemory = 0 478 } 479 if resources.KernelMemory > 0 && resources.KernelMemory < linuxMinMemory { 480 return warnings, fmt.Errorf("Minimum kernel memory limit allowed is 4MB") 481 } 482 if resources.KernelMemory > 0 && !kernel.CheckKernelVersion(4, 0, 0) { 483 warnings = append(warnings, "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable.") 484 } 485 if resources.OomKillDisable != nil && !sysInfo.OomKillDisable { 486 // only produce warnings if the setting wasn't to *disable* the OOM Kill; no point 487 // warning the caller if they already wanted the feature to be off 488 if *resources.OomKillDisable { 489 warnings = append(warnings, "Your kernel does not support OomKillDisable. OomKillDisable discarded.") 490 } 491 resources.OomKillDisable = nil 492 } 493 if resources.OomKillDisable != nil && *resources.OomKillDisable && resources.Memory == 0 { 494 warnings = append(warnings, "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources.") 495 } 496 if resources.PidsLimit != nil && !sysInfo.PidsLimit { 497 if *resources.PidsLimit > 0 { 498 warnings = append(warnings, "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded.") 499 } 500 resources.PidsLimit = nil 501 } 502 503 // cpu subsystem checks and adjustments 504 if resources.NanoCPUs > 0 && resources.CPUPeriod > 0 { 505 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Period cannot both be set") 506 } 507 if resources.NanoCPUs > 0 && resources.CPUQuota > 0 { 508 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Quota cannot both be set") 509 } 510 if resources.NanoCPUs > 0 && !sysInfo.CPUCfs { 511 return warnings, fmt.Errorf("NanoCPUs can not be set, as your kernel does not support CPU CFS scheduler or the cgroup is not mounted") 512 } 513 // The highest precision we could get on Linux is 0.001, by setting 514 // cpu.cfs_period_us=1000ms 515 // cpu.cfs_quota=1ms 516 // See the following link for details: 517 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 518 // Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error. 519 // The error message is 0.01 so that this is consistent with Windows 520 if resources.NanoCPUs < 0 || resources.NanoCPUs > int64(sysinfo.NumCPU())*1e9 { 521 return warnings, fmt.Errorf("Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available", sysinfo.NumCPU(), sysinfo.NumCPU()) 522 } 523 524 if resources.CPUShares > 0 && !sysInfo.CPUShares { 525 warnings = append(warnings, "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded.") 526 resources.CPUShares = 0 527 } 528 if (resources.CPUPeriod != 0 || resources.CPUQuota != 0) && !sysInfo.CPUCfs { 529 warnings = append(warnings, "Your kernel does not support CPU CFS scheduler. CPU period/quota discarded.") 530 resources.CPUPeriod = 0 531 resources.CPUQuota = 0 532 } 533 if resources.CPUPeriod != 0 && (resources.CPUPeriod < 1000 || resources.CPUPeriod > 1000000) { 534 return warnings, fmt.Errorf("CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)") 535 } 536 if resources.CPUQuota > 0 && resources.CPUQuota < 1000 { 537 return warnings, fmt.Errorf("CPU cfs quota can not be less than 1ms (i.e. 1000)") 538 } 539 if resources.CPUPercent > 0 { 540 warnings = append(warnings, fmt.Sprintf("%s does not support CPU percent. Percent discarded.", runtime.GOOS)) 541 resources.CPUPercent = 0 542 } 543 544 // cpuset subsystem checks and adjustments 545 if (resources.CpusetCpus != "" || resources.CpusetMems != "") && !sysInfo.Cpuset { 546 warnings = append(warnings, "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded.") 547 resources.CpusetCpus = "" 548 resources.CpusetMems = "" 549 } 550 cpusAvailable, err := sysInfo.IsCpusetCpusAvailable(resources.CpusetCpus) 551 if err != nil { 552 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset cpus", resources.CpusetCpus) 553 } 554 if !cpusAvailable { 555 return warnings, fmt.Errorf("Requested CPUs are not available - requested %s, available: %s", resources.CpusetCpus, sysInfo.Cpus) 556 } 557 memsAvailable, err := sysInfo.IsCpusetMemsAvailable(resources.CpusetMems) 558 if err != nil { 559 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset mems", resources.CpusetMems) 560 } 561 if !memsAvailable { 562 return warnings, fmt.Errorf("Requested memory nodes are not available - requested %s, available: %s", resources.CpusetMems, sysInfo.Mems) 563 } 564 565 // blkio subsystem checks and adjustments 566 if resources.BlkioWeight > 0 && !sysInfo.BlkioWeight { 567 warnings = append(warnings, "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded.") 568 resources.BlkioWeight = 0 569 } 570 if resources.BlkioWeight > 0 && (resources.BlkioWeight < 10 || resources.BlkioWeight > 1000) { 571 return warnings, fmt.Errorf("Range of blkio weight is from 10 to 1000") 572 } 573 if resources.IOMaximumBandwidth != 0 || resources.IOMaximumIOps != 0 { 574 return warnings, fmt.Errorf("Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps", runtime.GOOS) 575 } 576 if len(resources.BlkioWeightDevice) > 0 && !sysInfo.BlkioWeightDevice { 577 warnings = append(warnings, "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded.") 578 resources.BlkioWeightDevice = []*pblkiodev.WeightDevice{} 579 } 580 if len(resources.BlkioDeviceReadBps) > 0 && !sysInfo.BlkioReadBpsDevice { 581 warnings = append(warnings, "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded.") 582 resources.BlkioDeviceReadBps = []*pblkiodev.ThrottleDevice{} 583 } 584 if len(resources.BlkioDeviceWriteBps) > 0 && !sysInfo.BlkioWriteBpsDevice { 585 warnings = append(warnings, "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded.") 586 resources.BlkioDeviceWriteBps = []*pblkiodev.ThrottleDevice{} 587 588 } 589 if len(resources.BlkioDeviceReadIOps) > 0 && !sysInfo.BlkioReadIOpsDevice { 590 warnings = append(warnings, "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded.") 591 resources.BlkioDeviceReadIOps = []*pblkiodev.ThrottleDevice{} 592 } 593 if len(resources.BlkioDeviceWriteIOps) > 0 && !sysInfo.BlkioWriteIOpsDevice { 594 warnings = append(warnings, "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded.") 595 resources.BlkioDeviceWriteIOps = []*pblkiodev.ThrottleDevice{} 596 } 597 598 return warnings, nil 599 } 600 601 func (daemon *Daemon) getCgroupDriver() string { 602 if UsingSystemd(daemon.configStore) { 603 return cgroupSystemdDriver 604 } 605 if daemon.Rootless() { 606 return cgroupNoneDriver 607 } 608 return cgroupFsDriver 609 } 610 611 // getCD gets the raw value of the native.cgroupdriver option, if set. 612 func getCD(config *config.Config) string { 613 for _, option := range config.ExecOptions { 614 key, val, err := parsers.ParseKeyValueOpt(option) 615 if err != nil || !strings.EqualFold(key, "native.cgroupdriver") { 616 continue 617 } 618 return val 619 } 620 return "" 621 } 622 623 // VerifyCgroupDriver validates native.cgroupdriver 624 func VerifyCgroupDriver(config *config.Config) error { 625 cd := getCD(config) 626 if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver { 627 return nil 628 } 629 if cd == cgroupNoneDriver { 630 return fmt.Errorf("native.cgroupdriver option %s is internally used and cannot be specified manually", cd) 631 } 632 return fmt.Errorf("native.cgroupdriver option %s not supported", cd) 633 } 634 635 // UsingSystemd returns true if cli option includes native.cgroupdriver=systemd 636 func UsingSystemd(config *config.Config) bool { 637 if getCD(config) == cgroupSystemdDriver { 638 return true 639 } 640 // On cgroup v2 hosts, default to systemd driver 641 if getCD(config) == "" && cgroups.Mode() == cgroups.Unified && IsRunningSystemd() { 642 return true 643 } 644 return false 645 } 646 647 // IsRunningSystemd is from https://github.com/opencontainers/runc/blob/46be7b612e2533c494e6a251111de46d8e286ed5/libcontainer/cgroups/systemd/common.go#L27-L33 648 func IsRunningSystemd() bool { 649 fi, err := os.Lstat("/run/systemd/system") 650 if err != nil { 651 return false 652 } 653 return fi.IsDir() 654 } 655 656 // verifyPlatformContainerSettings performs platform-specific validation of the 657 // hostconfig and config structures. 658 func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.HostConfig, update bool) (warnings []string, err error) { 659 if hostConfig == nil { 660 return nil, nil 661 } 662 sysInfo := daemon.RawSysInfo(true) 663 664 w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update) 665 666 // no matter err is nil or not, w could have data in itself. 667 warnings = append(warnings, w...) 668 669 if err != nil { 670 return warnings, err 671 } 672 673 if hostConfig.ShmSize < 0 { 674 return warnings, fmt.Errorf("SHM size can not be less than 0") 675 } 676 677 if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 { 678 return warnings, fmt.Errorf("Invalid value %d, range for oom score adj is [-1000, 1000]", hostConfig.OomScoreAdj) 679 } 680 681 // ip-forwarding does not affect container with '--net=host' (or '--net=none') 682 if sysInfo.IPv4ForwardingDisabled && !(hostConfig.NetworkMode.IsHost() || hostConfig.NetworkMode.IsNone()) { 683 warnings = append(warnings, "IPv4 forwarding is disabled. Networking will not work.") 684 } 685 if hostConfig.NetworkMode.IsHost() && len(hostConfig.PortBindings) > 0 { 686 warnings = append(warnings, "Published ports are discarded when using host network mode") 687 } 688 689 // check for various conflicting options with user namespaces 690 if daemon.configStore.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() { 691 if hostConfig.Privileged { 692 return warnings, fmt.Errorf("privileged mode is incompatible with user namespaces. You must run the container in the host namespace when running privileged mode") 693 } 694 if hostConfig.NetworkMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 695 return warnings, fmt.Errorf("cannot share the host's network namespace when user namespaces are enabled") 696 } 697 if hostConfig.PidMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 698 return warnings, fmt.Errorf("cannot share the host PID namespace when user namespaces are enabled") 699 } 700 } 701 if hostConfig.CgroupParent != "" && UsingSystemd(daemon.configStore) { 702 // CgroupParent for systemd cgroup should be named as "xxx.slice" 703 if len(hostConfig.CgroupParent) <= 6 || !strings.HasSuffix(hostConfig.CgroupParent, ".slice") { 704 return warnings, fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 705 } 706 } 707 if hostConfig.Runtime == "" { 708 hostConfig.Runtime = daemon.configStore.GetDefaultRuntimeName() 709 } 710 711 if rt := daemon.configStore.GetRuntime(hostConfig.Runtime); rt == nil { 712 return warnings, fmt.Errorf("Unknown runtime specified %s", hostConfig.Runtime) 713 } 714 715 parser := volumemounts.NewParser(runtime.GOOS) 716 for dest := range hostConfig.Tmpfs { 717 if err := parser.ValidateTmpfsMountDestination(dest); err != nil { 718 return warnings, err 719 } 720 } 721 722 if !hostConfig.CgroupnsMode.Valid() { 723 return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode) 724 } 725 if hostConfig.CgroupnsMode.IsPrivate() { 726 if !sysInfo.CgroupNamespaces { 727 warnings = append(warnings, "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded.") 728 } 729 } 730 731 if hostConfig.Runtime == config.LinuxV1RuntimeName || (hostConfig.Runtime == "" && daemon.configStore.DefaultRuntime == config.LinuxV1RuntimeName) { 732 warnings = append(warnings, fmt.Sprintf("Configured runtime %q is deprecated and will be removed in the next release.", config.LinuxV1RuntimeName)) 733 } 734 735 return warnings, nil 736 } 737 738 // verifyDaemonSettings performs validation of daemon config struct 739 func verifyDaemonSettings(conf *config.Config) error { 740 if conf.ContainerdNamespace == conf.ContainerdPluginNamespace { 741 return errors.New("containers namespace and plugins namespace cannot be the same") 742 } 743 // Check for mutually incompatible config options 744 if conf.BridgeConfig.Iface != "" && conf.BridgeConfig.IP != "" { 745 return fmt.Errorf("You specified -b & --bip, mutually exclusive options. Please specify only one") 746 } 747 if !conf.BridgeConfig.EnableIPTables && !conf.BridgeConfig.InterContainerCommunication { 748 return fmt.Errorf("You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true") 749 } 750 if conf.BridgeConfig.EnableIP6Tables && !conf.Experimental { 751 return fmt.Errorf("ip6tables rules are only available if experimental features are enabled") 752 } 753 if !conf.BridgeConfig.EnableIPTables && conf.BridgeConfig.EnableIPMasq { 754 conf.BridgeConfig.EnableIPMasq = false 755 } 756 if err := VerifyCgroupDriver(conf); err != nil { 757 return err 758 } 759 if conf.CgroupParent != "" && UsingSystemd(conf) { 760 if len(conf.CgroupParent) <= 6 || !strings.HasSuffix(conf.CgroupParent, ".slice") { 761 return fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 762 } 763 } 764 765 if conf.Rootless && UsingSystemd(conf) && cgroups.Mode() != cgroups.Unified { 766 return fmt.Errorf("exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode") 767 } 768 769 configureRuntimes(conf) 770 if rtName := conf.GetDefaultRuntimeName(); rtName != "" { 771 if conf.GetRuntime(rtName) == nil { 772 return fmt.Errorf("specified default runtime '%s' does not exist", rtName) 773 } 774 if rtName == config.LinuxV1RuntimeName { 775 logrus.Warnf("Configured default runtime %q is deprecated and will be removed in the next release.", config.LinuxV1RuntimeName) 776 } 777 } 778 return nil 779 } 780 781 // checkSystem validates platform-specific requirements 782 func checkSystem() error { 783 return checkKernel() 784 } 785 786 // configureMaxThreads sets the Go runtime max threads threshold 787 // which is 90% of the kernel setting from /proc/sys/kernel/threads-max 788 func configureMaxThreads(config *config.Config) error { 789 mt, err := ioutil.ReadFile("/proc/sys/kernel/threads-max") 790 if err != nil { 791 return err 792 } 793 mtint, err := strconv.Atoi(strings.TrimSpace(string(mt))) 794 if err != nil { 795 return err 796 } 797 maxThreads := (mtint / 100) * 90 798 debug.SetMaxThreads(maxThreads) 799 logrus.Debugf("Golang's threads limit set to %d", maxThreads) 800 return nil 801 } 802 803 func overlaySupportsSelinux() (bool, error) { 804 f, err := os.Open("/proc/kallsyms") 805 if err != nil { 806 if os.IsNotExist(err) { 807 return false, nil 808 } 809 return false, err 810 } 811 defer f.Close() 812 813 s := bufio.NewScanner(f) 814 for s.Scan() { 815 if strings.HasSuffix(s.Text(), " security_inode_copy_up") { 816 return true, nil 817 } 818 } 819 820 return false, s.Err() 821 } 822 823 // configureKernelSecuritySupport configures and validates security support for the kernel 824 func configureKernelSecuritySupport(config *config.Config, driverName string) error { 825 if config.EnableSelinuxSupport { 826 if !selinux.GetEnabled() { 827 logrus.Warn("Docker could not enable SELinux on the host system") 828 return nil 829 } 830 831 if driverName == "overlay" || driverName == "overlay2" { 832 // If driver is overlay or overlay2, make sure kernel 833 // supports selinux with overlay. 834 supported, err := overlaySupportsSelinux() 835 if err != nil { 836 return err 837 } 838 839 if !supported { 840 logrus.Warnf("SELinux is not supported with the %v graph driver on this kernel", driverName) 841 } 842 } 843 } else { 844 selinux.SetDisabled() 845 } 846 return nil 847 } 848 849 func (daemon *Daemon) initNetworkController(config *config.Config, activeSandboxes map[string]interface{}) (libnetwork.NetworkController, error) { 850 netOptions, err := daemon.networkOptions(config, daemon.PluginStore, activeSandboxes) 851 if err != nil { 852 return nil, err 853 } 854 855 controller, err := libnetwork.New(netOptions...) 856 if err != nil { 857 return nil, fmt.Errorf("error obtaining controller instance: %v", err) 858 } 859 860 if len(activeSandboxes) > 0 { 861 logrus.Info("There are old running containers, the network config will not take affect") 862 return controller, nil 863 } 864 865 // Initialize default network on "null" 866 if n, _ := controller.NetworkByName("none"); n == nil { 867 if _, err := controller.NewNetwork("null", "none", "", libnetwork.NetworkOptionPersist(true)); err != nil { 868 return nil, fmt.Errorf("Error creating default \"null\" network: %v", err) 869 } 870 } 871 872 // Initialize default network on "host" 873 if n, _ := controller.NetworkByName("host"); n == nil { 874 if _, err := controller.NewNetwork("host", "host", "", libnetwork.NetworkOptionPersist(true)); err != nil { 875 return nil, fmt.Errorf("Error creating default \"host\" network: %v", err) 876 } 877 } 878 879 // Clear stale bridge network 880 if n, err := controller.NetworkByName("bridge"); err == nil { 881 if err = n.Delete(); err != nil { 882 return nil, fmt.Errorf("could not delete the default bridge network: %v", err) 883 } 884 if len(config.NetworkConfig.DefaultAddressPools.Value()) > 0 && !daemon.configStore.LiveRestoreEnabled { 885 removeDefaultBridgeInterface() 886 } 887 } 888 889 if !config.DisableBridge { 890 // Initialize default driver "bridge" 891 if err := initBridgeDriver(controller, config); err != nil { 892 return nil, err 893 } 894 } else { 895 removeDefaultBridgeInterface() 896 } 897 898 // Set HostGatewayIP to the default bridge's IP if it is empty 899 if daemon.configStore.HostGatewayIP == nil && controller != nil { 900 if n, err := controller.NetworkByName("bridge"); err == nil { 901 v4Info, v6Info := n.Info().IpamInfo() 902 var gateway net.IP 903 if len(v4Info) > 0 { 904 gateway = v4Info[0].Gateway.IP 905 } else if len(v6Info) > 0 { 906 gateway = v6Info[0].Gateway.IP 907 } 908 daemon.configStore.HostGatewayIP = gateway 909 } 910 } 911 return controller, nil 912 } 913 914 func driverOptions(config *config.Config) []nwconfig.Option { 915 bridgeConfig := options.Generic{ 916 "EnableIPForwarding": config.BridgeConfig.EnableIPForward, 917 "EnableIPTables": config.BridgeConfig.EnableIPTables, 918 "EnableIP6Tables": config.BridgeConfig.EnableIP6Tables, 919 "EnableUserlandProxy": config.BridgeConfig.EnableUserlandProxy, 920 "UserlandProxyPath": config.BridgeConfig.UserlandProxyPath} 921 bridgeOption := options.Generic{netlabel.GenericData: bridgeConfig} 922 923 dOptions := []nwconfig.Option{} 924 dOptions = append(dOptions, nwconfig.OptionDriverConfig("bridge", bridgeOption)) 925 return dOptions 926 } 927 928 func initBridgeDriver(controller libnetwork.NetworkController, config *config.Config) error { 929 bridgeName := bridge.DefaultBridgeName 930 if config.BridgeConfig.Iface != "" { 931 bridgeName = config.BridgeConfig.Iface 932 } 933 netOption := map[string]string{ 934 bridge.BridgeName: bridgeName, 935 bridge.DefaultBridge: strconv.FormatBool(true), 936 netlabel.DriverMTU: strconv.Itoa(config.Mtu), 937 bridge.EnableIPMasquerade: strconv.FormatBool(config.BridgeConfig.EnableIPMasq), 938 bridge.EnableICC: strconv.FormatBool(config.BridgeConfig.InterContainerCommunication), 939 } 940 941 // --ip processing 942 if config.BridgeConfig.DefaultIP != nil { 943 netOption[bridge.DefaultBindingIP] = config.BridgeConfig.DefaultIP.String() 944 } 945 946 ipamV4Conf := &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 947 948 nwList, nw6List, err := netutils.ElectInterfaceAddresses(bridgeName) 949 if err != nil { 950 return errors.Wrap(err, "list bridge addresses failed") 951 } 952 953 nw := nwList[0] 954 if len(nwList) > 1 && config.BridgeConfig.FixedCIDR != "" { 955 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 956 if err != nil { 957 return errors.Wrap(err, "parse CIDR failed") 958 } 959 // Iterate through in case there are multiple addresses for the bridge 960 for _, entry := range nwList { 961 if fCIDR.Contains(entry.IP) { 962 nw = entry 963 break 964 } 965 } 966 } 967 968 ipamV4Conf.PreferredPool = lntypes.GetIPNetCanonical(nw).String() 969 hip, _ := lntypes.GetHostPartIP(nw.IP, nw.Mask) 970 if hip.IsGlobalUnicast() { 971 ipamV4Conf.Gateway = nw.IP.String() 972 } 973 974 if config.BridgeConfig.IP != "" { 975 ip, ipNet, err := net.ParseCIDR(config.BridgeConfig.IP) 976 if err != nil { 977 return err 978 } 979 ipamV4Conf.PreferredPool = ipNet.String() 980 ipamV4Conf.Gateway = ip.String() 981 } else if bridgeName == bridge.DefaultBridgeName && ipamV4Conf.PreferredPool != "" { 982 logrus.Infof("Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address", bridgeName, ipamV4Conf.PreferredPool) 983 } 984 985 if config.BridgeConfig.FixedCIDR != "" { 986 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 987 if err != nil { 988 return err 989 } 990 991 ipamV4Conf.SubPool = fCIDR.String() 992 } 993 994 if config.BridgeConfig.DefaultGatewayIPv4 != nil { 995 ipamV4Conf.AuxAddresses["DefaultGatewayIPv4"] = config.BridgeConfig.DefaultGatewayIPv4.String() 996 } 997 998 var ( 999 deferIPv6Alloc bool 1000 ipamV6Conf *libnetwork.IpamConf 1001 ) 1002 1003 if config.BridgeConfig.EnableIPv6 && config.BridgeConfig.FixedCIDRv6 == "" { 1004 return errdefs.InvalidParameter(errors.New("IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6")) 1005 } else if config.BridgeConfig.FixedCIDRv6 != "" { 1006 _, fCIDRv6, err := net.ParseCIDR(config.BridgeConfig.FixedCIDRv6) 1007 if err != nil { 1008 return err 1009 } 1010 1011 // In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has 1012 // at least 48 host bits, we need to guarantee the current behavior where the containers' 1013 // IPv6 addresses will be constructed based on the containers' interface MAC address. 1014 // We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints 1015 // on this network until after the driver has created the endpoint and returned the 1016 // constructed address. Libnetwork will then reserve this address with the ipam driver. 1017 ones, _ := fCIDRv6.Mask.Size() 1018 deferIPv6Alloc = ones <= 80 1019 1020 ipamV6Conf = &libnetwork.IpamConf{ 1021 AuxAddresses: make(map[string]string), 1022 PreferredPool: fCIDRv6.String(), 1023 } 1024 1025 // In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6 1026 // address belongs to the same network, we need to inform libnetwork about it, so 1027 // that it can be reserved with IPAM and it will not be given away to somebody else 1028 for _, nw6 := range nw6List { 1029 if fCIDRv6.Contains(nw6.IP) { 1030 ipamV6Conf.Gateway = nw6.IP.String() 1031 break 1032 } 1033 } 1034 } 1035 1036 if config.BridgeConfig.DefaultGatewayIPv6 != nil { 1037 if ipamV6Conf == nil { 1038 ipamV6Conf = &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 1039 } 1040 ipamV6Conf.AuxAddresses["DefaultGatewayIPv6"] = config.BridgeConfig.DefaultGatewayIPv6.String() 1041 } 1042 1043 v4Conf := []*libnetwork.IpamConf{ipamV4Conf} 1044 v6Conf := []*libnetwork.IpamConf{} 1045 if ipamV6Conf != nil { 1046 v6Conf = append(v6Conf, ipamV6Conf) 1047 } 1048 // Initialize default network on "bridge" with the same name 1049 _, err = controller.NewNetwork("bridge", "bridge", "", 1050 libnetwork.NetworkOptionEnableIPv6(config.BridgeConfig.EnableIPv6), 1051 libnetwork.NetworkOptionDriverOpts(netOption), 1052 libnetwork.NetworkOptionIpam("default", "", v4Conf, v6Conf, nil), 1053 libnetwork.NetworkOptionDeferIPv6Alloc(deferIPv6Alloc)) 1054 if err != nil { 1055 return fmt.Errorf("Error creating default \"bridge\" network: %v", err) 1056 } 1057 return nil 1058 } 1059 1060 // Remove default bridge interface if present (--bridge=none use case) 1061 func removeDefaultBridgeInterface() { 1062 if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil { 1063 if err := netlink.LinkDel(lnk); err != nil { 1064 logrus.Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err) 1065 } 1066 } 1067 } 1068 1069 func setupInitLayer(idMapping *idtools.IdentityMapping) func(containerfs.ContainerFS) error { 1070 return func(initPath containerfs.ContainerFS) error { 1071 return initlayer.Setup(initPath, idMapping.RootPair()) 1072 } 1073 } 1074 1075 // Parse the remapped root (user namespace) option, which can be one of: 1076 // username - valid username from /etc/passwd 1077 // username:groupname - valid username; valid groupname from /etc/group 1078 // uid - 32-bit unsigned int valid Linux UID value 1079 // uid:gid - uid value; 32-bit unsigned int Linux GID value 1080 // 1081 // If no groupname is specified, and a username is specified, an attempt 1082 // will be made to lookup a gid for that username as a groupname 1083 // 1084 // If names are used, they are verified to exist in passwd/group 1085 func parseRemappedRoot(usergrp string) (string, string, error) { 1086 1087 var ( 1088 userID, groupID int 1089 username, groupname string 1090 ) 1091 1092 idparts := strings.Split(usergrp, ":") 1093 if len(idparts) > 2 { 1094 return "", "", fmt.Errorf("Invalid user/group specification in --userns-remap: %q", usergrp) 1095 } 1096 1097 if uid, err := strconv.ParseInt(idparts[0], 10, 32); err == nil { 1098 // must be a uid; take it as valid 1099 userID = int(uid) 1100 luser, err := idtools.LookupUID(userID) 1101 if err != nil { 1102 return "", "", fmt.Errorf("Uid %d has no entry in /etc/passwd: %v", userID, err) 1103 } 1104 username = luser.Name 1105 if len(idparts) == 1 { 1106 // if the uid was numeric and no gid was specified, take the uid as the gid 1107 groupID = userID 1108 lgrp, err := idtools.LookupGID(groupID) 1109 if err != nil { 1110 return "", "", fmt.Errorf("Gid %d has no entry in /etc/group: %v", groupID, err) 1111 } 1112 groupname = lgrp.Name 1113 } 1114 } else { 1115 lookupName := idparts[0] 1116 // special case: if the user specified "default", they want Docker to create or 1117 // use (after creation) the "dockremap" user/group for root remapping 1118 if lookupName == defaultIDSpecifier { 1119 lookupName = defaultRemappedID 1120 } 1121 luser, err := idtools.LookupUser(lookupName) 1122 if err != nil && idparts[0] != defaultIDSpecifier { 1123 // error if the name requested isn't the special "dockremap" ID 1124 return "", "", fmt.Errorf("Error during uid lookup for %q: %v", lookupName, err) 1125 } else if err != nil { 1126 // special case-- if the username == "default", then we have been asked 1127 // to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid} 1128 // ranges will be used for the user and group mappings in user namespaced containers 1129 _, _, err := idtools.AddNamespaceRangesUser(defaultRemappedID) 1130 if err == nil { 1131 return defaultRemappedID, defaultRemappedID, nil 1132 } 1133 return "", "", fmt.Errorf("Error during %q user creation: %v", defaultRemappedID, err) 1134 } 1135 username = luser.Name 1136 if len(idparts) == 1 { 1137 // we only have a string username, and no group specified; look up gid from username as group 1138 group, err := idtools.LookupGroup(lookupName) 1139 if err != nil { 1140 return "", "", fmt.Errorf("Error during gid lookup for %q: %v", lookupName, err) 1141 } 1142 groupname = group.Name 1143 } 1144 } 1145 1146 if len(idparts) == 2 { 1147 // groupname or gid is separately specified and must be resolved 1148 // to an unsigned 32-bit gid 1149 if gid, err := strconv.ParseInt(idparts[1], 10, 32); err == nil { 1150 // must be a gid, take it as valid 1151 groupID = int(gid) 1152 lgrp, err := idtools.LookupGID(groupID) 1153 if err != nil { 1154 return "", "", fmt.Errorf("Gid %d has no entry in /etc/passwd: %v", groupID, err) 1155 } 1156 groupname = lgrp.Name 1157 } else { 1158 // not a number; attempt a lookup 1159 if _, err := idtools.LookupGroup(idparts[1]); err != nil { 1160 return "", "", fmt.Errorf("Error during groupname lookup for %q: %v", idparts[1], err) 1161 } 1162 groupname = idparts[1] 1163 } 1164 } 1165 return username, groupname, nil 1166 } 1167 1168 func setupRemappedRoot(config *config.Config) (*idtools.IdentityMapping, error) { 1169 if runtime.GOOS != "linux" && config.RemappedRoot != "" { 1170 return nil, fmt.Errorf("User namespaces are only supported on Linux") 1171 } 1172 1173 // if the daemon was started with remapped root option, parse 1174 // the config option to the int uid,gid values 1175 if config.RemappedRoot != "" { 1176 username, groupname, err := parseRemappedRoot(config.RemappedRoot) 1177 if err != nil { 1178 return nil, err 1179 } 1180 if username == "root" { 1181 // Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op 1182 // effectively 1183 logrus.Warn("User namespaces: root cannot be remapped with itself; user namespaces are OFF") 1184 return &idtools.IdentityMapping{}, nil 1185 } 1186 logrus.Infof("User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s", username) 1187 // update remapped root setting now that we have resolved them to actual names 1188 config.RemappedRoot = fmt.Sprintf("%s:%s", username, groupname) 1189 1190 mappings, err := idtools.NewIdentityMapping(username) 1191 if err != nil { 1192 return nil, errors.Wrap(err, "Can't create ID mappings") 1193 } 1194 return mappings, nil 1195 } 1196 return &idtools.IdentityMapping{}, nil 1197 } 1198 1199 func setupDaemonRoot(config *config.Config, rootDir string, remappedRoot idtools.Identity) error { 1200 config.Root = rootDir 1201 // the docker root metadata directory needs to have execute permissions for all users (g+x,o+x) 1202 // so that syscalls executing as non-root, operating on subdirectories of the graph root 1203 // (e.g. mounted layers of a container) can traverse this path. 1204 // The user namespace support will create subdirectories for the remapped root host uid:gid 1205 // pair owned by that same uid:gid pair for proper write access to those needed metadata and 1206 // layer content subtrees. 1207 if _, err := os.Stat(rootDir); err == nil { 1208 // root current exists; verify the access bits are correct by setting them 1209 if err = os.Chmod(rootDir, 0711); err != nil { 1210 return err 1211 } 1212 } else if os.IsNotExist(err) { 1213 // no root exists yet, create it 0711 with root:root ownership 1214 if err := os.MkdirAll(rootDir, 0711); err != nil { 1215 return err 1216 } 1217 } 1218 1219 // if user namespaces are enabled we will create a subtree underneath the specified root 1220 // with any/all specified remapped root uid/gid options on the daemon creating 1221 // a new subdirectory with ownership set to the remapped uid/gid (so as to allow 1222 // `chdir()` to work for containers namespaced to that uid/gid) 1223 if config.RemappedRoot != "" { 1224 id := idtools.CurrentIdentity() 1225 // First make sure the current root dir has the correct perms. 1226 if err := idtools.MkdirAllAndChown(config.Root, 0701, id); err != nil { 1227 return errors.Wrapf(err, "could not create or set daemon root permissions: %s", config.Root) 1228 } 1229 1230 config.Root = filepath.Join(rootDir, fmt.Sprintf("%d.%d", remappedRoot.UID, remappedRoot.GID)) 1231 logrus.Debugf("Creating user namespaced daemon root: %s", config.Root) 1232 // Create the root directory if it doesn't exist 1233 if err := idtools.MkdirAllAndChown(config.Root, 0701, id); err != nil { 1234 return fmt.Errorf("Cannot create daemon root: %s: %v", config.Root, err) 1235 } 1236 // we also need to verify that any pre-existing directories in the path to 1237 // the graphroot won't block access to remapped root--if any pre-existing directory 1238 // has strict permissions that don't allow "x", container start will fail, so 1239 // better to warn and fail now 1240 dirPath := config.Root 1241 for { 1242 dirPath = filepath.Dir(dirPath) 1243 if dirPath == "/" { 1244 break 1245 } 1246 if !idtools.CanAccess(dirPath, remappedRoot) { 1247 return fmt.Errorf("a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories", config.Root) 1248 } 1249 } 1250 } 1251 1252 if err := setupDaemonRootPropagation(config); err != nil { 1253 logrus.WithError(err).WithField("dir", config.Root).Warn("Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior") 1254 } 1255 return nil 1256 } 1257 1258 func setupDaemonRootPropagation(cfg *config.Config) error { 1259 rootParentMount, mountOptions, err := getSourceMount(cfg.Root) 1260 if err != nil { 1261 return errors.Wrap(err, "error getting daemon root's parent mount") 1262 } 1263 1264 var cleanupOldFile bool 1265 cleanupFile := getUnmountOnShutdownPath(cfg) 1266 defer func() { 1267 if !cleanupOldFile { 1268 return 1269 } 1270 if err := os.Remove(cleanupFile); err != nil && !os.IsNotExist(err) { 1271 logrus.WithError(err).WithField("file", cleanupFile).Warn("could not clean up old root propagation unmount file") 1272 } 1273 }() 1274 1275 if hasMountInfoOption(mountOptions, sharedPropagationOption, slavePropagationOption) { 1276 cleanupOldFile = true 1277 return nil 1278 } 1279 1280 if err := mount.MakeShared(cfg.Root); err != nil { 1281 return errors.Wrap(err, "could not setup daemon root propagation to shared") 1282 } 1283 1284 // check the case where this may have already been a mount to itself. 1285 // If so then the daemon only performed a remount and should not try to unmount this later. 1286 if rootParentMount == cfg.Root { 1287 cleanupOldFile = true 1288 return nil 1289 } 1290 1291 if err := os.MkdirAll(filepath.Dir(cleanupFile), 0700); err != nil { 1292 return errors.Wrap(err, "error creating dir to store mount cleanup file") 1293 } 1294 1295 if err := ioutil.WriteFile(cleanupFile, nil, 0600); err != nil { 1296 return errors.Wrap(err, "error writing file to signal mount cleanup on shutdown") 1297 } 1298 return nil 1299 } 1300 1301 // getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown 1302 // the daemon root should be unmounted. 1303 func getUnmountOnShutdownPath(config *config.Config) string { 1304 return filepath.Join(config.ExecRoot, "unmount-on-shutdown") 1305 } 1306 1307 // registerLinks writes the links to a file. 1308 func (daemon *Daemon) registerLinks(container *container.Container, hostConfig *containertypes.HostConfig) error { 1309 if hostConfig == nil || hostConfig.NetworkMode.IsUserDefined() { 1310 return nil 1311 } 1312 1313 for _, l := range hostConfig.Links { 1314 name, alias, err := opts.ParseLink(l) 1315 if err != nil { 1316 return err 1317 } 1318 child, err := daemon.GetContainer(name) 1319 if err != nil { 1320 if errdefs.IsNotFound(err) { 1321 // Trying to link to a non-existing container is not valid, and 1322 // should return an "invalid parameter" error. Returning a "not 1323 // found" error here would make the client report the container's 1324 // image could not be found (see moby/moby#39823) 1325 err = errdefs.InvalidParameter(err) 1326 } 1327 return errors.Wrapf(err, "could not get container for %s", name) 1328 } 1329 for child.HostConfig.NetworkMode.IsContainer() { 1330 parts := strings.SplitN(string(child.HostConfig.NetworkMode), ":", 2) 1331 child, err = daemon.GetContainer(parts[1]) 1332 if err != nil { 1333 if errdefs.IsNotFound(err) { 1334 // Trying to link to a non-existing container is not valid, and 1335 // should return an "invalid parameter" error. Returning a "not 1336 // found" error here would make the client report the container's 1337 // image could not be found (see moby/moby#39823) 1338 err = errdefs.InvalidParameter(err) 1339 } 1340 return errors.Wrapf(err, "Could not get container for %s", parts[1]) 1341 } 1342 } 1343 if child.HostConfig.NetworkMode.IsHost() { 1344 return runconfig.ErrConflictHostNetworkAndLinks 1345 } 1346 if err := daemon.registerLink(container, child, alias); err != nil { 1347 return err 1348 } 1349 } 1350 1351 // After we load all the links into the daemon 1352 // set them to nil on the hostconfig 1353 _, err := container.WriteHostConfig() 1354 return err 1355 } 1356 1357 // conditionalMountOnStart is a platform specific helper function during the 1358 // container start to call mount. 1359 func (daemon *Daemon) conditionalMountOnStart(container *container.Container) error { 1360 return daemon.Mount(container) 1361 } 1362 1363 // conditionalUnmountOnCleanup is a platform specific helper function called 1364 // during the cleanup of a container to unmount. 1365 func (daemon *Daemon) conditionalUnmountOnCleanup(container *container.Container) error { 1366 return daemon.Unmount(container) 1367 } 1368 1369 func copyBlkioEntry(entries []*statsV1.BlkIOEntry) []types.BlkioStatEntry { 1370 out := make([]types.BlkioStatEntry, len(entries)) 1371 for i, re := range entries { 1372 out[i] = types.BlkioStatEntry{ 1373 Major: re.Major, 1374 Minor: re.Minor, 1375 Op: re.Op, 1376 Value: re.Value, 1377 } 1378 } 1379 return out 1380 } 1381 1382 func (daemon *Daemon) stats(c *container.Container) (*types.StatsJSON, error) { 1383 if !c.IsRunning() { 1384 return nil, errNotRunning(c.ID) 1385 } 1386 cs, err := daemon.containerd.Stats(context.Background(), c.ID) 1387 if err != nil { 1388 if strings.Contains(err.Error(), "container not found") { 1389 return nil, containerNotFound(c.ID) 1390 } 1391 return nil, err 1392 } 1393 s := &types.StatsJSON{} 1394 s.Read = cs.Read 1395 stats := cs.Metrics 1396 switch t := stats.(type) { 1397 case *statsV1.Metrics: 1398 return daemon.statsV1(s, t) 1399 case *statsV2.Metrics: 1400 return daemon.statsV2(s, t) 1401 default: 1402 return nil, errors.Errorf("unexpected type of metrics %+v", t) 1403 } 1404 } 1405 1406 func (daemon *Daemon) statsV1(s *types.StatsJSON, stats *statsV1.Metrics) (*types.StatsJSON, error) { 1407 if stats.Blkio != nil { 1408 s.BlkioStats = types.BlkioStats{ 1409 IoServiceBytesRecursive: copyBlkioEntry(stats.Blkio.IoServiceBytesRecursive), 1410 IoServicedRecursive: copyBlkioEntry(stats.Blkio.IoServicedRecursive), 1411 IoQueuedRecursive: copyBlkioEntry(stats.Blkio.IoQueuedRecursive), 1412 IoServiceTimeRecursive: copyBlkioEntry(stats.Blkio.IoServiceTimeRecursive), 1413 IoWaitTimeRecursive: copyBlkioEntry(stats.Blkio.IoWaitTimeRecursive), 1414 IoMergedRecursive: copyBlkioEntry(stats.Blkio.IoMergedRecursive), 1415 IoTimeRecursive: copyBlkioEntry(stats.Blkio.IoTimeRecursive), 1416 SectorsRecursive: copyBlkioEntry(stats.Blkio.SectorsRecursive), 1417 } 1418 } 1419 if stats.CPU != nil { 1420 s.CPUStats = types.CPUStats{ 1421 CPUUsage: types.CPUUsage{ 1422 TotalUsage: stats.CPU.Usage.Total, 1423 PercpuUsage: stats.CPU.Usage.PerCPU, 1424 UsageInKernelmode: stats.CPU.Usage.Kernel, 1425 UsageInUsermode: stats.CPU.Usage.User, 1426 }, 1427 ThrottlingData: types.ThrottlingData{ 1428 Periods: stats.CPU.Throttling.Periods, 1429 ThrottledPeriods: stats.CPU.Throttling.ThrottledPeriods, 1430 ThrottledTime: stats.CPU.Throttling.ThrottledTime, 1431 }, 1432 } 1433 } 1434 1435 if stats.Memory != nil { 1436 raw := make(map[string]uint64) 1437 raw["cache"] = stats.Memory.Cache 1438 raw["rss"] = stats.Memory.RSS 1439 raw["rss_huge"] = stats.Memory.RSSHuge 1440 raw["mapped_file"] = stats.Memory.MappedFile 1441 raw["dirty"] = stats.Memory.Dirty 1442 raw["writeback"] = stats.Memory.Writeback 1443 raw["pgpgin"] = stats.Memory.PgPgIn 1444 raw["pgpgout"] = stats.Memory.PgPgOut 1445 raw["pgfault"] = stats.Memory.PgFault 1446 raw["pgmajfault"] = stats.Memory.PgMajFault 1447 raw["inactive_anon"] = stats.Memory.InactiveAnon 1448 raw["active_anon"] = stats.Memory.ActiveAnon 1449 raw["inactive_file"] = stats.Memory.InactiveFile 1450 raw["active_file"] = stats.Memory.ActiveFile 1451 raw["unevictable"] = stats.Memory.Unevictable 1452 raw["hierarchical_memory_limit"] = stats.Memory.HierarchicalMemoryLimit 1453 raw["hierarchical_memsw_limit"] = stats.Memory.HierarchicalSwapLimit 1454 raw["total_cache"] = stats.Memory.TotalCache 1455 raw["total_rss"] = stats.Memory.TotalRSS 1456 raw["total_rss_huge"] = stats.Memory.TotalRSSHuge 1457 raw["total_mapped_file"] = stats.Memory.TotalMappedFile 1458 raw["total_dirty"] = stats.Memory.TotalDirty 1459 raw["total_writeback"] = stats.Memory.TotalWriteback 1460 raw["total_pgpgin"] = stats.Memory.TotalPgPgIn 1461 raw["total_pgpgout"] = stats.Memory.TotalPgPgOut 1462 raw["total_pgfault"] = stats.Memory.TotalPgFault 1463 raw["total_pgmajfault"] = stats.Memory.TotalPgMajFault 1464 raw["total_inactive_anon"] = stats.Memory.TotalInactiveAnon 1465 raw["total_active_anon"] = stats.Memory.TotalActiveAnon 1466 raw["total_inactive_file"] = stats.Memory.TotalInactiveFile 1467 raw["total_active_file"] = stats.Memory.TotalActiveFile 1468 raw["total_unevictable"] = stats.Memory.TotalUnevictable 1469 1470 if stats.Memory.Usage != nil { 1471 s.MemoryStats = types.MemoryStats{ 1472 Stats: raw, 1473 Usage: stats.Memory.Usage.Usage, 1474 MaxUsage: stats.Memory.Usage.Max, 1475 Limit: stats.Memory.Usage.Limit, 1476 Failcnt: stats.Memory.Usage.Failcnt, 1477 } 1478 } else { 1479 s.MemoryStats = types.MemoryStats{ 1480 Stats: raw, 1481 } 1482 } 1483 1484 // if the container does not set memory limit, use the machineMemory 1485 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1486 s.MemoryStats.Limit = daemon.machineMemory 1487 } 1488 } 1489 1490 if stats.Pids != nil { 1491 s.PidsStats = types.PidsStats{ 1492 Current: stats.Pids.Current, 1493 Limit: stats.Pids.Limit, 1494 } 1495 } 1496 1497 return s, nil 1498 } 1499 1500 func (daemon *Daemon) statsV2(s *types.StatsJSON, stats *statsV2.Metrics) (*types.StatsJSON, error) { 1501 if stats.Io != nil { 1502 var isbr []types.BlkioStatEntry 1503 for _, re := range stats.Io.Usage { 1504 isbr = append(isbr, 1505 types.BlkioStatEntry{ 1506 Major: re.Major, 1507 Minor: re.Minor, 1508 Op: "read", 1509 Value: re.Rbytes, 1510 }, 1511 types.BlkioStatEntry{ 1512 Major: re.Major, 1513 Minor: re.Minor, 1514 Op: "write", 1515 Value: re.Wbytes, 1516 }, 1517 ) 1518 } 1519 s.BlkioStats = types.BlkioStats{ 1520 IoServiceBytesRecursive: isbr, 1521 // Other fields are unsupported 1522 } 1523 } 1524 1525 if stats.CPU != nil { 1526 s.CPUStats = types.CPUStats{ 1527 CPUUsage: types.CPUUsage{ 1528 TotalUsage: stats.CPU.UsageUsec * 1000, 1529 // PercpuUsage is not supported 1530 UsageInKernelmode: stats.CPU.SystemUsec * 1000, 1531 UsageInUsermode: stats.CPU.UserUsec * 1000, 1532 }, 1533 ThrottlingData: types.ThrottlingData{ 1534 Periods: stats.CPU.NrPeriods, 1535 ThrottledPeriods: stats.CPU.NrThrottled, 1536 ThrottledTime: stats.CPU.ThrottledUsec * 1000, 1537 }, 1538 } 1539 } 1540 1541 if stats.Memory != nil { 1542 raw := make(map[string]uint64) 1543 raw["anon"] = stats.Memory.Anon 1544 raw["file"] = stats.Memory.File 1545 raw["kernel_stack"] = stats.Memory.KernelStack 1546 raw["slab"] = stats.Memory.Slab 1547 raw["sock"] = stats.Memory.Sock 1548 raw["shmem"] = stats.Memory.Shmem 1549 raw["file_mapped"] = stats.Memory.FileMapped 1550 raw["file_dirty"] = stats.Memory.FileDirty 1551 raw["file_writeback"] = stats.Memory.FileWriteback 1552 raw["anon_thp"] = stats.Memory.AnonThp 1553 raw["inactive_anon"] = stats.Memory.InactiveAnon 1554 raw["active_anon"] = stats.Memory.ActiveAnon 1555 raw["inactive_file"] = stats.Memory.InactiveFile 1556 raw["active_file"] = stats.Memory.ActiveFile 1557 raw["unevictable"] = stats.Memory.Unevictable 1558 raw["slab_reclaimable"] = stats.Memory.SlabReclaimable 1559 raw["slab_unreclaimable"] = stats.Memory.SlabUnreclaimable 1560 raw["pgfault"] = stats.Memory.Pgfault 1561 raw["pgmajfault"] = stats.Memory.Pgmajfault 1562 raw["workingset_refault"] = stats.Memory.WorkingsetRefault 1563 raw["workingset_activate"] = stats.Memory.WorkingsetActivate 1564 raw["workingset_nodereclaim"] = stats.Memory.WorkingsetNodereclaim 1565 raw["pgrefill"] = stats.Memory.Pgrefill 1566 raw["pgscan"] = stats.Memory.Pgscan 1567 raw["pgsteal"] = stats.Memory.Pgsteal 1568 raw["pgactivate"] = stats.Memory.Pgactivate 1569 raw["pgdeactivate"] = stats.Memory.Pgdeactivate 1570 raw["pglazyfree"] = stats.Memory.Pglazyfree 1571 raw["pglazyfreed"] = stats.Memory.Pglazyfreed 1572 raw["thp_fault_alloc"] = stats.Memory.ThpFaultAlloc 1573 raw["thp_collapse_alloc"] = stats.Memory.ThpCollapseAlloc 1574 s.MemoryStats = types.MemoryStats{ 1575 // Stats is not compatible with v1 1576 Stats: raw, 1577 Usage: stats.Memory.Usage, 1578 // MaxUsage is not supported 1579 Limit: stats.Memory.UsageLimit, 1580 } 1581 // if the container does not set memory limit, use the machineMemory 1582 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1583 s.MemoryStats.Limit = daemon.machineMemory 1584 } 1585 if stats.MemoryEvents != nil { 1586 // Failcnt is set to the "oom" field of the "memory.events" file. 1587 // See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html 1588 s.MemoryStats.Failcnt = stats.MemoryEvents.Oom 1589 } 1590 } 1591 1592 if stats.Pids != nil { 1593 s.PidsStats = types.PidsStats{ 1594 Current: stats.Pids.Current, 1595 Limit: stats.Pids.Limit, 1596 } 1597 } 1598 1599 return s, nil 1600 } 1601 1602 // setDefaultIsolation determines the default isolation mode for the 1603 // daemon to run in. This is only applicable on Windows 1604 func (daemon *Daemon) setDefaultIsolation() error { 1605 return nil 1606 } 1607 1608 // setupDaemonProcess sets various settings for the daemon's process 1609 func setupDaemonProcess(config *config.Config) error { 1610 // setup the daemons oom_score_adj 1611 if err := setupOOMScoreAdj(config.OOMScoreAdjust); err != nil { 1612 return err 1613 } 1614 if err := setMayDetachMounts(); err != nil { 1615 logrus.WithError(err).Warn("Could not set may_detach_mounts kernel parameter") 1616 } 1617 return nil 1618 } 1619 1620 // This is used to allow removal of mountpoints that may be mounted in other 1621 // namespaces on RHEL based kernels starting from RHEL 7.4. 1622 // Without this setting, removals on these RHEL based kernels may fail with 1623 // "device or resource busy". 1624 // This setting is not available in upstream kernels as it is not configurable, 1625 // but has been in the upstream kernels since 3.15. 1626 func setMayDetachMounts() error { 1627 f, err := os.OpenFile("/proc/sys/fs/may_detach_mounts", os.O_WRONLY, 0) 1628 if err != nil { 1629 if os.IsNotExist(err) { 1630 return nil 1631 } 1632 return errors.Wrap(err, "error opening may_detach_mounts kernel config file") 1633 } 1634 defer f.Close() 1635 1636 _, err = f.WriteString("1") 1637 if os.IsPermission(err) { 1638 // Setting may_detach_mounts does not work in an 1639 // unprivileged container. Ignore the error, but log 1640 // it if we appear not to be in that situation. 1641 if !sys.RunningInUserNS() { 1642 logrus.Debugf("Permission denied writing %q to /proc/sys/fs/may_detach_mounts", "1") 1643 } 1644 return nil 1645 } 1646 return err 1647 } 1648 1649 func setupOOMScoreAdj(score int) error { 1650 if score == 0 { 1651 return nil 1652 } 1653 f, err := os.OpenFile("/proc/self/oom_score_adj", os.O_WRONLY, 0) 1654 if err != nil { 1655 return err 1656 } 1657 defer f.Close() 1658 stringScore := strconv.Itoa(score) 1659 _, err = f.WriteString(stringScore) 1660 if os.IsPermission(err) { 1661 // Setting oom_score_adj does not work in an 1662 // unprivileged container. Ignore the error, but log 1663 // it if we appear not to be in that situation. 1664 if !sys.RunningInUserNS() { 1665 logrus.Debugf("Permission denied writing %q to /proc/self/oom_score_adj", stringScore) 1666 } 1667 return nil 1668 } 1669 1670 return err 1671 } 1672 1673 func (daemon *Daemon) initCPURtController(mnt, path string) error { 1674 if path == "/" || path == "." { 1675 return nil 1676 } 1677 1678 // Recursively create cgroup to ensure that the system and all parent cgroups have values set 1679 // for the period and runtime as this limits what the children can be set to. 1680 if err := daemon.initCPURtController(mnt, filepath.Dir(path)); err != nil { 1681 return err 1682 } 1683 1684 path = filepath.Join(mnt, path) 1685 if err := os.MkdirAll(path, 0755); err != nil { 1686 return err 1687 } 1688 if err := maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil { 1689 return err 1690 } 1691 return maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimeRuntime, "cpu.rt_runtime_us", path) 1692 } 1693 1694 func maybeCreateCPURealTimeFile(configValue int64, file string, path string) error { 1695 if configValue == 0 { 1696 return nil 1697 } 1698 return ioutil.WriteFile(filepath.Join(path, file), []byte(strconv.FormatInt(configValue, 10)), 0700) 1699 } 1700 1701 func (daemon *Daemon) setupSeccompProfile() error { 1702 if daemon.configStore.SeccompProfile != "" { 1703 daemon.seccompProfilePath = daemon.configStore.SeccompProfile 1704 b, err := ioutil.ReadFile(daemon.configStore.SeccompProfile) 1705 if err != nil { 1706 return fmt.Errorf("opening seccomp profile (%s) failed: %v", daemon.configStore.SeccompProfile, err) 1707 } 1708 daemon.seccompProfile = b 1709 } 1710 return nil 1711 } 1712 1713 // RawSysInfo returns *sysinfo.SysInfo . 1714 func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo { 1715 var opts []sysinfo.Opt 1716 if daemon.getCgroupDriver() == cgroupSystemdDriver { 1717 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 1718 if rootlesskitParentEUID != "" { 1719 groupPath := fmt.Sprintf("/user.slice/user-%s.slice", rootlesskitParentEUID) 1720 opts = append(opts, sysinfo.WithCgroup2GroupPath(groupPath)) 1721 } 1722 } 1723 return sysinfo.New(quiet, opts...) 1724 } 1725 1726 func recursiveUnmount(target string) error { 1727 return mount.RecursiveUnmount(target) 1728 }