github.com/docker/docker@v299999999.0.0-20200612211812-aaf470eca7b5+incompatible/daemon/daemon_unix.go (about) 1 // +build linux freebsd 2 3 package daemon // import "github.com/docker/docker/daemon" 4 5 import ( 6 "bufio" 7 "context" 8 "fmt" 9 "io/ioutil" 10 "net" 11 "os" 12 "path/filepath" 13 "runtime" 14 "runtime/debug" 15 "strconv" 16 "strings" 17 "time" 18 19 statsV1 "github.com/containerd/cgroups/stats/v1" 20 statsV2 "github.com/containerd/cgroups/v2/stats" 21 "github.com/docker/docker/api/types" 22 "github.com/docker/docker/api/types/blkiodev" 23 pblkiodev "github.com/docker/docker/api/types/blkiodev" 24 containertypes "github.com/docker/docker/api/types/container" 25 "github.com/docker/docker/container" 26 "github.com/docker/docker/daemon/config" 27 "github.com/docker/docker/daemon/initlayer" 28 "github.com/docker/docker/errdefs" 29 "github.com/docker/docker/opts" 30 "github.com/docker/docker/pkg/containerfs" 31 "github.com/docker/docker/pkg/idtools" 32 "github.com/docker/docker/pkg/ioutils" 33 "github.com/docker/docker/pkg/parsers" 34 "github.com/docker/docker/pkg/parsers/kernel" 35 "github.com/docker/docker/pkg/sysinfo" 36 "github.com/docker/docker/runconfig" 37 volumemounts "github.com/docker/docker/volume/mounts" 38 "github.com/docker/libnetwork" 39 nwconfig "github.com/docker/libnetwork/config" 40 "github.com/docker/libnetwork/drivers/bridge" 41 "github.com/docker/libnetwork/netlabel" 42 "github.com/docker/libnetwork/netutils" 43 "github.com/docker/libnetwork/options" 44 lntypes "github.com/docker/libnetwork/types" 45 "github.com/moby/sys/mount" 46 "github.com/opencontainers/runc/libcontainer/cgroups" 47 rsystem "github.com/opencontainers/runc/libcontainer/system" 48 specs "github.com/opencontainers/runtime-spec/specs-go" 49 "github.com/opencontainers/selinux/go-selinux/label" 50 "github.com/pkg/errors" 51 "github.com/sirupsen/logrus" 52 "github.com/vishvananda/netlink" 53 "golang.org/x/sys/unix" 54 ) 55 56 const ( 57 isWindows = false 58 59 // DefaultShimBinary is the default shim to be used by containerd if none 60 // is specified 61 DefaultShimBinary = "containerd-shim" 62 63 // DefaultRuntimeBinary is the default runtime to be used by 64 // containerd if none is specified 65 DefaultRuntimeBinary = "runc" 66 67 // See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269 68 linuxMinCPUShares = 2 69 linuxMaxCPUShares = 262144 70 platformSupported = true 71 // It's not kernel limit, we want this 4M limit to supply a reasonable functional container 72 linuxMinMemory = 4194304 73 // constants for remapped root settings 74 defaultIDSpecifier = "default" 75 defaultRemappedID = "dockremap" 76 77 // constant for cgroup drivers 78 cgroupFsDriver = "cgroupfs" 79 cgroupSystemdDriver = "systemd" 80 cgroupNoneDriver = "none" 81 82 // DefaultRuntimeName is the default runtime to be used by 83 // containerd if none is specified 84 DefaultRuntimeName = "runc" 85 ) 86 87 type containerGetter interface { 88 GetContainer(string) (*container.Container, error) 89 } 90 91 func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory { 92 memory := specs.LinuxMemory{} 93 94 if config.Memory > 0 { 95 memory.Limit = &config.Memory 96 } 97 98 if config.MemoryReservation > 0 { 99 memory.Reservation = &config.MemoryReservation 100 } 101 102 if config.MemorySwap > 0 { 103 memory.Swap = &config.MemorySwap 104 } 105 106 if config.MemorySwappiness != nil { 107 swappiness := uint64(*config.MemorySwappiness) 108 memory.Swappiness = &swappiness 109 } 110 111 if config.OomKillDisable != nil { 112 memory.DisableOOMKiller = config.OomKillDisable 113 } 114 115 if config.KernelMemory != 0 { 116 memory.Kernel = &config.KernelMemory 117 } 118 119 if config.KernelMemoryTCP != 0 { 120 memory.KernelTCP = &config.KernelMemoryTCP 121 } 122 123 return &memory 124 } 125 126 func getPidsLimit(config containertypes.Resources) *specs.LinuxPids { 127 if config.PidsLimit == nil { 128 return nil 129 } 130 if *config.PidsLimit <= 0 { 131 // docker API allows 0 and negative values to unset this to be consistent 132 // with default values. When updating values, runc requires -1 to unset 133 // the previous limit. 134 return &specs.LinuxPids{Limit: -1} 135 } 136 return &specs.LinuxPids{Limit: *config.PidsLimit} 137 } 138 139 func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { 140 cpu := specs.LinuxCPU{} 141 142 if config.CPUShares < 0 { 143 return nil, fmt.Errorf("shares: invalid argument") 144 } 145 if config.CPUShares >= 0 { 146 shares := uint64(config.CPUShares) 147 cpu.Shares = &shares 148 } 149 150 if config.CpusetCpus != "" { 151 cpu.Cpus = config.CpusetCpus 152 } 153 154 if config.CpusetMems != "" { 155 cpu.Mems = config.CpusetMems 156 } 157 158 if config.NanoCPUs > 0 { 159 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 160 period := uint64(100 * time.Millisecond / time.Microsecond) 161 quota := config.NanoCPUs * int64(period) / 1e9 162 cpu.Period = &period 163 cpu.Quota = "a 164 } 165 166 if config.CPUPeriod != 0 { 167 period := uint64(config.CPUPeriod) 168 cpu.Period = &period 169 } 170 171 if config.CPUQuota != 0 { 172 q := config.CPUQuota 173 cpu.Quota = &q 174 } 175 176 if config.CPURealtimePeriod != 0 { 177 period := uint64(config.CPURealtimePeriod) 178 cpu.RealtimePeriod = &period 179 } 180 181 if config.CPURealtimeRuntime != 0 { 182 c := config.CPURealtimeRuntime 183 cpu.RealtimeRuntime = &c 184 } 185 186 return &cpu, nil 187 } 188 189 func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) { 190 var stat unix.Stat_t 191 var blkioWeightDevices []specs.LinuxWeightDevice 192 193 for _, weightDevice := range config.BlkioWeightDevice { 194 if err := unix.Stat(weightDevice.Path, &stat); err != nil { 195 return nil, err 196 } 197 weight := weightDevice.Weight 198 d := specs.LinuxWeightDevice{Weight: &weight} 199 // The type is 32bit on mips. 200 d.Major = int64(unix.Major(uint64(stat.Rdev))) // nolint: unconvert 201 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) // nolint: unconvert 202 blkioWeightDevices = append(blkioWeightDevices, d) 203 } 204 205 return blkioWeightDevices, nil 206 } 207 208 func (daemon *Daemon) parseSecurityOpt(container *container.Container, hostConfig *containertypes.HostConfig) error { 209 container.NoNewPrivileges = daemon.configStore.NoNewPrivileges 210 return parseSecurityOpt(container, hostConfig) 211 } 212 213 func parseSecurityOpt(container *container.Container, config *containertypes.HostConfig) error { 214 var ( 215 labelOpts []string 216 err error 217 ) 218 219 for _, opt := range config.SecurityOpt { 220 if opt == "no-new-privileges" { 221 container.NoNewPrivileges = true 222 continue 223 } 224 if opt == "disable" { 225 labelOpts = append(labelOpts, "disable") 226 continue 227 } 228 229 var con []string 230 if strings.Contains(opt, "=") { 231 con = strings.SplitN(opt, "=", 2) 232 } else if strings.Contains(opt, ":") { 233 con = strings.SplitN(opt, ":", 2) 234 logrus.Warn("Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead.") 235 } 236 if len(con) != 2 { 237 return fmt.Errorf("invalid --security-opt 1: %q", opt) 238 } 239 240 switch con[0] { 241 case "label": 242 labelOpts = append(labelOpts, con[1]) 243 case "apparmor": 244 container.AppArmorProfile = con[1] 245 case "seccomp": 246 container.SeccompProfile = con[1] 247 case "no-new-privileges": 248 noNewPrivileges, err := strconv.ParseBool(con[1]) 249 if err != nil { 250 return fmt.Errorf("invalid --security-opt 2: %q", opt) 251 } 252 container.NoNewPrivileges = noNewPrivileges 253 default: 254 return fmt.Errorf("invalid --security-opt 2: %q", opt) 255 } 256 } 257 258 container.ProcessLabel, container.MountLabel, err = label.InitLabels(labelOpts) 259 return err 260 } 261 262 func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { 263 var throttleDevices []specs.LinuxThrottleDevice 264 var stat unix.Stat_t 265 266 for _, d := range devs { 267 if err := unix.Stat(d.Path, &stat); err != nil { 268 return nil, err 269 } 270 d := specs.LinuxThrottleDevice{Rate: d.Rate} 271 // the type is 32bit on mips 272 d.Major = int64(unix.Major(uint64(stat.Rdev))) // nolint: unconvert 273 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) // nolint: unconvert 274 throttleDevices = append(throttleDevices, d) 275 } 276 277 return throttleDevices, nil 278 } 279 280 // adjustParallelLimit takes a number of objects and a proposed limit and 281 // figures out if it's reasonable (and adjusts it accordingly). This is only 282 // used for daemon startup, which does a lot of parallel loading of containers 283 // (and if we exceed RLIMIT_NOFILE then we're in trouble). 284 func adjustParallelLimit(n int, limit int) int { 285 // Rule-of-thumb overhead factor (how many files will each goroutine open 286 // simultaneously). Yes, this is ugly but to be frank this whole thing is 287 // ugly. 288 const overhead = 2 289 290 // On Linux, we need to ensure that parallelStartupJobs doesn't cause us to 291 // exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it 292 // and give a warning (since in theory the user should increase their 293 // ulimits to the largest possible value for dockerd). 294 var rlim unix.Rlimit 295 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil { 296 logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err) 297 return limit 298 } 299 softRlimit := int(rlim.Cur) 300 301 // Much fewer containers than RLIMIT_NOFILE. No need to adjust anything. 302 if softRlimit > overhead*n { 303 return limit 304 } 305 306 // RLIMIT_NOFILE big enough, no need to adjust anything. 307 if softRlimit > overhead*limit { 308 return limit 309 } 310 311 logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit) 312 return softRlimit / overhead 313 } 314 315 func checkKernel() error { 316 // Check for unsupported kernel versions 317 // FIXME: it would be cleaner to not test for specific versions, but rather 318 // test for specific functionalities. 319 // Unfortunately we can't test for the feature "does not cause a kernel panic" 320 // without actually causing a kernel panic, so we need this workaround until 321 // the circumstances of pre-3.10 crashes are clearer. 322 // For details see https://github.com/docker/docker/issues/407 323 // Docker 1.11 and above doesn't actually run on kernels older than 3.4, 324 // due to containerd-shim usage of PR_SET_CHILD_SUBREAPER (introduced in 3.4). 325 if !kernel.CheckKernelVersion(3, 10, 0) { 326 v, _ := kernel.GetKernelVersion() 327 if os.Getenv("DOCKER_NOWARN_KERNEL_VERSION") == "" { 328 logrus.Fatalf("Your Linux kernel version %s is not supported for running docker. Please upgrade your kernel to 3.10.0 or newer.", v.String()) 329 } 330 } 331 return nil 332 } 333 334 // adaptContainerSettings is called during container creation to modify any 335 // settings necessary in the HostConfig structure. 336 func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConfig, adjustCPUShares bool) error { 337 if adjustCPUShares && hostConfig.CPUShares > 0 { 338 // Handle unsupported CPUShares 339 if hostConfig.CPUShares < linuxMinCPUShares { 340 logrus.Warnf("Changing requested CPUShares of %d to minimum allowed of %d", hostConfig.CPUShares, linuxMinCPUShares) 341 hostConfig.CPUShares = linuxMinCPUShares 342 } else if hostConfig.CPUShares > linuxMaxCPUShares { 343 logrus.Warnf("Changing requested CPUShares of %d to maximum allowed of %d", hostConfig.CPUShares, linuxMaxCPUShares) 344 hostConfig.CPUShares = linuxMaxCPUShares 345 } 346 } 347 if hostConfig.Memory > 0 && hostConfig.MemorySwap == 0 { 348 // By default, MemorySwap is set to twice the size of Memory. 349 hostConfig.MemorySwap = hostConfig.Memory * 2 350 } 351 if hostConfig.ShmSize == 0 { 352 hostConfig.ShmSize = config.DefaultShmSize 353 if daemon.configStore != nil { 354 hostConfig.ShmSize = int64(daemon.configStore.ShmSize) 355 } 356 } 357 // Set default IPC mode, if unset for container 358 if hostConfig.IpcMode.IsEmpty() { 359 m := config.DefaultIpcMode 360 if daemon.configStore != nil { 361 m = daemon.configStore.IpcMode 362 } 363 hostConfig.IpcMode = containertypes.IpcMode(m) 364 } 365 366 // Set default cgroup namespace mode, if unset for container 367 if hostConfig.CgroupnsMode.IsEmpty() { 368 // for cgroup v2: unshare cgroupns even for privileged containers 369 // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 370 if hostConfig.Privileged && !cgroups.IsCgroup2UnifiedMode() { 371 hostConfig.CgroupnsMode = containertypes.CgroupnsMode("host") 372 } else { 373 m := "host" 374 if cgroups.IsCgroup2UnifiedMode() { 375 m = "private" 376 } 377 if daemon.configStore != nil { 378 m = daemon.configStore.CgroupNamespaceMode 379 } 380 hostConfig.CgroupnsMode = containertypes.CgroupnsMode(m) 381 } 382 } 383 384 adaptSharedNamespaceContainer(daemon, hostConfig) 385 386 var err error 387 secOpts, err := daemon.generateSecurityOpt(hostConfig) 388 if err != nil { 389 return err 390 } 391 hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, secOpts...) 392 if hostConfig.OomKillDisable == nil { 393 defaultOomKillDisable := false 394 hostConfig.OomKillDisable = &defaultOomKillDisable 395 } 396 397 return nil 398 } 399 400 // adaptSharedNamespaceContainer replaces container name with its ID in hostConfig. 401 // To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode 402 // and NetworkMode. 403 // 404 // When a container shares its namespace with another container, use ID can keep the namespace 405 // sharing connection between the two containers even the another container is renamed. 406 func adaptSharedNamespaceContainer(daemon containerGetter, hostConfig *containertypes.HostConfig) { 407 containerPrefix := "container:" 408 if hostConfig.PidMode.IsContainer() { 409 pidContainer := hostConfig.PidMode.Container() 410 // if there is any error returned here, we just ignore it and leave it to be 411 // handled in the following logic 412 if c, err := daemon.GetContainer(pidContainer); err == nil { 413 hostConfig.PidMode = containertypes.PidMode(containerPrefix + c.ID) 414 } 415 } 416 if hostConfig.IpcMode.IsContainer() { 417 ipcContainer := hostConfig.IpcMode.Container() 418 if c, err := daemon.GetContainer(ipcContainer); err == nil { 419 hostConfig.IpcMode = containertypes.IpcMode(containerPrefix + c.ID) 420 } 421 } 422 if hostConfig.NetworkMode.IsContainer() { 423 netContainer := hostConfig.NetworkMode.ConnectedContainer() 424 if c, err := daemon.GetContainer(netContainer); err == nil { 425 hostConfig.NetworkMode = containertypes.NetworkMode(containerPrefix + c.ID) 426 } 427 } 428 } 429 430 // verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration 431 func verifyPlatformContainerResources(resources *containertypes.Resources, sysInfo *sysinfo.SysInfo, update bool) (warnings []string, err error) { 432 fixMemorySwappiness(resources) 433 434 // memory subsystem checks and adjustments 435 if resources.Memory != 0 && resources.Memory < linuxMinMemory { 436 return warnings, fmt.Errorf("Minimum memory limit allowed is 4MB") 437 } 438 if resources.Memory > 0 && !sysInfo.MemoryLimit { 439 warnings = append(warnings, "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 440 resources.Memory = 0 441 resources.MemorySwap = -1 442 } 443 if resources.Memory > 0 && resources.MemorySwap != -1 && !sysInfo.SwapLimit { 444 warnings = append(warnings, "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.") 445 resources.MemorySwap = -1 446 } 447 if resources.Memory > 0 && resources.MemorySwap > 0 && resources.MemorySwap < resources.Memory { 448 return warnings, fmt.Errorf("Minimum memoryswap limit should be larger than memory limit, see usage") 449 } 450 if resources.Memory == 0 && resources.MemorySwap > 0 && !update { 451 return warnings, fmt.Errorf("You should always set the Memory limit when using Memoryswap limit, see usage") 452 } 453 if resources.MemorySwappiness != nil && !sysInfo.MemorySwappiness { 454 warnings = append(warnings, "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded.") 455 resources.MemorySwappiness = nil 456 } 457 if resources.MemorySwappiness != nil { 458 swappiness := *resources.MemorySwappiness 459 if swappiness < 0 || swappiness > 100 { 460 return warnings, fmt.Errorf("Invalid value: %v, valid memory swappiness range is 0-100", swappiness) 461 } 462 } 463 if resources.MemoryReservation > 0 && !sysInfo.MemoryReservation { 464 warnings = append(warnings, "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded.") 465 resources.MemoryReservation = 0 466 } 467 if resources.MemoryReservation > 0 && resources.MemoryReservation < linuxMinMemory { 468 return warnings, fmt.Errorf("Minimum memory reservation allowed is 4MB") 469 } 470 if resources.Memory > 0 && resources.MemoryReservation > 0 && resources.Memory < resources.MemoryReservation { 471 return warnings, fmt.Errorf("Minimum memory limit can not be less than memory reservation limit, see usage") 472 } 473 if resources.KernelMemory > 0 && !sysInfo.KernelMemory { 474 warnings = append(warnings, "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 475 resources.KernelMemory = 0 476 } 477 if resources.KernelMemory > 0 && resources.KernelMemory < linuxMinMemory { 478 return warnings, fmt.Errorf("Minimum kernel memory limit allowed is 4MB") 479 } 480 if resources.KernelMemory > 0 && !kernel.CheckKernelVersion(4, 0, 0) { 481 warnings = append(warnings, "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable.") 482 } 483 if resources.OomKillDisable != nil && !sysInfo.OomKillDisable { 484 // only produce warnings if the setting wasn't to *disable* the OOM Kill; no point 485 // warning the caller if they already wanted the feature to be off 486 if *resources.OomKillDisable { 487 warnings = append(warnings, "Your kernel does not support OomKillDisable. OomKillDisable discarded.") 488 } 489 resources.OomKillDisable = nil 490 } 491 if resources.OomKillDisable != nil && *resources.OomKillDisable && resources.Memory == 0 { 492 warnings = append(warnings, "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources.") 493 } 494 if resources.PidsLimit != nil && !sysInfo.PidsLimit { 495 if *resources.PidsLimit > 0 { 496 warnings = append(warnings, "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded.") 497 } 498 resources.PidsLimit = nil 499 } 500 501 // cpu subsystem checks and adjustments 502 if resources.NanoCPUs > 0 && resources.CPUPeriod > 0 { 503 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Period cannot both be set") 504 } 505 if resources.NanoCPUs > 0 && resources.CPUQuota > 0 { 506 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Quota cannot both be set") 507 } 508 if resources.NanoCPUs > 0 && (!sysInfo.CPUCfsPeriod || !sysInfo.CPUCfsQuota) { 509 return warnings, fmt.Errorf("NanoCPUs can not be set, as your kernel does not support CPU cfs period/quota or the cgroup is not mounted") 510 } 511 // The highest precision we could get on Linux is 0.001, by setting 512 // cpu.cfs_period_us=1000ms 513 // cpu.cfs_quota=1ms 514 // See the following link for details: 515 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 516 // Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error. 517 // The error message is 0.01 so that this is consistent with Windows 518 if resources.NanoCPUs < 0 || resources.NanoCPUs > int64(sysinfo.NumCPU())*1e9 { 519 return warnings, fmt.Errorf("Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available", sysinfo.NumCPU(), sysinfo.NumCPU()) 520 } 521 522 if resources.CPUShares > 0 && !sysInfo.CPUShares { 523 warnings = append(warnings, "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded.") 524 resources.CPUShares = 0 525 } 526 if resources.CPUPeriod > 0 && !sysInfo.CPUCfsPeriod { 527 warnings = append(warnings, "Your kernel does not support CPU cfs period or the cgroup is not mounted. Period discarded.") 528 resources.CPUPeriod = 0 529 } 530 if resources.CPUPeriod != 0 && (resources.CPUPeriod < 1000 || resources.CPUPeriod > 1000000) { 531 return warnings, fmt.Errorf("CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)") 532 } 533 if resources.CPUQuota > 0 && !sysInfo.CPUCfsQuota { 534 warnings = append(warnings, "Your kernel does not support CPU cfs quota or the cgroup is not mounted. Quota discarded.") 535 resources.CPUQuota = 0 536 } 537 if resources.CPUQuota > 0 && resources.CPUQuota < 1000 { 538 return warnings, fmt.Errorf("CPU cfs quota can not be less than 1ms (i.e. 1000)") 539 } 540 if resources.CPUPercent > 0 { 541 warnings = append(warnings, fmt.Sprintf("%s does not support CPU percent. Percent discarded.", runtime.GOOS)) 542 resources.CPUPercent = 0 543 } 544 545 // cpuset subsystem checks and adjustments 546 if (resources.CpusetCpus != "" || resources.CpusetMems != "") && !sysInfo.Cpuset { 547 warnings = append(warnings, "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded.") 548 resources.CpusetCpus = "" 549 resources.CpusetMems = "" 550 } 551 cpusAvailable, err := sysInfo.IsCpusetCpusAvailable(resources.CpusetCpus) 552 if err != nil { 553 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset cpus", resources.CpusetCpus) 554 } 555 if !cpusAvailable { 556 return warnings, fmt.Errorf("Requested CPUs are not available - requested %s, available: %s", resources.CpusetCpus, sysInfo.Cpus) 557 } 558 memsAvailable, err := sysInfo.IsCpusetMemsAvailable(resources.CpusetMems) 559 if err != nil { 560 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset mems", resources.CpusetMems) 561 } 562 if !memsAvailable { 563 return warnings, fmt.Errorf("Requested memory nodes are not available - requested %s, available: %s", resources.CpusetMems, sysInfo.Mems) 564 } 565 566 // blkio subsystem checks and adjustments 567 if resources.BlkioWeight > 0 && !sysInfo.BlkioWeight { 568 warnings = append(warnings, "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded.") 569 resources.BlkioWeight = 0 570 } 571 if resources.BlkioWeight > 0 && (resources.BlkioWeight < 10 || resources.BlkioWeight > 1000) { 572 return warnings, fmt.Errorf("Range of blkio weight is from 10 to 1000") 573 } 574 if resources.IOMaximumBandwidth != 0 || resources.IOMaximumIOps != 0 { 575 return warnings, fmt.Errorf("Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps", runtime.GOOS) 576 } 577 if len(resources.BlkioWeightDevice) > 0 && !sysInfo.BlkioWeightDevice { 578 warnings = append(warnings, "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded.") 579 resources.BlkioWeightDevice = []*pblkiodev.WeightDevice{} 580 } 581 if len(resources.BlkioDeviceReadBps) > 0 && !sysInfo.BlkioReadBpsDevice { 582 warnings = append(warnings, "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded.") 583 resources.BlkioDeviceReadBps = []*pblkiodev.ThrottleDevice{} 584 } 585 if len(resources.BlkioDeviceWriteBps) > 0 && !sysInfo.BlkioWriteBpsDevice { 586 warnings = append(warnings, "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded.") 587 resources.BlkioDeviceWriteBps = []*pblkiodev.ThrottleDevice{} 588 589 } 590 if len(resources.BlkioDeviceReadIOps) > 0 && !sysInfo.BlkioReadIOpsDevice { 591 warnings = append(warnings, "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded.") 592 resources.BlkioDeviceReadIOps = []*pblkiodev.ThrottleDevice{} 593 } 594 if len(resources.BlkioDeviceWriteIOps) > 0 && !sysInfo.BlkioWriteIOpsDevice { 595 warnings = append(warnings, "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded.") 596 resources.BlkioDeviceWriteIOps = []*pblkiodev.ThrottleDevice{} 597 } 598 599 return warnings, nil 600 } 601 602 func (daemon *Daemon) getCgroupDriver() string { 603 if UsingSystemd(daemon.configStore) { 604 return cgroupSystemdDriver 605 } 606 if daemon.Rootless() { 607 return cgroupNoneDriver 608 } 609 return cgroupFsDriver 610 } 611 612 // getCD gets the raw value of the native.cgroupdriver option, if set. 613 func getCD(config *config.Config) string { 614 for _, option := range config.ExecOptions { 615 key, val, err := parsers.ParseKeyValueOpt(option) 616 if err != nil || !strings.EqualFold(key, "native.cgroupdriver") { 617 continue 618 } 619 return val 620 } 621 return "" 622 } 623 624 // VerifyCgroupDriver validates native.cgroupdriver 625 func VerifyCgroupDriver(config *config.Config) error { 626 cd := getCD(config) 627 if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver { 628 return nil 629 } 630 if cd == cgroupNoneDriver { 631 return fmt.Errorf("native.cgroupdriver option %s is internally used and cannot be specified manually", cd) 632 } 633 return fmt.Errorf("native.cgroupdriver option %s not supported", cd) 634 } 635 636 // UsingSystemd returns true if cli option includes native.cgroupdriver=systemd 637 func UsingSystemd(config *config.Config) bool { 638 if getCD(config) == cgroupSystemdDriver { 639 return true 640 } 641 // On cgroup v2 hosts, default to systemd driver 642 if getCD(config) == "" && cgroups.IsCgroup2UnifiedMode() && IsRunningSystemd() { 643 return true 644 } 645 return false 646 } 647 648 // IsRunningSystemd is from https://github.com/opencontainers/runc/blob/46be7b612e2533c494e6a251111de46d8e286ed5/libcontainer/cgroups/systemd/common.go#L27-L33 649 func IsRunningSystemd() bool { 650 fi, err := os.Lstat("/run/systemd/system") 651 if err != nil { 652 return false 653 } 654 return fi.IsDir() 655 } 656 657 // verifyPlatformContainerSettings performs platform-specific validation of the 658 // hostconfig and config structures. 659 func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.HostConfig, update bool) (warnings []string, err error) { 660 if hostConfig == nil { 661 return nil, nil 662 } 663 sysInfo := daemon.RawSysInfo(true) 664 665 w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update) 666 667 // no matter err is nil or not, w could have data in itself. 668 warnings = append(warnings, w...) 669 670 if err != nil { 671 return warnings, err 672 } 673 674 if hostConfig.ShmSize < 0 { 675 return warnings, fmt.Errorf("SHM size can not be less than 0") 676 } 677 678 if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 { 679 return warnings, fmt.Errorf("Invalid value %d, range for oom score adj is [-1000, 1000]", hostConfig.OomScoreAdj) 680 } 681 682 // ip-forwarding does not affect container with '--net=host' (or '--net=none') 683 if sysInfo.IPv4ForwardingDisabled && !(hostConfig.NetworkMode.IsHost() || hostConfig.NetworkMode.IsNone()) { 684 warnings = append(warnings, "IPv4 forwarding is disabled. Networking will not work.") 685 } 686 if hostConfig.NetworkMode.IsHost() && len(hostConfig.PortBindings) > 0 { 687 warnings = append(warnings, "Published ports are discarded when using host network mode") 688 } 689 690 // check for various conflicting options with user namespaces 691 if daemon.configStore.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() { 692 if hostConfig.Privileged { 693 return warnings, fmt.Errorf("privileged mode is incompatible with user namespaces. You must run the container in the host namespace when running privileged mode") 694 } 695 if hostConfig.NetworkMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 696 return warnings, fmt.Errorf("cannot share the host's network namespace when user namespaces are enabled") 697 } 698 if hostConfig.PidMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 699 return warnings, fmt.Errorf("cannot share the host PID namespace when user namespaces are enabled") 700 } 701 } 702 if hostConfig.CgroupParent != "" && UsingSystemd(daemon.configStore) { 703 // CgroupParent for systemd cgroup should be named as "xxx.slice" 704 if len(hostConfig.CgroupParent) <= 6 || !strings.HasSuffix(hostConfig.CgroupParent, ".slice") { 705 return warnings, fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 706 } 707 } 708 if hostConfig.Runtime == "" { 709 hostConfig.Runtime = daemon.configStore.GetDefaultRuntimeName() 710 } 711 712 if rt := daemon.configStore.GetRuntime(hostConfig.Runtime); rt == nil { 713 return warnings, fmt.Errorf("Unknown runtime specified %s", hostConfig.Runtime) 714 } 715 716 parser := volumemounts.NewParser(runtime.GOOS) 717 for dest := range hostConfig.Tmpfs { 718 if err := parser.ValidateTmpfsMountDestination(dest); err != nil { 719 return warnings, err 720 } 721 } 722 723 if !hostConfig.CgroupnsMode.Valid() { 724 return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode) 725 } 726 if hostConfig.CgroupnsMode.IsPrivate() { 727 if !sysInfo.CgroupNamespaces { 728 warnings = append(warnings, "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded.") 729 } 730 } 731 732 return warnings, nil 733 } 734 735 func (daemon *Daemon) loadRuntimes() error { 736 return daemon.initRuntimes(daemon.configStore.Runtimes) 737 } 738 739 func (daemon *Daemon) initRuntimes(runtimes map[string]types.Runtime) (err error) { 740 runtimeDir := filepath.Join(daemon.configStore.Root, "runtimes") 741 // Remove old temp directory if any 742 os.RemoveAll(runtimeDir + "-old") 743 tmpDir, err := ioutils.TempDir(daemon.configStore.Root, "gen-runtimes") 744 if err != nil { 745 return errors.Wrap(err, "failed to get temp dir to generate runtime scripts") 746 } 747 defer func() { 748 if err != nil { 749 if err1 := os.RemoveAll(tmpDir); err1 != nil { 750 logrus.WithError(err1).WithField("dir", tmpDir). 751 Warn("failed to remove tmp dir") 752 } 753 return 754 } 755 756 if err = os.Rename(runtimeDir, runtimeDir+"-old"); err != nil { 757 return 758 } 759 if err = os.Rename(tmpDir, runtimeDir); err != nil { 760 err = errors.Wrap(err, "failed to setup runtimes dir, new containers may not start") 761 return 762 } 763 if err = os.RemoveAll(runtimeDir + "-old"); err != nil { 764 logrus.WithError(err).WithField("dir", tmpDir). 765 Warn("failed to remove old runtimes dir") 766 } 767 }() 768 769 for name, rt := range runtimes { 770 if len(rt.Args) == 0 { 771 continue 772 } 773 774 script := filepath.Join(tmpDir, name) 775 content := fmt.Sprintf("#!/bin/sh\n%s %s $@\n", rt.Path, strings.Join(rt.Args, " ")) 776 if err := ioutil.WriteFile(script, []byte(content), 0700); err != nil { 777 return err 778 } 779 } 780 return nil 781 } 782 783 // verifyDaemonSettings performs validation of daemon config struct 784 func verifyDaemonSettings(conf *config.Config) error { 785 if conf.ContainerdNamespace == conf.ContainerdPluginNamespace { 786 return errors.New("containers namespace and plugins namespace cannot be the same") 787 } 788 // Check for mutually incompatible config options 789 if conf.BridgeConfig.Iface != "" && conf.BridgeConfig.IP != "" { 790 return fmt.Errorf("You specified -b & --bip, mutually exclusive options. Please specify only one") 791 } 792 if !conf.BridgeConfig.EnableIPTables && !conf.BridgeConfig.InterContainerCommunication { 793 return fmt.Errorf("You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true") 794 } 795 if !conf.BridgeConfig.EnableIPTables && conf.BridgeConfig.EnableIPMasq { 796 conf.BridgeConfig.EnableIPMasq = false 797 } 798 if err := VerifyCgroupDriver(conf); err != nil { 799 return err 800 } 801 if conf.CgroupParent != "" && UsingSystemd(conf) { 802 if len(conf.CgroupParent) <= 6 || !strings.HasSuffix(conf.CgroupParent, ".slice") { 803 return fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 804 } 805 } 806 807 if conf.Rootless && UsingSystemd(conf) && !cgroups.IsCgroup2UnifiedMode() { 808 return fmt.Errorf("exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode") 809 } 810 811 if conf.DefaultRuntime == "" { 812 conf.DefaultRuntime = config.StockRuntimeName 813 } 814 if conf.Runtimes == nil { 815 conf.Runtimes = make(map[string]types.Runtime) 816 } 817 conf.Runtimes[config.StockRuntimeName] = types.Runtime{Path: DefaultRuntimeName} 818 819 return nil 820 } 821 822 // checkSystem validates platform-specific requirements 823 func checkSystem() error { 824 return checkKernel() 825 } 826 827 // configureMaxThreads sets the Go runtime max threads threshold 828 // which is 90% of the kernel setting from /proc/sys/kernel/threads-max 829 func configureMaxThreads(config *config.Config) error { 830 mt, err := ioutil.ReadFile("/proc/sys/kernel/threads-max") 831 if err != nil { 832 return err 833 } 834 mtint, err := strconv.Atoi(strings.TrimSpace(string(mt))) 835 if err != nil { 836 return err 837 } 838 maxThreads := (mtint / 100) * 90 839 debug.SetMaxThreads(maxThreads) 840 logrus.Debugf("Golang's threads limit set to %d", maxThreads) 841 return nil 842 } 843 844 func overlaySupportsSelinux() (bool, error) { 845 f, err := os.Open("/proc/kallsyms") 846 if err != nil { 847 if os.IsNotExist(err) { 848 return false, nil 849 } 850 return false, err 851 } 852 defer f.Close() 853 854 s := bufio.NewScanner(f) 855 for s.Scan() { 856 if strings.HasSuffix(s.Text(), " security_inode_copy_up") { 857 return true, nil 858 } 859 } 860 861 return false, s.Err() 862 } 863 864 // configureKernelSecuritySupport configures and validates security support for the kernel 865 func configureKernelSecuritySupport(config *config.Config, driverName string) error { 866 if config.EnableSelinuxSupport { 867 if !selinuxEnabled() { 868 logrus.Warn("Docker could not enable SELinux on the host system") 869 return nil 870 } 871 872 if driverName == "overlay" || driverName == "overlay2" { 873 // If driver is overlay or overlay2, make sure kernel 874 // supports selinux with overlay. 875 supported, err := overlaySupportsSelinux() 876 if err != nil { 877 return err 878 } 879 880 if !supported { 881 logrus.Warnf("SELinux is not supported with the %v graph driver on this kernel", driverName) 882 } 883 } 884 } else { 885 selinuxSetDisabled() 886 } 887 return nil 888 } 889 890 func (daemon *Daemon) initNetworkController(config *config.Config, activeSandboxes map[string]interface{}) (libnetwork.NetworkController, error) { 891 netOptions, err := daemon.networkOptions(config, daemon.PluginStore, activeSandboxes) 892 if err != nil { 893 return nil, err 894 } 895 896 controller, err := libnetwork.New(netOptions...) 897 if err != nil { 898 return nil, fmt.Errorf("error obtaining controller instance: %v", err) 899 } 900 901 if len(activeSandboxes) > 0 { 902 logrus.Info("There are old running containers, the network config will not take affect") 903 return controller, nil 904 } 905 906 // Initialize default network on "null" 907 if n, _ := controller.NetworkByName("none"); n == nil { 908 if _, err := controller.NewNetwork("null", "none", "", libnetwork.NetworkOptionPersist(true)); err != nil { 909 return nil, fmt.Errorf("Error creating default \"null\" network: %v", err) 910 } 911 } 912 913 // Initialize default network on "host" 914 if n, _ := controller.NetworkByName("host"); n == nil { 915 if _, err := controller.NewNetwork("host", "host", "", libnetwork.NetworkOptionPersist(true)); err != nil { 916 return nil, fmt.Errorf("Error creating default \"host\" network: %v", err) 917 } 918 } 919 920 // Clear stale bridge network 921 if n, err := controller.NetworkByName("bridge"); err == nil { 922 if err = n.Delete(); err != nil { 923 return nil, fmt.Errorf("could not delete the default bridge network: %v", err) 924 } 925 if len(config.NetworkConfig.DefaultAddressPools.Value()) > 0 && !daemon.configStore.LiveRestoreEnabled { 926 removeDefaultBridgeInterface() 927 } 928 } 929 930 if !config.DisableBridge { 931 // Initialize default driver "bridge" 932 if err := initBridgeDriver(controller, config); err != nil { 933 return nil, err 934 } 935 } else { 936 removeDefaultBridgeInterface() 937 } 938 939 // Set HostGatewayIP to the default bridge's IP if it is empty 940 if daemon.configStore.HostGatewayIP == nil && controller != nil { 941 if n, err := controller.NetworkByName("bridge"); err == nil { 942 v4Info, v6Info := n.Info().IpamInfo() 943 var gateway net.IP 944 if len(v4Info) > 0 { 945 gateway = v4Info[0].Gateway.IP 946 } else if len(v6Info) > 0 { 947 gateway = v6Info[0].Gateway.IP 948 } 949 daemon.configStore.HostGatewayIP = gateway 950 } 951 } 952 return controller, nil 953 } 954 955 func driverOptions(config *config.Config) []nwconfig.Option { 956 bridgeConfig := options.Generic{ 957 "EnableIPForwarding": config.BridgeConfig.EnableIPForward, 958 "EnableIPTables": config.BridgeConfig.EnableIPTables, 959 "EnableUserlandProxy": config.BridgeConfig.EnableUserlandProxy, 960 "UserlandProxyPath": config.BridgeConfig.UserlandProxyPath} 961 bridgeOption := options.Generic{netlabel.GenericData: bridgeConfig} 962 963 dOptions := []nwconfig.Option{} 964 dOptions = append(dOptions, nwconfig.OptionDriverConfig("bridge", bridgeOption)) 965 return dOptions 966 } 967 968 func initBridgeDriver(controller libnetwork.NetworkController, config *config.Config) error { 969 bridgeName := bridge.DefaultBridgeName 970 if config.BridgeConfig.Iface != "" { 971 bridgeName = config.BridgeConfig.Iface 972 } 973 netOption := map[string]string{ 974 bridge.BridgeName: bridgeName, 975 bridge.DefaultBridge: strconv.FormatBool(true), 976 netlabel.DriverMTU: strconv.Itoa(config.Mtu), 977 bridge.EnableIPMasquerade: strconv.FormatBool(config.BridgeConfig.EnableIPMasq), 978 bridge.EnableICC: strconv.FormatBool(config.BridgeConfig.InterContainerCommunication), 979 } 980 981 // --ip processing 982 if config.BridgeConfig.DefaultIP != nil { 983 netOption[bridge.DefaultBindingIP] = config.BridgeConfig.DefaultIP.String() 984 } 985 986 ipamV4Conf := &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 987 988 nwList, nw6List, err := netutils.ElectInterfaceAddresses(bridgeName) 989 if err != nil { 990 return errors.Wrap(err, "list bridge addresses failed") 991 } 992 993 nw := nwList[0] 994 if len(nwList) > 1 && config.BridgeConfig.FixedCIDR != "" { 995 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 996 if err != nil { 997 return errors.Wrap(err, "parse CIDR failed") 998 } 999 // Iterate through in case there are multiple addresses for the bridge 1000 for _, entry := range nwList { 1001 if fCIDR.Contains(entry.IP) { 1002 nw = entry 1003 break 1004 } 1005 } 1006 } 1007 1008 ipamV4Conf.PreferredPool = lntypes.GetIPNetCanonical(nw).String() 1009 hip, _ := lntypes.GetHostPartIP(nw.IP, nw.Mask) 1010 if hip.IsGlobalUnicast() { 1011 ipamV4Conf.Gateway = nw.IP.String() 1012 } 1013 1014 if config.BridgeConfig.IP != "" { 1015 ip, ipNet, err := net.ParseCIDR(config.BridgeConfig.IP) 1016 if err != nil { 1017 return err 1018 } 1019 ipamV4Conf.PreferredPool = ipNet.String() 1020 ipamV4Conf.Gateway = ip.String() 1021 } else if bridgeName == bridge.DefaultBridgeName && ipamV4Conf.PreferredPool != "" { 1022 logrus.Infof("Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address", bridgeName, ipamV4Conf.PreferredPool) 1023 } 1024 1025 if config.BridgeConfig.FixedCIDR != "" { 1026 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 1027 if err != nil { 1028 return err 1029 } 1030 1031 ipamV4Conf.SubPool = fCIDR.String() 1032 } 1033 1034 if config.BridgeConfig.DefaultGatewayIPv4 != nil { 1035 ipamV4Conf.AuxAddresses["DefaultGatewayIPv4"] = config.BridgeConfig.DefaultGatewayIPv4.String() 1036 } 1037 1038 var ( 1039 deferIPv6Alloc bool 1040 ipamV6Conf *libnetwork.IpamConf 1041 ) 1042 1043 if config.BridgeConfig.EnableIPv6 && config.BridgeConfig.FixedCIDRv6 == "" { 1044 return errdefs.InvalidParameter(errors.New("IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6")) 1045 } else if config.BridgeConfig.FixedCIDRv6 != "" { 1046 _, fCIDRv6, err := net.ParseCIDR(config.BridgeConfig.FixedCIDRv6) 1047 if err != nil { 1048 return err 1049 } 1050 1051 // In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has 1052 // at least 48 host bits, we need to guarantee the current behavior where the containers' 1053 // IPv6 addresses will be constructed based on the containers' interface MAC address. 1054 // We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints 1055 // on this network until after the driver has created the endpoint and returned the 1056 // constructed address. Libnetwork will then reserve this address with the ipam driver. 1057 ones, _ := fCIDRv6.Mask.Size() 1058 deferIPv6Alloc = ones <= 80 1059 1060 ipamV6Conf = &libnetwork.IpamConf{ 1061 AuxAddresses: make(map[string]string), 1062 PreferredPool: fCIDRv6.String(), 1063 } 1064 1065 // In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6 1066 // address belongs to the same network, we need to inform libnetwork about it, so 1067 // that it can be reserved with IPAM and it will not be given away to somebody else 1068 for _, nw6 := range nw6List { 1069 if fCIDRv6.Contains(nw6.IP) { 1070 ipamV6Conf.Gateway = nw6.IP.String() 1071 break 1072 } 1073 } 1074 } 1075 1076 if config.BridgeConfig.DefaultGatewayIPv6 != nil { 1077 if ipamV6Conf == nil { 1078 ipamV6Conf = &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 1079 } 1080 ipamV6Conf.AuxAddresses["DefaultGatewayIPv6"] = config.BridgeConfig.DefaultGatewayIPv6.String() 1081 } 1082 1083 v4Conf := []*libnetwork.IpamConf{ipamV4Conf} 1084 v6Conf := []*libnetwork.IpamConf{} 1085 if ipamV6Conf != nil { 1086 v6Conf = append(v6Conf, ipamV6Conf) 1087 } 1088 // Initialize default network on "bridge" with the same name 1089 _, err = controller.NewNetwork("bridge", "bridge", "", 1090 libnetwork.NetworkOptionEnableIPv6(config.BridgeConfig.EnableIPv6), 1091 libnetwork.NetworkOptionDriverOpts(netOption), 1092 libnetwork.NetworkOptionIpam("default", "", v4Conf, v6Conf, nil), 1093 libnetwork.NetworkOptionDeferIPv6Alloc(deferIPv6Alloc)) 1094 if err != nil { 1095 return fmt.Errorf("Error creating default \"bridge\" network: %v", err) 1096 } 1097 return nil 1098 } 1099 1100 // Remove default bridge interface if present (--bridge=none use case) 1101 func removeDefaultBridgeInterface() { 1102 if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil { 1103 if err := netlink.LinkDel(lnk); err != nil { 1104 logrus.Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err) 1105 } 1106 } 1107 } 1108 1109 func setupInitLayer(idMapping *idtools.IdentityMapping) func(containerfs.ContainerFS) error { 1110 return func(initPath containerfs.ContainerFS) error { 1111 return initlayer.Setup(initPath, idMapping.RootPair()) 1112 } 1113 } 1114 1115 // Parse the remapped root (user namespace) option, which can be one of: 1116 // username - valid username from /etc/passwd 1117 // username:groupname - valid username; valid groupname from /etc/group 1118 // uid - 32-bit unsigned int valid Linux UID value 1119 // uid:gid - uid value; 32-bit unsigned int Linux GID value 1120 // 1121 // If no groupname is specified, and a username is specified, an attempt 1122 // will be made to lookup a gid for that username as a groupname 1123 // 1124 // If names are used, they are verified to exist in passwd/group 1125 func parseRemappedRoot(usergrp string) (string, string, error) { 1126 1127 var ( 1128 userID, groupID int 1129 username, groupname string 1130 ) 1131 1132 idparts := strings.Split(usergrp, ":") 1133 if len(idparts) > 2 { 1134 return "", "", fmt.Errorf("Invalid user/group specification in --userns-remap: %q", usergrp) 1135 } 1136 1137 if uid, err := strconv.ParseInt(idparts[0], 10, 32); err == nil { 1138 // must be a uid; take it as valid 1139 userID = int(uid) 1140 luser, err := idtools.LookupUID(userID) 1141 if err != nil { 1142 return "", "", fmt.Errorf("Uid %d has no entry in /etc/passwd: %v", userID, err) 1143 } 1144 username = luser.Name 1145 if len(idparts) == 1 { 1146 // if the uid was numeric and no gid was specified, take the uid as the gid 1147 groupID = userID 1148 lgrp, err := idtools.LookupGID(groupID) 1149 if err != nil { 1150 return "", "", fmt.Errorf("Gid %d has no entry in /etc/group: %v", groupID, err) 1151 } 1152 groupname = lgrp.Name 1153 } 1154 } else { 1155 lookupName := idparts[0] 1156 // special case: if the user specified "default", they want Docker to create or 1157 // use (after creation) the "dockremap" user/group for root remapping 1158 if lookupName == defaultIDSpecifier { 1159 lookupName = defaultRemappedID 1160 } 1161 luser, err := idtools.LookupUser(lookupName) 1162 if err != nil && idparts[0] != defaultIDSpecifier { 1163 // error if the name requested isn't the special "dockremap" ID 1164 return "", "", fmt.Errorf("Error during uid lookup for %q: %v", lookupName, err) 1165 } else if err != nil { 1166 // special case-- if the username == "default", then we have been asked 1167 // to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid} 1168 // ranges will be used for the user and group mappings in user namespaced containers 1169 _, _, err := idtools.AddNamespaceRangesUser(defaultRemappedID) 1170 if err == nil { 1171 return defaultRemappedID, defaultRemappedID, nil 1172 } 1173 return "", "", fmt.Errorf("Error during %q user creation: %v", defaultRemappedID, err) 1174 } 1175 username = luser.Name 1176 if len(idparts) == 1 { 1177 // we only have a string username, and no group specified; look up gid from username as group 1178 group, err := idtools.LookupGroup(lookupName) 1179 if err != nil { 1180 return "", "", fmt.Errorf("Error during gid lookup for %q: %v", lookupName, err) 1181 } 1182 groupname = group.Name 1183 } 1184 } 1185 1186 if len(idparts) == 2 { 1187 // groupname or gid is separately specified and must be resolved 1188 // to an unsigned 32-bit gid 1189 if gid, err := strconv.ParseInt(idparts[1], 10, 32); err == nil { 1190 // must be a gid, take it as valid 1191 groupID = int(gid) 1192 lgrp, err := idtools.LookupGID(groupID) 1193 if err != nil { 1194 return "", "", fmt.Errorf("Gid %d has no entry in /etc/passwd: %v", groupID, err) 1195 } 1196 groupname = lgrp.Name 1197 } else { 1198 // not a number; attempt a lookup 1199 if _, err := idtools.LookupGroup(idparts[1]); err != nil { 1200 return "", "", fmt.Errorf("Error during groupname lookup for %q: %v", idparts[1], err) 1201 } 1202 groupname = idparts[1] 1203 } 1204 } 1205 return username, groupname, nil 1206 } 1207 1208 func setupRemappedRoot(config *config.Config) (*idtools.IdentityMapping, error) { 1209 if runtime.GOOS != "linux" && config.RemappedRoot != "" { 1210 return nil, fmt.Errorf("User namespaces are only supported on Linux") 1211 } 1212 1213 // if the daemon was started with remapped root option, parse 1214 // the config option to the int uid,gid values 1215 if config.RemappedRoot != "" { 1216 username, groupname, err := parseRemappedRoot(config.RemappedRoot) 1217 if err != nil { 1218 return nil, err 1219 } 1220 if username == "root" { 1221 // Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op 1222 // effectively 1223 logrus.Warn("User namespaces: root cannot be remapped with itself; user namespaces are OFF") 1224 return &idtools.IdentityMapping{}, nil 1225 } 1226 logrus.Infof("User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s", username) 1227 // update remapped root setting now that we have resolved them to actual names 1228 config.RemappedRoot = fmt.Sprintf("%s:%s", username, groupname) 1229 1230 mappings, err := idtools.NewIdentityMapping(username) 1231 if err != nil { 1232 return nil, errors.Wrap(err, "Can't create ID mappings") 1233 } 1234 return mappings, nil 1235 } 1236 return &idtools.IdentityMapping{}, nil 1237 } 1238 1239 func setupDaemonRoot(config *config.Config, rootDir string, rootIdentity idtools.Identity) error { 1240 config.Root = rootDir 1241 // the docker root metadata directory needs to have execute permissions for all users (g+x,o+x) 1242 // so that syscalls executing as non-root, operating on subdirectories of the graph root 1243 // (e.g. mounted layers of a container) can traverse this path. 1244 // The user namespace support will create subdirectories for the remapped root host uid:gid 1245 // pair owned by that same uid:gid pair for proper write access to those needed metadata and 1246 // layer content subtrees. 1247 if _, err := os.Stat(rootDir); err == nil { 1248 // root current exists; verify the access bits are correct by setting them 1249 if err = os.Chmod(rootDir, 0711); err != nil { 1250 return err 1251 } 1252 } else if os.IsNotExist(err) { 1253 // no root exists yet, create it 0711 with root:root ownership 1254 if err := os.MkdirAll(rootDir, 0711); err != nil { 1255 return err 1256 } 1257 } 1258 1259 // if user namespaces are enabled we will create a subtree underneath the specified root 1260 // with any/all specified remapped root uid/gid options on the daemon creating 1261 // a new subdirectory with ownership set to the remapped uid/gid (so as to allow 1262 // `chdir()` to work for containers namespaced to that uid/gid) 1263 if config.RemappedRoot != "" { 1264 config.Root = filepath.Join(rootDir, fmt.Sprintf("%d.%d", rootIdentity.UID, rootIdentity.GID)) 1265 logrus.Debugf("Creating user namespaced daemon root: %s", config.Root) 1266 // Create the root directory if it doesn't exist 1267 if err := idtools.MkdirAllAndChown(config.Root, 0700, rootIdentity); err != nil { 1268 return fmt.Errorf("Cannot create daemon root: %s: %v", config.Root, err) 1269 } 1270 // we also need to verify that any pre-existing directories in the path to 1271 // the graphroot won't block access to remapped root--if any pre-existing directory 1272 // has strict permissions that don't allow "x", container start will fail, so 1273 // better to warn and fail now 1274 dirPath := config.Root 1275 for { 1276 dirPath = filepath.Dir(dirPath) 1277 if dirPath == "/" { 1278 break 1279 } 1280 if !idtools.CanAccess(dirPath, rootIdentity) { 1281 return fmt.Errorf("a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories", config.Root) 1282 } 1283 } 1284 } 1285 1286 if err := setupDaemonRootPropagation(config); err != nil { 1287 logrus.WithError(err).WithField("dir", config.Root).Warn("Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior") 1288 } 1289 return nil 1290 } 1291 1292 func setupDaemonRootPropagation(cfg *config.Config) error { 1293 rootParentMount, mountOptions, err := getSourceMount(cfg.Root) 1294 if err != nil { 1295 return errors.Wrap(err, "error getting daemon root's parent mount") 1296 } 1297 1298 var cleanupOldFile bool 1299 cleanupFile := getUnmountOnShutdownPath(cfg) 1300 defer func() { 1301 if !cleanupOldFile { 1302 return 1303 } 1304 if err := os.Remove(cleanupFile); err != nil && !os.IsNotExist(err) { 1305 logrus.WithError(err).WithField("file", cleanupFile).Warn("could not clean up old root propagation unmount file") 1306 } 1307 }() 1308 1309 if hasMountInfoOption(mountOptions, sharedPropagationOption, slavePropagationOption) { 1310 cleanupOldFile = true 1311 return nil 1312 } 1313 1314 if err := mount.MakeShared(cfg.Root); err != nil { 1315 return errors.Wrap(err, "could not setup daemon root propagation to shared") 1316 } 1317 1318 // check the case where this may have already been a mount to itself. 1319 // If so then the daemon only performed a remount and should not try to unmount this later. 1320 if rootParentMount == cfg.Root { 1321 cleanupOldFile = true 1322 return nil 1323 } 1324 1325 if err := os.MkdirAll(filepath.Dir(cleanupFile), 0700); err != nil { 1326 return errors.Wrap(err, "error creating dir to store mount cleanup file") 1327 } 1328 1329 if err := ioutil.WriteFile(cleanupFile, nil, 0600); err != nil { 1330 return errors.Wrap(err, "error writing file to signal mount cleanup on shutdown") 1331 } 1332 return nil 1333 } 1334 1335 // getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown 1336 // the daemon root should be unmounted. 1337 func getUnmountOnShutdownPath(config *config.Config) string { 1338 return filepath.Join(config.ExecRoot, "unmount-on-shutdown") 1339 } 1340 1341 // registerLinks writes the links to a file. 1342 func (daemon *Daemon) registerLinks(container *container.Container, hostConfig *containertypes.HostConfig) error { 1343 if hostConfig == nil || hostConfig.NetworkMode.IsUserDefined() { 1344 return nil 1345 } 1346 1347 for _, l := range hostConfig.Links { 1348 name, alias, err := opts.ParseLink(l) 1349 if err != nil { 1350 return err 1351 } 1352 child, err := daemon.GetContainer(name) 1353 if err != nil { 1354 if errdefs.IsNotFound(err) { 1355 // Trying to link to a non-existing container is not valid, and 1356 // should return an "invalid parameter" error. Returning a "not 1357 // found" error here would make the client report the container's 1358 // image could not be found (see moby/moby#39823) 1359 err = errdefs.InvalidParameter(err) 1360 } 1361 return errors.Wrapf(err, "could not get container for %s", name) 1362 } 1363 for child.HostConfig.NetworkMode.IsContainer() { 1364 parts := strings.SplitN(string(child.HostConfig.NetworkMode), ":", 2) 1365 child, err = daemon.GetContainer(parts[1]) 1366 if err != nil { 1367 if errdefs.IsNotFound(err) { 1368 // Trying to link to a non-existing container is not valid, and 1369 // should return an "invalid parameter" error. Returning a "not 1370 // found" error here would make the client report the container's 1371 // image could not be found (see moby/moby#39823) 1372 err = errdefs.InvalidParameter(err) 1373 } 1374 return errors.Wrapf(err, "Could not get container for %s", parts[1]) 1375 } 1376 } 1377 if child.HostConfig.NetworkMode.IsHost() { 1378 return runconfig.ErrConflictHostNetworkAndLinks 1379 } 1380 if err := daemon.registerLink(container, child, alias); err != nil { 1381 return err 1382 } 1383 } 1384 1385 // After we load all the links into the daemon 1386 // set them to nil on the hostconfig 1387 _, err := container.WriteHostConfig() 1388 return err 1389 } 1390 1391 // conditionalMountOnStart is a platform specific helper function during the 1392 // container start to call mount. 1393 func (daemon *Daemon) conditionalMountOnStart(container *container.Container) error { 1394 return daemon.Mount(container) 1395 } 1396 1397 // conditionalUnmountOnCleanup is a platform specific helper function called 1398 // during the cleanup of a container to unmount. 1399 func (daemon *Daemon) conditionalUnmountOnCleanup(container *container.Container) error { 1400 return daemon.Unmount(container) 1401 } 1402 1403 func copyBlkioEntry(entries []*statsV1.BlkIOEntry) []types.BlkioStatEntry { 1404 out := make([]types.BlkioStatEntry, len(entries)) 1405 for i, re := range entries { 1406 out[i] = types.BlkioStatEntry{ 1407 Major: re.Major, 1408 Minor: re.Minor, 1409 Op: re.Op, 1410 Value: re.Value, 1411 } 1412 } 1413 return out 1414 } 1415 1416 func (daemon *Daemon) stats(c *container.Container) (*types.StatsJSON, error) { 1417 if !c.IsRunning() { 1418 return nil, errNotRunning(c.ID) 1419 } 1420 cs, err := daemon.containerd.Stats(context.Background(), c.ID) 1421 if err != nil { 1422 if strings.Contains(err.Error(), "container not found") { 1423 return nil, containerNotFound(c.ID) 1424 } 1425 return nil, err 1426 } 1427 s := &types.StatsJSON{} 1428 s.Read = cs.Read 1429 stats := cs.Metrics 1430 switch t := stats.(type) { 1431 case *statsV1.Metrics: 1432 return daemon.statsV1(s, t) 1433 case *statsV2.Metrics: 1434 return daemon.statsV2(s, t) 1435 default: 1436 return nil, errors.Errorf("unexpected type of metrics %+v", t) 1437 } 1438 } 1439 1440 func (daemon *Daemon) statsV1(s *types.StatsJSON, stats *statsV1.Metrics) (*types.StatsJSON, error) { 1441 if stats.Blkio != nil { 1442 s.BlkioStats = types.BlkioStats{ 1443 IoServiceBytesRecursive: copyBlkioEntry(stats.Blkio.IoServiceBytesRecursive), 1444 IoServicedRecursive: copyBlkioEntry(stats.Blkio.IoServicedRecursive), 1445 IoQueuedRecursive: copyBlkioEntry(stats.Blkio.IoQueuedRecursive), 1446 IoServiceTimeRecursive: copyBlkioEntry(stats.Blkio.IoServiceTimeRecursive), 1447 IoWaitTimeRecursive: copyBlkioEntry(stats.Blkio.IoWaitTimeRecursive), 1448 IoMergedRecursive: copyBlkioEntry(stats.Blkio.IoMergedRecursive), 1449 IoTimeRecursive: copyBlkioEntry(stats.Blkio.IoTimeRecursive), 1450 SectorsRecursive: copyBlkioEntry(stats.Blkio.SectorsRecursive), 1451 } 1452 } 1453 if stats.CPU != nil { 1454 s.CPUStats = types.CPUStats{ 1455 CPUUsage: types.CPUUsage{ 1456 TotalUsage: stats.CPU.Usage.Total, 1457 PercpuUsage: stats.CPU.Usage.PerCPU, 1458 UsageInKernelmode: stats.CPU.Usage.Kernel, 1459 UsageInUsermode: stats.CPU.Usage.User, 1460 }, 1461 ThrottlingData: types.ThrottlingData{ 1462 Periods: stats.CPU.Throttling.Periods, 1463 ThrottledPeriods: stats.CPU.Throttling.ThrottledPeriods, 1464 ThrottledTime: stats.CPU.Throttling.ThrottledTime, 1465 }, 1466 } 1467 } 1468 1469 if stats.Memory != nil { 1470 raw := make(map[string]uint64) 1471 raw["cache"] = stats.Memory.Cache 1472 raw["rss"] = stats.Memory.RSS 1473 raw["rss_huge"] = stats.Memory.RSSHuge 1474 raw["mapped_file"] = stats.Memory.MappedFile 1475 raw["dirty"] = stats.Memory.Dirty 1476 raw["writeback"] = stats.Memory.Writeback 1477 raw["pgpgin"] = stats.Memory.PgPgIn 1478 raw["pgpgout"] = stats.Memory.PgPgOut 1479 raw["pgfault"] = stats.Memory.PgFault 1480 raw["pgmajfault"] = stats.Memory.PgMajFault 1481 raw["inactive_anon"] = stats.Memory.InactiveAnon 1482 raw["active_anon"] = stats.Memory.ActiveAnon 1483 raw["inactive_file"] = stats.Memory.InactiveFile 1484 raw["active_file"] = stats.Memory.ActiveFile 1485 raw["unevictable"] = stats.Memory.Unevictable 1486 raw["hierarchical_memory_limit"] = stats.Memory.HierarchicalMemoryLimit 1487 raw["hierarchical_memsw_limit"] = stats.Memory.HierarchicalSwapLimit 1488 raw["total_cache"] = stats.Memory.TotalCache 1489 raw["total_rss"] = stats.Memory.TotalRSS 1490 raw["total_rss_huge"] = stats.Memory.TotalRSSHuge 1491 raw["total_mapped_file"] = stats.Memory.TotalMappedFile 1492 raw["total_dirty"] = stats.Memory.TotalDirty 1493 raw["total_writeback"] = stats.Memory.TotalWriteback 1494 raw["total_pgpgin"] = stats.Memory.TotalPgPgIn 1495 raw["total_pgpgout"] = stats.Memory.TotalPgPgOut 1496 raw["total_pgfault"] = stats.Memory.TotalPgFault 1497 raw["total_pgmajfault"] = stats.Memory.TotalPgMajFault 1498 raw["total_inactive_anon"] = stats.Memory.TotalInactiveAnon 1499 raw["total_active_anon"] = stats.Memory.TotalActiveAnon 1500 raw["total_inactive_file"] = stats.Memory.TotalInactiveFile 1501 raw["total_active_file"] = stats.Memory.TotalActiveFile 1502 raw["total_unevictable"] = stats.Memory.TotalUnevictable 1503 1504 if stats.Memory.Usage != nil { 1505 s.MemoryStats = types.MemoryStats{ 1506 Stats: raw, 1507 Usage: stats.Memory.Usage.Usage, 1508 MaxUsage: stats.Memory.Usage.Max, 1509 Limit: stats.Memory.Usage.Limit, 1510 Failcnt: stats.Memory.Usage.Failcnt, 1511 } 1512 } else { 1513 s.MemoryStats = types.MemoryStats{ 1514 Stats: raw, 1515 } 1516 } 1517 1518 // if the container does not set memory limit, use the machineMemory 1519 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1520 s.MemoryStats.Limit = daemon.machineMemory 1521 } 1522 } 1523 1524 if stats.Pids != nil { 1525 s.PidsStats = types.PidsStats{ 1526 Current: stats.Pids.Current, 1527 Limit: stats.Pids.Limit, 1528 } 1529 } 1530 1531 return s, nil 1532 } 1533 1534 func (daemon *Daemon) statsV2(s *types.StatsJSON, stats *statsV2.Metrics) (*types.StatsJSON, error) { 1535 if stats.Io != nil { 1536 var isbr []types.BlkioStatEntry 1537 for _, re := range stats.Io.Usage { 1538 isbr = append(isbr, 1539 types.BlkioStatEntry{ 1540 Major: re.Major, 1541 Minor: re.Minor, 1542 Op: "read", 1543 Value: re.Rbytes, 1544 }, 1545 types.BlkioStatEntry{ 1546 Major: re.Major, 1547 Minor: re.Minor, 1548 Op: "write", 1549 Value: re.Wbytes, 1550 }, 1551 ) 1552 } 1553 s.BlkioStats = types.BlkioStats{ 1554 IoServiceBytesRecursive: isbr, 1555 // Other fields are unsupported 1556 } 1557 } 1558 1559 if stats.CPU != nil { 1560 s.CPUStats = types.CPUStats{ 1561 CPUUsage: types.CPUUsage{ 1562 TotalUsage: stats.CPU.UsageUsec * 1000, 1563 // PercpuUsage is not supported 1564 UsageInKernelmode: stats.CPU.SystemUsec * 1000, 1565 UsageInUsermode: stats.CPU.UserUsec * 1000, 1566 }, 1567 ThrottlingData: types.ThrottlingData{ 1568 Periods: stats.CPU.NrPeriods, 1569 ThrottledPeriods: stats.CPU.NrThrottled, 1570 ThrottledTime: stats.CPU.ThrottledUsec * 1000, 1571 }, 1572 } 1573 } 1574 1575 if stats.Memory != nil { 1576 raw := make(map[string]uint64) 1577 raw["anon"] = stats.Memory.Anon 1578 raw["file"] = stats.Memory.File 1579 raw["kernel_stack"] = stats.Memory.KernelStack 1580 raw["slab"] = stats.Memory.Slab 1581 raw["sock"] = stats.Memory.Sock 1582 raw["shmem"] = stats.Memory.Shmem 1583 raw["file_mapped"] = stats.Memory.FileMapped 1584 raw["file_dirty"] = stats.Memory.FileDirty 1585 raw["file_writeback"] = stats.Memory.FileWriteback 1586 raw["anon_thp"] = stats.Memory.AnonThp 1587 raw["inactive_anon"] = stats.Memory.InactiveAnon 1588 raw["active_anon"] = stats.Memory.ActiveAnon 1589 raw["inactive_file"] = stats.Memory.InactiveFile 1590 raw["active_file"] = stats.Memory.ActiveFile 1591 raw["unevictable"] = stats.Memory.Unevictable 1592 raw["slab_reclaimable"] = stats.Memory.SlabReclaimable 1593 raw["slab_unreclaimable"] = stats.Memory.SlabUnreclaimable 1594 raw["pgfault"] = stats.Memory.Pgfault 1595 raw["pgmajfault"] = stats.Memory.Pgmajfault 1596 raw["workingset_refault"] = stats.Memory.WorkingsetRefault 1597 raw["workingset_activate"] = stats.Memory.WorkingsetActivate 1598 raw["workingset_nodereclaim"] = stats.Memory.WorkingsetNodereclaim 1599 raw["pgrefill"] = stats.Memory.Pgrefill 1600 raw["pgscan"] = stats.Memory.Pgscan 1601 raw["pgsteal"] = stats.Memory.Pgsteal 1602 raw["pgactivate"] = stats.Memory.Pgactivate 1603 raw["pgdeactivate"] = stats.Memory.Pgdeactivate 1604 raw["pglazyfree"] = stats.Memory.Pglazyfree 1605 raw["pglazyfreed"] = stats.Memory.Pglazyfreed 1606 raw["thp_fault_alloc"] = stats.Memory.ThpFaultAlloc 1607 raw["thp_collapse_alloc"] = stats.Memory.ThpCollapseAlloc 1608 s.MemoryStats = types.MemoryStats{ 1609 // Stats is not compatible with v1 1610 Stats: raw, 1611 Usage: stats.Memory.Usage, 1612 // MaxUsage is not supported 1613 Limit: stats.Memory.UsageLimit, 1614 // TODO: Failcnt 1615 } 1616 // if the container does not set memory limit, use the machineMemory 1617 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1618 s.MemoryStats.Limit = daemon.machineMemory 1619 } 1620 } 1621 1622 if stats.Pids != nil { 1623 s.PidsStats = types.PidsStats{ 1624 Current: stats.Pids.Current, 1625 Limit: stats.Pids.Limit, 1626 } 1627 } 1628 1629 return s, nil 1630 } 1631 1632 // setDefaultIsolation determines the default isolation mode for the 1633 // daemon to run in. This is only applicable on Windows 1634 func (daemon *Daemon) setDefaultIsolation() error { 1635 return nil 1636 } 1637 1638 // setupDaemonProcess sets various settings for the daemon's process 1639 func setupDaemonProcess(config *config.Config) error { 1640 // setup the daemons oom_score_adj 1641 if err := setupOOMScoreAdj(config.OOMScoreAdjust); err != nil { 1642 return err 1643 } 1644 if err := setMayDetachMounts(); err != nil { 1645 logrus.WithError(err).Warn("Could not set may_detach_mounts kernel parameter") 1646 } 1647 return nil 1648 } 1649 1650 // This is used to allow removal of mountpoints that may be mounted in other 1651 // namespaces on RHEL based kernels starting from RHEL 7.4. 1652 // Without this setting, removals on these RHEL based kernels may fail with 1653 // "device or resource busy". 1654 // This setting is not available in upstream kernels as it is not configurable, 1655 // but has been in the upstream kernels since 3.15. 1656 func setMayDetachMounts() error { 1657 f, err := os.OpenFile("/proc/sys/fs/may_detach_mounts", os.O_WRONLY, 0) 1658 if err != nil { 1659 if os.IsNotExist(err) { 1660 return nil 1661 } 1662 return errors.Wrap(err, "error opening may_detach_mounts kernel config file") 1663 } 1664 defer f.Close() 1665 1666 _, err = f.WriteString("1") 1667 if os.IsPermission(err) { 1668 // Setting may_detach_mounts does not work in an 1669 // unprivileged container. Ignore the error, but log 1670 // it if we appear not to be in that situation. 1671 if !rsystem.RunningInUserNS() { 1672 logrus.Debugf("Permission denied writing %q to /proc/sys/fs/may_detach_mounts", "1") 1673 } 1674 return nil 1675 } 1676 return err 1677 } 1678 1679 func setupOOMScoreAdj(score int) error { 1680 f, err := os.OpenFile("/proc/self/oom_score_adj", os.O_WRONLY, 0) 1681 if err != nil { 1682 return err 1683 } 1684 defer f.Close() 1685 stringScore := strconv.Itoa(score) 1686 _, err = f.WriteString(stringScore) 1687 if os.IsPermission(err) { 1688 // Setting oom_score_adj does not work in an 1689 // unprivileged container. Ignore the error, but log 1690 // it if we appear not to be in that situation. 1691 if !rsystem.RunningInUserNS() { 1692 logrus.Debugf("Permission denied writing %q to /proc/self/oom_score_adj", stringScore) 1693 } 1694 return nil 1695 } 1696 1697 return err 1698 } 1699 1700 func (daemon *Daemon) initCgroupsPath(path string) error { 1701 if path == "/" || path == "." { 1702 return nil 1703 } 1704 1705 if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 { 1706 return nil 1707 } 1708 1709 if cgroups.IsCgroup2UnifiedMode() { 1710 return fmt.Errorf("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2") 1711 } 1712 1713 // Recursively create cgroup to ensure that the system and all parent cgroups have values set 1714 // for the period and runtime as this limits what the children can be set to. 1715 daemon.initCgroupsPath(filepath.Dir(path)) 1716 1717 mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu") 1718 if err != nil { 1719 return err 1720 } 1721 // When docker is run inside docker, the root is based of the host cgroup. 1722 // Should this be handled in runc/libcontainer/cgroups ? 1723 if strings.HasPrefix(root, "/docker/") { 1724 root = "/" 1725 } 1726 1727 path = filepath.Join(mnt, root, path) 1728 sysInfo := daemon.RawSysInfo(true) 1729 if err := maybeCreateCPURealTimeFile(sysInfo.CPURealtimePeriod, daemon.configStore.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil { 1730 return err 1731 } 1732 return maybeCreateCPURealTimeFile(sysInfo.CPURealtimeRuntime, daemon.configStore.CPURealtimeRuntime, "cpu.rt_runtime_us", path) 1733 } 1734 1735 func maybeCreateCPURealTimeFile(sysinfoPresent bool, configValue int64, file string, path string) error { 1736 if sysinfoPresent && configValue != 0 { 1737 if err := os.MkdirAll(path, 0755); err != nil { 1738 return err 1739 } 1740 if err := ioutil.WriteFile(filepath.Join(path, file), []byte(strconv.FormatInt(configValue, 10)), 0700); err != nil { 1741 return err 1742 } 1743 } 1744 return nil 1745 } 1746 1747 func (daemon *Daemon) setupSeccompProfile() error { 1748 if daemon.configStore.SeccompProfile != "" { 1749 daemon.seccompProfilePath = daemon.configStore.SeccompProfile 1750 b, err := ioutil.ReadFile(daemon.configStore.SeccompProfile) 1751 if err != nil { 1752 return fmt.Errorf("opening seccomp profile (%s) failed: %v", daemon.configStore.SeccompProfile, err) 1753 } 1754 daemon.seccompProfile = b 1755 } 1756 return nil 1757 } 1758 1759 func (daemon *Daemon) useShimV2() bool { 1760 return cgroups.IsCgroup2UnifiedMode() 1761 } 1762 1763 // RawSysInfo returns *sysinfo.SysInfo . 1764 func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo { 1765 var opts []sysinfo.Opt 1766 if daemon.getCgroupDriver() == cgroupSystemdDriver { 1767 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 1768 if rootlesskitParentEUID != "" { 1769 groupPath := fmt.Sprintf("/user.slice/user-%s.slice", rootlesskitParentEUID) 1770 opts = append(opts, sysinfo.WithCgroup2GroupPath(groupPath)) 1771 } 1772 } 1773 return sysinfo.New(quiet, opts...) 1774 }