github.com/adityamillind98/moby@v23.0.0-rc.4+incompatible/daemon/daemon_unix.go (about) 1 //go:build linux || freebsd 2 // +build linux freebsd 3 4 package daemon // import "github.com/docker/docker/daemon" 5 6 import ( 7 "bufio" 8 "context" 9 "fmt" 10 "net" 11 "os" 12 "path/filepath" 13 "runtime" 14 "runtime/debug" 15 "strconv" 16 "strings" 17 "sync" 18 "time" 19 20 "github.com/containerd/cgroups" 21 statsV1 "github.com/containerd/cgroups/stats/v1" 22 statsV2 "github.com/containerd/cgroups/v2/stats" 23 "github.com/containerd/containerd/pkg/userns" 24 "github.com/docker/docker/api/types" 25 "github.com/docker/docker/api/types/blkiodev" 26 pblkiodev "github.com/docker/docker/api/types/blkiodev" 27 containertypes "github.com/docker/docker/api/types/container" 28 "github.com/docker/docker/container" 29 "github.com/docker/docker/daemon/config" 30 "github.com/docker/docker/daemon/initlayer" 31 "github.com/docker/docker/errdefs" 32 "github.com/docker/docker/libcontainerd/remote" 33 "github.com/docker/docker/libnetwork" 34 nwconfig "github.com/docker/docker/libnetwork/config" 35 "github.com/docker/docker/libnetwork/drivers/bridge" 36 "github.com/docker/docker/libnetwork/netlabel" 37 "github.com/docker/docker/libnetwork/netutils" 38 "github.com/docker/docker/libnetwork/options" 39 lntypes "github.com/docker/docker/libnetwork/types" 40 "github.com/docker/docker/opts" 41 "github.com/docker/docker/pkg/containerfs" 42 "github.com/docker/docker/pkg/idtools" 43 "github.com/docker/docker/pkg/parsers" 44 "github.com/docker/docker/pkg/parsers/kernel" 45 "github.com/docker/docker/pkg/sysinfo" 46 "github.com/docker/docker/runconfig" 47 volumemounts "github.com/docker/docker/volume/mounts" 48 "github.com/moby/sys/mount" 49 specs "github.com/opencontainers/runtime-spec/specs-go" 50 "github.com/opencontainers/selinux/go-selinux" 51 "github.com/opencontainers/selinux/go-selinux/label" 52 "github.com/pkg/errors" 53 "github.com/sirupsen/logrus" 54 "github.com/vishvananda/netlink" 55 "golang.org/x/sys/unix" 56 ) 57 58 const ( 59 isWindows = false 60 61 // See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269 62 linuxMinCPUShares = 2 63 linuxMaxCPUShares = 262144 64 platformSupported = true 65 // It's not kernel limit, we want this 6M limit to account for overhead during startup, and to supply a reasonable functional container 66 linuxMinMemory = 6291456 67 // constants for remapped root settings 68 defaultIDSpecifier = "default" 69 defaultRemappedID = "dockremap" 70 71 // constant for cgroup drivers 72 cgroupFsDriver = "cgroupfs" 73 cgroupSystemdDriver = "systemd" 74 cgroupNoneDriver = "none" 75 ) 76 77 type containerGetter interface { 78 GetContainer(string) (*container.Container, error) 79 } 80 81 func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory { 82 memory := specs.LinuxMemory{} 83 84 if config.Memory > 0 { 85 memory.Limit = &config.Memory 86 } 87 88 if config.MemoryReservation > 0 { 89 memory.Reservation = &config.MemoryReservation 90 } 91 92 if config.MemorySwap > 0 { 93 memory.Swap = &config.MemorySwap 94 } 95 96 if config.MemorySwappiness != nil { 97 swappiness := uint64(*config.MemorySwappiness) 98 memory.Swappiness = &swappiness 99 } 100 101 if config.OomKillDisable != nil { 102 memory.DisableOOMKiller = config.OomKillDisable 103 } 104 105 if config.KernelMemory != 0 { 106 memory.Kernel = &config.KernelMemory 107 } 108 109 if config.KernelMemoryTCP != 0 { 110 memory.KernelTCP = &config.KernelMemoryTCP 111 } 112 113 return &memory 114 } 115 116 func getPidsLimit(config containertypes.Resources) *specs.LinuxPids { 117 if config.PidsLimit == nil { 118 return nil 119 } 120 if *config.PidsLimit <= 0 { 121 // docker API allows 0 and negative values to unset this to be consistent 122 // with default values. When updating values, runc requires -1 to unset 123 // the previous limit. 124 return &specs.LinuxPids{Limit: -1} 125 } 126 return &specs.LinuxPids{Limit: *config.PidsLimit} 127 } 128 129 func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { 130 cpu := specs.LinuxCPU{} 131 132 if config.CPUShares < 0 { 133 return nil, fmt.Errorf("shares: invalid argument") 134 } 135 if config.CPUShares >= 0 { 136 shares := uint64(config.CPUShares) 137 cpu.Shares = &shares 138 } 139 140 if config.CpusetCpus != "" { 141 cpu.Cpus = config.CpusetCpus 142 } 143 144 if config.CpusetMems != "" { 145 cpu.Mems = config.CpusetMems 146 } 147 148 if config.NanoCPUs > 0 { 149 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 150 period := uint64(100 * time.Millisecond / time.Microsecond) 151 quota := config.NanoCPUs * int64(period) / 1e9 152 cpu.Period = &period 153 cpu.Quota = "a 154 } 155 156 if config.CPUPeriod != 0 { 157 period := uint64(config.CPUPeriod) 158 cpu.Period = &period 159 } 160 161 if config.CPUQuota != 0 { 162 q := config.CPUQuota 163 cpu.Quota = &q 164 } 165 166 if config.CPURealtimePeriod != 0 { 167 period := uint64(config.CPURealtimePeriod) 168 cpu.RealtimePeriod = &period 169 } 170 171 if config.CPURealtimeRuntime != 0 { 172 c := config.CPURealtimeRuntime 173 cpu.RealtimeRuntime = &c 174 } 175 176 return &cpu, nil 177 } 178 179 func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) { 180 var stat unix.Stat_t 181 var blkioWeightDevices []specs.LinuxWeightDevice 182 183 for _, weightDevice := range config.BlkioWeightDevice { 184 if err := unix.Stat(weightDevice.Path, &stat); err != nil { 185 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: weightDevice.Path, Err: err}) 186 } 187 weight := weightDevice.Weight 188 d := specs.LinuxWeightDevice{Weight: &weight} 189 // The type is 32bit on mips. 190 d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert 191 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert 192 blkioWeightDevices = append(blkioWeightDevices, d) 193 } 194 195 return blkioWeightDevices, nil 196 } 197 198 func (daemon *Daemon) parseSecurityOpt(container *container.Container, hostConfig *containertypes.HostConfig) error { 199 container.NoNewPrivileges = daemon.configStore.NoNewPrivileges 200 return parseSecurityOpt(container, hostConfig) 201 } 202 203 func parseSecurityOpt(container *container.Container, config *containertypes.HostConfig) error { 204 var ( 205 labelOpts []string 206 err error 207 ) 208 209 for _, opt := range config.SecurityOpt { 210 if opt == "no-new-privileges" { 211 container.NoNewPrivileges = true 212 continue 213 } 214 if opt == "disable" { 215 labelOpts = append(labelOpts, "disable") 216 continue 217 } 218 219 var con []string 220 if strings.Contains(opt, "=") { 221 con = strings.SplitN(opt, "=", 2) 222 } else if strings.Contains(opt, ":") { 223 con = strings.SplitN(opt, ":", 2) 224 logrus.Warn("Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead.") 225 } 226 if len(con) != 2 { 227 return fmt.Errorf("invalid --security-opt 1: %q", opt) 228 } 229 230 switch con[0] { 231 case "label": 232 labelOpts = append(labelOpts, con[1]) 233 case "apparmor": 234 container.AppArmorProfile = con[1] 235 case "seccomp": 236 container.SeccompProfile = con[1] 237 case "no-new-privileges": 238 noNewPrivileges, err := strconv.ParseBool(con[1]) 239 if err != nil { 240 return fmt.Errorf("invalid --security-opt 2: %q", opt) 241 } 242 container.NoNewPrivileges = noNewPrivileges 243 default: 244 return fmt.Errorf("invalid --security-opt 2: %q", opt) 245 } 246 } 247 248 container.ProcessLabel, container.MountLabel, err = label.InitLabels(labelOpts) 249 return err 250 } 251 252 func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { 253 var throttleDevices []specs.LinuxThrottleDevice 254 var stat unix.Stat_t 255 256 for _, d := range devs { 257 if err := unix.Stat(d.Path, &stat); err != nil { 258 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: d.Path, Err: err}) 259 } 260 d := specs.LinuxThrottleDevice{Rate: d.Rate} 261 // the type is 32bit on mips 262 d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert 263 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert 264 throttleDevices = append(throttleDevices, d) 265 } 266 267 return throttleDevices, nil 268 } 269 270 // adjustParallelLimit takes a number of objects and a proposed limit and 271 // figures out if it's reasonable (and adjusts it accordingly). This is only 272 // used for daemon startup, which does a lot of parallel loading of containers 273 // (and if we exceed RLIMIT_NOFILE then we're in trouble). 274 func adjustParallelLimit(n int, limit int) int { 275 // Rule-of-thumb overhead factor (how many files will each goroutine open 276 // simultaneously). Yes, this is ugly but to be frank this whole thing is 277 // ugly. 278 const overhead = 2 279 280 // On Linux, we need to ensure that parallelStartupJobs doesn't cause us to 281 // exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it 282 // and give a warning (since in theory the user should increase their 283 // ulimits to the largest possible value for dockerd). 284 var rlim unix.Rlimit 285 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil { 286 logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err) 287 return limit 288 } 289 softRlimit := int(rlim.Cur) 290 291 // Much fewer containers than RLIMIT_NOFILE. No need to adjust anything. 292 if softRlimit > overhead*n { 293 return limit 294 } 295 296 // RLIMIT_NOFILE big enough, no need to adjust anything. 297 if softRlimit > overhead*limit { 298 return limit 299 } 300 301 logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit) 302 return softRlimit / overhead 303 } 304 305 // adaptContainerSettings is called during container creation to modify any 306 // settings necessary in the HostConfig structure. 307 func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConfig, adjustCPUShares bool) error { 308 if adjustCPUShares && hostConfig.CPUShares > 0 { 309 // Handle unsupported CPUShares 310 if hostConfig.CPUShares < linuxMinCPUShares { 311 logrus.Warnf("Changing requested CPUShares of %d to minimum allowed of %d", hostConfig.CPUShares, linuxMinCPUShares) 312 hostConfig.CPUShares = linuxMinCPUShares 313 } else if hostConfig.CPUShares > linuxMaxCPUShares { 314 logrus.Warnf("Changing requested CPUShares of %d to maximum allowed of %d", hostConfig.CPUShares, linuxMaxCPUShares) 315 hostConfig.CPUShares = linuxMaxCPUShares 316 } 317 } 318 if hostConfig.Memory > 0 && hostConfig.MemorySwap == 0 { 319 // By default, MemorySwap is set to twice the size of Memory. 320 hostConfig.MemorySwap = hostConfig.Memory * 2 321 } 322 if hostConfig.ShmSize == 0 { 323 hostConfig.ShmSize = config.DefaultShmSize 324 if daemon.configStore != nil { 325 hostConfig.ShmSize = int64(daemon.configStore.ShmSize) 326 } 327 } 328 // Set default IPC mode, if unset for container 329 if hostConfig.IpcMode.IsEmpty() { 330 m := config.DefaultIpcMode 331 if daemon.configStore != nil { 332 m = containertypes.IpcMode(daemon.configStore.IpcMode) 333 } 334 hostConfig.IpcMode = m 335 } 336 337 // Set default cgroup namespace mode, if unset for container 338 if hostConfig.CgroupnsMode.IsEmpty() { 339 // for cgroup v2: unshare cgroupns even for privileged containers 340 // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 341 if hostConfig.Privileged && cgroups.Mode() != cgroups.Unified { 342 hostConfig.CgroupnsMode = containertypes.CgroupnsModeHost 343 } else { 344 m := containertypes.CgroupnsModeHost 345 if cgroups.Mode() == cgroups.Unified { 346 m = containertypes.CgroupnsModePrivate 347 } 348 if daemon.configStore != nil { 349 m = containertypes.CgroupnsMode(daemon.configStore.CgroupNamespaceMode) 350 } 351 hostConfig.CgroupnsMode = m 352 } 353 } 354 355 adaptSharedNamespaceContainer(daemon, hostConfig) 356 357 var err error 358 secOpts, err := daemon.generateSecurityOpt(hostConfig) 359 if err != nil { 360 return err 361 } 362 hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, secOpts...) 363 if hostConfig.OomKillDisable == nil { 364 defaultOomKillDisable := false 365 hostConfig.OomKillDisable = &defaultOomKillDisable 366 } 367 368 return nil 369 } 370 371 // adaptSharedNamespaceContainer replaces container name with its ID in hostConfig. 372 // To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode 373 // and NetworkMode. 374 // 375 // When a container shares its namespace with another container, use ID can keep the namespace 376 // sharing connection between the two containers even the another container is renamed. 377 func adaptSharedNamespaceContainer(daemon containerGetter, hostConfig *containertypes.HostConfig) { 378 containerPrefix := "container:" 379 if hostConfig.PidMode.IsContainer() { 380 pidContainer := hostConfig.PidMode.Container() 381 // if there is any error returned here, we just ignore it and leave it to be 382 // handled in the following logic 383 if c, err := daemon.GetContainer(pidContainer); err == nil { 384 hostConfig.PidMode = containertypes.PidMode(containerPrefix + c.ID) 385 } 386 } 387 if hostConfig.IpcMode.IsContainer() { 388 ipcContainer := hostConfig.IpcMode.Container() 389 if c, err := daemon.GetContainer(ipcContainer); err == nil { 390 hostConfig.IpcMode = containertypes.IpcMode(containerPrefix + c.ID) 391 } 392 } 393 if hostConfig.NetworkMode.IsContainer() { 394 netContainer := hostConfig.NetworkMode.ConnectedContainer() 395 if c, err := daemon.GetContainer(netContainer); err == nil { 396 hostConfig.NetworkMode = containertypes.NetworkMode(containerPrefix + c.ID) 397 } 398 } 399 } 400 401 // verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration 402 func verifyPlatformContainerResources(resources *containertypes.Resources, sysInfo *sysinfo.SysInfo, update bool) (warnings []string, err error) { 403 fixMemorySwappiness(resources) 404 405 // memory subsystem checks and adjustments 406 if resources.Memory != 0 && resources.Memory < linuxMinMemory { 407 return warnings, fmt.Errorf("Minimum memory limit allowed is 6MB") 408 } 409 if resources.Memory > 0 && !sysInfo.MemoryLimit { 410 warnings = append(warnings, "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 411 resources.Memory = 0 412 resources.MemorySwap = -1 413 } 414 if resources.Memory > 0 && resources.MemorySwap != -1 && !sysInfo.SwapLimit { 415 warnings = append(warnings, "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.") 416 resources.MemorySwap = -1 417 } 418 if resources.Memory > 0 && resources.MemorySwap > 0 && resources.MemorySwap < resources.Memory { 419 return warnings, fmt.Errorf("Minimum memoryswap limit should be larger than memory limit, see usage") 420 } 421 if resources.Memory == 0 && resources.MemorySwap > 0 && !update { 422 return warnings, fmt.Errorf("You should always set the Memory limit when using Memoryswap limit, see usage") 423 } 424 if resources.MemorySwappiness != nil && !sysInfo.MemorySwappiness { 425 warnings = append(warnings, "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded.") 426 resources.MemorySwappiness = nil 427 } 428 if resources.MemorySwappiness != nil { 429 swappiness := *resources.MemorySwappiness 430 if swappiness < 0 || swappiness > 100 { 431 return warnings, fmt.Errorf("Invalid value: %v, valid memory swappiness range is 0-100", swappiness) 432 } 433 } 434 if resources.MemoryReservation > 0 && !sysInfo.MemoryReservation { 435 warnings = append(warnings, "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded.") 436 resources.MemoryReservation = 0 437 } 438 if resources.MemoryReservation > 0 && resources.MemoryReservation < linuxMinMemory { 439 return warnings, fmt.Errorf("Minimum memory reservation allowed is 6MB") 440 } 441 if resources.Memory > 0 && resources.MemoryReservation > 0 && resources.Memory < resources.MemoryReservation { 442 return warnings, fmt.Errorf("Minimum memory limit can not be less than memory reservation limit, see usage") 443 } 444 if resources.KernelMemory > 0 { 445 // Kernel memory limit is not supported on cgroup v2. 446 // Even on cgroup v1, kernel memory limit (`kmem.limit_in_bytes`) has been deprecated since kernel 5.4. 447 // https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0b5adf44cae99b3ebcc7 448 if !sysInfo.KernelMemory { 449 warnings = append(warnings, "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 450 resources.KernelMemory = 0 451 } 452 if resources.KernelMemory > 0 && resources.KernelMemory < linuxMinMemory { 453 return warnings, fmt.Errorf("Minimum kernel memory limit allowed is 6MB") 454 } 455 if !kernel.CheckKernelVersion(4, 0, 0) { 456 warnings = append(warnings, "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable.") 457 } 458 } 459 if resources.OomKillDisable != nil && !sysInfo.OomKillDisable { 460 // only produce warnings if the setting wasn't to *disable* the OOM Kill; no point 461 // warning the caller if they already wanted the feature to be off 462 if *resources.OomKillDisable { 463 warnings = append(warnings, "Your kernel does not support OomKillDisable. OomKillDisable discarded.") 464 } 465 resources.OomKillDisable = nil 466 } 467 if resources.OomKillDisable != nil && *resources.OomKillDisable && resources.Memory == 0 { 468 warnings = append(warnings, "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources.") 469 } 470 if resources.PidsLimit != nil && !sysInfo.PidsLimit { 471 if *resources.PidsLimit > 0 { 472 warnings = append(warnings, "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded.") 473 } 474 resources.PidsLimit = nil 475 } 476 477 // cpu subsystem checks and adjustments 478 if resources.NanoCPUs > 0 && resources.CPUPeriod > 0 { 479 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Period cannot both be set") 480 } 481 if resources.NanoCPUs > 0 && resources.CPUQuota > 0 { 482 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Quota cannot both be set") 483 } 484 if resources.NanoCPUs > 0 && !sysInfo.CPUCfs { 485 return warnings, fmt.Errorf("NanoCPUs can not be set, as your kernel does not support CPU CFS scheduler or the cgroup is not mounted") 486 } 487 // The highest precision we could get on Linux is 0.001, by setting 488 // cpu.cfs_period_us=1000ms 489 // cpu.cfs_quota=1ms 490 // See the following link for details: 491 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 492 // Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error. 493 // The error message is 0.01 so that this is consistent with Windows 494 if resources.NanoCPUs < 0 || resources.NanoCPUs > int64(sysinfo.NumCPU())*1e9 { 495 return warnings, fmt.Errorf("Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available", sysinfo.NumCPU(), sysinfo.NumCPU()) 496 } 497 498 if resources.CPUShares > 0 && !sysInfo.CPUShares { 499 warnings = append(warnings, "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded.") 500 resources.CPUShares = 0 501 } 502 if (resources.CPUPeriod != 0 || resources.CPUQuota != 0) && !sysInfo.CPUCfs { 503 warnings = append(warnings, "Your kernel does not support CPU CFS scheduler. CPU period/quota discarded.") 504 resources.CPUPeriod = 0 505 resources.CPUQuota = 0 506 } 507 if resources.CPUPeriod != 0 && (resources.CPUPeriod < 1000 || resources.CPUPeriod > 1000000) { 508 return warnings, fmt.Errorf("CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)") 509 } 510 if resources.CPUQuota > 0 && resources.CPUQuota < 1000 { 511 return warnings, fmt.Errorf("CPU cfs quota can not be less than 1ms (i.e. 1000)") 512 } 513 if resources.CPUPercent > 0 { 514 warnings = append(warnings, fmt.Sprintf("%s does not support CPU percent. Percent discarded.", runtime.GOOS)) 515 resources.CPUPercent = 0 516 } 517 518 // cpuset subsystem checks and adjustments 519 if (resources.CpusetCpus != "" || resources.CpusetMems != "") && !sysInfo.Cpuset { 520 warnings = append(warnings, "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded.") 521 resources.CpusetCpus = "" 522 resources.CpusetMems = "" 523 } 524 cpusAvailable, err := sysInfo.IsCpusetCpusAvailable(resources.CpusetCpus) 525 if err != nil { 526 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset cpus", resources.CpusetCpus) 527 } 528 if !cpusAvailable { 529 return warnings, fmt.Errorf("Requested CPUs are not available - requested %s, available: %s", resources.CpusetCpus, sysInfo.Cpus) 530 } 531 memsAvailable, err := sysInfo.IsCpusetMemsAvailable(resources.CpusetMems) 532 if err != nil { 533 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset mems", resources.CpusetMems) 534 } 535 if !memsAvailable { 536 return warnings, fmt.Errorf("Requested memory nodes are not available - requested %s, available: %s", resources.CpusetMems, sysInfo.Mems) 537 } 538 539 // blkio subsystem checks and adjustments 540 if resources.BlkioWeight > 0 && !sysInfo.BlkioWeight { 541 warnings = append(warnings, "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded.") 542 resources.BlkioWeight = 0 543 } 544 if resources.BlkioWeight > 0 && (resources.BlkioWeight < 10 || resources.BlkioWeight > 1000) { 545 return warnings, fmt.Errorf("Range of blkio weight is from 10 to 1000") 546 } 547 if resources.IOMaximumBandwidth != 0 || resources.IOMaximumIOps != 0 { 548 return warnings, fmt.Errorf("Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps", runtime.GOOS) 549 } 550 if len(resources.BlkioWeightDevice) > 0 && !sysInfo.BlkioWeightDevice { 551 warnings = append(warnings, "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded.") 552 resources.BlkioWeightDevice = []*pblkiodev.WeightDevice{} 553 } 554 if len(resources.BlkioDeviceReadBps) > 0 && !sysInfo.BlkioReadBpsDevice { 555 warnings = append(warnings, "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded.") 556 resources.BlkioDeviceReadBps = []*pblkiodev.ThrottleDevice{} 557 } 558 if len(resources.BlkioDeviceWriteBps) > 0 && !sysInfo.BlkioWriteBpsDevice { 559 warnings = append(warnings, "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded.") 560 resources.BlkioDeviceWriteBps = []*pblkiodev.ThrottleDevice{} 561 } 562 if len(resources.BlkioDeviceReadIOps) > 0 && !sysInfo.BlkioReadIOpsDevice { 563 warnings = append(warnings, "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded.") 564 resources.BlkioDeviceReadIOps = []*pblkiodev.ThrottleDevice{} 565 } 566 if len(resources.BlkioDeviceWriteIOps) > 0 && !sysInfo.BlkioWriteIOpsDevice { 567 warnings = append(warnings, "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded.") 568 resources.BlkioDeviceWriteIOps = []*pblkiodev.ThrottleDevice{} 569 } 570 571 return warnings, nil 572 } 573 574 func (daemon *Daemon) getCgroupDriver() string { 575 if UsingSystemd(daemon.configStore) { 576 return cgroupSystemdDriver 577 } 578 if daemon.Rootless() { 579 return cgroupNoneDriver 580 } 581 return cgroupFsDriver 582 } 583 584 // getCD gets the raw value of the native.cgroupdriver option, if set. 585 func getCD(config *config.Config) string { 586 for _, option := range config.ExecOptions { 587 key, val, err := parsers.ParseKeyValueOpt(option) 588 if err != nil || !strings.EqualFold(key, "native.cgroupdriver") { 589 continue 590 } 591 return val 592 } 593 return "" 594 } 595 596 // verifyCgroupDriver validates native.cgroupdriver 597 func verifyCgroupDriver(config *config.Config) error { 598 cd := getCD(config) 599 if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver { 600 return nil 601 } 602 if cd == cgroupNoneDriver { 603 return fmt.Errorf("native.cgroupdriver option %s is internally used and cannot be specified manually", cd) 604 } 605 return fmt.Errorf("native.cgroupdriver option %s not supported", cd) 606 } 607 608 // UsingSystemd returns true if cli option includes native.cgroupdriver=systemd 609 func UsingSystemd(config *config.Config) bool { 610 cd := getCD(config) 611 612 if cd == cgroupSystemdDriver { 613 return true 614 } 615 // On cgroup v2 hosts, default to systemd driver 616 if cd == "" && cgroups.Mode() == cgroups.Unified && isRunningSystemd() { 617 return true 618 } 619 return false 620 } 621 622 var ( 623 runningSystemd bool 624 detectSystemd sync.Once 625 ) 626 627 // isRunningSystemd checks whether the host was booted with systemd as its init 628 // system. This functions similarly to systemd's `sd_booted(3)`: internally, it 629 // checks whether /run/systemd/system/ exists and is a directory. 630 // http://www.freedesktop.org/software/systemd/man/sd_booted.html 631 // 632 // NOTE: This function comes from package github.com/coreos/go-systemd/util 633 // It was borrowed here to avoid a dependency on cgo. 634 func isRunningSystemd() bool { 635 detectSystemd.Do(func() { 636 fi, err := os.Lstat("/run/systemd/system") 637 if err != nil { 638 return 639 } 640 runningSystemd = fi.IsDir() 641 }) 642 return runningSystemd 643 } 644 645 // verifyPlatformContainerSettings performs platform-specific validation of the 646 // hostconfig and config structures. 647 func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.HostConfig, update bool) (warnings []string, err error) { 648 if hostConfig == nil { 649 return nil, nil 650 } 651 sysInfo := daemon.RawSysInfo() 652 653 w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update) 654 655 // no matter err is nil or not, w could have data in itself. 656 warnings = append(warnings, w...) 657 658 if err != nil { 659 return warnings, err 660 } 661 662 if !hostConfig.IpcMode.Valid() { 663 return warnings, errors.Errorf("invalid IPC mode: %v", hostConfig.IpcMode) 664 } 665 if !hostConfig.PidMode.Valid() { 666 return warnings, errors.Errorf("invalid PID mode: %v", hostConfig.PidMode) 667 } 668 if hostConfig.ShmSize < 0 { 669 return warnings, fmt.Errorf("SHM size can not be less than 0") 670 } 671 if !hostConfig.UTSMode.Valid() { 672 return warnings, errors.Errorf("invalid UTS mode: %v", hostConfig.UTSMode) 673 } 674 675 if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 { 676 return warnings, fmt.Errorf("Invalid value %d, range for oom score adj is [-1000, 1000]", hostConfig.OomScoreAdj) 677 } 678 679 // ip-forwarding does not affect container with '--net=host' (or '--net=none') 680 if sysInfo.IPv4ForwardingDisabled && !(hostConfig.NetworkMode.IsHost() || hostConfig.NetworkMode.IsNone()) { 681 warnings = append(warnings, "IPv4 forwarding is disabled. Networking will not work.") 682 } 683 if hostConfig.NetworkMode.IsHost() && len(hostConfig.PortBindings) > 0 { 684 warnings = append(warnings, "Published ports are discarded when using host network mode") 685 } 686 687 // check for various conflicting options with user namespaces 688 if daemon.configStore.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() { 689 if hostConfig.Privileged { 690 return warnings, fmt.Errorf("privileged mode is incompatible with user namespaces. You must run the container in the host namespace when running privileged mode") 691 } 692 if hostConfig.NetworkMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 693 return warnings, fmt.Errorf("cannot share the host's network namespace when user namespaces are enabled") 694 } 695 if hostConfig.PidMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 696 return warnings, fmt.Errorf("cannot share the host PID namespace when user namespaces are enabled") 697 } 698 } 699 if hostConfig.CgroupParent != "" && UsingSystemd(daemon.configStore) { 700 // CgroupParent for systemd cgroup should be named as "xxx.slice" 701 if len(hostConfig.CgroupParent) <= 6 || !strings.HasSuffix(hostConfig.CgroupParent, ".slice") { 702 return warnings, fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 703 } 704 } 705 if hostConfig.Runtime == "" { 706 hostConfig.Runtime = daemon.configStore.GetDefaultRuntimeName() 707 } 708 709 if _, err := daemon.getRuntime(hostConfig.Runtime); err != nil { 710 return warnings, err 711 } 712 713 parser := volumemounts.NewParser() 714 for dest := range hostConfig.Tmpfs { 715 if err := parser.ValidateTmpfsMountDestination(dest); err != nil { 716 return warnings, err 717 } 718 } 719 720 if !hostConfig.CgroupnsMode.Valid() { 721 return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode) 722 } 723 if hostConfig.CgroupnsMode.IsPrivate() { 724 if !sysInfo.CgroupNamespaces { 725 warnings = append(warnings, "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded.") 726 } 727 } 728 729 return warnings, nil 730 } 731 732 // verifyDaemonSettings performs validation of daemon config struct 733 func verifyDaemonSettings(conf *config.Config) error { 734 if conf.ContainerdNamespace == conf.ContainerdPluginNamespace { 735 return errors.New("containers namespace and plugins namespace cannot be the same") 736 } 737 // Check for mutually incompatible config options 738 if conf.BridgeConfig.Iface != "" && conf.BridgeConfig.IP != "" { 739 return fmt.Errorf("You specified -b & --bip, mutually exclusive options. Please specify only one") 740 } 741 if !conf.BridgeConfig.EnableIPTables && !conf.BridgeConfig.InterContainerCommunication { 742 return fmt.Errorf("You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true") 743 } 744 if conf.BridgeConfig.EnableIP6Tables && !conf.Experimental { 745 return fmt.Errorf("ip6tables rules are only available if experimental features are enabled") 746 } 747 if !conf.BridgeConfig.EnableIPTables && conf.BridgeConfig.EnableIPMasq { 748 conf.BridgeConfig.EnableIPMasq = false 749 } 750 if err := verifyCgroupDriver(conf); err != nil { 751 return err 752 } 753 if conf.CgroupParent != "" && UsingSystemd(conf) { 754 if len(conf.CgroupParent) <= 6 || !strings.HasSuffix(conf.CgroupParent, ".slice") { 755 return fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 756 } 757 } 758 759 if conf.Rootless && UsingSystemd(conf) && cgroups.Mode() != cgroups.Unified { 760 return fmt.Errorf("exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode") 761 } 762 763 configureRuntimes(conf) 764 if rtName := conf.GetDefaultRuntimeName(); rtName != "" { 765 if conf.GetRuntime(rtName) == nil { 766 if !config.IsPermissibleC8dRuntimeName(rtName) { 767 return fmt.Errorf("specified default runtime '%s' does not exist", rtName) 768 } 769 } 770 } 771 return nil 772 } 773 774 // checkSystem validates platform-specific requirements 775 func checkSystem() error { 776 return nil 777 } 778 779 // configureMaxThreads sets the Go runtime max threads threshold 780 // which is 90% of the kernel setting from /proc/sys/kernel/threads-max 781 func configureMaxThreads(config *config.Config) error { 782 mt, err := os.ReadFile("/proc/sys/kernel/threads-max") 783 if err != nil { 784 return err 785 } 786 mtint, err := strconv.Atoi(strings.TrimSpace(string(mt))) 787 if err != nil { 788 return err 789 } 790 maxThreads := (mtint / 100) * 90 791 debug.SetMaxThreads(maxThreads) 792 logrus.Debugf("Golang's threads limit set to %d", maxThreads) 793 return nil 794 } 795 796 func overlaySupportsSelinux() (bool, error) { 797 f, err := os.Open("/proc/kallsyms") 798 if err != nil { 799 if os.IsNotExist(err) { 800 return false, nil 801 } 802 return false, err 803 } 804 defer f.Close() 805 806 s := bufio.NewScanner(f) 807 for s.Scan() { 808 if strings.HasSuffix(s.Text(), " security_inode_copy_up") { 809 return true, nil 810 } 811 } 812 813 return false, s.Err() 814 } 815 816 // configureKernelSecuritySupport configures and validates security support for the kernel 817 func configureKernelSecuritySupport(config *config.Config, driverName string) error { 818 if config.EnableSelinuxSupport { 819 if !selinux.GetEnabled() { 820 logrus.Warn("Docker could not enable SELinux on the host system") 821 return nil 822 } 823 824 if driverName == "overlay" || driverName == "overlay2" { 825 // If driver is overlay or overlay2, make sure kernel 826 // supports selinux with overlay. 827 supported, err := overlaySupportsSelinux() 828 if err != nil { 829 return err 830 } 831 832 if !supported { 833 logrus.Warnf("SELinux is not supported with the %v graph driver on this kernel", driverName) 834 } 835 } 836 } else { 837 selinux.SetDisabled() 838 } 839 return nil 840 } 841 842 // initNetworkController initializes the libnetwork controller and configures 843 // network settings. If there's active sandboxes, configuration changes will not 844 // take effect. 845 func (daemon *Daemon) initNetworkController(activeSandboxes map[string]interface{}) error { 846 netOptions, err := daemon.networkOptions(daemon.PluginStore, activeSandboxes) 847 if err != nil { 848 return err 849 } 850 851 daemon.netController, err = libnetwork.New(netOptions...) 852 if err != nil { 853 return fmt.Errorf("error obtaining controller instance: %v", err) 854 } 855 856 if len(activeSandboxes) > 0 { 857 logrus.Info("there are running containers, updated network configuration will not take affect") 858 } else if err := configureNetworking(daemon.netController, daemon.configStore); err != nil { 859 return err 860 } 861 862 // Set HostGatewayIP to the default bridge's IP if it is empty 863 setHostGatewayIP(daemon.netController, daemon.configStore) 864 return nil 865 } 866 867 func configureNetworking(controller libnetwork.NetworkController, conf *config.Config) error { 868 // Initialize default network on "null" 869 if n, _ := controller.NetworkByName("none"); n == nil { 870 if _, err := controller.NewNetwork("null", "none", "", libnetwork.NetworkOptionPersist(true)); err != nil { 871 return errors.Wrap(err, `error creating default "null" network`) 872 } 873 } 874 875 // Initialize default network on "host" 876 if n, _ := controller.NetworkByName("host"); n == nil { 877 if _, err := controller.NewNetwork("host", "host", "", libnetwork.NetworkOptionPersist(true)); err != nil { 878 return errors.Wrap(err, `error creating default "host" network`) 879 } 880 } 881 882 // Clear stale bridge network 883 if n, err := controller.NetworkByName("bridge"); err == nil { 884 if err = n.Delete(); err != nil { 885 return errors.Wrap(err, `could not delete the default "bridge"" network`) 886 } 887 if len(conf.NetworkConfig.DefaultAddressPools.Value()) > 0 && !conf.LiveRestoreEnabled { 888 removeDefaultBridgeInterface() 889 } 890 } 891 892 if !conf.DisableBridge { 893 // Initialize default driver "bridge" 894 if err := initBridgeDriver(controller, conf); err != nil { 895 return err 896 } 897 } else { 898 removeDefaultBridgeInterface() 899 } 900 901 return nil 902 } 903 904 // setHostGatewayIP sets cfg.HostGatewayIP to the default bridge's IP if it is empty. 905 func setHostGatewayIP(controller libnetwork.NetworkController, config *config.Config) { 906 if config.HostGatewayIP != nil { 907 return 908 } 909 if n, err := controller.NetworkByName("bridge"); err == nil { 910 v4Info, v6Info := n.Info().IpamInfo() 911 var gateway net.IP 912 if len(v4Info) > 0 { 913 gateway = v4Info[0].Gateway.IP 914 } else if len(v6Info) > 0 { 915 gateway = v6Info[0].Gateway.IP 916 } 917 config.HostGatewayIP = gateway 918 } 919 } 920 921 func driverOptions(config *config.Config) nwconfig.Option { 922 return nwconfig.OptionDriverConfig("bridge", options.Generic{ 923 netlabel.GenericData: options.Generic{ 924 "EnableIPForwarding": config.BridgeConfig.EnableIPForward, 925 "EnableIPTables": config.BridgeConfig.EnableIPTables, 926 "EnableIP6Tables": config.BridgeConfig.EnableIP6Tables, 927 "EnableUserlandProxy": config.BridgeConfig.EnableUserlandProxy, 928 "UserlandProxyPath": config.BridgeConfig.UserlandProxyPath, 929 }, 930 }) 931 } 932 933 func initBridgeDriver(controller libnetwork.NetworkController, config *config.Config) error { 934 bridgeName := bridge.DefaultBridgeName 935 if config.BridgeConfig.Iface != "" { 936 bridgeName = config.BridgeConfig.Iface 937 } 938 netOption := map[string]string{ 939 bridge.BridgeName: bridgeName, 940 bridge.DefaultBridge: strconv.FormatBool(true), 941 netlabel.DriverMTU: strconv.Itoa(config.Mtu), 942 bridge.EnableIPMasquerade: strconv.FormatBool(config.BridgeConfig.EnableIPMasq), 943 bridge.EnableICC: strconv.FormatBool(config.BridgeConfig.InterContainerCommunication), 944 } 945 946 // --ip processing 947 if config.BridgeConfig.DefaultIP != nil { 948 netOption[bridge.DefaultBindingIP] = config.BridgeConfig.DefaultIP.String() 949 } 950 951 ipamV4Conf := &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 952 953 nwList, nw6List, err := netutils.ElectInterfaceAddresses(bridgeName) 954 if err != nil { 955 return errors.Wrap(err, "list bridge addresses failed") 956 } 957 958 nw := nwList[0] 959 if len(nwList) > 1 && config.BridgeConfig.FixedCIDR != "" { 960 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 961 if err != nil { 962 return errors.Wrap(err, "parse CIDR failed") 963 } 964 // Iterate through in case there are multiple addresses for the bridge 965 for _, entry := range nwList { 966 if fCIDR.Contains(entry.IP) { 967 nw = entry 968 break 969 } 970 } 971 } 972 973 ipamV4Conf.PreferredPool = lntypes.GetIPNetCanonical(nw).String() 974 hip, _ := lntypes.GetHostPartIP(nw.IP, nw.Mask) 975 if hip.IsGlobalUnicast() { 976 ipamV4Conf.Gateway = nw.IP.String() 977 } 978 979 if config.BridgeConfig.IP != "" { 980 ip, ipNet, err := net.ParseCIDR(config.BridgeConfig.IP) 981 if err != nil { 982 return err 983 } 984 ipamV4Conf.PreferredPool = ipNet.String() 985 ipamV4Conf.Gateway = ip.String() 986 } else if bridgeName == bridge.DefaultBridgeName && ipamV4Conf.PreferredPool != "" { 987 logrus.Infof("Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address", bridgeName, ipamV4Conf.PreferredPool) 988 } 989 990 if config.BridgeConfig.FixedCIDR != "" { 991 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 992 if err != nil { 993 return err 994 } 995 996 ipamV4Conf.SubPool = fCIDR.String() 997 } 998 999 if config.BridgeConfig.DefaultGatewayIPv4 != nil { 1000 ipamV4Conf.AuxAddresses["DefaultGatewayIPv4"] = config.BridgeConfig.DefaultGatewayIPv4.String() 1001 } 1002 1003 var ( 1004 deferIPv6Alloc bool 1005 ipamV6Conf *libnetwork.IpamConf 1006 ) 1007 1008 if config.BridgeConfig.EnableIPv6 && config.BridgeConfig.FixedCIDRv6 == "" { 1009 return errdefs.InvalidParameter(errors.New("IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6")) 1010 } else if config.BridgeConfig.FixedCIDRv6 != "" { 1011 _, fCIDRv6, err := net.ParseCIDR(config.BridgeConfig.FixedCIDRv6) 1012 if err != nil { 1013 return err 1014 } 1015 1016 // In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has 1017 // at least 48 host bits, we need to guarantee the current behavior where the containers' 1018 // IPv6 addresses will be constructed based on the containers' interface MAC address. 1019 // We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints 1020 // on this network until after the driver has created the endpoint and returned the 1021 // constructed address. Libnetwork will then reserve this address with the ipam driver. 1022 ones, _ := fCIDRv6.Mask.Size() 1023 deferIPv6Alloc = ones <= 80 1024 1025 ipamV6Conf = &libnetwork.IpamConf{ 1026 AuxAddresses: make(map[string]string), 1027 PreferredPool: fCIDRv6.String(), 1028 } 1029 1030 // In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6 1031 // address belongs to the same network, we need to inform libnetwork about it, so 1032 // that it can be reserved with IPAM and it will not be given away to somebody else 1033 for _, nw6 := range nw6List { 1034 if fCIDRv6.Contains(nw6.IP) { 1035 ipamV6Conf.Gateway = nw6.IP.String() 1036 break 1037 } 1038 } 1039 } 1040 1041 if config.BridgeConfig.DefaultGatewayIPv6 != nil { 1042 if ipamV6Conf == nil { 1043 ipamV6Conf = &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 1044 } 1045 ipamV6Conf.AuxAddresses["DefaultGatewayIPv6"] = config.BridgeConfig.DefaultGatewayIPv6.String() 1046 } 1047 1048 v4Conf := []*libnetwork.IpamConf{ipamV4Conf} 1049 v6Conf := []*libnetwork.IpamConf{} 1050 if ipamV6Conf != nil { 1051 v6Conf = append(v6Conf, ipamV6Conf) 1052 } 1053 // Initialize default network on "bridge" with the same name 1054 _, err = controller.NewNetwork("bridge", "bridge", "", 1055 libnetwork.NetworkOptionEnableIPv6(config.BridgeConfig.EnableIPv6), 1056 libnetwork.NetworkOptionDriverOpts(netOption), 1057 libnetwork.NetworkOptionIpam("default", "", v4Conf, v6Conf, nil), 1058 libnetwork.NetworkOptionDeferIPv6Alloc(deferIPv6Alloc)) 1059 if err != nil { 1060 return fmt.Errorf("Error creating default \"bridge\" network: %v", err) 1061 } 1062 return nil 1063 } 1064 1065 // Remove default bridge interface if present (--bridge=none use case) 1066 func removeDefaultBridgeInterface() { 1067 if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil { 1068 if err := netlink.LinkDel(lnk); err != nil { 1069 logrus.Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err) 1070 } 1071 } 1072 } 1073 1074 func setupInitLayer(idMapping idtools.IdentityMapping) func(containerfs.ContainerFS) error { 1075 return func(initPath containerfs.ContainerFS) error { 1076 return initlayer.Setup(initPath, idMapping.RootPair()) 1077 } 1078 } 1079 1080 // Parse the remapped root (user namespace) option, which can be one of: 1081 // 1082 // - username - valid username from /etc/passwd 1083 // - username:groupname - valid username; valid groupname from /etc/group 1084 // - uid - 32-bit unsigned int valid Linux UID value 1085 // - uid:gid - uid value; 32-bit unsigned int Linux GID value 1086 // 1087 // If no groupname is specified, and a username is specified, an attempt 1088 // will be made to lookup a gid for that username as a groupname 1089 // 1090 // If names are used, they are verified to exist in passwd/group 1091 func parseRemappedRoot(usergrp string) (string, string, error) { 1092 var ( 1093 userID, groupID int 1094 username, groupname string 1095 ) 1096 1097 idparts := strings.Split(usergrp, ":") 1098 if len(idparts) > 2 { 1099 return "", "", fmt.Errorf("Invalid user/group specification in --userns-remap: %q", usergrp) 1100 } 1101 1102 if uid, err := strconv.ParseInt(idparts[0], 10, 32); err == nil { 1103 // must be a uid; take it as valid 1104 userID = int(uid) 1105 luser, err := idtools.LookupUID(userID) 1106 if err != nil { 1107 return "", "", fmt.Errorf("Uid %d has no entry in /etc/passwd: %v", userID, err) 1108 } 1109 username = luser.Name 1110 if len(idparts) == 1 { 1111 // if the uid was numeric and no gid was specified, take the uid as the gid 1112 groupID = userID 1113 lgrp, err := idtools.LookupGID(groupID) 1114 if err != nil { 1115 return "", "", fmt.Errorf("Gid %d has no entry in /etc/group: %v", groupID, err) 1116 } 1117 groupname = lgrp.Name 1118 } 1119 } else { 1120 lookupName := idparts[0] 1121 // special case: if the user specified "default", they want Docker to create or 1122 // use (after creation) the "dockremap" user/group for root remapping 1123 if lookupName == defaultIDSpecifier { 1124 lookupName = defaultRemappedID 1125 } 1126 luser, err := idtools.LookupUser(lookupName) 1127 if err != nil && idparts[0] != defaultIDSpecifier { 1128 // error if the name requested isn't the special "dockremap" ID 1129 return "", "", fmt.Errorf("Error during uid lookup for %q: %v", lookupName, err) 1130 } else if err != nil { 1131 // special case-- if the username == "default", then we have been asked 1132 // to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid} 1133 // ranges will be used for the user and group mappings in user namespaced containers 1134 _, _, err := idtools.AddNamespaceRangesUser(defaultRemappedID) 1135 if err == nil { 1136 return defaultRemappedID, defaultRemappedID, nil 1137 } 1138 return "", "", fmt.Errorf("Error during %q user creation: %v", defaultRemappedID, err) 1139 } 1140 username = luser.Name 1141 if len(idparts) == 1 { 1142 // we only have a string username, and no group specified; look up gid from username as group 1143 group, err := idtools.LookupGroup(lookupName) 1144 if err != nil { 1145 return "", "", fmt.Errorf("Error during gid lookup for %q: %v", lookupName, err) 1146 } 1147 groupname = group.Name 1148 } 1149 } 1150 1151 if len(idparts) == 2 { 1152 // groupname or gid is separately specified and must be resolved 1153 // to an unsigned 32-bit gid 1154 if gid, err := strconv.ParseInt(idparts[1], 10, 32); err == nil { 1155 // must be a gid, take it as valid 1156 groupID = int(gid) 1157 lgrp, err := idtools.LookupGID(groupID) 1158 if err != nil { 1159 return "", "", fmt.Errorf("Gid %d has no entry in /etc/passwd: %v", groupID, err) 1160 } 1161 groupname = lgrp.Name 1162 } else { 1163 // not a number; attempt a lookup 1164 if _, err := idtools.LookupGroup(idparts[1]); err != nil { 1165 return "", "", fmt.Errorf("Error during groupname lookup for %q: %v", idparts[1], err) 1166 } 1167 groupname = idparts[1] 1168 } 1169 } 1170 return username, groupname, nil 1171 } 1172 1173 func setupRemappedRoot(config *config.Config) (idtools.IdentityMapping, error) { 1174 if runtime.GOOS != "linux" && config.RemappedRoot != "" { 1175 return idtools.IdentityMapping{}, fmt.Errorf("User namespaces are only supported on Linux") 1176 } 1177 1178 // if the daemon was started with remapped root option, parse 1179 // the config option to the int uid,gid values 1180 if config.RemappedRoot != "" { 1181 username, groupname, err := parseRemappedRoot(config.RemappedRoot) 1182 if err != nil { 1183 return idtools.IdentityMapping{}, err 1184 } 1185 if username == "root" { 1186 // Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op 1187 // effectively 1188 logrus.Warn("User namespaces: root cannot be remapped with itself; user namespaces are OFF") 1189 return idtools.IdentityMapping{}, nil 1190 } 1191 logrus.Infof("User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s", username) 1192 // update remapped root setting now that we have resolved them to actual names 1193 config.RemappedRoot = fmt.Sprintf("%s:%s", username, groupname) 1194 1195 mappings, err := idtools.LoadIdentityMapping(username) 1196 if err != nil { 1197 return idtools.IdentityMapping{}, errors.Wrap(err, "Can't create ID mappings") 1198 } 1199 return mappings, nil 1200 } 1201 return idtools.IdentityMapping{}, nil 1202 } 1203 1204 func setupDaemonRoot(config *config.Config, rootDir string, remappedRoot idtools.Identity) error { 1205 config.Root = rootDir 1206 // the docker root metadata directory needs to have execute permissions for all users (g+x,o+x) 1207 // so that syscalls executing as non-root, operating on subdirectories of the graph root 1208 // (e.g. mounted layers of a container) can traverse this path. 1209 // The user namespace support will create subdirectories for the remapped root host uid:gid 1210 // pair owned by that same uid:gid pair for proper write access to those needed metadata and 1211 // layer content subtrees. 1212 if _, err := os.Stat(rootDir); err == nil { 1213 // root current exists; verify the access bits are correct by setting them 1214 if err = os.Chmod(rootDir, 0711); err != nil { 1215 return err 1216 } 1217 } else if os.IsNotExist(err) { 1218 // no root exists yet, create it 0711 with root:root ownership 1219 if err := os.MkdirAll(rootDir, 0711); err != nil { 1220 return err 1221 } 1222 } 1223 1224 id := idtools.Identity{UID: idtools.CurrentIdentity().UID, GID: remappedRoot.GID} 1225 // First make sure the current root dir has the correct perms. 1226 if err := idtools.MkdirAllAndChown(config.Root, 0710, id); err != nil { 1227 return errors.Wrapf(err, "could not create or set daemon root permissions: %s", config.Root) 1228 } 1229 1230 // if user namespaces are enabled we will create a subtree underneath the specified root 1231 // with any/all specified remapped root uid/gid options on the daemon creating 1232 // a new subdirectory with ownership set to the remapped uid/gid (so as to allow 1233 // `chdir()` to work for containers namespaced to that uid/gid) 1234 if config.RemappedRoot != "" { 1235 config.Root = filepath.Join(rootDir, fmt.Sprintf("%d.%d", remappedRoot.UID, remappedRoot.GID)) 1236 logrus.Debugf("Creating user namespaced daemon root: %s", config.Root) 1237 // Create the root directory if it doesn't exist 1238 if err := idtools.MkdirAllAndChown(config.Root, 0710, id); err != nil { 1239 return fmt.Errorf("Cannot create daemon root: %s: %v", config.Root, err) 1240 } 1241 // we also need to verify that any pre-existing directories in the path to 1242 // the graphroot won't block access to remapped root--if any pre-existing directory 1243 // has strict permissions that don't allow "x", container start will fail, so 1244 // better to warn and fail now 1245 dirPath := config.Root 1246 for { 1247 dirPath = filepath.Dir(dirPath) 1248 if dirPath == "/" { 1249 break 1250 } 1251 if !idtools.CanAccess(dirPath, remappedRoot) { 1252 return fmt.Errorf("a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories", config.Root) 1253 } 1254 } 1255 } 1256 1257 if err := setupDaemonRootPropagation(config); err != nil { 1258 logrus.WithError(err).WithField("dir", config.Root).Warn("Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior") 1259 } 1260 return nil 1261 } 1262 1263 func setupDaemonRootPropagation(cfg *config.Config) error { 1264 rootParentMount, mountOptions, err := getSourceMount(cfg.Root) 1265 if err != nil { 1266 return errors.Wrap(err, "error getting daemon root's parent mount") 1267 } 1268 1269 var cleanupOldFile bool 1270 cleanupFile := getUnmountOnShutdownPath(cfg) 1271 defer func() { 1272 if !cleanupOldFile { 1273 return 1274 } 1275 if err := os.Remove(cleanupFile); err != nil && !os.IsNotExist(err) { 1276 logrus.WithError(err).WithField("file", cleanupFile).Warn("could not clean up old root propagation unmount file") 1277 } 1278 }() 1279 1280 if hasMountInfoOption(mountOptions, sharedPropagationOption, slavePropagationOption) { 1281 cleanupOldFile = true 1282 return nil 1283 } 1284 1285 if err := mount.MakeShared(cfg.Root); err != nil { 1286 return errors.Wrap(err, "could not setup daemon root propagation to shared") 1287 } 1288 1289 // check the case where this may have already been a mount to itself. 1290 // If so then the daemon only performed a remount and should not try to unmount this later. 1291 if rootParentMount == cfg.Root { 1292 cleanupOldFile = true 1293 return nil 1294 } 1295 1296 if err := os.MkdirAll(filepath.Dir(cleanupFile), 0700); err != nil { 1297 return errors.Wrap(err, "error creating dir to store mount cleanup file") 1298 } 1299 1300 if err := os.WriteFile(cleanupFile, nil, 0600); err != nil { 1301 return errors.Wrap(err, "error writing file to signal mount cleanup on shutdown") 1302 } 1303 return nil 1304 } 1305 1306 // getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown 1307 // the daemon root should be unmounted. 1308 func getUnmountOnShutdownPath(config *config.Config) string { 1309 return filepath.Join(config.ExecRoot, "unmount-on-shutdown") 1310 } 1311 1312 // registerLinks registers network links between container and other containers 1313 // with the daemon using the specification in hostConfig. 1314 func (daemon *Daemon) registerLinks(container *container.Container, hostConfig *containertypes.HostConfig) error { 1315 if hostConfig == nil || hostConfig.NetworkMode.IsUserDefined() { 1316 return nil 1317 } 1318 1319 for _, l := range hostConfig.Links { 1320 name, alias, err := opts.ParseLink(l) 1321 if err != nil { 1322 return err 1323 } 1324 child, err := daemon.GetContainer(name) 1325 if err != nil { 1326 if errdefs.IsNotFound(err) { 1327 // Trying to link to a non-existing container is not valid, and 1328 // should return an "invalid parameter" error. Returning a "not 1329 // found" error here would make the client report the container's 1330 // image could not be found (see moby/moby#39823) 1331 err = errdefs.InvalidParameter(err) 1332 } 1333 return errors.Wrapf(err, "could not get container for %s", name) 1334 } 1335 for child.HostConfig.NetworkMode.IsContainer() { 1336 parts := strings.SplitN(string(child.HostConfig.NetworkMode), ":", 2) 1337 child, err = daemon.GetContainer(parts[1]) 1338 if err != nil { 1339 if errdefs.IsNotFound(err) { 1340 // Trying to link to a non-existing container is not valid, and 1341 // should return an "invalid parameter" error. Returning a "not 1342 // found" error here would make the client report the container's 1343 // image could not be found (see moby/moby#39823) 1344 err = errdefs.InvalidParameter(err) 1345 } 1346 return errors.Wrapf(err, "Could not get container for %s", parts[1]) 1347 } 1348 } 1349 if child.HostConfig.NetworkMode.IsHost() { 1350 return runconfig.ErrConflictHostNetworkAndLinks 1351 } 1352 if err := daemon.registerLink(container, child, alias); err != nil { 1353 return err 1354 } 1355 } 1356 1357 return nil 1358 } 1359 1360 // conditionalMountOnStart is a platform specific helper function during the 1361 // container start to call mount. 1362 func (daemon *Daemon) conditionalMountOnStart(container *container.Container) error { 1363 return daemon.Mount(container) 1364 } 1365 1366 // conditionalUnmountOnCleanup is a platform specific helper function called 1367 // during the cleanup of a container to unmount. 1368 func (daemon *Daemon) conditionalUnmountOnCleanup(container *container.Container) error { 1369 return daemon.Unmount(container) 1370 } 1371 1372 func copyBlkioEntry(entries []*statsV1.BlkIOEntry) []types.BlkioStatEntry { 1373 out := make([]types.BlkioStatEntry, len(entries)) 1374 for i, re := range entries { 1375 out[i] = types.BlkioStatEntry{ 1376 Major: re.Major, 1377 Minor: re.Minor, 1378 Op: re.Op, 1379 Value: re.Value, 1380 } 1381 } 1382 return out 1383 } 1384 1385 func (daemon *Daemon) stats(c *container.Container) (*types.StatsJSON, error) { 1386 if !c.IsRunning() { 1387 return nil, errNotRunning(c.ID) 1388 } 1389 cs, err := daemon.containerd.Stats(context.Background(), c.ID) 1390 if err != nil { 1391 if strings.Contains(err.Error(), "container not found") { 1392 return nil, containerNotFound(c.ID) 1393 } 1394 return nil, err 1395 } 1396 s := &types.StatsJSON{} 1397 s.Read = cs.Read 1398 stats := cs.Metrics 1399 switch t := stats.(type) { 1400 case *statsV1.Metrics: 1401 return daemon.statsV1(s, t) 1402 case *statsV2.Metrics: 1403 return daemon.statsV2(s, t) 1404 default: 1405 return nil, errors.Errorf("unexpected type of metrics %+v", t) 1406 } 1407 } 1408 1409 func (daemon *Daemon) statsV1(s *types.StatsJSON, stats *statsV1.Metrics) (*types.StatsJSON, error) { 1410 if stats.Blkio != nil { 1411 s.BlkioStats = types.BlkioStats{ 1412 IoServiceBytesRecursive: copyBlkioEntry(stats.Blkio.IoServiceBytesRecursive), 1413 IoServicedRecursive: copyBlkioEntry(stats.Blkio.IoServicedRecursive), 1414 IoQueuedRecursive: copyBlkioEntry(stats.Blkio.IoQueuedRecursive), 1415 IoServiceTimeRecursive: copyBlkioEntry(stats.Blkio.IoServiceTimeRecursive), 1416 IoWaitTimeRecursive: copyBlkioEntry(stats.Blkio.IoWaitTimeRecursive), 1417 IoMergedRecursive: copyBlkioEntry(stats.Blkio.IoMergedRecursive), 1418 IoTimeRecursive: copyBlkioEntry(stats.Blkio.IoTimeRecursive), 1419 SectorsRecursive: copyBlkioEntry(stats.Blkio.SectorsRecursive), 1420 } 1421 } 1422 if stats.CPU != nil { 1423 s.CPUStats = types.CPUStats{ 1424 CPUUsage: types.CPUUsage{ 1425 TotalUsage: stats.CPU.Usage.Total, 1426 PercpuUsage: stats.CPU.Usage.PerCPU, 1427 UsageInKernelmode: stats.CPU.Usage.Kernel, 1428 UsageInUsermode: stats.CPU.Usage.User, 1429 }, 1430 ThrottlingData: types.ThrottlingData{ 1431 Periods: stats.CPU.Throttling.Periods, 1432 ThrottledPeriods: stats.CPU.Throttling.ThrottledPeriods, 1433 ThrottledTime: stats.CPU.Throttling.ThrottledTime, 1434 }, 1435 } 1436 } 1437 1438 if stats.Memory != nil { 1439 raw := map[string]uint64{ 1440 "cache": stats.Memory.Cache, 1441 "rss": stats.Memory.RSS, 1442 "rss_huge": stats.Memory.RSSHuge, 1443 "mapped_file": stats.Memory.MappedFile, 1444 "dirty": stats.Memory.Dirty, 1445 "writeback": stats.Memory.Writeback, 1446 "pgpgin": stats.Memory.PgPgIn, 1447 "pgpgout": stats.Memory.PgPgOut, 1448 "pgfault": stats.Memory.PgFault, 1449 "pgmajfault": stats.Memory.PgMajFault, 1450 "inactive_anon": stats.Memory.InactiveAnon, 1451 "active_anon": stats.Memory.ActiveAnon, 1452 "inactive_file": stats.Memory.InactiveFile, 1453 "active_file": stats.Memory.ActiveFile, 1454 "unevictable": stats.Memory.Unevictable, 1455 "hierarchical_memory_limit": stats.Memory.HierarchicalMemoryLimit, 1456 "hierarchical_memsw_limit": stats.Memory.HierarchicalSwapLimit, 1457 "total_cache": stats.Memory.TotalCache, 1458 "total_rss": stats.Memory.TotalRSS, 1459 "total_rss_huge": stats.Memory.TotalRSSHuge, 1460 "total_mapped_file": stats.Memory.TotalMappedFile, 1461 "total_dirty": stats.Memory.TotalDirty, 1462 "total_writeback": stats.Memory.TotalWriteback, 1463 "total_pgpgin": stats.Memory.TotalPgPgIn, 1464 "total_pgpgout": stats.Memory.TotalPgPgOut, 1465 "total_pgfault": stats.Memory.TotalPgFault, 1466 "total_pgmajfault": stats.Memory.TotalPgMajFault, 1467 "total_inactive_anon": stats.Memory.TotalInactiveAnon, 1468 "total_active_anon": stats.Memory.TotalActiveAnon, 1469 "total_inactive_file": stats.Memory.TotalInactiveFile, 1470 "total_active_file": stats.Memory.TotalActiveFile, 1471 "total_unevictable": stats.Memory.TotalUnevictable, 1472 } 1473 if stats.Memory.Usage != nil { 1474 s.MemoryStats = types.MemoryStats{ 1475 Stats: raw, 1476 Usage: stats.Memory.Usage.Usage, 1477 MaxUsage: stats.Memory.Usage.Max, 1478 Limit: stats.Memory.Usage.Limit, 1479 Failcnt: stats.Memory.Usage.Failcnt, 1480 } 1481 } else { 1482 s.MemoryStats = types.MemoryStats{ 1483 Stats: raw, 1484 } 1485 } 1486 1487 // if the container does not set memory limit, use the machineMemory 1488 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1489 s.MemoryStats.Limit = daemon.machineMemory 1490 } 1491 } 1492 1493 if stats.Pids != nil { 1494 s.PidsStats = types.PidsStats{ 1495 Current: stats.Pids.Current, 1496 Limit: stats.Pids.Limit, 1497 } 1498 } 1499 1500 return s, nil 1501 } 1502 1503 func (daemon *Daemon) statsV2(s *types.StatsJSON, stats *statsV2.Metrics) (*types.StatsJSON, error) { 1504 if stats.Io != nil { 1505 var isbr []types.BlkioStatEntry 1506 for _, re := range stats.Io.Usage { 1507 isbr = append(isbr, 1508 types.BlkioStatEntry{ 1509 Major: re.Major, 1510 Minor: re.Minor, 1511 Op: "read", 1512 Value: re.Rbytes, 1513 }, 1514 types.BlkioStatEntry{ 1515 Major: re.Major, 1516 Minor: re.Minor, 1517 Op: "write", 1518 Value: re.Wbytes, 1519 }, 1520 ) 1521 } 1522 s.BlkioStats = types.BlkioStats{ 1523 IoServiceBytesRecursive: isbr, 1524 // Other fields are unsupported 1525 } 1526 } 1527 1528 if stats.CPU != nil { 1529 s.CPUStats = types.CPUStats{ 1530 CPUUsage: types.CPUUsage{ 1531 TotalUsage: stats.CPU.UsageUsec * 1000, 1532 // PercpuUsage is not supported 1533 UsageInKernelmode: stats.CPU.SystemUsec * 1000, 1534 UsageInUsermode: stats.CPU.UserUsec * 1000, 1535 }, 1536 ThrottlingData: types.ThrottlingData{ 1537 Periods: stats.CPU.NrPeriods, 1538 ThrottledPeriods: stats.CPU.NrThrottled, 1539 ThrottledTime: stats.CPU.ThrottledUsec * 1000, 1540 }, 1541 } 1542 } 1543 1544 if stats.Memory != nil { 1545 s.MemoryStats = types.MemoryStats{ 1546 // Stats is not compatible with v1 1547 Stats: map[string]uint64{ 1548 "anon": stats.Memory.Anon, 1549 "file": stats.Memory.File, 1550 "kernel_stack": stats.Memory.KernelStack, 1551 "slab": stats.Memory.Slab, 1552 "sock": stats.Memory.Sock, 1553 "shmem": stats.Memory.Shmem, 1554 "file_mapped": stats.Memory.FileMapped, 1555 "file_dirty": stats.Memory.FileDirty, 1556 "file_writeback": stats.Memory.FileWriteback, 1557 "anon_thp": stats.Memory.AnonThp, 1558 "inactive_anon": stats.Memory.InactiveAnon, 1559 "active_anon": stats.Memory.ActiveAnon, 1560 "inactive_file": stats.Memory.InactiveFile, 1561 "active_file": stats.Memory.ActiveFile, 1562 "unevictable": stats.Memory.Unevictable, 1563 "slab_reclaimable": stats.Memory.SlabReclaimable, 1564 "slab_unreclaimable": stats.Memory.SlabUnreclaimable, 1565 "pgfault": stats.Memory.Pgfault, 1566 "pgmajfault": stats.Memory.Pgmajfault, 1567 "workingset_refault": stats.Memory.WorkingsetRefault, 1568 "workingset_activate": stats.Memory.WorkingsetActivate, 1569 "workingset_nodereclaim": stats.Memory.WorkingsetNodereclaim, 1570 "pgrefill": stats.Memory.Pgrefill, 1571 "pgscan": stats.Memory.Pgscan, 1572 "pgsteal": stats.Memory.Pgsteal, 1573 "pgactivate": stats.Memory.Pgactivate, 1574 "pgdeactivate": stats.Memory.Pgdeactivate, 1575 "pglazyfree": stats.Memory.Pglazyfree, 1576 "pglazyfreed": stats.Memory.Pglazyfreed, 1577 "thp_fault_alloc": stats.Memory.ThpFaultAlloc, 1578 "thp_collapse_alloc": stats.Memory.ThpCollapseAlloc, 1579 }, 1580 Usage: stats.Memory.Usage, 1581 // MaxUsage is not supported 1582 Limit: stats.Memory.UsageLimit, 1583 } 1584 // if the container does not set memory limit, use the machineMemory 1585 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1586 s.MemoryStats.Limit = daemon.machineMemory 1587 } 1588 if stats.MemoryEvents != nil { 1589 // Failcnt is set to the "oom" field of the "memory.events" file. 1590 // See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html 1591 s.MemoryStats.Failcnt = stats.MemoryEvents.Oom 1592 } 1593 } 1594 1595 if stats.Pids != nil { 1596 s.PidsStats = types.PidsStats{ 1597 Current: stats.Pids.Current, 1598 Limit: stats.Pids.Limit, 1599 } 1600 } 1601 1602 return s, nil 1603 } 1604 1605 // setDefaultIsolation determines the default isolation mode for the 1606 // daemon to run in. This is only applicable on Windows 1607 func (daemon *Daemon) setDefaultIsolation() error { 1608 return nil 1609 } 1610 1611 // setupDaemonProcess sets various settings for the daemon's process 1612 func setupDaemonProcess(config *config.Config) error { 1613 // setup the daemons oom_score_adj 1614 if err := setupOOMScoreAdj(config.OOMScoreAdjust); err != nil { 1615 return err 1616 } 1617 if err := setMayDetachMounts(); err != nil { 1618 logrus.WithError(err).Warn("Could not set may_detach_mounts kernel parameter") 1619 } 1620 return nil 1621 } 1622 1623 // This is used to allow removal of mountpoints that may be mounted in other 1624 // namespaces on RHEL based kernels starting from RHEL 7.4. 1625 // Without this setting, removals on these RHEL based kernels may fail with 1626 // "device or resource busy". 1627 // This setting is not available in upstream kernels as it is not configurable, 1628 // but has been in the upstream kernels since 3.15. 1629 func setMayDetachMounts() error { 1630 f, err := os.OpenFile("/proc/sys/fs/may_detach_mounts", os.O_WRONLY, 0) 1631 if err != nil { 1632 if os.IsNotExist(err) { 1633 return nil 1634 } 1635 return errors.Wrap(err, "error opening may_detach_mounts kernel config file") 1636 } 1637 defer f.Close() 1638 1639 _, err = f.WriteString("1") 1640 if os.IsPermission(err) { 1641 // Setting may_detach_mounts does not work in an 1642 // unprivileged container. Ignore the error, but log 1643 // it if we appear not to be in that situation. 1644 if !userns.RunningInUserNS() { 1645 logrus.Debugf("Permission denied writing %q to /proc/sys/fs/may_detach_mounts", "1") 1646 } 1647 return nil 1648 } 1649 return err 1650 } 1651 1652 func setupOOMScoreAdj(score int) error { 1653 if score == 0 { 1654 return nil 1655 } 1656 f, err := os.OpenFile("/proc/self/oom_score_adj", os.O_WRONLY, 0) 1657 if err != nil { 1658 return err 1659 } 1660 defer f.Close() 1661 stringScore := strconv.Itoa(score) 1662 _, err = f.WriteString(stringScore) 1663 if os.IsPermission(err) { 1664 // Setting oom_score_adj does not work in an 1665 // unprivileged container. Ignore the error, but log 1666 // it if we appear not to be in that situation. 1667 if !userns.RunningInUserNS() { 1668 logrus.Debugf("Permission denied writing %q to /proc/self/oom_score_adj", stringScore) 1669 } 1670 return nil 1671 } 1672 1673 return err 1674 } 1675 1676 func (daemon *Daemon) initCPURtController(mnt, path string) error { 1677 if path == "/" || path == "." { 1678 return nil 1679 } 1680 1681 // Recursively create cgroup to ensure that the system and all parent cgroups have values set 1682 // for the period and runtime as this limits what the children can be set to. 1683 if err := daemon.initCPURtController(mnt, filepath.Dir(path)); err != nil { 1684 return err 1685 } 1686 1687 path = filepath.Join(mnt, path) 1688 if err := os.MkdirAll(path, 0755); err != nil { 1689 return err 1690 } 1691 if err := maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil { 1692 return err 1693 } 1694 return maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimeRuntime, "cpu.rt_runtime_us", path) 1695 } 1696 1697 func maybeCreateCPURealTimeFile(configValue int64, file string, path string) error { 1698 if configValue == 0 { 1699 return nil 1700 } 1701 return os.WriteFile(filepath.Join(path, file), []byte(strconv.FormatInt(configValue, 10)), 0700) 1702 } 1703 1704 func (daemon *Daemon) setupSeccompProfile() error { 1705 switch profile := daemon.configStore.SeccompProfile; profile { 1706 case "", config.SeccompProfileDefault: 1707 daemon.seccompProfilePath = config.SeccompProfileDefault 1708 case config.SeccompProfileUnconfined: 1709 daemon.seccompProfilePath = config.SeccompProfileUnconfined 1710 default: 1711 daemon.seccompProfilePath = profile 1712 b, err := os.ReadFile(profile) 1713 if err != nil { 1714 return fmt.Errorf("opening seccomp profile (%s) failed: %v", profile, err) 1715 } 1716 daemon.seccompProfile = b 1717 } 1718 return nil 1719 } 1720 1721 func getSysInfo(daemon *Daemon) *sysinfo.SysInfo { 1722 var siOpts []sysinfo.Opt 1723 if daemon.getCgroupDriver() == cgroupSystemdDriver { 1724 if euid := os.Getenv("ROOTLESSKIT_PARENT_EUID"); euid != "" { 1725 siOpts = append(siOpts, sysinfo.WithCgroup2GroupPath("/user.slice/user-"+euid+".slice")) 1726 } 1727 } 1728 return sysinfo.New(siOpts...) 1729 } 1730 1731 func (daemon *Daemon) initLibcontainerd(ctx context.Context) error { 1732 var err error 1733 daemon.containerd, err = remote.NewClient( 1734 ctx, 1735 daemon.containerdCli, 1736 filepath.Join(daemon.configStore.ExecRoot, "containerd"), 1737 daemon.configStore.ContainerdNamespace, 1738 daemon, 1739 ) 1740 return err 1741 } 1742 1743 func recursiveUnmount(target string) error { 1744 return mount.RecursiveUnmount(target) 1745 }