github.com/adityamillind98/moby@v23.0.0-rc.4+incompatible/daemon/daemon_unix.go (about)

     1  //go:build linux || freebsd
     2  // +build linux freebsd
     3  
     4  package daemon // import "github.com/docker/docker/daemon"
     5  
     6  import (
     7  	"bufio"
     8  	"context"
     9  	"fmt"
    10  	"net"
    11  	"os"
    12  	"path/filepath"
    13  	"runtime"
    14  	"runtime/debug"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"time"
    19  
    20  	"github.com/containerd/cgroups"
    21  	statsV1 "github.com/containerd/cgroups/stats/v1"
    22  	statsV2 "github.com/containerd/cgroups/v2/stats"
    23  	"github.com/containerd/containerd/pkg/userns"
    24  	"github.com/docker/docker/api/types"
    25  	"github.com/docker/docker/api/types/blkiodev"
    26  	pblkiodev "github.com/docker/docker/api/types/blkiodev"
    27  	containertypes "github.com/docker/docker/api/types/container"
    28  	"github.com/docker/docker/container"
    29  	"github.com/docker/docker/daemon/config"
    30  	"github.com/docker/docker/daemon/initlayer"
    31  	"github.com/docker/docker/errdefs"
    32  	"github.com/docker/docker/libcontainerd/remote"
    33  	"github.com/docker/docker/libnetwork"
    34  	nwconfig "github.com/docker/docker/libnetwork/config"
    35  	"github.com/docker/docker/libnetwork/drivers/bridge"
    36  	"github.com/docker/docker/libnetwork/netlabel"
    37  	"github.com/docker/docker/libnetwork/netutils"
    38  	"github.com/docker/docker/libnetwork/options"
    39  	lntypes "github.com/docker/docker/libnetwork/types"
    40  	"github.com/docker/docker/opts"
    41  	"github.com/docker/docker/pkg/containerfs"
    42  	"github.com/docker/docker/pkg/idtools"
    43  	"github.com/docker/docker/pkg/parsers"
    44  	"github.com/docker/docker/pkg/parsers/kernel"
    45  	"github.com/docker/docker/pkg/sysinfo"
    46  	"github.com/docker/docker/runconfig"
    47  	volumemounts "github.com/docker/docker/volume/mounts"
    48  	"github.com/moby/sys/mount"
    49  	specs "github.com/opencontainers/runtime-spec/specs-go"
    50  	"github.com/opencontainers/selinux/go-selinux"
    51  	"github.com/opencontainers/selinux/go-selinux/label"
    52  	"github.com/pkg/errors"
    53  	"github.com/sirupsen/logrus"
    54  	"github.com/vishvananda/netlink"
    55  	"golang.org/x/sys/unix"
    56  )
    57  
    58  const (
    59  	isWindows = false
    60  
    61  	// See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269
    62  	linuxMinCPUShares = 2
    63  	linuxMaxCPUShares = 262144
    64  	platformSupported = true
    65  	// It's not kernel limit, we want this 6M limit to account for overhead during startup, and to supply a reasonable functional container
    66  	linuxMinMemory = 6291456
    67  	// constants for remapped root settings
    68  	defaultIDSpecifier = "default"
    69  	defaultRemappedID  = "dockremap"
    70  
    71  	// constant for cgroup drivers
    72  	cgroupFsDriver      = "cgroupfs"
    73  	cgroupSystemdDriver = "systemd"
    74  	cgroupNoneDriver    = "none"
    75  )
    76  
    77  type containerGetter interface {
    78  	GetContainer(string) (*container.Container, error)
    79  }
    80  
    81  func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory {
    82  	memory := specs.LinuxMemory{}
    83  
    84  	if config.Memory > 0 {
    85  		memory.Limit = &config.Memory
    86  	}
    87  
    88  	if config.MemoryReservation > 0 {
    89  		memory.Reservation = &config.MemoryReservation
    90  	}
    91  
    92  	if config.MemorySwap > 0 {
    93  		memory.Swap = &config.MemorySwap
    94  	}
    95  
    96  	if config.MemorySwappiness != nil {
    97  		swappiness := uint64(*config.MemorySwappiness)
    98  		memory.Swappiness = &swappiness
    99  	}
   100  
   101  	if config.OomKillDisable != nil {
   102  		memory.DisableOOMKiller = config.OomKillDisable
   103  	}
   104  
   105  	if config.KernelMemory != 0 {
   106  		memory.Kernel = &config.KernelMemory
   107  	}
   108  
   109  	if config.KernelMemoryTCP != 0 {
   110  		memory.KernelTCP = &config.KernelMemoryTCP
   111  	}
   112  
   113  	return &memory
   114  }
   115  
   116  func getPidsLimit(config containertypes.Resources) *specs.LinuxPids {
   117  	if config.PidsLimit == nil {
   118  		return nil
   119  	}
   120  	if *config.PidsLimit <= 0 {
   121  		// docker API allows 0 and negative values to unset this to be consistent
   122  		// with default values. When updating values, runc requires -1 to unset
   123  		// the previous limit.
   124  		return &specs.LinuxPids{Limit: -1}
   125  	}
   126  	return &specs.LinuxPids{Limit: *config.PidsLimit}
   127  }
   128  
   129  func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) {
   130  	cpu := specs.LinuxCPU{}
   131  
   132  	if config.CPUShares < 0 {
   133  		return nil, fmt.Errorf("shares: invalid argument")
   134  	}
   135  	if config.CPUShares >= 0 {
   136  		shares := uint64(config.CPUShares)
   137  		cpu.Shares = &shares
   138  	}
   139  
   140  	if config.CpusetCpus != "" {
   141  		cpu.Cpus = config.CpusetCpus
   142  	}
   143  
   144  	if config.CpusetMems != "" {
   145  		cpu.Mems = config.CpusetMems
   146  	}
   147  
   148  	if config.NanoCPUs > 0 {
   149  		// https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt
   150  		period := uint64(100 * time.Millisecond / time.Microsecond)
   151  		quota := config.NanoCPUs * int64(period) / 1e9
   152  		cpu.Period = &period
   153  		cpu.Quota = &quota
   154  	}
   155  
   156  	if config.CPUPeriod != 0 {
   157  		period := uint64(config.CPUPeriod)
   158  		cpu.Period = &period
   159  	}
   160  
   161  	if config.CPUQuota != 0 {
   162  		q := config.CPUQuota
   163  		cpu.Quota = &q
   164  	}
   165  
   166  	if config.CPURealtimePeriod != 0 {
   167  		period := uint64(config.CPURealtimePeriod)
   168  		cpu.RealtimePeriod = &period
   169  	}
   170  
   171  	if config.CPURealtimeRuntime != 0 {
   172  		c := config.CPURealtimeRuntime
   173  		cpu.RealtimeRuntime = &c
   174  	}
   175  
   176  	return &cpu, nil
   177  }
   178  
   179  func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) {
   180  	var stat unix.Stat_t
   181  	var blkioWeightDevices []specs.LinuxWeightDevice
   182  
   183  	for _, weightDevice := range config.BlkioWeightDevice {
   184  		if err := unix.Stat(weightDevice.Path, &stat); err != nil {
   185  			return nil, errors.WithStack(&os.PathError{Op: "stat", Path: weightDevice.Path, Err: err})
   186  		}
   187  		weight := weightDevice.Weight
   188  		d := specs.LinuxWeightDevice{Weight: &weight}
   189  		// The type is 32bit on mips.
   190  		d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert
   191  		d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert
   192  		blkioWeightDevices = append(blkioWeightDevices, d)
   193  	}
   194  
   195  	return blkioWeightDevices, nil
   196  }
   197  
   198  func (daemon *Daemon) parseSecurityOpt(container *container.Container, hostConfig *containertypes.HostConfig) error {
   199  	container.NoNewPrivileges = daemon.configStore.NoNewPrivileges
   200  	return parseSecurityOpt(container, hostConfig)
   201  }
   202  
   203  func parseSecurityOpt(container *container.Container, config *containertypes.HostConfig) error {
   204  	var (
   205  		labelOpts []string
   206  		err       error
   207  	)
   208  
   209  	for _, opt := range config.SecurityOpt {
   210  		if opt == "no-new-privileges" {
   211  			container.NoNewPrivileges = true
   212  			continue
   213  		}
   214  		if opt == "disable" {
   215  			labelOpts = append(labelOpts, "disable")
   216  			continue
   217  		}
   218  
   219  		var con []string
   220  		if strings.Contains(opt, "=") {
   221  			con = strings.SplitN(opt, "=", 2)
   222  		} else if strings.Contains(opt, ":") {
   223  			con = strings.SplitN(opt, ":", 2)
   224  			logrus.Warn("Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead.")
   225  		}
   226  		if len(con) != 2 {
   227  			return fmt.Errorf("invalid --security-opt 1: %q", opt)
   228  		}
   229  
   230  		switch con[0] {
   231  		case "label":
   232  			labelOpts = append(labelOpts, con[1])
   233  		case "apparmor":
   234  			container.AppArmorProfile = con[1]
   235  		case "seccomp":
   236  			container.SeccompProfile = con[1]
   237  		case "no-new-privileges":
   238  			noNewPrivileges, err := strconv.ParseBool(con[1])
   239  			if err != nil {
   240  				return fmt.Errorf("invalid --security-opt 2: %q", opt)
   241  			}
   242  			container.NoNewPrivileges = noNewPrivileges
   243  		default:
   244  			return fmt.Errorf("invalid --security-opt 2: %q", opt)
   245  		}
   246  	}
   247  
   248  	container.ProcessLabel, container.MountLabel, err = label.InitLabels(labelOpts)
   249  	return err
   250  }
   251  
   252  func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) {
   253  	var throttleDevices []specs.LinuxThrottleDevice
   254  	var stat unix.Stat_t
   255  
   256  	for _, d := range devs {
   257  		if err := unix.Stat(d.Path, &stat); err != nil {
   258  			return nil, errors.WithStack(&os.PathError{Op: "stat", Path: d.Path, Err: err})
   259  		}
   260  		d := specs.LinuxThrottleDevice{Rate: d.Rate}
   261  		// the type is 32bit on mips
   262  		d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert
   263  		d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert
   264  		throttleDevices = append(throttleDevices, d)
   265  	}
   266  
   267  	return throttleDevices, nil
   268  }
   269  
   270  // adjustParallelLimit takes a number of objects and a proposed limit and
   271  // figures out if it's reasonable (and adjusts it accordingly). This is only
   272  // used for daemon startup, which does a lot of parallel loading of containers
   273  // (and if we exceed RLIMIT_NOFILE then we're in trouble).
   274  func adjustParallelLimit(n int, limit int) int {
   275  	// Rule-of-thumb overhead factor (how many files will each goroutine open
   276  	// simultaneously). Yes, this is ugly but to be frank this whole thing is
   277  	// ugly.
   278  	const overhead = 2
   279  
   280  	// On Linux, we need to ensure that parallelStartupJobs doesn't cause us to
   281  	// exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it
   282  	// and give a warning (since in theory the user should increase their
   283  	// ulimits to the largest possible value for dockerd).
   284  	var rlim unix.Rlimit
   285  	if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil {
   286  		logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err)
   287  		return limit
   288  	}
   289  	softRlimit := int(rlim.Cur)
   290  
   291  	// Much fewer containers than RLIMIT_NOFILE. No need to adjust anything.
   292  	if softRlimit > overhead*n {
   293  		return limit
   294  	}
   295  
   296  	// RLIMIT_NOFILE big enough, no need to adjust anything.
   297  	if softRlimit > overhead*limit {
   298  		return limit
   299  	}
   300  
   301  	logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit)
   302  	return softRlimit / overhead
   303  }
   304  
   305  // adaptContainerSettings is called during container creation to modify any
   306  // settings necessary in the HostConfig structure.
   307  func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConfig, adjustCPUShares bool) error {
   308  	if adjustCPUShares && hostConfig.CPUShares > 0 {
   309  		// Handle unsupported CPUShares
   310  		if hostConfig.CPUShares < linuxMinCPUShares {
   311  			logrus.Warnf("Changing requested CPUShares of %d to minimum allowed of %d", hostConfig.CPUShares, linuxMinCPUShares)
   312  			hostConfig.CPUShares = linuxMinCPUShares
   313  		} else if hostConfig.CPUShares > linuxMaxCPUShares {
   314  			logrus.Warnf("Changing requested CPUShares of %d to maximum allowed of %d", hostConfig.CPUShares, linuxMaxCPUShares)
   315  			hostConfig.CPUShares = linuxMaxCPUShares
   316  		}
   317  	}
   318  	if hostConfig.Memory > 0 && hostConfig.MemorySwap == 0 {
   319  		// By default, MemorySwap is set to twice the size of Memory.
   320  		hostConfig.MemorySwap = hostConfig.Memory * 2
   321  	}
   322  	if hostConfig.ShmSize == 0 {
   323  		hostConfig.ShmSize = config.DefaultShmSize
   324  		if daemon.configStore != nil {
   325  			hostConfig.ShmSize = int64(daemon.configStore.ShmSize)
   326  		}
   327  	}
   328  	// Set default IPC mode, if unset for container
   329  	if hostConfig.IpcMode.IsEmpty() {
   330  		m := config.DefaultIpcMode
   331  		if daemon.configStore != nil {
   332  			m = containertypes.IpcMode(daemon.configStore.IpcMode)
   333  		}
   334  		hostConfig.IpcMode = m
   335  	}
   336  
   337  	// Set default cgroup namespace mode, if unset for container
   338  	if hostConfig.CgroupnsMode.IsEmpty() {
   339  		// for cgroup v2: unshare cgroupns even for privileged containers
   340  		// https://github.com/containers/libpod/pull/4374#issuecomment-549776387
   341  		if hostConfig.Privileged && cgroups.Mode() != cgroups.Unified {
   342  			hostConfig.CgroupnsMode = containertypes.CgroupnsModeHost
   343  		} else {
   344  			m := containertypes.CgroupnsModeHost
   345  			if cgroups.Mode() == cgroups.Unified {
   346  				m = containertypes.CgroupnsModePrivate
   347  			}
   348  			if daemon.configStore != nil {
   349  				m = containertypes.CgroupnsMode(daemon.configStore.CgroupNamespaceMode)
   350  			}
   351  			hostConfig.CgroupnsMode = m
   352  		}
   353  	}
   354  
   355  	adaptSharedNamespaceContainer(daemon, hostConfig)
   356  
   357  	var err error
   358  	secOpts, err := daemon.generateSecurityOpt(hostConfig)
   359  	if err != nil {
   360  		return err
   361  	}
   362  	hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, secOpts...)
   363  	if hostConfig.OomKillDisable == nil {
   364  		defaultOomKillDisable := false
   365  		hostConfig.OomKillDisable = &defaultOomKillDisable
   366  	}
   367  
   368  	return nil
   369  }
   370  
   371  // adaptSharedNamespaceContainer replaces container name with its ID in hostConfig.
   372  // To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode
   373  // and NetworkMode.
   374  //
   375  // When a container shares its namespace with another container, use ID can keep the namespace
   376  // sharing connection between the two containers even the another container is renamed.
   377  func adaptSharedNamespaceContainer(daemon containerGetter, hostConfig *containertypes.HostConfig) {
   378  	containerPrefix := "container:"
   379  	if hostConfig.PidMode.IsContainer() {
   380  		pidContainer := hostConfig.PidMode.Container()
   381  		// if there is any error returned here, we just ignore it and leave it to be
   382  		// handled in the following logic
   383  		if c, err := daemon.GetContainer(pidContainer); err == nil {
   384  			hostConfig.PidMode = containertypes.PidMode(containerPrefix + c.ID)
   385  		}
   386  	}
   387  	if hostConfig.IpcMode.IsContainer() {
   388  		ipcContainer := hostConfig.IpcMode.Container()
   389  		if c, err := daemon.GetContainer(ipcContainer); err == nil {
   390  			hostConfig.IpcMode = containertypes.IpcMode(containerPrefix + c.ID)
   391  		}
   392  	}
   393  	if hostConfig.NetworkMode.IsContainer() {
   394  		netContainer := hostConfig.NetworkMode.ConnectedContainer()
   395  		if c, err := daemon.GetContainer(netContainer); err == nil {
   396  			hostConfig.NetworkMode = containertypes.NetworkMode(containerPrefix + c.ID)
   397  		}
   398  	}
   399  }
   400  
   401  // verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration
   402  func verifyPlatformContainerResources(resources *containertypes.Resources, sysInfo *sysinfo.SysInfo, update bool) (warnings []string, err error) {
   403  	fixMemorySwappiness(resources)
   404  
   405  	// memory subsystem checks and adjustments
   406  	if resources.Memory != 0 && resources.Memory < linuxMinMemory {
   407  		return warnings, fmt.Errorf("Minimum memory limit allowed is 6MB")
   408  	}
   409  	if resources.Memory > 0 && !sysInfo.MemoryLimit {
   410  		warnings = append(warnings, "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded.")
   411  		resources.Memory = 0
   412  		resources.MemorySwap = -1
   413  	}
   414  	if resources.Memory > 0 && resources.MemorySwap != -1 && !sysInfo.SwapLimit {
   415  		warnings = append(warnings, "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.")
   416  		resources.MemorySwap = -1
   417  	}
   418  	if resources.Memory > 0 && resources.MemorySwap > 0 && resources.MemorySwap < resources.Memory {
   419  		return warnings, fmt.Errorf("Minimum memoryswap limit should be larger than memory limit, see usage")
   420  	}
   421  	if resources.Memory == 0 && resources.MemorySwap > 0 && !update {
   422  		return warnings, fmt.Errorf("You should always set the Memory limit when using Memoryswap limit, see usage")
   423  	}
   424  	if resources.MemorySwappiness != nil && !sysInfo.MemorySwappiness {
   425  		warnings = append(warnings, "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded.")
   426  		resources.MemorySwappiness = nil
   427  	}
   428  	if resources.MemorySwappiness != nil {
   429  		swappiness := *resources.MemorySwappiness
   430  		if swappiness < 0 || swappiness > 100 {
   431  			return warnings, fmt.Errorf("Invalid value: %v, valid memory swappiness range is 0-100", swappiness)
   432  		}
   433  	}
   434  	if resources.MemoryReservation > 0 && !sysInfo.MemoryReservation {
   435  		warnings = append(warnings, "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded.")
   436  		resources.MemoryReservation = 0
   437  	}
   438  	if resources.MemoryReservation > 0 && resources.MemoryReservation < linuxMinMemory {
   439  		return warnings, fmt.Errorf("Minimum memory reservation allowed is 6MB")
   440  	}
   441  	if resources.Memory > 0 && resources.MemoryReservation > 0 && resources.Memory < resources.MemoryReservation {
   442  		return warnings, fmt.Errorf("Minimum memory limit can not be less than memory reservation limit, see usage")
   443  	}
   444  	if resources.KernelMemory > 0 {
   445  		// Kernel memory limit is not supported on cgroup v2.
   446  		// Even on cgroup v1, kernel memory limit (`kmem.limit_in_bytes`) has been deprecated since kernel 5.4.
   447  		// https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0b5adf44cae99b3ebcc7
   448  		if !sysInfo.KernelMemory {
   449  			warnings = append(warnings, "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded.")
   450  			resources.KernelMemory = 0
   451  		}
   452  		if resources.KernelMemory > 0 && resources.KernelMemory < linuxMinMemory {
   453  			return warnings, fmt.Errorf("Minimum kernel memory limit allowed is 6MB")
   454  		}
   455  		if !kernel.CheckKernelVersion(4, 0, 0) {
   456  			warnings = append(warnings, "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable.")
   457  		}
   458  	}
   459  	if resources.OomKillDisable != nil && !sysInfo.OomKillDisable {
   460  		// only produce warnings if the setting wasn't to *disable* the OOM Kill; no point
   461  		// warning the caller if they already wanted the feature to be off
   462  		if *resources.OomKillDisable {
   463  			warnings = append(warnings, "Your kernel does not support OomKillDisable. OomKillDisable discarded.")
   464  		}
   465  		resources.OomKillDisable = nil
   466  	}
   467  	if resources.OomKillDisable != nil && *resources.OomKillDisable && resources.Memory == 0 {
   468  		warnings = append(warnings, "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources.")
   469  	}
   470  	if resources.PidsLimit != nil && !sysInfo.PidsLimit {
   471  		if *resources.PidsLimit > 0 {
   472  			warnings = append(warnings, "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded.")
   473  		}
   474  		resources.PidsLimit = nil
   475  	}
   476  
   477  	// cpu subsystem checks and adjustments
   478  	if resources.NanoCPUs > 0 && resources.CPUPeriod > 0 {
   479  		return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Period cannot both be set")
   480  	}
   481  	if resources.NanoCPUs > 0 && resources.CPUQuota > 0 {
   482  		return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Quota cannot both be set")
   483  	}
   484  	if resources.NanoCPUs > 0 && !sysInfo.CPUCfs {
   485  		return warnings, fmt.Errorf("NanoCPUs can not be set, as your kernel does not support CPU CFS scheduler or the cgroup is not mounted")
   486  	}
   487  	// The highest precision we could get on Linux is 0.001, by setting
   488  	//   cpu.cfs_period_us=1000ms
   489  	//   cpu.cfs_quota=1ms
   490  	// See the following link for details:
   491  	// https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt
   492  	// Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error.
   493  	// The error message is 0.01 so that this is consistent with Windows
   494  	if resources.NanoCPUs < 0 || resources.NanoCPUs > int64(sysinfo.NumCPU())*1e9 {
   495  		return warnings, fmt.Errorf("Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available", sysinfo.NumCPU(), sysinfo.NumCPU())
   496  	}
   497  
   498  	if resources.CPUShares > 0 && !sysInfo.CPUShares {
   499  		warnings = append(warnings, "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded.")
   500  		resources.CPUShares = 0
   501  	}
   502  	if (resources.CPUPeriod != 0 || resources.CPUQuota != 0) && !sysInfo.CPUCfs {
   503  		warnings = append(warnings, "Your kernel does not support CPU CFS scheduler. CPU period/quota discarded.")
   504  		resources.CPUPeriod = 0
   505  		resources.CPUQuota = 0
   506  	}
   507  	if resources.CPUPeriod != 0 && (resources.CPUPeriod < 1000 || resources.CPUPeriod > 1000000) {
   508  		return warnings, fmt.Errorf("CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)")
   509  	}
   510  	if resources.CPUQuota > 0 && resources.CPUQuota < 1000 {
   511  		return warnings, fmt.Errorf("CPU cfs quota can not be less than 1ms (i.e. 1000)")
   512  	}
   513  	if resources.CPUPercent > 0 {
   514  		warnings = append(warnings, fmt.Sprintf("%s does not support CPU percent. Percent discarded.", runtime.GOOS))
   515  		resources.CPUPercent = 0
   516  	}
   517  
   518  	// cpuset subsystem checks and adjustments
   519  	if (resources.CpusetCpus != "" || resources.CpusetMems != "") && !sysInfo.Cpuset {
   520  		warnings = append(warnings, "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded.")
   521  		resources.CpusetCpus = ""
   522  		resources.CpusetMems = ""
   523  	}
   524  	cpusAvailable, err := sysInfo.IsCpusetCpusAvailable(resources.CpusetCpus)
   525  	if err != nil {
   526  		return warnings, errors.Wrapf(err, "Invalid value %s for cpuset cpus", resources.CpusetCpus)
   527  	}
   528  	if !cpusAvailable {
   529  		return warnings, fmt.Errorf("Requested CPUs are not available - requested %s, available: %s", resources.CpusetCpus, sysInfo.Cpus)
   530  	}
   531  	memsAvailable, err := sysInfo.IsCpusetMemsAvailable(resources.CpusetMems)
   532  	if err != nil {
   533  		return warnings, errors.Wrapf(err, "Invalid value %s for cpuset mems", resources.CpusetMems)
   534  	}
   535  	if !memsAvailable {
   536  		return warnings, fmt.Errorf("Requested memory nodes are not available - requested %s, available: %s", resources.CpusetMems, sysInfo.Mems)
   537  	}
   538  
   539  	// blkio subsystem checks and adjustments
   540  	if resources.BlkioWeight > 0 && !sysInfo.BlkioWeight {
   541  		warnings = append(warnings, "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded.")
   542  		resources.BlkioWeight = 0
   543  	}
   544  	if resources.BlkioWeight > 0 && (resources.BlkioWeight < 10 || resources.BlkioWeight > 1000) {
   545  		return warnings, fmt.Errorf("Range of blkio weight is from 10 to 1000")
   546  	}
   547  	if resources.IOMaximumBandwidth != 0 || resources.IOMaximumIOps != 0 {
   548  		return warnings, fmt.Errorf("Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps", runtime.GOOS)
   549  	}
   550  	if len(resources.BlkioWeightDevice) > 0 && !sysInfo.BlkioWeightDevice {
   551  		warnings = append(warnings, "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded.")
   552  		resources.BlkioWeightDevice = []*pblkiodev.WeightDevice{}
   553  	}
   554  	if len(resources.BlkioDeviceReadBps) > 0 && !sysInfo.BlkioReadBpsDevice {
   555  		warnings = append(warnings, "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded.")
   556  		resources.BlkioDeviceReadBps = []*pblkiodev.ThrottleDevice{}
   557  	}
   558  	if len(resources.BlkioDeviceWriteBps) > 0 && !sysInfo.BlkioWriteBpsDevice {
   559  		warnings = append(warnings, "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded.")
   560  		resources.BlkioDeviceWriteBps = []*pblkiodev.ThrottleDevice{}
   561  	}
   562  	if len(resources.BlkioDeviceReadIOps) > 0 && !sysInfo.BlkioReadIOpsDevice {
   563  		warnings = append(warnings, "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded.")
   564  		resources.BlkioDeviceReadIOps = []*pblkiodev.ThrottleDevice{}
   565  	}
   566  	if len(resources.BlkioDeviceWriteIOps) > 0 && !sysInfo.BlkioWriteIOpsDevice {
   567  		warnings = append(warnings, "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded.")
   568  		resources.BlkioDeviceWriteIOps = []*pblkiodev.ThrottleDevice{}
   569  	}
   570  
   571  	return warnings, nil
   572  }
   573  
   574  func (daemon *Daemon) getCgroupDriver() string {
   575  	if UsingSystemd(daemon.configStore) {
   576  		return cgroupSystemdDriver
   577  	}
   578  	if daemon.Rootless() {
   579  		return cgroupNoneDriver
   580  	}
   581  	return cgroupFsDriver
   582  }
   583  
   584  // getCD gets the raw value of the native.cgroupdriver option, if set.
   585  func getCD(config *config.Config) string {
   586  	for _, option := range config.ExecOptions {
   587  		key, val, err := parsers.ParseKeyValueOpt(option)
   588  		if err != nil || !strings.EqualFold(key, "native.cgroupdriver") {
   589  			continue
   590  		}
   591  		return val
   592  	}
   593  	return ""
   594  }
   595  
   596  // verifyCgroupDriver validates native.cgroupdriver
   597  func verifyCgroupDriver(config *config.Config) error {
   598  	cd := getCD(config)
   599  	if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver {
   600  		return nil
   601  	}
   602  	if cd == cgroupNoneDriver {
   603  		return fmt.Errorf("native.cgroupdriver option %s is internally used and cannot be specified manually", cd)
   604  	}
   605  	return fmt.Errorf("native.cgroupdriver option %s not supported", cd)
   606  }
   607  
   608  // UsingSystemd returns true if cli option includes native.cgroupdriver=systemd
   609  func UsingSystemd(config *config.Config) bool {
   610  	cd := getCD(config)
   611  
   612  	if cd == cgroupSystemdDriver {
   613  		return true
   614  	}
   615  	// On cgroup v2 hosts, default to systemd driver
   616  	if cd == "" && cgroups.Mode() == cgroups.Unified && isRunningSystemd() {
   617  		return true
   618  	}
   619  	return false
   620  }
   621  
   622  var (
   623  	runningSystemd bool
   624  	detectSystemd  sync.Once
   625  )
   626  
   627  // isRunningSystemd checks whether the host was booted with systemd as its init
   628  // system. This functions similarly to systemd's `sd_booted(3)`: internally, it
   629  // checks whether /run/systemd/system/ exists and is a directory.
   630  // http://www.freedesktop.org/software/systemd/man/sd_booted.html
   631  //
   632  // NOTE: This function comes from package github.com/coreos/go-systemd/util
   633  // It was borrowed here to avoid a dependency on cgo.
   634  func isRunningSystemd() bool {
   635  	detectSystemd.Do(func() {
   636  		fi, err := os.Lstat("/run/systemd/system")
   637  		if err != nil {
   638  			return
   639  		}
   640  		runningSystemd = fi.IsDir()
   641  	})
   642  	return runningSystemd
   643  }
   644  
   645  // verifyPlatformContainerSettings performs platform-specific validation of the
   646  // hostconfig and config structures.
   647  func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.HostConfig, update bool) (warnings []string, err error) {
   648  	if hostConfig == nil {
   649  		return nil, nil
   650  	}
   651  	sysInfo := daemon.RawSysInfo()
   652  
   653  	w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update)
   654  
   655  	// no matter err is nil or not, w could have data in itself.
   656  	warnings = append(warnings, w...)
   657  
   658  	if err != nil {
   659  		return warnings, err
   660  	}
   661  
   662  	if !hostConfig.IpcMode.Valid() {
   663  		return warnings, errors.Errorf("invalid IPC mode: %v", hostConfig.IpcMode)
   664  	}
   665  	if !hostConfig.PidMode.Valid() {
   666  		return warnings, errors.Errorf("invalid PID mode: %v", hostConfig.PidMode)
   667  	}
   668  	if hostConfig.ShmSize < 0 {
   669  		return warnings, fmt.Errorf("SHM size can not be less than 0")
   670  	}
   671  	if !hostConfig.UTSMode.Valid() {
   672  		return warnings, errors.Errorf("invalid UTS mode: %v", hostConfig.UTSMode)
   673  	}
   674  
   675  	if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 {
   676  		return warnings, fmt.Errorf("Invalid value %d, range for oom score adj is [-1000, 1000]", hostConfig.OomScoreAdj)
   677  	}
   678  
   679  	// ip-forwarding does not affect container with '--net=host' (or '--net=none')
   680  	if sysInfo.IPv4ForwardingDisabled && !(hostConfig.NetworkMode.IsHost() || hostConfig.NetworkMode.IsNone()) {
   681  		warnings = append(warnings, "IPv4 forwarding is disabled. Networking will not work.")
   682  	}
   683  	if hostConfig.NetworkMode.IsHost() && len(hostConfig.PortBindings) > 0 {
   684  		warnings = append(warnings, "Published ports are discarded when using host network mode")
   685  	}
   686  
   687  	// check for various conflicting options with user namespaces
   688  	if daemon.configStore.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() {
   689  		if hostConfig.Privileged {
   690  			return warnings, fmt.Errorf("privileged mode is incompatible with user namespaces.  You must run the container in the host namespace when running privileged mode")
   691  		}
   692  		if hostConfig.NetworkMode.IsHost() && !hostConfig.UsernsMode.IsHost() {
   693  			return warnings, fmt.Errorf("cannot share the host's network namespace when user namespaces are enabled")
   694  		}
   695  		if hostConfig.PidMode.IsHost() && !hostConfig.UsernsMode.IsHost() {
   696  			return warnings, fmt.Errorf("cannot share the host PID namespace when user namespaces are enabled")
   697  		}
   698  	}
   699  	if hostConfig.CgroupParent != "" && UsingSystemd(daemon.configStore) {
   700  		// CgroupParent for systemd cgroup should be named as "xxx.slice"
   701  		if len(hostConfig.CgroupParent) <= 6 || !strings.HasSuffix(hostConfig.CgroupParent, ".slice") {
   702  			return warnings, fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"")
   703  		}
   704  	}
   705  	if hostConfig.Runtime == "" {
   706  		hostConfig.Runtime = daemon.configStore.GetDefaultRuntimeName()
   707  	}
   708  
   709  	if _, err := daemon.getRuntime(hostConfig.Runtime); err != nil {
   710  		return warnings, err
   711  	}
   712  
   713  	parser := volumemounts.NewParser()
   714  	for dest := range hostConfig.Tmpfs {
   715  		if err := parser.ValidateTmpfsMountDestination(dest); err != nil {
   716  			return warnings, err
   717  		}
   718  	}
   719  
   720  	if !hostConfig.CgroupnsMode.Valid() {
   721  		return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode)
   722  	}
   723  	if hostConfig.CgroupnsMode.IsPrivate() {
   724  		if !sysInfo.CgroupNamespaces {
   725  			warnings = append(warnings, "Your kernel does not support cgroup namespaces.  Cgroup namespace setting discarded.")
   726  		}
   727  	}
   728  
   729  	return warnings, nil
   730  }
   731  
   732  // verifyDaemonSettings performs validation of daemon config struct
   733  func verifyDaemonSettings(conf *config.Config) error {
   734  	if conf.ContainerdNamespace == conf.ContainerdPluginNamespace {
   735  		return errors.New("containers namespace and plugins namespace cannot be the same")
   736  	}
   737  	// Check for mutually incompatible config options
   738  	if conf.BridgeConfig.Iface != "" && conf.BridgeConfig.IP != "" {
   739  		return fmt.Errorf("You specified -b & --bip, mutually exclusive options. Please specify only one")
   740  	}
   741  	if !conf.BridgeConfig.EnableIPTables && !conf.BridgeConfig.InterContainerCommunication {
   742  		return fmt.Errorf("You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true")
   743  	}
   744  	if conf.BridgeConfig.EnableIP6Tables && !conf.Experimental {
   745  		return fmt.Errorf("ip6tables rules are only available if experimental features are enabled")
   746  	}
   747  	if !conf.BridgeConfig.EnableIPTables && conf.BridgeConfig.EnableIPMasq {
   748  		conf.BridgeConfig.EnableIPMasq = false
   749  	}
   750  	if err := verifyCgroupDriver(conf); err != nil {
   751  		return err
   752  	}
   753  	if conf.CgroupParent != "" && UsingSystemd(conf) {
   754  		if len(conf.CgroupParent) <= 6 || !strings.HasSuffix(conf.CgroupParent, ".slice") {
   755  			return fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"")
   756  		}
   757  	}
   758  
   759  	if conf.Rootless && UsingSystemd(conf) && cgroups.Mode() != cgroups.Unified {
   760  		return fmt.Errorf("exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode")
   761  	}
   762  
   763  	configureRuntimes(conf)
   764  	if rtName := conf.GetDefaultRuntimeName(); rtName != "" {
   765  		if conf.GetRuntime(rtName) == nil {
   766  			if !config.IsPermissibleC8dRuntimeName(rtName) {
   767  				return fmt.Errorf("specified default runtime '%s' does not exist", rtName)
   768  			}
   769  		}
   770  	}
   771  	return nil
   772  }
   773  
   774  // checkSystem validates platform-specific requirements
   775  func checkSystem() error {
   776  	return nil
   777  }
   778  
   779  // configureMaxThreads sets the Go runtime max threads threshold
   780  // which is 90% of the kernel setting from /proc/sys/kernel/threads-max
   781  func configureMaxThreads(config *config.Config) error {
   782  	mt, err := os.ReadFile("/proc/sys/kernel/threads-max")
   783  	if err != nil {
   784  		return err
   785  	}
   786  	mtint, err := strconv.Atoi(strings.TrimSpace(string(mt)))
   787  	if err != nil {
   788  		return err
   789  	}
   790  	maxThreads := (mtint / 100) * 90
   791  	debug.SetMaxThreads(maxThreads)
   792  	logrus.Debugf("Golang's threads limit set to %d", maxThreads)
   793  	return nil
   794  }
   795  
   796  func overlaySupportsSelinux() (bool, error) {
   797  	f, err := os.Open("/proc/kallsyms")
   798  	if err != nil {
   799  		if os.IsNotExist(err) {
   800  			return false, nil
   801  		}
   802  		return false, err
   803  	}
   804  	defer f.Close()
   805  
   806  	s := bufio.NewScanner(f)
   807  	for s.Scan() {
   808  		if strings.HasSuffix(s.Text(), " security_inode_copy_up") {
   809  			return true, nil
   810  		}
   811  	}
   812  
   813  	return false, s.Err()
   814  }
   815  
   816  // configureKernelSecuritySupport configures and validates security support for the kernel
   817  func configureKernelSecuritySupport(config *config.Config, driverName string) error {
   818  	if config.EnableSelinuxSupport {
   819  		if !selinux.GetEnabled() {
   820  			logrus.Warn("Docker could not enable SELinux on the host system")
   821  			return nil
   822  		}
   823  
   824  		if driverName == "overlay" || driverName == "overlay2" {
   825  			// If driver is overlay or overlay2, make sure kernel
   826  			// supports selinux with overlay.
   827  			supported, err := overlaySupportsSelinux()
   828  			if err != nil {
   829  				return err
   830  			}
   831  
   832  			if !supported {
   833  				logrus.Warnf("SELinux is not supported with the %v graph driver on this kernel", driverName)
   834  			}
   835  		}
   836  	} else {
   837  		selinux.SetDisabled()
   838  	}
   839  	return nil
   840  }
   841  
   842  // initNetworkController initializes the libnetwork controller and configures
   843  // network settings. If there's active sandboxes, configuration changes will not
   844  // take effect.
   845  func (daemon *Daemon) initNetworkController(activeSandboxes map[string]interface{}) error {
   846  	netOptions, err := daemon.networkOptions(daemon.PluginStore, activeSandboxes)
   847  	if err != nil {
   848  		return err
   849  	}
   850  
   851  	daemon.netController, err = libnetwork.New(netOptions...)
   852  	if err != nil {
   853  		return fmt.Errorf("error obtaining controller instance: %v", err)
   854  	}
   855  
   856  	if len(activeSandboxes) > 0 {
   857  		logrus.Info("there are running containers, updated network configuration will not take affect")
   858  	} else if err := configureNetworking(daemon.netController, daemon.configStore); err != nil {
   859  		return err
   860  	}
   861  
   862  	// Set HostGatewayIP to the default bridge's IP if it is empty
   863  	setHostGatewayIP(daemon.netController, daemon.configStore)
   864  	return nil
   865  }
   866  
   867  func configureNetworking(controller libnetwork.NetworkController, conf *config.Config) error {
   868  	// Initialize default network on "null"
   869  	if n, _ := controller.NetworkByName("none"); n == nil {
   870  		if _, err := controller.NewNetwork("null", "none", "", libnetwork.NetworkOptionPersist(true)); err != nil {
   871  			return errors.Wrap(err, `error creating default "null" network`)
   872  		}
   873  	}
   874  
   875  	// Initialize default network on "host"
   876  	if n, _ := controller.NetworkByName("host"); n == nil {
   877  		if _, err := controller.NewNetwork("host", "host", "", libnetwork.NetworkOptionPersist(true)); err != nil {
   878  			return errors.Wrap(err, `error creating default "host" network`)
   879  		}
   880  	}
   881  
   882  	// Clear stale bridge network
   883  	if n, err := controller.NetworkByName("bridge"); err == nil {
   884  		if err = n.Delete(); err != nil {
   885  			return errors.Wrap(err, `could not delete the default "bridge"" network`)
   886  		}
   887  		if len(conf.NetworkConfig.DefaultAddressPools.Value()) > 0 && !conf.LiveRestoreEnabled {
   888  			removeDefaultBridgeInterface()
   889  		}
   890  	}
   891  
   892  	if !conf.DisableBridge {
   893  		// Initialize default driver "bridge"
   894  		if err := initBridgeDriver(controller, conf); err != nil {
   895  			return err
   896  		}
   897  	} else {
   898  		removeDefaultBridgeInterface()
   899  	}
   900  
   901  	return nil
   902  }
   903  
   904  // setHostGatewayIP sets cfg.HostGatewayIP to the default bridge's IP if it is empty.
   905  func setHostGatewayIP(controller libnetwork.NetworkController, config *config.Config) {
   906  	if config.HostGatewayIP != nil {
   907  		return
   908  	}
   909  	if n, err := controller.NetworkByName("bridge"); err == nil {
   910  		v4Info, v6Info := n.Info().IpamInfo()
   911  		var gateway net.IP
   912  		if len(v4Info) > 0 {
   913  			gateway = v4Info[0].Gateway.IP
   914  		} else if len(v6Info) > 0 {
   915  			gateway = v6Info[0].Gateway.IP
   916  		}
   917  		config.HostGatewayIP = gateway
   918  	}
   919  }
   920  
   921  func driverOptions(config *config.Config) nwconfig.Option {
   922  	return nwconfig.OptionDriverConfig("bridge", options.Generic{
   923  		netlabel.GenericData: options.Generic{
   924  			"EnableIPForwarding":  config.BridgeConfig.EnableIPForward,
   925  			"EnableIPTables":      config.BridgeConfig.EnableIPTables,
   926  			"EnableIP6Tables":     config.BridgeConfig.EnableIP6Tables,
   927  			"EnableUserlandProxy": config.BridgeConfig.EnableUserlandProxy,
   928  			"UserlandProxyPath":   config.BridgeConfig.UserlandProxyPath,
   929  		},
   930  	})
   931  }
   932  
   933  func initBridgeDriver(controller libnetwork.NetworkController, config *config.Config) error {
   934  	bridgeName := bridge.DefaultBridgeName
   935  	if config.BridgeConfig.Iface != "" {
   936  		bridgeName = config.BridgeConfig.Iface
   937  	}
   938  	netOption := map[string]string{
   939  		bridge.BridgeName:         bridgeName,
   940  		bridge.DefaultBridge:      strconv.FormatBool(true),
   941  		netlabel.DriverMTU:        strconv.Itoa(config.Mtu),
   942  		bridge.EnableIPMasquerade: strconv.FormatBool(config.BridgeConfig.EnableIPMasq),
   943  		bridge.EnableICC:          strconv.FormatBool(config.BridgeConfig.InterContainerCommunication),
   944  	}
   945  
   946  	// --ip processing
   947  	if config.BridgeConfig.DefaultIP != nil {
   948  		netOption[bridge.DefaultBindingIP] = config.BridgeConfig.DefaultIP.String()
   949  	}
   950  
   951  	ipamV4Conf := &libnetwork.IpamConf{AuxAddresses: make(map[string]string)}
   952  
   953  	nwList, nw6List, err := netutils.ElectInterfaceAddresses(bridgeName)
   954  	if err != nil {
   955  		return errors.Wrap(err, "list bridge addresses failed")
   956  	}
   957  
   958  	nw := nwList[0]
   959  	if len(nwList) > 1 && config.BridgeConfig.FixedCIDR != "" {
   960  		_, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR)
   961  		if err != nil {
   962  			return errors.Wrap(err, "parse CIDR failed")
   963  		}
   964  		// Iterate through in case there are multiple addresses for the bridge
   965  		for _, entry := range nwList {
   966  			if fCIDR.Contains(entry.IP) {
   967  				nw = entry
   968  				break
   969  			}
   970  		}
   971  	}
   972  
   973  	ipamV4Conf.PreferredPool = lntypes.GetIPNetCanonical(nw).String()
   974  	hip, _ := lntypes.GetHostPartIP(nw.IP, nw.Mask)
   975  	if hip.IsGlobalUnicast() {
   976  		ipamV4Conf.Gateway = nw.IP.String()
   977  	}
   978  
   979  	if config.BridgeConfig.IP != "" {
   980  		ip, ipNet, err := net.ParseCIDR(config.BridgeConfig.IP)
   981  		if err != nil {
   982  			return err
   983  		}
   984  		ipamV4Conf.PreferredPool = ipNet.String()
   985  		ipamV4Conf.Gateway = ip.String()
   986  	} else if bridgeName == bridge.DefaultBridgeName && ipamV4Conf.PreferredPool != "" {
   987  		logrus.Infof("Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address", bridgeName, ipamV4Conf.PreferredPool)
   988  	}
   989  
   990  	if config.BridgeConfig.FixedCIDR != "" {
   991  		_, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR)
   992  		if err != nil {
   993  			return err
   994  		}
   995  
   996  		ipamV4Conf.SubPool = fCIDR.String()
   997  	}
   998  
   999  	if config.BridgeConfig.DefaultGatewayIPv4 != nil {
  1000  		ipamV4Conf.AuxAddresses["DefaultGatewayIPv4"] = config.BridgeConfig.DefaultGatewayIPv4.String()
  1001  	}
  1002  
  1003  	var (
  1004  		deferIPv6Alloc bool
  1005  		ipamV6Conf     *libnetwork.IpamConf
  1006  	)
  1007  
  1008  	if config.BridgeConfig.EnableIPv6 && config.BridgeConfig.FixedCIDRv6 == "" {
  1009  		return errdefs.InvalidParameter(errors.New("IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6"))
  1010  	} else if config.BridgeConfig.FixedCIDRv6 != "" {
  1011  		_, fCIDRv6, err := net.ParseCIDR(config.BridgeConfig.FixedCIDRv6)
  1012  		if err != nil {
  1013  			return err
  1014  		}
  1015  
  1016  		// In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has
  1017  		// at least 48 host bits, we need to guarantee the current behavior where the containers'
  1018  		// IPv6 addresses will be constructed based on the containers' interface MAC address.
  1019  		// We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints
  1020  		// on this network until after the driver has created the endpoint and returned the
  1021  		// constructed address. Libnetwork will then reserve this address with the ipam driver.
  1022  		ones, _ := fCIDRv6.Mask.Size()
  1023  		deferIPv6Alloc = ones <= 80
  1024  
  1025  		ipamV6Conf = &libnetwork.IpamConf{
  1026  			AuxAddresses:  make(map[string]string),
  1027  			PreferredPool: fCIDRv6.String(),
  1028  		}
  1029  
  1030  		// In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6
  1031  		// address belongs to the same network, we need to inform libnetwork about it, so
  1032  		// that it can be reserved with IPAM and it will not be given away to somebody else
  1033  		for _, nw6 := range nw6List {
  1034  			if fCIDRv6.Contains(nw6.IP) {
  1035  				ipamV6Conf.Gateway = nw6.IP.String()
  1036  				break
  1037  			}
  1038  		}
  1039  	}
  1040  
  1041  	if config.BridgeConfig.DefaultGatewayIPv6 != nil {
  1042  		if ipamV6Conf == nil {
  1043  			ipamV6Conf = &libnetwork.IpamConf{AuxAddresses: make(map[string]string)}
  1044  		}
  1045  		ipamV6Conf.AuxAddresses["DefaultGatewayIPv6"] = config.BridgeConfig.DefaultGatewayIPv6.String()
  1046  	}
  1047  
  1048  	v4Conf := []*libnetwork.IpamConf{ipamV4Conf}
  1049  	v6Conf := []*libnetwork.IpamConf{}
  1050  	if ipamV6Conf != nil {
  1051  		v6Conf = append(v6Conf, ipamV6Conf)
  1052  	}
  1053  	// Initialize default network on "bridge" with the same name
  1054  	_, err = controller.NewNetwork("bridge", "bridge", "",
  1055  		libnetwork.NetworkOptionEnableIPv6(config.BridgeConfig.EnableIPv6),
  1056  		libnetwork.NetworkOptionDriverOpts(netOption),
  1057  		libnetwork.NetworkOptionIpam("default", "", v4Conf, v6Conf, nil),
  1058  		libnetwork.NetworkOptionDeferIPv6Alloc(deferIPv6Alloc))
  1059  	if err != nil {
  1060  		return fmt.Errorf("Error creating default \"bridge\" network: %v", err)
  1061  	}
  1062  	return nil
  1063  }
  1064  
  1065  // Remove default bridge interface if present (--bridge=none use case)
  1066  func removeDefaultBridgeInterface() {
  1067  	if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil {
  1068  		if err := netlink.LinkDel(lnk); err != nil {
  1069  			logrus.Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err)
  1070  		}
  1071  	}
  1072  }
  1073  
  1074  func setupInitLayer(idMapping idtools.IdentityMapping) func(containerfs.ContainerFS) error {
  1075  	return func(initPath containerfs.ContainerFS) error {
  1076  		return initlayer.Setup(initPath, idMapping.RootPair())
  1077  	}
  1078  }
  1079  
  1080  // Parse the remapped root (user namespace) option, which can be one of:
  1081  //
  1082  // - username            - valid username from /etc/passwd
  1083  // - username:groupname  - valid username; valid groupname from /etc/group
  1084  // - uid                 - 32-bit unsigned int valid Linux UID value
  1085  // - uid:gid             - uid value; 32-bit unsigned int Linux GID value
  1086  //
  1087  // If no groupname is specified, and a username is specified, an attempt
  1088  // will be made to lookup a gid for that username as a groupname
  1089  //
  1090  // If names are used, they are verified to exist in passwd/group
  1091  func parseRemappedRoot(usergrp string) (string, string, error) {
  1092  	var (
  1093  		userID, groupID     int
  1094  		username, groupname string
  1095  	)
  1096  
  1097  	idparts := strings.Split(usergrp, ":")
  1098  	if len(idparts) > 2 {
  1099  		return "", "", fmt.Errorf("Invalid user/group specification in --userns-remap: %q", usergrp)
  1100  	}
  1101  
  1102  	if uid, err := strconv.ParseInt(idparts[0], 10, 32); err == nil {
  1103  		// must be a uid; take it as valid
  1104  		userID = int(uid)
  1105  		luser, err := idtools.LookupUID(userID)
  1106  		if err != nil {
  1107  			return "", "", fmt.Errorf("Uid %d has no entry in /etc/passwd: %v", userID, err)
  1108  		}
  1109  		username = luser.Name
  1110  		if len(idparts) == 1 {
  1111  			// if the uid was numeric and no gid was specified, take the uid as the gid
  1112  			groupID = userID
  1113  			lgrp, err := idtools.LookupGID(groupID)
  1114  			if err != nil {
  1115  				return "", "", fmt.Errorf("Gid %d has no entry in /etc/group: %v", groupID, err)
  1116  			}
  1117  			groupname = lgrp.Name
  1118  		}
  1119  	} else {
  1120  		lookupName := idparts[0]
  1121  		// special case: if the user specified "default", they want Docker to create or
  1122  		// use (after creation) the "dockremap" user/group for root remapping
  1123  		if lookupName == defaultIDSpecifier {
  1124  			lookupName = defaultRemappedID
  1125  		}
  1126  		luser, err := idtools.LookupUser(lookupName)
  1127  		if err != nil && idparts[0] != defaultIDSpecifier {
  1128  			// error if the name requested isn't the special "dockremap" ID
  1129  			return "", "", fmt.Errorf("Error during uid lookup for %q: %v", lookupName, err)
  1130  		} else if err != nil {
  1131  			// special case-- if the username == "default", then we have been asked
  1132  			// to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid}
  1133  			// ranges will be used for the user and group mappings in user namespaced containers
  1134  			_, _, err := idtools.AddNamespaceRangesUser(defaultRemappedID)
  1135  			if err == nil {
  1136  				return defaultRemappedID, defaultRemappedID, nil
  1137  			}
  1138  			return "", "", fmt.Errorf("Error during %q user creation: %v", defaultRemappedID, err)
  1139  		}
  1140  		username = luser.Name
  1141  		if len(idparts) == 1 {
  1142  			// we only have a string username, and no group specified; look up gid from username as group
  1143  			group, err := idtools.LookupGroup(lookupName)
  1144  			if err != nil {
  1145  				return "", "", fmt.Errorf("Error during gid lookup for %q: %v", lookupName, err)
  1146  			}
  1147  			groupname = group.Name
  1148  		}
  1149  	}
  1150  
  1151  	if len(idparts) == 2 {
  1152  		// groupname or gid is separately specified and must be resolved
  1153  		// to an unsigned 32-bit gid
  1154  		if gid, err := strconv.ParseInt(idparts[1], 10, 32); err == nil {
  1155  			// must be a gid, take it as valid
  1156  			groupID = int(gid)
  1157  			lgrp, err := idtools.LookupGID(groupID)
  1158  			if err != nil {
  1159  				return "", "", fmt.Errorf("Gid %d has no entry in /etc/passwd: %v", groupID, err)
  1160  			}
  1161  			groupname = lgrp.Name
  1162  		} else {
  1163  			// not a number; attempt a lookup
  1164  			if _, err := idtools.LookupGroup(idparts[1]); err != nil {
  1165  				return "", "", fmt.Errorf("Error during groupname lookup for %q: %v", idparts[1], err)
  1166  			}
  1167  			groupname = idparts[1]
  1168  		}
  1169  	}
  1170  	return username, groupname, nil
  1171  }
  1172  
  1173  func setupRemappedRoot(config *config.Config) (idtools.IdentityMapping, error) {
  1174  	if runtime.GOOS != "linux" && config.RemappedRoot != "" {
  1175  		return idtools.IdentityMapping{}, fmt.Errorf("User namespaces are only supported on Linux")
  1176  	}
  1177  
  1178  	// if the daemon was started with remapped root option, parse
  1179  	// the config option to the int uid,gid values
  1180  	if config.RemappedRoot != "" {
  1181  		username, groupname, err := parseRemappedRoot(config.RemappedRoot)
  1182  		if err != nil {
  1183  			return idtools.IdentityMapping{}, err
  1184  		}
  1185  		if username == "root" {
  1186  			// Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op
  1187  			// effectively
  1188  			logrus.Warn("User namespaces: root cannot be remapped with itself; user namespaces are OFF")
  1189  			return idtools.IdentityMapping{}, nil
  1190  		}
  1191  		logrus.Infof("User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s", username)
  1192  		// update remapped root setting now that we have resolved them to actual names
  1193  		config.RemappedRoot = fmt.Sprintf("%s:%s", username, groupname)
  1194  
  1195  		mappings, err := idtools.LoadIdentityMapping(username)
  1196  		if err != nil {
  1197  			return idtools.IdentityMapping{}, errors.Wrap(err, "Can't create ID mappings")
  1198  		}
  1199  		return mappings, nil
  1200  	}
  1201  	return idtools.IdentityMapping{}, nil
  1202  }
  1203  
  1204  func setupDaemonRoot(config *config.Config, rootDir string, remappedRoot idtools.Identity) error {
  1205  	config.Root = rootDir
  1206  	// the docker root metadata directory needs to have execute permissions for all users (g+x,o+x)
  1207  	// so that syscalls executing as non-root, operating on subdirectories of the graph root
  1208  	// (e.g. mounted layers of a container) can traverse this path.
  1209  	// The user namespace support will create subdirectories for the remapped root host uid:gid
  1210  	// pair owned by that same uid:gid pair for proper write access to those needed metadata and
  1211  	// layer content subtrees.
  1212  	if _, err := os.Stat(rootDir); err == nil {
  1213  		// root current exists; verify the access bits are correct by setting them
  1214  		if err = os.Chmod(rootDir, 0711); err != nil {
  1215  			return err
  1216  		}
  1217  	} else if os.IsNotExist(err) {
  1218  		// no root exists yet, create it 0711 with root:root ownership
  1219  		if err := os.MkdirAll(rootDir, 0711); err != nil {
  1220  			return err
  1221  		}
  1222  	}
  1223  
  1224  	id := idtools.Identity{UID: idtools.CurrentIdentity().UID, GID: remappedRoot.GID}
  1225  	// First make sure the current root dir has the correct perms.
  1226  	if err := idtools.MkdirAllAndChown(config.Root, 0710, id); err != nil {
  1227  		return errors.Wrapf(err, "could not create or set daemon root permissions: %s", config.Root)
  1228  	}
  1229  
  1230  	// if user namespaces are enabled we will create a subtree underneath the specified root
  1231  	// with any/all specified remapped root uid/gid options on the daemon creating
  1232  	// a new subdirectory with ownership set to the remapped uid/gid (so as to allow
  1233  	// `chdir()` to work for containers namespaced to that uid/gid)
  1234  	if config.RemappedRoot != "" {
  1235  		config.Root = filepath.Join(rootDir, fmt.Sprintf("%d.%d", remappedRoot.UID, remappedRoot.GID))
  1236  		logrus.Debugf("Creating user namespaced daemon root: %s", config.Root)
  1237  		// Create the root directory if it doesn't exist
  1238  		if err := idtools.MkdirAllAndChown(config.Root, 0710, id); err != nil {
  1239  			return fmt.Errorf("Cannot create daemon root: %s: %v", config.Root, err)
  1240  		}
  1241  		// we also need to verify that any pre-existing directories in the path to
  1242  		// the graphroot won't block access to remapped root--if any pre-existing directory
  1243  		// has strict permissions that don't allow "x", container start will fail, so
  1244  		// better to warn and fail now
  1245  		dirPath := config.Root
  1246  		for {
  1247  			dirPath = filepath.Dir(dirPath)
  1248  			if dirPath == "/" {
  1249  				break
  1250  			}
  1251  			if !idtools.CanAccess(dirPath, remappedRoot) {
  1252  				return fmt.Errorf("a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories", config.Root)
  1253  			}
  1254  		}
  1255  	}
  1256  
  1257  	if err := setupDaemonRootPropagation(config); err != nil {
  1258  		logrus.WithError(err).WithField("dir", config.Root).Warn("Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior")
  1259  	}
  1260  	return nil
  1261  }
  1262  
  1263  func setupDaemonRootPropagation(cfg *config.Config) error {
  1264  	rootParentMount, mountOptions, err := getSourceMount(cfg.Root)
  1265  	if err != nil {
  1266  		return errors.Wrap(err, "error getting daemon root's parent mount")
  1267  	}
  1268  
  1269  	var cleanupOldFile bool
  1270  	cleanupFile := getUnmountOnShutdownPath(cfg)
  1271  	defer func() {
  1272  		if !cleanupOldFile {
  1273  			return
  1274  		}
  1275  		if err := os.Remove(cleanupFile); err != nil && !os.IsNotExist(err) {
  1276  			logrus.WithError(err).WithField("file", cleanupFile).Warn("could not clean up old root propagation unmount file")
  1277  		}
  1278  	}()
  1279  
  1280  	if hasMountInfoOption(mountOptions, sharedPropagationOption, slavePropagationOption) {
  1281  		cleanupOldFile = true
  1282  		return nil
  1283  	}
  1284  
  1285  	if err := mount.MakeShared(cfg.Root); err != nil {
  1286  		return errors.Wrap(err, "could not setup daemon root propagation to shared")
  1287  	}
  1288  
  1289  	// check the case where this may have already been a mount to itself.
  1290  	// If so then the daemon only performed a remount and should not try to unmount this later.
  1291  	if rootParentMount == cfg.Root {
  1292  		cleanupOldFile = true
  1293  		return nil
  1294  	}
  1295  
  1296  	if err := os.MkdirAll(filepath.Dir(cleanupFile), 0700); err != nil {
  1297  		return errors.Wrap(err, "error creating dir to store mount cleanup file")
  1298  	}
  1299  
  1300  	if err := os.WriteFile(cleanupFile, nil, 0600); err != nil {
  1301  		return errors.Wrap(err, "error writing file to signal mount cleanup on shutdown")
  1302  	}
  1303  	return nil
  1304  }
  1305  
  1306  // getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown
  1307  // the daemon root should be unmounted.
  1308  func getUnmountOnShutdownPath(config *config.Config) string {
  1309  	return filepath.Join(config.ExecRoot, "unmount-on-shutdown")
  1310  }
  1311  
  1312  // registerLinks registers network links between container and other containers
  1313  // with the daemon using the specification in hostConfig.
  1314  func (daemon *Daemon) registerLinks(container *container.Container, hostConfig *containertypes.HostConfig) error {
  1315  	if hostConfig == nil || hostConfig.NetworkMode.IsUserDefined() {
  1316  		return nil
  1317  	}
  1318  
  1319  	for _, l := range hostConfig.Links {
  1320  		name, alias, err := opts.ParseLink(l)
  1321  		if err != nil {
  1322  			return err
  1323  		}
  1324  		child, err := daemon.GetContainer(name)
  1325  		if err != nil {
  1326  			if errdefs.IsNotFound(err) {
  1327  				// Trying to link to a non-existing container is not valid, and
  1328  				// should return an "invalid parameter" error. Returning a "not
  1329  				// found" error here would make the client report the container's
  1330  				// image could not be found (see moby/moby#39823)
  1331  				err = errdefs.InvalidParameter(err)
  1332  			}
  1333  			return errors.Wrapf(err, "could not get container for %s", name)
  1334  		}
  1335  		for child.HostConfig.NetworkMode.IsContainer() {
  1336  			parts := strings.SplitN(string(child.HostConfig.NetworkMode), ":", 2)
  1337  			child, err = daemon.GetContainer(parts[1])
  1338  			if err != nil {
  1339  				if errdefs.IsNotFound(err) {
  1340  					// Trying to link to a non-existing container is not valid, and
  1341  					// should return an "invalid parameter" error. Returning a "not
  1342  					// found" error here would make the client report the container's
  1343  					// image could not be found (see moby/moby#39823)
  1344  					err = errdefs.InvalidParameter(err)
  1345  				}
  1346  				return errors.Wrapf(err, "Could not get container for %s", parts[1])
  1347  			}
  1348  		}
  1349  		if child.HostConfig.NetworkMode.IsHost() {
  1350  			return runconfig.ErrConflictHostNetworkAndLinks
  1351  		}
  1352  		if err := daemon.registerLink(container, child, alias); err != nil {
  1353  			return err
  1354  		}
  1355  	}
  1356  
  1357  	return nil
  1358  }
  1359  
  1360  // conditionalMountOnStart is a platform specific helper function during the
  1361  // container start to call mount.
  1362  func (daemon *Daemon) conditionalMountOnStart(container *container.Container) error {
  1363  	return daemon.Mount(container)
  1364  }
  1365  
  1366  // conditionalUnmountOnCleanup is a platform specific helper function called
  1367  // during the cleanup of a container to unmount.
  1368  func (daemon *Daemon) conditionalUnmountOnCleanup(container *container.Container) error {
  1369  	return daemon.Unmount(container)
  1370  }
  1371  
  1372  func copyBlkioEntry(entries []*statsV1.BlkIOEntry) []types.BlkioStatEntry {
  1373  	out := make([]types.BlkioStatEntry, len(entries))
  1374  	for i, re := range entries {
  1375  		out[i] = types.BlkioStatEntry{
  1376  			Major: re.Major,
  1377  			Minor: re.Minor,
  1378  			Op:    re.Op,
  1379  			Value: re.Value,
  1380  		}
  1381  	}
  1382  	return out
  1383  }
  1384  
  1385  func (daemon *Daemon) stats(c *container.Container) (*types.StatsJSON, error) {
  1386  	if !c.IsRunning() {
  1387  		return nil, errNotRunning(c.ID)
  1388  	}
  1389  	cs, err := daemon.containerd.Stats(context.Background(), c.ID)
  1390  	if err != nil {
  1391  		if strings.Contains(err.Error(), "container not found") {
  1392  			return nil, containerNotFound(c.ID)
  1393  		}
  1394  		return nil, err
  1395  	}
  1396  	s := &types.StatsJSON{}
  1397  	s.Read = cs.Read
  1398  	stats := cs.Metrics
  1399  	switch t := stats.(type) {
  1400  	case *statsV1.Metrics:
  1401  		return daemon.statsV1(s, t)
  1402  	case *statsV2.Metrics:
  1403  		return daemon.statsV2(s, t)
  1404  	default:
  1405  		return nil, errors.Errorf("unexpected type of metrics %+v", t)
  1406  	}
  1407  }
  1408  
  1409  func (daemon *Daemon) statsV1(s *types.StatsJSON, stats *statsV1.Metrics) (*types.StatsJSON, error) {
  1410  	if stats.Blkio != nil {
  1411  		s.BlkioStats = types.BlkioStats{
  1412  			IoServiceBytesRecursive: copyBlkioEntry(stats.Blkio.IoServiceBytesRecursive),
  1413  			IoServicedRecursive:     copyBlkioEntry(stats.Blkio.IoServicedRecursive),
  1414  			IoQueuedRecursive:       copyBlkioEntry(stats.Blkio.IoQueuedRecursive),
  1415  			IoServiceTimeRecursive:  copyBlkioEntry(stats.Blkio.IoServiceTimeRecursive),
  1416  			IoWaitTimeRecursive:     copyBlkioEntry(stats.Blkio.IoWaitTimeRecursive),
  1417  			IoMergedRecursive:       copyBlkioEntry(stats.Blkio.IoMergedRecursive),
  1418  			IoTimeRecursive:         copyBlkioEntry(stats.Blkio.IoTimeRecursive),
  1419  			SectorsRecursive:        copyBlkioEntry(stats.Blkio.SectorsRecursive),
  1420  		}
  1421  	}
  1422  	if stats.CPU != nil {
  1423  		s.CPUStats = types.CPUStats{
  1424  			CPUUsage: types.CPUUsage{
  1425  				TotalUsage:        stats.CPU.Usage.Total,
  1426  				PercpuUsage:       stats.CPU.Usage.PerCPU,
  1427  				UsageInKernelmode: stats.CPU.Usage.Kernel,
  1428  				UsageInUsermode:   stats.CPU.Usage.User,
  1429  			},
  1430  			ThrottlingData: types.ThrottlingData{
  1431  				Periods:          stats.CPU.Throttling.Periods,
  1432  				ThrottledPeriods: stats.CPU.Throttling.ThrottledPeriods,
  1433  				ThrottledTime:    stats.CPU.Throttling.ThrottledTime,
  1434  			},
  1435  		}
  1436  	}
  1437  
  1438  	if stats.Memory != nil {
  1439  		raw := map[string]uint64{
  1440  			"cache":                     stats.Memory.Cache,
  1441  			"rss":                       stats.Memory.RSS,
  1442  			"rss_huge":                  stats.Memory.RSSHuge,
  1443  			"mapped_file":               stats.Memory.MappedFile,
  1444  			"dirty":                     stats.Memory.Dirty,
  1445  			"writeback":                 stats.Memory.Writeback,
  1446  			"pgpgin":                    stats.Memory.PgPgIn,
  1447  			"pgpgout":                   stats.Memory.PgPgOut,
  1448  			"pgfault":                   stats.Memory.PgFault,
  1449  			"pgmajfault":                stats.Memory.PgMajFault,
  1450  			"inactive_anon":             stats.Memory.InactiveAnon,
  1451  			"active_anon":               stats.Memory.ActiveAnon,
  1452  			"inactive_file":             stats.Memory.InactiveFile,
  1453  			"active_file":               stats.Memory.ActiveFile,
  1454  			"unevictable":               stats.Memory.Unevictable,
  1455  			"hierarchical_memory_limit": stats.Memory.HierarchicalMemoryLimit,
  1456  			"hierarchical_memsw_limit":  stats.Memory.HierarchicalSwapLimit,
  1457  			"total_cache":               stats.Memory.TotalCache,
  1458  			"total_rss":                 stats.Memory.TotalRSS,
  1459  			"total_rss_huge":            stats.Memory.TotalRSSHuge,
  1460  			"total_mapped_file":         stats.Memory.TotalMappedFile,
  1461  			"total_dirty":               stats.Memory.TotalDirty,
  1462  			"total_writeback":           stats.Memory.TotalWriteback,
  1463  			"total_pgpgin":              stats.Memory.TotalPgPgIn,
  1464  			"total_pgpgout":             stats.Memory.TotalPgPgOut,
  1465  			"total_pgfault":             stats.Memory.TotalPgFault,
  1466  			"total_pgmajfault":          stats.Memory.TotalPgMajFault,
  1467  			"total_inactive_anon":       stats.Memory.TotalInactiveAnon,
  1468  			"total_active_anon":         stats.Memory.TotalActiveAnon,
  1469  			"total_inactive_file":       stats.Memory.TotalInactiveFile,
  1470  			"total_active_file":         stats.Memory.TotalActiveFile,
  1471  			"total_unevictable":         stats.Memory.TotalUnevictable,
  1472  		}
  1473  		if stats.Memory.Usage != nil {
  1474  			s.MemoryStats = types.MemoryStats{
  1475  				Stats:    raw,
  1476  				Usage:    stats.Memory.Usage.Usage,
  1477  				MaxUsage: stats.Memory.Usage.Max,
  1478  				Limit:    stats.Memory.Usage.Limit,
  1479  				Failcnt:  stats.Memory.Usage.Failcnt,
  1480  			}
  1481  		} else {
  1482  			s.MemoryStats = types.MemoryStats{
  1483  				Stats: raw,
  1484  			}
  1485  		}
  1486  
  1487  		// if the container does not set memory limit, use the machineMemory
  1488  		if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 {
  1489  			s.MemoryStats.Limit = daemon.machineMemory
  1490  		}
  1491  	}
  1492  
  1493  	if stats.Pids != nil {
  1494  		s.PidsStats = types.PidsStats{
  1495  			Current: stats.Pids.Current,
  1496  			Limit:   stats.Pids.Limit,
  1497  		}
  1498  	}
  1499  
  1500  	return s, nil
  1501  }
  1502  
  1503  func (daemon *Daemon) statsV2(s *types.StatsJSON, stats *statsV2.Metrics) (*types.StatsJSON, error) {
  1504  	if stats.Io != nil {
  1505  		var isbr []types.BlkioStatEntry
  1506  		for _, re := range stats.Io.Usage {
  1507  			isbr = append(isbr,
  1508  				types.BlkioStatEntry{
  1509  					Major: re.Major,
  1510  					Minor: re.Minor,
  1511  					Op:    "read",
  1512  					Value: re.Rbytes,
  1513  				},
  1514  				types.BlkioStatEntry{
  1515  					Major: re.Major,
  1516  					Minor: re.Minor,
  1517  					Op:    "write",
  1518  					Value: re.Wbytes,
  1519  				},
  1520  			)
  1521  		}
  1522  		s.BlkioStats = types.BlkioStats{
  1523  			IoServiceBytesRecursive: isbr,
  1524  			// Other fields are unsupported
  1525  		}
  1526  	}
  1527  
  1528  	if stats.CPU != nil {
  1529  		s.CPUStats = types.CPUStats{
  1530  			CPUUsage: types.CPUUsage{
  1531  				TotalUsage: stats.CPU.UsageUsec * 1000,
  1532  				// PercpuUsage is not supported
  1533  				UsageInKernelmode: stats.CPU.SystemUsec * 1000,
  1534  				UsageInUsermode:   stats.CPU.UserUsec * 1000,
  1535  			},
  1536  			ThrottlingData: types.ThrottlingData{
  1537  				Periods:          stats.CPU.NrPeriods,
  1538  				ThrottledPeriods: stats.CPU.NrThrottled,
  1539  				ThrottledTime:    stats.CPU.ThrottledUsec * 1000,
  1540  			},
  1541  		}
  1542  	}
  1543  
  1544  	if stats.Memory != nil {
  1545  		s.MemoryStats = types.MemoryStats{
  1546  			// Stats is not compatible with v1
  1547  			Stats: map[string]uint64{
  1548  				"anon":                   stats.Memory.Anon,
  1549  				"file":                   stats.Memory.File,
  1550  				"kernel_stack":           stats.Memory.KernelStack,
  1551  				"slab":                   stats.Memory.Slab,
  1552  				"sock":                   stats.Memory.Sock,
  1553  				"shmem":                  stats.Memory.Shmem,
  1554  				"file_mapped":            stats.Memory.FileMapped,
  1555  				"file_dirty":             stats.Memory.FileDirty,
  1556  				"file_writeback":         stats.Memory.FileWriteback,
  1557  				"anon_thp":               stats.Memory.AnonThp,
  1558  				"inactive_anon":          stats.Memory.InactiveAnon,
  1559  				"active_anon":            stats.Memory.ActiveAnon,
  1560  				"inactive_file":          stats.Memory.InactiveFile,
  1561  				"active_file":            stats.Memory.ActiveFile,
  1562  				"unevictable":            stats.Memory.Unevictable,
  1563  				"slab_reclaimable":       stats.Memory.SlabReclaimable,
  1564  				"slab_unreclaimable":     stats.Memory.SlabUnreclaimable,
  1565  				"pgfault":                stats.Memory.Pgfault,
  1566  				"pgmajfault":             stats.Memory.Pgmajfault,
  1567  				"workingset_refault":     stats.Memory.WorkingsetRefault,
  1568  				"workingset_activate":    stats.Memory.WorkingsetActivate,
  1569  				"workingset_nodereclaim": stats.Memory.WorkingsetNodereclaim,
  1570  				"pgrefill":               stats.Memory.Pgrefill,
  1571  				"pgscan":                 stats.Memory.Pgscan,
  1572  				"pgsteal":                stats.Memory.Pgsteal,
  1573  				"pgactivate":             stats.Memory.Pgactivate,
  1574  				"pgdeactivate":           stats.Memory.Pgdeactivate,
  1575  				"pglazyfree":             stats.Memory.Pglazyfree,
  1576  				"pglazyfreed":            stats.Memory.Pglazyfreed,
  1577  				"thp_fault_alloc":        stats.Memory.ThpFaultAlloc,
  1578  				"thp_collapse_alloc":     stats.Memory.ThpCollapseAlloc,
  1579  			},
  1580  			Usage: stats.Memory.Usage,
  1581  			// MaxUsage is not supported
  1582  			Limit: stats.Memory.UsageLimit,
  1583  		}
  1584  		// if the container does not set memory limit, use the machineMemory
  1585  		if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 {
  1586  			s.MemoryStats.Limit = daemon.machineMemory
  1587  		}
  1588  		if stats.MemoryEvents != nil {
  1589  			// Failcnt is set to the "oom" field of the "memory.events" file.
  1590  			// See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
  1591  			s.MemoryStats.Failcnt = stats.MemoryEvents.Oom
  1592  		}
  1593  	}
  1594  
  1595  	if stats.Pids != nil {
  1596  		s.PidsStats = types.PidsStats{
  1597  			Current: stats.Pids.Current,
  1598  			Limit:   stats.Pids.Limit,
  1599  		}
  1600  	}
  1601  
  1602  	return s, nil
  1603  }
  1604  
  1605  // setDefaultIsolation determines the default isolation mode for the
  1606  // daemon to run in. This is only applicable on Windows
  1607  func (daemon *Daemon) setDefaultIsolation() error {
  1608  	return nil
  1609  }
  1610  
  1611  // setupDaemonProcess sets various settings for the daemon's process
  1612  func setupDaemonProcess(config *config.Config) error {
  1613  	// setup the daemons oom_score_adj
  1614  	if err := setupOOMScoreAdj(config.OOMScoreAdjust); err != nil {
  1615  		return err
  1616  	}
  1617  	if err := setMayDetachMounts(); err != nil {
  1618  		logrus.WithError(err).Warn("Could not set may_detach_mounts kernel parameter")
  1619  	}
  1620  	return nil
  1621  }
  1622  
  1623  // This is used to allow removal of mountpoints that may be mounted in other
  1624  // namespaces on RHEL based kernels starting from RHEL 7.4.
  1625  // Without this setting, removals on these RHEL based kernels may fail with
  1626  // "device or resource busy".
  1627  // This setting is not available in upstream kernels as it is not configurable,
  1628  // but has been in the upstream kernels since 3.15.
  1629  func setMayDetachMounts() error {
  1630  	f, err := os.OpenFile("/proc/sys/fs/may_detach_mounts", os.O_WRONLY, 0)
  1631  	if err != nil {
  1632  		if os.IsNotExist(err) {
  1633  			return nil
  1634  		}
  1635  		return errors.Wrap(err, "error opening may_detach_mounts kernel config file")
  1636  	}
  1637  	defer f.Close()
  1638  
  1639  	_, err = f.WriteString("1")
  1640  	if os.IsPermission(err) {
  1641  		// Setting may_detach_mounts does not work in an
  1642  		// unprivileged container. Ignore the error, but log
  1643  		// it if we appear not to be in that situation.
  1644  		if !userns.RunningInUserNS() {
  1645  			logrus.Debugf("Permission denied writing %q to /proc/sys/fs/may_detach_mounts", "1")
  1646  		}
  1647  		return nil
  1648  	}
  1649  	return err
  1650  }
  1651  
  1652  func setupOOMScoreAdj(score int) error {
  1653  	if score == 0 {
  1654  		return nil
  1655  	}
  1656  	f, err := os.OpenFile("/proc/self/oom_score_adj", os.O_WRONLY, 0)
  1657  	if err != nil {
  1658  		return err
  1659  	}
  1660  	defer f.Close()
  1661  	stringScore := strconv.Itoa(score)
  1662  	_, err = f.WriteString(stringScore)
  1663  	if os.IsPermission(err) {
  1664  		// Setting oom_score_adj does not work in an
  1665  		// unprivileged container. Ignore the error, but log
  1666  		// it if we appear not to be in that situation.
  1667  		if !userns.RunningInUserNS() {
  1668  			logrus.Debugf("Permission denied writing %q to /proc/self/oom_score_adj", stringScore)
  1669  		}
  1670  		return nil
  1671  	}
  1672  
  1673  	return err
  1674  }
  1675  
  1676  func (daemon *Daemon) initCPURtController(mnt, path string) error {
  1677  	if path == "/" || path == "." {
  1678  		return nil
  1679  	}
  1680  
  1681  	// Recursively create cgroup to ensure that the system and all parent cgroups have values set
  1682  	// for the period and runtime as this limits what the children can be set to.
  1683  	if err := daemon.initCPURtController(mnt, filepath.Dir(path)); err != nil {
  1684  		return err
  1685  	}
  1686  
  1687  	path = filepath.Join(mnt, path)
  1688  	if err := os.MkdirAll(path, 0755); err != nil {
  1689  		return err
  1690  	}
  1691  	if err := maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil {
  1692  		return err
  1693  	}
  1694  	return maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimeRuntime, "cpu.rt_runtime_us", path)
  1695  }
  1696  
  1697  func maybeCreateCPURealTimeFile(configValue int64, file string, path string) error {
  1698  	if configValue == 0 {
  1699  		return nil
  1700  	}
  1701  	return os.WriteFile(filepath.Join(path, file), []byte(strconv.FormatInt(configValue, 10)), 0700)
  1702  }
  1703  
  1704  func (daemon *Daemon) setupSeccompProfile() error {
  1705  	switch profile := daemon.configStore.SeccompProfile; profile {
  1706  	case "", config.SeccompProfileDefault:
  1707  		daemon.seccompProfilePath = config.SeccompProfileDefault
  1708  	case config.SeccompProfileUnconfined:
  1709  		daemon.seccompProfilePath = config.SeccompProfileUnconfined
  1710  	default:
  1711  		daemon.seccompProfilePath = profile
  1712  		b, err := os.ReadFile(profile)
  1713  		if err != nil {
  1714  			return fmt.Errorf("opening seccomp profile (%s) failed: %v", profile, err)
  1715  		}
  1716  		daemon.seccompProfile = b
  1717  	}
  1718  	return nil
  1719  }
  1720  
  1721  func getSysInfo(daemon *Daemon) *sysinfo.SysInfo {
  1722  	var siOpts []sysinfo.Opt
  1723  	if daemon.getCgroupDriver() == cgroupSystemdDriver {
  1724  		if euid := os.Getenv("ROOTLESSKIT_PARENT_EUID"); euid != "" {
  1725  			siOpts = append(siOpts, sysinfo.WithCgroup2GroupPath("/user.slice/user-"+euid+".slice"))
  1726  		}
  1727  	}
  1728  	return sysinfo.New(siOpts...)
  1729  }
  1730  
  1731  func (daemon *Daemon) initLibcontainerd(ctx context.Context) error {
  1732  	var err error
  1733  	daemon.containerd, err = remote.NewClient(
  1734  		ctx,
  1735  		daemon.containerdCli,
  1736  		filepath.Join(daemon.configStore.ExecRoot, "containerd"),
  1737  		daemon.configStore.ContainerdNamespace,
  1738  		daemon,
  1739  	)
  1740  	return err
  1741  }
  1742  
  1743  func recursiveUnmount(target string) error {
  1744  	return mount.RecursiveUnmount(target)
  1745  }