github.com/containers/libpod@v1.9.4-0.20220419124438-4284fd425507/pkg/spec/spec.go (about)

     1  package createconfig
     2  
     3  import (
     4  	"strings"
     5  
     6  	"github.com/containers/common/pkg/capabilities"
     7  	cconfig "github.com/containers/common/pkg/config"
     8  	"github.com/containers/libpod/libpod"
     9  	"github.com/containers/libpod/pkg/cgroups"
    10  	"github.com/containers/libpod/pkg/env"
    11  	"github.com/containers/libpod/pkg/rootless"
    12  	"github.com/containers/libpod/pkg/sysinfo"
    13  	"github.com/containers/libpod/pkg/util"
    14  	"github.com/docker/go-units"
    15  	"github.com/opencontainers/runc/libcontainer/user"
    16  	spec "github.com/opencontainers/runtime-spec/specs-go"
    17  	"github.com/opencontainers/runtime-tools/generate"
    18  	"github.com/pkg/errors"
    19  )
    20  
    21  const (
    22  	CpuPeriod        = 100000
    23  	kernelMax uint64 = 1048576
    24  )
    25  
    26  func GetAvailableGids() (int64, error) {
    27  	idMap, err := user.ParseIDMapFile("/proc/self/gid_map")
    28  	if err != nil {
    29  		return 0, err
    30  	}
    31  	count := int64(0)
    32  	for _, r := range idMap {
    33  		count += r.Count
    34  	}
    35  	return count, nil
    36  }
    37  
    38  // CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec
    39  func (config *CreateConfig) createConfigToOCISpec(runtime *libpod.Runtime, userMounts []spec.Mount) (*spec.Spec, error) {
    40  	cgroupPerm := "ro"
    41  	g, err := generate.New("linux")
    42  	if err != nil {
    43  		return nil, err
    44  	}
    45  	// Remove the default /dev/shm mount to ensure we overwrite it
    46  	g.RemoveMount("/dev/shm")
    47  	g.HostSpecific = true
    48  	addCgroup := true
    49  	canMountSys := true
    50  
    51  	isRootless := rootless.IsRootless()
    52  	inUserNS := config.User.InNS(isRootless)
    53  
    54  	if inUserNS && config.Network.NetMode.IsHost() {
    55  		canMountSys = false
    56  	}
    57  
    58  	if config.Security.Privileged && canMountSys {
    59  		cgroupPerm = "rw"
    60  		g.RemoveMount("/sys")
    61  		sysMnt := spec.Mount{
    62  			Destination: "/sys",
    63  			Type:        "sysfs",
    64  			Source:      "sysfs",
    65  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", "rw"},
    66  		}
    67  		g.AddMount(sysMnt)
    68  	} else if !canMountSys {
    69  		addCgroup = false
    70  		g.RemoveMount("/sys")
    71  		r := "ro"
    72  		if config.Security.Privileged {
    73  			r = "rw"
    74  		}
    75  		sysMnt := spec.Mount{
    76  			Destination: "/sys",
    77  			Type:        TypeBind,
    78  			Source:      "/sys",
    79  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"},
    80  		}
    81  		g.AddMount(sysMnt)
    82  		if !config.Security.Privileged && isRootless {
    83  			g.AddLinuxMaskedPaths("/sys/kernel")
    84  		}
    85  	}
    86  	var runtimeConfig *cconfig.Config
    87  
    88  	if runtime != nil {
    89  		runtimeConfig, err = runtime.GetConfig()
    90  		if err != nil {
    91  			return nil, err
    92  		}
    93  		g.Config.Process.Capabilities.Bounding = runtimeConfig.Containers.DefaultCapabilities
    94  		sysctls, err := util.ValidateSysctls(runtimeConfig.Containers.DefaultSysctls)
    95  		if err != nil {
    96  			return nil, err
    97  		}
    98  
    99  		for name, val := range config.Security.Sysctl {
   100  			sysctls[name] = val
   101  		}
   102  		config.Security.Sysctl = sysctls
   103  		if !util.StringInSlice("host", config.Resources.Ulimit) {
   104  			config.Resources.Ulimit = append(runtimeConfig.Containers.DefaultUlimits, config.Resources.Ulimit...)
   105  		}
   106  		if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() {
   107  			config.Resources.PidsLimit = runtimeConfig.Containers.PidsLimit
   108  		}
   109  
   110  	} else {
   111  		g.Config.Process.Capabilities.Bounding = cconfig.DefaultCapabilities
   112  		if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() {
   113  			config.Resources.PidsLimit = cconfig.DefaultPidsLimit
   114  		}
   115  	}
   116  
   117  	gid5Available := true
   118  	if isRootless {
   119  		nGids, err := GetAvailableGids()
   120  		if err != nil {
   121  			return nil, err
   122  		}
   123  		gid5Available = nGids >= 5
   124  	}
   125  	// When using a different user namespace, check that the GID 5 is mapped inside
   126  	// the container.
   127  	if gid5Available && len(config.User.IDMappings.GIDMap) > 0 {
   128  		mappingFound := false
   129  		for _, r := range config.User.IDMappings.GIDMap {
   130  			if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size {
   131  				mappingFound = true
   132  				break
   133  			}
   134  		}
   135  		if !mappingFound {
   136  			gid5Available = false
   137  		}
   138  
   139  	}
   140  	if !gid5Available {
   141  		// If we have no GID mappings, the gid=5 default option would fail, so drop it.
   142  		g.RemoveMount("/dev/pts")
   143  		devPts := spec.Mount{
   144  			Destination: "/dev/pts",
   145  			Type:        "devpts",
   146  			Source:      "devpts",
   147  			Options:     []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"},
   148  		}
   149  		g.AddMount(devPts)
   150  	}
   151  
   152  	if inUserNS && config.Ipc.IpcMode.IsHost() {
   153  		g.RemoveMount("/dev/mqueue")
   154  		devMqueue := spec.Mount{
   155  			Destination: "/dev/mqueue",
   156  			Type:        TypeBind,
   157  			Source:      "/dev/mqueue",
   158  			Options:     []string{"bind", "nosuid", "noexec", "nodev"},
   159  		}
   160  		g.AddMount(devMqueue)
   161  	}
   162  	if inUserNS && config.Pid.PidMode.IsHost() {
   163  		g.RemoveMount("/proc")
   164  		procMount := spec.Mount{
   165  			Destination: "/proc",
   166  			Type:        TypeBind,
   167  			Source:      "/proc",
   168  			Options:     []string{"rbind", "nosuid", "noexec", "nodev"},
   169  		}
   170  		g.AddMount(procMount)
   171  	}
   172  
   173  	if addCgroup {
   174  		cgroupMnt := spec.Mount{
   175  			Destination: "/sys/fs/cgroup",
   176  			Type:        "cgroup",
   177  			Source:      "cgroup",
   178  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm},
   179  		}
   180  		g.AddMount(cgroupMnt)
   181  	}
   182  	g.SetProcessCwd(config.WorkDir)
   183  	g.SetProcessArgs(config.Command)
   184  	g.SetProcessTerminal(config.Tty)
   185  
   186  	for key, val := range config.Annotations {
   187  		g.AddAnnotation(key, val)
   188  	}
   189  
   190  	addedResources := false
   191  
   192  	// RESOURCES - MEMORY
   193  	if config.Resources.Memory != 0 {
   194  		g.SetLinuxResourcesMemoryLimit(config.Resources.Memory)
   195  		// If a swap limit is not explicitly set, also set a swap limit
   196  		// Default to double the memory limit
   197  		if config.Resources.MemorySwap == 0 {
   198  			g.SetLinuxResourcesMemorySwap(2 * config.Resources.Memory)
   199  		}
   200  		addedResources = true
   201  	}
   202  	if config.Resources.MemoryReservation != 0 {
   203  		g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation)
   204  		addedResources = true
   205  	}
   206  	if config.Resources.MemorySwap != 0 {
   207  		g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap)
   208  		addedResources = true
   209  	}
   210  	if config.Resources.KernelMemory != 0 {
   211  		g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory)
   212  		addedResources = true
   213  	}
   214  	if config.Resources.MemorySwappiness != -1 {
   215  		g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness))
   216  		addedResources = true
   217  	}
   218  	g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller)
   219  	g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj)
   220  
   221  	// RESOURCES - CPU
   222  	if config.Resources.CPUShares != 0 {
   223  		g.SetLinuxResourcesCPUShares(config.Resources.CPUShares)
   224  		addedResources = true
   225  	}
   226  	if config.Resources.CPUQuota != 0 {
   227  		g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota)
   228  		addedResources = true
   229  	}
   230  	if config.Resources.CPUPeriod != 0 {
   231  		g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod)
   232  		addedResources = true
   233  	}
   234  	if config.Resources.CPUs != 0 {
   235  		g.SetLinuxResourcesCPUPeriod(CpuPeriod)
   236  		g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * CpuPeriod))
   237  		addedResources = true
   238  	}
   239  	if config.Resources.CPURtRuntime != 0 {
   240  		g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime)
   241  		addedResources = true
   242  	}
   243  	if config.Resources.CPURtPeriod != 0 {
   244  		g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod)
   245  		addedResources = true
   246  	}
   247  	if config.Resources.CPUsetCPUs != "" {
   248  		g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs)
   249  		addedResources = true
   250  	}
   251  	if config.Resources.CPUsetMems != "" {
   252  		g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems)
   253  		addedResources = true
   254  	}
   255  
   256  	// Devices
   257  	if config.Security.Privileged {
   258  		// If privileged, we need to add all the host devices to the
   259  		// spec.  We do not add the user provided ones because we are
   260  		// already adding them all.
   261  		if err := AddPrivilegedDevices(&g); err != nil {
   262  			return nil, err
   263  		}
   264  	} else {
   265  		for _, devicePath := range config.Devices {
   266  			if err := DevicesFromPath(&g, devicePath); err != nil {
   267  				return nil, err
   268  			}
   269  		}
   270  		if len(config.Resources.DeviceCgroupRules) != 0 {
   271  			if err := deviceCgroupRules(&g, config.Resources.DeviceCgroupRules); err != nil {
   272  				return nil, err
   273  			}
   274  			addedResources = true
   275  		}
   276  	}
   277  
   278  	g.SetProcessNoNewPrivileges(config.Security.NoNewPrivs)
   279  
   280  	if !config.Security.Privileged {
   281  		g.SetProcessApparmorProfile(config.Security.ApparmorProfile)
   282  	}
   283  
   284  	// Unless already set via the CLI, check if we need to disable process
   285  	// labels or set the defaults.
   286  	if len(config.Security.LabelOpts) == 0 && runtimeConfig != nil {
   287  		if !runtimeConfig.Containers.EnableLabeling {
   288  			// Disabled in the config.
   289  			config.Security.LabelOpts = append(config.Security.LabelOpts, "disable")
   290  		} else if err := config.Security.SetLabelOpts(runtime, &config.Pid, &config.Ipc); err != nil {
   291  			// Defaults!
   292  			return nil, err
   293  		}
   294  	}
   295  
   296  	BlockAccessToKernelFilesystems(config.Security.Privileged, config.Pid.PidMode.IsHost(), &g)
   297  
   298  	// RESOURCES - PIDS
   299  	if config.Resources.PidsLimit > 0 {
   300  		// if running on rootless on a cgroupv1 machine or using the cgroupfs manager, pids
   301  		// limit is not supported.  If the value is still the default
   302  		// then ignore the settings.  If the caller asked for a
   303  		// non-default, then try to use it.
   304  		setPidLimit := true
   305  		if rootless.IsRootless() {
   306  			cgroup2, err := cgroups.IsCgroup2UnifiedMode()
   307  			if err != nil {
   308  				return nil, err
   309  			}
   310  			if (!cgroup2 || (runtimeConfig != nil && runtimeConfig.Engine.CgroupManager != cconfig.SystemdCgroupsManager)) && config.Resources.PidsLimit == sysinfo.GetDefaultPidsLimit() {
   311  				setPidLimit = false
   312  			}
   313  		}
   314  		if setPidLimit {
   315  			g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit)
   316  			addedResources = true
   317  		}
   318  	}
   319  
   320  	// Make sure to always set the default variables unless overridden in the
   321  	// config.
   322  	var defaultEnv map[string]string
   323  	if runtimeConfig == nil {
   324  		defaultEnv = env.DefaultEnvVariables
   325  	} else {
   326  		defaultEnv, err = env.ParseSlice(runtimeConfig.Containers.Env)
   327  		if err != nil {
   328  			return nil, errors.Wrap(err, "Env fields in containers.conf failed ot parse")
   329  		}
   330  		defaultEnv = env.Join(env.DefaultEnvVariables, defaultEnv)
   331  	}
   332  
   333  	if err := addRlimits(config, &g); err != nil {
   334  		return nil, err
   335  	}
   336  
   337  	// NAMESPACES
   338  
   339  	if err := config.Pid.ConfigureGenerator(&g); err != nil {
   340  		return nil, err
   341  	}
   342  
   343  	if err := config.User.ConfigureGenerator(&g); err != nil {
   344  		return nil, err
   345  	}
   346  
   347  	if err := config.Network.ConfigureGenerator(&g); err != nil {
   348  		return nil, err
   349  	}
   350  
   351  	if err := config.Uts.ConfigureGenerator(&g, &config.Network, runtime); err != nil {
   352  		return nil, err
   353  	}
   354  
   355  	if err := config.Ipc.ConfigureGenerator(&g); err != nil {
   356  		return nil, err
   357  	}
   358  
   359  	if err := config.Cgroup.ConfigureGenerator(&g); err != nil {
   360  		return nil, err
   361  	}
   362  
   363  	config.Env = env.Join(defaultEnv, config.Env)
   364  	for name, val := range config.Env {
   365  		g.AddProcessEnv(name, val)
   366  	}
   367  	configSpec := g.Config
   368  
   369  	// If the container image specifies an label with a
   370  	// capabilities.ContainerImageLabel then split the comma separated list
   371  	// of capabilities and record them.  This list indicates the only
   372  	// capabilities, required to run the container.
   373  	var capRequired []string
   374  	for key, val := range config.Labels {
   375  		if util.StringInSlice(key, capabilities.ContainerImageLabels) {
   376  			capRequired = strings.Split(val, ",")
   377  		}
   378  	}
   379  	config.Security.CapRequired = capRequired
   380  
   381  	if err := config.Security.ConfigureGenerator(&g, &config.User); err != nil {
   382  		return nil, err
   383  	}
   384  
   385  	// BIND MOUNTS
   386  	configSpec.Mounts = SupercedeUserMounts(userMounts, configSpec.Mounts)
   387  	// Process mounts to ensure correct options
   388  	if err := InitFSMounts(configSpec.Mounts); err != nil {
   389  		return nil, err
   390  	}
   391  
   392  	// BLOCK IO
   393  	blkio, err := config.CreateBlockIO()
   394  	if err != nil {
   395  		return nil, errors.Wrapf(err, "error creating block io")
   396  	}
   397  	if blkio != nil {
   398  		configSpec.Linux.Resources.BlockIO = blkio
   399  		addedResources = true
   400  	}
   401  
   402  	if rootless.IsRootless() {
   403  		cgroup2, err := cgroups.IsCgroup2UnifiedMode()
   404  		if err != nil {
   405  			return nil, err
   406  		}
   407  		if !addedResources {
   408  			configSpec.Linux.Resources = &spec.LinuxResources{}
   409  		}
   410  
   411  		canUseResources := cgroup2 && runtimeConfig != nil && (runtimeConfig.Engine.CgroupManager == cconfig.SystemdCgroupsManager)
   412  
   413  		if addedResources && !canUseResources {
   414  			return nil, errors.New("invalid configuration, cannot specify resource limits without cgroups v2 and --cgroup-manager=systemd")
   415  		}
   416  		if !canUseResources {
   417  			// Force the resources block to be empty instead of having default values.
   418  			configSpec.Linux.Resources = &spec.LinuxResources{}
   419  		}
   420  	}
   421  
   422  	switch config.Cgroup.Cgroups {
   423  	case "disabled":
   424  		if addedResources {
   425  			return nil, errors.New("cannot specify resource limits when cgroups are disabled is specified")
   426  		}
   427  		configSpec.Linux.Resources = &spec.LinuxResources{}
   428  	case "enabled", "no-conmon", "":
   429  		// Do nothing
   430  	default:
   431  		return nil, errors.New("unrecognized option for cgroups; supported are 'default', 'disabled', 'no-conmon'")
   432  	}
   433  
   434  	// Add annotations
   435  	if configSpec.Annotations == nil {
   436  		configSpec.Annotations = make(map[string]string)
   437  	}
   438  
   439  	if config.CidFile != "" {
   440  		configSpec.Annotations[libpod.InspectAnnotationCIDFile] = config.CidFile
   441  	}
   442  
   443  	if config.Rm {
   444  		configSpec.Annotations[libpod.InspectAnnotationAutoremove] = libpod.InspectResponseTrue
   445  	} else {
   446  		configSpec.Annotations[libpod.InspectAnnotationAutoremove] = libpod.InspectResponseFalse
   447  	}
   448  
   449  	if len(config.VolumesFrom) > 0 {
   450  		configSpec.Annotations[libpod.InspectAnnotationVolumesFrom] = strings.Join(config.VolumesFrom, ",")
   451  	}
   452  
   453  	if config.Security.Privileged {
   454  		configSpec.Annotations[libpod.InspectAnnotationPrivileged] = libpod.InspectResponseTrue
   455  	} else {
   456  		configSpec.Annotations[libpod.InspectAnnotationPrivileged] = libpod.InspectResponseFalse
   457  	}
   458  
   459  	if config.Init {
   460  		configSpec.Annotations[libpod.InspectAnnotationInit] = libpod.InspectResponseTrue
   461  	} else {
   462  		configSpec.Annotations[libpod.InspectAnnotationInit] = libpod.InspectResponseFalse
   463  	}
   464  
   465  	return configSpec, nil
   466  }
   467  
   468  func (config *CreateConfig) cgroupDisabled() bool {
   469  	return config.Cgroup.Cgroups == "disabled"
   470  }
   471  
   472  func BlockAccessToKernelFilesystems(privileged, pidModeIsHost bool, g *generate.Generator) {
   473  	if !privileged {
   474  		for _, mp := range []string{
   475  			"/proc/acpi",
   476  			"/proc/kcore",
   477  			"/proc/keys",
   478  			"/proc/latency_stats",
   479  			"/proc/timer_list",
   480  			"/proc/timer_stats",
   481  			"/proc/sched_debug",
   482  			"/proc/scsi",
   483  			"/sys/firmware",
   484  			"/sys/fs/selinux",
   485  		} {
   486  			g.AddLinuxMaskedPaths(mp)
   487  		}
   488  
   489  		if pidModeIsHost && rootless.IsRootless() {
   490  			return
   491  		}
   492  
   493  		for _, rp := range []string{
   494  			"/proc/asound",
   495  			"/proc/bus",
   496  			"/proc/fs",
   497  			"/proc/irq",
   498  			"/proc/sys",
   499  			"/proc/sysrq-trigger",
   500  		} {
   501  			g.AddLinuxReadonlyPaths(rp)
   502  		}
   503  	}
   504  }
   505  
   506  func addRlimits(config *CreateConfig, g *generate.Generator) error {
   507  	var (
   508  		nofileSet = false
   509  		nprocSet  = false
   510  	)
   511  
   512  	for _, u := range config.Resources.Ulimit {
   513  		if u == "host" {
   514  			if len(config.Resources.Ulimit) != 1 {
   515  				return errors.New("ulimit can use host only once")
   516  			}
   517  			g.Config.Process.Rlimits = nil
   518  			break
   519  		}
   520  
   521  		ul, err := units.ParseUlimit(u)
   522  		if err != nil {
   523  			return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u)
   524  		}
   525  
   526  		if ul.Name == "nofile" {
   527  			nofileSet = true
   528  		} else if ul.Name == "nproc" {
   529  			nprocSet = true
   530  		}
   531  
   532  		g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft))
   533  	}
   534  
   535  	// If not explicitly overridden by the user, default number of open
   536  	// files and number of processes to the maximum they can be set to
   537  	// (without overriding a sysctl)
   538  	if !nofileSet {
   539  		current, max := getNOFILESettings()
   540  		g.AddProcessRlimits("RLIMIT_NOFILE", current, max)
   541  	}
   542  	if !nprocSet {
   543  		current, max := getNPROCSettings()
   544  		g.AddProcessRlimits("RLIMIT_NPROC", current, max)
   545  	}
   546  
   547  	return nil
   548  }