github.com/containers/podman/v2@v2.2.2-0.20210501105131-c1e07d070c4c/pkg/spec/spec.go (about)

     1  package createconfig
     2  
     3  import (
     4  	"strings"
     5  
     6  	"github.com/containers/common/pkg/capabilities"
     7  	cconfig "github.com/containers/common/pkg/config"
     8  	"github.com/containers/common/pkg/sysinfo"
     9  	"github.com/containers/podman/v2/libpod"
    10  	"github.com/containers/podman/v2/libpod/define"
    11  	"github.com/containers/podman/v2/pkg/cgroups"
    12  	"github.com/containers/podman/v2/pkg/env"
    13  	"github.com/containers/podman/v2/pkg/rootless"
    14  	"github.com/containers/podman/v2/pkg/util"
    15  	"github.com/docker/go-units"
    16  	"github.com/opencontainers/runc/libcontainer/user"
    17  	spec "github.com/opencontainers/runtime-spec/specs-go"
    18  	"github.com/opencontainers/runtime-tools/generate"
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  	"golang.org/x/sys/unix"
    22  )
    23  
    24  const CpuPeriod = 100000
    25  
    26  func GetAvailableGids() (int64, error) {
    27  	idMap, err := user.ParseIDMapFile("/proc/self/gid_map")
    28  	if err != nil {
    29  		return 0, err
    30  	}
    31  	count := int64(0)
    32  	for _, r := range idMap {
    33  		count += r.Count
    34  	}
    35  	return count, nil
    36  }
    37  
    38  // CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec
    39  func (config *CreateConfig) createConfigToOCISpec(runtime *libpod.Runtime, userMounts []spec.Mount) (*spec.Spec, error) {
    40  	cgroupPerm := "ro"
    41  	g, err := generate.New("linux")
    42  	if err != nil {
    43  		return nil, err
    44  	}
    45  	// Remove the default /dev/shm mount to ensure we overwrite it
    46  	g.RemoveMount("/dev/shm")
    47  	g.HostSpecific = true
    48  	addCgroup := true
    49  	canMountSys := true
    50  
    51  	isRootless := rootless.IsRootless()
    52  	inUserNS := config.User.InNS(isRootless)
    53  
    54  	if inUserNS && config.Network.NetMode.IsHost() {
    55  		canMountSys = false
    56  	}
    57  
    58  	if config.Security.Privileged && canMountSys {
    59  		cgroupPerm = "rw"
    60  		g.RemoveMount("/sys")
    61  		sysMnt := spec.Mount{
    62  			Destination: "/sys",
    63  			Type:        "sysfs",
    64  			Source:      "sysfs",
    65  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", "rw"},
    66  		}
    67  		g.AddMount(sysMnt)
    68  	} else if !canMountSys {
    69  		addCgroup = false
    70  		g.RemoveMount("/sys")
    71  		r := "ro"
    72  		if config.Security.Privileged {
    73  			r = "rw"
    74  		}
    75  		sysMnt := spec.Mount{
    76  			Destination: "/sys",
    77  			Type:        TypeBind,
    78  			Source:      "/sys",
    79  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"},
    80  		}
    81  		g.AddMount(sysMnt)
    82  		if !config.Security.Privileged && isRootless {
    83  			g.AddLinuxMaskedPaths("/sys/kernel")
    84  		}
    85  	}
    86  	var runtimeConfig *cconfig.Config
    87  
    88  	if runtime != nil {
    89  		runtimeConfig, err = runtime.GetConfig()
    90  		if err != nil {
    91  			return nil, err
    92  		}
    93  		g.Config.Process.Capabilities.Bounding = runtimeConfig.Containers.DefaultCapabilities
    94  		sysctls, err := util.ValidateSysctls(runtimeConfig.Containers.DefaultSysctls)
    95  		if err != nil {
    96  			return nil, err
    97  		}
    98  
    99  		for name, val := range config.Security.Sysctl {
   100  			sysctls[name] = val
   101  		}
   102  		config.Security.Sysctl = sysctls
   103  		if !util.StringInSlice("host", config.Resources.Ulimit) {
   104  			config.Resources.Ulimit = append(runtimeConfig.Containers.DefaultUlimits, config.Resources.Ulimit...)
   105  		}
   106  		if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() {
   107  			config.Resources.PidsLimit = runtimeConfig.Containers.PidsLimit
   108  		}
   109  
   110  	} else {
   111  		g.Config.Process.Capabilities.Bounding = cconfig.DefaultCapabilities
   112  		if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() {
   113  			config.Resources.PidsLimit = cconfig.DefaultPidsLimit
   114  		}
   115  	}
   116  
   117  	gid5Available := true
   118  	if isRootless {
   119  		nGids, err := GetAvailableGids()
   120  		if err != nil {
   121  			return nil, err
   122  		}
   123  		gid5Available = nGids >= 5
   124  	}
   125  	// When using a different user namespace, check that the GID 5 is mapped inside
   126  	// the container.
   127  	if gid5Available && len(config.User.IDMappings.GIDMap) > 0 {
   128  		mappingFound := false
   129  		for _, r := range config.User.IDMappings.GIDMap {
   130  			if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size {
   131  				mappingFound = true
   132  				break
   133  			}
   134  		}
   135  		if !mappingFound {
   136  			gid5Available = false
   137  		}
   138  
   139  	}
   140  	if !gid5Available {
   141  		// If we have no GID mappings, the gid=5 default option would fail, so drop it.
   142  		g.RemoveMount("/dev/pts")
   143  		devPts := spec.Mount{
   144  			Destination: "/dev/pts",
   145  			Type:        "devpts",
   146  			Source:      "devpts",
   147  			Options:     []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"},
   148  		}
   149  		g.AddMount(devPts)
   150  	}
   151  
   152  	if inUserNS && config.Ipc.IpcMode.IsHost() {
   153  		g.RemoveMount("/dev/mqueue")
   154  		devMqueue := spec.Mount{
   155  			Destination: "/dev/mqueue",
   156  			Type:        TypeBind,
   157  			Source:      "/dev/mqueue",
   158  			Options:     []string{"bind", "nosuid", "noexec", "nodev"},
   159  		}
   160  		g.AddMount(devMqueue)
   161  	}
   162  	if inUserNS && config.Pid.PidMode.IsHost() {
   163  		g.RemoveMount("/proc")
   164  		procMount := spec.Mount{
   165  			Destination: "/proc",
   166  			Type:        TypeBind,
   167  			Source:      "/proc",
   168  			Options:     []string{"rbind", "nosuid", "noexec", "nodev"},
   169  		}
   170  		g.AddMount(procMount)
   171  	}
   172  
   173  	if addCgroup {
   174  		cgroupMnt := spec.Mount{
   175  			Destination: "/sys/fs/cgroup",
   176  			Type:        "cgroup",
   177  			Source:      "cgroup",
   178  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm},
   179  		}
   180  		g.AddMount(cgroupMnt)
   181  	}
   182  	g.SetProcessCwd(config.WorkDir)
   183  
   184  	ProcessArgs := make([]string, 0)
   185  	// We need to iterate the input for entrypoint because it is a []string
   186  	// but "" is a legit json input, which translates into a []string with an
   187  	// empty position.  This messes up the eventual command being executed
   188  	// in the container
   189  	for _, a := range config.Entrypoint {
   190  		if len(a) > 0 {
   191  			ProcessArgs = append(ProcessArgs, a)
   192  		}
   193  	}
   194  	// Same issue as explained above for config.Entrypoint.
   195  	for _, a := range config.Command {
   196  		if len(a) > 0 {
   197  			ProcessArgs = append(ProcessArgs, a)
   198  		}
   199  	}
   200  
   201  	g.SetProcessArgs(ProcessArgs)
   202  	g.SetProcessTerminal(config.Tty)
   203  
   204  	for key, val := range config.Annotations {
   205  		g.AddAnnotation(key, val)
   206  	}
   207  
   208  	addedResources := false
   209  
   210  	// RESOURCES - MEMORY
   211  	if config.Resources.Memory != 0 {
   212  		g.SetLinuxResourcesMemoryLimit(config.Resources.Memory)
   213  		// If a swap limit is not explicitly set, also set a swap limit
   214  		// Default to double the memory limit
   215  		if config.Resources.MemorySwap == 0 {
   216  			g.SetLinuxResourcesMemorySwap(2 * config.Resources.Memory)
   217  		}
   218  		addedResources = true
   219  	}
   220  	if config.Resources.MemoryReservation != 0 {
   221  		g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation)
   222  		addedResources = true
   223  	}
   224  	if config.Resources.MemorySwap != 0 {
   225  		g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap)
   226  		addedResources = true
   227  	}
   228  	if config.Resources.KernelMemory != 0 {
   229  		g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory)
   230  		addedResources = true
   231  	}
   232  	if config.Resources.MemorySwappiness != -1 {
   233  		g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness))
   234  		addedResources = true
   235  	}
   236  	g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller)
   237  	g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj)
   238  
   239  	// RESOURCES - CPU
   240  	if config.Resources.CPUShares != 0 {
   241  		g.SetLinuxResourcesCPUShares(config.Resources.CPUShares)
   242  		addedResources = true
   243  	}
   244  	if config.Resources.CPUQuota != 0 {
   245  		g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota)
   246  		addedResources = true
   247  	}
   248  	if config.Resources.CPUPeriod != 0 {
   249  		g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod)
   250  		addedResources = true
   251  	}
   252  	if config.Resources.CPUs != 0 {
   253  		g.SetLinuxResourcesCPUPeriod(CpuPeriod)
   254  		g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * CpuPeriod))
   255  		addedResources = true
   256  	}
   257  	if config.Resources.CPURtRuntime != 0 {
   258  		g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime)
   259  		addedResources = true
   260  	}
   261  	if config.Resources.CPURtPeriod != 0 {
   262  		g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod)
   263  		addedResources = true
   264  	}
   265  	if config.Resources.CPUsetCPUs != "" {
   266  		g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs)
   267  		addedResources = true
   268  	}
   269  	if config.Resources.CPUsetMems != "" {
   270  		g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems)
   271  		addedResources = true
   272  	}
   273  
   274  	// Devices
   275  	if config.Security.Privileged {
   276  		// If privileged, we need to add all the host devices to the
   277  		// spec.  We do not add the user provided ones because we are
   278  		// already adding them all.
   279  		if err := AddPrivilegedDevices(&g); err != nil {
   280  			return nil, err
   281  		}
   282  	} else {
   283  		for _, devicePath := range config.Devices {
   284  			if err := DevicesFromPath(&g, devicePath); err != nil {
   285  				return nil, err
   286  			}
   287  		}
   288  		if len(config.Resources.DeviceCgroupRules) != 0 {
   289  			if err := deviceCgroupRules(&g, config.Resources.DeviceCgroupRules); err != nil {
   290  				return nil, err
   291  			}
   292  			addedResources = true
   293  		}
   294  	}
   295  
   296  	g.SetProcessNoNewPrivileges(config.Security.NoNewPrivs)
   297  
   298  	if !config.Security.Privileged {
   299  		g.SetProcessApparmorProfile(config.Security.ApparmorProfile)
   300  	}
   301  
   302  	// Unless already set via the CLI, check if we need to disable process
   303  	// labels or set the defaults.
   304  	if len(config.Security.LabelOpts) == 0 && runtimeConfig != nil {
   305  		if !runtimeConfig.Containers.EnableLabeling {
   306  			// Disabled in the config.
   307  			config.Security.LabelOpts = append(config.Security.LabelOpts, "disable")
   308  		} else if err := config.Security.SetLabelOpts(runtime, &config.Pid, &config.Ipc); err != nil {
   309  			// Defaults!
   310  			return nil, err
   311  		}
   312  	}
   313  
   314  	BlockAccessToKernelFilesystems(config.Security.Privileged, config.Pid.PidMode.IsHost(), &g)
   315  
   316  	// RESOURCES - PIDS
   317  	if config.Resources.PidsLimit > 0 {
   318  		// if running on rootless on a cgroupv1 machine or using the cgroupfs manager, pids
   319  		// limit is not supported.  If the value is still the default
   320  		// then ignore the settings.  If the caller asked for a
   321  		// non-default, then try to use it.
   322  		setPidLimit := true
   323  		if rootless.IsRootless() {
   324  			cgroup2, err := cgroups.IsCgroup2UnifiedMode()
   325  			if err != nil {
   326  				return nil, err
   327  			}
   328  			if (!cgroup2 || (runtimeConfig != nil && runtimeConfig.Engine.CgroupManager != cconfig.SystemdCgroupsManager)) && config.Resources.PidsLimit == sysinfo.GetDefaultPidsLimit() {
   329  				setPidLimit = false
   330  			}
   331  		}
   332  		if setPidLimit {
   333  			g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit)
   334  			addedResources = true
   335  		}
   336  	}
   337  
   338  	// Make sure to always set the default variables unless overridden in the
   339  	// config.
   340  	var defaultEnv map[string]string
   341  	if runtimeConfig == nil {
   342  		defaultEnv = env.DefaultEnvVariables()
   343  	} else {
   344  		defaultEnv, err = env.ParseSlice(runtimeConfig.Containers.Env)
   345  		if err != nil {
   346  			return nil, errors.Wrap(err, "Env fields in containers.conf failed to parse")
   347  		}
   348  		defaultEnv = env.Join(env.DefaultEnvVariables(), defaultEnv)
   349  	}
   350  
   351  	if err := addRlimits(config, &g); err != nil {
   352  		return nil, err
   353  	}
   354  
   355  	// NAMESPACES
   356  
   357  	if err := config.Pid.ConfigureGenerator(&g); err != nil {
   358  		return nil, err
   359  	}
   360  
   361  	if err := config.User.ConfigureGenerator(&g); err != nil {
   362  		return nil, err
   363  	}
   364  
   365  	if err := config.Network.ConfigureGenerator(&g); err != nil {
   366  		return nil, err
   367  	}
   368  
   369  	if err := config.Uts.ConfigureGenerator(&g, &config.Network, runtime); err != nil {
   370  		return nil, err
   371  	}
   372  
   373  	if err := config.Ipc.ConfigureGenerator(&g); err != nil {
   374  		return nil, err
   375  	}
   376  
   377  	if err := config.Cgroup.ConfigureGenerator(&g); err != nil {
   378  		return nil, err
   379  	}
   380  
   381  	config.Env = env.Join(defaultEnv, config.Env)
   382  	for name, val := range config.Env {
   383  		g.AddProcessEnv(name, val)
   384  	}
   385  	configSpec := g.Config
   386  
   387  	// If the container image specifies an label with a
   388  	// capabilities.ContainerImageLabel then split the comma separated list
   389  	// of capabilities and record them.  This list indicates the only
   390  	// capabilities, required to run the container.
   391  	var capRequired []string
   392  	for key, val := range config.Labels {
   393  		if util.StringInSlice(key, capabilities.ContainerImageLabels) {
   394  			capRequired = strings.Split(val, ",")
   395  		}
   396  	}
   397  	config.Security.CapRequired = capRequired
   398  
   399  	if err := config.Security.ConfigureGenerator(&g, &config.User); err != nil {
   400  		return nil, err
   401  	}
   402  
   403  	// BIND MOUNTS
   404  	configSpec.Mounts = SupercedeUserMounts(userMounts, configSpec.Mounts)
   405  	// Process mounts to ensure correct options
   406  	if err := InitFSMounts(configSpec.Mounts); err != nil {
   407  		return nil, err
   408  	}
   409  
   410  	// BLOCK IO
   411  	blkio, err := config.CreateBlockIO()
   412  	if err != nil {
   413  		return nil, errors.Wrapf(err, "error creating block io")
   414  	}
   415  	if blkio != nil {
   416  		configSpec.Linux.Resources.BlockIO = blkio
   417  		addedResources = true
   418  	}
   419  
   420  	if rootless.IsRootless() {
   421  		cgroup2, err := cgroups.IsCgroup2UnifiedMode()
   422  		if err != nil {
   423  			return nil, err
   424  		}
   425  		if !addedResources {
   426  			configSpec.Linux.Resources = &spec.LinuxResources{}
   427  		}
   428  
   429  		canUseResources := cgroup2 && runtimeConfig != nil && (runtimeConfig.Engine.CgroupManager == cconfig.SystemdCgroupsManager)
   430  
   431  		if addedResources && !canUseResources {
   432  			return nil, errors.New("invalid configuration, cannot specify resource limits without cgroups v2 and --cgroup-manager=systemd")
   433  		}
   434  		if !canUseResources {
   435  			// Force the resources block to be empty instead of having default values.
   436  			configSpec.Linux.Resources = &spec.LinuxResources{}
   437  		}
   438  	}
   439  
   440  	switch config.Cgroup.Cgroups {
   441  	case "disabled":
   442  		if addedResources {
   443  			return nil, errors.New("cannot specify resource limits when cgroups are disabled is specified")
   444  		}
   445  		configSpec.Linux.Resources = &spec.LinuxResources{}
   446  	case "enabled", "no-conmon", "":
   447  		// Do nothing
   448  	default:
   449  		return nil, errors.New("unrecognized option for cgroups; supported are 'default', 'disabled', 'no-conmon'")
   450  	}
   451  
   452  	// Add annotations
   453  	if configSpec.Annotations == nil {
   454  		configSpec.Annotations = make(map[string]string)
   455  	}
   456  
   457  	if config.CidFile != "" {
   458  		configSpec.Annotations[define.InspectAnnotationCIDFile] = config.CidFile
   459  	}
   460  
   461  	if config.Rm {
   462  		configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseTrue
   463  	} else {
   464  		configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseFalse
   465  	}
   466  
   467  	if len(config.VolumesFrom) > 0 {
   468  		configSpec.Annotations[define.InspectAnnotationVolumesFrom] = strings.Join(config.VolumesFrom, ",")
   469  	}
   470  
   471  	if config.Security.Privileged {
   472  		configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseTrue
   473  	} else {
   474  		configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseFalse
   475  	}
   476  
   477  	if config.Init {
   478  		configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue
   479  	} else {
   480  		configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseFalse
   481  	}
   482  
   483  	return configSpec, nil
   484  }
   485  
   486  func (config *CreateConfig) cgroupDisabled() bool {
   487  	return config.Cgroup.Cgroups == "disabled"
   488  }
   489  
   490  func BlockAccessToKernelFilesystems(privileged, pidModeIsHost bool, g *generate.Generator) {
   491  	if !privileged {
   492  		for _, mp := range []string{
   493  			"/proc/acpi",
   494  			"/proc/kcore",
   495  			"/proc/keys",
   496  			"/proc/latency_stats",
   497  			"/proc/timer_list",
   498  			"/proc/timer_stats",
   499  			"/proc/sched_debug",
   500  			"/proc/scsi",
   501  			"/sys/firmware",
   502  			"/sys/fs/selinux",
   503  		} {
   504  			g.AddLinuxMaskedPaths(mp)
   505  		}
   506  
   507  		if pidModeIsHost && rootless.IsRootless() {
   508  			return
   509  		}
   510  
   511  		for _, rp := range []string{
   512  			"/proc/asound",
   513  			"/proc/bus",
   514  			"/proc/fs",
   515  			"/proc/irq",
   516  			"/proc/sys",
   517  			"/proc/sysrq-trigger",
   518  		} {
   519  			g.AddLinuxReadonlyPaths(rp)
   520  		}
   521  	}
   522  }
   523  
   524  func addRlimits(config *CreateConfig, g *generate.Generator) error {
   525  	var (
   526  		isRootless = rootless.IsRootless()
   527  		nofileSet  = false
   528  		nprocSet   = false
   529  	)
   530  
   531  	for _, u := range config.Resources.Ulimit {
   532  		if u == "host" {
   533  			if len(config.Resources.Ulimit) != 1 {
   534  				return errors.New("ulimit can use host only once")
   535  			}
   536  			g.Config.Process.Rlimits = nil
   537  			break
   538  		}
   539  
   540  		ul, err := units.ParseUlimit(u)
   541  		if err != nil {
   542  			return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u)
   543  		}
   544  
   545  		if ul.Name == "nofile" {
   546  			nofileSet = true
   547  		} else if ul.Name == "nproc" {
   548  			nprocSet = true
   549  		}
   550  
   551  		g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft))
   552  	}
   553  
   554  	// If not explicitly overridden by the user, default number of open
   555  	// files and number of processes to the maximum they can be set to
   556  	// (without overriding a sysctl)
   557  	if !nofileSet {
   558  		max := define.RLimitDefaultValue
   559  		current := define.RLimitDefaultValue
   560  		if isRootless {
   561  			var rlimit unix.Rlimit
   562  			if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlimit); err != nil {
   563  				logrus.Warnf("failed to return RLIMIT_NOFILE ulimit %q", err)
   564  			}
   565  			if rlimit.Cur < current {
   566  				current = rlimit.Cur
   567  			}
   568  			if rlimit.Max < max {
   569  				max = rlimit.Max
   570  			}
   571  		}
   572  		g.AddProcessRlimits("RLIMIT_NOFILE", max, current)
   573  	}
   574  	if !nprocSet {
   575  		max := define.RLimitDefaultValue
   576  		current := define.RLimitDefaultValue
   577  		if isRootless {
   578  			var rlimit unix.Rlimit
   579  			if err := unix.Getrlimit(unix.RLIMIT_NPROC, &rlimit); err != nil {
   580  				logrus.Warnf("failed to return RLIMIT_NPROC ulimit %q", err)
   581  			}
   582  			if rlimit.Cur < current {
   583  				current = rlimit.Cur
   584  			}
   585  			if rlimit.Max < max {
   586  				max = rlimit.Max
   587  			}
   588  		}
   589  		g.AddProcessRlimits("RLIMIT_NPROC", max, current)
   590  	}
   591  
   592  	return nil
   593  }