github.com/containers/libpod@v1.9.4-0.20220419124438-4284fd425507/pkg/spec/spec.go (about) 1 package createconfig 2 3 import ( 4 "strings" 5 6 "github.com/containers/common/pkg/capabilities" 7 cconfig "github.com/containers/common/pkg/config" 8 "github.com/containers/libpod/libpod" 9 "github.com/containers/libpod/pkg/cgroups" 10 "github.com/containers/libpod/pkg/env" 11 "github.com/containers/libpod/pkg/rootless" 12 "github.com/containers/libpod/pkg/sysinfo" 13 "github.com/containers/libpod/pkg/util" 14 "github.com/docker/go-units" 15 "github.com/opencontainers/runc/libcontainer/user" 16 spec "github.com/opencontainers/runtime-spec/specs-go" 17 "github.com/opencontainers/runtime-tools/generate" 18 "github.com/pkg/errors" 19 ) 20 21 const ( 22 CpuPeriod = 100000 23 kernelMax uint64 = 1048576 24 ) 25 26 func GetAvailableGids() (int64, error) { 27 idMap, err := user.ParseIDMapFile("/proc/self/gid_map") 28 if err != nil { 29 return 0, err 30 } 31 count := int64(0) 32 for _, r := range idMap { 33 count += r.Count 34 } 35 return count, nil 36 } 37 38 // CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec 39 func (config *CreateConfig) createConfigToOCISpec(runtime *libpod.Runtime, userMounts []spec.Mount) (*spec.Spec, error) { 40 cgroupPerm := "ro" 41 g, err := generate.New("linux") 42 if err != nil { 43 return nil, err 44 } 45 // Remove the default /dev/shm mount to ensure we overwrite it 46 g.RemoveMount("/dev/shm") 47 g.HostSpecific = true 48 addCgroup := true 49 canMountSys := true 50 51 isRootless := rootless.IsRootless() 52 inUserNS := config.User.InNS(isRootless) 53 54 if inUserNS && config.Network.NetMode.IsHost() { 55 canMountSys = false 56 } 57 58 if config.Security.Privileged && canMountSys { 59 cgroupPerm = "rw" 60 g.RemoveMount("/sys") 61 sysMnt := spec.Mount{ 62 Destination: "/sys", 63 Type: "sysfs", 64 Source: "sysfs", 65 Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"}, 66 } 67 g.AddMount(sysMnt) 68 } else if !canMountSys { 69 addCgroup = false 70 g.RemoveMount("/sys") 71 r := "ro" 72 if config.Security.Privileged { 73 r = "rw" 74 } 75 sysMnt := spec.Mount{ 76 Destination: "/sys", 77 Type: TypeBind, 78 Source: "/sys", 79 Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"}, 80 } 81 g.AddMount(sysMnt) 82 if !config.Security.Privileged && isRootless { 83 g.AddLinuxMaskedPaths("/sys/kernel") 84 } 85 } 86 var runtimeConfig *cconfig.Config 87 88 if runtime != nil { 89 runtimeConfig, err = runtime.GetConfig() 90 if err != nil { 91 return nil, err 92 } 93 g.Config.Process.Capabilities.Bounding = runtimeConfig.Containers.DefaultCapabilities 94 sysctls, err := util.ValidateSysctls(runtimeConfig.Containers.DefaultSysctls) 95 if err != nil { 96 return nil, err 97 } 98 99 for name, val := range config.Security.Sysctl { 100 sysctls[name] = val 101 } 102 config.Security.Sysctl = sysctls 103 if !util.StringInSlice("host", config.Resources.Ulimit) { 104 config.Resources.Ulimit = append(runtimeConfig.Containers.DefaultUlimits, config.Resources.Ulimit...) 105 } 106 if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() { 107 config.Resources.PidsLimit = runtimeConfig.Containers.PidsLimit 108 } 109 110 } else { 111 g.Config.Process.Capabilities.Bounding = cconfig.DefaultCapabilities 112 if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() { 113 config.Resources.PidsLimit = cconfig.DefaultPidsLimit 114 } 115 } 116 117 gid5Available := true 118 if isRootless { 119 nGids, err := GetAvailableGids() 120 if err != nil { 121 return nil, err 122 } 123 gid5Available = nGids >= 5 124 } 125 // When using a different user namespace, check that the GID 5 is mapped inside 126 // the container. 127 if gid5Available && len(config.User.IDMappings.GIDMap) > 0 { 128 mappingFound := false 129 for _, r := range config.User.IDMappings.GIDMap { 130 if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size { 131 mappingFound = true 132 break 133 } 134 } 135 if !mappingFound { 136 gid5Available = false 137 } 138 139 } 140 if !gid5Available { 141 // If we have no GID mappings, the gid=5 default option would fail, so drop it. 142 g.RemoveMount("/dev/pts") 143 devPts := spec.Mount{ 144 Destination: "/dev/pts", 145 Type: "devpts", 146 Source: "devpts", 147 Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"}, 148 } 149 g.AddMount(devPts) 150 } 151 152 if inUserNS && config.Ipc.IpcMode.IsHost() { 153 g.RemoveMount("/dev/mqueue") 154 devMqueue := spec.Mount{ 155 Destination: "/dev/mqueue", 156 Type: TypeBind, 157 Source: "/dev/mqueue", 158 Options: []string{"bind", "nosuid", "noexec", "nodev"}, 159 } 160 g.AddMount(devMqueue) 161 } 162 if inUserNS && config.Pid.PidMode.IsHost() { 163 g.RemoveMount("/proc") 164 procMount := spec.Mount{ 165 Destination: "/proc", 166 Type: TypeBind, 167 Source: "/proc", 168 Options: []string{"rbind", "nosuid", "noexec", "nodev"}, 169 } 170 g.AddMount(procMount) 171 } 172 173 if addCgroup { 174 cgroupMnt := spec.Mount{ 175 Destination: "/sys/fs/cgroup", 176 Type: "cgroup", 177 Source: "cgroup", 178 Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm}, 179 } 180 g.AddMount(cgroupMnt) 181 } 182 g.SetProcessCwd(config.WorkDir) 183 g.SetProcessArgs(config.Command) 184 g.SetProcessTerminal(config.Tty) 185 186 for key, val := range config.Annotations { 187 g.AddAnnotation(key, val) 188 } 189 190 addedResources := false 191 192 // RESOURCES - MEMORY 193 if config.Resources.Memory != 0 { 194 g.SetLinuxResourcesMemoryLimit(config.Resources.Memory) 195 // If a swap limit is not explicitly set, also set a swap limit 196 // Default to double the memory limit 197 if config.Resources.MemorySwap == 0 { 198 g.SetLinuxResourcesMemorySwap(2 * config.Resources.Memory) 199 } 200 addedResources = true 201 } 202 if config.Resources.MemoryReservation != 0 { 203 g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation) 204 addedResources = true 205 } 206 if config.Resources.MemorySwap != 0 { 207 g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap) 208 addedResources = true 209 } 210 if config.Resources.KernelMemory != 0 { 211 g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory) 212 addedResources = true 213 } 214 if config.Resources.MemorySwappiness != -1 { 215 g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness)) 216 addedResources = true 217 } 218 g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller) 219 g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj) 220 221 // RESOURCES - CPU 222 if config.Resources.CPUShares != 0 { 223 g.SetLinuxResourcesCPUShares(config.Resources.CPUShares) 224 addedResources = true 225 } 226 if config.Resources.CPUQuota != 0 { 227 g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota) 228 addedResources = true 229 } 230 if config.Resources.CPUPeriod != 0 { 231 g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod) 232 addedResources = true 233 } 234 if config.Resources.CPUs != 0 { 235 g.SetLinuxResourcesCPUPeriod(CpuPeriod) 236 g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * CpuPeriod)) 237 addedResources = true 238 } 239 if config.Resources.CPURtRuntime != 0 { 240 g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime) 241 addedResources = true 242 } 243 if config.Resources.CPURtPeriod != 0 { 244 g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod) 245 addedResources = true 246 } 247 if config.Resources.CPUsetCPUs != "" { 248 g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs) 249 addedResources = true 250 } 251 if config.Resources.CPUsetMems != "" { 252 g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems) 253 addedResources = true 254 } 255 256 // Devices 257 if config.Security.Privileged { 258 // If privileged, we need to add all the host devices to the 259 // spec. We do not add the user provided ones because we are 260 // already adding them all. 261 if err := AddPrivilegedDevices(&g); err != nil { 262 return nil, err 263 } 264 } else { 265 for _, devicePath := range config.Devices { 266 if err := DevicesFromPath(&g, devicePath); err != nil { 267 return nil, err 268 } 269 } 270 if len(config.Resources.DeviceCgroupRules) != 0 { 271 if err := deviceCgroupRules(&g, config.Resources.DeviceCgroupRules); err != nil { 272 return nil, err 273 } 274 addedResources = true 275 } 276 } 277 278 g.SetProcessNoNewPrivileges(config.Security.NoNewPrivs) 279 280 if !config.Security.Privileged { 281 g.SetProcessApparmorProfile(config.Security.ApparmorProfile) 282 } 283 284 // Unless already set via the CLI, check if we need to disable process 285 // labels or set the defaults. 286 if len(config.Security.LabelOpts) == 0 && runtimeConfig != nil { 287 if !runtimeConfig.Containers.EnableLabeling { 288 // Disabled in the config. 289 config.Security.LabelOpts = append(config.Security.LabelOpts, "disable") 290 } else if err := config.Security.SetLabelOpts(runtime, &config.Pid, &config.Ipc); err != nil { 291 // Defaults! 292 return nil, err 293 } 294 } 295 296 BlockAccessToKernelFilesystems(config.Security.Privileged, config.Pid.PidMode.IsHost(), &g) 297 298 // RESOURCES - PIDS 299 if config.Resources.PidsLimit > 0 { 300 // if running on rootless on a cgroupv1 machine or using the cgroupfs manager, pids 301 // limit is not supported. If the value is still the default 302 // then ignore the settings. If the caller asked for a 303 // non-default, then try to use it. 304 setPidLimit := true 305 if rootless.IsRootless() { 306 cgroup2, err := cgroups.IsCgroup2UnifiedMode() 307 if err != nil { 308 return nil, err 309 } 310 if (!cgroup2 || (runtimeConfig != nil && runtimeConfig.Engine.CgroupManager != cconfig.SystemdCgroupsManager)) && config.Resources.PidsLimit == sysinfo.GetDefaultPidsLimit() { 311 setPidLimit = false 312 } 313 } 314 if setPidLimit { 315 g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit) 316 addedResources = true 317 } 318 } 319 320 // Make sure to always set the default variables unless overridden in the 321 // config. 322 var defaultEnv map[string]string 323 if runtimeConfig == nil { 324 defaultEnv = env.DefaultEnvVariables 325 } else { 326 defaultEnv, err = env.ParseSlice(runtimeConfig.Containers.Env) 327 if err != nil { 328 return nil, errors.Wrap(err, "Env fields in containers.conf failed ot parse") 329 } 330 defaultEnv = env.Join(env.DefaultEnvVariables, defaultEnv) 331 } 332 333 if err := addRlimits(config, &g); err != nil { 334 return nil, err 335 } 336 337 // NAMESPACES 338 339 if err := config.Pid.ConfigureGenerator(&g); err != nil { 340 return nil, err 341 } 342 343 if err := config.User.ConfigureGenerator(&g); err != nil { 344 return nil, err 345 } 346 347 if err := config.Network.ConfigureGenerator(&g); err != nil { 348 return nil, err 349 } 350 351 if err := config.Uts.ConfigureGenerator(&g, &config.Network, runtime); err != nil { 352 return nil, err 353 } 354 355 if err := config.Ipc.ConfigureGenerator(&g); err != nil { 356 return nil, err 357 } 358 359 if err := config.Cgroup.ConfigureGenerator(&g); err != nil { 360 return nil, err 361 } 362 363 config.Env = env.Join(defaultEnv, config.Env) 364 for name, val := range config.Env { 365 g.AddProcessEnv(name, val) 366 } 367 configSpec := g.Config 368 369 // If the container image specifies an label with a 370 // capabilities.ContainerImageLabel then split the comma separated list 371 // of capabilities and record them. This list indicates the only 372 // capabilities, required to run the container. 373 var capRequired []string 374 for key, val := range config.Labels { 375 if util.StringInSlice(key, capabilities.ContainerImageLabels) { 376 capRequired = strings.Split(val, ",") 377 } 378 } 379 config.Security.CapRequired = capRequired 380 381 if err := config.Security.ConfigureGenerator(&g, &config.User); err != nil { 382 return nil, err 383 } 384 385 // BIND MOUNTS 386 configSpec.Mounts = SupercedeUserMounts(userMounts, configSpec.Mounts) 387 // Process mounts to ensure correct options 388 if err := InitFSMounts(configSpec.Mounts); err != nil { 389 return nil, err 390 } 391 392 // BLOCK IO 393 blkio, err := config.CreateBlockIO() 394 if err != nil { 395 return nil, errors.Wrapf(err, "error creating block io") 396 } 397 if blkio != nil { 398 configSpec.Linux.Resources.BlockIO = blkio 399 addedResources = true 400 } 401 402 if rootless.IsRootless() { 403 cgroup2, err := cgroups.IsCgroup2UnifiedMode() 404 if err != nil { 405 return nil, err 406 } 407 if !addedResources { 408 configSpec.Linux.Resources = &spec.LinuxResources{} 409 } 410 411 canUseResources := cgroup2 && runtimeConfig != nil && (runtimeConfig.Engine.CgroupManager == cconfig.SystemdCgroupsManager) 412 413 if addedResources && !canUseResources { 414 return nil, errors.New("invalid configuration, cannot specify resource limits without cgroups v2 and --cgroup-manager=systemd") 415 } 416 if !canUseResources { 417 // Force the resources block to be empty instead of having default values. 418 configSpec.Linux.Resources = &spec.LinuxResources{} 419 } 420 } 421 422 switch config.Cgroup.Cgroups { 423 case "disabled": 424 if addedResources { 425 return nil, errors.New("cannot specify resource limits when cgroups are disabled is specified") 426 } 427 configSpec.Linux.Resources = &spec.LinuxResources{} 428 case "enabled", "no-conmon", "": 429 // Do nothing 430 default: 431 return nil, errors.New("unrecognized option for cgroups; supported are 'default', 'disabled', 'no-conmon'") 432 } 433 434 // Add annotations 435 if configSpec.Annotations == nil { 436 configSpec.Annotations = make(map[string]string) 437 } 438 439 if config.CidFile != "" { 440 configSpec.Annotations[libpod.InspectAnnotationCIDFile] = config.CidFile 441 } 442 443 if config.Rm { 444 configSpec.Annotations[libpod.InspectAnnotationAutoremove] = libpod.InspectResponseTrue 445 } else { 446 configSpec.Annotations[libpod.InspectAnnotationAutoremove] = libpod.InspectResponseFalse 447 } 448 449 if len(config.VolumesFrom) > 0 { 450 configSpec.Annotations[libpod.InspectAnnotationVolumesFrom] = strings.Join(config.VolumesFrom, ",") 451 } 452 453 if config.Security.Privileged { 454 configSpec.Annotations[libpod.InspectAnnotationPrivileged] = libpod.InspectResponseTrue 455 } else { 456 configSpec.Annotations[libpod.InspectAnnotationPrivileged] = libpod.InspectResponseFalse 457 } 458 459 if config.Init { 460 configSpec.Annotations[libpod.InspectAnnotationInit] = libpod.InspectResponseTrue 461 } else { 462 configSpec.Annotations[libpod.InspectAnnotationInit] = libpod.InspectResponseFalse 463 } 464 465 return configSpec, nil 466 } 467 468 func (config *CreateConfig) cgroupDisabled() bool { 469 return config.Cgroup.Cgroups == "disabled" 470 } 471 472 func BlockAccessToKernelFilesystems(privileged, pidModeIsHost bool, g *generate.Generator) { 473 if !privileged { 474 for _, mp := range []string{ 475 "/proc/acpi", 476 "/proc/kcore", 477 "/proc/keys", 478 "/proc/latency_stats", 479 "/proc/timer_list", 480 "/proc/timer_stats", 481 "/proc/sched_debug", 482 "/proc/scsi", 483 "/sys/firmware", 484 "/sys/fs/selinux", 485 } { 486 g.AddLinuxMaskedPaths(mp) 487 } 488 489 if pidModeIsHost && rootless.IsRootless() { 490 return 491 } 492 493 for _, rp := range []string{ 494 "/proc/asound", 495 "/proc/bus", 496 "/proc/fs", 497 "/proc/irq", 498 "/proc/sys", 499 "/proc/sysrq-trigger", 500 } { 501 g.AddLinuxReadonlyPaths(rp) 502 } 503 } 504 } 505 506 func addRlimits(config *CreateConfig, g *generate.Generator) error { 507 var ( 508 nofileSet = false 509 nprocSet = false 510 ) 511 512 for _, u := range config.Resources.Ulimit { 513 if u == "host" { 514 if len(config.Resources.Ulimit) != 1 { 515 return errors.New("ulimit can use host only once") 516 } 517 g.Config.Process.Rlimits = nil 518 break 519 } 520 521 ul, err := units.ParseUlimit(u) 522 if err != nil { 523 return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u) 524 } 525 526 if ul.Name == "nofile" { 527 nofileSet = true 528 } else if ul.Name == "nproc" { 529 nprocSet = true 530 } 531 532 g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft)) 533 } 534 535 // If not explicitly overridden by the user, default number of open 536 // files and number of processes to the maximum they can be set to 537 // (without overriding a sysctl) 538 if !nofileSet { 539 current, max := getNOFILESettings() 540 g.AddProcessRlimits("RLIMIT_NOFILE", current, max) 541 } 542 if !nprocSet { 543 current, max := getNPROCSettings() 544 g.AddProcessRlimits("RLIMIT_NPROC", current, max) 545 } 546 547 return nil 548 }