github.com/containers/podman/v2@v2.2.2-0.20210501105131-c1e07d070c4c/pkg/spec/spec.go (about) 1 package createconfig 2 3 import ( 4 "strings" 5 6 "github.com/containers/common/pkg/capabilities" 7 cconfig "github.com/containers/common/pkg/config" 8 "github.com/containers/common/pkg/sysinfo" 9 "github.com/containers/podman/v2/libpod" 10 "github.com/containers/podman/v2/libpod/define" 11 "github.com/containers/podman/v2/pkg/cgroups" 12 "github.com/containers/podman/v2/pkg/env" 13 "github.com/containers/podman/v2/pkg/rootless" 14 "github.com/containers/podman/v2/pkg/util" 15 "github.com/docker/go-units" 16 "github.com/opencontainers/runc/libcontainer/user" 17 spec "github.com/opencontainers/runtime-spec/specs-go" 18 "github.com/opencontainers/runtime-tools/generate" 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 "golang.org/x/sys/unix" 22 ) 23 24 const CpuPeriod = 100000 25 26 func GetAvailableGids() (int64, error) { 27 idMap, err := user.ParseIDMapFile("/proc/self/gid_map") 28 if err != nil { 29 return 0, err 30 } 31 count := int64(0) 32 for _, r := range idMap { 33 count += r.Count 34 } 35 return count, nil 36 } 37 38 // CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec 39 func (config *CreateConfig) createConfigToOCISpec(runtime *libpod.Runtime, userMounts []spec.Mount) (*spec.Spec, error) { 40 cgroupPerm := "ro" 41 g, err := generate.New("linux") 42 if err != nil { 43 return nil, err 44 } 45 // Remove the default /dev/shm mount to ensure we overwrite it 46 g.RemoveMount("/dev/shm") 47 g.HostSpecific = true 48 addCgroup := true 49 canMountSys := true 50 51 isRootless := rootless.IsRootless() 52 inUserNS := config.User.InNS(isRootless) 53 54 if inUserNS && config.Network.NetMode.IsHost() { 55 canMountSys = false 56 } 57 58 if config.Security.Privileged && canMountSys { 59 cgroupPerm = "rw" 60 g.RemoveMount("/sys") 61 sysMnt := spec.Mount{ 62 Destination: "/sys", 63 Type: "sysfs", 64 Source: "sysfs", 65 Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"}, 66 } 67 g.AddMount(sysMnt) 68 } else if !canMountSys { 69 addCgroup = false 70 g.RemoveMount("/sys") 71 r := "ro" 72 if config.Security.Privileged { 73 r = "rw" 74 } 75 sysMnt := spec.Mount{ 76 Destination: "/sys", 77 Type: TypeBind, 78 Source: "/sys", 79 Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"}, 80 } 81 g.AddMount(sysMnt) 82 if !config.Security.Privileged && isRootless { 83 g.AddLinuxMaskedPaths("/sys/kernel") 84 } 85 } 86 var runtimeConfig *cconfig.Config 87 88 if runtime != nil { 89 runtimeConfig, err = runtime.GetConfig() 90 if err != nil { 91 return nil, err 92 } 93 g.Config.Process.Capabilities.Bounding = runtimeConfig.Containers.DefaultCapabilities 94 sysctls, err := util.ValidateSysctls(runtimeConfig.Containers.DefaultSysctls) 95 if err != nil { 96 return nil, err 97 } 98 99 for name, val := range config.Security.Sysctl { 100 sysctls[name] = val 101 } 102 config.Security.Sysctl = sysctls 103 if !util.StringInSlice("host", config.Resources.Ulimit) { 104 config.Resources.Ulimit = append(runtimeConfig.Containers.DefaultUlimits, config.Resources.Ulimit...) 105 } 106 if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() { 107 config.Resources.PidsLimit = runtimeConfig.Containers.PidsLimit 108 } 109 110 } else { 111 g.Config.Process.Capabilities.Bounding = cconfig.DefaultCapabilities 112 if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() { 113 config.Resources.PidsLimit = cconfig.DefaultPidsLimit 114 } 115 } 116 117 gid5Available := true 118 if isRootless { 119 nGids, err := GetAvailableGids() 120 if err != nil { 121 return nil, err 122 } 123 gid5Available = nGids >= 5 124 } 125 // When using a different user namespace, check that the GID 5 is mapped inside 126 // the container. 127 if gid5Available && len(config.User.IDMappings.GIDMap) > 0 { 128 mappingFound := false 129 for _, r := range config.User.IDMappings.GIDMap { 130 if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size { 131 mappingFound = true 132 break 133 } 134 } 135 if !mappingFound { 136 gid5Available = false 137 } 138 139 } 140 if !gid5Available { 141 // If we have no GID mappings, the gid=5 default option would fail, so drop it. 142 g.RemoveMount("/dev/pts") 143 devPts := spec.Mount{ 144 Destination: "/dev/pts", 145 Type: "devpts", 146 Source: "devpts", 147 Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"}, 148 } 149 g.AddMount(devPts) 150 } 151 152 if inUserNS && config.Ipc.IpcMode.IsHost() { 153 g.RemoveMount("/dev/mqueue") 154 devMqueue := spec.Mount{ 155 Destination: "/dev/mqueue", 156 Type: TypeBind, 157 Source: "/dev/mqueue", 158 Options: []string{"bind", "nosuid", "noexec", "nodev"}, 159 } 160 g.AddMount(devMqueue) 161 } 162 if inUserNS && config.Pid.PidMode.IsHost() { 163 g.RemoveMount("/proc") 164 procMount := spec.Mount{ 165 Destination: "/proc", 166 Type: TypeBind, 167 Source: "/proc", 168 Options: []string{"rbind", "nosuid", "noexec", "nodev"}, 169 } 170 g.AddMount(procMount) 171 } 172 173 if addCgroup { 174 cgroupMnt := spec.Mount{ 175 Destination: "/sys/fs/cgroup", 176 Type: "cgroup", 177 Source: "cgroup", 178 Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm}, 179 } 180 g.AddMount(cgroupMnt) 181 } 182 g.SetProcessCwd(config.WorkDir) 183 184 ProcessArgs := make([]string, 0) 185 // We need to iterate the input for entrypoint because it is a []string 186 // but "" is a legit json input, which translates into a []string with an 187 // empty position. This messes up the eventual command being executed 188 // in the container 189 for _, a := range config.Entrypoint { 190 if len(a) > 0 { 191 ProcessArgs = append(ProcessArgs, a) 192 } 193 } 194 // Same issue as explained above for config.Entrypoint. 195 for _, a := range config.Command { 196 if len(a) > 0 { 197 ProcessArgs = append(ProcessArgs, a) 198 } 199 } 200 201 g.SetProcessArgs(ProcessArgs) 202 g.SetProcessTerminal(config.Tty) 203 204 for key, val := range config.Annotations { 205 g.AddAnnotation(key, val) 206 } 207 208 addedResources := false 209 210 // RESOURCES - MEMORY 211 if config.Resources.Memory != 0 { 212 g.SetLinuxResourcesMemoryLimit(config.Resources.Memory) 213 // If a swap limit is not explicitly set, also set a swap limit 214 // Default to double the memory limit 215 if config.Resources.MemorySwap == 0 { 216 g.SetLinuxResourcesMemorySwap(2 * config.Resources.Memory) 217 } 218 addedResources = true 219 } 220 if config.Resources.MemoryReservation != 0 { 221 g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation) 222 addedResources = true 223 } 224 if config.Resources.MemorySwap != 0 { 225 g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap) 226 addedResources = true 227 } 228 if config.Resources.KernelMemory != 0 { 229 g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory) 230 addedResources = true 231 } 232 if config.Resources.MemorySwappiness != -1 { 233 g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness)) 234 addedResources = true 235 } 236 g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller) 237 g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj) 238 239 // RESOURCES - CPU 240 if config.Resources.CPUShares != 0 { 241 g.SetLinuxResourcesCPUShares(config.Resources.CPUShares) 242 addedResources = true 243 } 244 if config.Resources.CPUQuota != 0 { 245 g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota) 246 addedResources = true 247 } 248 if config.Resources.CPUPeriod != 0 { 249 g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod) 250 addedResources = true 251 } 252 if config.Resources.CPUs != 0 { 253 g.SetLinuxResourcesCPUPeriod(CpuPeriod) 254 g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * CpuPeriod)) 255 addedResources = true 256 } 257 if config.Resources.CPURtRuntime != 0 { 258 g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime) 259 addedResources = true 260 } 261 if config.Resources.CPURtPeriod != 0 { 262 g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod) 263 addedResources = true 264 } 265 if config.Resources.CPUsetCPUs != "" { 266 g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs) 267 addedResources = true 268 } 269 if config.Resources.CPUsetMems != "" { 270 g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems) 271 addedResources = true 272 } 273 274 // Devices 275 if config.Security.Privileged { 276 // If privileged, we need to add all the host devices to the 277 // spec. We do not add the user provided ones because we are 278 // already adding them all. 279 if err := AddPrivilegedDevices(&g); err != nil { 280 return nil, err 281 } 282 } else { 283 for _, devicePath := range config.Devices { 284 if err := DevicesFromPath(&g, devicePath); err != nil { 285 return nil, err 286 } 287 } 288 if len(config.Resources.DeviceCgroupRules) != 0 { 289 if err := deviceCgroupRules(&g, config.Resources.DeviceCgroupRules); err != nil { 290 return nil, err 291 } 292 addedResources = true 293 } 294 } 295 296 g.SetProcessNoNewPrivileges(config.Security.NoNewPrivs) 297 298 if !config.Security.Privileged { 299 g.SetProcessApparmorProfile(config.Security.ApparmorProfile) 300 } 301 302 // Unless already set via the CLI, check if we need to disable process 303 // labels or set the defaults. 304 if len(config.Security.LabelOpts) == 0 && runtimeConfig != nil { 305 if !runtimeConfig.Containers.EnableLabeling { 306 // Disabled in the config. 307 config.Security.LabelOpts = append(config.Security.LabelOpts, "disable") 308 } else if err := config.Security.SetLabelOpts(runtime, &config.Pid, &config.Ipc); err != nil { 309 // Defaults! 310 return nil, err 311 } 312 } 313 314 BlockAccessToKernelFilesystems(config.Security.Privileged, config.Pid.PidMode.IsHost(), &g) 315 316 // RESOURCES - PIDS 317 if config.Resources.PidsLimit > 0 { 318 // if running on rootless on a cgroupv1 machine or using the cgroupfs manager, pids 319 // limit is not supported. If the value is still the default 320 // then ignore the settings. If the caller asked for a 321 // non-default, then try to use it. 322 setPidLimit := true 323 if rootless.IsRootless() { 324 cgroup2, err := cgroups.IsCgroup2UnifiedMode() 325 if err != nil { 326 return nil, err 327 } 328 if (!cgroup2 || (runtimeConfig != nil && runtimeConfig.Engine.CgroupManager != cconfig.SystemdCgroupsManager)) && config.Resources.PidsLimit == sysinfo.GetDefaultPidsLimit() { 329 setPidLimit = false 330 } 331 } 332 if setPidLimit { 333 g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit) 334 addedResources = true 335 } 336 } 337 338 // Make sure to always set the default variables unless overridden in the 339 // config. 340 var defaultEnv map[string]string 341 if runtimeConfig == nil { 342 defaultEnv = env.DefaultEnvVariables() 343 } else { 344 defaultEnv, err = env.ParseSlice(runtimeConfig.Containers.Env) 345 if err != nil { 346 return nil, errors.Wrap(err, "Env fields in containers.conf failed to parse") 347 } 348 defaultEnv = env.Join(env.DefaultEnvVariables(), defaultEnv) 349 } 350 351 if err := addRlimits(config, &g); err != nil { 352 return nil, err 353 } 354 355 // NAMESPACES 356 357 if err := config.Pid.ConfigureGenerator(&g); err != nil { 358 return nil, err 359 } 360 361 if err := config.User.ConfigureGenerator(&g); err != nil { 362 return nil, err 363 } 364 365 if err := config.Network.ConfigureGenerator(&g); err != nil { 366 return nil, err 367 } 368 369 if err := config.Uts.ConfigureGenerator(&g, &config.Network, runtime); err != nil { 370 return nil, err 371 } 372 373 if err := config.Ipc.ConfigureGenerator(&g); err != nil { 374 return nil, err 375 } 376 377 if err := config.Cgroup.ConfigureGenerator(&g); err != nil { 378 return nil, err 379 } 380 381 config.Env = env.Join(defaultEnv, config.Env) 382 for name, val := range config.Env { 383 g.AddProcessEnv(name, val) 384 } 385 configSpec := g.Config 386 387 // If the container image specifies an label with a 388 // capabilities.ContainerImageLabel then split the comma separated list 389 // of capabilities and record them. This list indicates the only 390 // capabilities, required to run the container. 391 var capRequired []string 392 for key, val := range config.Labels { 393 if util.StringInSlice(key, capabilities.ContainerImageLabels) { 394 capRequired = strings.Split(val, ",") 395 } 396 } 397 config.Security.CapRequired = capRequired 398 399 if err := config.Security.ConfigureGenerator(&g, &config.User); err != nil { 400 return nil, err 401 } 402 403 // BIND MOUNTS 404 configSpec.Mounts = SupercedeUserMounts(userMounts, configSpec.Mounts) 405 // Process mounts to ensure correct options 406 if err := InitFSMounts(configSpec.Mounts); err != nil { 407 return nil, err 408 } 409 410 // BLOCK IO 411 blkio, err := config.CreateBlockIO() 412 if err != nil { 413 return nil, errors.Wrapf(err, "error creating block io") 414 } 415 if blkio != nil { 416 configSpec.Linux.Resources.BlockIO = blkio 417 addedResources = true 418 } 419 420 if rootless.IsRootless() { 421 cgroup2, err := cgroups.IsCgroup2UnifiedMode() 422 if err != nil { 423 return nil, err 424 } 425 if !addedResources { 426 configSpec.Linux.Resources = &spec.LinuxResources{} 427 } 428 429 canUseResources := cgroup2 && runtimeConfig != nil && (runtimeConfig.Engine.CgroupManager == cconfig.SystemdCgroupsManager) 430 431 if addedResources && !canUseResources { 432 return nil, errors.New("invalid configuration, cannot specify resource limits without cgroups v2 and --cgroup-manager=systemd") 433 } 434 if !canUseResources { 435 // Force the resources block to be empty instead of having default values. 436 configSpec.Linux.Resources = &spec.LinuxResources{} 437 } 438 } 439 440 switch config.Cgroup.Cgroups { 441 case "disabled": 442 if addedResources { 443 return nil, errors.New("cannot specify resource limits when cgroups are disabled is specified") 444 } 445 configSpec.Linux.Resources = &spec.LinuxResources{} 446 case "enabled", "no-conmon", "": 447 // Do nothing 448 default: 449 return nil, errors.New("unrecognized option for cgroups; supported are 'default', 'disabled', 'no-conmon'") 450 } 451 452 // Add annotations 453 if configSpec.Annotations == nil { 454 configSpec.Annotations = make(map[string]string) 455 } 456 457 if config.CidFile != "" { 458 configSpec.Annotations[define.InspectAnnotationCIDFile] = config.CidFile 459 } 460 461 if config.Rm { 462 configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseTrue 463 } else { 464 configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseFalse 465 } 466 467 if len(config.VolumesFrom) > 0 { 468 configSpec.Annotations[define.InspectAnnotationVolumesFrom] = strings.Join(config.VolumesFrom, ",") 469 } 470 471 if config.Security.Privileged { 472 configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseTrue 473 } else { 474 configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseFalse 475 } 476 477 if config.Init { 478 configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue 479 } else { 480 configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseFalse 481 } 482 483 return configSpec, nil 484 } 485 486 func (config *CreateConfig) cgroupDisabled() bool { 487 return config.Cgroup.Cgroups == "disabled" 488 } 489 490 func BlockAccessToKernelFilesystems(privileged, pidModeIsHost bool, g *generate.Generator) { 491 if !privileged { 492 for _, mp := range []string{ 493 "/proc/acpi", 494 "/proc/kcore", 495 "/proc/keys", 496 "/proc/latency_stats", 497 "/proc/timer_list", 498 "/proc/timer_stats", 499 "/proc/sched_debug", 500 "/proc/scsi", 501 "/sys/firmware", 502 "/sys/fs/selinux", 503 } { 504 g.AddLinuxMaskedPaths(mp) 505 } 506 507 if pidModeIsHost && rootless.IsRootless() { 508 return 509 } 510 511 for _, rp := range []string{ 512 "/proc/asound", 513 "/proc/bus", 514 "/proc/fs", 515 "/proc/irq", 516 "/proc/sys", 517 "/proc/sysrq-trigger", 518 } { 519 g.AddLinuxReadonlyPaths(rp) 520 } 521 } 522 } 523 524 func addRlimits(config *CreateConfig, g *generate.Generator) error { 525 var ( 526 isRootless = rootless.IsRootless() 527 nofileSet = false 528 nprocSet = false 529 ) 530 531 for _, u := range config.Resources.Ulimit { 532 if u == "host" { 533 if len(config.Resources.Ulimit) != 1 { 534 return errors.New("ulimit can use host only once") 535 } 536 g.Config.Process.Rlimits = nil 537 break 538 } 539 540 ul, err := units.ParseUlimit(u) 541 if err != nil { 542 return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u) 543 } 544 545 if ul.Name == "nofile" { 546 nofileSet = true 547 } else if ul.Name == "nproc" { 548 nprocSet = true 549 } 550 551 g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft)) 552 } 553 554 // If not explicitly overridden by the user, default number of open 555 // files and number of processes to the maximum they can be set to 556 // (without overriding a sysctl) 557 if !nofileSet { 558 max := define.RLimitDefaultValue 559 current := define.RLimitDefaultValue 560 if isRootless { 561 var rlimit unix.Rlimit 562 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlimit); err != nil { 563 logrus.Warnf("failed to return RLIMIT_NOFILE ulimit %q", err) 564 } 565 if rlimit.Cur < current { 566 current = rlimit.Cur 567 } 568 if rlimit.Max < max { 569 max = rlimit.Max 570 } 571 } 572 g.AddProcessRlimits("RLIMIT_NOFILE", max, current) 573 } 574 if !nprocSet { 575 max := define.RLimitDefaultValue 576 current := define.RLimitDefaultValue 577 if isRootless { 578 var rlimit unix.Rlimit 579 if err := unix.Getrlimit(unix.RLIMIT_NPROC, &rlimit); err != nil { 580 logrus.Warnf("failed to return RLIMIT_NPROC ulimit %q", err) 581 } 582 if rlimit.Cur < current { 583 current = rlimit.Cur 584 } 585 if rlimit.Max < max { 586 max = rlimit.Max 587 } 588 } 589 g.AddProcessRlimits("RLIMIT_NPROC", max, current) 590 } 591 592 return nil 593 }