github.com/containers/podman/v4@v4.9.4/pkg/specgen/generate/oci_linux.go (about) 1 //go:build !remote 2 // +build !remote 3 4 package generate 5 6 import ( 7 "context" 8 "encoding/json" 9 "fmt" 10 "path" 11 "strings" 12 13 "github.com/containers/common/libimage" 14 "github.com/containers/common/pkg/cgroups" 15 "github.com/containers/common/pkg/config" 16 "github.com/containers/podman/v4/libpod" 17 "github.com/containers/podman/v4/libpod/define" 18 "github.com/containers/podman/v4/pkg/rootless" 19 "github.com/containers/podman/v4/pkg/specgen" 20 "github.com/docker/go-units" 21 spec "github.com/opencontainers/runtime-spec/specs-go" 22 "github.com/opencontainers/runtime-tools/generate" 23 "github.com/sirupsen/logrus" 24 "golang.org/x/sys/unix" 25 ) 26 27 func setProcOpts(s *specgen.SpecGenerator, g *generate.Generator) { 28 if s.ProcOpts == nil { 29 return 30 } 31 for i := range g.Config.Mounts { 32 if g.Config.Mounts[i].Destination == "/proc" { 33 g.Config.Mounts[i].Options = s.ProcOpts 34 return 35 } 36 } 37 } 38 39 func setDevOptsReadOnly(g *generate.Generator) { 40 for i := range g.Config.Mounts { 41 if g.Config.Mounts[i].Destination == "/dev" { 42 g.Config.Mounts[i].Options = append(g.Config.Mounts[i].Options, "ro") 43 return 44 } 45 } 46 } 47 48 // canMountSys is a best-effort heuristic to detect whether mounting a new sysfs is permitted in the container 49 func canMountSys(isRootless, isNewUserns bool, s *specgen.SpecGenerator) bool { 50 if s.NetNS.IsHost() && (isRootless || isNewUserns) { 51 return false 52 } 53 if isNewUserns { 54 switch s.NetNS.NSMode { 55 case specgen.Slirp, specgen.Pasta, specgen.Private, specgen.NoNetwork, specgen.Bridge: 56 return true 57 default: 58 return false 59 } 60 } 61 return true 62 } 63 64 func getCgroupPermissions(unmask []string) string { 65 ro := "ro" 66 rw := "rw" 67 cgroup := "/sys/fs/cgroup" 68 69 cgroupv2, _ := cgroups.IsCgroup2UnifiedMode() 70 if !cgroupv2 { 71 return ro 72 } 73 74 if len(unmask) != 0 && unmask[0] == "ALL" { 75 return rw 76 } 77 78 for _, p := range unmask { 79 if path.Clean(p) == cgroup { 80 return rw 81 } 82 } 83 return ro 84 } 85 86 // SpecGenToOCI returns the base configuration for the container. 87 func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runtime, rtc *config.Config, newImage *libimage.Image, mounts []spec.Mount, pod *libpod.Pod, finalCmd []string, compatibleOptions *libpod.InfraInherit) (*spec.Spec, error) { 88 cgroupPerm := getCgroupPermissions(s.Unmask) 89 90 g, err := generate.New("linux") 91 if err != nil { 92 return nil, err 93 } 94 // Remove the default /dev/shm mount to ensure we overwrite it 95 g.RemoveMount("/dev/shm") 96 g.HostSpecific = true 97 addCgroup := true 98 99 isRootless := rootless.IsRootless() 100 isNewUserns := s.UserNS.IsContainer() || s.UserNS.IsPath() || s.UserNS.IsPrivate() || s.UserNS.IsPod() || s.UserNS.IsAuto() 101 102 canMountSys := canMountSys(isRootless, isNewUserns, s) 103 104 if s.Privileged && canMountSys { 105 cgroupPerm = "rw" 106 g.RemoveMount("/sys") 107 sysMnt := spec.Mount{ 108 Destination: "/sys", 109 Type: "sysfs", 110 Source: "sysfs", 111 Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"}, 112 } 113 g.AddMount(sysMnt) 114 } 115 if !canMountSys { 116 addCgroup = false 117 g.RemoveMount("/sys") 118 r := "ro" 119 if s.Privileged { 120 r = "rw" 121 } 122 sysMnt := spec.Mount{ 123 Destination: "/sys", 124 Type: define.TypeBind, 125 Source: "/sys", 126 Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"}, 127 } 128 g.AddMount(sysMnt) 129 g.RemoveMount("/sys/fs/cgroup") 130 131 sysFsCgroupMnt := spec.Mount{ 132 Destination: "/sys/fs/cgroup", 133 Type: "cgroup", 134 Source: "/sys/fs/cgroup", 135 Options: []string{"rprivate", "nosuid", "noexec", "nodev", r}, 136 } 137 g.AddMount(sysFsCgroupMnt) 138 if !s.Privileged && isRootless { 139 g.AddLinuxMaskedPaths("/sys/kernel") 140 } 141 } 142 gid5Available := true 143 if isRootless { 144 nGids, err := rootless.GetAvailableGids() 145 if err != nil { 146 return nil, err 147 } 148 gid5Available = nGids >= 5 149 } 150 // When using a different user namespace, check that the GID 5 is mapped inside 151 // the container. 152 if gid5Available && (s.IDMappings != nil && len(s.IDMappings.GIDMap) > 0) { 153 mappingFound := false 154 for _, r := range s.IDMappings.GIDMap { 155 if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size { 156 mappingFound = true 157 break 158 } 159 } 160 if !mappingFound { 161 gid5Available = false 162 } 163 } 164 if !gid5Available { 165 // If we have no GID mappings, the gid=5 default option would fail, so drop it. 166 g.RemoveMount("/dev/pts") 167 devPts := spec.Mount{ 168 Destination: "/dev/pts", 169 Type: define.TypeDevpts, 170 Source: define.TypeDevpts, 171 Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"}, 172 } 173 g.AddMount(devPts) 174 } 175 176 inUserNS := isRootless || isNewUserns 177 178 if inUserNS && s.IpcNS.IsHost() { 179 g.RemoveMount("/dev/mqueue") 180 devMqueue := spec.Mount{ 181 Destination: "/dev/mqueue", 182 Type: define.TypeBind, // constant ? 183 Source: "/dev/mqueue", 184 Options: []string{define.TypeBind, "nosuid", "noexec", "nodev"}, 185 } 186 g.AddMount(devMqueue) 187 } 188 if inUserNS && s.PidNS.IsHost() { 189 g.RemoveMount("/proc") 190 procMount := spec.Mount{ 191 Destination: "/proc", 192 Type: define.TypeBind, 193 Source: "/proc", 194 Options: []string{"rbind", "nosuid", "noexec", "nodev"}, 195 } 196 g.AddMount(procMount) 197 } 198 199 if addCgroup { 200 cgroupMnt := spec.Mount{ 201 Destination: "/sys/fs/cgroup", 202 Type: "cgroup", 203 Source: "cgroup", 204 Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm}, 205 } 206 g.AddMount(cgroupMnt) 207 } 208 209 g.Config.Linux.Personality = s.Personality 210 211 g.SetProcessCwd(s.WorkDir) 212 213 g.SetProcessArgs(finalCmd) 214 215 g.SetProcessTerminal(s.Terminal) 216 217 for key, val := range s.Annotations { 218 g.AddAnnotation(key, val) 219 } 220 221 if s.IntelRdt != nil { 222 if s.IntelRdt.ClosID != "" { 223 g.SetLinuxIntelRdtClosID(s.IntelRdt.ClosID) 224 } 225 } 226 227 if s.ResourceLimits != nil { 228 out, err := json.Marshal(s.ResourceLimits) 229 if err != nil { 230 return nil, err 231 } 232 err = json.Unmarshal(out, g.Config.Linux.Resources) 233 if err != nil { 234 return nil, err 235 } 236 g.Config.Linux.Resources = s.ResourceLimits 237 } 238 239 weightDevices, err := WeightDevices(s.WeightDevice) 240 if err != nil { 241 return nil, err 242 } 243 if len(weightDevices) > 0 { 244 for _, dev := range weightDevices { 245 g.AddLinuxResourcesBlockIOWeightDevice(dev.Major, dev.Minor, *dev.Weight) 246 } 247 } 248 249 // Devices 250 // set the default rule at the beginning of device configuration 251 if !inUserNS && !s.Privileged { 252 g.AddLinuxResourcesDevice(false, "", nil, nil, "rwm") 253 } 254 255 var userDevices []spec.LinuxDevice 256 257 if !s.Privileged { 258 // add default devices from containers.conf 259 for _, device := range rtc.Containers.Devices.Get() { 260 if err = DevicesFromPath(&g, device); err != nil { 261 return nil, err 262 } 263 } 264 if len(compatibleOptions.HostDeviceList) > 0 && len(s.Devices) == 0 { 265 userDevices = compatibleOptions.HostDeviceList 266 } else { 267 userDevices = s.Devices 268 } 269 // add default devices specified by caller 270 for _, device := range userDevices { 271 if err = DevicesFromPath(&g, device.Path); err != nil { 272 return nil, err 273 } 274 } 275 } 276 s.HostDeviceList = userDevices 277 278 // set the devices cgroup when not running in a user namespace 279 if isRootless && len(s.DeviceCgroupRule) > 0 { 280 return nil, fmt.Errorf("device cgroup rules are not supported in rootless mode or in a user namespace") 281 } 282 if !isRootless && !s.Privileged { 283 for _, dev := range s.DeviceCgroupRule { 284 g.AddLinuxResourcesDevice(true, dev.Type, dev.Major, dev.Minor, dev.Access) 285 } 286 } 287 288 BlockAccessToKernelFilesystems(s.Privileged, s.PidNS.IsHost(), s.Mask, s.Unmask, &g) 289 290 g.ClearProcessEnv() 291 for name, val := range s.Env { 292 g.AddProcessEnv(name, val) 293 } 294 295 addRlimits(s, &g) 296 297 // NAMESPACES 298 if err := specConfigureNamespaces(s, &g, rt, pod); err != nil { 299 return nil, err 300 } 301 configSpec := g.Config 302 303 if err := securityConfigureGenerator(s, &g, newImage, rtc); err != nil { 304 return nil, err 305 } 306 307 // BIND MOUNTS 308 configSpec.Mounts = SupersedeUserMounts(mounts, configSpec.Mounts) 309 // Process mounts to ensure correct options 310 if err := InitFSMounts(configSpec.Mounts); err != nil { 311 return nil, err 312 } 313 314 // Add annotations 315 if configSpec.Annotations == nil { 316 configSpec.Annotations = make(map[string]string) 317 } 318 319 if s.Remove { 320 configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseTrue 321 } 322 323 if len(s.VolumesFrom) > 0 { 324 configSpec.Annotations[define.InspectAnnotationVolumesFrom] = strings.Join(s.VolumesFrom, ",") 325 } 326 327 if s.Privileged { 328 configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseTrue 329 } 330 331 if s.Init { 332 configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue 333 } 334 335 if s.OOMScoreAdj != nil { 336 g.SetProcessOOMScoreAdj(*s.OOMScoreAdj) 337 } 338 339 setProcOpts(s, &g) 340 if s.ReadOnlyFilesystem && !s.ReadWriteTmpfs { 341 setDevOptsReadOnly(&g) 342 } 343 344 return configSpec, nil 345 } 346 347 func WeightDevices(wtDevices map[string]spec.LinuxWeightDevice) ([]spec.LinuxWeightDevice, error) { 348 devs := []spec.LinuxWeightDevice{} 349 for k, v := range wtDevices { 350 statT := unix.Stat_t{} 351 if err := unix.Stat(k, &statT); err != nil { 352 return nil, fmt.Errorf("failed to inspect '%s' in --blkio-weight-device: %w", k, err) 353 } 354 dev := new(spec.LinuxWeightDevice) 355 dev.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert 356 dev.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert 357 dev.Weight = v.Weight 358 devs = append(devs, *dev) 359 } 360 return devs, nil 361 } 362 363 // subNegativeOne translates Hard or soft limits of -1 to the current 364 // processes Max limit 365 func subNegativeOne(u spec.POSIXRlimit) spec.POSIXRlimit { 366 if !rootless.IsRootless() || 367 (int64(u.Hard) != -1 && int64(u.Soft) != -1) { 368 return u 369 } 370 371 ul, err := units.ParseUlimit(fmt.Sprintf("%s=%d:%d", u.Type, int64(u.Soft), int64(u.Hard))) 372 if err != nil { 373 logrus.Warnf("Failed to check %s ulimit %q", u.Type, err) 374 return u 375 } 376 rl, err := ul.GetRlimit() 377 if err != nil { 378 logrus.Warnf("Failed to check %s ulimit %q", u.Type, err) 379 return u 380 } 381 382 var rlimit unix.Rlimit 383 384 if err := unix.Getrlimit(rl.Type, &rlimit); err != nil { 385 logrus.Warnf("Failed to return RLIMIT_NOFILE ulimit %q", err) 386 return u 387 } 388 if int64(u.Hard) == -1 { 389 u.Hard = rlimit.Max 390 } 391 if int64(u.Soft) == -1 { 392 u.Soft = rlimit.Max 393 } 394 return u 395 }