github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/systemd/v2.go (about) 1 package systemd 2 3 import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "math" 8 "os" 9 "path/filepath" 10 "strconv" 11 "strings" 12 "sync" 13 14 systemdDbus "github.com/coreos/go-systemd/v22/dbus" 15 securejoin "github.com/cyphar/filepath-securejoin" 16 "github.com/sirupsen/logrus" 17 18 "github.com/opencontainers/runc/libcontainer/cgroups" 19 "github.com/opencontainers/runc/libcontainer/cgroups/fs2" 20 "github.com/opencontainers/runc/libcontainer/configs" 21 ) 22 23 const ( 24 cpuIdleSupportedVersion = 252 25 ) 26 27 type UnifiedManager struct { 28 mu sync.Mutex 29 cgroups *configs.Cgroup 30 // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" 31 path string 32 dbus *dbusConnManager 33 fsMgr cgroups.Manager 34 } 35 36 func NewUnifiedManager(config *configs.Cgroup, path string) (*UnifiedManager, error) { 37 m := &UnifiedManager{ 38 cgroups: config, 39 path: path, 40 dbus: newDbusConnManager(config.Rootless), 41 } 42 if err := m.initPath(); err != nil { 43 return nil, err 44 } 45 46 fsMgr, err := fs2.NewManager(config, m.path) 47 if err != nil { 48 return nil, err 49 } 50 m.fsMgr = fsMgr 51 52 return m, nil 53 } 54 55 func shouldSetCPUIdle(cm *dbusConnManager, v string) bool { 56 // The only valid values for cpu.idle are 0 and 1. As it is 57 // not possible to directly set cpu.idle to 0 via systemd, 58 // ignore 0. Ignore other values as we'll error out later 59 // in Set() while calling fsMgr.Set(). 60 return v == "1" && systemdVersion(cm) >= cpuIdleSupportedVersion 61 } 62 63 // unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified 64 // key/value map (where key is cgroupfs file name) to systemd unit properties. 65 // This is on a best-effort basis, so the properties that are not known 66 // (to this function and/or systemd) are ignored (but logged with "debug" 67 // log level). 68 // 69 // For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt 70 // 71 // For the list of systemd unit properties, see systemd.resource-control(5). 72 func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) { 73 var err error 74 75 for k, v := range res { 76 if strings.Contains(k, "/") { 77 return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k) 78 } 79 if strings.IndexByte(k, '.') <= 0 { 80 return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) 81 } 82 // Kernel is quite forgiving to extra whitespace 83 // around the value, and so should we. 84 v = strings.TrimSpace(v) 85 // Please keep cases in alphabetical order. 86 switch k { 87 case "cpu.idle": 88 if shouldSetCPUIdle(cm, v) { 89 // Setting CPUWeight to 0 tells systemd 90 // to set cpu.idle to 1. 91 props = append(props, 92 newProp("CPUWeight", uint64(0))) 93 } 94 95 case "cpu.max": 96 // value: quota [period] 97 quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set 98 period := defCPUQuotaPeriod 99 sv := strings.Fields(v) 100 if len(sv) < 1 || len(sv) > 2 { 101 return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v) 102 } 103 // quota 104 if sv[0] != "max" { 105 quota, err = strconv.ParseInt(sv[0], 10, 64) 106 if err != nil { 107 return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err) 108 } 109 } 110 // period 111 if len(sv) == 2 { 112 period, err = strconv.ParseUint(sv[1], 10, 64) 113 if err != nil { 114 return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err) 115 } 116 } 117 addCpuQuota(cm, &props, quota, period) 118 119 case "cpu.weight": 120 if shouldSetCPUIdle(cm, strings.TrimSpace(res["cpu.idle"])) { 121 // Do not add duplicate CPUWeight property 122 // (see case "cpu.idle" above). 123 logrus.Warn("unable to apply both cpu.weight and cpu.idle to systemd, ignoring cpu.weight") 124 continue 125 } 126 num, err := strconv.ParseUint(v, 10, 64) 127 if err != nil { 128 return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) 129 } 130 props = append(props, 131 newProp("CPUWeight", num)) 132 133 case "cpuset.cpus", "cpuset.mems": 134 bits, err := RangeToBits(v) 135 if err != nil { 136 return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) 137 } 138 m := map[string]string{ 139 "cpuset.cpus": "AllowedCPUs", 140 "cpuset.mems": "AllowedMemoryNodes", 141 } 142 // systemd only supports these properties since v244 143 sdVer := systemdVersion(cm) 144 if sdVer >= 244 { 145 props = append(props, 146 newProp(m[k], bits)) 147 } else { 148 logrus.Debugf("systemd v%d is too old to support %s"+ 149 " (setting will still be applied to cgroupfs)", 150 sdVer, m[k]) 151 } 152 153 case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max": 154 num := uint64(math.MaxUint64) 155 if v != "max" { 156 num, err = strconv.ParseUint(v, 10, 64) 157 if err != nil { 158 return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) 159 } 160 } 161 m := map[string]string{ 162 "memory.high": "MemoryHigh", 163 "memory.low": "MemoryLow", 164 "memory.min": "MemoryMin", 165 "memory.max": "MemoryMax", 166 "memory.swap.max": "MemorySwapMax", 167 } 168 props = append(props, 169 newProp(m[k], num)) 170 171 case "pids.max": 172 num := uint64(math.MaxUint64) 173 if v != "max" { 174 var err error 175 num, err = strconv.ParseUint(v, 10, 64) 176 if err != nil { 177 return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) 178 } 179 } 180 props = append(props, 181 newProp("TasksMax", num)) 182 183 case "memory.oom.group": 184 // Setting this to 1 is roughly equivalent to OOMPolicy=kill 185 // (as per systemd.service(5) and 186 // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), 187 // but it's not clear what to do if it is unset or set 188 // to 0 in runc update, as there are two other possible 189 // values for OOMPolicy (continue/stop). 190 fallthrough 191 192 default: 193 // Ignore the unknown resource here -- will still be 194 // applied in Set which calls fs2.Set. 195 logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v) 196 } 197 } 198 199 return props, nil 200 } 201 202 func genV2ResourcesProperties(dirPath string, r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { 203 // We need this check before setting systemd properties, otherwise 204 // the container is OOM-killed and the systemd unit is removed 205 // before we get to fsMgr.Set(). 206 if err := fs2.CheckMemoryUsage(dirPath, r); err != nil { 207 return nil, err 208 } 209 210 var properties []systemdDbus.Property 211 212 // NOTE: This is of questionable correctness because we insert our own 213 // devices eBPF program later. Two programs with identical rules 214 // aren't the end of the world, but it is a bit concerning. However 215 // it's unclear if systemd removes all eBPF programs attached when 216 // doing SetUnitProperties... 217 deviceProperties, err := generateDeviceProperties(r, cm) 218 if err != nil { 219 return nil, err 220 } 221 properties = append(properties, deviceProperties...) 222 223 if r.Memory != 0 { 224 properties = append(properties, 225 newProp("MemoryMax", uint64(r.Memory))) 226 } 227 if r.MemoryReservation != 0 { 228 properties = append(properties, 229 newProp("MemoryLow", uint64(r.MemoryReservation))) 230 } 231 232 swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) 233 if err != nil { 234 return nil, err 235 } 236 if swap != 0 { 237 properties = append(properties, 238 newProp("MemorySwapMax", uint64(swap))) 239 } 240 241 idleSet := false 242 // The logic here is the same as in shouldSetCPUIdle. 243 if r.CPUIdle != nil && *r.CPUIdle == 1 && systemdVersion(cm) >= cpuIdleSupportedVersion { 244 properties = append(properties, 245 newProp("CPUWeight", uint64(0))) 246 idleSet = true 247 } 248 if r.CpuWeight != 0 { 249 if idleSet { 250 // Ignore CpuWeight if CPUIdle is already set. 251 logrus.Warn("unable to apply both CPUWeight and CpuIdle to systemd, ignoring CPUWeight") 252 } else { 253 properties = append(properties, 254 newProp("CPUWeight", r.CpuWeight)) 255 } 256 } 257 258 addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) 259 260 if r.PidsLimit > 0 || r.PidsLimit == -1 { 261 properties = append(properties, 262 newProp("TasksMax", uint64(r.PidsLimit))) 263 } 264 265 err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) 266 if err != nil { 267 return nil, err 268 } 269 270 // ignore r.KernelMemory 271 272 // convert Resources.Unified map to systemd properties 273 if r.Unified != nil { 274 unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified) 275 if err != nil { 276 return nil, err 277 } 278 properties = append(properties, unifiedProps...) 279 } 280 281 return properties, nil 282 } 283 284 func (m *UnifiedManager) Apply(pid int) error { 285 var ( 286 c = m.cgroups 287 unitName = getUnitName(c) 288 properties []systemdDbus.Property 289 ) 290 291 slice := "system.slice" 292 if m.cgroups.Rootless { 293 slice = "user.slice" 294 } 295 if c.Parent != "" { 296 slice = c.Parent 297 } 298 299 properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) 300 301 if strings.HasSuffix(unitName, ".slice") { 302 // If we create a slice, the parent is defined via a Wants=. 303 properties = append(properties, systemdDbus.PropWants(slice)) 304 } else { 305 // Otherwise it's a scope, which we put into a Slice=. 306 properties = append(properties, systemdDbus.PropSlice(slice)) 307 // Assume scopes always support delegation (supported since systemd v218). 308 properties = append(properties, newProp("Delegate", true)) 309 } 310 311 // only add pid if its valid, -1 is used w/ general slice creation. 312 if pid != -1 { 313 properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) 314 } 315 316 // Always enable accounting, this gets us the same behaviour as the fs implementation, 317 // plus the kernel has some problems with joining the memory cgroup at a later time. 318 properties = append(properties, 319 newProp("MemoryAccounting", true), 320 newProp("CPUAccounting", true), 321 newProp("IOAccounting", true), 322 newProp("TasksAccounting", true), 323 ) 324 325 // Assume DefaultDependencies= will always work (the check for it was previously broken.) 326 properties = append(properties, 327 newProp("DefaultDependencies", false)) 328 329 properties = append(properties, c.SystemdProps...) 330 331 if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { 332 return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) 333 } 334 335 if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { 336 return err 337 } 338 339 if c.OwnerUID != nil { 340 // The directory itself must be chowned. 341 err := os.Chown(m.path, *c.OwnerUID, -1) 342 if err != nil { 343 return err 344 } 345 346 filesToChown, err := cgroupFilesToChown() 347 if err != nil { 348 return err 349 } 350 351 for _, v := range filesToChown { 352 err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) 353 // Some files might not be present. 354 if err != nil && !errors.Is(err, os.ErrNotExist) { 355 return err 356 } 357 } 358 } 359 360 return nil 361 } 362 363 // The kernel exposes a list of files that should be chowned to the delegate 364 // uid in /sys/kernel/cgroup/delegate. If the file is not present 365 // (Linux < 4.15), use the initial values mentioned in cgroups(7). 366 func cgroupFilesToChown() ([]string, error) { 367 const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" 368 369 f, err := os.Open(cgroupDelegateFile) 370 if err != nil { 371 return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil 372 } 373 defer f.Close() 374 375 filesToChown := []string{} 376 scanner := bufio.NewScanner(f) 377 for scanner.Scan() { 378 filesToChown = append(filesToChown, scanner.Text()) 379 } 380 if err := scanner.Err(); err != nil { 381 return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) 382 } 383 384 return filesToChown, nil 385 } 386 387 func (m *UnifiedManager) Destroy() error { 388 m.mu.Lock() 389 defer m.mu.Unlock() 390 391 unitName := getUnitName(m.cgroups) 392 if err := stopUnit(m.dbus, unitName); err != nil { 393 return err 394 } 395 396 // systemd 239 do not remove sub-cgroups. 397 err := m.fsMgr.Destroy() 398 // fsMgr.Destroy has handled ErrNotExist 399 if err != nil { 400 return err 401 } 402 403 return nil 404 } 405 406 func (m *UnifiedManager) Path(_ string) string { 407 return m.path 408 } 409 410 // getSliceFull value is used in initPath. 411 // The value is incompatible with systemdDbus.PropSlice. 412 func (m *UnifiedManager) getSliceFull() (string, error) { 413 c := m.cgroups 414 slice := "system.slice" 415 if c.Rootless { 416 slice = "user.slice" 417 } 418 if c.Parent != "" { 419 var err error 420 slice, err = ExpandSlice(c.Parent) 421 if err != nil { 422 return "", err 423 } 424 } 425 426 if c.Rootless { 427 // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service". 428 managerCG, err := getManagerProperty(m.dbus, "ControlGroup") 429 if err != nil { 430 return "", err 431 } 432 slice = filepath.Join(managerCG, slice) 433 } 434 435 // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice" 436 // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified. 437 return slice, nil 438 } 439 440 func (m *UnifiedManager) initPath() error { 441 if m.path != "" { 442 return nil 443 } 444 445 sliceFull, err := m.getSliceFull() 446 if err != nil { 447 return err 448 } 449 450 c := m.cgroups 451 path := filepath.Join(sliceFull, getUnitName(c)) 452 path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path) 453 if err != nil { 454 return err 455 } 456 457 // an example of the final path in rootless: 458 // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" 459 m.path = path 460 461 return nil 462 } 463 464 func (m *UnifiedManager) Freeze(state configs.FreezerState) error { 465 return m.fsMgr.Freeze(state) 466 } 467 468 func (m *UnifiedManager) GetPids() ([]int, error) { 469 return cgroups.GetPids(m.path) 470 } 471 472 func (m *UnifiedManager) GetAllPids() ([]int, error) { 473 return cgroups.GetAllPids(m.path) 474 } 475 476 func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) { 477 return m.fsMgr.GetStats() 478 } 479 480 func (m *UnifiedManager) Set(r *configs.Resources) error { 481 if r == nil { 482 return nil 483 } 484 properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus) 485 if err != nil { 486 return err 487 } 488 489 if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil { 490 return fmt.Errorf("unable to set unit properties: %w", err) 491 } 492 493 return m.fsMgr.Set(r) 494 } 495 496 func (m *UnifiedManager) GetPaths() map[string]string { 497 paths := make(map[string]string, 1) 498 paths[""] = m.path 499 return paths 500 } 501 502 func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) { 503 return m.cgroups, nil 504 } 505 506 func (m *UnifiedManager) GetFreezerState() (configs.FreezerState, error) { 507 return m.fsMgr.GetFreezerState() 508 } 509 510 func (m *UnifiedManager) Exists() bool { 511 return cgroups.PathExists(m.path) 512 } 513 514 func (m *UnifiedManager) OOMKillCount() (uint64, error) { 515 return m.fsMgr.OOMKillCount() 516 } 517 518 func (m *UnifiedManager) GetEffectiveCPUs() string { 519 return m.fsMgr.GetEffectiveCPUs() 520 }