github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/utils.go (about) 1 package cgroups 2 3 import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "path/filepath" 10 "strconv" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/opencontainers/runc/libcontainer/userns" 16 "github.com/sirupsen/logrus" 17 "golang.org/x/sys/unix" 18 ) 19 20 const ( 21 CgroupProcesses = "cgroup.procs" 22 unifiedMountpoint = "/sys/fs/cgroup" 23 hybridMountpoint = "/sys/fs/cgroup/unified" 24 ) 25 26 var ( 27 isUnifiedOnce sync.Once 28 isUnified bool 29 isHybridOnce sync.Once 30 isHybrid bool 31 ) 32 33 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. 34 func IsCgroup2UnifiedMode() bool { 35 isUnifiedOnce.Do(func() { 36 var st unix.Statfs_t 37 err := unix.Statfs(unifiedMountpoint, &st) 38 if err != nil { 39 level := logrus.WarnLevel 40 if os.IsNotExist(err) && userns.RunningInUserNS() { 41 // For rootless containers, sweep it under the rug. 42 level = logrus.DebugLevel 43 } 44 logrus.StandardLogger().Logf(level, 45 "statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err) 46 } 47 isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC 48 }) 49 return isUnified 50 } 51 52 // IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode. 53 func IsCgroup2HybridMode() bool { 54 isHybridOnce.Do(func() { 55 var st unix.Statfs_t 56 err := unix.Statfs(hybridMountpoint, &st) 57 if err != nil { 58 isHybrid = false 59 if !os.IsNotExist(err) { 60 // Report unexpected errors. 61 logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint) 62 } 63 return 64 } 65 isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC 66 }) 67 return isHybrid 68 } 69 70 type Mount struct { 71 Mountpoint string 72 Root string 73 Subsystems []string 74 } 75 76 // GetCgroupMounts returns the mounts for the cgroup subsystems. 77 // all indicates whether to return just the first instance or all the mounts. 78 // This function should not be used from cgroupv2 code, as in this case 79 // all the controllers are available under the constant unifiedMountpoint. 80 func GetCgroupMounts(all bool) ([]Mount, error) { 81 if IsCgroup2UnifiedMode() { 82 // TODO: remove cgroupv2 case once all external users are converted 83 availableControllers, err := GetAllSubsystems() 84 if err != nil { 85 return nil, err 86 } 87 m := Mount{ 88 Mountpoint: unifiedMountpoint, 89 Root: unifiedMountpoint, 90 Subsystems: availableControllers, 91 } 92 return []Mount{m}, nil 93 } 94 95 return getCgroupMountsV1(all) 96 } 97 98 // GetAllSubsystems returns all the cgroup subsystems supported by the kernel 99 func GetAllSubsystems() ([]string, error) { 100 // /proc/cgroups is meaningless for v2 101 // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features 102 if IsCgroup2UnifiedMode() { 103 // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. 104 // - devices: implemented in kernel 4.15 105 // - freezer: implemented in kernel 5.2 106 // We assume these are always available, as it is hard to detect availability. 107 pseudo := []string{"devices", "freezer"} 108 data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers") 109 if err != nil { 110 return nil, err 111 } 112 subsystems := append(pseudo, strings.Fields(data)...) 113 return subsystems, nil 114 } 115 f, err := os.Open("/proc/cgroups") 116 if err != nil { 117 return nil, err 118 } 119 defer f.Close() 120 121 subsystems := []string{} 122 123 s := bufio.NewScanner(f) 124 for s.Scan() { 125 text := s.Text() 126 if text[0] != '#' { 127 parts := strings.Fields(text) 128 if len(parts) >= 4 && parts[3] != "0" { 129 subsystems = append(subsystems, parts[0]) 130 } 131 } 132 } 133 if err := s.Err(); err != nil { 134 return nil, err 135 } 136 return subsystems, nil 137 } 138 139 func readProcsFile(dir string) ([]int, error) { 140 f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY) 141 if err != nil { 142 return nil, err 143 } 144 defer f.Close() 145 146 var ( 147 s = bufio.NewScanner(f) 148 out = []int{} 149 ) 150 151 for s.Scan() { 152 if t := s.Text(); t != "" { 153 pid, err := strconv.Atoi(t) 154 if err != nil { 155 return nil, err 156 } 157 out = append(out, pid) 158 } 159 } 160 return out, s.Err() 161 } 162 163 // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup 164 // or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g. 165 // 166 // "cpu": "/user.slice/user-1000.slice" 167 // "pids": "/user.slice/user-1000.slice" 168 // 169 // etc. 170 // 171 // Note that for cgroup v2 unified hierarchy, there are no per-controller 172 // cgroup paths, so the resulting map will have a single element where the key 173 // is empty string ("") and the value is the cgroup path the <pid> is in. 174 func ParseCgroupFile(path string) (map[string]string, error) { 175 f, err := os.Open(path) 176 if err != nil { 177 return nil, err 178 } 179 defer f.Close() 180 181 return parseCgroupFromReader(f) 182 } 183 184 // helper function for ParseCgroupFile to make testing easier 185 func parseCgroupFromReader(r io.Reader) (map[string]string, error) { 186 s := bufio.NewScanner(r) 187 cgroups := make(map[string]string) 188 189 for s.Scan() { 190 text := s.Text() 191 // from cgroups(7): 192 // /proc/[pid]/cgroup 193 // ... 194 // For each cgroup hierarchy ... there is one entry 195 // containing three colon-separated fields of the form: 196 // hierarchy-ID:subsystem-list:cgroup-path 197 parts := strings.SplitN(text, ":", 3) 198 if len(parts) < 3 { 199 return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) 200 } 201 202 for _, subs := range strings.Split(parts[1], ",") { 203 cgroups[subs] = parts[2] 204 } 205 } 206 if err := s.Err(); err != nil { 207 return nil, err 208 } 209 210 return cgroups, nil 211 } 212 213 func PathExists(path string) bool { 214 if _, err := os.Stat(path); err != nil { 215 return false 216 } 217 return true 218 } 219 220 // rmdir tries to remove a directory, optionally retrying on EBUSY. 221 func rmdir(path string, retry bool) error { 222 delay := time.Millisecond 223 tries := 10 224 225 again: 226 err := unix.Rmdir(path) 227 switch err { // nolint:errorlint // unix errors are bare 228 case nil, unix.ENOENT: 229 return nil 230 case unix.EINTR: 231 goto again 232 case unix.EBUSY: 233 if retry && tries > 0 { 234 time.Sleep(delay) 235 delay *= 2 236 tries-- 237 goto again 238 239 } 240 } 241 return &os.PathError{Op: "rmdir", Path: path, Err: err} 242 } 243 244 // RemovePath aims to remove cgroup path. It does so recursively, 245 // by removing any subdirectories (sub-cgroups) first. 246 func RemovePath(path string) error { 247 // Try the fast path first. 248 if err := rmdir(path, false); err == nil { 249 return nil 250 } 251 252 infos, err := os.ReadDir(path) 253 if err != nil && !os.IsNotExist(err) { 254 return err 255 } 256 for _, info := range infos { 257 if info.IsDir() { 258 // We should remove subcgroup first. 259 if err = RemovePath(filepath.Join(path, info.Name())); err != nil { 260 break 261 } 262 } 263 } 264 if err == nil { 265 err = rmdir(path, true) 266 } 267 return err 268 } 269 270 // RemovePaths iterates over the provided paths removing them. 271 func RemovePaths(paths map[string]string) (err error) { 272 for s, p := range paths { 273 if err := RemovePath(p); err == nil { 274 delete(paths, s) 275 } 276 } 277 if len(paths) == 0 { 278 //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 279 // TODO: switch to clear once Go < 1.21 is not supported. 280 paths = make(map[string]string) 281 return nil 282 } 283 return fmt.Errorf("Failed to remove paths: %v", paths) 284 } 285 286 var ( 287 hugePageSizes []string 288 initHPSOnce sync.Once 289 ) 290 291 func HugePageSizes() []string { 292 initHPSOnce.Do(func() { 293 dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) 294 if err != nil { 295 return 296 } 297 files, err := dir.Readdirnames(0) 298 dir.Close() 299 if err != nil { 300 return 301 } 302 303 hugePageSizes, err = getHugePageSizeFromFilenames(files) 304 if err != nil { 305 logrus.Warn("HugePageSizes: ", err) 306 } 307 }) 308 309 return hugePageSizes 310 } 311 312 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { 313 pageSizes := make([]string, 0, len(fileNames)) 314 var warn error 315 316 for _, file := range fileNames { 317 // example: hugepages-1048576kB 318 val := strings.TrimPrefix(file, "hugepages-") 319 if len(val) == len(file) { 320 // Unexpected file name: no prefix found, ignore it. 321 continue 322 } 323 // The suffix is always "kB" (as of Linux 5.13). If we find 324 // something else, produce an error but keep going. 325 eLen := len(val) - 2 326 val = strings.TrimSuffix(val, "kB") 327 if len(val) != eLen { 328 // Highly unlikely. 329 if warn == nil { 330 warn = errors.New(file + `: invalid suffix (expected "kB")`) 331 } 332 continue 333 } 334 size, err := strconv.Atoi(val) 335 if err != nil { 336 // Highly unlikely. 337 if warn == nil { 338 warn = fmt.Errorf("%s: %w", file, err) 339 } 340 continue 341 } 342 // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 343 // but in our case the size is in KB already. 344 if size >= (1 << 20) { 345 val = strconv.Itoa(size>>20) + "GB" 346 } else if size >= (1 << 10) { 347 val = strconv.Itoa(size>>10) + "MB" 348 } else { 349 val += "KB" 350 } 351 pageSizes = append(pageSizes, val) 352 } 353 354 return pageSizes, warn 355 } 356 357 // GetPids returns all pids, that were added to cgroup at path. 358 func GetPids(dir string) ([]int, error) { 359 return readProcsFile(dir) 360 } 361 362 // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file 363 func WriteCgroupProc(dir string, pid int) error { 364 // Normally dir should not be empty, one case is that cgroup subsystem 365 // is not mounted, we will get empty dir, and we want it fail here. 366 if dir == "" { 367 return fmt.Errorf("no such directory for %s", CgroupProcesses) 368 } 369 370 // Dont attach any pid to the cgroup if -1 is specified as a pid 371 if pid == -1 { 372 return nil 373 } 374 375 file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY) 376 if err != nil { 377 return fmt.Errorf("failed to write %v: %w", pid, err) 378 } 379 defer file.Close() 380 381 for i := 0; i < 5; i++ { 382 _, err = file.WriteString(strconv.Itoa(pid)) 383 if err == nil { 384 return nil 385 } 386 387 // EINVAL might mean that the task being added to cgroup.procs is in state 388 // TASK_NEW. We should attempt to do so again. 389 if errors.Is(err, unix.EINVAL) { 390 time.Sleep(30 * time.Millisecond) 391 continue 392 } 393 394 return fmt.Errorf("failed to write %v: %w", pid, err) 395 } 396 return err 397 } 398 399 // Since the OCI spec is designed for cgroup v1, in some cases 400 // there is need to convert from the cgroup v1 configuration to cgroup v2 401 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) 402 // convert from [2-262144] to [1-10000] 403 // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" 404 func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { 405 if cpuShares == 0 { 406 return 0 407 } 408 return (1 + ((cpuShares-2)*9999)/262142) 409 } 410 411 // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec 412 // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap 413 // is defined as memory+swap combined, while in cgroup v2 swap is a separate value. 414 func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { 415 // for compatibility with cgroup1 controller, set swap to unlimited in 416 // case the memory is set to unlimited, and swap is not explicitly set, 417 // treating the request as "set both memory and swap to unlimited". 418 if memory == -1 && memorySwap == 0 { 419 return -1, nil 420 } 421 if memorySwap == -1 || memorySwap == 0 { 422 // -1 is "max", 0 is "unset", so treat as is 423 return memorySwap, nil 424 } 425 // sanity checks 426 if memory == 0 || memory == -1 { 427 return 0, errors.New("unable to set swap limit without memory limit") 428 } 429 if memory < 0 { 430 return 0, fmt.Errorf("invalid memory value: %d", memory) 431 } 432 if memorySwap < memory { 433 return 0, errors.New("memory+swap limit should be >= memory limit") 434 } 435 436 return memorySwap - memory, nil 437 } 438 439 // Since the OCI spec is designed for cgroup v1, in some cases 440 // there is need to convert from the cgroup v1 configuration to cgroup v2 441 // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) 442 // convert linearly from [10-1000] to [1-10000] 443 func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { 444 if blkIoWeight == 0 { 445 return 0 446 } 447 return 1 + (uint64(blkIoWeight)-10)*9999/990 448 }