gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/cgroup/cgroup_v2.go (about) 1 // Copyright The runc Authors. 2 // Copyright The containerd Authors. 3 // Copyright 2021 The gVisor Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // https://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package cgroup 18 19 import ( 20 "bufio" 21 "bytes" 22 "context" 23 "errors" 24 "fmt" 25 "io/ioutil" 26 "math" 27 "math/big" 28 "os" 29 "path/filepath" 30 "strconv" 31 "strings" 32 "time" 33 34 "github.com/cenkalti/backoff" 35 "github.com/coreos/go-systemd/v22/dbus" 36 specs "github.com/opencontainers/runtime-spec/specs-go" 37 "golang.org/x/sys/unix" 38 "gvisor.dev/gvisor/pkg/cleanup" 39 "gvisor.dev/gvisor/pkg/log" 40 ) 41 42 const ( 43 subtreeControl = "cgroup.subtree_control" 44 controllersFile = "cgroup.controllers" 45 cgroup2Key = "cgroup2" 46 memoryLimitCgroup = "memory.max" 47 cpuLimitCgroup = "cpu.max" 48 maxLimitStr = "max" 49 50 // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html 51 defaultPeriod = 100000 52 ) 53 54 var ( 55 ErrInvalidFormat = errors.New("cgroup: parsing file with invalid format failed") 56 ErrInvalidGroupPath = errors.New("cgroup: invalid group path") 57 58 // controllers2 is the group of all supported cgroupv2 controllers 59 controllers2 = map[string]controllerv2{ 60 "cpu": &cpu2{}, 61 "cpuset": &cpuset2{}, 62 "io": &io2{}, 63 "memory": &memory2{}, 64 "pids": &pid2{}, 65 "hugetlb": &hugeTLB2{}, 66 } 67 ) 68 69 // cgroupV2 represents a cgroup inside supported all cgroupV2 controllers 70 type cgroupV2 struct { 71 // Mountpoint is the unified mount point of cgroupV2 72 Mountpoint string `json:"mountpoint"` 73 // Path is the relative path to the unified mountpoint 74 Path string `json:"path"` 75 // Controllers is the list of supported controllers 76 Controllers []string `json:"controllers"` 77 // Own is the list of owned path created when install this cgroup 78 Own []string `json:"own"` 79 } 80 81 func newCgroupV2(mountpoint, group string, useSystemd bool) (Cgroup, error) { 82 data, err := ioutil.ReadFile(filepath.Join(mountpoint, "cgroup.controllers")) 83 if err != nil { 84 return nil, err 85 } 86 cg := &cgroupV2{ 87 Mountpoint: mountpoint, 88 Path: group, 89 Controllers: strings.Fields(string(data)), 90 } 91 if useSystemd { 92 return newCgroupV2Systemd(cg) 93 } 94 return cg, err 95 } 96 97 func (c *cgroupV2) createCgroupPaths() (bool, error) { 98 // setup all known controllers for the current subtree 99 // For example, given path /foo/bar and mount /sys/fs/cgroup, we need to write 100 // the controllers to: 101 // * /sys/fs/cgroup/cgroup.subtree_control 102 // * /sys/fs/cgroup/foo/cgroup.subtree_control 103 val := "+" + strings.Join(c.Controllers, " +") 104 elements := strings.Split(c.Path, "/") 105 current := c.Mountpoint 106 created := false 107 108 for i, e := range elements { 109 current = filepath.Join(current, e) 110 if i > 0 { 111 if err := os.Mkdir(current, 0o755); err != nil { 112 if !os.IsExist(err) { 113 return false, err 114 } 115 } else { 116 created = true 117 c.Own = append(c.Own, current) 118 } 119 } 120 // enable all known controllers for subtree 121 if i < len(elements)-1 { 122 if err := writeFile(filepath.Join(current, subtreeControl), []byte(val), 0700); err != nil { 123 return false, err 124 } 125 } 126 } 127 return created, nil 128 } 129 130 // Install creates and configures cgroups. 131 func (c *cgroupV2) Install(res *specs.LinuxResources) error { 132 log.Debugf("Installing cgroup path %q", c.MakePath("")) 133 // Clean up partially created cgroups on error. Errors during cleanup itself 134 // are ignored. 135 clean := cleanup.Make(func() { _ = c.Uninstall() }) 136 defer clean.Clean() 137 138 created, err := c.createCgroupPaths() 139 if err != nil { 140 return err 141 } 142 if created { 143 // If we created our final cgroup path then we can set the resources. 144 for controllerName, ctrlr := range controllers2 { 145 // First check if our controller is found in the system. 146 found := false 147 for _, knownController := range c.Controllers { 148 if controllerName == knownController { 149 found = true 150 } 151 } 152 153 // In case we don't have the controller. 154 if found { 155 if err := ctrlr.set(res, c.MakePath("")); err != nil { 156 return err 157 } 158 continue 159 } 160 if ctrlr.optional() { 161 if err := ctrlr.skip(res); err != nil { 162 return err 163 } 164 } else { 165 return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.MakePath("")) 166 } 167 } 168 } 169 170 clean.Release() 171 return nil 172 } 173 174 // Uninstall removes the settings done in Install(). If cgroup path already 175 // existed when Install() was called, Uninstall is a noop. 176 func (c *cgroupV2) Uninstall() error { 177 log.Debugf("Deleting cgroup %q", c.MakePath("")) 178 179 // If we try to remove the cgroup too soon after killing the sandbox we 180 // might get EBUSY, so we retry for a few seconds until it succeeds. 181 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 182 defer cancel() 183 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 184 185 // Deletion must occur reverse order, because they may contain ancestors. 186 for i := len(c.Own) - 1; i >= 0; i-- { 187 current := c.Own[i] 188 log.Debugf("Removing cgroup for path=%q", current) 189 190 fn := func() error { 191 err := unix.Rmdir(current) 192 if os.IsNotExist(err) { 193 return nil 194 } 195 return err 196 } 197 if err := backoff.Retry(fn, b); err != nil { 198 return fmt.Errorf("removing cgroup path %q: %w", current, err) 199 } 200 } 201 202 return nil 203 } 204 205 // Join adds the current process to the all controllers. Returns function that 206 // restores cgroup to the original state. 207 func (c *cgroupV2) Join() (func(), error) { 208 // First save the current state so it can be restored. 209 paths, err := loadPaths("self") 210 if err != nil { 211 return nil, err 212 } 213 // Since this is unified, get the first path of current process's cgroup is 214 // enough. 215 undoPath := filepath.Join(c.Mountpoint, paths[cgroup2Key]) 216 217 cu := cleanup.Make(func() { 218 log.Debugf("Restoring cgroup %q", undoPath) 219 // Writing the value 0 to a cgroup.procs file causes 220 // the writing process to be moved to the corresponding 221 // cgroup. - cgroups(7). 222 if err := setValue(undoPath, "cgroup.procs", "0"); err != nil { 223 log.Warningf("Error restoring cgroup %q: %v", undoPath, err) 224 } 225 }) 226 defer cu.Clean() 227 228 // now join the cgroup 229 if err := setValue(c.MakePath(""), "cgroup.procs", "0"); err != nil { 230 return nil, err 231 } 232 233 return cu.Release(), nil 234 } 235 236 func getCPUQuota(path string) (float64, error) { 237 cpuMax, err := getValue(path, cpuLimitCgroup) 238 if err != nil { 239 return -1, err 240 } 241 return parseCPUQuota(cpuMax) 242 } 243 244 // CPUQuota returns the CFS CPU quota. 245 func (c *cgroupV2) CPUQuota() (float64, error) { 246 cpuQuota, err := getCPUQuota(c.MakePath("")) 247 if err != nil { 248 return -1, err 249 } 250 // In cgroupv2+systemd, limits are set in the parent slice rather 251 // than the leaf node. Check the parent to see if this is the case. 252 if cpuQuota == -1 { 253 cpuQuota, err = getCPUQuota(filepath.Dir(c.MakePath(""))) 254 if err != nil && errors.Is(err, os.ErrNotExist) { 255 err = nil 256 } 257 } 258 return cpuQuota, nil 259 } 260 261 func parseCPUQuota(cpuMax string) (float64, error) { 262 data := strings.SplitN(strings.TrimSpace(cpuMax), " ", 2) 263 if len(data) != 2 { 264 return -1, fmt.Errorf("invalid cpu.max data %q", cpuMax) 265 } 266 267 // no cpu limit if quota is max 268 if data[0] == maxLimitStr { 269 return -1, nil 270 } 271 272 quota, err := strconv.ParseInt(data[0], 10, 64) 273 if err != nil { 274 return -1, err 275 } 276 277 period, err := strconv.ParseInt(data[1], 10, 64) 278 if err != nil { 279 return -1, err 280 } 281 282 if quota <= 0 || period <= 0 { 283 return -1, err 284 } 285 return float64(quota) / float64(period), nil 286 287 } 288 289 // CPUUsage returns the total CPU usage of the cgroup in nanoseconds. 290 func (c *cgroupV2) CPUUsage() (uint64, error) { 291 cpuStat, err := getValue(c.MakePath(""), "cpu.stat") 292 if err != nil { 293 return 0, err 294 } 295 296 sc := bufio.NewScanner(strings.NewReader(cpuStat)) 297 for sc.Scan() { 298 key, value, err := parseKeyValue(sc.Text()) 299 if err != nil { 300 return 0, err 301 } 302 if key == "usage_usec" { 303 return value * 1000, nil 304 } 305 } 306 307 return 0, nil 308 } 309 310 // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. 311 func (c *cgroupV2) NumCPU() (int, error) { 312 cpuset, err := getValue(c.MakePath(""), "cpuset.cpus.effective") 313 if err != nil { 314 return 0, err 315 } 316 return countCpuset(strings.TrimSpace(cpuset)) 317 } 318 319 func getMemoryLimit(path string) (string, error) { 320 limStr, err := getValue(path, memoryLimitCgroup) 321 if err != nil { 322 return "", err 323 } 324 return strings.TrimSpace(limStr), nil 325 } 326 327 // MemoryLimit returns the memory limit. 328 func (c *cgroupV2) MemoryLimit() (uint64, error) { 329 limStr, err := getMemoryLimit(c.MakePath("")) 330 if err != nil { 331 return 0, err 332 } 333 // In cgroupv2+systemd, limits are set in the parent slice rather 334 // than the leaf node. Check the parent to see if this is the case. 335 if limStr == maxLimitStr { 336 parentLimStr, err := getMemoryLimit(filepath.Dir(c.MakePath(""))) 337 if err != nil && !errors.Is(err, os.ErrNotExist) { 338 return 0, err 339 } 340 if parentLimStr != "" { 341 limStr = parentLimStr 342 } 343 if limStr == maxLimitStr { 344 return math.MaxUint64, nil 345 } 346 } 347 return strconv.ParseUint(limStr, 10, 64) 348 } 349 350 // MakePath builds a path to the given controller. 351 func (c *cgroupV2) MakePath(string) string { 352 return filepath.Join(c.Mountpoint, c.Path) 353 } 354 355 type controllerv2 interface { 356 controller 357 generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) 358 } 359 360 type cpu2 struct { 361 mandatory 362 } 363 364 func (*cpu2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 365 props := []dbus.Property{} 366 if spec == nil || spec.CPU == nil { 367 return props, nil 368 } 369 cpu := spec.CPU 370 if cpu.Shares != nil { 371 weight := convertCPUSharesToCgroupV2Value(*cpu.Shares) 372 if weight != 0 { 373 props = append(props, newProp("CPUWeight", weight)) 374 } 375 } 376 var ( 377 period uint64 378 quota int64 379 ) 380 if cpu.Period != nil { 381 period = *cpu.Period 382 } 383 if cpu.Quota != nil { 384 quota = *cpu.Quota 385 } 386 if period != 0 { 387 props = append(props, newProp("CPUQuotaPeriodUSec", period)) 388 } 389 if quota != 0 || period != 0 { 390 // Corresponds to USEC_INFINITY in systemd. 391 cpuQuotaPerSecUSec := uint64(math.MaxUint64) 392 if quota > 0 { 393 if period == 0 { 394 // Assume the default. 395 period = defaultPeriod 396 } 397 // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to 398 // CPUQuota (integer percentage of CPU) internally. This means that if a 399 // fractional percent of CPU is indicated by spec.CPU.Quota, we need to 400 // round up to the nearest 10ms (1% of a second) such that child cgroups 401 // can set the cpu.cfs_quota_us they expect. 402 cpuQuotaPerSecUSec = uint64(quota*1000000) / period 403 if cpuQuotaPerSecUSec%10000 != 0 { 404 cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 405 } 406 } 407 props = append(props, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) 408 } 409 return props, nil 410 } 411 412 func (*cpu2) set(spec *specs.LinuxResources, path string) error { 413 if spec == nil || spec.CPU == nil { 414 return nil 415 } 416 417 if spec.CPU.Shares != nil { 418 weight := convertCPUSharesToCgroupV2Value(*spec.CPU.Shares) 419 if weight != 0 { 420 if err := setValue(path, "cpu.weight", strconv.FormatUint(weight, 10)); err != nil { 421 return err 422 } 423 } 424 } 425 426 if spec.CPU.Period != nil || spec.CPU.Quota != nil { 427 v := maxLimitStr 428 if spec.CPU.Quota != nil && *spec.CPU.Quota > 0 { 429 v = strconv.FormatInt(*spec.CPU.Quota, 10) 430 } 431 432 var period uint64 433 if spec.CPU.Period != nil && *spec.CPU.Period != 0 { 434 period = *spec.CPU.Period 435 } else { 436 period = defaultPeriod 437 } 438 439 v += " " + strconv.FormatUint(period, 10) 440 if err := setValue(path, "cpu.max", v); err != nil { 441 return err 442 } 443 } 444 445 return nil 446 } 447 448 type cpuset2 struct { 449 mandatory 450 } 451 452 func (*cpuset2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 453 props := []dbus.Property{} 454 if spec == nil || spec.CPU == nil { 455 return props, nil 456 } 457 cpu := spec.CPU 458 if cpu.Cpus == "" && cpu.Mems == "" { 459 return props, nil 460 } 461 cpus := cpu.Cpus 462 mems := cpu.Mems 463 if cpus != "" { 464 bits, err := RangeToBits(cpus) 465 if err != nil { 466 return nil, fmt.Errorf("%w: cpus=%q conversion error: %v", ErrBadResourceSpec, cpus, err) 467 } 468 props = append(props, newProp("AllowedCPUs", bits)) 469 } 470 if mems != "" { 471 bits, err := RangeToBits(mems) 472 if err != nil { 473 return nil, fmt.Errorf("%w: mems=%q conversion error: %v", ErrBadResourceSpec, mems, err) 474 } 475 props = append(props, newProp("AllowedMemoryNodes", bits)) 476 } 477 return props, nil 478 } 479 480 func (*cpuset2) set(spec *specs.LinuxResources, path string) error { 481 if spec == nil || spec.CPU == nil { 482 return nil 483 } 484 485 if spec.CPU.Cpus != "" { 486 if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil { 487 return err 488 } 489 } 490 491 if spec.CPU.Mems != "" { 492 if err := setValue(path, "cpuset.mems", spec.CPU.Mems); err != nil { 493 return err 494 } 495 } 496 497 return nil 498 } 499 500 type memory2 struct { 501 mandatory 502 } 503 504 func (*memory2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 505 props := []dbus.Property{} 506 if spec == nil || spec.Memory == nil { 507 return props, nil 508 } 509 mem := spec.Memory 510 if mem.Swap != nil { 511 if mem.Limit == nil { 512 return nil, ErrBadResourceSpec 513 } 514 swap, err := convertMemorySwapToCgroupV2Value(*mem.Swap, *mem.Limit) 515 if err != nil { 516 return nil, err 517 } 518 props = append(props, newProp("MemorySwapMax", uint64(swap))) 519 } 520 if mem.Limit != nil { 521 props = append(props, newProp("MemoryMax", uint64(*mem.Limit))) 522 } 523 if mem.Reservation != nil { 524 props = append(props, newProp("MemoryLow", uint64(*mem.Reservation))) 525 } 526 return props, nil 527 } 528 529 func (*memory2) set(spec *specs.LinuxResources, path string) error { 530 if spec == nil || spec.Memory == nil { 531 return nil 532 } 533 534 if spec.Memory.Swap != nil { 535 // in cgroup v2, we set memory and swap separately, but the spec specifies 536 // Swap field as memory+swap, so we need memory limit here to be set in 537 // order to get the correct swap value. 538 if spec.Memory.Limit == nil { 539 return errors.New("cgroup: Memory.Swap is set without Memory.Limit") 540 } 541 542 swap, err := convertMemorySwapToCgroupV2Value(*spec.Memory.Swap, *spec.Memory.Limit) 543 if err != nil { 544 return nil 545 } 546 swapStr := numToStr(swap) 547 // memory and memorySwap set to the same value -- disable swap 548 if swapStr == "" && swap == 0 && *spec.Memory.Swap > 0 { 549 swapStr = "0" 550 } 551 // never write empty string to `memory.swap.max`, it means set to 0. 552 if swapStr != "" { 553 if err := setValue(path, "memory.swap.max", swapStr); err != nil { 554 return err 555 } 556 } 557 } 558 559 if spec.Memory.Limit != nil { 560 if val := numToStr(*spec.Memory.Limit); val != "" { 561 if err := setValue(path, "memory.max", val); err != nil { 562 return err 563 } 564 } 565 } 566 567 if spec.Memory.Reservation != nil { 568 if val := numToStr(*spec.Memory.Reservation); val != "" { 569 if err := setValue(path, "memory.low", val); err != nil { 570 return err 571 } 572 } 573 } 574 575 return nil 576 } 577 578 type pid2 struct { 579 mandatory 580 } 581 582 func (*pid2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 583 if spec != nil && spec.Pids != nil { 584 return []dbus.Property{newProp("TasksMax", uint64(spec.Pids.Limit))}, nil 585 } 586 return []dbus.Property{}, nil 587 } 588 589 func (*pid2) set(spec *specs.LinuxResources, path string) error { 590 if spec == nil || spec.Pids == nil { 591 return nil 592 } 593 594 if val := numToStr(spec.Pids.Limit); val != "" { 595 return setValue(path, "pids.max", val) 596 } 597 598 return nil 599 } 600 601 type io2 struct { 602 mandatory 603 } 604 605 func (*io2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 606 props := []dbus.Property{} 607 if spec == nil || spec.BlockIO == nil { 608 return props, nil 609 } 610 io := spec.BlockIO 611 if io != nil { 612 if io.Weight != nil && *io.Weight != 0 { 613 ioWeight := convertBlkIOToIOWeightValue(*io.Weight) 614 props = append(props, newProp("IOWeight", ioWeight)) 615 } 616 for _, dev := range io.WeightDevice { 617 val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight) 618 props = append(props, newProp("IODeviceWeight", val)) 619 } 620 props = addIOProps(props, "IOReadBandwidthMax", io.ThrottleReadBpsDevice) 621 props = addIOProps(props, "IOWriteBandwidthMax", io.ThrottleWriteBpsDevice) 622 props = addIOProps(props, "IOReadIOPSMax", io.ThrottleReadIOPSDevice) 623 props = addIOProps(props, "IOWriteIOPSMax", io.ThrottleWriteIOPSDevice) 624 } 625 return props, nil 626 } 627 628 func (*io2) set(spec *specs.LinuxResources, path string) error { 629 if spec == nil || spec.BlockIO == nil { 630 return nil 631 } 632 blkio := spec.BlockIO 633 634 var ( 635 err error 636 bfq *os.File 637 ) 638 639 // If BFQ IO scheduler is available, use it. 640 if blkio.Weight != nil || len(blkio.WeightDevice) > 0 { 641 bfq, err = os.Open(filepath.Join(path, "io.bfq.weight")) 642 if err == nil { 643 defer bfq.Close() 644 } else if !os.IsNotExist(err) { 645 return err 646 } 647 648 } 649 650 if blkio.Weight != nil && *blkio.Weight != 0 { 651 if bfq != nil { 652 if _, err := bfq.WriteString(strconv.FormatUint(uint64(*blkio.Weight), 10)); err != nil { 653 return err 654 } 655 } else { 656 // bfq io scheduler is not available, fallback to io.weight with 657 // a conversion scheme 658 ioWeight := convertBlkIOToIOWeightValue(*blkio.Weight) 659 if err = setValue(path, "io.weight", strconv.FormatUint(ioWeight, 10)); err != nil { 660 return err 661 } 662 } 663 } 664 665 if bfqDeviceWeightSupported(bfq) { 666 // ignore leaf weight, does not apply to cgroupv2 667 for _, dev := range blkio.WeightDevice { 668 if dev.Weight != nil { 669 val := fmt.Sprintf("%d:%d %d\n", dev.Major, dev.Minor, *dev.Weight) 670 if _, err := bfq.WriteString(val); err != nil { 671 return fmt.Errorf("failed to set device weight %q: %w", val, err) 672 } 673 } 674 } 675 } 676 677 if err := setThrottle2(path, "rbps", blkio.ThrottleReadBpsDevice); err != nil { 678 return err 679 } 680 681 if err := setThrottle2(path, "wbps", blkio.ThrottleWriteBpsDevice); err != nil { 682 return err 683 } 684 685 if err := setThrottle2(path, "riops", blkio.ThrottleReadIOPSDevice); err != nil { 686 return err 687 } 688 689 if err := setThrottle2(path, "wiops", blkio.ThrottleWriteIOPSDevice); err != nil { 690 return err 691 } 692 693 return nil 694 } 695 696 func setThrottle2(path, name string, devs []specs.LinuxThrottleDevice) error { 697 for _, dev := range devs { 698 val := fmt.Sprintf("%d:%d %s=%d", dev.Major, dev.Minor, name, dev.Rate) 699 if err := setValue(path, "io.max", val); err != nil { 700 return err 701 } 702 } 703 return nil 704 } 705 706 type hugeTLB2 struct { 707 } 708 709 func (*hugeTLB2) optional() bool { 710 return true 711 } 712 713 func (*hugeTLB2) skip(spec *specs.LinuxResources) error { 714 if spec != nil && len(spec.HugepageLimits) > 0 { 715 return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found") 716 } 717 return nil 718 } 719 720 func (*hugeTLB2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 721 return nil, nil 722 } 723 724 func (*hugeTLB2) set(spec *specs.LinuxResources, path string) error { 725 if spec == nil { 726 return nil 727 } 728 for _, limit := range spec.HugepageLimits { 729 name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize) 730 val := strconv.FormatUint(limit.Limit, 10) 731 if err := setValue(path, name, val); err != nil { 732 return err 733 } 734 } 735 return nil 736 } 737 738 // Since the OCI spec is designed for cgroup v1, in some cases 739 // there is need to convert from the cgroup v1 configuration to cgroup v2 740 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) 741 // convert from [2-262144] to [1-10000] 742 // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" 743 func convertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { 744 if cpuShares == 0 { 745 return 0 746 } 747 return (1 + ((cpuShares-2)*9999)/262142) 748 } 749 750 // convertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec 751 // for use by cgroup v2 drivers. A conversion is needed since 752 // Resources.MemorySwap is defined as memory+swap combined, while in cgroup v2 753 // swap is a separate value. 754 func convertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { 755 // for compatibility with cgroup1 controller, set swap to unlimited in 756 // case the memory is set to unlimited, and swap is not explicitly set, 757 // treating the request as "set both memory and swap to unlimited". 758 if memory == -1 && memorySwap == 0 { 759 return -1, nil 760 } 761 if memorySwap == -1 || memorySwap == 0 { 762 // -1 is "max", 0 is "unset", so treat as is. 763 return memorySwap, nil 764 } 765 // sanity checks 766 if memory == 0 || memory == -1 { 767 return 0, errors.New("unable to set swap limit without memory limit") 768 } 769 if memory < 0 { 770 return 0, fmt.Errorf("invalid memory value: %d", memory) 771 } 772 if memorySwap < memory { 773 return 0, errors.New("memory+swap limit should be >= memory limit") 774 } 775 776 return memorySwap - memory, nil 777 } 778 779 // Since the OCI spec is designed for cgroup v1, in some cases 780 // there is need to convert from the cgroup v1 configuration to cgroup v2 781 // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) 782 // convert linearly from [10-1000] to [1-10000] 783 func convertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { 784 if blkIoWeight == 0 { 785 return 0 786 } 787 return 1 + (uint64(blkIoWeight)-10)*9999/990 788 } 789 790 // numToStr converts an int64 value to a string for writing to a 791 // cgroupv2 files with .min, .max, .low, or .high suffix. 792 // The value of -1 is converted to "max" for cgroupv1 compatibility 793 // (which used to write -1 to remove the limit). 794 func numToStr(value int64) (ret string) { 795 switch { 796 case value == 0: 797 ret = "" 798 case value == -1: 799 ret = maxLimitStr 800 default: 801 ret = strconv.FormatInt(value, 10) 802 } 803 return ret 804 } 805 806 // bfqDeviceWeightSupported checks for per-device BFQ weight support (added 807 // in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". 808 func bfqDeviceWeightSupported(bfq *os.File) bool { 809 if bfq == nil { 810 return false 811 } 812 813 if _, err := bfq.Seek(0, 0); err != nil { 814 return false 815 } 816 817 buf := make([]byte, 32) 818 if _, err := bfq.Read(buf); err != nil { 819 return false 820 } 821 // If only a single number (default weight) if read back, we have older 822 // kernel. 823 _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) 824 return err != nil 825 } 826 827 // parseKeyValue parses a space-separated "name value" kind of cgroup 828 // parameter and returns its key as a string, and its value as uint64 829 // (ParseUint is used to convert the value). For example, 830 // "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. 831 func parseKeyValue(t string) (string, uint64, error) { 832 parts := strings.SplitN(t, " ", 3) 833 if len(parts) != 2 { 834 return "", 0, fmt.Errorf("line %q is not in key value format", t) 835 } 836 837 value, err := parseUint(parts[1], 10, 64) 838 if err != nil { 839 return "", 0, err 840 } 841 842 return parts[0], value, nil 843 } 844 845 // parseUint converts a string to an uint64 integer. 846 // Negative values are returned at zero as, due to kernel bugs, 847 // some of the memory cgroup stats can be negative. 848 func parseUint(s string, base, bitSize int) (uint64, error) { 849 value, err := strconv.ParseUint(s, base, bitSize) 850 if err != nil { 851 intValue, intErr := strconv.ParseInt(s, base, bitSize) 852 // 1. Handle negative values greater than MinInt64 (and) 853 // 2. Handle negative values lesser than MinInt64 854 if intErr == nil && intValue < 0 { 855 return 0, nil 856 } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { 857 return 0, nil 858 } 859 860 return value, err 861 } 862 863 return value, nil 864 } 865 866 // RangeToBits converts a text representation of a CPU mask (as written to 867 // or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes 868 // with the corresponding bits set (as consumed by systemd over dbus as 869 // AllowedCPUs/AllowedMemoryNodes unit property value). 870 // Copied from runc. 871 func RangeToBits(str string) ([]byte, error) { 872 bits := &big.Int{} 873 874 for _, r := range strings.Split(str, ",") { 875 // allow extra spaces around 876 r = strings.TrimSpace(r) 877 // allow empty elements (extra commas) 878 if r == "" { 879 continue 880 } 881 ranges := strings.SplitN(r, "-", 2) 882 if len(ranges) > 1 { 883 start, err := strconv.ParseUint(ranges[0], 10, 32) 884 if err != nil { 885 return nil, err 886 } 887 end, err := strconv.ParseUint(ranges[1], 10, 32) 888 if err != nil { 889 return nil, err 890 } 891 if start > end { 892 return nil, errors.New("invalid range: " + r) 893 } 894 for i := start; i <= end; i++ { 895 bits.SetBit(bits, int(i), 1) 896 } 897 } else { 898 val, err := strconv.ParseUint(ranges[0], 10, 32) 899 if err != nil { 900 return nil, err 901 } 902 bits.SetBit(bits, int(val), 1) 903 } 904 } 905 906 ret := bits.Bytes() 907 if len(ret) == 0 { 908 // do not allow empty values 909 return nil, errors.New("empty value") 910 } 911 return ret, nil 912 }