github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/cgroup/cgroup_v2.go (about) 1 // Copyright The runc Authors. 2 // Copyright The containerd Authors. 3 // Copyright 2021 The gVisor Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // https://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package cgroup 18 19 import ( 20 "bufio" 21 "bytes" 22 "context" 23 "errors" 24 "fmt" 25 "io/ioutil" 26 "math" 27 "math/big" 28 "os" 29 "path/filepath" 30 "strconv" 31 "strings" 32 "time" 33 34 "github.com/MerlinKodo/gvisor/pkg/cleanup" 35 "github.com/MerlinKodo/gvisor/pkg/log" 36 "github.com/cenkalti/backoff" 37 "github.com/coreos/go-systemd/v22/dbus" 38 specs "github.com/opencontainers/runtime-spec/specs-go" 39 "golang.org/x/sys/unix" 40 ) 41 42 const ( 43 subtreeControl = "cgroup.subtree_control" 44 controllersFile = "cgroup.controllers" 45 cgroup2Key = "cgroup2" 46 47 // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html 48 defaultPeriod = 100000 49 ) 50 51 var ( 52 ErrInvalidFormat = errors.New("cgroup: parsing file with invalid format failed") 53 ErrInvalidGroupPath = errors.New("cgroup: invalid group path") 54 55 // controllers2 is the group of all supported cgroupv2 controllers 56 controllers2 = map[string]controllerv2{ 57 "cpu": &cpu2{}, 58 "cpuset": &cpuset2{}, 59 "io": &io2{}, 60 "memory": &memory2{}, 61 "pids": &pid2{}, 62 "hugetlb": &hugeTLB2{}, 63 } 64 ) 65 66 // cgroupV2 represents a cgroup inside supported all cgroupV2 controllers 67 type cgroupV2 struct { 68 // Mountpoint is the unified mount point of cgroupV2 69 Mountpoint string `json:"mountpoint"` 70 // Path is the relative path to the unified mountpoint 71 Path string `json:"path"` 72 // Controllers is the list of supported controllers 73 Controllers []string `json:"controllers"` 74 // Own is the list of owned path created when install this cgroup 75 Own []string `json:"own"` 76 } 77 78 func newCgroupV2(mountpoint, group string, useSystemd bool) (Cgroup, error) { 79 data, err := ioutil.ReadFile(filepath.Join(mountpoint, "cgroup.controllers")) 80 if err != nil { 81 return nil, err 82 } 83 cg := &cgroupV2{ 84 Mountpoint: mountpoint, 85 Path: group, 86 Controllers: strings.Fields(string(data)), 87 } 88 if useSystemd { 89 return newCgroupV2Systemd(cg) 90 } 91 return cg, err 92 } 93 94 func (c *cgroupV2) createCgroupPaths() (bool, error) { 95 // setup all known controllers for the current subtree 96 // For example, given path /foo/bar and mount /sys/fs/cgroup, we need to write 97 // the controllers to: 98 // * /sys/fs/cgroup/cgroup.subtree_control 99 // * /sys/fs/cgroup/foo/cgroup.subtree_control 100 val := "+" + strings.Join(c.Controllers, " +") 101 elements := strings.Split(c.Path, "/") 102 current := c.Mountpoint 103 created := false 104 105 for i, e := range elements { 106 current = filepath.Join(current, e) 107 if i > 0 { 108 if err := os.Mkdir(current, 0o755); err != nil { 109 if !os.IsExist(err) { 110 return false, err 111 } 112 } else { 113 created = true 114 c.Own = append(c.Own, current) 115 } 116 } 117 // enable all known controllers for subtree 118 if i < len(elements)-1 { 119 if err := writeFile(filepath.Join(current, subtreeControl), []byte(val), 0700); err != nil { 120 return false, err 121 } 122 } 123 } 124 return created, nil 125 } 126 127 // Install creates and configures cgroups. 128 func (c *cgroupV2) Install(res *specs.LinuxResources) error { 129 log.Debugf("Installing cgroup path %q", c.MakePath("")) 130 // Clean up partially created cgroups on error. Errors during cleanup itself 131 // are ignored. 132 clean := cleanup.Make(func() { _ = c.Uninstall() }) 133 defer clean.Clean() 134 135 created, err := c.createCgroupPaths() 136 if err != nil { 137 return err 138 } 139 if created { 140 // If we created our final cgroup path then we can set the resources. 141 for controllerName, ctrlr := range controllers2 { 142 // First check if our controller is found in the system. 143 found := false 144 for _, knownController := range c.Controllers { 145 if controllerName == knownController { 146 found = true 147 } 148 } 149 150 // In case we don't have the controller. 151 if found { 152 if err := ctrlr.set(res, c.MakePath("")); err != nil { 153 return err 154 } 155 continue 156 } 157 if ctrlr.optional() { 158 if err := ctrlr.skip(res); err != nil { 159 return err 160 } 161 } else { 162 return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.MakePath("")) 163 } 164 } 165 } 166 167 clean.Release() 168 return nil 169 } 170 171 // Uninstall removes the settings done in Install(). If cgroup path already 172 // existed when Install() was called, Uninstall is a noop. 173 func (c *cgroupV2) Uninstall() error { 174 log.Debugf("Deleting cgroup %q", c.MakePath("")) 175 176 // If we try to remove the cgroup too soon after killing the sandbox we 177 // might get EBUSY, so we retry for a few seconds until it succeeds. 178 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 179 defer cancel() 180 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 181 182 // Deletion must occur reverse order, because they may contain ancestors. 183 for i := len(c.Own) - 1; i >= 0; i-- { 184 current := c.Own[i] 185 log.Debugf("Removing cgroup for path=%q", current) 186 187 fn := func() error { 188 err := unix.Rmdir(current) 189 if os.IsNotExist(err) { 190 return nil 191 } 192 return err 193 } 194 if err := backoff.Retry(fn, b); err != nil { 195 return fmt.Errorf("removing cgroup path %q: %w", current, err) 196 } 197 } 198 199 return nil 200 } 201 202 // Join adds the current process to the all controllers. Returns function that 203 // restores cgroup to the original state. 204 func (c *cgroupV2) Join() (func(), error) { 205 // First save the current state so it can be restored. 206 paths, err := loadPaths("self") 207 if err != nil { 208 return nil, err 209 } 210 // Since this is unified, get the first path of current process's cgroup is 211 // enough. 212 undoPath := filepath.Join(c.Mountpoint, paths[cgroup2Key]) 213 214 cu := cleanup.Make(func() { 215 log.Debugf("Restoring cgroup %q", undoPath) 216 // Writing the value 0 to a cgroup.procs file causes 217 // the writing process to be moved to the corresponding 218 // cgroup. - cgroups(7). 219 if err := setValue(undoPath, "cgroup.procs", "0"); err != nil { 220 log.Warningf("Error restoring cgroup %q: %v", undoPath, err) 221 } 222 }) 223 defer cu.Clean() 224 225 // now join the cgroup 226 if err := setValue(c.MakePath(""), "cgroup.procs", "0"); err != nil { 227 return nil, err 228 } 229 230 return cu.Release(), nil 231 } 232 233 // CPUQuota returns the CFS CPU quota. 234 func (c *cgroupV2) CPUQuota() (float64, error) { 235 cpuMax, err := getValue(c.MakePath(""), "cpu.max") 236 if err != nil { 237 return -1, err 238 } 239 240 return parseCPUQuota(cpuMax) 241 } 242 243 func parseCPUQuota(cpuMax string) (float64, error) { 244 data := strings.SplitN(strings.TrimSpace(cpuMax), " ", 2) 245 if len(data) != 2 { 246 return -1, fmt.Errorf("invalid cpu.max data %q", cpuMax) 247 } 248 249 // no cpu limit if quota is max 250 if data[0] == "max" { 251 return -1, nil 252 } 253 254 quota, err := strconv.ParseInt(data[0], 10, 64) 255 if err != nil { 256 return -1, err 257 } 258 259 period, err := strconv.ParseInt(data[1], 10, 64) 260 if err != nil { 261 return -1, err 262 } 263 264 if quota <= 0 || period <= 0 { 265 return -1, err 266 } 267 return float64(quota) / float64(period), nil 268 269 } 270 271 // CPUUsage returns the total CPU usage of the cgroup. 272 func (c *cgroupV2) CPUUsage() (uint64, error) { 273 cpuStat, err := getValue(c.MakePath(""), "cpu.stat") 274 if err != nil { 275 return 0, err 276 } 277 278 sc := bufio.NewScanner(strings.NewReader(cpuStat)) 279 for sc.Scan() { 280 key, value, err := parseKeyValue(sc.Text()) 281 if err != nil { 282 return 0, err 283 } 284 if key == "usage_usec" { 285 return value, nil 286 } 287 } 288 289 return 0, nil 290 } 291 292 // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. 293 func (c *cgroupV2) NumCPU() (int, error) { 294 cpuset, err := getValue(c.MakePath(""), "cpuset.cpus.effective") 295 if err != nil { 296 return 0, err 297 } 298 return countCpuset(strings.TrimSpace(cpuset)) 299 } 300 301 // MemoryLimit returns the memory limit. 302 func (c *cgroupV2) MemoryLimit() (uint64, error) { 303 limStr, err := getValue(c.MakePath(""), "memory.max") 304 if err != nil { 305 return 0, err 306 } 307 limStr = strings.TrimSpace(limStr) 308 if limStr == "max" { 309 return math.MaxUint64, nil 310 } 311 return strconv.ParseUint(limStr, 10, 64) 312 } 313 314 // MakePath builds a path to the given controller. 315 func (c *cgroupV2) MakePath(controllerName string) string { 316 return filepath.Join(c.Mountpoint, c.Path) 317 } 318 319 type controllerv2 interface { 320 controller 321 generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) 322 } 323 324 type cpu2 struct { 325 mandatory 326 } 327 328 func (*cpu2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 329 props := []dbus.Property{} 330 if spec == nil || spec.CPU == nil { 331 return props, nil 332 } 333 cpu := spec.CPU 334 if cpu.Shares != nil { 335 weight := convertCPUSharesToCgroupV2Value(*cpu.Shares) 336 if weight != 0 { 337 props = append(props, newProp("CPUWeight", weight)) 338 } 339 } 340 var ( 341 period uint64 342 quota int64 343 ) 344 if cpu.Period != nil { 345 period = *cpu.Period 346 } 347 if cpu.Quota != nil { 348 quota = *cpu.Quota 349 } 350 if period != 0 { 351 props = append(props, newProp("CPUQuotaPeriodUSec", period)) 352 } 353 if quota != 0 || period != 0 { 354 // Corresponds to USEC_INFINITY in systemd. 355 cpuQuotaPerSecUSec := uint64(math.MaxUint64) 356 if quota > 0 { 357 if period == 0 { 358 // Assume the default. 359 period = defaultPeriod 360 } 361 // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to 362 // CPUQuota (integer percentage of CPU) internally. This means that if a 363 // fractional percent of CPU is indicated by spec.CPU.Quota, we need to 364 // round up to the nearest 10ms (1% of a second) such that child cgroups 365 // can set the cpu.cfs_quota_us they expect. 366 cpuQuotaPerSecUSec = uint64(quota*1000000) / period 367 if cpuQuotaPerSecUSec%10000 != 0 { 368 cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 369 } 370 } 371 props = append(props, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) 372 } 373 return props, nil 374 } 375 376 func (*cpu2) set(spec *specs.LinuxResources, path string) error { 377 if spec == nil || spec.CPU == nil { 378 return nil 379 } 380 381 if spec.CPU.Shares != nil { 382 weight := convertCPUSharesToCgroupV2Value(*spec.CPU.Shares) 383 if weight != 0 { 384 if err := setValue(path, "cpu.weight", strconv.FormatUint(weight, 10)); err != nil { 385 return err 386 } 387 } 388 } 389 390 if spec.CPU.Period != nil || spec.CPU.Quota != nil { 391 v := "max" 392 if spec.CPU.Quota != nil && *spec.CPU.Quota > 0 { 393 v = strconv.FormatInt(*spec.CPU.Quota, 10) 394 } 395 396 var period uint64 397 if spec.CPU.Period != nil && *spec.CPU.Period != 0 { 398 period = *spec.CPU.Period 399 } else { 400 period = defaultPeriod 401 } 402 403 v += " " + strconv.FormatUint(period, 10) 404 if err := setValue(path, "cpu.max", v); err != nil { 405 return err 406 } 407 } 408 409 return nil 410 } 411 412 type cpuset2 struct { 413 mandatory 414 } 415 416 func (*cpuset2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 417 props := []dbus.Property{} 418 if spec == nil || spec.CPU == nil { 419 return props, nil 420 } 421 cpu := spec.CPU 422 if cpu.Cpus == "" && cpu.Mems == "" { 423 return props, nil 424 } 425 cpus := cpu.Cpus 426 mems := cpu.Mems 427 if cpus != "" { 428 bits, err := RangeToBits(cpus) 429 if err != nil { 430 return nil, fmt.Errorf("%w: cpus=%q conversion error: %v", ErrBadResourceSpec, cpus, err) 431 } 432 props = append(props, newProp("AllowedCPUs", bits)) 433 } 434 if mems != "" { 435 bits, err := RangeToBits(mems) 436 if err != nil { 437 return nil, fmt.Errorf("%w: mems=%q conversion error: %v", ErrBadResourceSpec, mems, err) 438 } 439 props = append(props, newProp("AllowedMemoryNodes", bits)) 440 } 441 return props, nil 442 } 443 444 func (*cpuset2) set(spec *specs.LinuxResources, path string) error { 445 if spec == nil || spec.CPU == nil { 446 return nil 447 } 448 449 if spec.CPU.Cpus != "" { 450 if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil { 451 return err 452 } 453 } 454 455 if spec.CPU.Mems != "" { 456 if err := setValue(path, "cpuset.mems", spec.CPU.Mems); err != nil { 457 return err 458 } 459 } 460 461 return nil 462 } 463 464 type memory2 struct { 465 mandatory 466 } 467 468 func (*memory2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 469 props := []dbus.Property{} 470 if spec == nil || spec.Memory == nil { 471 return props, nil 472 } 473 mem := spec.Memory 474 if mem.Swap != nil { 475 if mem.Limit == nil { 476 return nil, ErrBadResourceSpec 477 } 478 swap, err := convertMemorySwapToCgroupV2Value(*mem.Swap, *mem.Limit) 479 if err != nil { 480 return nil, err 481 } 482 props = append(props, newProp("MemorySwapMax", uint64(swap))) 483 } 484 if mem.Limit != nil { 485 props = append(props, newProp("MemoryMax", uint64(*mem.Limit))) 486 } 487 if mem.Reservation != nil { 488 props = append(props, newProp("MemoryLow", uint64(*mem.Reservation))) 489 } 490 return props, nil 491 } 492 493 func (*memory2) set(spec *specs.LinuxResources, path string) error { 494 if spec == nil || spec.Memory == nil { 495 return nil 496 } 497 498 if spec.Memory.Swap != nil { 499 // in cgroup v2, we set memory and swap separately, but the spec specifies 500 // Swap field as memory+swap, so we need memory limit here to be set in 501 // order to get the correct swap value. 502 if spec.Memory.Limit == nil { 503 return errors.New("cgroup: Memory.Swap is set without Memory.Limit") 504 } 505 506 swap, err := convertMemorySwapToCgroupV2Value(*spec.Memory.Swap, *spec.Memory.Limit) 507 if err != nil { 508 return nil 509 } 510 swapStr := numToStr(swap) 511 // memory and memorySwap set to the same value -- disable swap 512 if swapStr == "" && swap == 0 && *spec.Memory.Swap > 0 { 513 swapStr = "0" 514 } 515 // never write empty string to `memory.swap.max`, it means set to 0. 516 if swapStr != "" { 517 if err := setValue(path, "memory.swap.max", swapStr); err != nil { 518 return err 519 } 520 } 521 } 522 523 if spec.Memory.Limit != nil { 524 if val := numToStr(*spec.Memory.Limit); val != "" { 525 if err := setValue(path, "memory.max", val); err != nil { 526 return err 527 } 528 } 529 } 530 531 if spec.Memory.Reservation != nil { 532 if val := numToStr(*spec.Memory.Reservation); val != "" { 533 if err := setValue(path, "memory.low", val); err != nil { 534 return err 535 } 536 } 537 } 538 539 return nil 540 } 541 542 type pid2 struct { 543 mandatory 544 } 545 546 func (*pid2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 547 if spec != nil && spec.Pids != nil { 548 return []dbus.Property{newProp("TasksMax", uint64(spec.Pids.Limit))}, nil 549 } 550 return []dbus.Property{}, nil 551 } 552 553 func (*pid2) set(spec *specs.LinuxResources, path string) error { 554 if spec == nil || spec.Pids == nil { 555 return nil 556 } 557 558 if val := numToStr(spec.Pids.Limit); val != "" { 559 return setValue(path, "pids.max", val) 560 } 561 562 return nil 563 } 564 565 type io2 struct { 566 mandatory 567 } 568 569 func (*io2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 570 props := []dbus.Property{} 571 if spec == nil || spec.BlockIO == nil { 572 return props, nil 573 } 574 io := spec.BlockIO 575 if io != nil { 576 if io.Weight != nil && *io.Weight != 0 { 577 ioWeight := convertBlkIOToIOWeightValue(*io.Weight) 578 props = append(props, newProp("IOWeight", ioWeight)) 579 } 580 for _, dev := range io.WeightDevice { 581 val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight) 582 props = append(props, newProp("IODeviceWeight", val)) 583 } 584 props = addIOProps(props, "IOReadBandwidthMax", io.ThrottleReadBpsDevice) 585 props = addIOProps(props, "IOWriteBandwidthMax", io.ThrottleWriteBpsDevice) 586 props = addIOProps(props, "IOReadIOPSMax", io.ThrottleReadIOPSDevice) 587 props = addIOProps(props, "IOWriteIOPSMax", io.ThrottleWriteIOPSDevice) 588 } 589 return props, nil 590 } 591 592 func (*io2) set(spec *specs.LinuxResources, path string) error { 593 if spec == nil || spec.BlockIO == nil { 594 return nil 595 } 596 blkio := spec.BlockIO 597 598 var ( 599 err error 600 bfq *os.File 601 ) 602 603 // If BFQ IO scheduler is available, use it. 604 if blkio.Weight != nil || len(blkio.WeightDevice) > 0 { 605 bfq, err = os.Open(filepath.Join(path, "io.bfq.weight")) 606 if err == nil { 607 defer bfq.Close() 608 } else if !os.IsNotExist(err) { 609 return err 610 } 611 612 } 613 614 if blkio.Weight != nil && *blkio.Weight != 0 { 615 if bfq != nil { 616 if _, err := bfq.WriteString(strconv.FormatUint(uint64(*blkio.Weight), 10)); err != nil { 617 return err 618 } 619 } else { 620 // bfq io scheduler is not available, fallback to io.weight with 621 // a conversion scheme 622 ioWeight := convertBlkIOToIOWeightValue(*blkio.Weight) 623 if err = setValue(path, "io.weight", strconv.FormatUint(ioWeight, 10)); err != nil { 624 return err 625 } 626 } 627 } 628 629 if bfqDeviceWeightSupported(bfq) { 630 // ignore leaf weight, does not apply to cgroupv2 631 for _, dev := range blkio.WeightDevice { 632 if dev.Weight != nil { 633 val := fmt.Sprintf("%d:%d %d\n", dev.Major, dev.Minor, *dev.Weight) 634 if _, err := bfq.WriteString(val); err != nil { 635 return fmt.Errorf("failed to set device weight %q: %w", val, err) 636 } 637 } 638 } 639 } 640 641 if err := setThrottle2(path, "rbps", blkio.ThrottleReadBpsDevice); err != nil { 642 return err 643 } 644 645 if err := setThrottle2(path, "wbps", blkio.ThrottleWriteBpsDevice); err != nil { 646 return err 647 } 648 649 if err := setThrottle2(path, "riops", blkio.ThrottleReadIOPSDevice); err != nil { 650 return err 651 } 652 653 if err := setThrottle2(path, "wiops", blkio.ThrottleWriteIOPSDevice); err != nil { 654 return err 655 } 656 657 return nil 658 } 659 660 func setThrottle2(path, name string, devs []specs.LinuxThrottleDevice) error { 661 for _, dev := range devs { 662 val := fmt.Sprintf("%d:%d %s=%d", dev.Major, dev.Minor, name, dev.Rate) 663 if err := setValue(path, "io.max", val); err != nil { 664 return err 665 } 666 } 667 return nil 668 } 669 670 type hugeTLB2 struct { 671 } 672 673 func (*hugeTLB2) optional() bool { 674 return true 675 } 676 677 func (*hugeTLB2) skip(spec *specs.LinuxResources) error { 678 if spec != nil && len(spec.HugepageLimits) > 0 { 679 return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found") 680 } 681 return nil 682 } 683 684 func (*hugeTLB2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { 685 return nil, nil 686 } 687 688 func (*hugeTLB2) set(spec *specs.LinuxResources, path string) error { 689 if spec == nil { 690 return nil 691 } 692 for _, limit := range spec.HugepageLimits { 693 name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize) 694 val := strconv.FormatUint(limit.Limit, 10) 695 if err := setValue(path, name, val); err != nil { 696 return err 697 } 698 } 699 return nil 700 } 701 702 // Since the OCI spec is designed for cgroup v1, in some cases 703 // there is need to convert from the cgroup v1 configuration to cgroup v2 704 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) 705 // convert from [2-262144] to [1-10000] 706 // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" 707 func convertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { 708 if cpuShares == 0 { 709 return 0 710 } 711 return (1 + ((cpuShares-2)*9999)/262142) 712 } 713 714 // convertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec 715 // for use by cgroup v2 drivers. A conversion is needed since 716 // Resources.MemorySwap is defined as memory+swap combined, while in cgroup v2 717 // swap is a separate value. 718 func convertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { 719 // for compatibility with cgroup1 controller, set swap to unlimited in 720 // case the memory is set to unlimited, and swap is not explicitly set, 721 // treating the request as "set both memory and swap to unlimited". 722 if memory == -1 && memorySwap == 0 { 723 return -1, nil 724 } 725 if memorySwap == -1 || memorySwap == 0 { 726 // -1 is "max", 0 is "unset", so treat as is. 727 return memorySwap, nil 728 } 729 // sanity checks 730 if memory == 0 || memory == -1 { 731 return 0, errors.New("unable to set swap limit without memory limit") 732 } 733 if memory < 0 { 734 return 0, fmt.Errorf("invalid memory value: %d", memory) 735 } 736 if memorySwap < memory { 737 return 0, errors.New("memory+swap limit should be >= memory limit") 738 } 739 740 return memorySwap - memory, nil 741 } 742 743 // Since the OCI spec is designed for cgroup v1, in some cases 744 // there is need to convert from the cgroup v1 configuration to cgroup v2 745 // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) 746 // convert linearly from [10-1000] to [1-10000] 747 func convertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { 748 if blkIoWeight == 0 { 749 return 0 750 } 751 return 1 + (uint64(blkIoWeight)-10)*9999/990 752 } 753 754 // numToStr converts an int64 value to a string for writing to a 755 // cgroupv2 files with .min, .max, .low, or .high suffix. 756 // The value of -1 is converted to "max" for cgroupv1 compatibility 757 // (which used to write -1 to remove the limit). 758 func numToStr(value int64) (ret string) { 759 switch { 760 case value == 0: 761 ret = "" 762 case value == -1: 763 ret = "max" 764 default: 765 ret = strconv.FormatInt(value, 10) 766 } 767 return ret 768 } 769 770 // bfqDeviceWeightSupported checks for per-device BFQ weight support (added 771 // in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". 772 func bfqDeviceWeightSupported(bfq *os.File) bool { 773 if bfq == nil { 774 return false 775 } 776 777 if _, err := bfq.Seek(0, 0); err != nil { 778 return false 779 } 780 781 buf := make([]byte, 32) 782 if _, err := bfq.Read(buf); err != nil { 783 return false 784 } 785 // If only a single number (default weight) if read back, we have older 786 // kernel. 787 _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) 788 return err != nil 789 } 790 791 // parseKeyValue parses a space-separated "name value" kind of cgroup 792 // parameter and returns its key as a string, and its value as uint64 793 // (ParseUint is used to convert the value). For example, 794 // "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. 795 func parseKeyValue(t string) (string, uint64, error) { 796 parts := strings.SplitN(t, " ", 3) 797 if len(parts) != 2 { 798 return "", 0, fmt.Errorf("line %q is not in key value format", t) 799 } 800 801 value, err := parseUint(parts[1], 10, 64) 802 if err != nil { 803 return "", 0, err 804 } 805 806 return parts[0], value, nil 807 } 808 809 // parseUint converts a string to an uint64 integer. 810 // Negative values are returned at zero as, due to kernel bugs, 811 // some of the memory cgroup stats can be negative. 812 func parseUint(s string, base, bitSize int) (uint64, error) { 813 value, err := strconv.ParseUint(s, base, bitSize) 814 if err != nil { 815 intValue, intErr := strconv.ParseInt(s, base, bitSize) 816 // 1. Handle negative values greater than MinInt64 (and) 817 // 2. Handle negative values lesser than MinInt64 818 if intErr == nil && intValue < 0 { 819 return 0, nil 820 } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { 821 return 0, nil 822 } 823 824 return value, err 825 } 826 827 return value, nil 828 } 829 830 // RangeToBits converts a text representation of a CPU mask (as written to 831 // or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes 832 // with the corresponding bits set (as consumed by systemd over dbus as 833 // AllowedCPUs/AllowedMemoryNodes unit property value). 834 // Copied from runc. 835 func RangeToBits(str string) ([]byte, error) { 836 bits := &big.Int{} 837 838 for _, r := range strings.Split(str, ",") { 839 // allow extra spaces around 840 r = strings.TrimSpace(r) 841 // allow empty elements (extra commas) 842 if r == "" { 843 continue 844 } 845 ranges := strings.SplitN(r, "-", 2) 846 if len(ranges) > 1 { 847 start, err := strconv.ParseUint(ranges[0], 10, 32) 848 if err != nil { 849 return nil, err 850 } 851 end, err := strconv.ParseUint(ranges[1], 10, 32) 852 if err != nil { 853 return nil, err 854 } 855 if start > end { 856 return nil, errors.New("invalid range: " + r) 857 } 858 for i := start; i <= end; i++ { 859 bits.SetBit(bits, int(i), 1) 860 } 861 } else { 862 val, err := strconv.ParseUint(ranges[0], 10, 32) 863 if err != nil { 864 return nil, err 865 } 866 bits.SetBit(bits, int(val), 1) 867 } 868 } 869 870 ret := bits.Bytes() 871 if len(ret) == 0 { 872 // do not allow empty values 873 return nil, errors.New("empty value") 874 } 875 return ret, nil 876 }