github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/cgroup/cgroup_v2.go (about)

     1  // Copyright The runc Authors.
     2  // Copyright The containerd Authors.
     3  // Copyright 2021 The gVisor Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //     https://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package cgroup
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"io/ioutil"
    26  	"math"
    27  	"math/big"
    28  	"os"
    29  	"path/filepath"
    30  	"strconv"
    31  	"strings"
    32  	"time"
    33  
    34  	"github.com/cenkalti/backoff"
    35  	"github.com/coreos/go-systemd/v22/dbus"
    36  	specs "github.com/opencontainers/runtime-spec/specs-go"
    37  	"golang.org/x/sys/unix"
    38  	"github.com/metacubex/gvisor/pkg/cleanup"
    39  	"github.com/metacubex/gvisor/pkg/log"
    40  )
    41  
    42  const (
    43  	subtreeControl    = "cgroup.subtree_control"
    44  	controllersFile   = "cgroup.controllers"
    45  	cgroup2Key        = "cgroup2"
    46  	memoryLimitCgroup = "memory.max"
    47  	cpuLimitCgroup    = "cpu.max"
    48  	maxLimitStr       = "max"
    49  
    50  	// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
    51  	defaultPeriod = 100000
    52  )
    53  
    54  var (
    55  	ErrInvalidFormat    = errors.New("cgroup: parsing file with invalid format failed")
    56  	ErrInvalidGroupPath = errors.New("cgroup: invalid group path")
    57  
    58  	// controllers2 is the group of all supported cgroupv2 controllers
    59  	controllers2 = map[string]controllerv2{
    60  		"cpu":     &cpu2{},
    61  		"cpuset":  &cpuset2{},
    62  		"io":      &io2{},
    63  		"memory":  &memory2{},
    64  		"pids":    &pid2{},
    65  		"hugetlb": &hugeTLB2{},
    66  	}
    67  )
    68  
    69  // cgroupV2 represents a cgroup inside supported all cgroupV2 controllers
    70  type cgroupV2 struct {
    71  	// Mountpoint is the unified mount point of cgroupV2
    72  	Mountpoint string `json:"mountpoint"`
    73  	// Path is the relative path to the unified mountpoint
    74  	Path string `json:"path"`
    75  	// Controllers is the list of supported controllers
    76  	Controllers []string `json:"controllers"`
    77  	// Own is the list of owned path created when install this cgroup
    78  	Own []string `json:"own"`
    79  }
    80  
    81  func newCgroupV2(mountpoint, group string, useSystemd bool) (Cgroup, error) {
    82  	data, err := ioutil.ReadFile(filepath.Join(mountpoint, "cgroup.controllers"))
    83  	if err != nil {
    84  		return nil, err
    85  	}
    86  	cg := &cgroupV2{
    87  		Mountpoint:  mountpoint,
    88  		Path:        group,
    89  		Controllers: strings.Fields(string(data)),
    90  	}
    91  	if useSystemd {
    92  		return newCgroupV2Systemd(cg)
    93  	}
    94  	return cg, err
    95  }
    96  
    97  func (c *cgroupV2) createCgroupPaths() (bool, error) {
    98  	// setup all known controllers for the current subtree
    99  	// For example, given path /foo/bar and mount /sys/fs/cgroup, we need to write
   100  	// the controllers to:
   101  	//	* /sys/fs/cgroup/cgroup.subtree_control
   102  	//	* /sys/fs/cgroup/foo/cgroup.subtree_control
   103  	val := "+" + strings.Join(c.Controllers, " +")
   104  	elements := strings.Split(c.Path, "/")
   105  	current := c.Mountpoint
   106  	created := false
   107  
   108  	for i, e := range elements {
   109  		current = filepath.Join(current, e)
   110  		if i > 0 {
   111  			if err := os.Mkdir(current, 0o755); err != nil {
   112  				if !os.IsExist(err) {
   113  					return false, err
   114  				}
   115  			} else {
   116  				created = true
   117  				c.Own = append(c.Own, current)
   118  			}
   119  		}
   120  		// enable all known controllers for subtree
   121  		if i < len(elements)-1 {
   122  			if err := writeFile(filepath.Join(current, subtreeControl), []byte(val), 0700); err != nil {
   123  				return false, err
   124  			}
   125  		}
   126  	}
   127  	return created, nil
   128  }
   129  
   130  // Install creates and configures cgroups.
   131  func (c *cgroupV2) Install(res *specs.LinuxResources) error {
   132  	log.Debugf("Installing cgroup path %q", c.MakePath(""))
   133  	// Clean up partially created cgroups on error. Errors during cleanup itself
   134  	// are ignored.
   135  	clean := cleanup.Make(func() { _ = c.Uninstall() })
   136  	defer clean.Clean()
   137  
   138  	created, err := c.createCgroupPaths()
   139  	if err != nil {
   140  		return err
   141  	}
   142  	if created {
   143  		// If we created our final cgroup path then we can set the resources.
   144  		for controllerName, ctrlr := range controllers2 {
   145  			// First check if our controller is found in the system.
   146  			found := false
   147  			for _, knownController := range c.Controllers {
   148  				if controllerName == knownController {
   149  					found = true
   150  				}
   151  			}
   152  
   153  			// In case we don't have the controller.
   154  			if found {
   155  				if err := ctrlr.set(res, c.MakePath("")); err != nil {
   156  					return err
   157  				}
   158  				continue
   159  			}
   160  			if ctrlr.optional() {
   161  				if err := ctrlr.skip(res); err != nil {
   162  					return err
   163  				}
   164  			} else {
   165  				return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.MakePath(""))
   166  			}
   167  		}
   168  	}
   169  
   170  	clean.Release()
   171  	return nil
   172  }
   173  
   174  // Uninstall removes the settings done in Install(). If cgroup path already
   175  // existed when Install() was called, Uninstall is a noop.
   176  func (c *cgroupV2) Uninstall() error {
   177  	log.Debugf("Deleting cgroup %q", c.MakePath(""))
   178  
   179  	// If we try to remove the cgroup too soon after killing the sandbox we
   180  	// might get EBUSY, so we retry for a few seconds until it succeeds.
   181  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   182  	defer cancel()
   183  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
   184  
   185  	// Deletion must occur reverse order, because they may contain ancestors.
   186  	for i := len(c.Own) - 1; i >= 0; i-- {
   187  		current := c.Own[i]
   188  		log.Debugf("Removing cgroup for path=%q", current)
   189  
   190  		fn := func() error {
   191  			err := unix.Rmdir(current)
   192  			if os.IsNotExist(err) {
   193  				return nil
   194  			}
   195  			return err
   196  		}
   197  		if err := backoff.Retry(fn, b); err != nil {
   198  			return fmt.Errorf("removing cgroup path %q: %w", current, err)
   199  		}
   200  	}
   201  
   202  	return nil
   203  }
   204  
   205  // Join adds the current process to the all controllers. Returns function that
   206  // restores cgroup to the original state.
   207  func (c *cgroupV2) Join() (func(), error) {
   208  	// First save the current state so it can be restored.
   209  	paths, err := loadPaths("self")
   210  	if err != nil {
   211  		return nil, err
   212  	}
   213  	// Since this is unified, get the first path of current process's cgroup is
   214  	// enough.
   215  	undoPath := filepath.Join(c.Mountpoint, paths[cgroup2Key])
   216  
   217  	cu := cleanup.Make(func() {
   218  		log.Debugf("Restoring cgroup %q", undoPath)
   219  		// Writing the value 0 to a cgroup.procs file causes
   220  		// the writing process to be moved to the corresponding
   221  		// cgroup. - cgroups(7).
   222  		if err := setValue(undoPath, "cgroup.procs", "0"); err != nil {
   223  			log.Warningf("Error restoring cgroup %q: %v", undoPath, err)
   224  		}
   225  	})
   226  	defer cu.Clean()
   227  
   228  	// now join the cgroup
   229  	if err := setValue(c.MakePath(""), "cgroup.procs", "0"); err != nil {
   230  		return nil, err
   231  	}
   232  
   233  	return cu.Release(), nil
   234  }
   235  
   236  func getCPUQuota(path string) (float64, error) {
   237  	cpuMax, err := getValue(path, cpuLimitCgroup)
   238  	if err != nil {
   239  		return -1, err
   240  	}
   241  	return parseCPUQuota(cpuMax)
   242  }
   243  
   244  // CPUQuota returns the CFS CPU quota.
   245  func (c *cgroupV2) CPUQuota() (float64, error) {
   246  	cpuQuota, err := getCPUQuota(c.MakePath(""))
   247  	if err != nil {
   248  		return -1, err
   249  	}
   250  	// In cgroupv2+systemd, limits are set in the parent slice rather
   251  	// than the leaf node. Check the parent to see if this is the case.
   252  	if cpuQuota == -1 {
   253  		cpuQuota, err = getCPUQuota(filepath.Dir(c.MakePath("")))
   254  		if err != nil && errors.Is(err, os.ErrNotExist) {
   255  			err = nil
   256  		}
   257  	}
   258  	return cpuQuota, nil
   259  }
   260  
   261  func parseCPUQuota(cpuMax string) (float64, error) {
   262  	data := strings.SplitN(strings.TrimSpace(cpuMax), " ", 2)
   263  	if len(data) != 2 {
   264  		return -1, fmt.Errorf("invalid cpu.max data %q", cpuMax)
   265  	}
   266  
   267  	// no cpu limit if quota is max
   268  	if data[0] == maxLimitStr {
   269  		return -1, nil
   270  	}
   271  
   272  	quota, err := strconv.ParseInt(data[0], 10, 64)
   273  	if err != nil {
   274  		return -1, err
   275  	}
   276  
   277  	period, err := strconv.ParseInt(data[1], 10, 64)
   278  	if err != nil {
   279  		return -1, err
   280  	}
   281  
   282  	if quota <= 0 || period <= 0 {
   283  		return -1, err
   284  	}
   285  	return float64(quota) / float64(period), nil
   286  
   287  }
   288  
   289  // CPUUsage returns the total CPU usage of the cgroup.
   290  func (c *cgroupV2) CPUUsage() (uint64, error) {
   291  	cpuStat, err := getValue(c.MakePath(""), "cpu.stat")
   292  	if err != nil {
   293  		return 0, err
   294  	}
   295  
   296  	sc := bufio.NewScanner(strings.NewReader(cpuStat))
   297  	for sc.Scan() {
   298  		key, value, err := parseKeyValue(sc.Text())
   299  		if err != nil {
   300  			return 0, err
   301  		}
   302  		if key == "usage_usec" {
   303  			return value, nil
   304  		}
   305  	}
   306  
   307  	return 0, nil
   308  }
   309  
   310  // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
   311  func (c *cgroupV2) NumCPU() (int, error) {
   312  	cpuset, err := getValue(c.MakePath(""), "cpuset.cpus.effective")
   313  	if err != nil {
   314  		return 0, err
   315  	}
   316  	return countCpuset(strings.TrimSpace(cpuset))
   317  }
   318  
   319  func getMemoryLimit(path string) (string, error) {
   320  	limStr, err := getValue(path, memoryLimitCgroup)
   321  	if err != nil {
   322  		return "", err
   323  	}
   324  	return strings.TrimSpace(limStr), nil
   325  }
   326  
   327  // MemoryLimit returns the memory limit.
   328  func (c *cgroupV2) MemoryLimit() (uint64, error) {
   329  	limStr, err := getMemoryLimit(c.MakePath(""))
   330  	if err != nil {
   331  		return 0, err
   332  	}
   333  	// In cgroupv2+systemd, limits are set in the parent slice rather
   334  	// than the leaf node. Check the parent to see if this is the case.
   335  	if limStr == maxLimitStr {
   336  		parentLimStr, err := getMemoryLimit(filepath.Dir(c.MakePath("")))
   337  		if err != nil && !errors.Is(err, os.ErrNotExist) {
   338  			return 0, err
   339  		}
   340  		if parentLimStr != "" {
   341  			limStr = parentLimStr
   342  		}
   343  		if limStr == maxLimitStr {
   344  			return math.MaxUint64, nil
   345  		}
   346  	}
   347  	return strconv.ParseUint(limStr, 10, 64)
   348  }
   349  
   350  // MakePath builds a path to the given controller.
   351  func (c *cgroupV2) MakePath(string) string {
   352  	return filepath.Join(c.Mountpoint, c.Path)
   353  }
   354  
   355  type controllerv2 interface {
   356  	controller
   357  	generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error)
   358  }
   359  
   360  type cpu2 struct {
   361  	mandatory
   362  }
   363  
   364  func (*cpu2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   365  	props := []dbus.Property{}
   366  	if spec == nil || spec.CPU == nil {
   367  		return props, nil
   368  	}
   369  	cpu := spec.CPU
   370  	if cpu.Shares != nil {
   371  		weight := convertCPUSharesToCgroupV2Value(*cpu.Shares)
   372  		if weight != 0 {
   373  			props = append(props, newProp("CPUWeight", weight))
   374  		}
   375  	}
   376  	var (
   377  		period uint64
   378  		quota  int64
   379  	)
   380  	if cpu.Period != nil {
   381  		period = *cpu.Period
   382  	}
   383  	if cpu.Quota != nil {
   384  		quota = *cpu.Quota
   385  	}
   386  	if period != 0 {
   387  		props = append(props, newProp("CPUQuotaPeriodUSec", period))
   388  	}
   389  	if quota != 0 || period != 0 {
   390  		// Corresponds to USEC_INFINITY in systemd.
   391  		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
   392  		if quota > 0 {
   393  			if period == 0 {
   394  				// Assume the default.
   395  				period = defaultPeriod
   396  			}
   397  			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to
   398  			// CPUQuota (integer percentage of CPU) internally. This means that if a
   399  			// fractional percent of CPU is indicated by spec.CPU.Quota, we need to
   400  			// round up to the nearest 10ms (1% of a second) such that child cgroups
   401  			// can set the cpu.cfs_quota_us they expect.
   402  			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
   403  			if cpuQuotaPerSecUSec%10000 != 0 {
   404  				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
   405  			}
   406  		}
   407  		props = append(props, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
   408  	}
   409  	return props, nil
   410  }
   411  
   412  func (*cpu2) set(spec *specs.LinuxResources, path string) error {
   413  	if spec == nil || spec.CPU == nil {
   414  		return nil
   415  	}
   416  
   417  	if spec.CPU.Shares != nil {
   418  		weight := convertCPUSharesToCgroupV2Value(*spec.CPU.Shares)
   419  		if weight != 0 {
   420  			if err := setValue(path, "cpu.weight", strconv.FormatUint(weight, 10)); err != nil {
   421  				return err
   422  			}
   423  		}
   424  	}
   425  
   426  	if spec.CPU.Period != nil || spec.CPU.Quota != nil {
   427  		v := maxLimitStr
   428  		if spec.CPU.Quota != nil && *spec.CPU.Quota > 0 {
   429  			v = strconv.FormatInt(*spec.CPU.Quota, 10)
   430  		}
   431  
   432  		var period uint64
   433  		if spec.CPU.Period != nil && *spec.CPU.Period != 0 {
   434  			period = *spec.CPU.Period
   435  		} else {
   436  			period = defaultPeriod
   437  		}
   438  
   439  		v += " " + strconv.FormatUint(period, 10)
   440  		if err := setValue(path, "cpu.max", v); err != nil {
   441  			return err
   442  		}
   443  	}
   444  
   445  	return nil
   446  }
   447  
   448  type cpuset2 struct {
   449  	mandatory
   450  }
   451  
   452  func (*cpuset2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   453  	props := []dbus.Property{}
   454  	if spec == nil || spec.CPU == nil {
   455  		return props, nil
   456  	}
   457  	cpu := spec.CPU
   458  	if cpu.Cpus == "" && cpu.Mems == "" {
   459  		return props, nil
   460  	}
   461  	cpus := cpu.Cpus
   462  	mems := cpu.Mems
   463  	if cpus != "" {
   464  		bits, err := RangeToBits(cpus)
   465  		if err != nil {
   466  			return nil, fmt.Errorf("%w: cpus=%q conversion error: %v", ErrBadResourceSpec, cpus, err)
   467  		}
   468  		props = append(props, newProp("AllowedCPUs", bits))
   469  	}
   470  	if mems != "" {
   471  		bits, err := RangeToBits(mems)
   472  		if err != nil {
   473  			return nil, fmt.Errorf("%w: mems=%q conversion error: %v", ErrBadResourceSpec, mems, err)
   474  		}
   475  		props = append(props, newProp("AllowedMemoryNodes", bits))
   476  	}
   477  	return props, nil
   478  }
   479  
   480  func (*cpuset2) set(spec *specs.LinuxResources, path string) error {
   481  	if spec == nil || spec.CPU == nil {
   482  		return nil
   483  	}
   484  
   485  	if spec.CPU.Cpus != "" {
   486  		if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil {
   487  			return err
   488  		}
   489  	}
   490  
   491  	if spec.CPU.Mems != "" {
   492  		if err := setValue(path, "cpuset.mems", spec.CPU.Mems); err != nil {
   493  			return err
   494  		}
   495  	}
   496  
   497  	return nil
   498  }
   499  
   500  type memory2 struct {
   501  	mandatory
   502  }
   503  
   504  func (*memory2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   505  	props := []dbus.Property{}
   506  	if spec == nil || spec.Memory == nil {
   507  		return props, nil
   508  	}
   509  	mem := spec.Memory
   510  	if mem.Swap != nil {
   511  		if mem.Limit == nil {
   512  			return nil, ErrBadResourceSpec
   513  		}
   514  		swap, err := convertMemorySwapToCgroupV2Value(*mem.Swap, *mem.Limit)
   515  		if err != nil {
   516  			return nil, err
   517  		}
   518  		props = append(props, newProp("MemorySwapMax", uint64(swap)))
   519  	}
   520  	if mem.Limit != nil {
   521  		props = append(props, newProp("MemoryMax", uint64(*mem.Limit)))
   522  	}
   523  	if mem.Reservation != nil {
   524  		props = append(props, newProp("MemoryLow", uint64(*mem.Reservation)))
   525  	}
   526  	return props, nil
   527  }
   528  
   529  func (*memory2) set(spec *specs.LinuxResources, path string) error {
   530  	if spec == nil || spec.Memory == nil {
   531  		return nil
   532  	}
   533  
   534  	if spec.Memory.Swap != nil {
   535  		// in cgroup v2, we set memory and swap separately, but the spec specifies
   536  		// Swap field as memory+swap, so we need memory limit here to be set in
   537  		// order to get the correct swap value.
   538  		if spec.Memory.Limit == nil {
   539  			return errors.New("cgroup: Memory.Swap is set without Memory.Limit")
   540  		}
   541  
   542  		swap, err := convertMemorySwapToCgroupV2Value(*spec.Memory.Swap, *spec.Memory.Limit)
   543  		if err != nil {
   544  			return nil
   545  		}
   546  		swapStr := numToStr(swap)
   547  		// memory and memorySwap set to the same value -- disable swap
   548  		if swapStr == "" && swap == 0 && *spec.Memory.Swap > 0 {
   549  			swapStr = "0"
   550  		}
   551  		// never write empty string to `memory.swap.max`, it means set to 0.
   552  		if swapStr != "" {
   553  			if err := setValue(path, "memory.swap.max", swapStr); err != nil {
   554  				return err
   555  			}
   556  		}
   557  	}
   558  
   559  	if spec.Memory.Limit != nil {
   560  		if val := numToStr(*spec.Memory.Limit); val != "" {
   561  			if err := setValue(path, "memory.max", val); err != nil {
   562  				return err
   563  			}
   564  		}
   565  	}
   566  
   567  	if spec.Memory.Reservation != nil {
   568  		if val := numToStr(*spec.Memory.Reservation); val != "" {
   569  			if err := setValue(path, "memory.low", val); err != nil {
   570  				return err
   571  			}
   572  		}
   573  	}
   574  
   575  	return nil
   576  }
   577  
   578  type pid2 struct {
   579  	mandatory
   580  }
   581  
   582  func (*pid2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   583  	if spec != nil && spec.Pids != nil {
   584  		return []dbus.Property{newProp("TasksMax", uint64(spec.Pids.Limit))}, nil
   585  	}
   586  	return []dbus.Property{}, nil
   587  }
   588  
   589  func (*pid2) set(spec *specs.LinuxResources, path string) error {
   590  	if spec == nil || spec.Pids == nil {
   591  		return nil
   592  	}
   593  
   594  	if val := numToStr(spec.Pids.Limit); val != "" {
   595  		return setValue(path, "pids.max", val)
   596  	}
   597  
   598  	return nil
   599  }
   600  
   601  type io2 struct {
   602  	mandatory
   603  }
   604  
   605  func (*io2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   606  	props := []dbus.Property{}
   607  	if spec == nil || spec.BlockIO == nil {
   608  		return props, nil
   609  	}
   610  	io := spec.BlockIO
   611  	if io != nil {
   612  		if io.Weight != nil && *io.Weight != 0 {
   613  			ioWeight := convertBlkIOToIOWeightValue(*io.Weight)
   614  			props = append(props, newProp("IOWeight", ioWeight))
   615  		}
   616  		for _, dev := range io.WeightDevice {
   617  			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight)
   618  			props = append(props, newProp("IODeviceWeight", val))
   619  		}
   620  		props = addIOProps(props, "IOReadBandwidthMax", io.ThrottleReadBpsDevice)
   621  		props = addIOProps(props, "IOWriteBandwidthMax", io.ThrottleWriteBpsDevice)
   622  		props = addIOProps(props, "IOReadIOPSMax", io.ThrottleReadIOPSDevice)
   623  		props = addIOProps(props, "IOWriteIOPSMax", io.ThrottleWriteIOPSDevice)
   624  	}
   625  	return props, nil
   626  }
   627  
   628  func (*io2) set(spec *specs.LinuxResources, path string) error {
   629  	if spec == nil || spec.BlockIO == nil {
   630  		return nil
   631  	}
   632  	blkio := spec.BlockIO
   633  
   634  	var (
   635  		err error
   636  		bfq *os.File
   637  	)
   638  
   639  	// If BFQ IO scheduler is available, use it.
   640  	if blkio.Weight != nil || len(blkio.WeightDevice) > 0 {
   641  		bfq, err = os.Open(filepath.Join(path, "io.bfq.weight"))
   642  		if err == nil {
   643  			defer bfq.Close()
   644  		} else if !os.IsNotExist(err) {
   645  			return err
   646  		}
   647  
   648  	}
   649  
   650  	if blkio.Weight != nil && *blkio.Weight != 0 {
   651  		if bfq != nil {
   652  			if _, err := bfq.WriteString(strconv.FormatUint(uint64(*blkio.Weight), 10)); err != nil {
   653  				return err
   654  			}
   655  		} else {
   656  			// bfq io scheduler is not available, fallback to io.weight with
   657  			// a conversion scheme
   658  			ioWeight := convertBlkIOToIOWeightValue(*blkio.Weight)
   659  			if err = setValue(path, "io.weight", strconv.FormatUint(ioWeight, 10)); err != nil {
   660  				return err
   661  			}
   662  		}
   663  	}
   664  
   665  	if bfqDeviceWeightSupported(bfq) {
   666  		// ignore leaf weight, does not apply to cgroupv2
   667  		for _, dev := range blkio.WeightDevice {
   668  			if dev.Weight != nil {
   669  				val := fmt.Sprintf("%d:%d %d\n", dev.Major, dev.Minor, *dev.Weight)
   670  				if _, err := bfq.WriteString(val); err != nil {
   671  					return fmt.Errorf("failed to set device weight %q: %w", val, err)
   672  				}
   673  			}
   674  		}
   675  	}
   676  
   677  	if err := setThrottle2(path, "rbps", blkio.ThrottleReadBpsDevice); err != nil {
   678  		return err
   679  	}
   680  
   681  	if err := setThrottle2(path, "wbps", blkio.ThrottleWriteBpsDevice); err != nil {
   682  		return err
   683  	}
   684  
   685  	if err := setThrottle2(path, "riops", blkio.ThrottleReadIOPSDevice); err != nil {
   686  		return err
   687  	}
   688  
   689  	if err := setThrottle2(path, "wiops", blkio.ThrottleWriteIOPSDevice); err != nil {
   690  		return err
   691  	}
   692  
   693  	return nil
   694  }
   695  
   696  func setThrottle2(path, name string, devs []specs.LinuxThrottleDevice) error {
   697  	for _, dev := range devs {
   698  		val := fmt.Sprintf("%d:%d %s=%d", dev.Major, dev.Minor, name, dev.Rate)
   699  		if err := setValue(path, "io.max", val); err != nil {
   700  			return err
   701  		}
   702  	}
   703  	return nil
   704  }
   705  
   706  type hugeTLB2 struct {
   707  }
   708  
   709  func (*hugeTLB2) optional() bool {
   710  	return true
   711  }
   712  
   713  func (*hugeTLB2) skip(spec *specs.LinuxResources) error {
   714  	if spec != nil && len(spec.HugepageLimits) > 0 {
   715  		return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found")
   716  	}
   717  	return nil
   718  }
   719  
   720  func (*hugeTLB2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   721  	return nil, nil
   722  }
   723  
   724  func (*hugeTLB2) set(spec *specs.LinuxResources, path string) error {
   725  	if spec == nil {
   726  		return nil
   727  	}
   728  	for _, limit := range spec.HugepageLimits {
   729  		name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize)
   730  		val := strconv.FormatUint(limit.Limit, 10)
   731  		if err := setValue(path, name, val); err != nil {
   732  			return err
   733  		}
   734  	}
   735  	return nil
   736  }
   737  
   738  // Since the OCI spec is designed for cgroup v1, in some cases
   739  // there is need to convert from the cgroup v1 configuration to cgroup v2
   740  // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
   741  // convert from [2-262144] to [1-10000]
   742  // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
   743  func convertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
   744  	if cpuShares == 0 {
   745  		return 0
   746  	}
   747  	return (1 + ((cpuShares-2)*9999)/262142)
   748  }
   749  
   750  // convertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
   751  // for use by cgroup v2 drivers. A conversion is needed since
   752  // Resources.MemorySwap is defined as memory+swap combined, while in cgroup v2
   753  // swap is a separate value.
   754  func convertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
   755  	// for compatibility with cgroup1 controller, set swap to unlimited in
   756  	// case the memory is set to unlimited, and swap is not explicitly set,
   757  	// treating the request as "set both memory and swap to unlimited".
   758  	if memory == -1 && memorySwap == 0 {
   759  		return -1, nil
   760  	}
   761  	if memorySwap == -1 || memorySwap == 0 {
   762  		// -1 is "max", 0 is "unset", so treat as is.
   763  		return memorySwap, nil
   764  	}
   765  	// sanity checks
   766  	if memory == 0 || memory == -1 {
   767  		return 0, errors.New("unable to set swap limit without memory limit")
   768  	}
   769  	if memory < 0 {
   770  		return 0, fmt.Errorf("invalid memory value: %d", memory)
   771  	}
   772  	if memorySwap < memory {
   773  		return 0, errors.New("memory+swap limit should be >= memory limit")
   774  	}
   775  
   776  	return memorySwap - memory, nil
   777  }
   778  
   779  // Since the OCI spec is designed for cgroup v1, in some cases
   780  // there is need to convert from the cgroup v1 configuration to cgroup v2
   781  // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
   782  // convert linearly from [10-1000] to [1-10000]
   783  func convertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
   784  	if blkIoWeight == 0 {
   785  		return 0
   786  	}
   787  	return 1 + (uint64(blkIoWeight)-10)*9999/990
   788  }
   789  
   790  // numToStr converts an int64 value to a string for writing to a
   791  // cgroupv2 files with .min, .max, .low, or .high suffix.
   792  // The value of -1 is converted to "max" for cgroupv1 compatibility
   793  // (which used to write -1 to remove the limit).
   794  func numToStr(value int64) (ret string) {
   795  	switch {
   796  	case value == 0:
   797  		ret = ""
   798  	case value == -1:
   799  		ret = maxLimitStr
   800  	default:
   801  		ret = strconv.FormatInt(value, 10)
   802  	}
   803  	return ret
   804  }
   805  
   806  // bfqDeviceWeightSupported checks for per-device BFQ weight support (added
   807  // in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight".
   808  func bfqDeviceWeightSupported(bfq *os.File) bool {
   809  	if bfq == nil {
   810  		return false
   811  	}
   812  
   813  	if _, err := bfq.Seek(0, 0); err != nil {
   814  		return false
   815  	}
   816  
   817  	buf := make([]byte, 32)
   818  	if _, err := bfq.Read(buf); err != nil {
   819  		return false
   820  	}
   821  	// If only a single number (default weight) if read back, we have older
   822  	// kernel.
   823  	_, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64)
   824  	return err != nil
   825  }
   826  
   827  // parseKeyValue parses a space-separated "name value" kind of cgroup
   828  // parameter and returns its key as a string, and its value as uint64
   829  // (ParseUint is used to convert the value). For example,
   830  // "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
   831  func parseKeyValue(t string) (string, uint64, error) {
   832  	parts := strings.SplitN(t, " ", 3)
   833  	if len(parts) != 2 {
   834  		return "", 0, fmt.Errorf("line %q is not in key value format", t)
   835  	}
   836  
   837  	value, err := parseUint(parts[1], 10, 64)
   838  	if err != nil {
   839  		return "", 0, err
   840  	}
   841  
   842  	return parts[0], value, nil
   843  }
   844  
   845  // parseUint converts a string to an uint64 integer.
   846  // Negative values are returned at zero as, due to kernel bugs,
   847  // some of the memory cgroup stats can be negative.
   848  func parseUint(s string, base, bitSize int) (uint64, error) {
   849  	value, err := strconv.ParseUint(s, base, bitSize)
   850  	if err != nil {
   851  		intValue, intErr := strconv.ParseInt(s, base, bitSize)
   852  		// 1. Handle negative values greater than MinInt64 (and)
   853  		// 2. Handle negative values lesser than MinInt64
   854  		if intErr == nil && intValue < 0 {
   855  			return 0, nil
   856  		} else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 {
   857  			return 0, nil
   858  		}
   859  
   860  		return value, err
   861  	}
   862  
   863  	return value, nil
   864  }
   865  
   866  // RangeToBits converts a text representation of a CPU mask (as written to
   867  // or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
   868  // with the corresponding bits set (as consumed by systemd over dbus as
   869  // AllowedCPUs/AllowedMemoryNodes unit property value).
   870  // Copied from runc.
   871  func RangeToBits(str string) ([]byte, error) {
   872  	bits := &big.Int{}
   873  
   874  	for _, r := range strings.Split(str, ",") {
   875  		// allow extra spaces around
   876  		r = strings.TrimSpace(r)
   877  		// allow empty elements (extra commas)
   878  		if r == "" {
   879  			continue
   880  		}
   881  		ranges := strings.SplitN(r, "-", 2)
   882  		if len(ranges) > 1 {
   883  			start, err := strconv.ParseUint(ranges[0], 10, 32)
   884  			if err != nil {
   885  				return nil, err
   886  			}
   887  			end, err := strconv.ParseUint(ranges[1], 10, 32)
   888  			if err != nil {
   889  				return nil, err
   890  			}
   891  			if start > end {
   892  				return nil, errors.New("invalid range: " + r)
   893  			}
   894  			for i := start; i <= end; i++ {
   895  				bits.SetBit(bits, int(i), 1)
   896  			}
   897  		} else {
   898  			val, err := strconv.ParseUint(ranges[0], 10, 32)
   899  			if err != nil {
   900  				return nil, err
   901  			}
   902  			bits.SetBit(bits, int(val), 1)
   903  		}
   904  	}
   905  
   906  	ret := bits.Bytes()
   907  	if len(ret) == 0 {
   908  		// do not allow empty values
   909  		return nil, errors.New("empty value")
   910  	}
   911  	return ret, nil
   912  }