github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/cgroup/cgroup_v2.go (about)

     1  // Copyright The runc Authors.
     2  // Copyright The containerd Authors.
     3  // Copyright 2021 The gVisor Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //     https://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package cgroup
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"io/ioutil"
    26  	"math"
    27  	"math/big"
    28  	"os"
    29  	"path/filepath"
    30  	"strconv"
    31  	"strings"
    32  	"time"
    33  
    34  	"github.com/cenkalti/backoff"
    35  	"github.com/coreos/go-systemd/v22/dbus"
    36  	specs "github.com/opencontainers/runtime-spec/specs-go"
    37  	"golang.org/x/sys/unix"
    38  	"github.com/nicocha30/gvisor-ligolo/pkg/cleanup"
    39  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    40  )
    41  
    42  const (
    43  	subtreeControl  = "cgroup.subtree_control"
    44  	controllersFile = "cgroup.controllers"
    45  	cgroup2Key      = "cgroup2"
    46  
    47  	// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
    48  	defaultPeriod = 100000
    49  )
    50  
    51  var (
    52  	ErrInvalidFormat    = errors.New("cgroup: parsing file with invalid format failed")
    53  	ErrInvalidGroupPath = errors.New("cgroup: invalid group path")
    54  
    55  	// controllers2 is the group of all supported cgroupv2 controllers
    56  	controllers2 = map[string]controllerv2{
    57  		"cpu":     &cpu2{},
    58  		"cpuset":  &cpuset2{},
    59  		"io":      &io2{},
    60  		"memory":  &memory2{},
    61  		"pids":    &pid2{},
    62  		"hugetlb": &hugeTLB2{},
    63  	}
    64  )
    65  
    66  // cgroupV2 represents a cgroup inside supported all cgroupV2 controllers
    67  type cgroupV2 struct {
    68  	// Mountpoint is the unified mount point of cgroupV2
    69  	Mountpoint string `json:"mountpoint"`
    70  	// Path is the relative path to the unified mountpoint
    71  	Path string `json:"path"`
    72  	// Controllers is the list of supported controllers
    73  	Controllers []string `json:"controllers"`
    74  	// Own is the list of owned path created when install this cgroup
    75  	Own []string `json:"own"`
    76  }
    77  
    78  func newCgroupV2(mountpoint, group string, useSystemd bool) (Cgroup, error) {
    79  	data, err := ioutil.ReadFile(filepath.Join(mountpoint, "cgroup.controllers"))
    80  	if err != nil {
    81  		return nil, err
    82  	}
    83  	cg := &cgroupV2{
    84  		Mountpoint:  mountpoint,
    85  		Path:        group,
    86  		Controllers: strings.Fields(string(data)),
    87  	}
    88  	if useSystemd {
    89  		return newCgroupV2Systemd(cg)
    90  	}
    91  	return cg, err
    92  }
    93  
    94  func (c *cgroupV2) createCgroupPaths() (bool, error) {
    95  	// setup all known controllers for the current subtree
    96  	// For example, given path /foo/bar and mount /sys/fs/cgroup, we need to write
    97  	// the controllers to:
    98  	//	* /sys/fs/cgroup/cgroup.subtree_control
    99  	//	* /sys/fs/cgroup/foo/cgroup.subtree_control
   100  	val := "+" + strings.Join(c.Controllers, " +")
   101  	elements := strings.Split(c.Path, "/")
   102  	current := c.Mountpoint
   103  	created := false
   104  
   105  	for i, e := range elements {
   106  		current = filepath.Join(current, e)
   107  		if i > 0 {
   108  			if err := os.Mkdir(current, 0o755); err != nil {
   109  				if !os.IsExist(err) {
   110  					return false, err
   111  				}
   112  			} else {
   113  				created = true
   114  				c.Own = append(c.Own, current)
   115  			}
   116  		}
   117  		// enable all known controllers for subtree
   118  		if i < len(elements)-1 {
   119  			if err := writeFile(filepath.Join(current, subtreeControl), []byte(val), 0700); err != nil {
   120  				return false, err
   121  			}
   122  		}
   123  	}
   124  	return created, nil
   125  }
   126  
   127  // Install creates and configures cgroups.
   128  func (c *cgroupV2) Install(res *specs.LinuxResources) error {
   129  	log.Debugf("Installing cgroup path %q", c.MakePath(""))
   130  	// Clean up partially created cgroups on error. Errors during cleanup itself
   131  	// are ignored.
   132  	clean := cleanup.Make(func() { _ = c.Uninstall() })
   133  	defer clean.Clean()
   134  
   135  	created, err := c.createCgroupPaths()
   136  	if err != nil {
   137  		return err
   138  	}
   139  	if created {
   140  		// If we created our final cgroup path then we can set the resources.
   141  		for controllerName, ctrlr := range controllers2 {
   142  			// First check if our controller is found in the system.
   143  			found := false
   144  			for _, knownController := range c.Controllers {
   145  				if controllerName == knownController {
   146  					found = true
   147  				}
   148  			}
   149  
   150  			// In case we don't have the controller.
   151  			if found {
   152  				if err := ctrlr.set(res, c.MakePath("")); err != nil {
   153  					return err
   154  				}
   155  				continue
   156  			}
   157  			if ctrlr.optional() {
   158  				if err := ctrlr.skip(res); err != nil {
   159  					return err
   160  				}
   161  			} else {
   162  				return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.MakePath(""))
   163  			}
   164  		}
   165  	}
   166  
   167  	clean.Release()
   168  	return nil
   169  }
   170  
   171  // Uninstall removes the settings done in Install(). If cgroup path already
   172  // existed when Install() was called, Uninstall is a noop.
   173  func (c *cgroupV2) Uninstall() error {
   174  	log.Debugf("Deleting cgroup %q", c.MakePath(""))
   175  
   176  	// If we try to remove the cgroup too soon after killing the sandbox we
   177  	// might get EBUSY, so we retry for a few seconds until it succeeds.
   178  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   179  	defer cancel()
   180  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
   181  
   182  	// Deletion must occur reverse order, because they may contain ancestors.
   183  	for i := len(c.Own) - 1; i >= 0; i-- {
   184  		current := c.Own[i]
   185  		log.Debugf("Removing cgroup for path=%q", current)
   186  
   187  		fn := func() error {
   188  			err := unix.Rmdir(current)
   189  			if os.IsNotExist(err) {
   190  				return nil
   191  			}
   192  			return err
   193  		}
   194  		if err := backoff.Retry(fn, b); err != nil {
   195  			return fmt.Errorf("removing cgroup path %q: %w", current, err)
   196  		}
   197  	}
   198  
   199  	return nil
   200  }
   201  
   202  // Join adds the current process to the all controllers. Returns function that
   203  // restores cgroup to the original state.
   204  func (c *cgroupV2) Join() (func(), error) {
   205  	// First save the current state so it can be restored.
   206  	paths, err := loadPaths("self")
   207  	if err != nil {
   208  		return nil, err
   209  	}
   210  	// Since this is unified, get the first path of current process's cgroup is
   211  	// enough.
   212  	undoPath := filepath.Join(c.Mountpoint, paths[cgroup2Key])
   213  
   214  	cu := cleanup.Make(func() {
   215  		log.Debugf("Restoring cgroup %q", undoPath)
   216  		// Writing the value 0 to a cgroup.procs file causes
   217  		// the writing process to be moved to the corresponding
   218  		// cgroup. - cgroups(7).
   219  		if err := setValue(undoPath, "cgroup.procs", "0"); err != nil {
   220  			log.Warningf("Error restoring cgroup %q: %v", undoPath, err)
   221  		}
   222  	})
   223  	defer cu.Clean()
   224  
   225  	// now join the cgroup
   226  	if err := setValue(c.MakePath(""), "cgroup.procs", "0"); err != nil {
   227  		return nil, err
   228  	}
   229  
   230  	return cu.Release(), nil
   231  }
   232  
   233  // CPUQuota returns the CFS CPU quota.
   234  func (c *cgroupV2) CPUQuota() (float64, error) {
   235  	cpuMax, err := getValue(c.MakePath(""), "cpu.max")
   236  	if err != nil {
   237  		return -1, err
   238  	}
   239  
   240  	return parseCPUQuota(cpuMax)
   241  }
   242  
   243  func parseCPUQuota(cpuMax string) (float64, error) {
   244  	data := strings.SplitN(strings.TrimSpace(cpuMax), " ", 2)
   245  	if len(data) != 2 {
   246  		return -1, fmt.Errorf("invalid cpu.max data %q", cpuMax)
   247  	}
   248  
   249  	// no cpu limit if quota is max
   250  	if data[0] == "max" {
   251  		return -1, nil
   252  	}
   253  
   254  	quota, err := strconv.ParseInt(data[0], 10, 64)
   255  	if err != nil {
   256  		return -1, err
   257  	}
   258  
   259  	period, err := strconv.ParseInt(data[1], 10, 64)
   260  	if err != nil {
   261  		return -1, err
   262  	}
   263  
   264  	if quota <= 0 || period <= 0 {
   265  		return -1, err
   266  	}
   267  	return float64(quota) / float64(period), nil
   268  
   269  }
   270  
   271  // CPUUsage returns the total CPU usage of the cgroup.
   272  func (c *cgroupV2) CPUUsage() (uint64, error) {
   273  	cpuStat, err := getValue(c.MakePath(""), "cpu.stat")
   274  	if err != nil {
   275  		return 0, err
   276  	}
   277  
   278  	sc := bufio.NewScanner(strings.NewReader(cpuStat))
   279  	for sc.Scan() {
   280  		key, value, err := parseKeyValue(sc.Text())
   281  		if err != nil {
   282  			return 0, err
   283  		}
   284  		if key == "usage_usec" {
   285  			return value, nil
   286  		}
   287  	}
   288  
   289  	return 0, nil
   290  }
   291  
   292  // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
   293  func (c *cgroupV2) NumCPU() (int, error) {
   294  	cpuset, err := getValue(c.MakePath(""), "cpuset.cpus.effective")
   295  	if err != nil {
   296  		return 0, err
   297  	}
   298  	return countCpuset(strings.TrimSpace(cpuset))
   299  }
   300  
   301  // MemoryLimit returns the memory limit.
   302  func (c *cgroupV2) MemoryLimit() (uint64, error) {
   303  	limStr, err := getValue(c.MakePath(""), "memory.max")
   304  	if err != nil {
   305  		return 0, err
   306  	}
   307  	limStr = strings.TrimSpace(limStr)
   308  	if limStr == "max" {
   309  		return math.MaxUint64, nil
   310  	}
   311  	return strconv.ParseUint(limStr, 10, 64)
   312  }
   313  
   314  // MakePath builds a path to the given controller.
   315  func (c *cgroupV2) MakePath(controllerName string) string {
   316  	return filepath.Join(c.Mountpoint, c.Path)
   317  }
   318  
   319  type controllerv2 interface {
   320  	controller
   321  	generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error)
   322  }
   323  
   324  type cpu2 struct {
   325  	mandatory
   326  }
   327  
   328  func (*cpu2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   329  	props := []dbus.Property{}
   330  	if spec == nil || spec.CPU == nil {
   331  		return props, nil
   332  	}
   333  	cpu := spec.CPU
   334  	if cpu.Shares != nil {
   335  		weight := convertCPUSharesToCgroupV2Value(*cpu.Shares)
   336  		if weight != 0 {
   337  			props = append(props, newProp("CPUWeight", weight))
   338  		}
   339  	}
   340  	var (
   341  		period uint64
   342  		quota  int64
   343  	)
   344  	if cpu.Period != nil {
   345  		period = *cpu.Period
   346  	}
   347  	if cpu.Quota != nil {
   348  		quota = *cpu.Quota
   349  	}
   350  	if period != 0 {
   351  		props = append(props, newProp("CPUQuotaPeriodUSec", period))
   352  	}
   353  	if quota != 0 || period != 0 {
   354  		// Corresponds to USEC_INFINITY in systemd.
   355  		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
   356  		if quota > 0 {
   357  			if period == 0 {
   358  				// Assume the default.
   359  				period = defaultPeriod
   360  			}
   361  			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to
   362  			// CPUQuota (integer percentage of CPU) internally. This means that if a
   363  			// fractional percent of CPU is indicated by spec.CPU.Quota, we need to
   364  			// round up to the nearest 10ms (1% of a second) such that child cgroups
   365  			// can set the cpu.cfs_quota_us they expect.
   366  			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
   367  			if cpuQuotaPerSecUSec%10000 != 0 {
   368  				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
   369  			}
   370  		}
   371  		props = append(props, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
   372  	}
   373  	return props, nil
   374  }
   375  
   376  func (*cpu2) set(spec *specs.LinuxResources, path string) error {
   377  	if spec == nil || spec.CPU == nil {
   378  		return nil
   379  	}
   380  
   381  	if spec.CPU.Shares != nil {
   382  		weight := convertCPUSharesToCgroupV2Value(*spec.CPU.Shares)
   383  		if weight != 0 {
   384  			if err := setValue(path, "cpu.weight", strconv.FormatUint(weight, 10)); err != nil {
   385  				return err
   386  			}
   387  		}
   388  	}
   389  
   390  	if spec.CPU.Period != nil || spec.CPU.Quota != nil {
   391  		v := "max"
   392  		if spec.CPU.Quota != nil && *spec.CPU.Quota > 0 {
   393  			v = strconv.FormatInt(*spec.CPU.Quota, 10)
   394  		}
   395  
   396  		var period uint64
   397  		if spec.CPU.Period != nil && *spec.CPU.Period != 0 {
   398  			period = *spec.CPU.Period
   399  		} else {
   400  			period = defaultPeriod
   401  		}
   402  
   403  		v += " " + strconv.FormatUint(period, 10)
   404  		if err := setValue(path, "cpu.max", v); err != nil {
   405  			return err
   406  		}
   407  	}
   408  
   409  	return nil
   410  }
   411  
   412  type cpuset2 struct {
   413  	mandatory
   414  }
   415  
   416  func (*cpuset2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   417  	props := []dbus.Property{}
   418  	if spec == nil || spec.CPU == nil {
   419  		return props, nil
   420  	}
   421  	cpu := spec.CPU
   422  	if cpu.Cpus == "" && cpu.Mems == "" {
   423  		return props, nil
   424  	}
   425  	cpus := cpu.Cpus
   426  	mems := cpu.Mems
   427  	if cpus != "" {
   428  		bits, err := RangeToBits(cpus)
   429  		if err != nil {
   430  			return nil, fmt.Errorf("%w: cpus=%q conversion error: %v", ErrBadResourceSpec, cpus, err)
   431  		}
   432  		props = append(props, newProp("AllowedCPUs", bits))
   433  	}
   434  	if mems != "" {
   435  		bits, err := RangeToBits(mems)
   436  		if err != nil {
   437  			return nil, fmt.Errorf("%w: mems=%q conversion error: %v", ErrBadResourceSpec, mems, err)
   438  		}
   439  		props = append(props, newProp("AllowedMemoryNodes", bits))
   440  	}
   441  	return props, nil
   442  }
   443  
   444  func (*cpuset2) set(spec *specs.LinuxResources, path string) error {
   445  	if spec == nil || spec.CPU == nil {
   446  		return nil
   447  	}
   448  
   449  	if spec.CPU.Cpus != "" {
   450  		if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil {
   451  			return err
   452  		}
   453  	}
   454  
   455  	if spec.CPU.Mems != "" {
   456  		if err := setValue(path, "cpuset.mems", spec.CPU.Mems); err != nil {
   457  			return err
   458  		}
   459  	}
   460  
   461  	return nil
   462  }
   463  
   464  type memory2 struct {
   465  	mandatory
   466  }
   467  
   468  func (*memory2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   469  	props := []dbus.Property{}
   470  	if spec == nil || spec.Memory == nil {
   471  		return props, nil
   472  	}
   473  	mem := spec.Memory
   474  	if mem.Swap != nil {
   475  		if mem.Limit == nil {
   476  			return nil, ErrBadResourceSpec
   477  		}
   478  		swap, err := convertMemorySwapToCgroupV2Value(*mem.Swap, *mem.Limit)
   479  		if err != nil {
   480  			return nil, err
   481  		}
   482  		props = append(props, newProp("MemorySwapMax", uint64(swap)))
   483  	}
   484  	if mem.Limit != nil {
   485  		props = append(props, newProp("MemoryMax", uint64(*mem.Limit)))
   486  	}
   487  	if mem.Reservation != nil {
   488  		props = append(props, newProp("MemoryLow", uint64(*mem.Reservation)))
   489  	}
   490  	return props, nil
   491  }
   492  
   493  func (*memory2) set(spec *specs.LinuxResources, path string) error {
   494  	if spec == nil || spec.Memory == nil {
   495  		return nil
   496  	}
   497  
   498  	if spec.Memory.Swap != nil {
   499  		// in cgroup v2, we set memory and swap separately, but the spec specifies
   500  		// Swap field as memory+swap, so we need memory limit here to be set in
   501  		// order to get the correct swap value.
   502  		if spec.Memory.Limit == nil {
   503  			return errors.New("cgroup: Memory.Swap is set without Memory.Limit")
   504  		}
   505  
   506  		swap, err := convertMemorySwapToCgroupV2Value(*spec.Memory.Swap, *spec.Memory.Limit)
   507  		if err != nil {
   508  			return nil
   509  		}
   510  		swapStr := numToStr(swap)
   511  		// memory and memorySwap set to the same value -- disable swap
   512  		if swapStr == "" && swap == 0 && *spec.Memory.Swap > 0 {
   513  			swapStr = "0"
   514  		}
   515  		// never write empty string to `memory.swap.max`, it means set to 0.
   516  		if swapStr != "" {
   517  			if err := setValue(path, "memory.swap.max", swapStr); err != nil {
   518  				return err
   519  			}
   520  		}
   521  	}
   522  
   523  	if spec.Memory.Limit != nil {
   524  		if val := numToStr(*spec.Memory.Limit); val != "" {
   525  			if err := setValue(path, "memory.max", val); err != nil {
   526  				return err
   527  			}
   528  		}
   529  	}
   530  
   531  	if spec.Memory.Reservation != nil {
   532  		if val := numToStr(*spec.Memory.Reservation); val != "" {
   533  			if err := setValue(path, "memory.low", val); err != nil {
   534  				return err
   535  			}
   536  		}
   537  	}
   538  
   539  	return nil
   540  }
   541  
   542  type pid2 struct {
   543  	mandatory
   544  }
   545  
   546  func (*pid2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   547  	if spec != nil && spec.Pids != nil {
   548  		return []dbus.Property{newProp("TasksMax", uint64(spec.Pids.Limit))}, nil
   549  	}
   550  	return []dbus.Property{}, nil
   551  }
   552  
   553  func (*pid2) set(spec *specs.LinuxResources, path string) error {
   554  	if spec == nil || spec.Pids == nil {
   555  		return nil
   556  	}
   557  
   558  	if val := numToStr(spec.Pids.Limit); val != "" {
   559  		return setValue(path, "pids.max", val)
   560  	}
   561  
   562  	return nil
   563  }
   564  
   565  type io2 struct {
   566  	mandatory
   567  }
   568  
   569  func (*io2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   570  	props := []dbus.Property{}
   571  	if spec == nil || spec.BlockIO == nil {
   572  		return props, nil
   573  	}
   574  	io := spec.BlockIO
   575  	if io != nil {
   576  		if io.Weight != nil && *io.Weight != 0 {
   577  			ioWeight := convertBlkIOToIOWeightValue(*io.Weight)
   578  			props = append(props, newProp("IOWeight", ioWeight))
   579  		}
   580  		for _, dev := range io.WeightDevice {
   581  			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight)
   582  			props = append(props, newProp("IODeviceWeight", val))
   583  		}
   584  		props = addIOProps(props, "IOReadBandwidthMax", io.ThrottleReadBpsDevice)
   585  		props = addIOProps(props, "IOWriteBandwidthMax", io.ThrottleWriteBpsDevice)
   586  		props = addIOProps(props, "IOReadIOPSMax", io.ThrottleReadIOPSDevice)
   587  		props = addIOProps(props, "IOWriteIOPSMax", io.ThrottleWriteIOPSDevice)
   588  	}
   589  	return props, nil
   590  }
   591  
   592  func (*io2) set(spec *specs.LinuxResources, path string) error {
   593  	if spec == nil || spec.BlockIO == nil {
   594  		return nil
   595  	}
   596  	blkio := spec.BlockIO
   597  
   598  	var (
   599  		err error
   600  		bfq *os.File
   601  	)
   602  
   603  	// If BFQ IO scheduler is available, use it.
   604  	if blkio.Weight != nil || len(blkio.WeightDevice) > 0 {
   605  		bfq, err = os.Open(filepath.Join(path, "io.bfq.weight"))
   606  		if err == nil {
   607  			defer bfq.Close()
   608  		} else if !os.IsNotExist(err) {
   609  			return err
   610  		}
   611  
   612  	}
   613  
   614  	if blkio.Weight != nil && *blkio.Weight != 0 {
   615  		if bfq != nil {
   616  			if _, err := bfq.WriteString(strconv.FormatUint(uint64(*blkio.Weight), 10)); err != nil {
   617  				return err
   618  			}
   619  		} else {
   620  			// bfq io scheduler is not available, fallback to io.weight with
   621  			// a conversion scheme
   622  			ioWeight := convertBlkIOToIOWeightValue(*blkio.Weight)
   623  			if err = setValue(path, "io.weight", strconv.FormatUint(ioWeight, 10)); err != nil {
   624  				return err
   625  			}
   626  		}
   627  	}
   628  
   629  	if bfqDeviceWeightSupported(bfq) {
   630  		// ignore leaf weight, does not apply to cgroupv2
   631  		for _, dev := range blkio.WeightDevice {
   632  			if dev.Weight != nil {
   633  				val := fmt.Sprintf("%d:%d %d\n", dev.Major, dev.Minor, *dev.Weight)
   634  				if _, err := bfq.WriteString(val); err != nil {
   635  					return fmt.Errorf("failed to set device weight %q: %w", val, err)
   636  				}
   637  			}
   638  		}
   639  	}
   640  
   641  	if err := setThrottle2(path, "rbps", blkio.ThrottleReadBpsDevice); err != nil {
   642  		return err
   643  	}
   644  
   645  	if err := setThrottle2(path, "wbps", blkio.ThrottleWriteBpsDevice); err != nil {
   646  		return err
   647  	}
   648  
   649  	if err := setThrottle2(path, "riops", blkio.ThrottleReadIOPSDevice); err != nil {
   650  		return err
   651  	}
   652  
   653  	if err := setThrottle2(path, "wiops", blkio.ThrottleWriteIOPSDevice); err != nil {
   654  		return err
   655  	}
   656  
   657  	return nil
   658  }
   659  
   660  func setThrottle2(path, name string, devs []specs.LinuxThrottleDevice) error {
   661  	for _, dev := range devs {
   662  		val := fmt.Sprintf("%d:%d %s=%d", dev.Major, dev.Minor, name, dev.Rate)
   663  		if err := setValue(path, "io.max", val); err != nil {
   664  			return err
   665  		}
   666  	}
   667  	return nil
   668  }
   669  
   670  type hugeTLB2 struct {
   671  }
   672  
   673  func (*hugeTLB2) optional() bool {
   674  	return true
   675  }
   676  
   677  func (*hugeTLB2) skip(spec *specs.LinuxResources) error {
   678  	if spec != nil && len(spec.HugepageLimits) > 0 {
   679  		return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found")
   680  	}
   681  	return nil
   682  }
   683  
   684  func (*hugeTLB2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) {
   685  	return nil, nil
   686  }
   687  
   688  func (*hugeTLB2) set(spec *specs.LinuxResources, path string) error {
   689  	if spec == nil {
   690  		return nil
   691  	}
   692  	for _, limit := range spec.HugepageLimits {
   693  		name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize)
   694  		val := strconv.FormatUint(limit.Limit, 10)
   695  		if err := setValue(path, name, val); err != nil {
   696  			return err
   697  		}
   698  	}
   699  	return nil
   700  }
   701  
   702  // Since the OCI spec is designed for cgroup v1, in some cases
   703  // there is need to convert from the cgroup v1 configuration to cgroup v2
   704  // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
   705  // convert from [2-262144] to [1-10000]
   706  // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
   707  func convertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
   708  	if cpuShares == 0 {
   709  		return 0
   710  	}
   711  	return (1 + ((cpuShares-2)*9999)/262142)
   712  }
   713  
   714  // convertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
   715  // for use by cgroup v2 drivers. A conversion is needed since
   716  // Resources.MemorySwap is defined as memory+swap combined, while in cgroup v2
   717  // swap is a separate value.
   718  func convertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
   719  	// for compatibility with cgroup1 controller, set swap to unlimited in
   720  	// case the memory is set to unlimited, and swap is not explicitly set,
   721  	// treating the request as "set both memory and swap to unlimited".
   722  	if memory == -1 && memorySwap == 0 {
   723  		return -1, nil
   724  	}
   725  	if memorySwap == -1 || memorySwap == 0 {
   726  		// -1 is "max", 0 is "unset", so treat as is.
   727  		return memorySwap, nil
   728  	}
   729  	// sanity checks
   730  	if memory == 0 || memory == -1 {
   731  		return 0, errors.New("unable to set swap limit without memory limit")
   732  	}
   733  	if memory < 0 {
   734  		return 0, fmt.Errorf("invalid memory value: %d", memory)
   735  	}
   736  	if memorySwap < memory {
   737  		return 0, errors.New("memory+swap limit should be >= memory limit")
   738  	}
   739  
   740  	return memorySwap - memory, nil
   741  }
   742  
   743  // Since the OCI spec is designed for cgroup v1, in some cases
   744  // there is need to convert from the cgroup v1 configuration to cgroup v2
   745  // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
   746  // convert linearly from [10-1000] to [1-10000]
   747  func convertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
   748  	if blkIoWeight == 0 {
   749  		return 0
   750  	}
   751  	return 1 + (uint64(blkIoWeight)-10)*9999/990
   752  }
   753  
   754  // numToStr converts an int64 value to a string for writing to a
   755  // cgroupv2 files with .min, .max, .low, or .high suffix.
   756  // The value of -1 is converted to "max" for cgroupv1 compatibility
   757  // (which used to write -1 to remove the limit).
   758  func numToStr(value int64) (ret string) {
   759  	switch {
   760  	case value == 0:
   761  		ret = ""
   762  	case value == -1:
   763  		ret = "max"
   764  	default:
   765  		ret = strconv.FormatInt(value, 10)
   766  	}
   767  	return ret
   768  }
   769  
   770  // bfqDeviceWeightSupported checks for per-device BFQ weight support (added
   771  // in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight".
   772  func bfqDeviceWeightSupported(bfq *os.File) bool {
   773  	if bfq == nil {
   774  		return false
   775  	}
   776  
   777  	if _, err := bfq.Seek(0, 0); err != nil {
   778  		return false
   779  	}
   780  
   781  	buf := make([]byte, 32)
   782  	if _, err := bfq.Read(buf); err != nil {
   783  		return false
   784  	}
   785  	// If only a single number (default weight) if read back, we have older
   786  	// kernel.
   787  	_, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64)
   788  	return err != nil
   789  }
   790  
   791  // parseKeyValue parses a space-separated "name value" kind of cgroup
   792  // parameter and returns its key as a string, and its value as uint64
   793  // (ParseUint is used to convert the value). For example,
   794  // "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
   795  func parseKeyValue(t string) (string, uint64, error) {
   796  	parts := strings.SplitN(t, " ", 3)
   797  	if len(parts) != 2 {
   798  		return "", 0, fmt.Errorf("line %q is not in key value format", t)
   799  	}
   800  
   801  	value, err := parseUint(parts[1], 10, 64)
   802  	if err != nil {
   803  		return "", 0, err
   804  	}
   805  
   806  	return parts[0], value, nil
   807  }
   808  
   809  // parseUint converts a string to an uint64 integer.
   810  // Negative values are returned at zero as, due to kernel bugs,
   811  // some of the memory cgroup stats can be negative.
   812  func parseUint(s string, base, bitSize int) (uint64, error) {
   813  	value, err := strconv.ParseUint(s, base, bitSize)
   814  	if err != nil {
   815  		intValue, intErr := strconv.ParseInt(s, base, bitSize)
   816  		// 1. Handle negative values greater than MinInt64 (and)
   817  		// 2. Handle negative values lesser than MinInt64
   818  		if intErr == nil && intValue < 0 {
   819  			return 0, nil
   820  		} else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 {
   821  			return 0, nil
   822  		}
   823  
   824  		return value, err
   825  	}
   826  
   827  	return value, nil
   828  }
   829  
   830  // RangeToBits converts a text representation of a CPU mask (as written to
   831  // or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
   832  // with the corresponding bits set (as consumed by systemd over dbus as
   833  // AllowedCPUs/AllowedMemoryNodes unit property value).
   834  // Copied from runc.
   835  func RangeToBits(str string) ([]byte, error) {
   836  	bits := &big.Int{}
   837  
   838  	for _, r := range strings.Split(str, ",") {
   839  		// allow extra spaces around
   840  		r = strings.TrimSpace(r)
   841  		// allow empty elements (extra commas)
   842  		if r == "" {
   843  			continue
   844  		}
   845  		ranges := strings.SplitN(r, "-", 2)
   846  		if len(ranges) > 1 {
   847  			start, err := strconv.ParseUint(ranges[0], 10, 32)
   848  			if err != nil {
   849  				return nil, err
   850  			}
   851  			end, err := strconv.ParseUint(ranges[1], 10, 32)
   852  			if err != nil {
   853  				return nil, err
   854  			}
   855  			if start > end {
   856  				return nil, errors.New("invalid range: " + r)
   857  			}
   858  			for i := start; i <= end; i++ {
   859  				bits.SetBit(bits, int(i), 1)
   860  			}
   861  		} else {
   862  			val, err := strconv.ParseUint(ranges[0], 10, 32)
   863  			if err != nil {
   864  				return nil, err
   865  			}
   866  			bits.SetBit(bits, int(val), 1)
   867  		}
   868  	}
   869  
   870  	ret := bits.Bytes()
   871  	if len(ret) == 0 {
   872  		// do not allow empty values
   873  		return nil, errors.New("empty value")
   874  	}
   875  	return ret, nil
   876  }