gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/cgroup/cgroup.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package cgroup provides an interface to read and write configuration to
    16  // cgroup.
    17  package cgroup
    18  
    19  import (
    20  	"bufio"
    21  	"context"
    22  	"encoding/json"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"io/ioutil"
    27  	"os"
    28  	"path/filepath"
    29  	"strconv"
    30  	"strings"
    31  	"time"
    32  
    33  	"github.com/cenkalti/backoff"
    34  	specs "github.com/opencontainers/runtime-spec/specs-go"
    35  	"golang.org/x/sync/errgroup"
    36  	"golang.org/x/sys/unix"
    37  	"gvisor.dev/gvisor/pkg/cleanup"
    38  	"gvisor.dev/gvisor/pkg/log"
    39  )
    40  
    41  const (
    42  	cgroupv1FsName = "cgroup"
    43  	cgroupv2FsName = "cgroup2"
    44  
    45  	// procRoot is the procfs root this module uses.
    46  	procRoot = "/proc"
    47  
    48  	// cgroupRoot is the cgroupfs root this module uses.
    49  	cgroupRoot = "/sys/fs/cgroup"
    50  )
    51  
    52  var controllers = map[string]controller{
    53  	"blkio":    &blockIO{},
    54  	"cpu":      &cpu{},
    55  	"cpuset":   &cpuSet{},
    56  	"hugetlb":  &hugeTLB{},
    57  	"memory":   &memory{},
    58  	"net_cls":  &networkClass{},
    59  	"net_prio": &networkPrio{},
    60  	"pids":     &pids{},
    61  
    62  	// These controllers either don't have anything in the OCI spec or is
    63  	// irrelevant for a sandbox.
    64  	"cpuacct":    &noop{},
    65  	"devices":    &noop{},
    66  	"freezer":    &noop{},
    67  	"perf_event": &noop{},
    68  	"rdma":       &noop{},
    69  	"systemd":    &noop{},
    70  }
    71  
    72  // IsOnlyV2 checks whether cgroups V2 is enabled and V1 is not.
    73  func IsOnlyV2() bool {
    74  	var stat unix.Statfs_t
    75  	if err := unix.Statfs(cgroupRoot, &stat); err != nil {
    76  		// It's not used for anything important, assume not V2 on failure.
    77  		return false
    78  	}
    79  	return stat.Type == unix.CGROUP2_SUPER_MAGIC
    80  }
    81  
    82  func setOptionalValueInt(path, name string, val *int64) error {
    83  	if val == nil || *val == 0 {
    84  		return nil
    85  	}
    86  	str := strconv.FormatInt(*val, 10)
    87  	return setValue(path, name, str)
    88  }
    89  
    90  func setOptionalValueUint(path, name string, val *uint64) error {
    91  	if val == nil || *val == 0 {
    92  		return nil
    93  	}
    94  	str := strconv.FormatUint(*val, 10)
    95  	return setValue(path, name, str)
    96  }
    97  
    98  func setOptionalValueUint32(path, name string, val *uint32) error {
    99  	if val == nil || *val == 0 {
   100  		return nil
   101  	}
   102  	str := strconv.FormatUint(uint64(*val), 10)
   103  	return setValue(path, name, str)
   104  }
   105  
   106  func setOptionalValueUint16(path, name string, val *uint16) error {
   107  	if val == nil || *val == 0 {
   108  		return nil
   109  	}
   110  	str := strconv.FormatUint(uint64(*val), 10)
   111  	return setValue(path, name, str)
   112  }
   113  
   114  func setValue(path, name, data string) error {
   115  	fullpath := filepath.Join(path, name)
   116  	log.Debugf("Setting %q to %q", fullpath, data)
   117  	return writeFile(fullpath, []byte(data), 0700)
   118  }
   119  
   120  // writeFile is similar to ioutil.WriteFile() but doesn't create the file if it
   121  // doesn't exist.
   122  func writeFile(path string, data []byte, perm os.FileMode) error {
   123  	f, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, perm)
   124  	if err != nil {
   125  		return err
   126  	}
   127  	defer f.Close()
   128  
   129  	_, err = f.Write(data)
   130  	return err
   131  }
   132  
   133  func getValue(path, name string) (string, error) {
   134  	fullpath := filepath.Join(path, name)
   135  	out, err := ioutil.ReadFile(fullpath)
   136  	if err != nil {
   137  		return "", err
   138  	}
   139  	return string(out), nil
   140  }
   141  
   142  func getInt(path, name string) (int, error) {
   143  	s, err := getValue(path, name)
   144  	if err != nil {
   145  		return 0, err
   146  	}
   147  	return strconv.Atoi(strings.TrimSpace(s))
   148  }
   149  
   150  // fillFromAncestor sets the value of a cgroup file from the first ancestor
   151  // that has content. It does nothing if the file in 'path' has already been set.
   152  func fillFromAncestor(path string) (string, error) {
   153  	out, err := ioutil.ReadFile(path)
   154  	if err != nil {
   155  		return "", err
   156  	}
   157  	val := strings.TrimSpace(string(out))
   158  	if val != "" {
   159  		// File is set, stop here.
   160  		return val, nil
   161  	}
   162  
   163  	// File is not set, recurse to parent and then set here.
   164  	name := filepath.Base(path)
   165  	parent := filepath.Dir(filepath.Dir(path))
   166  	val, err = fillFromAncestor(filepath.Join(parent, name))
   167  	if err != nil {
   168  		return "", err
   169  	}
   170  
   171  	if err := writeFile(path, []byte(val), 0700); err != nil {
   172  		return "", nil
   173  	}
   174  	return val, nil
   175  }
   176  
   177  // countCpuset returns the number of CPU in a string formatted like:
   178  //
   179  //	"0-2,7,12-14  # bits 0, 1, 2, 7, 12, 13, and 14 set" - man 7 cpuset
   180  func countCpuset(cpuset string) (int, error) {
   181  	var count int
   182  	for _, p := range strings.Split(cpuset, ",") {
   183  		interval := strings.Split(p, "-")
   184  		switch len(interval) {
   185  		case 1:
   186  			if _, err := strconv.Atoi(interval[0]); err != nil {
   187  				return 0, err
   188  			}
   189  			count++
   190  
   191  		case 2:
   192  			start, err := strconv.Atoi(interval[0])
   193  			if err != nil {
   194  				return 0, err
   195  			}
   196  			end, err := strconv.Atoi(interval[1])
   197  			if err != nil {
   198  				return 0, err
   199  			}
   200  			if start < 0 || end < 0 || start > end {
   201  				return 0, fmt.Errorf("invalid cpuset: %q", p)
   202  			}
   203  			count += end - start + 1
   204  
   205  		default:
   206  			return 0, fmt.Errorf("invalid cpuset: %q", p)
   207  		}
   208  	}
   209  	return count, nil
   210  }
   211  
   212  // loadPaths loads cgroup paths for given 'pid', may be set to 'self'.
   213  func loadPaths(pid string) (map[string]string, error) {
   214  	procCgroup, err := os.Open(filepath.Join(procRoot, pid, "cgroup"))
   215  	if err != nil {
   216  		return nil, err
   217  	}
   218  	defer procCgroup.Close()
   219  
   220  	// Load mountinfo for the current process, because it's where cgroups is
   221  	// being accessed from.
   222  	mountinfo, err := os.Open(filepath.Join(procRoot, "self/mountinfo"))
   223  	if err != nil {
   224  		return nil, err
   225  	}
   226  	defer mountinfo.Close()
   227  
   228  	return loadPathsHelper(procCgroup, mountinfo, IsOnlyV2())
   229  }
   230  
   231  func loadPathsHelper(cgroup, mountinfo io.Reader, unified bool) (map[string]string, error) {
   232  	paths := make(map[string]string)
   233  
   234  	scanner := bufio.NewScanner(cgroup)
   235  	for scanner.Scan() {
   236  		// Format: ID:[name=]controller1,controller2:path
   237  		// Example: 2:cpu,cpuacct:/user.slice
   238  		tokens := strings.Split(scanner.Text(), ":")
   239  		if len(tokens) != 3 {
   240  			return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text())
   241  		}
   242  		if len(tokens[1]) == 0 && unified {
   243  			paths[cgroup2Key] = tokens[2]
   244  			continue
   245  		}
   246  		if len(tokens[1]) == 0 {
   247  			continue
   248  		}
   249  		for _, ctrlr := range strings.Split(tokens[1], ",") {
   250  			// Remove prefix for cgroups with no controller, eg. systemd.
   251  			ctrlr = strings.TrimPrefix(ctrlr, "name=")
   252  			// Discard unknown controllers.
   253  			if _, ok := controllers[ctrlr]; ok {
   254  				paths[ctrlr] = tokens[2]
   255  			}
   256  		}
   257  	}
   258  	if err := scanner.Err(); err != nil {
   259  		return nil, err
   260  	}
   261  
   262  	// For nested containers, in /proc/[pid]/cgroup we see paths from host,
   263  	// which don't exist in container, so recover the container paths here by
   264  	// double-checking with /proc/[pid]/mountinfo
   265  	mountScanner := bufio.NewScanner(mountinfo)
   266  	haveCg2Path := false
   267  	for mountScanner.Scan() {
   268  		// Format: ID parent major:minor root mount-point options opt-fields - fs-type source super-options
   269  		// Example: 39 32 0:34 / /sys/fs/cgroup/devices rw,noexec shared:18 - cgroup cgroup rw,devices
   270  		fields := strings.Fields(mountScanner.Text())
   271  		if len(fields) < 9 {
   272  			// Skip mounts that are not cgroup mounts.
   273  			continue
   274  		}
   275  		switch fields[len(fields)-3] {
   276  		case cgroupv1FsName:
   277  			// Cgroup controller type is in the super-options field.
   278  			superOptions := strings.Split(fields[len(fields)-1], ",")
   279  			for _, opt := range superOptions {
   280  				// Remove prefix for cgroups with no controller, eg. systemd.
   281  				opt = strings.TrimPrefix(opt, "name=")
   282  
   283  				// Only considers cgroup controllers that are registered, and skip other
   284  				// irrelevant options, e.g. rw.
   285  				if cgroupPath, ok := paths[opt]; ok {
   286  					rootDir := fields[3]
   287  					if rootDir != "/" {
   288  						// When cgroup is in submount, remove repeated path components from
   289  						// cgroup path to avoid duplicating them.
   290  						relCgroupPath, err := filepath.Rel(rootDir, cgroupPath)
   291  						if err != nil {
   292  							return nil, err
   293  						}
   294  						paths[opt] = relCgroupPath
   295  					}
   296  				}
   297  			}
   298  		case cgroupv2FsName:
   299  			if cgroupPath, ok := paths[cgroup2Key]; !haveCg2Path && ok {
   300  				root := fields[3]
   301  				relCgroupPath, err := filepath.Rel(root, cgroupPath)
   302  				if err != nil {
   303  					return nil, err
   304  				}
   305  				haveCg2Path = true
   306  				paths[cgroup2Key] = relCgroupPath
   307  			}
   308  		}
   309  	}
   310  	if err := mountScanner.Err(); err != nil {
   311  		return nil, err
   312  	}
   313  
   314  	return paths, nil
   315  }
   316  
   317  // Cgroup represents a cgroup configuration.
   318  type Cgroup interface {
   319  	Install(res *specs.LinuxResources) error
   320  	Uninstall() error
   321  	Join() (func(), error)
   322  	CPUQuota() (float64, error)
   323  	CPUUsage() (uint64, error)
   324  	NumCPU() (int, error)
   325  	MemoryLimit() (uint64, error)
   326  	MakePath(controllerName string) string
   327  }
   328  
   329  // cgroupV1 represents a group inside all controllers. For example:
   330  //
   331  //	Name='/foo/bar' maps to /sys/fs/cgroup/<controller>/foo/bar on
   332  //	all controllers.
   333  //
   334  // If Name is relative, it uses the parent cgroup path to determine the
   335  // location. For example:
   336  //
   337  //	Name='foo/bar' and Parent[ctrl]="/user.slice", then it will map to
   338  //	/sys/fs/cgroup/<ctrl>/user.slice/foo/bar
   339  type cgroupV1 struct {
   340  	Name    string            `json:"name"`
   341  	Parents map[string]string `json:"parents"`
   342  	Own     map[string]bool   `json:"own"`
   343  }
   344  
   345  // NewFromSpec creates a new Cgroup instance if the spec includes a cgroup path.
   346  // Returns nil otherwise. Cgroup paths are loaded based on the current process.
   347  // If useSystemd is true, the Cgroup will be created and managed with
   348  // systemd. This requires systemd (>=v244) to be running on the host and the
   349  // cgroup path to be in the form `slice:prefix:name`.
   350  func NewFromSpec(spec *specs.Spec, useSystemd bool) (Cgroup, error) {
   351  	if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
   352  		return nil, nil
   353  	}
   354  	return NewFromPath(spec.Linux.CgroupsPath, useSystemd)
   355  }
   356  
   357  // NewFromPath creates a new Cgroup instance from the specified relative path.
   358  // Cgroup paths are loaded based on the current process.
   359  // If useSystemd is true, the Cgroup will be created and managed with
   360  // systemd. This requires systemd (>=v244) to be running on the host and the
   361  // cgroup path to be in the form `slice:prefix:name`.
   362  func NewFromPath(cgroupsPath string, useSystemd bool) (Cgroup, error) {
   363  	return new("self", cgroupsPath, useSystemd)
   364  }
   365  
   366  // NewFromPid loads cgroup for the given process.
   367  // If useSystemd is true, the Cgroup will be created and managed with
   368  // systemd. This requires systemd (>=v244) to be running on the host and the
   369  // cgroup path to be in the form `slice:prefix:name`.
   370  func NewFromPid(pid int, useSystemd bool) (Cgroup, error) {
   371  	return new(strconv.Itoa(pid), "", useSystemd)
   372  }
   373  
   374  func new(pid, cgroupsPath string, useSystemd bool) (Cgroup, error) {
   375  	var (
   376  		parents map[string]string
   377  		err     error
   378  		cg      Cgroup
   379  	)
   380  
   381  	// If path is relative, load cgroup paths for the process to build the
   382  	// relative paths.
   383  	if !filepath.IsAbs(cgroupsPath) {
   384  		parents, err = loadPaths(pid)
   385  		if err != nil {
   386  			return nil, fmt.Errorf("finding current cgroups: %w", err)
   387  		}
   388  	}
   389  
   390  	if IsOnlyV2() {
   391  		// The cgroupsPath is in a special `slice:prefix:name` format for systemd
   392  		// that should not be modified.
   393  		if p, ok := parents[cgroup2Key]; ok && !useSystemd {
   394  			// The cgroup of current pid will have tasks in it and we can't use
   395  			// that, instead, use the its parent which should not have tasks in it.
   396  			cgroupsPath = filepath.Join(filepath.Dir(p), cgroupsPath)
   397  		}
   398  		// Assume that for v2, cgroup is always mounted at cgroupRoot.
   399  		cg, err = newCgroupV2(cgroupRoot, cgroupsPath, useSystemd)
   400  		if err != nil {
   401  			return nil, err
   402  		}
   403  	} else {
   404  		cg = &cgroupV1{
   405  			Name:    cgroupsPath,
   406  			Parents: parents,
   407  			Own:     make(map[string]bool),
   408  		}
   409  	}
   410  	log.Debugf("New cgroup for pid: %s, %T: %+v", pid, cg, cg)
   411  	return cg, nil
   412  }
   413  
   414  // CgroupJSON is a wrapper for Cgroup that can be encoded to JSON.
   415  type CgroupJSON struct {
   416  	Cgroup Cgroup
   417  }
   418  
   419  type cgroupJSONv1 struct {
   420  	Cgroup *cgroupV1 `json:"cgroupv1"`
   421  }
   422  
   423  type cgroupJSONv2 struct {
   424  	Cgroup *cgroupV2 `json:"cgroupv2"`
   425  }
   426  
   427  type cgroupJSONSystemd struct {
   428  	Cgroup *cgroupSystemd `json:"cgroupsystemd"`
   429  }
   430  
   431  type cgroupJSONUnknown struct {
   432  	Cgroup any `json:"cgroupunknown"`
   433  }
   434  
   435  // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON
   436  func (c *CgroupJSON) UnmarshalJSON(data []byte) error {
   437  	m := map[string]json.RawMessage{}
   438  	if err := json.Unmarshal(data, &m); err != nil {
   439  		return err
   440  	}
   441  
   442  	var cg Cgroup
   443  	if rm, ok := m["cgroupv1"]; ok {
   444  		cg = &cgroupV1{}
   445  		if err := json.Unmarshal(rm, cg); err != nil {
   446  			return err
   447  		}
   448  	} else if rm, ok := m["cgroupv2"]; ok {
   449  		cg = &cgroupV2{}
   450  		if err := json.Unmarshal(rm, cg); err != nil {
   451  			return err
   452  		}
   453  	} else if rm, ok := m["cgroupsystemd"]; ok {
   454  		cg = &cgroupSystemd{}
   455  		if err := json.Unmarshal(rm, cg); err != nil {
   456  			return err
   457  		}
   458  	}
   459  	c.Cgroup = cg
   460  	return nil
   461  }
   462  
   463  // MarshalJSON implements json.Marshaler.MarshalJSON
   464  func (c *CgroupJSON) MarshalJSON() ([]byte, error) {
   465  	if c.Cgroup == nil {
   466  		return json.Marshal(cgroupJSONUnknown{})
   467  	}
   468  	switch c.Cgroup.(type) {
   469  	case *cgroupV1:
   470  		return json.Marshal(cgroupJSONv1{Cgroup: c.Cgroup.(*cgroupV1)})
   471  	case *cgroupV2:
   472  		return json.Marshal(cgroupJSONv2{Cgroup: c.Cgroup.(*cgroupV2)})
   473  	case *cgroupSystemd:
   474  		return json.Marshal(cgroupJSONSystemd{Cgroup: c.Cgroup.(*cgroupSystemd)})
   475  	}
   476  	return nil, nil
   477  }
   478  
   479  // Install creates and configures cgroups according to 'res'. If cgroup path
   480  // already exists, it means that the caller has already provided a
   481  // pre-configured cgroups, and 'res' is ignored.
   482  func (c *cgroupV1) Install(res *specs.LinuxResources) error {
   483  	log.Debugf("Installing cgroup path %q", c.Name)
   484  
   485  	// Clean up partially created cgroups on error. Errors during cleanup itself
   486  	// are ignored.
   487  	clean := cleanup.Make(func() { _ = c.Uninstall() })
   488  	defer clean.Clean()
   489  
   490  	// Controllers can be symlinks to a group of controllers (e.g. cpu,cpuacct).
   491  	// So first check what directories need to be created. Otherwise, when
   492  	// the directory for one of the controllers in a group is created, it will
   493  	// make it seem like the directory already existed and it's not owned by the
   494  	// other controllers in the group.
   495  	var missing []string
   496  	for key := range controllers {
   497  		path := c.MakePath(key)
   498  		if _, err := os.Stat(path); err != nil {
   499  			missing = append(missing, key)
   500  		} else {
   501  			log.Debugf("Using pre-created cgroup %q: %q", key, path)
   502  		}
   503  	}
   504  	for _, key := range missing {
   505  		ctrlr := controllers[key]
   506  
   507  		if skip, err := createController(c, key); skip && ctrlr.optional() {
   508  			if err := ctrlr.skip(res); err != nil {
   509  				return err
   510  			}
   511  			log.Infof("Skipping cgroup %q, err: %v", key, err)
   512  			continue
   513  		} else if err != nil {
   514  			return err
   515  		}
   516  
   517  		// Only set controllers that were created by me.
   518  		c.Own[key] = true
   519  		path := c.MakePath(key)
   520  		if err := ctrlr.set(res, path); err != nil {
   521  			return err
   522  		}
   523  	}
   524  	clean.Release()
   525  	return nil
   526  }
   527  
   528  // createController creates the controller directory, checking that the
   529  // controller is enabled in the system. It returns a boolean indicating whether
   530  // the controller should be skipped (e.g. controller is disabled). In case it
   531  // should be skipped, it also returns the error it got.
   532  func createController(c Cgroup, name string) (bool, error) {
   533  	ctrlrPath := filepath.Join(cgroupRoot, name)
   534  	if _, err := os.Stat(ctrlrPath); err != nil {
   535  		return os.IsNotExist(err), err
   536  	}
   537  
   538  	path := c.MakePath(name)
   539  	log.Debugf("Creating cgroup %q: %q", name, path)
   540  	if err := os.MkdirAll(path, 0755); err != nil {
   541  		return errors.Is(err, unix.EROFS), err
   542  	}
   543  	return false, nil
   544  }
   545  
   546  // Uninstall removes the settings done in Install(). If cgroup path already
   547  // existed when Install() was called, Uninstall is a noop.
   548  func (c *cgroupV1) Uninstall() error {
   549  	log.Debugf("Deleting cgroup %q", c.Name)
   550  	g, ctx := errgroup.WithContext(context.Background())
   551  	for key := range controllers {
   552  		if !c.Own[key] {
   553  			// cgroup is managed by caller, don't touch it.
   554  			continue
   555  		}
   556  		path := c.MakePath(key)
   557  		log.Debugf("Removing cgroup controller for key=%q path=%q", key, path)
   558  
   559  		// If we try to remove the cgroup too soon after killing the sandbox we
   560  		// might get EBUSY, so we retry for a few seconds until it succeeds.
   561  		ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
   562  		defer cancel()
   563  		b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
   564  		fn := func() error {
   565  			err := unix.Rmdir(path)
   566  			if os.IsNotExist(err) {
   567  				return nil
   568  			}
   569  			return err
   570  		}
   571  		// Run deletions in parallel to remove all directories even if there are
   572  		// failures/timeouts in other directories.
   573  		g.Go(func() error {
   574  			if err := backoff.Retry(fn, b); err != nil {
   575  				return fmt.Errorf("removing cgroup path %q: %w", path, err)
   576  			}
   577  			return nil
   578  		})
   579  	}
   580  	return g.Wait()
   581  }
   582  
   583  // Join adds the current process to the all controllers. Returns function that
   584  // restores cgroup to the original state.
   585  func (c *cgroupV1) Join() (func(), error) {
   586  	// First save the current state so it can be restored.
   587  	paths, err := loadPaths("self")
   588  	if err != nil {
   589  		return nil, err
   590  	}
   591  	var undoPaths []string
   592  	for ctrlr, path := range paths {
   593  		// Skip controllers we don't handle.
   594  		if _, ok := controllers[ctrlr]; ok {
   595  			fullPath := filepath.Join(cgroupRoot, ctrlr, path)
   596  			undoPaths = append(undoPaths, fullPath)
   597  		}
   598  	}
   599  
   600  	cu := cleanup.Make(func() {
   601  		for _, path := range undoPaths {
   602  			log.Debugf("Restoring cgroup %q", path)
   603  			// Writing the value 0 to a cgroup.procs file causes
   604  			// the writing process to be moved to the corresponding
   605  			// cgroup. - cgroups(7).
   606  			if err := setValue(path, "cgroup.procs", "0"); err != nil {
   607  				log.Warningf("Error restoring cgroup %q: %v", path, err)
   608  			}
   609  		}
   610  	})
   611  	defer cu.Clean()
   612  
   613  	// Now join the cgroups.
   614  	for key, ctrlr := range controllers {
   615  		path := c.MakePath(key)
   616  		log.Debugf("Joining cgroup %q", path)
   617  		// Writing the value 0 to a cgroup.procs file causes the writing process to
   618  		// be moved to the corresponding cgroup - cgroups(7).
   619  		if err := setValue(path, "cgroup.procs", "0"); err != nil {
   620  			if ctrlr.optional() && os.IsNotExist(err) {
   621  				continue
   622  			}
   623  			return nil, err
   624  		}
   625  	}
   626  	return cu.Release(), nil
   627  }
   628  
   629  // CPUQuota returns the CFS CPU quota.
   630  func (c *cgroupV1) CPUQuota() (float64, error) {
   631  	path := c.MakePath("cpu")
   632  	quota, err := getInt(path, "cpu.cfs_quota_us")
   633  	if err != nil {
   634  		return -1, err
   635  	}
   636  	period, err := getInt(path, "cpu.cfs_period_us")
   637  	if err != nil {
   638  		return -1, err
   639  	}
   640  	if quota <= 0 || period <= 0 {
   641  		return -1, err
   642  	}
   643  	return float64(quota) / float64(period), nil
   644  }
   645  
   646  // CPUUsage returns the total CPU usage of the cgroup in nanoseconds.
   647  func (c *cgroupV1) CPUUsage() (uint64, error) {
   648  	path := c.MakePath("cpuacct")
   649  	usage, err := getValue(path, "cpuacct.usage")
   650  	if err != nil {
   651  		return 0, err
   652  	}
   653  	return strconv.ParseUint(strings.TrimSpace(usage), 10, 64)
   654  }
   655  
   656  // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
   657  func (c *cgroupV1) NumCPU() (int, error) {
   658  	path := c.MakePath("cpuset")
   659  	cpuset, err := getValue(path, "cpuset.cpus")
   660  	if err != nil {
   661  		return 0, err
   662  	}
   663  	return countCpuset(strings.TrimSpace(cpuset))
   664  }
   665  
   666  // MemoryLimit returns the memory limit.
   667  func (c *cgroupV1) MemoryLimit() (uint64, error) {
   668  	path := c.MakePath("memory")
   669  	limStr, err := getValue(path, "memory.limit_in_bytes")
   670  	if err != nil {
   671  		return 0, err
   672  	}
   673  	return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64)
   674  }
   675  
   676  // MakePath builds a path to the given controller.
   677  func (c *cgroupV1) MakePath(controllerName string) string {
   678  	path := c.Name
   679  	if parent, ok := c.Parents[controllerName]; ok {
   680  		path = filepath.Join(parent, c.Name)
   681  	}
   682  	return filepath.Join(cgroupRoot, controllerName, path)
   683  }
   684  
   685  type controller interface {
   686  	// optional controllers don't fail if not found.
   687  	optional() bool
   688  	// set applies resource limits to controller.
   689  	set(*specs.LinuxResources, string) error
   690  	// skip is called when controller is not found to check if it can be safely
   691  	// skipped or not based on the spec.
   692  	skip(*specs.LinuxResources) error
   693  }
   694  
   695  type noop struct{}
   696  
   697  func (n *noop) optional() bool {
   698  	return true
   699  }
   700  
   701  func (*noop) set(*specs.LinuxResources, string) error {
   702  	return nil
   703  }
   704  
   705  func (n *noop) skip(*specs.LinuxResources) error {
   706  	return nil
   707  }
   708  
   709  type mandatory struct{}
   710  
   711  func (*mandatory) optional() bool {
   712  	return false
   713  }
   714  
   715  func (*mandatory) skip(*specs.LinuxResources) error {
   716  	panic("cgroup controller is not optional")
   717  }
   718  
   719  type memory struct {
   720  	mandatory
   721  }
   722  
   723  func (*memory) set(spec *specs.LinuxResources, path string) error {
   724  	if spec == nil || spec.Memory == nil {
   725  		return nil
   726  	}
   727  	if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil {
   728  		return err
   729  	}
   730  	if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil {
   731  		return err
   732  	}
   733  	if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil {
   734  		return err
   735  	}
   736  	if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil {
   737  		return err
   738  	}
   739  	if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil {
   740  		return err
   741  	}
   742  	if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil {
   743  		return err
   744  	}
   745  
   746  	if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller {
   747  		if err := setValue(path, "memory.oom_control", "1"); err != nil {
   748  			return err
   749  		}
   750  	}
   751  	return nil
   752  }
   753  
   754  type cpu struct {
   755  	mandatory
   756  }
   757  
   758  func (*cpu) set(spec *specs.LinuxResources, path string) error {
   759  	if spec == nil || spec.CPU == nil {
   760  		return nil
   761  	}
   762  	if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil {
   763  		return err
   764  	}
   765  	if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil {
   766  		return err
   767  	}
   768  	if err := setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period); err != nil {
   769  		return err
   770  	}
   771  	if err := setOptionalValueUint(path, "cpu.rt_period_us", spec.CPU.RealtimePeriod); err != nil {
   772  		return err
   773  	}
   774  	return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime)
   775  }
   776  
   777  type cpuSet struct {
   778  	mandatory
   779  }
   780  
   781  func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
   782  	// cpuset.cpus and mems are required fields, but are not set on a new cgroup.
   783  	// If not set in the spec, get it from one of the ancestors cgroup.
   784  	if spec == nil || spec.CPU == nil || spec.CPU.Cpus == "" {
   785  		if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil {
   786  			return err
   787  		}
   788  	} else {
   789  		if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil {
   790  			return err
   791  		}
   792  	}
   793  
   794  	if spec == nil || spec.CPU == nil || spec.CPU.Mems == "" {
   795  		_, err := fillFromAncestor(filepath.Join(path, "cpuset.mems"))
   796  		return err
   797  	}
   798  	return setValue(path, "cpuset.mems", spec.CPU.Mems)
   799  }
   800  
   801  type blockIO struct {
   802  	mandatory
   803  }
   804  
   805  func (*blockIO) set(spec *specs.LinuxResources, path string) error {
   806  	if spec == nil || spec.BlockIO == nil {
   807  		return nil
   808  	}
   809  
   810  	if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil {
   811  		return err
   812  	}
   813  	if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil {
   814  		return err
   815  	}
   816  
   817  	for _, dev := range spec.BlockIO.WeightDevice {
   818  		if dev.Weight != nil {
   819  			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight)
   820  			if err := setValue(path, "blkio.weight_device", val); err != nil {
   821  				return err
   822  			}
   823  		}
   824  		if dev.LeafWeight != nil {
   825  			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.LeafWeight)
   826  			if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
   827  				return err
   828  			}
   829  		}
   830  	}
   831  	if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil {
   832  		return err
   833  	}
   834  	if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil {
   835  		return err
   836  	}
   837  	if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil {
   838  		return err
   839  	}
   840  	return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice)
   841  }
   842  
   843  func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error {
   844  	for _, dev := range devs {
   845  		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate)
   846  		if err := setValue(path, name, val); err != nil {
   847  			return err
   848  		}
   849  	}
   850  	return nil
   851  }
   852  
   853  type networkClass struct{}
   854  
   855  func (*networkClass) optional() bool {
   856  	return true
   857  }
   858  
   859  func (*networkClass) set(spec *specs.LinuxResources, path string) error {
   860  	if spec == nil || spec.Network == nil {
   861  		return nil
   862  	}
   863  	return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID)
   864  }
   865  
   866  func (*networkClass) skip(spec *specs.LinuxResources) error {
   867  	if spec != nil && spec.Network != nil && spec.Network.ClassID != nil {
   868  		return fmt.Errorf("Network.ClassID set but net_cls cgroup controller not found")
   869  	}
   870  	return nil
   871  }
   872  
   873  type networkPrio struct{}
   874  
   875  func (*networkPrio) optional() bool {
   876  	return true
   877  }
   878  
   879  func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
   880  	if spec == nil || spec.Network == nil {
   881  		return nil
   882  	}
   883  	for _, prio := range spec.Network.Priorities {
   884  		val := fmt.Sprintf("%s %d", prio.Name, prio.Priority)
   885  		if err := setValue(path, "net_prio.ifpriomap", val); err != nil {
   886  			return err
   887  		}
   888  	}
   889  	return nil
   890  }
   891  
   892  func (*networkPrio) skip(spec *specs.LinuxResources) error {
   893  	if spec != nil && spec.Network != nil && len(spec.Network.Priorities) > 0 {
   894  		return fmt.Errorf("Network.Priorities set but net_prio cgroup controller not found")
   895  	}
   896  	return nil
   897  }
   898  
   899  type pids struct{}
   900  
   901  func (*pids) optional() bool {
   902  	return true
   903  }
   904  
   905  func (*pids) skip(spec *specs.LinuxResources) error {
   906  	if spec != nil && spec.Pids != nil && spec.Pids.Limit > 0 {
   907  		return fmt.Errorf("Pids.Limit set but pids cgroup controller not found")
   908  	}
   909  	return nil
   910  }
   911  
   912  func (*pids) set(spec *specs.LinuxResources, path string) error {
   913  	if spec == nil || spec.Pids == nil || spec.Pids.Limit <= 0 {
   914  		return nil
   915  	}
   916  	val := strconv.FormatInt(spec.Pids.Limit, 10)
   917  	return setValue(path, "pids.max", val)
   918  }
   919  
   920  type hugeTLB struct{}
   921  
   922  func (*hugeTLB) optional() bool {
   923  	return true
   924  }
   925  
   926  func (*hugeTLB) skip(spec *specs.LinuxResources) error {
   927  	if spec != nil && len(spec.HugepageLimits) > 0 {
   928  		return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found")
   929  	}
   930  	return nil
   931  }
   932  
   933  func (*hugeTLB) set(spec *specs.LinuxResources, path string) error {
   934  	if spec == nil {
   935  		return nil
   936  	}
   937  	for _, limit := range spec.HugepageLimits {
   938  		name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize)
   939  		val := strconv.FormatUint(limit.Limit, 10)
   940  		if err := setValue(path, name, val); err != nil {
   941  			return err
   942  		}
   943  	}
   944  	return nil
   945  }