github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/systemd/v2.go (about)

     1  package systemd
     2  
     3  import (
     4  	"bufio"
     5  	"errors"
     6  	"fmt"
     7  	"math"
     8  	"os"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  
    14  	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
    15  	securejoin "github.com/cyphar/filepath-securejoin"
    16  	"github.com/sirupsen/logrus"
    17  
    18  	"github.com/opencontainers/runc/libcontainer/cgroups"
    19  	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
    20  	"github.com/opencontainers/runc/libcontainer/configs"
    21  )
    22  
    23  const (
    24  	cpuIdleSupportedVersion = 252
    25  )
    26  
    27  type UnifiedManager struct {
    28  	mu      sync.Mutex
    29  	cgroups *configs.Cgroup
    30  	// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
    31  	path  string
    32  	dbus  *dbusConnManager
    33  	fsMgr cgroups.Manager
    34  }
    35  
    36  func NewUnifiedManager(config *configs.Cgroup, path string) (*UnifiedManager, error) {
    37  	m := &UnifiedManager{
    38  		cgroups: config,
    39  		path:    path,
    40  		dbus:    newDbusConnManager(config.Rootless),
    41  	}
    42  	if err := m.initPath(); err != nil {
    43  		return nil, err
    44  	}
    45  
    46  	fsMgr, err := fs2.NewManager(config, m.path)
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  	m.fsMgr = fsMgr
    51  
    52  	return m, nil
    53  }
    54  
    55  func shouldSetCPUIdle(cm *dbusConnManager, v string) bool {
    56  	// The only valid values for cpu.idle are 0 and 1. As it is
    57  	// not possible to directly set cpu.idle to 0 via systemd,
    58  	// ignore 0. Ignore other values as we'll error out later
    59  	// in Set() while calling fsMgr.Set().
    60  	return v == "1" && systemdVersion(cm) >= cpuIdleSupportedVersion
    61  }
    62  
    63  // unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
    64  // key/value map (where key is cgroupfs file name) to systemd unit properties.
    65  // This is on a best-effort basis, so the properties that are not known
    66  // (to this function and/or systemd) are ignored (but logged with "debug"
    67  // log level).
    68  //
    69  // For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
    70  //
    71  // For the list of systemd unit properties, see systemd.resource-control(5).
    72  func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
    73  	var err error
    74  
    75  	for k, v := range res {
    76  		if strings.Contains(k, "/") {
    77  			return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
    78  		}
    79  		if strings.IndexByte(k, '.') <= 0 {
    80  			return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
    81  		}
    82  		// Kernel is quite forgiving to extra whitespace
    83  		// around the value, and so should we.
    84  		v = strings.TrimSpace(v)
    85  		// Please keep cases in alphabetical order.
    86  		switch k {
    87  		case "cpu.idle":
    88  			if shouldSetCPUIdle(cm, v) {
    89  				// Setting CPUWeight to 0 tells systemd
    90  				// to set cpu.idle to 1.
    91  				props = append(props,
    92  					newProp("CPUWeight", uint64(0)))
    93  			}
    94  
    95  		case "cpu.max":
    96  			// value: quota [period]
    97  			quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
    98  			period := defCPUQuotaPeriod
    99  			sv := strings.Fields(v)
   100  			if len(sv) < 1 || len(sv) > 2 {
   101  				return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
   102  			}
   103  			// quota
   104  			if sv[0] != "max" {
   105  				quota, err = strconv.ParseInt(sv[0], 10, 64)
   106  				if err != nil {
   107  					return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
   108  				}
   109  			}
   110  			// period
   111  			if len(sv) == 2 {
   112  				period, err = strconv.ParseUint(sv[1], 10, 64)
   113  				if err != nil {
   114  					return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
   115  				}
   116  			}
   117  			addCpuQuota(cm, &props, quota, period)
   118  
   119  		case "cpu.weight":
   120  			if shouldSetCPUIdle(cm, strings.TrimSpace(res["cpu.idle"])) {
   121  				// Do not add duplicate CPUWeight property
   122  				// (see case "cpu.idle" above).
   123  				logrus.Warn("unable to apply both cpu.weight and cpu.idle to systemd, ignoring cpu.weight")
   124  				continue
   125  			}
   126  			num, err := strconv.ParseUint(v, 10, 64)
   127  			if err != nil {
   128  				return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
   129  			}
   130  			props = append(props,
   131  				newProp("CPUWeight", num))
   132  
   133  		case "cpuset.cpus", "cpuset.mems":
   134  			bits, err := RangeToBits(v)
   135  			if err != nil {
   136  				return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
   137  			}
   138  			m := map[string]string{
   139  				"cpuset.cpus": "AllowedCPUs",
   140  				"cpuset.mems": "AllowedMemoryNodes",
   141  			}
   142  			// systemd only supports these properties since v244
   143  			sdVer := systemdVersion(cm)
   144  			if sdVer >= 244 {
   145  				props = append(props,
   146  					newProp(m[k], bits))
   147  			} else {
   148  				logrus.Debugf("systemd v%d is too old to support %s"+
   149  					" (setting will still be applied to cgroupfs)",
   150  					sdVer, m[k])
   151  			}
   152  
   153  		case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
   154  			num := uint64(math.MaxUint64)
   155  			if v != "max" {
   156  				num, err = strconv.ParseUint(v, 10, 64)
   157  				if err != nil {
   158  					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
   159  				}
   160  			}
   161  			m := map[string]string{
   162  				"memory.high":     "MemoryHigh",
   163  				"memory.low":      "MemoryLow",
   164  				"memory.min":      "MemoryMin",
   165  				"memory.max":      "MemoryMax",
   166  				"memory.swap.max": "MemorySwapMax",
   167  			}
   168  			props = append(props,
   169  				newProp(m[k], num))
   170  
   171  		case "pids.max":
   172  			num := uint64(math.MaxUint64)
   173  			if v != "max" {
   174  				var err error
   175  				num, err = strconv.ParseUint(v, 10, 64)
   176  				if err != nil {
   177  					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
   178  				}
   179  			}
   180  			props = append(props,
   181  				newProp("TasksMax", num))
   182  
   183  		case "memory.oom.group":
   184  			// Setting this to 1 is roughly equivalent to OOMPolicy=kill
   185  			// (as per systemd.service(5) and
   186  			// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
   187  			// but it's not clear what to do if it is unset or set
   188  			// to 0 in runc update, as there are two other possible
   189  			// values for OOMPolicy (continue/stop).
   190  			fallthrough
   191  
   192  		default:
   193  			// Ignore the unknown resource here -- will still be
   194  			// applied in Set which calls fs2.Set.
   195  			logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
   196  		}
   197  	}
   198  
   199  	return props, nil
   200  }
   201  
   202  func genV2ResourcesProperties(dirPath string, r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
   203  	// We need this check before setting systemd properties, otherwise
   204  	// the container is OOM-killed and the systemd unit is removed
   205  	// before we get to fsMgr.Set().
   206  	if err := fs2.CheckMemoryUsage(dirPath, r); err != nil {
   207  		return nil, err
   208  	}
   209  
   210  	var properties []systemdDbus.Property
   211  
   212  	// NOTE: This is of questionable correctness because we insert our own
   213  	//       devices eBPF program later. Two programs with identical rules
   214  	//       aren't the end of the world, but it is a bit concerning. However
   215  	//       it's unclear if systemd removes all eBPF programs attached when
   216  	//       doing SetUnitProperties...
   217  	deviceProperties, err := generateDeviceProperties(r, cm)
   218  	if err != nil {
   219  		return nil, err
   220  	}
   221  	properties = append(properties, deviceProperties...)
   222  
   223  	if r.Memory != 0 {
   224  		properties = append(properties,
   225  			newProp("MemoryMax", uint64(r.Memory)))
   226  	}
   227  	if r.MemoryReservation != 0 {
   228  		properties = append(properties,
   229  			newProp("MemoryLow", uint64(r.MemoryReservation)))
   230  	}
   231  
   232  	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
   233  	if err != nil {
   234  		return nil, err
   235  	}
   236  	if swap != 0 {
   237  		properties = append(properties,
   238  			newProp("MemorySwapMax", uint64(swap)))
   239  	}
   240  
   241  	idleSet := false
   242  	// The logic here is the same as in shouldSetCPUIdle.
   243  	if r.CPUIdle != nil && *r.CPUIdle == 1 && systemdVersion(cm) >= cpuIdleSupportedVersion {
   244  		properties = append(properties,
   245  			newProp("CPUWeight", uint64(0)))
   246  		idleSet = true
   247  	}
   248  	if r.CpuWeight != 0 {
   249  		if idleSet {
   250  			// Ignore CpuWeight if CPUIdle is already set.
   251  			logrus.Warn("unable to apply both CPUWeight and CpuIdle to systemd, ignoring CPUWeight")
   252  		} else {
   253  			properties = append(properties,
   254  				newProp("CPUWeight", r.CpuWeight))
   255  		}
   256  	}
   257  
   258  	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
   259  
   260  	if r.PidsLimit > 0 || r.PidsLimit == -1 {
   261  		properties = append(properties,
   262  			newProp("TasksMax", uint64(r.PidsLimit)))
   263  	}
   264  
   265  	err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
   266  	if err != nil {
   267  		return nil, err
   268  	}
   269  
   270  	// ignore r.KernelMemory
   271  
   272  	// convert Resources.Unified map to systemd properties
   273  	if r.Unified != nil {
   274  		unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
   275  		if err != nil {
   276  			return nil, err
   277  		}
   278  		properties = append(properties, unifiedProps...)
   279  	}
   280  
   281  	return properties, nil
   282  }
   283  
   284  func (m *UnifiedManager) Apply(pid int) error {
   285  	var (
   286  		c          = m.cgroups
   287  		unitName   = getUnitName(c)
   288  		properties []systemdDbus.Property
   289  	)
   290  
   291  	slice := "system.slice"
   292  	if m.cgroups.Rootless {
   293  		slice = "user.slice"
   294  	}
   295  	if c.Parent != "" {
   296  		slice = c.Parent
   297  	}
   298  
   299  	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
   300  
   301  	if strings.HasSuffix(unitName, ".slice") {
   302  		// If we create a slice, the parent is defined via a Wants=.
   303  		properties = append(properties, systemdDbus.PropWants(slice))
   304  	} else {
   305  		// Otherwise it's a scope, which we put into a Slice=.
   306  		properties = append(properties, systemdDbus.PropSlice(slice))
   307  		// Assume scopes always support delegation (supported since systemd v218).
   308  		properties = append(properties, newProp("Delegate", true))
   309  	}
   310  
   311  	// only add pid if its valid, -1 is used w/ general slice creation.
   312  	if pid != -1 {
   313  		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
   314  	}
   315  
   316  	// Always enable accounting, this gets us the same behaviour as the fs implementation,
   317  	// plus the kernel has some problems with joining the memory cgroup at a later time.
   318  	properties = append(properties,
   319  		newProp("MemoryAccounting", true),
   320  		newProp("CPUAccounting", true),
   321  		newProp("IOAccounting", true),
   322  		newProp("TasksAccounting", true),
   323  	)
   324  
   325  	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
   326  	properties = append(properties,
   327  		newProp("DefaultDependencies", false))
   328  
   329  	properties = append(properties, c.SystemdProps...)
   330  
   331  	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
   332  		return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
   333  	}
   334  
   335  	if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
   336  		return err
   337  	}
   338  
   339  	if c.OwnerUID != nil {
   340  		// The directory itself must be chowned.
   341  		err := os.Chown(m.path, *c.OwnerUID, -1)
   342  		if err != nil {
   343  			return err
   344  		}
   345  
   346  		filesToChown, err := cgroupFilesToChown()
   347  		if err != nil {
   348  			return err
   349  		}
   350  
   351  		for _, v := range filesToChown {
   352  			err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
   353  			// Some files might not be present.
   354  			if err != nil && !errors.Is(err, os.ErrNotExist) {
   355  				return err
   356  			}
   357  		}
   358  	}
   359  
   360  	return nil
   361  }
   362  
   363  // The kernel exposes a list of files that should be chowned to the delegate
   364  // uid in /sys/kernel/cgroup/delegate.  If the file is not present
   365  // (Linux < 4.15), use the initial values mentioned in cgroups(7).
   366  func cgroupFilesToChown() ([]string, error) {
   367  	const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
   368  
   369  	f, err := os.Open(cgroupDelegateFile)
   370  	if err != nil {
   371  		return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
   372  	}
   373  	defer f.Close()
   374  
   375  	filesToChown := []string{}
   376  	scanner := bufio.NewScanner(f)
   377  	for scanner.Scan() {
   378  		filesToChown = append(filesToChown, scanner.Text())
   379  	}
   380  	if err := scanner.Err(); err != nil {
   381  		return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
   382  	}
   383  
   384  	return filesToChown, nil
   385  }
   386  
   387  func (m *UnifiedManager) Destroy() error {
   388  	m.mu.Lock()
   389  	defer m.mu.Unlock()
   390  
   391  	unitName := getUnitName(m.cgroups)
   392  	if err := stopUnit(m.dbus, unitName); err != nil {
   393  		return err
   394  	}
   395  
   396  	// systemd 239 do not remove sub-cgroups.
   397  	err := m.fsMgr.Destroy()
   398  	// fsMgr.Destroy has handled ErrNotExist
   399  	if err != nil {
   400  		return err
   401  	}
   402  
   403  	return nil
   404  }
   405  
   406  func (m *UnifiedManager) Path(_ string) string {
   407  	return m.path
   408  }
   409  
   410  // getSliceFull value is used in initPath.
   411  // The value is incompatible with systemdDbus.PropSlice.
   412  func (m *UnifiedManager) getSliceFull() (string, error) {
   413  	c := m.cgroups
   414  	slice := "system.slice"
   415  	if c.Rootless {
   416  		slice = "user.slice"
   417  	}
   418  	if c.Parent != "" {
   419  		var err error
   420  		slice, err = ExpandSlice(c.Parent)
   421  		if err != nil {
   422  			return "", err
   423  		}
   424  	}
   425  
   426  	if c.Rootless {
   427  		// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
   428  		managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
   429  		if err != nil {
   430  			return "", err
   431  		}
   432  		slice = filepath.Join(managerCG, slice)
   433  	}
   434  
   435  	// an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
   436  	// NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
   437  	return slice, nil
   438  }
   439  
   440  func (m *UnifiedManager) initPath() error {
   441  	if m.path != "" {
   442  		return nil
   443  	}
   444  
   445  	sliceFull, err := m.getSliceFull()
   446  	if err != nil {
   447  		return err
   448  	}
   449  
   450  	c := m.cgroups
   451  	path := filepath.Join(sliceFull, getUnitName(c))
   452  	path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
   453  	if err != nil {
   454  		return err
   455  	}
   456  
   457  	// an example of the final path in rootless:
   458  	// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
   459  	m.path = path
   460  
   461  	return nil
   462  }
   463  
   464  func (m *UnifiedManager) Freeze(state configs.FreezerState) error {
   465  	return m.fsMgr.Freeze(state)
   466  }
   467  
   468  func (m *UnifiedManager) GetPids() ([]int, error) {
   469  	return cgroups.GetPids(m.path)
   470  }
   471  
   472  func (m *UnifiedManager) GetAllPids() ([]int, error) {
   473  	return cgroups.GetAllPids(m.path)
   474  }
   475  
   476  func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
   477  	return m.fsMgr.GetStats()
   478  }
   479  
   480  func (m *UnifiedManager) Set(r *configs.Resources) error {
   481  	if r == nil {
   482  		return nil
   483  	}
   484  	properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus)
   485  	if err != nil {
   486  		return err
   487  	}
   488  
   489  	if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
   490  		return fmt.Errorf("unable to set unit properties: %w", err)
   491  	}
   492  
   493  	return m.fsMgr.Set(r)
   494  }
   495  
   496  func (m *UnifiedManager) GetPaths() map[string]string {
   497  	paths := make(map[string]string, 1)
   498  	paths[""] = m.path
   499  	return paths
   500  }
   501  
   502  func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) {
   503  	return m.cgroups, nil
   504  }
   505  
   506  func (m *UnifiedManager) GetFreezerState() (configs.FreezerState, error) {
   507  	return m.fsMgr.GetFreezerState()
   508  }
   509  
   510  func (m *UnifiedManager) Exists() bool {
   511  	return cgroups.PathExists(m.path)
   512  }
   513  
   514  func (m *UnifiedManager) OOMKillCount() (uint64, error) {
   515  	return m.fsMgr.OOMKillCount()
   516  }
   517  
   518  func (m *UnifiedManager) GetEffectiveCPUs() string {
   519  	return m.fsMgr.GetEffectiveCPUs()
   520  }