github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/mergeCode/runc/libcontainer/cgroups/systemd/apply_systemd.go (about)

     1  // +build linux
     2  
     3  package systemd
     4  
     5  import (
     6  	"errors"
     7  	"fmt"
     8  	"io/ioutil"
     9  	"os"
    10  	"path/filepath"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	systemdDbus "github.com/coreos/go-systemd/dbus"
    16  	systemdUtil "github.com/coreos/go-systemd/util"
    17  	"github.com/godbus/dbus"
    18  	"github.com/opencontainers/runc/libcontainer/cgroups"
    19  	"github.com/opencontainers/runc/libcontainer/cgroups/fs"
    20  	"github.com/opencontainers/runc/libcontainer/configs"
    21  )
    22  
    23  type Manager struct {
    24  	mu      sync.Mutex
    25  	Cgroups *configs.Cgroup
    26  	Paths   map[string]string
    27  }
    28  
    29  type subsystem interface {
    30  	// Name returns the name of the subsystem.
    31  	Name() string
    32  	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
    33  	GetStats(path string, stats *cgroups.Stats) error
    34  	// Set the cgroup represented by cgroup.
    35  	Set(path string, cgroup *configs.Cgroup) error
    36  }
    37  
    38  var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
    39  
    40  type subsystemSet []subsystem
    41  
    42  func (s subsystemSet) Get(name string) (subsystem, error) {
    43  	for _, ss := range s {
    44  		if ss.Name() == name {
    45  			return ss, nil
    46  		}
    47  	}
    48  	return nil, errSubsystemDoesNotExist
    49  }
    50  
    51  var subsystems = subsystemSet{
    52  	&fs.CpusetGroup{},
    53  	&fs.DevicesGroup{},
    54  	&fs.MemoryGroup{},
    55  	&fs.CpuGroup{},
    56  	&fs.CpuacctGroup{},
    57  	&fs.PidsGroup{},
    58  	&fs.BlkioGroup{},
    59  	&fs.HugetlbGroup{},
    60  	&fs.PerfEventGroup{},
    61  	&fs.FreezerGroup{},
    62  	&fs.NetPrioGroup{},
    63  	&fs.NetClsGroup{},
    64  	&fs.NameGroup{GroupName: "name=systemd"},
    65  }
    66  
    67  const (
    68  	testScopeWait = 4
    69  	testSliceWait = 4
    70  )
    71  
    72  var (
    73  	connLock                        sync.Mutex
    74  	theConn                         *systemdDbus.Conn
    75  	hasStartTransientUnit           bool
    76  	hasStartTransientSliceUnit      bool
    77  	hasTransientDefaultDependencies bool
    78  	hasDelegate                     bool
    79  )
    80  
    81  func newProp(name string, units interface{}) systemdDbus.Property {
    82  	return systemdDbus.Property{
    83  		Name:  name,
    84  		Value: dbus.MakeVariant(units),
    85  	}
    86  }
    87  
    88  func UseSystemd() bool {
    89  	if !systemdUtil.IsRunningSystemd() {
    90  		return false
    91  	}
    92  
    93  	connLock.Lock()
    94  	defer connLock.Unlock()
    95  
    96  	if theConn == nil {
    97  		var err error
    98  		theConn, err = systemdDbus.New()
    99  		if err != nil {
   100  			return false
   101  		}
   102  
   103  		// Assume we have StartTransientUnit
   104  		hasStartTransientUnit = true
   105  
   106  		// But if we get UnknownMethod error we don't
   107  		if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil {
   108  			if dbusError, ok := err.(dbus.Error); ok {
   109  				if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
   110  					hasStartTransientUnit = false
   111  					return hasStartTransientUnit
   112  				}
   113  			}
   114  		}
   115  
   116  		// Ensure the scope name we use doesn't exist. Use the Pid to
   117  		// avoid collisions between multiple libcontainer users on a
   118  		// single host.
   119  		scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid())
   120  		testScopeExists := true
   121  		for i := 0; i <= testScopeWait; i++ {
   122  			if _, err := theConn.StopUnit(scope, "replace", nil); err != nil {
   123  				if dbusError, ok := err.(dbus.Error); ok {
   124  					if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
   125  						testScopeExists = false
   126  						break
   127  					}
   128  				}
   129  			}
   130  			time.Sleep(time.Millisecond)
   131  		}
   132  
   133  		// Bail out if we can't kill this scope without testing for DefaultDependencies
   134  		if testScopeExists {
   135  			return hasStartTransientUnit
   136  		}
   137  
   138  		// Assume StartTransientUnit on a scope allows DefaultDependencies
   139  		hasTransientDefaultDependencies = true
   140  		ddf := newProp("DefaultDependencies", false)
   141  		if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil {
   142  			if dbusError, ok := err.(dbus.Error); ok {
   143  				if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
   144  					hasTransientDefaultDependencies = false
   145  				}
   146  			}
   147  		}
   148  
   149  		// Not critical because of the stop unit logic above.
   150  		theConn.StopUnit(scope, "replace", nil)
   151  
   152  		// Assume StartTransientUnit on a scope allows Delegate
   153  		hasDelegate = true
   154  		dl := newProp("Delegate", true)
   155  		if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
   156  			if dbusError, ok := err.(dbus.Error); ok {
   157  				if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
   158  					hasDelegate = false
   159  				}
   160  			}
   161  		}
   162  
   163  		// Assume we have the ability to start a transient unit as a slice
   164  		// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
   165  		// For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299
   166  		hasStartTransientSliceUnit = true
   167  
   168  		// To ensure simple clean-up, we create a slice off the root with no hierarchy
   169  		slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid())
   170  		if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil {
   171  			if _, ok := err.(dbus.Error); ok {
   172  				hasStartTransientSliceUnit = false
   173  			}
   174  		}
   175  
   176  		for i := 0; i <= testSliceWait; i++ {
   177  			if _, err := theConn.StopUnit(slice, "replace", nil); err != nil {
   178  				if dbusError, ok := err.(dbus.Error); ok {
   179  					if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
   180  						hasStartTransientSliceUnit = false
   181  						break
   182  					}
   183  				}
   184  			} else {
   185  				break
   186  			}
   187  			time.Sleep(time.Millisecond)
   188  		}
   189  
   190  		// Not critical because of the stop unit logic above.
   191  		theConn.StopUnit(scope, "replace", nil)
   192  		theConn.StopUnit(slice, "replace", nil)
   193  	}
   194  	return hasStartTransientUnit
   195  }
   196  
   197  func (m *Manager) Apply(pid int) error {
   198  	var (
   199  		c          = m.Cgroups
   200  		unitName   = getUnitName(c)
   201  		slice      = "system.slice"
   202  		properties []systemdDbus.Property
   203  	)
   204  
   205  	if c.Paths != nil {
   206  		paths := make(map[string]string)
   207  		for name, path := range c.Paths {
   208  			_, err := getSubsystemPath(m.Cgroups, name)
   209  			if err != nil {
   210  				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
   211  				if cgroups.IsNotFound(err) {
   212  					continue
   213  				}
   214  				return err
   215  			}
   216  			paths[name] = path
   217  		}
   218  		m.Paths = paths
   219  		return cgroups.EnterPid(m.Paths, pid)
   220  	}
   221  
   222  	if c.Parent != "" {
   223  		slice = c.Parent
   224  	}
   225  
   226  	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
   227  
   228  	// if we create a slice, the parent is defined via a Wants=
   229  	if strings.HasSuffix(unitName, ".slice") {
   230  		// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
   231  		if !hasStartTransientSliceUnit {
   232  			return fmt.Errorf("systemd version does not support ability to start a slice as transient unit")
   233  		}
   234  		properties = append(properties, systemdDbus.PropWants(slice))
   235  	} else {
   236  		// otherwise, we use Slice=
   237  		properties = append(properties, systemdDbus.PropSlice(slice))
   238  	}
   239  
   240  	// only add pid if its valid, -1 is used w/ general slice creation.
   241  	if pid != -1 {
   242  		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
   243  	}
   244  
   245  	if hasDelegate {
   246  		// This is only supported on systemd versions 218 and above.
   247  		properties = append(properties, newProp("Delegate", true))
   248  	}
   249  
   250  	// Always enable accounting, this gets us the same behaviour as the fs implementation,
   251  	// plus the kernel has some problems with joining the memory cgroup at a later time.
   252  	properties = append(properties,
   253  		newProp("MemoryAccounting", true),
   254  		newProp("CPUAccounting", true),
   255  		newProp("BlockIOAccounting", true))
   256  
   257  	if hasTransientDefaultDependencies {
   258  		properties = append(properties,
   259  			newProp("DefaultDependencies", false))
   260  	}
   261  
   262  	if c.Resources.Memory != 0 {
   263  		properties = append(properties,
   264  			newProp("MemoryLimit", uint64(c.Resources.Memory)))
   265  	}
   266  
   267  	if c.Resources.CpuShares != 0 {
   268  		properties = append(properties,
   269  			newProp("CPUShares", uint64(c.Resources.CpuShares)))
   270  	}
   271  
   272  	if c.Resources.BlkioWeight != 0 {
   273  		properties = append(properties,
   274  			newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
   275  	}
   276  
   277  	// We have to set kernel memory here, as we can't change it once
   278  	// processes have been attached to the cgroup.
   279  	if c.Resources.KernelMemory != 0 {
   280  		if err := setKernelMemory(c); err != nil {
   281  			return err
   282  		}
   283  	}
   284  
   285  	if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) {
   286  		return err
   287  	}
   288  
   289  	if err := joinCgroups(c, pid); err != nil {
   290  		return err
   291  	}
   292  
   293  	paths := make(map[string]string)
   294  	for _, s := range subsystems {
   295  		subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
   296  		if err != nil {
   297  			// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
   298  			if cgroups.IsNotFound(err) {
   299  				continue
   300  			}
   301  			return err
   302  		}
   303  		paths[s.Name()] = subsystemPath
   304  	}
   305  	m.Paths = paths
   306  	return nil
   307  }
   308  
   309  func (m *Manager) Destroy() error {
   310  	if m.Cgroups.Paths != nil {
   311  		return nil
   312  	}
   313  	m.mu.Lock()
   314  	defer m.mu.Unlock()
   315  	theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
   316  	if err := cgroups.RemovePaths(m.Paths); err != nil {
   317  		return err
   318  	}
   319  	m.Paths = make(map[string]string)
   320  	return nil
   321  }
   322  
   323  func (m *Manager) GetPaths() map[string]string {
   324  	m.mu.Lock()
   325  	paths := m.Paths
   326  	m.mu.Unlock()
   327  	return paths
   328  }
   329  
   330  func writeFile(dir, file, data string) error {
   331  	// Normally dir should not be empty, one case is that cgroup subsystem
   332  	// is not mounted, we will get empty dir, and we want it fail here.
   333  	if dir == "" {
   334  		return fmt.Errorf("no such directory for %s", file)
   335  	}
   336  	return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
   337  }
   338  
   339  func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
   340  	path, err := getSubsystemPath(c, subsystem)
   341  	if err != nil {
   342  		return "", err
   343  	}
   344  	if err := os.MkdirAll(path, 0755); err != nil {
   345  		return "", err
   346  	}
   347  	if err := cgroups.WriteCgroupProc(path, pid); err != nil {
   348  		return "", err
   349  	}
   350  	return path, nil
   351  }
   352  
   353  func joinCgroups(c *configs.Cgroup, pid int) error {
   354  	for _, sys := range subsystems {
   355  		name := sys.Name()
   356  		switch name {
   357  		case "name=systemd":
   358  			// let systemd handle this
   359  			break
   360  		case "cpuset":
   361  			path, err := getSubsystemPath(c, name)
   362  			if err != nil && !cgroups.IsNotFound(err) {
   363  				return err
   364  			}
   365  			s := &fs.CpusetGroup{}
   366  			if err := s.ApplyDir(path, c, pid); err != nil {
   367  				return err
   368  			}
   369  			break
   370  		default:
   371  			_, err := join(c, name, pid)
   372  			if err != nil {
   373  				// Even if it's `not found` error, we'll return err
   374  				// because devices cgroup is hard requirement for
   375  				// container security.
   376  				if name == "devices" {
   377  					return err
   378  				}
   379  				// For other subsystems, omit the `not found` error
   380  				// because they are optional.
   381  				if !cgroups.IsNotFound(err) {
   382  					return err
   383  				}
   384  			}
   385  		}
   386  	}
   387  
   388  	return nil
   389  }
   390  
   391  // systemd represents slice hierarchy using `-`, so we need to follow suit when
   392  // generating the path of slice. Essentially, test-a-b.slice becomes
   393  // test.slice/test-a.slice/test-a-b.slice.
   394  func ExpandSlice(slice string) (string, error) {
   395  	suffix := ".slice"
   396  	// Name has to end with ".slice", but can't be just ".slice".
   397  	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
   398  		return "", fmt.Errorf("invalid slice name: %s", slice)
   399  	}
   400  
   401  	// Path-separators are not allowed.
   402  	if strings.Contains(slice, "/") {
   403  		return "", fmt.Errorf("invalid slice name: %s", slice)
   404  	}
   405  
   406  	var path, prefix string
   407  	sliceName := strings.TrimSuffix(slice, suffix)
   408  	// if input was -.slice, we should just return root now
   409  	if sliceName == "-" {
   410  		return "/", nil
   411  	}
   412  	for _, component := range strings.Split(sliceName, "-") {
   413  		// test--a.slice isn't permitted, nor is -test.slice.
   414  		if component == "" {
   415  			return "", fmt.Errorf("invalid slice name: %s", slice)
   416  		}
   417  
   418  		// Append the component to the path and to the prefix.
   419  		path += prefix + component + suffix + "/"
   420  		prefix += component + "-"
   421  	}
   422  
   423  	return path, nil
   424  }
   425  
   426  func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
   427  	mountpoint, err := cgroups.FindCgroupMountpoint(subsystem)
   428  	if err != nil {
   429  		return "", err
   430  	}
   431  
   432  	initPath, err := cgroups.GetInitCgroupDir(subsystem)
   433  	if err != nil {
   434  		return "", err
   435  	}
   436  	// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
   437  	initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
   438  
   439  	slice := "system.slice"
   440  	if c.Parent != "" {
   441  		slice = c.Parent
   442  	}
   443  
   444  	slice, err = ExpandSlice(slice)
   445  	if err != nil {
   446  		return "", err
   447  	}
   448  
   449  	return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
   450  }
   451  
   452  func (m *Manager) Freeze(state configs.FreezerState) error {
   453  	path, err := getSubsystemPath(m.Cgroups, "freezer")
   454  	if err != nil {
   455  		return err
   456  	}
   457  	prevState := m.Cgroups.Resources.Freezer
   458  	m.Cgroups.Resources.Freezer = state
   459  	freezer, err := subsystems.Get("freezer")
   460  	if err != nil {
   461  		return err
   462  	}
   463  	err = freezer.Set(path, m.Cgroups)
   464  	if err != nil {
   465  		m.Cgroups.Resources.Freezer = prevState
   466  		return err
   467  	}
   468  	return nil
   469  }
   470  
   471  func (m *Manager) GetPids() ([]int, error) {
   472  	path, err := getSubsystemPath(m.Cgroups, "devices")
   473  	if err != nil {
   474  		return nil, err
   475  	}
   476  	return cgroups.GetPids(path)
   477  }
   478  
   479  func (m *Manager) GetAllPids() ([]int, error) {
   480  	path, err := getSubsystemPath(m.Cgroups, "devices")
   481  	if err != nil {
   482  		return nil, err
   483  	}
   484  	return cgroups.GetAllPids(path)
   485  }
   486  
   487  func (m *Manager) GetStats() (*cgroups.Stats, error) {
   488  	m.mu.Lock()
   489  	defer m.mu.Unlock()
   490  	stats := cgroups.NewStats()
   491  	for name, path := range m.Paths {
   492  		sys, err := subsystems.Get(name)
   493  		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
   494  			continue
   495  		}
   496  		if err := sys.GetStats(path, stats); err != nil {
   497  			return nil, err
   498  		}
   499  	}
   500  
   501  	return stats, nil
   502  }
   503  
   504  func (m *Manager) Set(container *configs.Config) error {
   505  	// If Paths are set, then we are just joining cgroups paths
   506  	// and there is no need to set any values.
   507  	if m.Cgroups.Paths != nil {
   508  		return nil
   509  	}
   510  	for _, sys := range subsystems {
   511  		// Get the subsystem path, but don't error out for not found cgroups.
   512  		path, err := getSubsystemPath(container.Cgroups, sys.Name())
   513  		if err != nil && !cgroups.IsNotFound(err) {
   514  			return err
   515  		}
   516  
   517  		if err := sys.Set(path, container.Cgroups); err != nil {
   518  			return err
   519  		}
   520  	}
   521  
   522  	if m.Paths["cpu"] != "" {
   523  		if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
   524  			return err
   525  		}
   526  	}
   527  	return nil
   528  }
   529  
   530  func getUnitName(c *configs.Cgroup) string {
   531  	// by default, we create a scope unless the user explicitly asks for a slice.
   532  	if !strings.HasSuffix(c.Name, ".slice") {
   533  		return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
   534  	}
   535  	return c.Name
   536  }
   537  
   538  func setKernelMemory(c *configs.Cgroup) error {
   539  	path, err := getSubsystemPath(c, "memory")
   540  	if err != nil && !cgroups.IsNotFound(err) {
   541  		return err
   542  	}
   543  
   544  	if err := os.MkdirAll(path, 0755); err != nil {
   545  		return err
   546  	}
   547  	return fs.EnableKernelMemoryAccounting(path)
   548  }
   549  
   550  // isUnitExists returns true if the error is that a systemd unit already exists.
   551  func isUnitExists(err error) bool {
   552  	if err != nil {
   553  		if dbusError, ok := err.(dbus.Error); ok {
   554  			return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
   555  		}
   556  	}
   557  	return false
   558  }