github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/cgroup/systemd.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cgroup
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"os"
    22  	"path/filepath"
    23  	"regexp"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    30  	"github.com/MerlinKodo/gvisor/pkg/log"
    31  	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
    32  	dbus "github.com/godbus/dbus/v5"
    33  	specs "github.com/opencontainers/runtime-spec/specs-go"
    34  )
    35  
    36  var (
    37  	// ErrBadResourceSpec indicates that a cgroupSystemd function was
    38  	// passed a specs.LinuxResources object that is impossible or illegal
    39  	// to process.
    40  	ErrBadResourceSpec = errors.New("misconfigured resource spec")
    41  	// ErrInvalidSlice indicates that the slice name passed via cgroup.Path is
    42  	// invalid.
    43  	ErrInvalidSlice = errors.New("invalid slice name")
    44  )
    45  
    46  // cgroupSystemd represents a cgroupv2 managed by systemd.
    47  type cgroupSystemd struct {
    48  	cgroupV2
    49  	// Name is the name of the of the systemd scope that controls the cgroups.
    50  	Name string
    51  	// Parent is the encapsulating slice.
    52  	Parent string
    53  	// ScopePrefix is the prefix for the scope name.
    54  	ScopePrefix string
    55  
    56  	properties []systemdDbus.Property
    57  	dbusConn   *systemdDbus.Conn
    58  }
    59  
    60  func newCgroupV2Systemd(cgv2 *cgroupV2) (*cgroupSystemd, error) {
    61  	if !isRunningSystemd() {
    62  		return nil, fmt.Errorf("systemd not running on host")
    63  	}
    64  	ctx := context.Background()
    65  	cg := &cgroupSystemd{cgroupV2: *cgv2}
    66  	// Parse the path from expected "slice:prefix:name"
    67  	// for e.g. "system.slice:docker:1234"
    68  	parts := strings.Split(cg.Path, ":")
    69  	if len(parts) != 3 {
    70  		return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", cg.Path)
    71  	}
    72  	cg.Parent = parts[0]
    73  	cg.ScopePrefix = parts[1]
    74  	cg.Name = parts[2]
    75  	if err := validSlice(cg.Parent); err != nil {
    76  		return nil, fmt.Errorf("%w: %v", ErrInvalidGroupPath, err)
    77  	}
    78  	// Rewrite Path so that it is compatible with cgroupv2 methods.
    79  	cg.Path = filepath.Join(expandSlice(cg.Parent), cg.unitName())
    80  	conn, err := systemdDbus.NewWithContext(ctx)
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  	var version int
    85  	if version, err = systemdVersion(conn); err != nil {
    86  		return nil, fmt.Errorf("error parsing systemd version: %v", err)
    87  	}
    88  	if version < 244 {
    89  		return nil, fmt.Errorf("systemd version %d not supported, please upgrade to at least 244", version)
    90  	}
    91  	cg.dbusConn = conn
    92  	return cg, err
    93  }
    94  
    95  // Install configures the properties for a scope unit but does not start the
    96  // unit.
    97  func (c *cgroupSystemd) Install(res *specs.LinuxResources) error {
    98  	log.Debugf("Installing systemd cgroup resource controller under %v", c.Parent)
    99  	c.properties = append(c.properties, systemdDbus.PropSlice(c.Parent))
   100  	c.properties = append(c.properties, systemdDbus.PropDescription("Secure container "+c.Name))
   101  	pid := os.Getpid()
   102  	c.properties = append(c.properties, systemdDbus.PropPids(uint32(pid)))
   103  	// We always want proper accounting for the container for reporting resource
   104  	// usage.
   105  	c.addProp("MemoryAccounting", true)
   106  	c.addProp("CPUAccounting", true)
   107  	c.addProp("TasksAccounting", true)
   108  	c.addProp("IOAccounting", true)
   109  	// Delegate must be true so that the container can manage its own cgroups.
   110  	c.addProp("Delegate", true)
   111  	// For compatibility with runc.
   112  	c.addProp("DefaultDependencies", false)
   113  
   114  	for controllerName, ctrlr := range controllers2 {
   115  		// First check if our controller is found in the system.
   116  		found := false
   117  		for _, knownController := range c.Controllers {
   118  			if controllerName == knownController {
   119  				found = true
   120  			}
   121  		}
   122  		if found {
   123  			props, err := ctrlr.generateProperties(res)
   124  			if err != nil {
   125  				return err
   126  			}
   127  			c.properties = append(c.properties, props...)
   128  			continue
   129  		}
   130  		if ctrlr.optional() {
   131  			if err := ctrlr.skip(res); err != nil {
   132  				return err
   133  			}
   134  		} else {
   135  			return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.Path)
   136  		}
   137  	}
   138  	return nil
   139  }
   140  
   141  func (c *cgroupSystemd) unitName() string {
   142  	return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
   143  }
   144  
   145  // MakePath builds a path to the given controller.
   146  func (c *cgroupSystemd) MakePath(string) string {
   147  	fullSlicePath := expandSlice(c.Parent)
   148  	path := filepath.Join(c.Mountpoint, fullSlicePath, c.unitName())
   149  	return path
   150  }
   151  
   152  // Join implements Cgroup.Join.
   153  func (c *cgroupSystemd) Join() (func(), error) {
   154  	log.Debugf("Joining systemd cgroup %v", c.unitName())
   155  	timeout := 30 * time.Second
   156  	ctx := context.Background()
   157  	// Clean up partially created cgroups on error. Errors during cleanup itself
   158  	// are ignored.
   159  	clean := cleanup.Make(func() { _ = c.Uninstall() })
   160  	defer clean.Clean()
   161  
   162  	conn, err := systemdDbus.NewWithContext(ctx)
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  	c.dbusConn = conn
   167  	unitName := c.unitName()
   168  	statusChan := make(chan string)
   169  	timedCtx, cancel := context.WithTimeout(ctx, timeout)
   170  	defer cancel()
   171  	if _, err := c.dbusConn.StartTransientUnitContext(timedCtx, unitName, "replace", c.properties, statusChan); err == nil {
   172  		s := <-statusChan
   173  		close(statusChan)
   174  		switch s {
   175  		case "done":
   176  		// All cases that are not "done" according to the dbus package.
   177  		case "cancelled", "timeout", "failed", "dependency", "skipped":
   178  			c.dbusConn.ResetFailedUnitContext(ctx, unitName)
   179  			return nil, fmt.Errorf("error creating systemd unit `%s`: got %s", unitName, s)
   180  		default:
   181  			c.dbusConn.ResetFailedUnitContext(ctx, unitName)
   182  			return nil, fmt.Errorf("unknown job completion status %q", s)
   183  		}
   184  	} else if unitAlreadyExists(err) {
   185  		return clean.Release(), nil
   186  	} else {
   187  		return nil, fmt.Errorf("systemd error: %v", err)
   188  	}
   189  	if _, err = c.createCgroupPaths(); err != nil {
   190  		return nil, err
   191  	}
   192  	return clean.Release(), nil
   193  }
   194  
   195  // unitAlreadyExists returns true if the error is that a systemd unit already
   196  // exists.
   197  func unitAlreadyExists(err error) bool {
   198  	if err != nil {
   199  		var derr dbus.Error
   200  		if errors.As(err, &derr) {
   201  			return strings.Contains(derr.Name, "org.freedesktop.systemd1.UnitExists")
   202  		}
   203  	}
   204  	return false
   205  }
   206  
   207  // systemd represents slice hierarchy using `-`, so we need to follow suit when
   208  // generating the path of slice. Essentially, test-a-b.slice becomes
   209  // /test.slice/test-a.slice/test-a-b.slice.
   210  func expandSlice(slice string) string {
   211  	var path, prefix string
   212  	suffix := ".slice"
   213  	sliceName := strings.TrimSuffix(slice, suffix)
   214  	// If input was -.slice, we should just return root now.
   215  	if sliceName == "-" {
   216  		return "/"
   217  	}
   218  	for _, component := range strings.Split(sliceName, "-") {
   219  		// Append the component to the path and to the prefix.
   220  		path += "/" + prefix + component + suffix
   221  		prefix += component + "-"
   222  	}
   223  	return path
   224  }
   225  
   226  func validSlice(slice string) error {
   227  	suffix := ".slice"
   228  	// Name has to end with ".slice", but can't be just ".slice".
   229  	if slice == suffix || !strings.HasSuffix(slice, suffix) {
   230  		return fmt.Errorf("%w: %s", ErrInvalidSlice, slice)
   231  	}
   232  
   233  	// Path-separators are not allowed.
   234  	if strings.Contains(slice, "/") {
   235  		return fmt.Errorf("%w: %s", ErrInvalidSlice, slice)
   236  	}
   237  
   238  	sliceName := strings.TrimSuffix(slice, suffix)
   239  	// If input was -.slice, we should just return root now.
   240  	if sliceName == "-" {
   241  		return nil
   242  	}
   243  	for _, component := range strings.Split(sliceName, "-") {
   244  		// test--a.slice isn't permitted, nor is -test.slice.
   245  		if component == "" {
   246  			return fmt.Errorf("%w: %s", ErrInvalidSlice, slice)
   247  		}
   248  	}
   249  	return nil
   250  }
   251  
   252  var systemdCheck struct {
   253  	once  sync.Once
   254  	cache bool
   255  }
   256  
   257  func isRunningSystemd() bool {
   258  	systemdCheck.once.Do(func() {
   259  		fi, err := os.Lstat("/run/systemd/system")
   260  		systemdCheck.cache = err == nil && fi.IsDir()
   261  	})
   262  	return systemdCheck.cache
   263  }
   264  
   265  func systemdVersion(conn *systemdDbus.Conn) (int, error) {
   266  	vStr, err := conn.GetManagerProperty("Version")
   267  	if err != nil {
   268  		return -1, errors.New("unable to get systemd version")
   269  	}
   270  	// vStr should be of the form:
   271  	// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
   272  	// The result for all of the above should be 245.
   273  	// Thus, we unconditionally remove the "v" prefix
   274  	// and then match on the first integer we can grab.
   275  	re := regexp.MustCompile(`v?([0-9]+)`)
   276  	matches := re.FindStringSubmatch(vStr)
   277  	if len(matches) < 2 {
   278  		return -1, fmt.Errorf("can't parse version %q: incorrect number of matches %d", vStr, len(matches))
   279  	}
   280  	version, err := strconv.Atoi(matches[1])
   281  	if err != nil {
   282  		return -1, fmt.Errorf("%w: can't parse version %q", err, vStr)
   283  	}
   284  	return version, nil
   285  }
   286  
   287  func addIOProps(props []systemdDbus.Property, name string, devs []specs.LinuxThrottleDevice) []systemdDbus.Property {
   288  	for _, dev := range devs {
   289  		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate)
   290  		props = append(props, newProp(name, val))
   291  	}
   292  	return props
   293  }
   294  
   295  func (c *cgroupSystemd) addProp(name string, value any) {
   296  	if value == nil {
   297  		return
   298  	}
   299  	c.properties = append(c.properties, newProp(name, value))
   300  }
   301  
   302  func newProp(name string, units any) systemdDbus.Property {
   303  	return systemdDbus.Property{
   304  		Name:  name,
   305  		Value: dbus.MakeVariant(units),
   306  	}
   307  }
   308  
   309  // CreateMockSystemdCgroup returns a mock Cgroup configured for systemd. This
   310  // is useful for testing.
   311  func CreateMockSystemdCgroup() Cgroup {
   312  	return &cgroupSystemd{
   313  		Name:        "test",
   314  		ScopePrefix: "runsc",
   315  		Parent:      "system.slice",
   316  		cgroupV2: cgroupV2{
   317  			Mountpoint: "/sys/fs/cgroup",
   318  			Path:       "/a/random/path",
   319  		},
   320  	}
   321  }