github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/devices/systemd.go (about)

     1  package devices
     2  
     3  import (
     4  	"bufio"
     5  	"fmt"
     6  	"os"
     7  	"strconv"
     8  	"strings"
     9  
    10  	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
    11  	"github.com/godbus/dbus/v5"
    12  	"github.com/sirupsen/logrus"
    13  
    14  	"github.com/opencontainers/runc/libcontainer/configs"
    15  	"github.com/opencontainers/runc/libcontainer/devices"
    16  )
    17  
    18  // systemdProperties takes the configured device rules and generates a
    19  // corresponding set of systemd properties to configure the devices correctly.
    20  func systemdProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
    21  	if r.SkipDevices {
    22  		return nil, nil
    23  	}
    24  
    25  	properties := []systemdDbus.Property{
    26  		// Always run in the strictest white-list mode.
    27  		newProp("DevicePolicy", "strict"),
    28  		// Empty the DeviceAllow array before filling it.
    29  		newProp("DeviceAllow", []deviceAllowEntry{}),
    30  	}
    31  
    32  	// Figure out the set of rules.
    33  	configEmu := emulator{}
    34  	for _, rule := range r.Devices {
    35  		if err := configEmu.Apply(*rule); err != nil {
    36  			return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
    37  		}
    38  	}
    39  	// systemd doesn't support blacklists. So we log a warning, and tell
    40  	// systemd to act as a deny-all whitelist. This ruleset will be replaced
    41  	// with our normal fallback code. This may result in spurious errors, but
    42  	// the only other option is to error out here.
    43  	if configEmu.IsBlacklist() {
    44  		// However, if we're dealing with an allow-all rule then we can do it.
    45  		if configEmu.IsAllowAll() {
    46  			return allowAllDevices(), nil
    47  		}
    48  		logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
    49  		return properties, nil
    50  	}
    51  
    52  	// Now generate the set of rules we actually need to apply. Unlike the
    53  	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
    54  	// whitelist which is the default for devices.Emulator.
    55  	finalRules, err := configEmu.Rules()
    56  	if err != nil {
    57  		return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
    58  	}
    59  	var deviceAllowList []deviceAllowEntry
    60  	for _, rule := range finalRules {
    61  		if !rule.Allow {
    62  			// Should never happen.
    63  			return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
    64  		}
    65  		switch rule.Type {
    66  		case devices.BlockDevice, devices.CharDevice:
    67  		default:
    68  			// Should never happen.
    69  			return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
    70  		}
    71  
    72  		entry := deviceAllowEntry{
    73  			Perms: string(rule.Permissions),
    74  		}
    75  
    76  		// systemd has a fairly odd (though understandable) syntax here, and
    77  		// because of the OCI configuration format we have to do quite a bit of
    78  		// trickery to convert things:
    79  		//
    80  		//  * Concrete rules with non-wildcard major/minor numbers have to use
    81  		//    /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
    82  		//    stat(2) on such paths to look up device properties, meaning we
    83  		//    cannot add whitelist rules for devices that don't exist. Since v240,
    84  		//    device properties are parsed from the path string.
    85  		//
    86  		//    However, path globbing is not supported for path-based rules so we
    87  		//    need to handle wildcards in some other manner.
    88  		//
    89  		//  * If systemd older than v240 is used, wildcard-minor rules
    90  		//    have to specify a "device group name" (the second column
    91  		//    in /proc/devices).
    92  		//
    93  		//  * Wildcard (major and minor) rules can just specify a glob with the
    94  		//    type ("char-*" or "block-*").
    95  		//
    96  		// The only type of rule we can't handle is wildcard-major rules, and
    97  		// so we'll give a warning in that case (note that the fallback code
    98  		// will insert any rules systemd couldn't handle). What amazing fun.
    99  
   100  		if rule.Major == devices.Wildcard {
   101  			// "_ *:n _" rules aren't supported by systemd.
   102  			if rule.Minor != devices.Wildcard {
   103  				logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
   104  				continue
   105  			}
   106  
   107  			// "_ *:* _" rules just wildcard everything.
   108  			prefix, err := groupPrefix(rule.Type)
   109  			if err != nil {
   110  				return nil, err
   111  			}
   112  			entry.Path = prefix + "*"
   113  		} else if rule.Minor == devices.Wildcard {
   114  			if sdVer >= 240 {
   115  				// systemd v240+ allows for {block,char}-MAJOR syntax.
   116  				prefix, err := groupPrefix(rule.Type)
   117  				if err != nil {
   118  					return nil, err
   119  				}
   120  				entry.Path = prefix + strconv.FormatInt(rule.Major, 10)
   121  			} else {
   122  				// For older systemd, "_ n:* _" rules require a device group from /proc/devices.
   123  				group, err := findDeviceGroup(rule.Type, rule.Major)
   124  				if err != nil {
   125  					return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
   126  				}
   127  				if group == "" {
   128  					// Couldn't find a group.
   129  					logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
   130  					continue
   131  				}
   132  				entry.Path = group
   133  			}
   134  		} else {
   135  			// "_ n:m _" rules are just a path in /dev/{block,char}/.
   136  			switch rule.Type {
   137  			case devices.BlockDevice:
   138  				entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
   139  			case devices.CharDevice:
   140  				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
   141  			}
   142  			if sdVer < 240 {
   143  				// Old systemd versions use stat(2) on path to find out device major:minor
   144  				// numbers and type. If the path doesn't exist, it will not add the rule,
   145  				// emitting a warning instead.
   146  				// Since all of this logic is best-effort anyway (we manually set these
   147  				// rules separately to systemd) we can safely skip entries that don't
   148  				// have a corresponding path.
   149  				if _, err := os.Stat(entry.Path); err != nil {
   150  					continue
   151  				}
   152  			}
   153  		}
   154  		deviceAllowList = append(deviceAllowList, entry)
   155  	}
   156  
   157  	properties = append(properties, newProp("DeviceAllow", deviceAllowList))
   158  	return properties, nil
   159  }
   160  
   161  func newProp(name string, units interface{}) systemdDbus.Property {
   162  	return systemdDbus.Property{
   163  		Name:  name,
   164  		Value: dbus.MakeVariant(units),
   165  	}
   166  }
   167  
   168  func groupPrefix(ruleType devices.Type) (string, error) {
   169  	switch ruleType {
   170  	case devices.BlockDevice:
   171  		return "block-", nil
   172  	case devices.CharDevice:
   173  		return "char-", nil
   174  	default:
   175  		return "", fmt.Errorf("device type %v has no group prefix", ruleType)
   176  	}
   177  }
   178  
   179  // findDeviceGroup tries to find the device group name (as listed in
   180  // /proc/devices) with the type prefixed as required for DeviceAllow, for a
   181  // given (type, major) combination. If more than one device group exists, an
   182  // arbitrary one is chosen.
   183  func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
   184  	fh, err := os.Open("/proc/devices")
   185  	if err != nil {
   186  		return "", err
   187  	}
   188  	defer fh.Close()
   189  
   190  	prefix, err := groupPrefix(ruleType)
   191  	if err != nil {
   192  		return "", err
   193  	}
   194  	ruleMajorStr := strconv.FormatInt(ruleMajor, 10) + " "
   195  
   196  	scanner := bufio.NewScanner(fh)
   197  	var currentType devices.Type
   198  	for scanner.Scan() {
   199  		// We need to strip spaces because the first number is column-aligned.
   200  		line := strings.TrimSpace(scanner.Text())
   201  
   202  		// Handle the "header" lines.
   203  		switch line {
   204  		case "Block devices:":
   205  			currentType = devices.BlockDevice
   206  			continue
   207  		case "Character devices:":
   208  			currentType = devices.CharDevice
   209  			continue
   210  		case "":
   211  			continue
   212  		}
   213  
   214  		// Skip lines unrelated to our type.
   215  		if currentType != ruleType {
   216  			continue
   217  		}
   218  
   219  		group := strings.TrimPrefix(line, ruleMajorStr)
   220  		if len(group) < len(line) { // got it
   221  			return prefix + group, nil
   222  		}
   223  	}
   224  	if err := scanner.Err(); err != nil {
   225  		return "", fmt.Errorf("reading /proc/devices: %w", err)
   226  	}
   227  	// Couldn't find the device group.
   228  	return "", nil
   229  }
   230  
   231  // DeviceAllow is the dbus type "a(ss)" which means we need a struct
   232  // to represent it in Go.
   233  type deviceAllowEntry struct {
   234  	Path  string
   235  	Perms string
   236  }
   237  
   238  func allowAllDevices() []systemdDbus.Property {
   239  	// Setting mode to auto and removing all DeviceAllow rules
   240  	// results in allowing access to all devices.
   241  	return []systemdDbus.Property{
   242  		newProp("DevicePolicy", "auto"),
   243  		newProp("DeviceAllow", []deviceAllowEntry{}),
   244  	}
   245  }