github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/devices/devicefilter.go (about)

     1  // Implements creation of eBPF device filter program.
     2  //
     3  // Based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
     4  //
     5  // Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
     6  // agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
     7  package devices
     8  
     9  import (
    10  	"errors"
    11  	"fmt"
    12  	"math"
    13  	"strconv"
    14  
    15  	"github.com/cilium/ebpf/asm"
    16  	"github.com/opencontainers/runc/libcontainer/devices"
    17  	"golang.org/x/sys/unix"
    18  )
    19  
    20  const (
    21  	// license string format is same as kernel MODULE_LICENSE macro
    22  	license = "Apache"
    23  )
    24  
    25  // deviceFilter returns eBPF device filter program and its license string.
    26  func deviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
    27  	// Generate the minimum ruleset for the device rules we are given. While we
    28  	// don't care about minimum transitions in cgroupv2, using the emulator
    29  	// gives us a guarantee that the behaviour of devices filtering is the same
    30  	// as cgroupv1, including security hardenings to avoid misconfiguration
    31  	// (such as punching holes in wildcard rules).
    32  	emu := new(emulator)
    33  	for _, rule := range rules {
    34  		if err := emu.Apply(*rule); err != nil {
    35  			return nil, "", err
    36  		}
    37  	}
    38  	cleanRules, err := emu.Rules()
    39  	if err != nil {
    40  		return nil, "", err
    41  	}
    42  
    43  	p := &program{
    44  		defaultAllow: emu.IsBlacklist(),
    45  	}
    46  	p.init()
    47  
    48  	for idx, rule := range cleanRules {
    49  		if rule.Type == devices.WildcardDevice {
    50  			// We can safely skip over wildcard entries because there should
    51  			// only be one (at most) at the very start to instruct cgroupv1 to
    52  			// go into allow-list mode. However we do double-check this here.
    53  			if idx != 0 || rule.Allow != emu.IsBlacklist() {
    54  				return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
    55  			}
    56  			continue
    57  		}
    58  		if rule.Allow == p.defaultAllow {
    59  			// There should be no rules which have an action equal to the
    60  			// default action, the emulator removes those.
    61  			return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
    62  		}
    63  		if err := p.appendRule(rule); err != nil {
    64  			return nil, "", err
    65  		}
    66  	}
    67  	return p.finalize(), license, nil
    68  }
    69  
    70  type program struct {
    71  	insts        asm.Instructions
    72  	defaultAllow bool
    73  	blockID      int
    74  }
    75  
    76  func (p *program) init() {
    77  	// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
    78  	/*
    79  		u32 access_type
    80  		u32 major
    81  		u32 minor
    82  	*/
    83  	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
    84  	p.insts = append(p.insts,
    85  		asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
    86  		asm.And.Imm32(asm.R2, 0xFFFF))
    87  
    88  	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
    89  	p.insts = append(p.insts,
    90  		asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
    91  		// RSh: bitwise shift right
    92  		asm.RSh.Imm32(asm.R3, 16))
    93  
    94  	// R4 <- major (u32 major at R1[4])
    95  	p.insts = append(p.insts,
    96  		asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
    97  
    98  	// R5 <- minor (u32 minor at R1[8])
    99  	p.insts = append(p.insts,
   100  		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
   101  }
   102  
   103  // appendRule rule converts an OCI rule to the relevant eBPF block and adds it
   104  // to the in-progress filter program. In order to operate properly, it must be
   105  // called with a "clean" rule list (generated by devices.Emulator.Rules() --
   106  // with any "a" rules removed).
   107  func (p *program) appendRule(rule *devices.Rule) error {
   108  	if p.blockID < 0 {
   109  		return errors.New("the program is finalized")
   110  	}
   111  
   112  	var bpfType int32
   113  	switch rule.Type {
   114  	case devices.CharDevice:
   115  		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
   116  	case devices.BlockDevice:
   117  		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
   118  	default:
   119  		// We do not permit 'a', nor any other types we don't know about.
   120  		return fmt.Errorf("invalid type %q", string(rule.Type))
   121  	}
   122  	if rule.Major > math.MaxUint32 {
   123  		return fmt.Errorf("invalid major %d", rule.Major)
   124  	}
   125  	if rule.Minor > math.MaxUint32 {
   126  		return fmt.Errorf("invalid minor %d", rule.Major)
   127  	}
   128  	hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
   129  	hasMinor := rule.Minor >= 0
   130  	bpfAccess := int32(0)
   131  	for _, r := range rule.Permissions {
   132  		switch r {
   133  		case 'r':
   134  			bpfAccess |= unix.BPF_DEVCG_ACC_READ
   135  		case 'w':
   136  			bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
   137  		case 'm':
   138  			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
   139  		default:
   140  			return fmt.Errorf("unknown device access %v", r)
   141  		}
   142  	}
   143  	// If the access is rwm, skip the check.
   144  	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
   145  
   146  	var (
   147  		blockSym         = "block-" + strconv.Itoa(p.blockID)
   148  		nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
   149  		prevBlockLastIdx = len(p.insts) - 1
   150  	)
   151  	p.insts = append(p.insts,
   152  		// if (R2 != bpfType) goto next
   153  		asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
   154  	)
   155  	if hasAccess {
   156  		p.insts = append(p.insts,
   157  			// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
   158  			asm.Mov.Reg32(asm.R1, asm.R3),
   159  			asm.And.Imm32(asm.R1, bpfAccess),
   160  			asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
   161  		)
   162  	}
   163  	if hasMajor {
   164  		p.insts = append(p.insts,
   165  			// if (R4 != major) goto next
   166  			asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
   167  		)
   168  	}
   169  	if hasMinor {
   170  		p.insts = append(p.insts,
   171  			// if (R5 != minor) goto next
   172  			asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
   173  		)
   174  	}
   175  	p.insts = append(p.insts, acceptBlock(rule.Allow)...)
   176  	// set blockSym to the first instruction we added in this iteration
   177  	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].WithSymbol(blockSym)
   178  	p.blockID++
   179  	return nil
   180  }
   181  
   182  func (p *program) finalize() asm.Instructions {
   183  	var v int32
   184  	if p.defaultAllow {
   185  		v = 1
   186  	}
   187  	blockSym := "block-" + strconv.Itoa(p.blockID)
   188  	p.insts = append(p.insts,
   189  		// R0 <- v
   190  		asm.Mov.Imm32(asm.R0, v).WithSymbol(blockSym),
   191  		asm.Return(),
   192  	)
   193  	p.blockID = -1
   194  	return p.insts
   195  }
   196  
   197  func acceptBlock(accept bool) asm.Instructions {
   198  	var v int32
   199  	if accept {
   200  		v = 1
   201  	}
   202  	return []asm.Instruction{
   203  		// R0 <- v
   204  		asm.Mov.Imm32(asm.R0, v),
   205  		asm.Return(),
   206  	}
   207  }