github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/devices/ebpf_linux.go (about) 1 package devices 2 3 import ( 4 "errors" 5 "fmt" 6 "os" 7 "runtime" 8 "sync" 9 "unsafe" 10 11 "github.com/cilium/ebpf" 12 "github.com/cilium/ebpf/asm" 13 "github.com/cilium/ebpf/link" 14 "github.com/sirupsen/logrus" 15 "golang.org/x/sys/unix" 16 ) 17 18 func nilCloser() error { 19 return nil 20 } 21 22 func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) { 23 type bpfAttrQuery struct { 24 TargetFd uint32 25 AttachType uint32 26 QueryType uint32 27 AttachFlags uint32 28 ProgIds uint64 // __aligned_u64 29 ProgCnt uint32 30 } 31 32 // Currently you can only have 64 eBPF programs attached to a cgroup. 33 size := 64 34 retries := 0 35 for retries < 10 { 36 progIds := make([]uint32, size) 37 query := bpfAttrQuery{ 38 TargetFd: uint32(dirFd), 39 AttachType: uint32(unix.BPF_CGROUP_DEVICE), 40 ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))), 41 ProgCnt: uint32(len(progIds)), 42 } 43 44 // Fetch the list of program ids. 45 _, _, errno := unix.Syscall(unix.SYS_BPF, 46 uintptr(unix.BPF_PROG_QUERY), 47 uintptr(unsafe.Pointer(&query)), 48 unsafe.Sizeof(query)) 49 size = int(query.ProgCnt) 50 runtime.KeepAlive(query) 51 if errno != 0 { 52 // On ENOSPC we get the correct number of programs. 53 if errno == unix.ENOSPC { 54 retries++ 55 continue 56 } 57 return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) 58 } 59 60 // Convert the ids to program handles. 61 progIds = progIds[:size] 62 programs := make([]*ebpf.Program, 0, len(progIds)) 63 for _, progId := range progIds { 64 program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) 65 if err != nil { 66 // We skip over programs that give us -EACCES or -EPERM. This 67 // is necessary because there may be BPF programs that have 68 // been attached (such as with --systemd-cgroup) which have an 69 // LSM label that blocks us from interacting with the program. 70 // 71 // Because additional BPF_CGROUP_DEVICE programs only can add 72 // restrictions, there's no real issue with just ignoring these 73 // programs (and stops runc from breaking on distributions with 74 // very strict SELinux policies). 75 if errors.Is(err, os.ErrPermission) { 76 logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err) 77 continue 78 } 79 return nil, fmt.Errorf("cannot fetch program from id: %w", err) 80 } 81 programs = append(programs, program) 82 } 83 runtime.KeepAlive(progIds) 84 return programs, nil 85 } 86 87 return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") 88 } 89 90 var ( 91 haveBpfProgReplaceBool bool 92 haveBpfProgReplaceOnce sync.Once 93 ) 94 95 // Loosely based on the BPF_F_REPLACE support check in 96 // https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. 97 // 98 // TODO: move this logic to cilium/ebpf 99 func haveBpfProgReplace() bool { 100 haveBpfProgReplaceOnce.Do(func() { 101 prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ 102 Type: ebpf.CGroupDevice, 103 License: "MIT", 104 Instructions: asm.Instructions{ 105 asm.Mov.Imm(asm.R0, 0), 106 asm.Return(), 107 }, 108 }) 109 if err != nil { 110 logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) 111 return 112 } 113 defer prog.Close() 114 115 devnull, err := os.Open("/dev/null") 116 if err != nil { 117 logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err) 118 return 119 } 120 defer devnull.Close() 121 122 // We know that we have BPF_PROG_ATTACH since we can load 123 // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL 124 // we know that the feature isn't present. 125 err = link.RawAttachProgram(link.RawAttachProgramOptions{ 126 // We rely on this fd being checked after attachFlags. 127 Target: int(devnull.Fd()), 128 // Attempt to "replace" bad fds with this program. 129 Program: prog, 130 Attach: ebpf.AttachCGroupDevice, 131 Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE, 132 }) 133 if errors.Is(err, unix.EINVAL) { 134 // not supported 135 return 136 } 137 // attach_flags test succeeded. 138 if !errors.Is(err, unix.EBADF) { 139 logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) 140 } 141 haveBpfProgReplaceBool = true 142 }) 143 return haveBpfProgReplaceBool 144 } 145 146 // loadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory. 147 // 148 // Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . 149 // 150 // https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 151 func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) { 152 // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). 153 // This limit is not inherited into the container. 154 memlockLimit := &unix.Rlimit{ 155 Cur: unix.RLIM_INFINITY, 156 Max: unix.RLIM_INFINITY, 157 } 158 _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) 159 160 // Get the list of existing programs. 161 oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) 162 if err != nil { 163 return nilCloser, err 164 } 165 useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 166 167 // Generate new program. 168 spec := &ebpf.ProgramSpec{ 169 Type: ebpf.CGroupDevice, 170 Instructions: insts, 171 License: license, 172 } 173 prog, err := ebpf.NewProgram(spec) 174 if err != nil { 175 return nilCloser, err 176 } 177 178 // If there is only one old program, we can just replace it directly. 179 var ( 180 replaceProg *ebpf.Program 181 attachFlags uint32 = unix.BPF_F_ALLOW_MULTI 182 ) 183 if useReplaceProg { 184 replaceProg = oldProgs[0] 185 attachFlags |= unix.BPF_F_REPLACE 186 } 187 err = link.RawAttachProgram(link.RawAttachProgramOptions{ 188 Target: dirFd, 189 Program: prog, 190 Replace: replaceProg, 191 Attach: ebpf.AttachCGroupDevice, 192 Flags: attachFlags, 193 }) 194 if err != nil { 195 return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) 196 } 197 closer := func() error { 198 err = link.RawDetachProgram(link.RawDetachProgramOptions{ 199 Target: dirFd, 200 Program: prog, 201 Attach: ebpf.AttachCGroupDevice, 202 }) 203 if err != nil { 204 return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err) 205 } 206 // TODO: Should we attach the old filters back in this case? Otherwise 207 // we fail-open on a security feature, which is a bit scary. 208 return nil 209 } 210 if !useReplaceProg { 211 logLevel := logrus.DebugLevel 212 // If there was more than one old program, give a warning (since this 213 // really shouldn't happen with runc-managed cgroups) and then detach 214 // all the old programs. 215 if len(oldProgs) > 1 { 216 // NOTE: Ideally this should be a warning but it turns out that 217 // systemd-managed cgroups trigger this warning (apparently 218 // systemd doesn't delete old non-systemd programs when 219 // setting properties). 220 logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) 221 logLevel = logrus.InfoLevel 222 } 223 for idx, oldProg := range oldProgs { 224 // Output some extra debug info. 225 if info, err := oldProg.Info(); err == nil { 226 fields := logrus.Fields{ 227 "type": info.Type.String(), 228 "tag": info.Tag, 229 "name": info.Name, 230 } 231 if id, ok := info.ID(); ok { 232 fields["id"] = id 233 } 234 if runCount, ok := info.RunCount(); ok { 235 fields["run_count"] = runCount 236 } 237 if runtime, ok := info.Runtime(); ok { 238 fields["runtime"] = runtime.String() 239 } 240 logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) 241 } 242 err = link.RawDetachProgram(link.RawDetachProgramOptions{ 243 Target: dirFd, 244 Program: oldProg, 245 Attach: ebpf.AttachCGroupDevice, 246 }) 247 if err != nil { 248 return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) 249 } 250 } 251 } 252 return closer, nil 253 }