github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/gadgets/trace/capabilities/tracer/bpf/capable.bpf.c (about) 1 // SPDX-License-Identifier: GPL-2.0 2 // 3 // Unique filtering based on 4 // https://github.com/libbpf/libbpf-rs/tree/master/examples/capable 5 // 6 // Copyright 2022 Sony Group Corporation 7 8 #include <vmlinux.h> 9 #include <bpf/bpf_core_read.h> 10 #include <bpf/bpf_helpers.h> 11 #include <bpf/bpf_tracing.h> 12 #include "capable.h" 13 #include <gadget/mntns_filter.h> 14 15 // include/linux/security.h 16 #ifndef CAP_OPT_NOAUDIT 17 #define CAP_OPT_NOAUDIT 1 << 1 18 #endif 19 20 #define MAX_ENTRIES 10240 21 22 const volatile pid_t my_pid = -1; 23 const volatile pid_t targ_pid = -1; 24 const volatile u32 linux_version_code = 0; 25 const volatile bool audit_only = false; 26 const volatile bool unique = false; 27 28 extern int LINUX_KERNEL_VERSION __kconfig; 29 30 // we need this to make sure the compiler doesn't remove our struct 31 const struct cap_event *unusedcapevent __attribute__((unused)); 32 33 struct args_t { 34 u64 current_userns; 35 u64 target_userns; 36 u64 cap_effective; 37 int cap; 38 int cap_opt; 39 }; 40 41 struct { 42 __uint(type, BPF_MAP_TYPE_HASH); 43 __uint(max_entries, 10240); 44 __type(key, u64); 45 __type(value, struct args_t); 46 } start SEC(".maps"); 47 48 struct { 49 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 50 __uint(key_size, sizeof(__u32)); 51 __uint(value_size, sizeof(__u32)); 52 } events SEC(".maps"); 53 54 struct unique_key { 55 int cap; 56 u64 mntns_id; 57 }; 58 59 struct { 60 __uint(type, BPF_MAP_TYPE_HASH); 61 __uint(max_entries, 10240); 62 __type(key, struct unique_key); 63 __type(value, u64); 64 } seen SEC(".maps"); 65 66 struct syscall_context { 67 // Syscall id 68 // -1 for unknown syscall 69 u64 nr; 70 71 // We could add more fields for the arguments if desired 72 }; 73 74 struct { 75 __uint(type, BPF_MAP_TYPE_HASH); 76 __uint(key_size, sizeof(u64)); 77 __uint(value_size, sizeof(struct syscall_context)); 78 __uint(max_entries, 79 1048576); // There can be many threads sleeping in some futex/poll syscalls 80 } current_syscall SEC(".maps"); 81 82 SEC("kprobe/cap_capable") 83 int BPF_KPROBE(ig_trace_cap_e, const struct cred *cred, 84 struct user_namespace *targ_ns, int cap, int cap_opt) 85 { 86 __u32 pid; 87 u64 mntns_id; 88 __u64 pid_tgid; 89 struct task_struct *task; 90 91 task = (struct task_struct *)bpf_get_current_task(); 92 mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum); 93 94 if (gadget_should_discard_mntns_id(mntns_id)) 95 return 0; 96 97 const struct cred *real_cred = BPF_CORE_READ(task, real_cred); 98 if (cred != real_cred) { 99 // the subjective credentials are in an overridden state with 100 // override_creds/revert_creds (e.g. during overlayfs cache or copyup) 101 // https://kernel.org/doc/html/v6.2-rc8/security/credentials.html#overriding-the-vfs-s-use-of-credentials 102 return 0; 103 } 104 105 pid_tgid = bpf_get_current_pid_tgid(); 106 pid = pid_tgid >> 32; 107 108 if (pid == my_pid) 109 return 0; 110 111 if (targ_pid != -1 && targ_pid != pid) 112 return 0; 113 114 if (audit_only) { 115 if (LINUX_KERNEL_VERSION >= KERNEL_VERSION(5, 1, 0)) { 116 if (cap_opt & CAP_OPT_NOAUDIT) 117 return 0; 118 } else { 119 if (!cap_opt) 120 return 0; 121 } 122 } 123 124 if (unique) { 125 struct unique_key key = { 126 .cap = cap, 127 .mntns_id = mntns_id, 128 }; 129 130 if (bpf_map_lookup_elem(&seen, &key) != NULL) { 131 return 0; 132 } 133 u64 zero = 0; 134 bpf_map_update_elem(&seen, &key, &zero, 0); 135 } 136 137 struct args_t args = {}; 138 args.current_userns = 139 (u64)BPF_CORE_READ(task, real_cred, user_ns, ns.inum); 140 args.target_userns = (u64)BPF_CORE_READ(targ_ns, ns.inum); 141 /* 142 * cap_effective has kernel_cap_t for type. 143 * This type definition changed along the time: 144 * 1. It was defined as a __u32 in: 145 * https://github.com/torvalds/linux/commit/1da177e4c3f4 146 * 2. It later was modified to be an array of __u32, so 64 bits kernel 147 * can use 64 bits for capabilities while supporting legacy 32 bits 148 * ones: 149 * https://github.com/torvalds/linux/commit/e338d263a76a 150 * 3. It was recently defined to be a simple u64: 151 * https://github.com/torvalds/linux/commit/f122a08b197d 152 * BPF_CORE_READ_INTO() will handle the different size for us and in any 153 * case, we define args.cap_effective as u64 which is enough to contain 154 * the information. 155 */ 156 BPF_CORE_READ_INTO(&args.cap_effective, task, real_cred, cap_effective); 157 args.cap = cap; 158 args.cap_opt = cap_opt; 159 bpf_map_update_elem(&start, &pid_tgid, &args, 0); 160 161 return 0; 162 } 163 164 SEC("kretprobe/cap_capable") 165 int BPF_KRETPROBE(ig_trace_cap_x) 166 { 167 __u64 pid_tgid; 168 __u64 uid_gid = bpf_get_current_uid_gid(); 169 struct args_t *ap; 170 int ret; 171 172 pid_tgid = bpf_get_current_pid_tgid(); 173 ap = bpf_map_lookup_elem(&start, &pid_tgid); 174 if (!ap) 175 return 0; /* missed entry */ 176 177 struct cap_event event = {}; 178 event.current_userns = ap->current_userns; 179 event.target_userns = ap->target_userns; 180 event.cap_effective = ap->cap_effective; 181 event.pid = pid_tgid >> 32; 182 event.tgid = pid_tgid; 183 event.cap = ap->cap; 184 event.uid = (u32)uid_gid; 185 event.gid = (u32)(uid_gid >> 32); 186 event.mntnsid = gadget_get_mntns_id(); 187 bpf_get_current_comm(&event.task, sizeof(event.task)); 188 event.ret = PT_REGS_RC(ctx); 189 event.timestamp = bpf_ktime_get_boot_ns(); 190 191 if (LINUX_KERNEL_VERSION >= KERNEL_VERSION(5, 1, 0)) { 192 event.audit = (ap->cap_opt & CAP_OPT_NOAUDIT) == 0; 193 event.insetid = (ap->cap_opt & CAP_OPT_INSETID) != 0; 194 } else { 195 event.audit = ap->cap_opt; 196 event.insetid = -1; 197 } 198 199 struct syscall_context *sc_ctx; 200 sc_ctx = bpf_map_lookup_elem(¤t_syscall, &pid_tgid); 201 if (sc_ctx) { 202 event.syscall = sc_ctx->nr; 203 } else { 204 event.syscall = -1; 205 } 206 207 bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, 208 sizeof(event)); 209 210 bpf_map_delete_elem(&start, &pid_tgid); 211 212 return 0; 213 } 214 215 /* 216 * Taken from: 217 * https://github.com/seccomp/libseccomp/blob/afbde6ddaec7c58c3b281d43b0b287269ffca9bd/src/syscalls.csv 218 */ 219 #if defined(__TARGET_ARCH_arm64) 220 #define __NR_rt_sigreturn 139 221 #define __NR_exit_group 94 222 #define __NR_exit 93 223 #elif defined(__TARGET_ARCH_x86) 224 #define __NR_rt_sigreturn 15 225 #define __NR_exit_group 231 226 #define __NR_exit 60 227 #else 228 #error "The trace capabilities gadget is not supported on your architecture." 229 #endif 230 231 static __always_inline int skip_exit_probe(int nr) 232 { 233 return !!(nr == __NR_exit || nr == __NR_exit_group || 234 nr == __NR_rt_sigreturn); 235 } 236 237 SEC("raw_tracepoint/sys_enter") 238 int ig_cap_sys_enter(struct bpf_raw_tracepoint_args *ctx) 239 { 240 u64 pid_tgid = bpf_get_current_pid_tgid(); 241 struct pt_regs regs = {}; 242 struct syscall_context sc_ctx = {}; 243 244 u64 mntns_id = gadget_get_mntns_id(); 245 246 if (gadget_should_discard_mntns_id(mntns_id)) 247 return 0; 248 249 u64 nr = ctx->args[1]; 250 sc_ctx.nr = nr; 251 252 // The sys_exit tracepoint is not called for some syscalls. 253 if (!skip_exit_probe(nr)) 254 bpf_map_update_elem(¤t_syscall, &pid_tgid, &sc_ctx, 255 BPF_ANY); 256 257 return 0; 258 } 259 260 SEC("raw_tracepoint/sys_exit") 261 int ig_cap_sys_exit(struct bpf_raw_tracepoint_args *ctx) 262 { 263 u64 pid_tgid = bpf_get_current_pid_tgid(); 264 bpf_map_delete_elem(¤t_syscall, &pid_tgid); 265 return 0; 266 } 267 268 char LICENSE[] SEC("license") = "GPL";