github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/gadgets/traceloop/tracer/bpf/traceloop.bpf.c (about) 1 // SPDX-License-Identifier: GPL-2.0 2 #include <vmlinux.h> 3 #include <bpf/bpf_helpers.h> 4 #include <bpf/bpf_core_read.h> 5 #include <bpf/bpf_tracing.h> 6 #include <gadget/mntns_filter.h> 7 #include "traceloop.h" 8 9 /* 10 * Taken from: 11 * https://github.com/seccomp/libseccomp/blob/afbde6ddaec7c58c3b281d43b0b287269ffca9bd/src/syscalls.csv 12 */ 13 #if defined(__TARGET_ARCH_arm64) 14 #define __NR_rt_sigreturn 139 15 #define __NR_exit_group 94 16 #define __NR_exit 93 17 #elif defined(__TARGET_ARCH_x86) 18 #define __NR_rt_sigreturn 15 19 #define __NR_exit_group 231 20 #define __NR_exit 60 21 #else 22 #error "Traceloop is not supported on your architecture." 23 #endif 24 25 /* Compile with -DSHOW_DEBUG to print debug messages. */ 26 #if defined(SHOW_DEBUG) 27 #define bpf_debug_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__) 28 #else /* !defined(SHOW_DEBUG) */ 29 #define bpf_debug_printk(fmt, ...) 30 #endif /* !defined(SHOW_DEBUG) */ 31 32 /* Compile with -DSHOW_ERROR to print error messages. */ 33 #if defined(SHOW_ERROR) 34 #define bpf_error_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__) 35 #else /* !defined(SHOW_ERROR) */ 36 #define bpf_error_printk(fmt, ...) 37 #endif /* !defined(SHOW_ERROR) */ 38 39 const volatile bool filter_syscall = false; 40 41 const struct syscall_event_t *unused_event __attribute__((unused)); 42 const struct syscall_event_cont_t *unused_event_cont __attribute__((unused)); 43 44 /* 45 * We need this to avoid hitting the 512 bytes stack limit. 46 * Indeed, pt_regs contains several u64 fields, so it is quite big. 47 */ 48 static const struct pt_regs empty; 49 static struct syscall_def_t default_definition; 50 51 struct { 52 __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); 53 /* 54 * We will use mount namespace ID to get the perf buffer corresponding 55 * to this container. 56 */ 57 __uint(key_size, sizeof(u64)); 58 __uint(value_size, sizeof(u32)); 59 __uint(max_entries, 1024); 60 __array( 61 values, struct { 62 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 63 __uint(key_size, sizeof(u32)); 64 __uint(value_size, sizeof(u32)); 65 }); 66 } map_of_perf_buffers SEC(".maps"); 67 68 struct { 69 __uint(type, BPF_MAP_TYPE_HASH); 70 __uint(key_size, sizeof(u64)); 71 __uint(value_size, sizeof(struct syscall_def_t)); 72 /* 73 * We have around 300 syscalls, let's use the immediate greater power of 74 * 2. 75 */ 76 __uint(max_entries, 512); 77 } syscalls SEC(".maps"); 78 79 struct { 80 __uint(type, BPF_MAP_TYPE_HASH); 81 __uint(key_size, sizeof(u64)); 82 /* 83 * We do not care about the value here, so let's use a bool to consume one 84 * byte per value. 85 */ 86 __uint(value_size, sizeof(bool)); 87 __uint(map_flags, BPF_F_NO_PREALLOC); 88 __uint(max_entries, SYSCALL_FILTERS); 89 } syscall_filters SEC(".maps"); 90 91 /* 92 * This key/value store maps thread PIDs to syscall arg arrays 93 * that were remembered at sys_enter so that sys_exit can probe buffer 94 * contents and generate syscall events showing the result content. 95 */ 96 struct { 97 __uint(type, BPF_MAP_TYPE_HASH); 98 __uint(key_size, sizeof(u64)); 99 __uint(value_size, sizeof(struct remembered_args)); 100 __uint(max_entries, 1024); 101 } probe_at_sys_exit SEC(".maps"); 102 103 struct { 104 __uint(type, BPF_MAP_TYPE_HASH); 105 __uint(key_size, sizeof(u64)); 106 __uint(value_size, sizeof(struct pt_regs)); 107 __uint(max_entries, 1024); 108 } regs_map SEC(".maps"); 109 110 static __always_inline int skip_exit_probe(int nr) 111 { 112 return !!(nr == __NR_exit || nr == __NR_exit_group || 113 nr == __NR_rt_sigreturn); 114 } 115 116 /* 117 * Highly inspired from ksnoop.bpf.c: 118 * https://github.com/iovisor/bcc/blob/f90126bb3770ea1bdd915ff3b47e451c6dde5c40/libbpf-tools/ksnoop.bpf.c#L280 119 */ 120 static __always_inline u64 get_arg(struct pt_regs *regs, int i) 121 { 122 switch (i) { 123 case 1: 124 return PT_REGS_PARM1_CORE_SYSCALL(regs); 125 case 2: 126 return PT_REGS_PARM2_CORE_SYSCALL(regs); 127 case 3: 128 return PT_REGS_PARM3_CORE_SYSCALL(regs); 129 case 4: 130 return PT_REGS_PARM4_CORE_SYSCALL(regs); 131 case 5: 132 return PT_REGS_PARM5_CORE_SYSCALL(regs); 133 case 6: 134 return PT_REGS_PARM6_CORE_SYSCALL(regs); 135 default: 136 bpf_error_printk( 137 "There is no PT_REGS_PARM%d_SYSCALL macro, check the argument!\n", 138 i); 139 return 0; 140 } 141 } 142 143 static __always_inline bool should_filter_out_syscall(u64 syscall_nr) 144 { 145 return filter_syscall && 146 bpf_map_lookup_elem(&syscall_filters, &syscall_nr) == NULL; 147 } 148 149 /* 150 * sys_enter is defined as: 151 * TP_PROTO(struct pt_regs *regs, long id) 152 * (https://elixir.bootlin.com/linux/v5.19/source/include/trace/events/syscalls.h#L20) 153 * So, ctx->args[0] contains a struct pt_regs and ctx->args[1] the syscall ID. 154 */ 155 SEC("raw_tracepoint/sys_enter") 156 int ig_traceloop_e(struct bpf_raw_tracepoint_args *ctx) 157 { 158 struct remembered_args remembered = {}; 159 u64 pid = bpf_get_current_pid_tgid(); 160 struct syscall_def_t *syscall_def; 161 /* 162 * Initialize struct to empty to be sure all fields (even padding) are zeroed: 163 * https://github.com/iovisor/bcc/issues/2623#issuecomment-560214481 164 */ 165 struct syscall_event_t sc = {}; 166 struct task_struct *task; 167 u64 nr = ctx->args[1]; 168 struct pt_regs *args; 169 void *perf_buffer; 170 u64 mntns_id; 171 int ret; 172 int i; 173 174 if (should_filter_out_syscall(nr)) 175 return 0; 176 177 /* The boot time timestamp is used to give the timestamp to users. It 178 * is converted to the wall-clock time in userspace. It only works 179 * from Linux 5.7. On older kernels, the BPF bytecode for 180 * bpf_ktime_get_boot_ns is automatically removed by the BPF loader, 181 * see FixBpfKtimeGetBootNs. In this way, this BPF program can still be 182 * loaded on older kernels. */ 183 u64 boot_ts = bpf_ktime_get_boot_ns(); 184 185 /* The monotonic timestamp is used by traceloop to match the sys_enter 186 * event with the cont and sys_exit events. This is an internal 187 * implementation detail not exposed to the user. */ 188 u64 monotonic_ts = bpf_ktime_get_ns(); 189 190 sc.boot_timestamp = boot_ts; 191 sc.monotonic_timestamp = monotonic_ts; 192 sc.cont_nr = 0; 193 sc.cpu = bpf_get_smp_processor_id(); 194 sc.pid = pid >> 32; 195 sc.typ = SYSCALL_EVENT_TYPE_ENTER; 196 sc.id = nr; 197 198 remembered.monotonic_timestamp = monotonic_ts; 199 remembered.nr = nr; 200 201 syscall_def = bpf_map_lookup_elem(&syscalls, &nr); 202 /* 203 * syscalls map contains definition for specific syscall like read or 204 * write. 205 * All others syscalls, like nanosleep, are not in this map because 206 * their signature is not specific, in this case, we use the default 207 * definition. 208 */ 209 if (syscall_def == NULL) 210 syscall_def = &default_definition; 211 212 task = (struct task_struct *)bpf_get_current_task(); 213 mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum); 214 215 perf_buffer = bpf_map_lookup_elem(&map_of_perf_buffers, &mntns_id); 216 if (!perf_buffer) 217 return 0; 218 219 bpf_get_current_comm(sc.comm, sizeof(sc.comm)); 220 221 ret = bpf_map_update_elem(®s_map, &pid, &empty, BPF_NOEXIST); 222 if (ret) { 223 bpf_error_printk( 224 "enter: there should not be any pt_regs for key %lu: %d\n", 225 pid, ret); 226 227 return 0; 228 } 229 230 args = bpf_map_lookup_elem(®s_map, &pid); 231 if (!args) { 232 bpf_error_printk( 233 "enter: there should be a pt_regs for key %lu\n", pid); 234 235 goto end; 236 } 237 238 bpf_probe_read(args, sizeof(*args), (void *)ctx->args[0]); 239 240 for (i = 0; i < SYSCALL_ARGS; i++) { 241 /* + 1 because PT_REGS_PARM begins from 1. */ 242 u64 arg = get_arg(args, i + 1); 243 sc.args[i] = arg; 244 remembered.args[i] = arg; 245 if (syscall_def->args_len[i]) 246 sc.cont_nr++; 247 } 248 249 bpf_debug_printk( 250 "Perf event output: sc.id: %d; sc.comm: %s; sizeof(sc): %d\n", 251 sc.id, sc.comm, sizeof(sc)); 252 ret = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, &sc, 253 sizeof(sc)); 254 if (ret != 0) { 255 bpf_error_printk("Problem outputting perf event: %d", ret); 256 } 257 258 // Avoid using probe_at_sys_exit for exit() and exit_group() because sys_exit 259 // would not be called and the map would not be cleaned up and would get full. 260 // Note that a process can still get killed in the middle, so we would need 261 // a userspace cleaner for this case (TODO). 262 if (!skip_exit_probe(nr)) 263 bpf_map_update_elem(&probe_at_sys_exit, &pid, &remembered, 264 BPF_ANY); 265 266 // We need to unroll this loop to make this work on kernels 5.4.0-x on ubuntu, see 267 // https://github.com/inspektor-gadget/inspektor-gadget/issues/1465 for more details. 268 #pragma unroll 269 for (i = 0; i < SYSCALL_ARGS; i++) { 270 __u64 arg_len = syscall_def->args_len[i]; 271 272 if (!arg_len || (arg_len & PARAM_PROBE_AT_EXIT_MASK) || 273 arg_len == USE_RET_AS_PARAM_LENGTH) 274 continue; 275 276 bool null_terminated = false; 277 struct syscall_event_cont_t sc_cont = {}; 278 279 sc_cont.monotonic_timestamp = monotonic_ts; 280 sc_cont.index = i; 281 sc_cont.failed = false; 282 283 if (arg_len == USE_NULL_BYTE_LENGTH) { 284 null_terminated = true; 285 arg_len = 0; 286 } else if (arg_len >= USE_ARG_INDEX_AS_PARAM_LENGTH) { 287 __u64 idx = arg_len & 288 USE_ARG_INDEX_AS_PARAM_LENGTH_MASK; 289 290 /* 291 * Access args via the previously saved map entry instead of 292 * the ctx pointer or 'remembered' struct to avoid this verifier 293 * issue (which does not occur in sys_exit for the same code): 294 * "variable ctx access var_off=(0x0; 0x38) disallowed" 295 */ 296 struct remembered_args *remembered_ctx_workaround; 297 if (idx < SYSCALL_ARGS) { 298 remembered_ctx_workaround = bpf_map_lookup_elem( 299 &probe_at_sys_exit, &pid); 300 if (remembered_ctx_workaround) 301 arg_len = remembered_ctx_workaround 302 ->args[idx]; 303 else 304 arg_len = 0; 305 } else { 306 arg_len = PARAM_LEN; 307 } 308 } 309 310 if (arg_len > sizeof(sc_cont.param)) 311 arg_len = sizeof(sc_cont.param); 312 313 if (null_terminated) 314 sc_cont.length = USE_NULL_BYTE_LENGTH; 315 else 316 sc_cont.length = arg_len; 317 318 /* + 1 because PT_REGS_PARM begins from 1. */ 319 u64 arg = get_arg(args, i + 1); 320 321 if (!arg_len && 322 null_terminated /* NULL terminated argument like string */ 323 && bpf_probe_read_user_str(sc_cont.param, PARAM_LEN, 324 (void *)(arg)) < 0) 325 sc_cont.failed = true; 326 else if (sizeof(u8) <= arg_len && 327 arg_len <= 328 sizeof(u64) /* Conventional arguments like type (char, int, etc.) */ 329 && bpf_probe_read_user(sc_cont.param, arg_len, 330 (void *)(arg))) 331 sc_cont.failed = true; 332 else if (bpf_probe_read_user( 333 sc_cont.param, PARAM_LEN, 334 (void *)(arg))) /* TODO Struct arguments? */ 335 sc_cont.failed = true; 336 337 bpf_debug_printk( 338 "Perf event output: sc_cont.index: %d; sizeof(sc_cont): %d\n", 339 sc_cont.index, sizeof(sc_cont)); 340 ret = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, 341 &sc_cont, sizeof(sc_cont)); 342 if (ret != 0) { 343 bpf_error_printk( 344 "Problem outputting continued perf event: %d", 345 ret); 346 } 347 } 348 349 end: 350 bpf_map_delete_elem(®s_map, &pid); 351 352 return 0; 353 } 354 355 /* 356 * syscall_get_nr() is defined for each architecture in the Linux kernel. 357 * As we cannot use trace_event_raw_sys_exit, we need to get the current syscall 358 * number from the register. 359 * So, this function should be expanded with the code of the architecture we 360 * support. 361 */ 362 static __always_inline int syscall_get_nr(struct pt_regs *regs) 363 { 364 #if defined(__TARGET_ARCH_arm64) 365 return regs->syscallno; 366 #elif defined(__TARGET_ARCH_x86) 367 return regs->orig_ax; 368 #else 369 #error "Traceloop is not supported on your architecture." 370 #endif 371 } 372 373 /* 374 * sys_exit is defined as: 375 * TP_PROTO(struct pt_regs *regs, long ret), 376 * (https://elixir.bootlin.com/linux/v5.19/source/include/trace/events/syscalls.h#L46) 377 * So, ctx->args[0] contains a struct pt_regs and ctx->args[1] the syscall 378 * return value. 379 */ 380 SEC("raw_tracepoint/sys_exit") 381 int ig_traceloop_x(struct bpf_raw_tracepoint_args *ctx) 382 { 383 u64 pid = bpf_get_current_pid_tgid(); 384 struct remembered_args *remembered; 385 struct syscall_def_t *syscall_def; 386 struct task_struct *task; 387 long ret = ctx->args[1]; 388 struct pt_regs *args; 389 void *perf_buffer; 390 u64 mntns_id; 391 int i, r; 392 u64 nr; 393 394 r = bpf_map_update_elem(®s_map, &pid, &empty, BPF_NOEXIST); 395 if (r) { 396 bpf_error_printk( 397 "exit: there should not be any pt_regs for key %lu: %d\n", 398 pid, r); 399 400 return 0; 401 } 402 403 args = bpf_map_lookup_elem(®s_map, &pid); 404 if (!args) { 405 bpf_error_printk( 406 "exit: there should be a pt_regs for key %lu\n", pid); 407 408 goto end; 409 } 410 411 bpf_probe_read(args, sizeof(*args), (void *)ctx->args[0]); 412 nr = syscall_get_nr(args); 413 /* TODO Why this can occur? */ 414 if (nr == -1) 415 goto end; 416 417 struct syscall_event_t sc = { 418 .boot_timestamp = bpf_ktime_get_boot_ns(), 419 .cpu = bpf_get_smp_processor_id(), 420 .pid = pid >> 32, 421 .typ = SYSCALL_EVENT_TYPE_EXIT, 422 .id = nr, 423 }; 424 sc.args[0] = ret; 425 426 syscall_def = bpf_map_lookup_elem(&syscalls, &nr); 427 if (syscall_def == NULL) 428 syscall_def = &default_definition; 429 430 task = (struct task_struct *)bpf_get_current_task(); 431 mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum); 432 433 perf_buffer = bpf_map_lookup_elem(&map_of_perf_buffers, &mntns_id); 434 if (!perf_buffer) 435 goto end; 436 437 remembered = bpf_map_lookup_elem(&probe_at_sys_exit, &pid); 438 if (!remembered) 439 goto end; 440 441 /* 442 * This ensures all events (enter, exit and cont) related to a given 443 * syscall have the same timestamp. 444 */ 445 sc.monotonic_timestamp = remembered->monotonic_timestamp; 446 447 for (i = 0; i < SYSCALL_ARGS; i++) { 448 __u64 arg_len = syscall_def->args_len[i]; 449 450 if (!arg_len || !(arg_len & PARAM_PROBE_AT_EXIT_MASK)) 451 goto end_loop; 452 453 bool null_terminated = false; 454 struct syscall_event_cont_t sc_cont = { 455 .monotonic_timestamp = remembered->monotonic_timestamp, 456 .index = i, 457 .failed = false, 458 }; 459 460 arg_len &= ~PARAM_PROBE_AT_EXIT_MASK; 461 462 if (arg_len == USE_RET_AS_PARAM_LENGTH) { 463 if ((signed long)ret < 0) 464 arg_len = 0; 465 else 466 arg_len = ret; 467 } else if (arg_len == USE_NULL_BYTE_LENGTH) { 468 null_terminated = true; 469 arg_len = 0; 470 } else if (arg_len >= USE_ARG_INDEX_AS_PARAM_LENGTH) { 471 __u64 idx = arg_len & 472 USE_ARG_INDEX_AS_PARAM_LENGTH_MASK; 473 if (idx < SYSCALL_ARGS) 474 arg_len = remembered->args[idx]; 475 else 476 arg_len = PARAM_LEN; 477 } 478 479 if (arg_len > sizeof(sc_cont.param)) 480 arg_len = sizeof(sc_cont.param); 481 482 if (null_terminated) 483 sc_cont.length = USE_NULL_BYTE_LENGTH; 484 else 485 sc_cont.length = arg_len; 486 487 if (arg_len == 0 && null_terminated) { 488 if (bpf_probe_read_user_str( 489 sc_cont.param, PARAM_LEN, 490 (void *)(remembered->args[i])) < 0) 491 sc_cont.failed = true; 492 } else if (sizeof(u8) <= arg_len && arg_len <= sizeof(u64) && 493 bpf_probe_read_user(sc_cont.param, arg_len, 494 (void *)(remembered->args[i]))) { 495 sc_cont.failed = true; 496 } else if (bpf_probe_read_user(sc_cont.param, PARAM_LEN, 497 (void *)(remembered->args[i]))) { 498 sc_cont.failed = true; 499 } 500 501 bpf_debug_printk( 502 "Perf event output (exit): sc_cont.index: %d; sizeof(sc_cont): %d\n", 503 sc_cont.index, sizeof(sc_cont)); 504 r = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, 505 &sc_cont, sizeof(sc_cont)); 506 if (r != 0) { 507 bpf_error_printk( 508 "Problem outputting continued perf event: %d", 509 ret); 510 } 511 end_loop: 512 bpf_map_delete_elem(&probe_at_sys_exit, &pid); 513 } 514 515 bpf_get_current_comm(sc.comm, sizeof(sc.comm)); 516 517 bpf_debug_printk( 518 "Perf event output (exit): sc.id: %d; sc.comm: %s; sizeof(sc): %d\n", 519 sc.id, sc.comm, sizeof(sc)); 520 r = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, &sc, 521 sizeof(sc)); 522 if (r != 0) { 523 bpf_error_printk("Problem outputting perf event: %d", ret); 524 } 525 end: 526 bpf_map_delete_elem(®s_map, &pid); 527 528 return 0; 529 } 530 531 char LICENSE[] SEC("license") = "GPL";