github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/pkg/ebpftracer/c/tracee.bpf.c (about) 1 // +build ignore 2 3 // Note: This file is licenced differently from the rest of the project 4 // SPDX-License-Identifier: GPL-2.0 5 // Copyright (C) Aqua Security inc. 6 7 #include <vmlinux.h> 8 #include <vmlinux_flavors.h> 9 #include <vmlinux_missing.h> 10 11 #undef container_of 12 13 #include <bpf/bpf_core_read.h> 14 #include <bpf/bpf_endian.h> 15 #include <bpf/bpf_helpers.h> 16 #include <bpf/bpf_tracing.h> 17 #include <maps.h> 18 #include <types.h> 19 #include <capture_filtering.h> 20 #include <tracee.h> 21 22 #include <common/arch.h> 23 #include <common/arguments.h> 24 #include <common/binprm.h> 25 #include <common/bpf_prog.h> 26 #include <common/buffer.h> 27 #include <common/capabilities.h> 28 #include <common/cgroups.h> 29 #include <common/common.h> 30 #include <common/consts.h> 31 #include <common/context.h> 32 #include <common/filesystem.h> 33 #include <common/filtering.h> 34 #include <common/kconfig.h> 35 #include <common/ksymbols.h> 36 #include <common/logging.h> 37 #include <common/memory.h> 38 #include <common/network.h> 39 #include <common/probes.h> 40 #include <common/signal.h> 41 #include <common/debug.h> 42 #include <common/stats.h> 43 44 char LICENSE[] SEC("license") = "GPL"; 45 46 extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig; 47 48 // trace/events/syscalls.h: TP_PROTO(struct pt_regs *regs, long id) 49 // initial entry for sys_enter syscall logic 50 SEC("raw_tracepoint/sys_enter") 51 int tracepoint__raw_syscalls__sys_enter(struct bpf_raw_tracepoint_args *ctx) 52 { 53 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 54 int id = ctx->args[1]; 55 if (is_compat(task)) { 56 // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler 57 u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id); 58 if (id_64 == 0) 59 return 0; 60 61 id = *id_64; 62 } 63 64 int zero = 0; 65 config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero); 66 if (unlikely(config == NULL)) 67 return 0; 68 u64 cgroup_id = 0; 69 if (config->options & OPT_CGROUP_V1) { 70 cgroup_id = get_cgroup_v1_subsys0_id(task); 71 } else { 72 cgroup_id = bpf_get_current_cgroup_id(); 73 } 74 // Skip if cgroup is muted 75 if (bpf_map_lookup_elem(&ignored_cgroups_map, &cgroup_id) != NULL) { 76 return 0; 77 } 78 // Update containers syscall stats. 79 update_syscall_stats(ctx, cgroup_id, id); 80 81 // Continue to tail calls. 82 bpf_tail_call(ctx, &sys_enter_init_tail, id); 83 return 0; 84 } 85 86 // initial tail call entry from sys_enter. 87 // purpose is to save the syscall info of relevant syscalls through the task_info map. 88 // can move to one of: 89 // 1. sys_enter_submit, general event submit logic from sys_enter 90 // 2. directly to syscall tail handler in sys_enter_tails 91 SEC("raw_tracepoint/sys_enter_init") 92 int sys_enter_init(struct bpf_raw_tracepoint_args *ctx) 93 { 94 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 95 96 u64 pid_tgid = bpf_get_current_pid_tgid(); 97 u32 tid = pid_tgid; 98 task_info_t *task_info = bpf_map_lookup_elem(&task_info_map, &tid); 99 if (unlikely(task_info == NULL)) { 100 task_info = init_task_info(tid, 0); 101 if (unlikely(task_info == NULL)) { 102 return 0; 103 } 104 int zero = 0; 105 config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero); 106 if (unlikely(config == NULL)) 107 return 0; 108 109 init_task_context(&task_info->context, task, config->options); 110 } 111 112 syscall_data_t *sys = &(task_info->syscall_data); 113 sys->id = ctx->args[1]; 114 115 if (LINUX_HAS_SYSCALL_WRAPPER) { 116 struct pt_regs *regs = (struct pt_regs *) ctx->args[0]; 117 118 if (is_x86_compat(task)) { 119 #if defined(bpf_target_x86) 120 sys->args.args[0] = BPF_CORE_READ(regs, bx); 121 sys->args.args[1] = BPF_CORE_READ(regs, cx); 122 sys->args.args[2] = BPF_CORE_READ(regs, dx); 123 sys->args.args[3] = BPF_CORE_READ(regs, si); 124 sys->args.args[4] = BPF_CORE_READ(regs, di); 125 sys->args.args[5] = BPF_CORE_READ(regs, bp); 126 #endif // bpf_target_x86 127 } else { 128 sys->args.args[0] = PT_REGS_PARM1_CORE_SYSCALL(regs); 129 sys->args.args[1] = PT_REGS_PARM2_CORE_SYSCALL(regs); 130 sys->args.args[2] = PT_REGS_PARM3_CORE_SYSCALL(regs); 131 sys->args.args[3] = PT_REGS_PARM4_CORE_SYSCALL(regs); 132 sys->args.args[4] = PT_REGS_PARM5_CORE_SYSCALL(regs); 133 sys->args.args[5] = PT_REGS_PARM6_CORE_SYSCALL(regs); 134 } 135 } else { 136 bpf_probe_read(sys->args.args, sizeof(6 * sizeof(u64)), (void *) ctx->args); 137 } 138 139 if (is_compat(task)) { 140 // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler 141 u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &sys->id); 142 if (id_64 == 0) 143 return 0; 144 145 sys->id = *id_64; 146 } 147 148 // exit, exit_group and rt_sigreturn syscalls don't return 149 if (sys->id != SYSCALL_EXIT && sys->id != SYSCALL_EXIT_GROUP && 150 sys->id != SYSCALL_RT_SIGRETURN) { 151 sys->ts = bpf_ktime_get_ns(); 152 task_info->syscall_traced = true; 153 } 154 155 // if id is irrelevant continue to next tail call 156 bpf_tail_call(ctx, &sys_enter_submit_tail, sys->id); 157 158 // call syscall handler, if exists 159 bpf_tail_call(ctx, &sys_enter_tails, sys->id); 160 return 0; 161 } 162 163 // submit tail call part of sys_enter. 164 // events that are required for submission go through two logics here: 165 // 1. parsing their FD filepath if requested as an option 166 // 2. submitting the event if relevant 167 // may move to the direct syscall handler in sys_enter_tails 168 SEC("raw_tracepoint/sys_enter_submit") 169 int sys_enter_submit(struct bpf_raw_tracepoint_args *ctx) 170 { 171 program_data_t p = {}; 172 if (!init_program_data(&p, ctx)) 173 return 0; 174 175 if (!should_trace(&p)) 176 return 0; 177 178 syscall_data_t *sys = &p.task_info->syscall_data; 179 180 if (p.config->options & OPT_TRANSLATE_FD_FILEPATH && has_syscall_fd_arg(sys->id)) { 181 // Process filepath related to fd argument 182 uint fd_num = get_syscall_fd_num_from_arg(sys->id, &sys->args); 183 struct file *file = get_struct_file_from_fd(fd_num); 184 185 if (file) { 186 u64 ts = sys->ts; 187 fd_arg_path_t fd_arg_path = {}; 188 void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 189 190 bpf_probe_read_kernel_str(&fd_arg_path.path, sizeof(fd_arg_path.path), file_path); 191 bpf_map_update_elem(&fd_arg_path_map, &ts, &fd_arg_path, BPF_ANY); 192 } 193 } 194 195 if (sys->id != SYSCALL_RT_SIGRETURN && !p.task_info->syscall_traced) { 196 save_to_submit_buf(&p.event->args_buf, (void *) &(sys->args.args[0]), sizeof(int), 0); 197 events_perf_submit(&p, sys->id, 0); 198 } 199 200 // call syscall handler, if exists 201 bpf_tail_call(ctx, &sys_enter_tails, sys->id); 202 return 0; 203 } 204 205 // trace/events/syscalls.h: TP_PROTO(struct pt_regs *regs, long ret) 206 // initial entry for sys_exit syscall logic 207 SEC("raw_tracepoint/sys_exit") 208 int tracepoint__raw_syscalls__sys_exit(struct bpf_raw_tracepoint_args *ctx) 209 { 210 struct pt_regs *regs = (struct pt_regs *) ctx->args[0]; 211 int id = get_syscall_id_from_regs(regs); 212 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 213 if (is_compat(task)) { 214 // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler 215 u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id); 216 if (id_64 == 0) 217 return 0; 218 219 id = *id_64; 220 } 221 222 // Skip if cgroup is muted. 223 int zero = 0; 224 config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero); 225 if (unlikely(config == NULL)) 226 return 0; 227 u64 cgroup_id = 0; 228 if (config->options & OPT_CGROUP_V1) { 229 cgroup_id = get_cgroup_v1_subsys0_id(task); 230 } else { 231 cgroup_id = bpf_get_current_cgroup_id(); 232 } 233 if (bpf_map_lookup_elem(&ignored_cgroups_map, &cgroup_id) != NULL) { 234 return 0; 235 } 236 237 bpf_tail_call(ctx, &sys_exit_init_tail, id); 238 return 0; 239 } 240 241 // initial tail call entry from sys_exit. 242 // purpose is to "confirm" the syscall data saved by marking it as complete(see 243 // task_info->syscall_traced) and adding the return value to the syscall_info struct. can move to 244 // one of: 245 // 1. sys_exit, general event submit logic from sys_exit 246 // 2. directly to syscall tail hanler in sys_exit_tails 247 SEC("raw_tracepoint/sys_exit_init") 248 int sys_exit_init(struct bpf_raw_tracepoint_args *ctx) 249 { 250 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 251 252 u64 pid_tgid = bpf_get_current_pid_tgid(); 253 u32 tid = pid_tgid; 254 task_info_t *task_info = bpf_map_lookup_elem(&task_info_map, &tid); 255 if (unlikely(task_info == NULL)) { 256 task_info = init_task_info(tid, 0); 257 if (unlikely(task_info == NULL)) 258 return 0; 259 260 int zero = 0; 261 config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero); 262 if (unlikely(config == NULL)) 263 return 0; 264 265 init_task_context(&task_info->context, task, config->options); 266 } 267 268 // check if syscall is being traced and mark that it finished 269 if (!task_info->syscall_traced) 270 return 0; 271 task_info->syscall_traced = false; 272 273 syscall_data_t *sys = &task_info->syscall_data; 274 275 long ret = ctx->args[1]; 276 struct pt_regs *regs = (struct pt_regs *) ctx->args[0]; 277 int id = get_syscall_id_from_regs(regs); 278 279 if (is_compat(task)) { 280 // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler 281 u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id); 282 if (id_64 == 0) 283 return 0; 284 285 id = *id_64; 286 } 287 288 // Sanity check - we returned from the expected syscall this task was executing 289 if (sys->id != id) 290 return 0; 291 292 sys->ret = ret; 293 294 // move to submit tail call if needed 295 bpf_tail_call(ctx, &sys_exit_submit_tail, id); 296 297 // otherwise move to direct syscall handler 298 bpf_tail_call(ctx, &sys_exit_tails, id); 299 return 0; 300 } 301 302 // submit tail call part of sys_exit. 303 // most syscall events are submitted at this point, and if not, 304 // they are submitted through direct syscall handlers in sys_exit_tails 305 SEC("raw_tracepoint/sys_exit_submit") 306 int sys_exit_submit(struct bpf_raw_tracepoint_args *ctx) 307 { 308 program_data_t p = {}; 309 if (!init_program_data(&p, ctx)) 310 return 0; 311 312 if (!should_trace(&p)) 313 return 0; 314 315 syscall_data_t *sys = &p.task_info->syscall_data; 316 long ret = ctx->args[1]; 317 318 if (!should_submit(sys->id, p.event)) 319 goto out; 320 321 // We can't use saved args after execve syscall, as pointers are invalid. 322 // To avoid showing execve event both on entry and exit, we only output failed execs. 323 if ((sys->id == SYSCALL_EXECVE || sys->id == SYSCALL_EXECVEAT) && (ret == 0)) 324 goto out; 325 326 save_args_to_submit_buf(p.event, &sys->args); 327 p.event->context.ts = sys->ts; 328 events_perf_submit(&p, sys->id, ret); 329 330 out: 331 bpf_tail_call(ctx, &sys_exit_tails, sys->id); 332 return 0; 333 } 334 335 // here are the direct hook points for sys_enter and sys_exit. 336 // There are used not for submitting syscall events but the enter and exit events themselves. 337 // As such they are usually not attached, and will only be used if sys_enter or sys_exit events are 338 // given as tracing arguments. 339 340 // separate hook point for sys_enter event tracing 341 SEC("raw_tracepoint/trace_sys_enter") 342 int trace_sys_enter(struct bpf_raw_tracepoint_args *ctx) 343 { 344 program_data_t p = {}; 345 if (!init_program_data(&p, ctx)) 346 return 0; 347 348 if (!should_trace(&p)) 349 return 0; 350 351 if (!should_submit(RAW_SYS_ENTER, p.event)) 352 return 0; 353 354 // always submit since this won't be attached otherwise 355 int id = ctx->args[1]; 356 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 357 if (is_compat(task)) { 358 // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler 359 u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id); 360 if (id_64 == 0) 361 return 0; 362 363 id = *id_64; 364 } 365 save_to_submit_buf(&p.event->args_buf, (void *) &id, sizeof(int), 0); 366 events_perf_submit(&p, RAW_SYS_ENTER, 0); 367 return 0; 368 } 369 370 // separate hook point for sys_exit event tracing 371 SEC("raw_tracepoint/trace_sys_exit") 372 int trace_sys_exit(struct bpf_raw_tracepoint_args *ctx) 373 { 374 program_data_t p = {}; 375 if (!init_program_data(&p, ctx)) 376 return 0; 377 378 if (!should_trace(&p)) 379 return 0; 380 381 if (!should_submit(RAW_SYS_EXIT, p.event)) 382 return 0; 383 384 // always submit since this won't be attached otherwise 385 struct pt_regs *regs = (struct pt_regs *) ctx->args[0]; 386 int id = get_syscall_id_from_regs(regs); 387 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 388 if (is_compat(task)) { 389 // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler 390 u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id); 391 if (id_64 == 0) 392 return 0; 393 394 id = *id_64; 395 } 396 save_to_submit_buf(&p.event->args_buf, (void *) &id, sizeof(int), 0); 397 events_perf_submit(&p, RAW_SYS_EXIT, 0); 398 return 0; 399 } 400 401 SEC("raw_tracepoint/sys_execve") 402 int syscall__execve(void *ctx) 403 { 404 program_data_t p = {}; 405 if (!init_tailcall_program_data(&p, ctx)) 406 return 0; 407 408 if (!p.task_info->syscall_traced) 409 return -1; 410 syscall_data_t *sys = &p.task_info->syscall_data; 411 p.event->context.ts = sys->ts; 412 413 if (!should_submit(SYSCALL_EXECVE, p.event)) 414 return 0; 415 416 reset_event_args(&p); 417 save_str_to_buf(&p.event->args_buf, (void *) sys->args.args[0] /*filename*/, 0); 418 save_str_arr_to_buf(&p.event->args_buf, (const char *const *) sys->args.args[1] /*argv*/, 1); 419 if (p.config->options & OPT_EXEC_ENV) { 420 save_str_arr_to_buf( 421 &p.event->args_buf, (const char *const *) sys->args.args[2] /*envp*/, 2); 422 } 423 424 return events_perf_submit(&p, SYSCALL_EXECVE, 0); 425 } 426 427 SEC("raw_tracepoint/sys_execveat") 428 int syscall__execveat(void *ctx) 429 { 430 program_data_t p = {}; 431 if (!init_tailcall_program_data(&p, ctx)) 432 return 0; 433 434 if (!p.task_info->syscall_traced) 435 return -1; 436 syscall_data_t *sys = &p.task_info->syscall_data; 437 p.event->context.ts = sys->ts; 438 439 if (!should_submit(SYSCALL_EXECVEAT, p.event)) 440 return 0; 441 442 reset_event_args(&p); 443 save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0] /*dirfd*/, sizeof(int), 0); 444 save_str_to_buf(&p.event->args_buf, (void *) sys->args.args[1] /*pathname*/, 1); 445 save_str_arr_to_buf(&p.event->args_buf, (const char *const *) sys->args.args[2] /*argv*/, 2); 446 if (p.config->options & OPT_EXEC_ENV) { 447 save_str_arr_to_buf( 448 &p.event->args_buf, (const char *const *) sys->args.args[3] /*envp*/, 3); 449 } 450 save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[4] /*flags*/, sizeof(int), 4); 451 452 return events_perf_submit(&p, SYSCALL_EXECVEAT, 0); 453 } 454 455 statfunc int send_socket_dup(program_data_t *p, u64 oldfd, u64 newfd) 456 { 457 if (!should_submit(SOCKET_DUP, p->event)) { 458 return 0; 459 } 460 461 if (!check_fd_type(oldfd, S_IFSOCK)) { 462 return 0; 463 } 464 465 struct file *f = get_struct_file_from_fd(oldfd); 466 if (f == NULL) { 467 return -1; 468 } 469 470 // this is a socket - submit the SOCKET_DUP event 471 472 reset_event_args(p); 473 save_to_submit_buf(&(p->event->args_buf), &oldfd, sizeof(u32), 0); 474 save_to_submit_buf(&(p->event->args_buf), &newfd, sizeof(u32), 1); 475 476 // get the address 477 struct socket *socket_from_file = (struct socket *) BPF_CORE_READ(f, private_data); 478 if (socket_from_file == NULL) { 479 return -1; 480 } 481 482 struct sock *sk = get_socket_sock(socket_from_file); 483 u16 family = get_sock_family(sk); 484 if ((family != AF_INET) && (family != AF_INET6) && (family != AF_UNIX)) { 485 return 0; 486 } 487 488 if (family == AF_INET) { 489 net_conn_v4_t net_details = {}; 490 struct sockaddr_in remote; 491 492 get_network_details_from_sock_v4(sk, &net_details, 0); 493 get_remote_sockaddr_in_from_network_details(&remote, &net_details, family); 494 495 save_to_submit_buf(&(p->event->args_buf), &remote, sizeof(struct sockaddr_in), 2); 496 } else if (family == AF_INET6) { 497 net_conn_v6_t net_details = {}; 498 struct sockaddr_in6 remote; 499 500 get_network_details_from_sock_v6(sk, &net_details, 0); 501 get_remote_sockaddr_in6_from_network_details(&remote, &net_details, family); 502 503 save_to_submit_buf(&(p->event->args_buf), &remote, sizeof(struct sockaddr_in6), 2); 504 } else if (family == AF_UNIX) { 505 struct unix_sock *unix_sk = (struct unix_sock *) sk; 506 struct sockaddr_un sockaddr = get_unix_sock_addr(unix_sk); 507 508 save_to_submit_buf(&(p->event->args_buf), &sockaddr, sizeof(struct sockaddr_un), 2); 509 } 510 511 return events_perf_submit(p, SOCKET_DUP, 0); 512 } 513 514 SEC("raw_tracepoint/sys_dup") 515 int sys_dup_exit_tail(void *ctx) 516 { 517 program_data_t p = {}; 518 if (!init_tailcall_program_data(&p, ctx)) 519 return 0; 520 521 if (!should_trace(&p)) 522 return 0; 523 524 syscall_data_t *sys = &p.task_info->syscall_data; 525 526 if (sys->ret < 0) { 527 // dup failed 528 return 0; 529 } 530 531 if (sys->id == SYSCALL_DUP) { 532 // args.args[0]: oldfd 533 // retval: newfd 534 send_socket_dup(&p, sys->args.args[0], sys->ret); 535 } else if (sys->id == SYSCALL_DUP2 || sys->id == SYSCALL_DUP3) { 536 // args.args[0]: oldfd 537 // args.args[1]: newfd 538 // retval: retval 539 send_socket_dup(&p, sys->args.args[0], sys->args.args[1]); 540 } 541 542 return 0; 543 } 544 545 // trace/events/sched.h: TP_PROTO(struct task_struct *parent, struct task_struct *child) 546 SEC("raw_tracepoint/sched_process_fork") 547 int tracepoint__sched__sched_process_fork(struct bpf_raw_tracepoint_args *ctx) 548 { 549 long ret = 0; 550 program_data_t p = {}; 551 if (!init_program_data(&p, ctx)) 552 return 0; 553 554 // NOTE: proc_info_map updates before should_trace() as the entries are needed in other places. 555 556 struct task_struct *parent = (struct task_struct *) ctx->args[0]; 557 struct task_struct *child = (struct task_struct *) ctx->args[1]; 558 559 // Information needed before the event: 560 int parent_pid = get_task_host_tgid(parent); 561 u64 child_start_time = get_task_start_time(child); 562 int child_pid = get_task_host_tgid(child); 563 int child_tid = get_task_host_pid(child); 564 int child_ns_pid = get_task_ns_tgid(child); 565 int child_ns_tid = get_task_ns_pid(child); 566 567 // Update the task_info map with the new task's info 568 569 ret = bpf_map_update_elem(&task_info_map, &child_tid, p.task_info, BPF_ANY); 570 if (ret < 0) 571 tracee_log(ctx, BPF_LOG_LVL_DEBUG, BPF_LOG_ID_MAP_UPDATE_ELEM, ret); 572 task_info_t *task = bpf_map_lookup_elem(&task_info_map, &child_tid); 573 if (unlikely(task == NULL)) { 574 // this should never happen - we just updated the map with this key 575 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0); 576 return 0; 577 } 578 579 task->context.tid = child_ns_tid; 580 task->context.host_tid = child_tid; 581 task->context.start_time = child_start_time; 582 583 // Update the proc_info_map with the new process's info (from parent) 584 585 proc_info_t *c_proc_info = bpf_map_lookup_elem(&proc_info_map, &child_pid); 586 if (c_proc_info == NULL) { 587 // It is a new process (not another thread): add it to proc_info_map. 588 proc_info_t *p_proc_info = bpf_map_lookup_elem(&proc_info_map, &parent_pid); 589 if (unlikely(p_proc_info == NULL)) { 590 // parent should exist in proc_info_map (init_program_data sets it) 591 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0); 592 return 0; 593 } 594 595 // Copy the parent's proc_info to the child's entry. 596 bpf_map_update_elem(&proc_info_map, &child_pid, p_proc_info, BPF_NOEXIST); 597 c_proc_info = bpf_map_lookup_elem(&proc_info_map, &child_pid); 598 if (unlikely(c_proc_info == NULL)) { 599 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0); 600 return 0; 601 } 602 603 c_proc_info->follow_in_scopes = 0; // updated later if should_trace() passes (follow filter) 604 c_proc_info->new_proc = true; // started after tracee (new_pid filter) 605 } 606 607 // Update the process tree map (filter related) if the parent has an entry. 608 609 policies_config_t *policies_cfg = &p.config->policies_config; 610 611 if (policies_cfg->proc_tree_filter_enabled_scopes) { 612 eq_t *tgid_filtered = bpf_map_lookup_elem(&process_tree_map, &parent_pid); 613 if (tgid_filtered) { 614 ret = bpf_map_update_elem(&process_tree_map, &child_pid, tgid_filtered, BPF_ANY); 615 if (ret < 0) 616 tracee_log(ctx, BPF_LOG_LVL_DEBUG, BPF_LOG_ID_MAP_UPDATE_ELEM, ret); 617 } 618 } 619 620 if (!should_trace(&p)) 621 return 0; 622 623 // Always follow every pid that passed the should_trace() checks (follow filter) 624 c_proc_info->follow_in_scopes = p.event->context.matched_policies; 625 626 // Submit the event 627 628 if (should_submit(SCHED_PROCESS_FORK, p.event)) { 629 // Parent information. 630 u64 parent_start_time = get_task_start_time(parent); 631 int parent_tid = get_task_host_pid(parent); 632 int parent_ns_pid = get_task_ns_tgid(parent); 633 int parent_ns_tid = get_task_ns_pid(parent); 634 635 // Parent (might be a thread or a process). 636 save_to_submit_buf(&p.event->args_buf, (void *) &parent_tid, sizeof(int), 0); 637 save_to_submit_buf(&p.event->args_buf, (void *) &parent_ns_tid, sizeof(int), 1); 638 save_to_submit_buf(&p.event->args_buf, (void *) &parent_pid, sizeof(int), 2); 639 save_to_submit_buf(&p.event->args_buf, (void *) &parent_ns_pid, sizeof(int), 3); 640 save_to_submit_buf(&p.event->args_buf, (void *) &parent_start_time, sizeof(u64), 4); 641 642 // Child (might be a lwp or a process, sched_process_fork trace is calle by clone() also). 643 save_to_submit_buf(&p.event->args_buf, (void *) &child_tid, sizeof(int), 5); 644 save_to_submit_buf(&p.event->args_buf, (void *) &child_ns_tid, sizeof(int), 6); 645 save_to_submit_buf(&p.event->args_buf, (void *) &child_pid, sizeof(int), 7); 646 save_to_submit_buf(&p.event->args_buf, (void *) &child_ns_pid, sizeof(int), 8); 647 save_to_submit_buf(&p.event->args_buf, (void *) &child_start_time, sizeof(u64), 9); 648 649 // Process tree information (if needed). 650 if (p.config->options & OPT_FORK_PROCTREE) { 651 // Both, the thread group leader and the "up_parent" (the first process, not lwp, found 652 // as a parent of the child in the hierarchy), are needed by the userland process tree. 653 // The userland process tree default source of events is the signal events, but there is 654 // an option to use regular event for maintaining it as well (and it is needed for some 655 // situatins). These arguments will always be removed by userland event processors. 656 struct task_struct *leader = get_leader_task(child); 657 struct task_struct *up_parent = get_leader_task(get_parent_task(leader)); 658 659 // Up Parent information: Go up in hierarchy until parent is process. 660 u64 up_parent_start_time = get_task_start_time(up_parent); 661 int up_parent_pid = get_task_host_tgid(up_parent); 662 int up_parent_tid = get_task_host_pid(up_parent); 663 int up_parent_ns_pid = get_task_ns_tgid(up_parent); 664 int up_parent_ns_tid = get_task_ns_pid(up_parent); 665 // Leader information. 666 u64 leader_start_time = get_task_start_time(leader); 667 int leader_pid = get_task_host_tgid(leader); 668 int leader_tid = get_task_host_pid(leader); 669 int leader_ns_pid = get_task_ns_tgid(leader); 670 int leader_ns_tid = get_task_ns_pid(leader); 671 672 // Up Parent: always a process (might be the same as Parent if parent is a process). 673 save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_tid, sizeof(int), 10); 674 save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_ns_tid, sizeof(int), 11); 675 save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_pid, sizeof(int), 12); 676 save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_ns_pid, sizeof(int), 13); 677 save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_start_time, sizeof(u64), 14); 678 // Leader: always a process (might be the same as the Child if child is a process). 679 save_to_submit_buf(&p.event->args_buf, (void *) &leader_tid, sizeof(int), 15); 680 save_to_submit_buf(&p.event->args_buf, (void *) &leader_ns_tid, sizeof(int), 16); 681 save_to_submit_buf(&p.event->args_buf, (void *) &leader_pid, sizeof(int), 17); 682 save_to_submit_buf(&p.event->args_buf, (void *) &leader_ns_pid, sizeof(int), 18); 683 save_to_submit_buf(&p.event->args_buf, (void *) &leader_start_time, sizeof(u64), 19); 684 } 685 686 // Submit 687 events_perf_submit(&p, SCHED_PROCESS_FORK, 0); 688 } 689 690 return 0; 691 } 692 693 // number of iterations - value that the verifier was seen to cope with - the higher, the better 694 #define MAX_NUM_MODULES 100 695 696 enum { 697 PROC_MODULES = 1 << 0, 698 KSET = 1 << 1, 699 MOD_TREE = 1 << 2, 700 NEW_MOD = 1 << 3, 701 FULL_SCAN = 1 << 30, 702 HIDDEN_MODULE = 1 << 31, 703 }; 704 705 // Forcibly create the map in all kernels, even when not needed, due to lack of 706 // support for kernel version awareness about map loading errors. 707 708 BPF_HASH(modules_map, u64, kernel_module_t, MAX_NUM_MODULES); 709 BPF_HASH(new_module_map, u64, kernel_new_mod_t, MAX_NUM_MODULES); 710 711 // We only care for modules that got deleted or inserted between our scan and if 712 // we detected something suspicious. Since it's a very small time frame, it's 713 // not likely that a large amount of modules will be deleted. Instead of saving 714 // a map of deleted modules, we could have saved the last deleted module 715 // timestamp and, if we detected something suspicious, verify that no modules 716 // got deleted between our check. This is preferable space-wise (u64 instead of 717 // a map), but an attacker might start unloading modules in the background and 718 // race with the check in order to abort reporting for hidden modules. 719 720 BPF_LRU_HASH(recent_deleted_module_map, u64, kernel_deleted_mod_t, 50); 721 BPF_LRU_HASH(recent_inserted_module_map, 722 u64, 723 kernel_new_mod_t, 724 50); // Likewise for module insertion 725 726 u64 start_scan_time_init_shown_mods = 0; 727 u64 last_module_insert_time = 0; 728 bool hidden_old_mod_scan_done = false; 729 static const int HID_MOD_RACE_CONDITION = -1; 730 static const int HID_MOD_UNCOMPLETED_ITERATIONS = -2; 731 static const int HID_MOD_MEM_ZEROED = -3; 732 static const int MOD_HIDDEN = 1; 733 static const int MOD_NOT_HIDDEN = 0; 734 735 void __always_inline lkm_seeker_send_to_userspace(struct module *mod, u32 *flags, program_data_t *p) 736 { 737 reset_event_args(p); 738 u64 mod_addr = (u64) mod; 739 char *mod_name = mod->name; 740 const char *mod_srcversion = BPF_CORE_READ(mod, srcversion); 741 742 save_to_submit_buf(&(p->event->args_buf), &mod_addr, sizeof(u64), 0); 743 save_bytes_to_buf(&(p->event->args_buf), 744 (void *) mod_name, 745 MODULE_NAME_LEN & MAX_MEM_DUMP_SIZE, 746 1); // string saved as bytes (verifier issues). 747 save_to_submit_buf(&(p->event->args_buf), flags, sizeof(u32), 2); 748 save_bytes_to_buf(&(p->event->args_buf), 749 (void *) mod_srcversion, 750 MODULE_SRCVERSION_MAX_LENGTH & MAX_MEM_DUMP_SIZE, 751 3); // string saved as bytes (verifier issues). 752 753 events_perf_submit(p, HIDDEN_KERNEL_MODULE_SEEKER, 0); 754 } 755 756 // Populate all the modules to an efficient query-able hash map. 757 // We can't read it once and then hook on do_init_module and free_module since a hidden module will 758 // remove itself from the list directly and we wouldn't know (hence from our perspective the module 759 // will reside in the modules list, which could be false). So on every trigger, we go over the 760 // modules list and populate the map. It gets clean in userspace before every run. 761 // Since this mechanism is suppose to be triggered every once in a while, 762 // this should be ok. 763 statfunc int init_shown_modules() 764 { 765 char modules_sym[8] = "modules"; 766 struct list_head *head = (struct list_head *) get_symbol_addr(modules_sym); 767 kernel_module_t ker_mod = {}; 768 bool iterated_all_modules = false; 769 struct module *pos, *n; 770 771 pos = list_first_entry_ebpf(head, typeof(*pos), list); 772 n = pos; 773 774 #pragma unroll 775 for (int i = 0; i < MAX_NUM_MODULES; i++) { 776 pos = n; 777 n = list_next_entry_ebpf(n, list); 778 779 if (&pos->list == head) { 780 return 0; 781 } 782 783 bpf_map_update_elem(&modules_map, &pos, &ker_mod, BPF_ANY); 784 } 785 786 return HID_MOD_UNCOMPLETED_ITERATIONS; 787 } 788 789 statfunc int is_hidden(u64 mod) 790 { 791 if (bpf_map_lookup_elem(&modules_map, &mod) != NULL) { 792 return MOD_NOT_HIDDEN; 793 } 794 795 // Verify that this module wasn't removed after we initialized modules_map 796 kernel_deleted_mod_t *deleted_mod = bpf_map_lookup_elem(&recent_deleted_module_map, &mod); 797 if (deleted_mod && deleted_mod->deleted_time > start_scan_time_init_shown_mods) { 798 // This module got deleted after the start of the scan time.. So there 799 // was a valid remove, and it's not hidden. 800 return false; 801 } 802 803 // Check if some module was inserted after we started scanning. 804 // If that's the case, then if the module got inserted to the modules list after we walked on 805 // the list, it'll be missing from our eBPF map. If it got inserted to other places (kset for 806 // example), then it will appear as if the module is hidden (in kset but not in module's list), 807 // but in fact it only got added in the midst of our scan. Thus, we need to monitor for this 808 // situation. 809 if (start_scan_time_init_shown_mods < last_module_insert_time) { 810 // No point of checking other modules in this scan... abort 811 return HID_MOD_RACE_CONDITION; 812 } 813 814 return MOD_HIDDEN; 815 } 816 817 statfunc int find_modules_from_module_kset_list(program_data_t *p) 818 { 819 char module_kset_sym[12] = "module_kset"; 820 struct module *first_mod = NULL; 821 struct kset *mod_kset = (struct kset *) get_symbol_addr(module_kset_sym); 822 struct list_head *head = &(mod_kset->list); 823 struct kobject *pos = list_first_entry_ebpf(head, typeof(*pos), entry); 824 struct kobject *n = list_next_entry_ebpf(pos, entry); 825 u32 flags = KSET | HIDDEN_MODULE; 826 827 for (int i = 0; i < MAX_NUM_MODULES; i++) { 828 if (BPF_CORE_READ(n, name) == 829 NULL) { // Without this the list seems infinite. Also, using pos 830 // here seems incorrect as it starts from a weird member 831 return 0; 832 } 833 834 struct module_kobject *mod_kobj = 835 (struct module_kobject *) container_of(n, struct module_kobject, kobj); 836 if (mod_kobj) { 837 struct module *mod = BPF_CORE_READ(mod_kobj, mod); 838 if (mod) { 839 if (first_mod == NULL) { 840 first_mod = mod; 841 } else if (first_mod == mod) { // Iterated over all modules - stop. 842 return 0; 843 } 844 int ret = is_hidden((u64) mod); 845 if (ret == MOD_HIDDEN) { 846 lkm_seeker_send_to_userspace(mod, &flags, p); 847 } else if (ret == HID_MOD_RACE_CONDITION) { 848 return ret; 849 } 850 } 851 } 852 853 pos = n; 854 n = list_next_entry_ebpf(n, entry); 855 } 856 857 return HID_MOD_UNCOMPLETED_ITERATIONS; 858 } 859 860 BPF_QUEUE(walk_mod_tree_queue, rb_node_t, MAX_NUM_MODULES); // used to walk a rb tree 861 862 statfunc struct latch_tree_node *__lt_from_rb(struct rb_node *node, int idx) 863 { 864 return container_of(node, struct latch_tree_node, node[idx]); 865 } 866 867 statfunc int walk_mod_tree(program_data_t *p, struct rb_node *root, int idx) 868 { 869 struct latch_tree_node *ltn; 870 struct module *mod; 871 struct rb_node *curr = root; 872 u32 flags = MOD_TREE | HIDDEN_MODULE; 873 874 #pragma unroll 875 for (int i = 0; i < MAX_NUM_MODULES; i++) { 876 if (curr != NULL) { 877 rb_node_t rb_nod = {.node = curr}; 878 bpf_map_push_elem(&walk_mod_tree_queue, &rb_nod, BPF_EXIST); 879 880 curr = BPF_CORE_READ(curr, rb_left); // Move left 881 } else { 882 rb_node_t rb_nod; 883 if (bpf_map_pop_elem(&walk_mod_tree_queue, &rb_nod) != 0) { 884 return 0; // Finished iterating 885 } else { 886 curr = rb_nod.node; 887 ltn = __lt_from_rb(curr, idx); 888 mod = BPF_CORE_READ(container_of(ltn, struct mod_tree_node, node), mod); 889 890 int ret = is_hidden((u64) mod); 891 if (ret == MOD_HIDDEN) { 892 lkm_seeker_send_to_userspace(mod, &flags, p); 893 } else if (ret == HID_MOD_RACE_CONDITION) { 894 return ret; 895 } 896 897 /* We have visited the node and its left subtree. 898 Now, it's right subtree's turn */ 899 curr = BPF_CORE_READ(curr, rb_right); 900 } 901 } 902 } 903 904 return HID_MOD_UNCOMPLETED_ITERATIONS; 905 } 906 907 struct mod_tree_root { 908 struct latch_tree_root root; 909 }; 910 911 statfunc int find_modules_from_mod_tree(program_data_t *p) 912 { 913 char mod_tree_sym[9] = "mod_tree"; 914 struct mod_tree_root *m_tree = (struct mod_tree_root *) get_symbol_addr(mod_tree_sym); 915 unsigned int seq; 916 917 if (bpf_core_field_exists(m_tree->root.seq.sequence)) { 918 seq = BPF_CORE_READ(m_tree, root.seq.sequence); // below 5.10 919 } else { 920 seq = BPF_CORE_READ(m_tree, root.seq.seqcount.sequence); // version >= v5.10 921 } 922 923 struct rb_node *node = BPF_CORE_READ(m_tree, root.tree[seq & 1].rb_node); 924 925 return walk_mod_tree(p, node, seq & 1); 926 } 927 928 static __always_inline u64 check_new_mods_only(program_data_t *p) 929 { 930 struct module *pos, *n; 931 u64 start_scan_time = bpf_ktime_get_ns(); 932 char modules_sym[8] = "modules"; 933 kernel_new_mod_t *new_mod; 934 u64 mod_addr; 935 struct list_head *head = (struct list_head *) get_symbol_addr(modules_sym); 936 937 pos = list_first_entry_ebpf(head, typeof(*pos), list); 938 n = pos; 939 940 #pragma unroll 941 for (int i = 0; i < MAX_NUM_MODULES; i++) { 942 pos = n; 943 n = list_next_entry_ebpf(n, list); 944 if (&pos->list == head) { 945 return start_scan_time; // To be used in userspace 946 } 947 948 mod_addr = (u64) pos; 949 new_mod = bpf_map_lookup_elem(&new_module_map, &mod_addr); 950 if (new_mod) { 951 new_mod->last_seen_time = bpf_ktime_get_ns(); 952 } 953 } 954 955 return 0; 956 } 957 958 statfunc int check_is_proc_modules_hooked(program_data_t *p) 959 { 960 struct module *pos, *n; 961 u64 mod_base_addr; 962 char modules_sym[8] = "modules"; 963 struct list_head *head = (struct list_head *) get_symbol_addr(modules_sym); 964 u32 flags = PROC_MODULES | HIDDEN_MODULE; 965 966 pos = list_first_entry_ebpf(head, typeof(*pos), list); 967 n = pos; 968 969 #pragma unroll 970 for (int i = 0; i < MAX_NUM_MODULES; i++) { 971 pos = n; 972 n = list_next_entry_ebpf(n, list); 973 if (&pos->list == head) { 974 return 0; 975 } 976 977 // Check with the address being the start of the memory area, since 978 // this is what is given from /proc/modules. 979 if (bpf_core_field_exists(pos->mem)) { // Version >= v6.4 980 mod_base_addr = (u64) BPF_CORE_READ(pos, mem[MOD_TEXT].base); 981 } else { 982 struct module___older_v64 *old_mod = (void *) pos; 983 mod_base_addr = (u64) BPF_CORE_READ(old_mod, core_layout.base); 984 } 985 986 if (unlikely(mod_base_addr == 0)) { // Module memory was possibly tampered.. submit an error 987 return HID_MOD_MEM_ZEROED; 988 } else if (bpf_map_lookup_elem(&modules_map, &mod_base_addr) == NULL) { 989 // Was there any recent insertion of a module since we populated 990 // modules_list? if so, don't report as there's possible race 991 // condition. Note that this granularity (insertion of any module 992 // and not just this particular module) is only for /proc/modules 993 // logic, since there's a context switch between userspace to kernel 994 // space, it opens a window for more modules to get 995 // inserted/deleted, and then the LRU size is not enough - modules 996 // get evicted and we report a false-positive. We don't really want 997 // the init_shown_mods time, but the time proc modules map was 998 // filled (userspace) - so assume it happened max 2 seconds prior to 999 // that. 1000 if (start_scan_time_init_shown_mods - (2 * 1000000000) < last_module_insert_time) { 1001 return 0; 1002 } 1003 1004 // Module was not seen in proc modules and there was no recent insertion, report. 1005 lkm_seeker_send_to_userspace(pos, &flags, p); 1006 } 1007 } 1008 1009 return HID_MOD_UNCOMPLETED_ITERATIONS; 1010 } 1011 1012 statfunc bool kern_ver_below_min_lkm(struct pt_regs *ctx) 1013 { 1014 // If we're below kernel version 5.2, propogate error to userspace and return 1015 if (!bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_sk_storage_get)) { 1016 goto below_threshold; 1017 } 1018 1019 return false; // lkm seeker may run! 1020 1021 goto below_threshold; // For compiler - avoid "unused label" warning 1022 below_threshold: 1023 tracee_log(ctx, 1024 BPF_LOG_LVL_ERROR, 1025 BPF_LOG_ID_UNSPEC, 1026 -1); // notify the user that the event logic isn't loaded even though it's requested 1027 return true; 1028 } 1029 1030 SEC("uprobe/lkm_seeker_submitter") 1031 int uprobe_lkm_seeker_submitter(struct pt_regs *ctx) 1032 { 1033 // This check is to satisfy the verifier for kernels older than 5.2 1034 if (kern_ver_below_min_lkm(ctx)) 1035 return 0; 1036 1037 u64 mod_address = 0; 1038 u64 received_flags = 0; 1039 1040 #if defined(bpf_target_x86) 1041 mod_address = ctx->bx; // 1st arg 1042 received_flags = ctx->cx; // 2nd arg 1043 #elif defined(bpf_target_arm64) 1044 mod_address = ctx->user_regs.regs[1]; // 1st arg 1045 received_flags = ctx->user_regs.regs[2]; // 2nd arg 1046 #else 1047 return 0; 1048 #endif 1049 1050 program_data_t p = {}; 1051 if (!init_program_data(&p, ctx)) 1052 return 0; 1053 1054 // Uprobes are not triggered by syscalls, so we need to override the false value. 1055 p.event->context.syscall = NO_SYSCALL; 1056 1057 u32 trigger_pid = bpf_get_current_pid_tgid() >> 32; 1058 // Uprobe was triggered from other tracee instance 1059 if (p.config->tracee_pid != trigger_pid) 1060 return 0; 1061 1062 u32 flags = 1063 ((u32) received_flags) | HIDDEN_MODULE; // Convert to 32bit and turn on the bit that will 1064 // cause it to be sent as an event to the user 1065 lkm_seeker_send_to_userspace((struct module *) mod_address, &flags, &p); 1066 1067 return 0; 1068 } 1069 1070 // There are 2 types of scans: 1071 // - Scan of modules that were loaded prior tracee started: this is only done once at the start of 1072 // tracee 1073 // - Scan of modules that were loaded after tracee started: runs periodically and on each new module 1074 // insertion 1075 SEC("uprobe/lkm_seeker") 1076 int uprobe_lkm_seeker(struct pt_regs *ctx) 1077 { 1078 if (kern_ver_below_min_lkm(ctx)) 1079 return 0; 1080 1081 program_data_t p = {}; 1082 if (!init_program_data(&p, ctx)) 1083 return 0; 1084 1085 // Uprobes are not triggered by syscalls, so we need to override the false value. 1086 p.event->context.syscall = NO_SYSCALL; 1087 1088 // uprobe was triggered from other tracee instance 1089 if (p.config->tracee_pid != p.task_info->context.pid && 1090 p.config->tracee_pid != p.task_info->context.host_pid) { 1091 return 0; 1092 } 1093 1094 start_scan_time_init_shown_mods = bpf_ktime_get_ns(); 1095 int ret = init_shown_modules(); 1096 if (ret != 0) { 1097 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret); 1098 return 1; 1099 } 1100 1101 // On first run, do a scan only relevant for modules that were inserted prior tracee started. 1102 if (unlikely(!hidden_old_mod_scan_done)) { 1103 hidden_old_mod_scan_done = true; 1104 bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_KSET); 1105 return -1; 1106 } 1107 1108 bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_PROC); 1109 1110 return -1; 1111 } 1112 1113 SEC("uprobe/lkm_seeker_kset_tail") 1114 int lkm_seeker_kset_tail(struct pt_regs *ctx) 1115 { 1116 // This check is to satisfy the verifier for kernels older than 5.2 1117 // as in runtime we'll never get here (the tail call doesn't happen) 1118 if (kern_ver_below_min_lkm(ctx)) 1119 return 0; 1120 1121 program_data_t p = {}; 1122 if (!init_tailcall_program_data(&p, ctx)) 1123 return -1; 1124 1125 int ret = find_modules_from_module_kset_list(&p); 1126 if (ret < 0) { 1127 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret); 1128 return -1; 1129 } 1130 1131 bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_MOD_TREE); 1132 1133 return -1; 1134 } 1135 1136 SEC("uprobe/lkm_seeker_mod_tree_tail") 1137 int lkm_seeker_mod_tree_tail(struct pt_regs *ctx) 1138 { 1139 // This check is to satisfy the verifier for kernels older than 5.2 1140 // as in runtime we'll never get here (the tail call doesn't happen) 1141 if (kern_ver_below_min_lkm(ctx)) 1142 return 0; 1143 1144 program_data_t p = {}; 1145 if (!init_tailcall_program_data(&p, ctx)) 1146 return -1; 1147 1148 // This method is efficient only when the kernel is compiled with 1149 // CONFIG_MODULES_TREE_LOOKUP=y 1150 int ret = find_modules_from_mod_tree(&p); 1151 if (ret < 0) { 1152 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret); 1153 return -1; 1154 } 1155 1156 bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_PROC); 1157 1158 return -1; 1159 } 1160 1161 SEC("uprobe/lkm_seeker_proc_tail") 1162 int lkm_seeker_proc_tail(struct pt_regs *ctx) 1163 { 1164 // This check is to satisfy the verifier for kernels older than 5.2 1165 // as in runtime we'll never get here (the tail call doesn't happen) 1166 if (kern_ver_below_min_lkm(ctx)) 1167 return 0; 1168 1169 program_data_t p = {}; 1170 if (!init_tailcall_program_data(&p, ctx)) 1171 return -1; 1172 1173 int ret = check_is_proc_modules_hooked(&p); 1174 if (ret < 0) { 1175 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret); 1176 return -1; 1177 } 1178 1179 bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_NEW_MOD_ONLY); 1180 1181 return -1; 1182 } 1183 1184 // We maintain a map of newly loaded modules. At times, we verify that this module appears in 1185 // modules list. If it is not (and there was no valid deletion), then it's hidden. 1186 SEC("uprobe/lkm_seeker_new_mod_only_tail") 1187 int lkm_seeker_new_mod_only_tail(struct pt_regs *ctx) 1188 { 1189 // This check is to satisfy the verifier for kernels older than 5.2 1190 // as in runtime we'll never get here (the tail call doesn't happen) 1191 if (kern_ver_below_min_lkm(ctx)) 1192 return 0; 1193 1194 program_data_t p = {}; 1195 if (!init_tailcall_program_data(&p, ctx)) 1196 return -1; 1197 1198 u64 start_scan_time = check_new_mods_only(&p); 1199 if (start_scan_time == 0) { 1200 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, HID_MOD_UNCOMPLETED_ITERATIONS); 1201 return 1; 1202 } 1203 1204 struct module *mod = 1205 (struct module *) start_scan_time; // Use the module address field as the start_scan_time 1206 u32 flags = NEW_MOD; 1207 lkm_seeker_send_to_userspace(mod, &flags, &p); 1208 1209 return 0; 1210 } 1211 1212 // clang-format off 1213 1214 // trace/events/sched.h: TP_PROTO(struct task_struct *p, pid_t old_pid, struct linux_binprm *bprm) 1215 SEC("raw_tracepoint/sched_process_exec") 1216 int tracepoint__sched__sched_process_exec(struct bpf_raw_tracepoint_args *ctx) 1217 { 1218 program_data_t p = {}; 1219 if (!init_program_data(&p, ctx)) { 1220 return 0; 1221 } 1222 1223 // Perform checks below before should_trace(), so tracee can filter by newly created containers 1224 // or processes. Assume that a new container, or pod, has started when a process of a newly 1225 // created cgroup and mount ns executed a binary. 1226 1227 if (p.task_info->container_state == CONTAINER_CREATED) { 1228 u32 mntns = get_task_mnt_ns_id(p.task); 1229 struct task_struct *parent = get_parent_task(p.task); 1230 u32 parent_mntns = get_task_mnt_ns_id(parent); 1231 if (mntns != parent_mntns) { 1232 u32 cgroup_id_lsb = p.event->context.task.cgroup_id; 1233 u8 state = CONTAINER_STARTED; 1234 bpf_map_update_elem(&containers_map, &cgroup_id_lsb, &state, BPF_ANY); 1235 p.task_info->container_state = state; 1236 p.event->context.task.flags |= CONTAINER_STARTED_FLAG; // change for current event 1237 p.task_info->context.flags |= CONTAINER_STARTED_FLAG; // change for future task events 1238 } 1239 } 1240 1241 struct linux_binprm *bprm = (struct linux_binprm *) ctx->args[2]; 1242 if (bprm == NULL) { 1243 return -1; 1244 } 1245 struct file *file = get_file_ptr_from_bprm(bprm); 1246 void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 1247 1248 proc_info_t *proc_info = p.proc_info; 1249 proc_info->new_proc = true; // task has started after tracee started running 1250 1251 // extract the binary name to be used in should_trace 1252 __builtin_memset(proc_info->binary.path, 0, MAX_BIN_PATH_SIZE); 1253 bpf_probe_read_kernel_str(proc_info->binary.path, MAX_BIN_PATH_SIZE, file_path); 1254 proc_info->binary.mnt_id = p.event->context.task.mnt_id; 1255 1256 if (!should_trace(&p)) { 1257 return 0; 1258 } 1259 1260 proc_info->follow_in_scopes = p.event->context.matched_policies; // follow task for matched scopes 1261 1262 if (!should_submit(SCHED_PROCESS_EXEC, p.event)) { 1263 return 0; 1264 } 1265 1266 // Note: From v5.9+, there are two interesting fields in bprm that could be added: 1267 // 1. struct file *executable: the executable name passed to an interpreter 1268 // 2. fdpath: generated filename for execveat (after resolving dirfd) 1269 1270 const char *filename = get_binprm_filename(bprm); 1271 dev_t s_dev = get_dev_from_file(file); 1272 unsigned long inode_nr = get_inode_nr_from_file(file); 1273 u64 ctime = get_ctime_nanosec_from_file(file); 1274 umode_t inode_mode = get_inode_mode_from_file(file); 1275 1276 save_str_to_buf(&p.event->args_buf, (void *) filename, 0); 1277 save_str_to_buf(&p.event->args_buf, file_path, 1); 1278 save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 2); 1279 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 3); 1280 save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4); 1281 save_to_submit_buf(&p.event->args_buf, &inode_mode, sizeof(umode_t), 5); 1282 1283 // NOTES: 1284 // - interp is the real interpreter (sh, bash, python, perl, ...) 1285 // - interpreter is the binary interpreter (ld.so), also known as the loader 1286 // - interpreter might be the same as executable (so there is no interpreter) 1287 1288 // Check if there is an interpreter and if it is different from the executable: 1289 1290 bool itp_inode_exists = proc_info->interpreter.id.inode != 0; 1291 bool itp_dev_diff = proc_info->interpreter.id.device != s_dev; 1292 bool itp_inode_diff = proc_info->interpreter.id.inode != inode_nr; 1293 1294 if (itp_inode_exists && (itp_dev_diff || itp_inode_diff)) { 1295 save_str_to_buf(&p.event->args_buf, &proc_info->interpreter.pathname, 6); // interpreter path 1296 save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 7); // interpreter device number 1297 save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.inode, sizeof(u64), 8); // interpreter inode number 1298 save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.ctime, sizeof(u64), 9); // interpreter changed time 1299 } 1300 1301 bpf_tail_call(ctx, &prog_array_tp, TAIL_SCHED_PROCESS_EXEC_EVENT_SUBMIT); 1302 1303 return 0; 1304 } 1305 1306 // clang-format on 1307 1308 SEC("raw_tracepoint/sched_process_exec_event_submit_tail") 1309 int sched_process_exec_event_submit_tail(struct bpf_raw_tracepoint_args *ctx) 1310 { 1311 program_data_t p = {}; 1312 if (!init_tailcall_program_data(&p, ctx)) 1313 return -1; 1314 1315 struct task_struct *task = (struct task_struct *) ctx->args[0]; 1316 struct linux_binprm *bprm = (struct linux_binprm *) ctx->args[2]; 1317 1318 if (bprm == NULL) 1319 return -1; 1320 1321 // bprm->mm is null at this point (set by begin_new_exec()), and task->mm is already initialized 1322 struct mm_struct *mm = get_mm_from_task(task); 1323 1324 unsigned long arg_start, arg_end; 1325 arg_start = get_arg_start_from_mm(mm); 1326 arg_end = get_arg_end_from_mm(mm); 1327 int argc = get_argc_from_bprm(bprm); 1328 1329 struct file *stdin_file = get_struct_file_from_fd(0); 1330 unsigned short stdin_type = get_inode_mode_from_file(stdin_file) & S_IFMT; 1331 void *stdin_path = get_path_str(__builtin_preserve_access_index(&stdin_file->f_path)); 1332 const char *interp = get_binprm_interp(bprm); 1333 1334 int invoked_from_kernel = 0; 1335 if (get_task_parent_flags(task) & PF_KTHREAD) { 1336 invoked_from_kernel = 1; 1337 } 1338 1339 save_args_str_arr_to_buf(&p.event->args_buf, (void *) arg_start, (void *) arg_end, argc, 10); 1340 save_str_to_buf(&p.event->args_buf, (void *) interp, 11); 1341 save_to_submit_buf(&p.event->args_buf, &stdin_type, sizeof(unsigned short), 12); 1342 save_str_to_buf(&p.event->args_buf, stdin_path, 13); 1343 save_to_submit_buf(&p.event->args_buf, &invoked_from_kernel, sizeof(int), 14); 1344 if (p.config->options & OPT_EXEC_ENV) { 1345 unsigned long env_start, env_end; 1346 env_start = get_env_start_from_mm(mm); 1347 env_end = get_env_end_from_mm(mm); 1348 int envc = get_envc_from_bprm(bprm); 1349 1350 save_args_str_arr_to_buf( 1351 &p.event->args_buf, (void *) env_start, (void *) env_end, envc, 15); 1352 } 1353 1354 events_perf_submit(&p, SCHED_PROCESS_EXEC, 0); 1355 return 0; 1356 } 1357 1358 // trace/events/sched.h: TP_PROTO(struct task_struct *p) 1359 SEC("raw_tracepoint/sched_process_exit") 1360 int tracepoint__sched__sched_process_exit(struct bpf_raw_tracepoint_args *ctx) 1361 { 1362 program_data_t p = {}; 1363 if (!init_program_data(&p, ctx)) 1364 return 0; 1365 1366 // evaluate should_trace before removing this pid from the maps 1367 bool traced = !!should_trace(&p); 1368 1369 bpf_map_delete_elem(&task_info_map, &p.event->context.task.host_tid); 1370 1371 bool group_dead = false; 1372 struct task_struct *task = p.task; 1373 struct signal_struct *signal = BPF_CORE_READ(task, signal); 1374 atomic_t live = BPF_CORE_READ(signal, live); 1375 // This check could be true for multiple thread exits if the thread count was 0 when the hooks 1376 // were triggered. This could happen for example if the threads performed exit in different CPUs 1377 // simultaneously. 1378 if (live.counter == 0) { 1379 group_dead = true; 1380 } 1381 1382 bool oom_killed = false; 1383 1384 if (bpf_map_lookup_elem(&oom_info, &p.task_info->context.host_pid)) { 1385 oom_killed = true; 1386 bpf_map_delete_elem(&oom_info, &p.task_info->context.host_pid); 1387 } 1388 1389 if (!traced) 1390 return 0; 1391 1392 long exit_code = get_task_exit_code(p.task); 1393 1394 if (oom_killed) { 1395 if (should_submit(PROCESS_OOM_KILLED, p.event)) { 1396 save_to_submit_buf(&p.event->args_buf, (void *) &exit_code, sizeof(long), 0); 1397 save_to_submit_buf(&p.event->args_buf, (void *) &group_dead, sizeof(bool), 1); 1398 1399 events_perf_submit(&p, PROCESS_OOM_KILLED, 0); 1400 } 1401 1402 return 0; 1403 } 1404 1405 if (should_submit(SCHED_PROCESS_EXIT, p.event)) { 1406 save_to_submit_buf(&p.event->args_buf, (void *) &exit_code, sizeof(long), 0); 1407 save_to_submit_buf(&p.event->args_buf, (void *) &group_dead, sizeof(bool), 1); 1408 1409 events_perf_submit(&p, SCHED_PROCESS_EXIT, 0); 1410 } 1411 1412 return 0; 1413 } 1414 1415 // trace/events/sched.h: TP_PROTO(struct task_struct *p) 1416 SEC("raw_tracepoint/sched_process_free") 1417 int tracepoint__sched__sched_process_free(struct bpf_raw_tracepoint_args *ctx) 1418 { 1419 struct task_struct *task = (struct task_struct *) ctx->args[0]; 1420 1421 int pid = get_task_host_pid(task); 1422 int tgid = get_task_host_tgid(task); 1423 1424 if (pid == tgid) { 1425 // we only care about process (and not thread) exit 1426 // if tgid task is freed, we know for sure that the process exited 1427 // so we can safely remove it from the process map 1428 bpf_map_delete_elem(&proc_info_map, &tgid); 1429 1430 u32 zero = 0; 1431 config_entry_t *cfg = bpf_map_lookup_elem(&config_map, &zero); 1432 if (unlikely(cfg == NULL)) { 1433 return 0; 1434 } 1435 1436 bpf_map_delete_elem(&process_tree_map, &tgid); 1437 } 1438 1439 return 0; 1440 } 1441 1442 SEC("raw_tracepoint/syscall__accept4") 1443 int syscall__accept4(void *ctx) 1444 { 1445 args_t saved_args; 1446 if (load_args(&saved_args, SOCKET_ACCEPT) != 0) { 1447 // missed entry or not traced 1448 return 0; 1449 } 1450 del_args(SOCKET_ACCEPT); 1451 1452 program_data_t p = {}; 1453 if (!init_program_data(&p, ctx)) 1454 return 0; 1455 1456 struct socket *old_sock = (struct socket *) saved_args.args[0]; 1457 struct socket *new_sock = (struct socket *) saved_args.args[1]; 1458 u32 sockfd = (u32) saved_args.args[2]; 1459 1460 if (new_sock == NULL) { 1461 return -1; 1462 } 1463 if (old_sock == NULL) { 1464 return -1; 1465 } 1466 1467 reset_event_args(&p); 1468 save_to_submit_buf(&p.event->args_buf, (void *) &sockfd, sizeof(u32), 0); 1469 save_sockaddr_to_buf(&p.event->args_buf, old_sock, 1); 1470 save_sockaddr_to_buf(&p.event->args_buf, new_sock, 2); 1471 1472 return events_perf_submit(&p, SOCKET_ACCEPT, 0); 1473 } 1474 1475 // trace/events/sched.h: TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next) 1476 SEC("raw_tracepoint/sched_switch") 1477 int tracepoint__sched__sched_switch(struct bpf_raw_tracepoint_args *ctx) 1478 { 1479 program_data_t p = {}; 1480 if (!init_program_data(&p, ctx)) 1481 return 0; 1482 1483 if (!should_trace(&p)) 1484 return 0; 1485 1486 if (!should_submit(SCHED_SWITCH, p.event)) 1487 return 0; 1488 1489 struct task_struct *prev = (struct task_struct *) ctx->args[1]; 1490 struct task_struct *next = (struct task_struct *) ctx->args[2]; 1491 int prev_pid = get_task_host_pid(prev); 1492 int next_pid = get_task_host_pid(next); 1493 int cpu = bpf_get_smp_processor_id(); 1494 1495 save_to_submit_buf(&p.event->args_buf, (void *) &cpu, sizeof(int), 0); 1496 save_to_submit_buf(&p.event->args_buf, (void *) &prev_pid, sizeof(int), 1); 1497 save_str_to_buf(&p.event->args_buf, prev->comm, 2); 1498 save_to_submit_buf(&p.event->args_buf, (void *) &next_pid, sizeof(int), 3); 1499 save_str_to_buf(&p.event->args_buf, next->comm, 4); 1500 1501 return events_perf_submit(&p, SCHED_SWITCH, 0); 1502 } 1503 1504 SEC("kprobe/filldir64") 1505 int BPF_KPROBE(trace_filldir64) 1506 { 1507 program_data_t p = {}; 1508 if (!init_program_data(&p, ctx)) 1509 return 0; 1510 1511 if (!should_trace((&p))) 1512 return 0; 1513 1514 if (!should_submit(HIDDEN_INODES, p.event)) 1515 return 0; 1516 1517 char *process_name = (char *) PT_REGS_PARM2(ctx); 1518 unsigned long process_inode_number = (unsigned long) PT_REGS_PARM5(ctx); 1519 if (process_inode_number == 0) { 1520 save_str_to_buf(&p.event->args_buf, process_name, 0); 1521 return events_perf_submit(&p, HIDDEN_INODES, 0); 1522 } 1523 return 0; 1524 } 1525 1526 SEC("kprobe/call_usermodehelper") 1527 int BPF_KPROBE(trace_call_usermodehelper) 1528 { 1529 program_data_t p = {}; 1530 if (!init_program_data(&p, ctx)) 1531 return 0; 1532 1533 if (!should_trace(&p)) 1534 return 0; 1535 1536 if (!should_submit(CALL_USERMODE_HELPER, p.event)) 1537 return 0; 1538 1539 void *path = (void *) PT_REGS_PARM1(ctx); 1540 unsigned long argv = PT_REGS_PARM2(ctx); 1541 unsigned long envp = PT_REGS_PARM3(ctx); 1542 int wait = PT_REGS_PARM4(ctx); 1543 1544 save_str_to_buf(&p.event->args_buf, path, 0); 1545 save_str_arr_to_buf(&p.event->args_buf, (const char *const *) argv, 1); 1546 save_str_arr_to_buf(&p.event->args_buf, (const char *const *) envp, 2); 1547 save_to_submit_buf(&p.event->args_buf, (void *) &wait, sizeof(int), 3); 1548 1549 return events_perf_submit(&p, CALL_USERMODE_HELPER, 0); 1550 } 1551 1552 SEC("kprobe/do_exit") 1553 int BPF_KPROBE(trace_do_exit) 1554 { 1555 program_data_t p = {}; 1556 if (!init_program_data(&p, ctx)) 1557 return 0; 1558 1559 if (!should_trace(&p)) 1560 return 0; 1561 1562 if (!should_submit(DO_EXIT, p.event)) 1563 return 0; 1564 1565 long code = PT_REGS_PARM1(ctx); 1566 1567 return events_perf_submit(&p, DO_EXIT, code); 1568 } 1569 1570 SEC("uprobe/trigger_seq_ops_event") 1571 int uprobe_seq_ops_trigger(struct pt_regs *ctx) 1572 { 1573 u64 caller_ctx_id = 0; 1574 u64 *address_array = NULL; 1575 u64 struct_address = 0; 1576 1577 // clang-format off 1578 // 1579 // Golang calling convention per architecture 1580 1581 #if defined(bpf_target_x86) 1582 caller_ctx_id = ctx->bx; // 1st arg 1583 address_array = ((void *) ctx->sp + 8); // 2nd arg 1584 #elif defined(bpf_target_arm64) 1585 caller_ctx_id = ctx->user_regs.regs[1]; // 1st arg 1586 address_array = ((void *) ctx->sp + 8); // 2nd arg 1587 1588 #else 1589 return 0; 1590 #endif 1591 // clang-format on 1592 1593 program_data_t p = {}; 1594 if (!init_program_data(&p, ctx)) 1595 return 0; 1596 1597 // Uprobes are not triggered by syscalls, so we need to override the false value. 1598 p.event->context.syscall = NO_SYSCALL; 1599 1600 // uprobe was triggered from other tracee instance 1601 if (p.config->tracee_pid != p.task_info->context.pid && 1602 p.config->tracee_pid != p.task_info->context.host_pid) 1603 return 0; 1604 1605 void *stext_addr = get_stext_addr(); 1606 if (unlikely(stext_addr == NULL)) 1607 return 0; 1608 void *etext_addr = get_etext_addr(); 1609 if (unlikely(etext_addr == NULL)) 1610 return 0; 1611 1612 u32 count_off = p.event->args_buf.offset + 1; 1613 save_u64_arr_to_buf(&p.event->args_buf, NULL, 0, 0); // init u64 array with size 0 1614 1615 #pragma unroll 1616 for (int i = 0; i < NET_SEQ_OPS_TYPES; i++) { 1617 bpf_probe_read_user(&struct_address, 8, (address_array + i)); 1618 struct seq_operations *seq_ops = (struct seq_operations *) struct_address; 1619 1620 u64 show_addr = (u64) BPF_CORE_READ(seq_ops, show); 1621 if (show_addr == 0) 1622 return 0; 1623 if (show_addr >= (u64) stext_addr && show_addr < (u64) etext_addr) 1624 show_addr = 0; 1625 1626 u64 start_addr = (u64) BPF_CORE_READ(seq_ops, start); 1627 if (start_addr == 0) 1628 return 0; 1629 if (start_addr >= (u64) stext_addr && start_addr < (u64) etext_addr) 1630 start_addr = 0; 1631 1632 u64 next_addr = (u64) BPF_CORE_READ(seq_ops, next); 1633 if (next_addr == 0) 1634 return 0; 1635 if (next_addr >= (u64) stext_addr && next_addr < (u64) etext_addr) 1636 next_addr = 0; 1637 1638 u64 stop_addr = (u64) BPF_CORE_READ(seq_ops, stop); 1639 if (stop_addr == 0) 1640 return 0; 1641 if (stop_addr >= (u64) stext_addr && stop_addr < (u64) etext_addr) 1642 stop_addr = 0; 1643 1644 u64 seq_ops_addresses[NET_SEQ_OPS_SIZE + 1] = {show_addr, start_addr, next_addr, stop_addr}; 1645 1646 add_u64_elements_to_buf(&p.event->args_buf, (const u64 *) seq_ops_addresses, 4, count_off); 1647 } 1648 1649 save_to_submit_buf(&p.event->args_buf, (void *) &caller_ctx_id, sizeof(uint64_t), 1); 1650 events_perf_submit(&p, PRINT_NET_SEQ_OPS, 0); 1651 return 0; 1652 } 1653 1654 SEC("uprobe/trigger_mem_dump_event") 1655 int uprobe_mem_dump_trigger(struct pt_regs *ctx) 1656 { 1657 u64 address = 0; 1658 u64 size = 0; 1659 u64 caller_ctx_id = 0; 1660 1661 #if defined(bpf_target_x86) 1662 address = ctx->bx; // 1st arg 1663 size = ctx->cx; // 2nd arg 1664 caller_ctx_id = ctx->di; // 3rd arg 1665 #elif defined(bpf_target_arm64) 1666 address = ctx->user_regs.regs[1]; // 1st arg 1667 size = ctx->user_regs.regs[2]; // 2nd arg 1668 caller_ctx_id = ctx->user_regs.regs[3]; // 3rd arg 1669 #else 1670 return 0; 1671 #endif 1672 1673 program_data_t p = {}; 1674 if (!init_program_data(&p, ctx)) 1675 return 0; 1676 1677 // Uprobes are not triggered by syscalls, so we need to override the false value. 1678 p.event->context.syscall = NO_SYSCALL; 1679 1680 // uprobe was triggered from other tracee instance 1681 if (p.config->tracee_pid != p.task_info->context.pid && 1682 p.config->tracee_pid != p.task_info->context.host_pid) 1683 return 0; 1684 1685 if (size <= 0) 1686 return 0; 1687 1688 int ret = save_bytes_to_buf(&p.event->args_buf, (void *) address, size & MAX_MEM_DUMP_SIZE, 0); 1689 // return in case of failed pointer read 1690 if (ret == 0) { 1691 tracee_log(ctx, BPF_LOG_LVL_ERROR, BPF_LOG_ID_MEM_READ, ret); 1692 return 0; 1693 } 1694 save_to_submit_buf(&p.event->args_buf, (void *) &address, sizeof(void *), 1); 1695 save_to_submit_buf(&p.event->args_buf, &size, sizeof(u64), 2); 1696 save_to_submit_buf(&p.event->args_buf, &caller_ctx_id, sizeof(u64), 3); 1697 1698 return events_perf_submit(&p, PRINT_MEM_DUMP, 0); 1699 } 1700 1701 statfunc struct trace_kprobe *get_trace_kprobe_from_trace_probe(void *tracep) 1702 { 1703 struct trace_kprobe *tracekp = 1704 (struct trace_kprobe *) container_of(tracep, struct trace_kprobe, tp); 1705 1706 return tracekp; 1707 } 1708 1709 statfunc struct trace_uprobe *get_trace_uprobe_from_trace_probe(void *tracep) 1710 { 1711 struct trace_uprobe *traceup = 1712 (struct trace_uprobe *) container_of(tracep, struct trace_uprobe, tp); 1713 1714 return traceup; 1715 } 1716 1717 // This function returns a pointer to struct trace_probe from struct trace_event_call. 1718 statfunc void *get_trace_probe_from_trace_event_call(struct trace_event_call *call) 1719 { 1720 void *tracep_ptr; 1721 1722 struct trace_probe___v53 *legacy_tracep; 1723 if (bpf_core_field_exists(legacy_tracep->call)) { 1724 tracep_ptr = container_of(call, struct trace_probe___v53, call); 1725 } else { 1726 struct trace_probe_event *tpe = container_of(call, struct trace_probe_event, call); 1727 struct list_head probes = BPF_CORE_READ(tpe, probes); 1728 tracep_ptr = container_of(probes.next, struct trace_probe, list); 1729 } 1730 1731 return tracep_ptr; 1732 } 1733 1734 enum bpf_attach_type_e { 1735 BPF_RAW_TRACEPOINT, 1736 PERF_TRACEPOINT, 1737 PERF_KPROBE, 1738 PERF_KRETPROBE, 1739 PERF_UPROBE, 1740 PERF_URETPROBE 1741 }; 1742 1743 statfunc int send_bpf_attach( 1744 program_data_t *p, struct bpf_prog *prog, void *event_name, u64 probe_addr, int perf_type) 1745 { 1746 if (!should_submit(BPF_ATTACH, p->event)) { 1747 return 0; 1748 } 1749 1750 // get bpf prog details 1751 1752 int prog_type = BPF_CORE_READ(prog, type); 1753 struct bpf_prog_aux *prog_aux = BPF_CORE_READ(prog, aux); 1754 u32 prog_id = BPF_CORE_READ(prog_aux, id); 1755 char prog_name[BPF_OBJ_NAME_LEN]; 1756 bpf_probe_read_kernel_str(&prog_name, BPF_OBJ_NAME_LEN, prog_aux->name); 1757 1758 // get usage of helpers 1759 bpf_used_helpers_t *val = bpf_map_lookup_elem(&bpf_attach_map, &prog_id); 1760 if (val == NULL) 1761 return 0; 1762 1763 // submit the event 1764 1765 save_to_submit_buf(&(p->event->args_buf), &prog_type, sizeof(int), 0); 1766 save_str_to_buf(&(p->event->args_buf), (void *) &prog_name, 1); 1767 save_to_submit_buf(&(p->event->args_buf), &prog_id, sizeof(u32), 2); 1768 save_u64_arr_to_buf(&(p->event->args_buf), (const u64 *) val->helpers, 4, 3); 1769 save_str_to_buf(&(p->event->args_buf), event_name, 4); 1770 save_to_submit_buf(&(p->event->args_buf), &probe_addr, sizeof(u64), 5); 1771 save_to_submit_buf(&(p->event->args_buf), &perf_type, sizeof(int), 6); 1772 1773 events_perf_submit(p, BPF_ATTACH, 0); 1774 1775 // delete from map 1776 bpf_map_delete_elem(&bpf_attach_map, &prog_id); 1777 1778 return 0; 1779 } 1780 1781 // Inspired by bpf_get_perf_event_info() kernel func. 1782 // https://elixir.bootlin.com/linux/v5.19.2/source/kernel/trace/bpf_trace.c#L2123 1783 statfunc int 1784 send_bpf_perf_attach(program_data_t *p, struct file *bpf_prog_file, struct file *perf_event_file) 1785 { 1786 if (!should_submit(BPF_ATTACH, p->event)) { 1787 return 0; 1788 } 1789 1790 // get real values of TRACE_EVENT_FL_KPROBE and TRACE_EVENT_FL_UPROBE. 1791 // these values were changed in kernels >= 5.15. 1792 int TRACE_EVENT_FL_KPROBE_BIT; 1793 int TRACE_EVENT_FL_UPROBE_BIT; 1794 if (bpf_core_field_exists(((struct trace_event_call *) 0)->module)) { // kernel >= 5.15 1795 TRACE_EVENT_FL_KPROBE_BIT = 6; 1796 TRACE_EVENT_FL_UPROBE_BIT = 7; 1797 } else { // kernel < 5.15 1798 TRACE_EVENT_FL_KPROBE_BIT = 5; 1799 TRACE_EVENT_FL_UPROBE_BIT = 6; 1800 } 1801 int TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT); 1802 int TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT); 1803 1804 // get perf event details 1805 1806 // clang-format off 1807 #define MAX_PERF_EVENT_NAME ((MAX_PATH_PREF_SIZE > MAX_KSYM_NAME_SIZE) ? MAX_PATH_PREF_SIZE : MAX_KSYM_NAME_SIZE) 1808 #define REQUIRED_SYSTEM_LENGTH 9 1809 // clang-format on 1810 1811 struct perf_event *event = (struct perf_event *) BPF_CORE_READ(perf_event_file, private_data); 1812 struct trace_event_call *tp_event = BPF_CORE_READ(event, tp_event); 1813 char event_name[MAX_PERF_EVENT_NAME]; 1814 u64 probe_addr = 0; 1815 int perf_type; 1816 1817 int flags = BPF_CORE_READ(tp_event, flags); 1818 1819 // check if syscall_tracepoint 1820 bool is_syscall_tracepoint = false; 1821 struct trace_event_class *tp_class = BPF_CORE_READ(tp_event, class); 1822 char class_system[REQUIRED_SYSTEM_LENGTH]; 1823 bpf_probe_read_kernel_str( 1824 &class_system, REQUIRED_SYSTEM_LENGTH, BPF_CORE_READ(tp_class, system)); 1825 class_system[REQUIRED_SYSTEM_LENGTH - 1] = '\0'; 1826 if (has_prefix("syscalls", class_system, REQUIRED_SYSTEM_LENGTH)) { 1827 is_syscall_tracepoint = true; 1828 } 1829 1830 if (flags & TRACE_EVENT_FL_TRACEPOINT) { // event is tracepoint 1831 1832 perf_type = PERF_TRACEPOINT; 1833 struct tracepoint *tp = BPF_CORE_READ(tp_event, tp); 1834 bpf_probe_read_kernel_str(&event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tp, name)); 1835 1836 } else if (is_syscall_tracepoint) { // event is syscall tracepoint 1837 1838 perf_type = PERF_TRACEPOINT; 1839 bpf_probe_read_kernel_str(&event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tp_event, name)); 1840 1841 } else { 1842 bool is_ret_probe = false; 1843 void *tracep_ptr = get_trace_probe_from_trace_event_call(tp_event); 1844 1845 if (flags & TRACE_EVENT_FL_KPROBE) { // event is kprobe 1846 1847 struct trace_kprobe *tracekp = get_trace_kprobe_from_trace_probe(tracep_ptr); 1848 1849 // check if probe is a kretprobe 1850 struct kretprobe *krp = &tracekp->rp; 1851 kretprobe_handler_t handler_f = BPF_CORE_READ(krp, handler); 1852 if (handler_f != NULL) 1853 is_ret_probe = true; 1854 1855 if (is_ret_probe) 1856 perf_type = PERF_KRETPROBE; 1857 else 1858 perf_type = PERF_KPROBE; 1859 1860 // get symbol name 1861 bpf_probe_read_kernel_str( 1862 &event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tracekp, symbol)); 1863 1864 // get symbol address 1865 if (!event_name[0]) 1866 probe_addr = (unsigned long) BPF_CORE_READ(krp, kp.addr); 1867 1868 } else if (flags & TRACE_EVENT_FL_UPROBE) { // event is uprobe 1869 1870 struct trace_uprobe *traceup = get_trace_uprobe_from_trace_probe(tracep_ptr); 1871 1872 // determine if ret probe 1873 struct uprobe_consumer *upc = &traceup->consumer; 1874 void *handler_f = BPF_CORE_READ(upc, ret_handler); 1875 if (handler_f != NULL) 1876 is_ret_probe = true; 1877 1878 if (is_ret_probe) 1879 perf_type = PERF_URETPROBE; 1880 else 1881 perf_type = PERF_UPROBE; 1882 1883 // get binary path 1884 bpf_probe_read_kernel_str( 1885 &event_name, MAX_PATH_PREF_SIZE, BPF_CORE_READ(traceup, filename)); 1886 1887 // get symbol offset 1888 probe_addr = BPF_CORE_READ(traceup, offset); 1889 1890 } else { 1891 // unsupported perf type 1892 return 0; 1893 } 1894 } 1895 1896 struct bpf_prog *prog = (struct bpf_prog *) BPF_CORE_READ(bpf_prog_file, private_data); 1897 1898 return send_bpf_attach(p, prog, &event_name, probe_addr, perf_type); 1899 } 1900 1901 SEC("kprobe/security_file_ioctl") 1902 int BPF_KPROBE(trace_security_file_ioctl) 1903 { 1904 program_data_t p = {}; 1905 if (!init_program_data(&p, ctx)) 1906 return 0; 1907 1908 if (!should_trace(&p)) 1909 return 0; 1910 1911 unsigned int cmd = PT_REGS_PARM2(ctx); 1912 1913 if (cmd == PERF_EVENT_IOC_SET_BPF) { 1914 struct file *perf_event_file = (struct file *) PT_REGS_PARM1(ctx); 1915 unsigned long fd = PT_REGS_PARM3(ctx); 1916 struct file *bpf_prog_file = get_struct_file_from_fd(fd); 1917 1918 send_bpf_perf_attach(&p, bpf_prog_file, perf_event_file); 1919 } 1920 1921 return 0; 1922 } 1923 1924 SEC("kprobe/tracepoint_probe_register_prio_may_exist") 1925 int BPF_KPROBE(trace_tracepoint_probe_register_prio_may_exist) 1926 { 1927 program_data_t p = {}; 1928 if (!init_program_data(&p, ctx)) 1929 return 0; 1930 1931 if (!should_trace(&p)) 1932 return 0; 1933 1934 struct tracepoint *tp = (struct tracepoint *) PT_REGS_PARM1(ctx); 1935 struct bpf_prog *prog = (struct bpf_prog *) PT_REGS_PARM3(ctx); 1936 1937 char event_name[MAX_PERF_EVENT_NAME]; 1938 bpf_probe_read_kernel_str(&event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tp, name)); 1939 1940 int perf_type = BPF_RAW_TRACEPOINT; 1941 u64 probe_addr = 0; 1942 1943 return send_bpf_attach(&p, prog, &event_name, probe_addr, perf_type); 1944 } 1945 1946 // trace/events/cgroup.h: 1947 // TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup) 1948 SEC("raw_tracepoint/cgroup_attach_task") 1949 int tracepoint__cgroup__cgroup_attach_task(struct bpf_raw_tracepoint_args *ctx) 1950 { 1951 program_data_t p = {}; 1952 if (!init_program_data(&p, ctx)) 1953 return 0; 1954 1955 if (!should_trace(&p)) 1956 return 0; 1957 1958 if (!should_submit(CGROUP_ATTACH_TASK, p.event)) 1959 return 0; 1960 1961 char *path = (char *) ctx->args[1]; 1962 struct task_struct *task = (struct task_struct *) ctx->args[2]; 1963 1964 int pid = get_task_host_pid(task); 1965 char *comm = BPF_CORE_READ(task, comm); 1966 1967 save_str_to_buf(&p.event->args_buf, path, 0); 1968 save_str_to_buf(&p.event->args_buf, comm, 1); 1969 save_to_submit_buf(&p.event->args_buf, (void *) &pid, sizeof(int), 2); 1970 events_perf_submit(&p, CGROUP_ATTACH_TASK, 0); 1971 1972 return 0; 1973 } 1974 1975 // trace/events/cgroup.h: TP_PROTO(struct cgroup *cgrp, const char *path) 1976 SEC("raw_tracepoint/cgroup_mkdir") 1977 int tracepoint__cgroup__cgroup_mkdir(struct bpf_raw_tracepoint_args *ctx) 1978 { 1979 program_data_t p = {}; 1980 if (!init_program_data(&p, ctx)) 1981 return 0; 1982 1983 if (!should_trace(&p)) 1984 return 0; 1985 1986 if (!should_submit(CGROUP_MKDIR, p.event)) 1987 return 0; 1988 1989 struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0]; 1990 char *path = (char *) ctx->args[1]; 1991 1992 u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp); 1993 u64 cgroup_id = get_cgroup_id(dst_cgrp); 1994 u32 cgroup_id_lsb = cgroup_id; 1995 1996 save_to_submit_buf(&p.event->args_buf, &cgroup_id, sizeof(u64), 0); 1997 save_str_to_buf(&p.event->args_buf, path, 1); 1998 save_to_submit_buf(&p.event->args_buf, &hierarchy_id, sizeof(u32), 2); 1999 events_perf_submit(&p, CGROUP_MKDIR, 0); 2000 2001 return 0; 2002 } 2003 2004 // trace/events/cgroup.h: TP_PROTO(struct cgroup *cgrp, const char *path) 2005 SEC("raw_tracepoint/cgroup_rmdir") 2006 int tracepoint__cgroup__cgroup_rmdir(struct bpf_raw_tracepoint_args *ctx) 2007 { 2008 program_data_t p = {}; 2009 if (!init_program_data(&p, ctx)) 2010 return 0; 2011 2012 if (!should_trace(&p)) 2013 return 0; 2014 2015 if (!should_submit(CGROUP_MKDIR, p.event)) 2016 return 0; 2017 2018 struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0]; 2019 char *path = (char *) ctx->args[1]; 2020 2021 u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp); 2022 u64 cgroup_id = get_cgroup_id(dst_cgrp); 2023 u32 cgroup_id_lsb = cgroup_id; 2024 2025 save_to_submit_buf(&p.event->args_buf, &cgroup_id, sizeof(u64), 0); 2026 save_str_to_buf(&p.event->args_buf, path, 1); 2027 save_to_submit_buf(&p.event->args_buf, &hierarchy_id, sizeof(u32), 2); 2028 events_perf_submit(&p, CGROUP_RMDIR, 0); 2029 2030 return 0; 2031 } 2032 2033 SEC("kprobe/security_bprm_check") 2034 int BPF_KPROBE(trace_security_bprm_check) 2035 { 2036 program_data_t p = {}; 2037 if (!init_program_data(&p, ctx)) 2038 return 0; 2039 2040 if (!should_trace(&p)) 2041 return 0; 2042 2043 if (!should_submit(SECURITY_BPRM_CHECK, p.event)) 2044 return 0; 2045 2046 struct linux_binprm *bprm = (struct linux_binprm *) PT_REGS_PARM1(ctx); 2047 struct file *file = get_file_ptr_from_bprm(bprm); 2048 dev_t s_dev = get_dev_from_file(file); 2049 unsigned long inode_nr = get_inode_nr_from_file(file); 2050 void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 2051 2052 syscall_data_t *sys = &p.task_info->syscall_data; 2053 const char *const *argv = NULL; 2054 const char *const *envp = NULL; 2055 switch (sys->id) { 2056 case SYSCALL_EXECVE: 2057 argv = (const char *const *) sys->args.args[1]; 2058 envp = (const char *const *) sys->args.args[2]; 2059 break; 2060 case SYSCALL_EXECVEAT: 2061 argv = (const char *const *) sys->args.args[2]; 2062 envp = (const char *const *) sys->args.args[3]; 2063 break; 2064 default: 2065 break; 2066 } 2067 2068 save_str_to_buf(&p.event->args_buf, file_path, 0); 2069 save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 1); 2070 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 2); 2071 save_str_arr_to_buf(&p.event->args_buf, argv, 3); 2072 if (p.config->options & OPT_EXEC_ENV) 2073 save_str_arr_to_buf(&p.event->args_buf, envp, 4); 2074 2075 return events_perf_submit(&p, SECURITY_BPRM_CHECK, 0); 2076 } 2077 2078 SEC("kprobe/security_file_open") 2079 int BPF_KPROBE(trace_security_file_open) 2080 { 2081 program_data_t p = {}; 2082 if (!init_program_data(&p, ctx)) 2083 return 0; 2084 2085 if (!should_trace(&p)) 2086 return 0; 2087 2088 if (!should_submit(SECURITY_FILE_OPEN, p.event)) 2089 return 0; 2090 2091 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 2092 dev_t s_dev = get_dev_from_file(file); 2093 unsigned long inode_nr = get_inode_nr_from_file(file); 2094 void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 2095 u64 ctime = get_ctime_nanosec_from_file(file); 2096 2097 // Load the arguments given to the open syscall (which eventually invokes this function) 2098 char empty_string[1] = ""; 2099 void *syscall_pathname = &empty_string; 2100 syscall_data_t *sys = NULL; 2101 bool syscall_traced = p.task_info->syscall_traced; 2102 if (syscall_traced) { 2103 sys = &p.task_info->syscall_data; 2104 switch (sys->id) { 2105 case SYSCALL_EXECVE: 2106 case SYSCALL_OPEN: 2107 syscall_pathname = (void *) sys->args.args[0]; 2108 break; 2109 2110 case SYSCALL_EXECVEAT: 2111 case SYSCALL_OPENAT: 2112 case SYSCALL_OPENAT2: 2113 syscall_pathname = (void *) sys->args.args[1]; 2114 break; 2115 } 2116 } 2117 2118 save_str_to_buf(&p.event->args_buf, file_path, 0); 2119 save_to_submit_buf(&p.event->args_buf, 2120 (void *) __builtin_preserve_access_index(&file->f_flags), 2121 sizeof(int), 2122 1); 2123 save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 2); 2124 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 3); 2125 save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4); 2126 save_str_to_buf(&p.event->args_buf, syscall_pathname, 5); 2127 2128 return events_perf_submit(&p, SECURITY_FILE_OPEN, 0); 2129 } 2130 2131 SEC("kprobe/security_sb_mount") 2132 int BPF_KPROBE(trace_security_sb_mount) 2133 { 2134 program_data_t p = {}; 2135 if (!init_program_data(&p, ctx)) 2136 return 0; 2137 2138 if (!should_trace(&p)) 2139 return 0; 2140 2141 if (!should_submit(SECURITY_SB_MOUNT, p.event)) 2142 return 0; 2143 2144 const char *dev_name = (const char *) PT_REGS_PARM1(ctx); 2145 struct path *path = (struct path *) PT_REGS_PARM2(ctx); 2146 const char *type = (const char *) PT_REGS_PARM3(ctx); 2147 unsigned long flags = (unsigned long) PT_REGS_PARM4(ctx); 2148 2149 void *path_str = get_path_str(path); 2150 2151 save_str_to_buf(&p.event->args_buf, (void *) dev_name, 0); 2152 save_str_to_buf(&p.event->args_buf, path_str, 1); 2153 save_str_to_buf(&p.event->args_buf, (void *) type, 2); 2154 save_to_submit_buf(&p.event->args_buf, &flags, sizeof(unsigned long), 3); 2155 2156 return events_perf_submit(&p, SECURITY_SB_MOUNT, 0); 2157 } 2158 2159 SEC("kprobe/security_inode_unlink") 2160 int BPF_KPROBE(trace_security_inode_unlink) 2161 { 2162 program_data_t p = {}; 2163 if (!init_program_data(&p, ctx)) 2164 return 0; 2165 2166 if (!should_trace(&p)) 2167 return 0; 2168 2169 bool should_trace_inode_unlink = should_submit(SECURITY_INODE_UNLINK, p.event); 2170 bool should_capture_io = false; 2171 if ((p.config->options & (OPT_CAPTURE_FILES_READ | OPT_CAPTURE_FILES_WRITE)) != 0) 2172 should_capture_io = true; 2173 2174 if (!should_trace_inode_unlink && !should_capture_io) 2175 return 0; 2176 2177 file_id_t unlinked_file_id = {}; 2178 int ret = 0; 2179 2180 // struct inode *dir = (struct inode *)PT_REGS_PARM1(ctx); 2181 struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx); 2182 unlinked_file_id.inode = get_inode_nr_from_dentry(dentry); 2183 unlinked_file_id.device = get_dev_from_dentry(dentry); 2184 2185 if (should_trace_inode_unlink) { 2186 void *dentry_path = get_dentry_path_str(dentry); 2187 unlinked_file_id.ctime = get_ctime_nanosec_from_dentry(dentry); 2188 2189 save_str_to_buf(&p.event->args_buf, dentry_path, 0); 2190 save_to_submit_buf(&p.event->args_buf, &unlinked_file_id.inode, sizeof(unsigned long), 1); 2191 save_to_submit_buf(&p.event->args_buf, &unlinked_file_id.device, sizeof(dev_t), 2); 2192 save_to_submit_buf(&p.event->args_buf, &unlinked_file_id.ctime, sizeof(u64), 3); 2193 ret = events_perf_submit(&p, SECURITY_INODE_UNLINK, 0); 2194 } 2195 2196 if (should_capture_io) { 2197 // We want to avoid reacquisition of the same inode-device affecting capture behavior 2198 unlinked_file_id.ctime = 0; 2199 bpf_map_delete_elem(&elf_files_map, &unlinked_file_id); 2200 } 2201 2202 return ret; 2203 } 2204 2205 SEC("kprobe/commit_creds") 2206 int BPF_KPROBE(trace_commit_creds) 2207 { 2208 program_data_t p = {}; 2209 if (!init_program_data(&p, ctx)) 2210 return 0; 2211 2212 if (!should_trace(&p)) 2213 return 0; 2214 2215 if (!should_submit(COMMIT_CREDS, p.event)) 2216 return 0; 2217 2218 struct cred *new_cred = (struct cred *) PT_REGS_PARM1(ctx); 2219 struct cred *old_cred = (struct cred *) get_task_real_cred(p.task); 2220 2221 slim_cred_t old_slim = {0}; 2222 slim_cred_t new_slim = {0}; 2223 2224 struct user_namespace *userns_old = BPF_CORE_READ(old_cred, user_ns); 2225 struct user_namespace *userns_new = BPF_CORE_READ(new_cred, user_ns); 2226 2227 // old credentials 2228 2229 old_slim.uid = BPF_CORE_READ(old_cred, uid.val); 2230 old_slim.gid = BPF_CORE_READ(old_cred, gid.val); 2231 old_slim.suid = BPF_CORE_READ(old_cred, suid.val); 2232 old_slim.sgid = BPF_CORE_READ(old_cred, sgid.val); 2233 old_slim.euid = BPF_CORE_READ(old_cred, euid.val); 2234 old_slim.egid = BPF_CORE_READ(old_cred, egid.val); 2235 old_slim.fsuid = BPF_CORE_READ(old_cred, fsuid.val); 2236 old_slim.fsgid = BPF_CORE_READ(old_cred, fsgid.val); 2237 old_slim.user_ns = BPF_CORE_READ(userns_old, ns.inum); 2238 old_slim.securebits = BPF_CORE_READ(old_cred, securebits); 2239 2240 old_slim.cap_inheritable = credcap_to_slimcap(&old_cred->cap_inheritable); 2241 old_slim.cap_permitted = credcap_to_slimcap(&old_cred->cap_permitted); 2242 old_slim.cap_effective = credcap_to_slimcap(&old_cred->cap_effective); 2243 old_slim.cap_bset = credcap_to_slimcap(&old_cred->cap_bset); 2244 old_slim.cap_ambient = credcap_to_slimcap(&old_cred->cap_ambient); 2245 2246 // new credentials 2247 2248 new_slim.uid = BPF_CORE_READ(new_cred, uid.val); 2249 new_slim.gid = BPF_CORE_READ(new_cred, gid.val); 2250 new_slim.suid = BPF_CORE_READ(new_cred, suid.val); 2251 new_slim.sgid = BPF_CORE_READ(new_cred, sgid.val); 2252 new_slim.euid = BPF_CORE_READ(new_cred, euid.val); 2253 new_slim.egid = BPF_CORE_READ(new_cred, egid.val); 2254 new_slim.fsuid = BPF_CORE_READ(new_cred, fsuid.val); 2255 new_slim.fsgid = BPF_CORE_READ(new_cred, fsgid.val); 2256 new_slim.user_ns = BPF_CORE_READ(userns_new, ns.inum); 2257 new_slim.securebits = BPF_CORE_READ(new_cred, securebits); 2258 2259 new_slim.cap_inheritable = credcap_to_slimcap(&new_cred->cap_inheritable); 2260 new_slim.cap_permitted = credcap_to_slimcap(&new_cred->cap_permitted); 2261 new_slim.cap_effective = credcap_to_slimcap(&new_cred->cap_effective); 2262 new_slim.cap_bset = credcap_to_slimcap(&new_cred->cap_bset); 2263 new_slim.cap_ambient = credcap_to_slimcap(&new_cred->cap_ambient); 2264 2265 save_to_submit_buf(&p.event->args_buf, (void *) &old_slim, sizeof(slim_cred_t), 0); 2266 save_to_submit_buf(&p.event->args_buf, (void *) &new_slim, sizeof(slim_cred_t), 1); 2267 2268 // clang-format off 2269 if ( 2270 (old_slim.uid != new_slim.uid) || 2271 (old_slim.gid != new_slim.gid) || 2272 (old_slim.suid != new_slim.suid) || 2273 (old_slim.sgid != new_slim.sgid) || 2274 (old_slim.euid != new_slim.euid) || 2275 (old_slim.egid != new_slim.egid) || 2276 (old_slim.fsuid != new_slim.fsuid) || 2277 (old_slim.fsgid != new_slim.fsgid) || 2278 (old_slim.cap_inheritable != new_slim.cap_inheritable) || 2279 (old_slim.cap_permitted != new_slim.cap_permitted) || 2280 (old_slim.cap_effective != new_slim.cap_effective) || 2281 (old_slim.cap_bset != new_slim.cap_bset) || 2282 (old_slim.cap_ambient != new_slim.cap_ambient) 2283 ) { 2284 events_perf_submit(&p, COMMIT_CREDS, 0); 2285 } 2286 // clang-format on 2287 2288 return 0; 2289 } 2290 2291 SEC("kprobe/switch_task_namespaces") 2292 int BPF_KPROBE(trace_switch_task_namespaces) 2293 { 2294 program_data_t p = {}; 2295 if (!init_program_data(&p, ctx)) 2296 return 0; 2297 2298 if (!should_trace(&p)) 2299 return 0; 2300 2301 if (!should_submit(SWITCH_TASK_NS, p.event)) 2302 return 0; 2303 2304 struct task_struct *task = (struct task_struct *) PT_REGS_PARM1(ctx); 2305 struct nsproxy *new = (struct nsproxy *) PT_REGS_PARM2(ctx); 2306 2307 if (!new) 2308 return 0; 2309 2310 pid_t pid = BPF_CORE_READ(task, pid); 2311 u32 old_mnt = p.event->context.task.mnt_id; 2312 u32 new_mnt = get_mnt_ns_id(new); 2313 u32 old_pid = get_task_pid_ns_for_children_id(task); 2314 u32 new_pid = get_pid_ns_for_children_id(new); 2315 u32 old_uts = get_task_uts_ns_id(task); 2316 u32 new_uts = get_uts_ns_id(new); 2317 u32 old_ipc = get_task_ipc_ns_id(task); 2318 u32 new_ipc = get_ipc_ns_id(new); 2319 u32 old_net = get_task_net_ns_id(task); 2320 u32 new_net = get_net_ns_id(new); 2321 u32 old_cgroup = get_task_cgroup_ns_id(task); 2322 u32 new_cgroup = get_cgroup_ns_id(new); 2323 2324 save_to_submit_buf(&p.event->args_buf, (void *) &pid, sizeof(int), 0); 2325 2326 if (old_mnt != new_mnt) 2327 save_to_submit_buf(&p.event->args_buf, (void *) &new_mnt, sizeof(u32), 1); 2328 if (old_pid != new_pid) 2329 save_to_submit_buf(&p.event->args_buf, (void *) &new_pid, sizeof(u32), 2); 2330 if (old_uts != new_uts) 2331 save_to_submit_buf(&p.event->args_buf, (void *) &new_uts, sizeof(u32), 3); 2332 if (old_ipc != new_ipc) 2333 save_to_submit_buf(&p.event->args_buf, (void *) &new_ipc, sizeof(u32), 4); 2334 if (old_net != new_net) 2335 save_to_submit_buf(&p.event->args_buf, (void *) &new_net, sizeof(u32), 5); 2336 if (old_cgroup != new_cgroup) 2337 save_to_submit_buf(&p.event->args_buf, (void *) &new_cgroup, sizeof(u32), 6); 2338 if (p.event->args_buf.argnum > 1) 2339 events_perf_submit(&p, SWITCH_TASK_NS, 0); 2340 2341 return 0; 2342 } 2343 2344 SEC("kprobe/cap_capable") 2345 int BPF_KPROBE(trace_cap_capable) 2346 { 2347 program_data_t p = {}; 2348 if (!init_program_data(&p, ctx)) 2349 return 0; 2350 2351 if (!should_trace(&p)) 2352 return 0; 2353 2354 if (!should_submit(CAP_CAPABLE, p.event)) 2355 return 0; 2356 2357 int cap = PT_REGS_PARM3(ctx); 2358 int cap_opt = PT_REGS_PARM4(ctx); 2359 2360 if (cap_opt & CAP_OPT_NOAUDIT) 2361 return 0; 2362 2363 save_to_submit_buf(&p.event->args_buf, (void *) &cap, sizeof(int), 0); 2364 2365 return events_perf_submit(&p, CAP_CAPABLE, 0); 2366 } 2367 2368 SEC("kprobe/security_socket_create") 2369 int BPF_KPROBE(trace_security_socket_create) 2370 { 2371 program_data_t p = {}; 2372 if (!init_program_data(&p, ctx)) 2373 return 0; 2374 2375 if (!should_trace(&p)) 2376 return 0; 2377 2378 if (!should_submit(SECURITY_SOCKET_CREATE, p.event)) 2379 return 0; 2380 2381 int family = (int) PT_REGS_PARM1(ctx); 2382 int type = (int) PT_REGS_PARM2(ctx); 2383 int protocol = (int) PT_REGS_PARM3(ctx); 2384 int kern = (int) PT_REGS_PARM4(ctx); 2385 2386 save_to_submit_buf(&p.event->args_buf, (void *) &family, sizeof(int), 0); 2387 save_to_submit_buf(&p.event->args_buf, (void *) &type, sizeof(int), 1); 2388 save_to_submit_buf(&p.event->args_buf, (void *) &protocol, sizeof(int), 2); 2389 save_to_submit_buf(&p.event->args_buf, (void *) &kern, sizeof(int), 3); 2390 2391 return events_perf_submit(&p, SECURITY_SOCKET_CREATE, 0); 2392 } 2393 2394 SEC("kprobe/security_inode_symlink") 2395 int BPF_KPROBE(trace_security_inode_symlink) 2396 { 2397 program_data_t p = {}; 2398 if (!init_program_data(&p, ctx)) 2399 return 0; 2400 2401 if (!should_trace(&p)) 2402 return 0; 2403 2404 if (!should_submit(SECURITY_INODE_SYMLINK, p.event)) 2405 return 0; 2406 2407 // struct inode *dir = (struct inode *)PT_REGS_PARM1(ctx); 2408 struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx); 2409 const char *old_name = (const char *) PT_REGS_PARM3(ctx); 2410 2411 void *dentry_path = get_dentry_path_str(dentry); 2412 2413 save_str_to_buf(&p.event->args_buf, dentry_path, 0); 2414 save_str_to_buf(&p.event->args_buf, (void *) old_name, 1); 2415 2416 return events_perf_submit(&p, SECURITY_INODE_SYMLINK, 0); 2417 } 2418 2419 SEC("kprobe/proc_create") 2420 int BPF_KPROBE(trace_proc_create) 2421 { 2422 program_data_t p = {}; 2423 if (!init_program_data(&p, ctx)) 2424 return 0; 2425 2426 if (!should_trace((&p))) 2427 return 0; 2428 2429 if (!should_submit(PROC_CREATE, p.event)) 2430 return 0; 2431 2432 char *name = (char *) PT_REGS_PARM1(ctx); 2433 unsigned long proc_ops_addr = (unsigned long) PT_REGS_PARM4(ctx); 2434 2435 save_str_to_buf(&p.event->args_buf, name, 0); 2436 save_to_submit_buf(&p.event->args_buf, (void *) &proc_ops_addr, sizeof(u64), 1); 2437 2438 return events_perf_submit(&p, PROC_CREATE, 0); 2439 } 2440 2441 SEC("kprobe/debugfs_create_file") 2442 int BPF_KPROBE(trace_debugfs_create_file) 2443 { 2444 program_data_t p = {}; 2445 if (!init_program_data(&p, ctx)) 2446 return 0; 2447 2448 if (!should_trace((&p))) 2449 return 0; 2450 2451 if (!should_submit(DEBUGFS_CREATE_FILE, p.event)) 2452 return 0; 2453 2454 char *name = (char *) PT_REGS_PARM1(ctx); 2455 mode_t mode = (unsigned short) PT_REGS_PARM2(ctx); 2456 struct dentry *dentry = (struct dentry *) PT_REGS_PARM3(ctx); 2457 void *dentry_path = get_dentry_path_str(dentry); 2458 unsigned long proc_ops_addr = (unsigned long) PT_REGS_PARM5(ctx); 2459 2460 save_str_to_buf(&p.event->args_buf, name, 0); 2461 save_str_to_buf(&p.event->args_buf, dentry_path, 1); 2462 save_to_submit_buf(&p.event->args_buf, &mode, sizeof(mode_t), 2); 2463 save_to_submit_buf(&p.event->args_buf, (void *) &proc_ops_addr, sizeof(u64), 3); 2464 2465 return events_perf_submit(&p, DEBUGFS_CREATE_FILE, 0); 2466 } 2467 2468 SEC("kprobe/debugfs_create_dir") 2469 int BPF_KPROBE(trace_debugfs_create_dir) 2470 { 2471 program_data_t p = {}; 2472 if (!init_program_data(&p, ctx)) 2473 return 0; 2474 2475 if (!should_trace((&p))) 2476 return 0; 2477 2478 if (!should_submit(DEBUGFS_CREATE_DIR, p.event)) 2479 return 0; 2480 2481 char *name = (char *) PT_REGS_PARM1(ctx); 2482 struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx); 2483 void *dentry_path = get_dentry_path_str(dentry); 2484 2485 save_str_to_buf(&p.event->args_buf, name, 0); 2486 save_str_to_buf(&p.event->args_buf, dentry_path, 1); 2487 2488 return events_perf_submit(&p, DEBUGFS_CREATE_DIR, 0); 2489 } 2490 2491 SEC("kprobe/security_socket_listen") 2492 int BPF_KPROBE(trace_security_socket_listen) 2493 { 2494 program_data_t p = {}; 2495 if (!init_program_data(&p, ctx)) 2496 return 0; 2497 2498 if (!should_trace(&p)) 2499 return 0; 2500 2501 if (!should_submit(SECURITY_SOCKET_LISTEN, p.event)) 2502 return 0; 2503 2504 struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx); 2505 int backlog = (int) PT_REGS_PARM2(ctx); 2506 2507 // Load the arguments given to the listen syscall (which eventually invokes this function) 2508 syscall_data_t *sys = &p.task_info->syscall_data; 2509 if (!p.task_info->syscall_traced) 2510 return 0; 2511 2512 switch (sys->id) { 2513 case SYSCALL_LISTEN: 2514 save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0); 2515 break; 2516 #if defined(bpf_target_x86) // armhf makes use of SYSCALL_LISTEN 2517 case SYSCALL_SOCKETCALL: 2518 save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0); 2519 break; 2520 #endif 2521 default: 2522 return 0; 2523 } 2524 2525 save_sockaddr_to_buf(&p.event->args_buf, sock, 1); 2526 save_to_submit_buf(&p.event->args_buf, (void *) &backlog, sizeof(int), 2); 2527 2528 return events_perf_submit(&p, SECURITY_SOCKET_LISTEN, 0); 2529 } 2530 2531 SEC("kprobe/security_socket_connect") 2532 int BPF_KPROBE(trace_security_socket_connect) 2533 { 2534 program_data_t p = {}; 2535 if (!init_program_data(&p, ctx)) 2536 return 0; 2537 2538 if (!should_trace(&p)) 2539 return 0; 2540 2541 if (!should_submit(SECURITY_SOCKET_CONNECT, p.event)) 2542 return 0; 2543 2544 u64 addr_len = PT_REGS_PARM3(ctx); 2545 2546 struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx); 2547 if (!sock) 2548 return 0; 2549 2550 struct sockaddr *address = (struct sockaddr *) PT_REGS_PARM2(ctx); 2551 if (!address) 2552 return 0; 2553 2554 // Check if the socket type is supported. 2555 u32 type = BPF_CORE_READ(sock, type); 2556 switch (type) { 2557 // TODO: case SOCK_DCCP: 2558 case SOCK_DGRAM: 2559 case SOCK_SEQPACKET: 2560 case SOCK_STREAM: 2561 break; 2562 default: 2563 return 0; 2564 } 2565 2566 // Check if the socket family is supported. 2567 sa_family_t sa_fam = get_sockaddr_family(address); 2568 switch (sa_fam) { 2569 case AF_INET: 2570 case AF_INET6: 2571 case AF_UNIX: 2572 break; 2573 default: 2574 return 0; 2575 } 2576 2577 // Load args given to the syscall that invoked this function. 2578 syscall_data_t *sys = &p.task_info->syscall_data; 2579 if (!p.task_info->syscall_traced) 2580 return 0; 2581 2582 // Reduce line cols by having a few temp pointers. 2583 int (*stsb)(args_buffer_t *, void *, u32, u8) = save_to_submit_buf; 2584 void *args_buf = &p.event->args_buf; 2585 void *to = (void *) &sys->args.args[0]; 2586 2587 if (is_x86_compat(p.task)) // only i386 binaries uses socketcall 2588 to = (void *) sys->args.args[1]; 2589 2590 // Save the socket fd, depending on the syscall. 2591 switch (sys->id) { 2592 case SYSCALL_CONNECT: 2593 case SYSCALL_SOCKETCALL: 2594 break; 2595 default: 2596 return 0; 2597 } 2598 2599 // Save the socket fd argument to the event. 2600 stsb(args_buf, to, sizeof(u32), 0); 2601 2602 // Save the socket type argument to the event. 2603 stsb(args_buf, &type, sizeof(u32), 1); 2604 2605 bool need_workaround = false; 2606 2607 // Save the sockaddr struct, depending on the family. 2608 size_t sockaddr_len = 0; 2609 switch (sa_fam) { 2610 case AF_INET: 2611 sockaddr_len = sizeof(struct sockaddr_in); 2612 break; 2613 case AF_INET6: 2614 sockaddr_len = sizeof(struct sockaddr_in6); 2615 break; 2616 case AF_UNIX: 2617 sockaddr_len = sizeof(struct sockaddr_un); 2618 if (addr_len < sockaddr_len) 2619 need_workaround = true; 2620 2621 break; 2622 } 2623 2624 #if defined(bpf_target_x86) 2625 if (need_workaround) { 2626 // Workaround for sockaddr_un struct length (issue: #1129). 2627 struct sockaddr_un sockaddr = {0}; 2628 bpf_probe_read(&sockaddr, (u32) addr_len, (void *) address); 2629 stsb(args_buf, (void *) &sockaddr, sizeof(struct sockaddr_un), 2); 2630 } 2631 #endif 2632 2633 // Save the sockaddr struct argument to the event. 2634 if (!need_workaround) { 2635 stsb(args_buf, (void *) address, sockaddr_len, 2); 2636 } 2637 2638 // Submit the event. 2639 return events_perf_submit(&p, SECURITY_SOCKET_CONNECT, 0); 2640 } 2641 2642 SEC("kprobe/security_socket_accept") 2643 int BPF_KPROBE(trace_security_socket_accept) 2644 { 2645 program_data_t p = {}; 2646 if (!init_program_data(&p, ctx)) 2647 return 0; 2648 2649 if (!should_trace(&p)) 2650 return 0; 2651 2652 struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx); 2653 struct socket *new_sock = (struct socket *) PT_REGS_PARM2(ctx); 2654 syscall_data_t *sys = &p.task_info->syscall_data; 2655 2656 // save sockets for "socket_accept event" 2657 if (should_submit(SOCKET_ACCEPT, p.event)) { 2658 args_t args = {}; 2659 args.args[0] = (unsigned long) sock; 2660 args.args[1] = (unsigned long) new_sock; 2661 args.args[2] = sys->args.args[0]; // sockfd 2662 save_args(&args, SOCKET_ACCEPT); 2663 } 2664 2665 if (!should_submit(SECURITY_SOCKET_ACCEPT, p.event)) 2666 return 0; 2667 2668 // Load the arguments given to the accept syscall (which eventually invokes this function) 2669 if (!p.task_info->syscall_traced || (sys->id != SYSCALL_ACCEPT && sys->id != SYSCALL_ACCEPT4)) 2670 return 0; 2671 2672 switch (sys->id) { 2673 case SYSCALL_ACCEPT: 2674 case SYSCALL_ACCEPT4: 2675 save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0); 2676 break; 2677 #if defined(bpf_target_x86) // armhf makes use of SYSCALL_ACCEPT/4 2678 case SYSCALL_SOCKETCALL: 2679 save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0); 2680 break; 2681 #endif 2682 default: 2683 return 0; 2684 } 2685 2686 save_sockaddr_to_buf(&p.event->args_buf, sock, 1); 2687 2688 return events_perf_submit(&p, SECURITY_SOCKET_ACCEPT, 0); 2689 } 2690 2691 SEC("kprobe/security_socket_bind") 2692 int BPF_KPROBE(trace_security_socket_bind) 2693 { 2694 program_data_t p = {}; 2695 if (!init_program_data(&p, ctx)) 2696 return 0; 2697 2698 if (!should_trace(&p)) 2699 return 0; 2700 2701 if (!should_submit(SECURITY_SOCKET_BIND, p.event)) 2702 return 0; 2703 2704 struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx); 2705 struct sock *sk = get_socket_sock(sock); 2706 2707 struct sockaddr *address = (struct sockaddr *) PT_REGS_PARM2(ctx); 2708 #if defined(__TARGET_ARCH_x86) // TODO: issue: #1129 2709 uint addr_len = (uint) PT_REGS_PARM3(ctx); 2710 #endif 2711 2712 sa_family_t sa_fam = get_sockaddr_family(address); 2713 if ((sa_fam != AF_INET) && (sa_fam != AF_INET6) && (sa_fam != AF_UNIX)) { 2714 return 0; 2715 } 2716 2717 // Load the arguments given to the bind syscall (which eventually invokes this function) 2718 syscall_data_t *sys = &p.task_info->syscall_data; 2719 if (!p.task_info->syscall_traced || sys->id != SYSCALL_BIND) 2720 return 0; 2721 2722 switch (sys->id) { 2723 case SYSCALL_BIND: 2724 save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0); 2725 break; 2726 #if defined(bpf_target_x86) // armhf makes use of SYSCALL_BIND 2727 case SYSCALL_SOCKETCALL: 2728 save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0); 2729 break; 2730 #endif 2731 default: 2732 return 0; 2733 } 2734 2735 u16 protocol = get_sock_protocol(sk); 2736 net_id_t connect_id = {0}; 2737 connect_id.protocol = protocol; 2738 2739 if (sa_fam == AF_INET) { 2740 save_to_submit_buf(&p.event->args_buf, (void *) address, sizeof(struct sockaddr_in), 1); 2741 2742 struct sockaddr_in *addr = (struct sockaddr_in *) address; 2743 2744 if (protocol == IPPROTO_UDP && BPF_CORE_READ(addr, sin_port)) { 2745 connect_id.address.s6_addr32[3] = BPF_CORE_READ(addr, sin_addr).s_addr; 2746 connect_id.address.s6_addr16[5] = 0xffff; 2747 connect_id.port = BPF_CORE_READ(addr, sin_port); 2748 } 2749 } else if (sa_fam == AF_INET6) { 2750 save_to_submit_buf(&p.event->args_buf, (void *) address, sizeof(struct sockaddr_in6), 1); 2751 2752 struct sockaddr_in6 *addr = (struct sockaddr_in6 *) address; 2753 2754 if (protocol == IPPROTO_UDP && BPF_CORE_READ(addr, sin6_port)) { 2755 connect_id.address = BPF_CORE_READ(addr, sin6_addr); 2756 connect_id.port = BPF_CORE_READ(addr, sin6_port); 2757 } 2758 } else if (sa_fam == AF_UNIX) { 2759 #if defined(__TARGET_ARCH_x86) // TODO: this is broken in arm64 (issue: #1129) 2760 if (addr_len <= sizeof(struct sockaddr_un)) { 2761 struct sockaddr_un sockaddr = {}; 2762 bpf_probe_read(&sockaddr, addr_len, (void *) address); 2763 save_to_submit_buf( 2764 &p.event->args_buf, (void *) &sockaddr, sizeof(struct sockaddr_un), 1); 2765 } else 2766 #endif 2767 save_to_submit_buf(&p.event->args_buf, (void *) address, sizeof(struct sockaddr_un), 1); 2768 } 2769 2770 return events_perf_submit(&p, SECURITY_SOCKET_BIND, 0); 2771 } 2772 2773 SEC("kprobe/security_socket_setsockopt") 2774 int BPF_KPROBE(trace_security_socket_setsockopt) 2775 { 2776 program_data_t p = {}; 2777 if (!init_program_data(&p, ctx)) 2778 return 0; 2779 2780 if (!should_trace(&p)) 2781 return 0; 2782 2783 if (!should_submit(SECURITY_SOCKET_SETSOCKOPT, p.event)) 2784 return 0; 2785 2786 struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx); 2787 int level = (int) PT_REGS_PARM2(ctx); 2788 int optname = (int) PT_REGS_PARM3(ctx); 2789 2790 // Load the arguments given to the setsockopt syscall (which eventually invokes this function) 2791 syscall_data_t *sys = &p.task_info->syscall_data; 2792 if (sys == NULL) { 2793 return -1; 2794 } 2795 2796 if (!p.task_info->syscall_traced || sys->id != SYSCALL_SETSOCKOPT) 2797 return 0; 2798 2799 switch (sys->id) { 2800 case SYSCALL_SETSOCKOPT: 2801 save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0); 2802 break; 2803 #if defined(bpf_target_x86) // armhf makes use of SYSCALL_SETSOCKOPT 2804 case SYSCALL_SOCKETCALL: 2805 save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0); 2806 break; 2807 #endif 2808 default: 2809 return 0; 2810 } 2811 2812 save_to_submit_buf(&p.event->args_buf, (void *) &level, sizeof(int), 1); 2813 save_to_submit_buf(&p.event->args_buf, (void *) &optname, sizeof(int), 2); 2814 save_sockaddr_to_buf(&p.event->args_buf, sock, 3); 2815 2816 return events_perf_submit(&p, SECURITY_SOCKET_SETSOCKOPT, 0); 2817 } 2818 2819 enum bin_type_e { 2820 SEND_VFS_WRITE = 1, 2821 SEND_MPROTECT, 2822 SEND_KERNEL_MODULE, 2823 SEND_BPF_OBJECT, 2824 SEND_VFS_READ 2825 }; 2826 2827 statfunc u32 tail_call_send_bin(void *ctx, program_data_t *p, bin_args_t *bin_args, int tail_call) 2828 { 2829 if (p->event->args_buf.offset < ARGS_BUF_SIZE - sizeof(bin_args_t)) { 2830 bpf_probe_read_kernel( 2831 &(p->event->args_buf.args[p->event->args_buf.offset]), sizeof(bin_args_t), bin_args); 2832 if (tail_call == TAIL_SEND_BIN) 2833 bpf_tail_call(ctx, &prog_array, tail_call); 2834 else if (tail_call == TAIL_SEND_BIN_TP) 2835 bpf_tail_call(ctx, &prog_array_tp, tail_call); 2836 } 2837 2838 return 0; 2839 } 2840 2841 statfunc u32 send_bin_helper(void *ctx, void *prog_array, int tail_call) 2842 { 2843 // Note: sending the data to the userspace have the following constraints: 2844 // 2845 // 1. We need a buffer that we know it's exact size 2846 // (so we can send chunks of known sizes in BPF) 2847 // 2. We can have multiple cpus - need percpu array 2848 // 3. We have to use perf submit and not maps as data 2849 // can be overridden if userspace doesn't consume 2850 // it fast enough 2851 2852 int i = 0; 2853 unsigned int chunk_size; 2854 u32 zero = 0; 2855 2856 event_data_t *event = bpf_map_lookup_elem(&event_data_map, &zero); 2857 if (!event || (event->args_buf.offset > ARGS_BUF_SIZE - sizeof(bin_args_t))) 2858 return 0; 2859 2860 bin_args_t *bin_args = (bin_args_t *) &(event->args_buf.args[event->args_buf.offset]); 2861 2862 if (bin_args->full_size <= 0) { 2863 // If there are more vector elements, continue to the next one 2864 bin_args->iov_idx++; 2865 if (bin_args->iov_idx < bin_args->iov_len) { 2866 // Handle the rest of write recursively 2867 bin_args->start_off += bin_args->full_size; 2868 struct iovec io_vec; 2869 bpf_probe_read(&io_vec, sizeof(struct iovec), &bin_args->vec[bin_args->iov_idx]); 2870 bin_args->ptr = io_vec.iov_base; 2871 bin_args->full_size = io_vec.iov_len; 2872 bpf_tail_call(ctx, prog_array, tail_call); 2873 } 2874 return 0; 2875 } 2876 2877 buf_t *file_buf_p = get_buf(FILE_BUF_IDX); 2878 if (file_buf_p == NULL) 2879 return 0; 2880 2881 #define F_SEND_TYPE 0 2882 #define F_CGROUP_ID (F_SEND_TYPE + sizeof(u8)) 2883 #define F_META_OFF (F_CGROUP_ID + sizeof(u64)) 2884 #define F_SZ_OFF (F_META_OFF + SEND_META_SIZE) 2885 #define F_POS_OFF (F_SZ_OFF + sizeof(unsigned int)) 2886 #define F_CHUNK_OFF (F_POS_OFF + sizeof(off_t)) 2887 #define F_CHUNK_SIZE (MAX_PERCPU_BUFSIZE >> 1) 2888 2889 bpf_probe_read_kernel((void **) &(file_buf_p->buf[F_SEND_TYPE]), sizeof(u8), &bin_args->type); 2890 2891 u64 cgroup_id = event->context.task.cgroup_id; 2892 bpf_probe_read_kernel((void **) &(file_buf_p->buf[F_CGROUP_ID]), sizeof(u64), &cgroup_id); 2893 2894 // Save metadata to be used in filename 2895 bpf_probe_read_kernel( 2896 (void **) &(file_buf_p->buf[F_META_OFF]), SEND_META_SIZE, bin_args->metadata); 2897 2898 // Save number of written bytes. Set this to CHUNK_SIZE for full chunks 2899 chunk_size = F_CHUNK_SIZE; 2900 bpf_probe_read_kernel( 2901 (void **) &(file_buf_p->buf[F_SZ_OFF]), sizeof(unsigned int), &chunk_size); 2902 2903 unsigned int full_chunk_num = bin_args->full_size / F_CHUNK_SIZE; 2904 void *data = file_buf_p->buf; 2905 2906 // Handle full chunks in loop 2907 #pragma unroll 2908 for (i = 0; i < MAX_BIN_CHUNKS; i++) { 2909 // Dummy instruction, as break instruction can't be first with unroll optimization 2910 chunk_size = F_CHUNK_SIZE; 2911 2912 if (i == full_chunk_num) 2913 break; 2914 2915 // Save binary chunk and file position of write 2916 bpf_probe_read_kernel( 2917 (void **) &(file_buf_p->buf[F_POS_OFF]), sizeof(off_t), &bin_args->start_off); 2918 bpf_probe_read_kernel( 2919 (void **) &(file_buf_p->buf[F_CHUNK_OFF]), F_CHUNK_SIZE, bin_args->ptr); 2920 bin_args->ptr += F_CHUNK_SIZE; 2921 bin_args->start_off += F_CHUNK_SIZE; 2922 2923 bpf_perf_event_output( 2924 ctx, &file_writes, BPF_F_CURRENT_CPU, data, F_CHUNK_OFF + F_CHUNK_SIZE); 2925 } 2926 2927 chunk_size = bin_args->full_size - i * F_CHUNK_SIZE; 2928 2929 if (chunk_size > F_CHUNK_SIZE) { 2930 // Handle the rest of write recursively 2931 bin_args->full_size = chunk_size; 2932 bpf_tail_call(ctx, prog_array, tail_call); 2933 return 0; 2934 } 2935 2936 if (chunk_size) { 2937 // Save last chunk 2938 chunk_size = chunk_size & ((MAX_PERCPU_BUFSIZE >> 1) - 1); 2939 bpf_probe_read_kernel((void **) &(file_buf_p->buf[F_CHUNK_OFF]), chunk_size, bin_args->ptr); 2940 bpf_probe_read_kernel( 2941 (void **) &(file_buf_p->buf[F_SZ_OFF]), sizeof(unsigned int), &chunk_size); 2942 bpf_probe_read_kernel( 2943 (void **) &(file_buf_p->buf[F_POS_OFF]), sizeof(off_t), &bin_args->start_off); 2944 2945 // Satisfy validator by setting buffer bounds 2946 int size = (F_CHUNK_OFF + chunk_size) & (MAX_PERCPU_BUFSIZE - 1); 2947 bpf_perf_event_output(ctx, &file_writes, BPF_F_CURRENT_CPU, data, size); 2948 } 2949 2950 // We finished writing an element of the vector - continue to next element 2951 bin_args->iov_idx++; 2952 if (bin_args->iov_idx < bin_args->iov_len) { 2953 // Handle the rest of write recursively 2954 bin_args->start_off += bin_args->full_size; 2955 struct iovec io_vec; 2956 bpf_probe_read(&io_vec, sizeof(struct iovec), &bin_args->vec[bin_args->iov_idx]); 2957 bin_args->ptr = io_vec.iov_base; 2958 bin_args->full_size = io_vec.iov_len; 2959 bpf_tail_call(ctx, prog_array, tail_call); 2960 } 2961 2962 return 0; 2963 } 2964 2965 SEC("kprobe/send_bin") 2966 int BPF_KPROBE(send_bin) 2967 { 2968 return send_bin_helper(ctx, &prog_array, TAIL_SEND_BIN); 2969 } 2970 2971 SEC("raw_tracepoint/send_bin_tp") 2972 int send_bin_tp(void *ctx) 2973 { 2974 return send_bin_helper(ctx, &prog_array_tp, TAIL_SEND_BIN_TP); 2975 } 2976 2977 statfunc bool should_submit_io_event(u32 event_id, program_data_t *p) 2978 { 2979 return ((event_id == VFS_READ || event_id == VFS_READV || event_id == VFS_WRITE || 2980 event_id == VFS_WRITEV || event_id == __KERNEL_WRITE) && 2981 should_submit(event_id, p->event)); 2982 } 2983 2984 statfunc int is_elf(io_data_t io_data, u8 header[FILE_MAGIC_HDR_SIZE]) 2985 { 2986 // ELF binaries start with a 4 byte long header 2987 if (io_data.len < 4) { 2988 return false; 2989 } 2990 2991 return header[0] == 0x7F && header[1] == 'E' && header[2] == 'L' && header[3] == 'F'; 2992 } 2993 2994 /** do_file_io_operation - generic file IO (read and write) event creator. 2995 * 2996 * @ctx: the state of the registers prior the hook. 2997 * @event_id: the ID of the event to be created. 2998 * @tail_call_id: the ID of the tail call to be called before function return. 2999 * @is_read: true if the operation is read. False if write. 3000 * @is_buf: true if the non-file side of the operation is a buffer. False if io_vector. 3001 */ 3002 statfunc int 3003 do_file_io_operation(struct pt_regs *ctx, u32 event_id, u32 tail_call_id, bool is_read, bool is_buf) 3004 { 3005 args_t saved_args; 3006 if (load_args(&saved_args, event_id) != 0) { 3007 // missed entry or not traced 3008 return 0; 3009 } 3010 // We shouldn't call del_args(event_id) here as the arguments are also used by the tail call 3011 3012 program_data_t p = {}; 3013 if (!init_program_data(&p, ctx)) { 3014 goto out; 3015 } 3016 3017 if (!should_trace(&p)) { 3018 goto out; 3019 } 3020 3021 if (!should_submit_io_event(event_id, &p)) { 3022 goto tail; 3023 } 3024 3025 loff_t start_pos; 3026 io_data_t io_data; 3027 file_info_t file_info; 3028 3029 struct file *file = (struct file *) saved_args.args[0]; 3030 file_info.pathname_p = get_path_str_cached(file); 3031 3032 io_data.is_buf = is_buf; 3033 io_data.ptr = (void *) saved_args.args[1]; 3034 io_data.len = (unsigned long) saved_args.args[2]; 3035 loff_t *pos = (loff_t *) saved_args.args[3]; 3036 3037 // Extract device id, inode number, and pos (offset) 3038 file_info.id.device = get_dev_from_file(file); 3039 file_info.id.inode = get_inode_nr_from_file(file); 3040 bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos); 3041 3042 u32 io_bytes_amount = PT_REGS_RC(ctx); 3043 3044 // Calculate write start offset 3045 if (start_pos != 0) 3046 start_pos -= io_bytes_amount; 3047 3048 save_str_to_buf(&p.event->args_buf, file_info.pathname_p, 0); 3049 save_to_submit_buf(&p.event->args_buf, &file_info.id.device, sizeof(dev_t), 1); 3050 save_to_submit_buf(&p.event->args_buf, &file_info.id.inode, sizeof(unsigned long), 2); 3051 save_to_submit_buf(&p.event->args_buf, &io_data.len, sizeof(unsigned long), 3); 3052 save_to_submit_buf(&p.event->args_buf, &start_pos, sizeof(off_t), 4); 3053 3054 // Submit io event 3055 events_perf_submit(&p, event_id, PT_REGS_RC(ctx)); 3056 3057 tail: 3058 bpf_tail_call(ctx, &prog_array, tail_call_id); 3059 out: 3060 del_args(event_id); 3061 3062 return 0; 3063 } 3064 3065 statfunc void 3066 extract_vfs_ret_io_data(struct pt_regs *ctx, args_t *saved_args, io_data_t *io_data, bool is_buf) 3067 { 3068 io_data->is_buf = is_buf; 3069 if (is_buf) { 3070 io_data->ptr = (void *) saved_args->args[1]; 3071 io_data->len = (size_t) PT_REGS_RC(ctx); 3072 } else { 3073 io_data->ptr = (struct iovec *) saved_args->args[1]; 3074 io_data->len = saved_args->args[2]; 3075 } 3076 } 3077 3078 // Filter capture of file writes according to path prefix, type and fd. 3079 statfunc bool 3080 filter_file_write_capture(program_data_t *p, struct file *file, io_data_t io_data, off_t start_pos) 3081 { 3082 return filter_file_path(p->ctx, &file_write_path_filter, file) || 3083 filter_file_type(p->ctx, 3084 &file_type_filter, 3085 CAPTURE_WRITE_TYPE_FILTER_IDX, 3086 file, 3087 io_data, 3088 start_pos) || 3089 filter_file_fd(p->ctx, &file_type_filter, CAPTURE_WRITE_TYPE_FILTER_IDX, file); 3090 } 3091 3092 // Capture file write 3093 // Will only capture if: 3094 // 1. File write capture was configured 3095 // 2. File matches the filters given 3096 statfunc int capture_file_write(struct pt_regs *ctx, u32 event_id, bool is_buf) 3097 { 3098 args_t saved_args; 3099 io_data_t io_data; 3100 3101 if (load_args(&saved_args, event_id) != 0) 3102 return 0; 3103 del_args(event_id); 3104 3105 program_data_t p = {}; 3106 if (!init_program_data(&p, ctx)) 3107 return 0; 3108 3109 if ((p.config->options & OPT_CAPTURE_FILES_WRITE) == 0) 3110 return 0; 3111 3112 extract_vfs_ret_io_data(ctx, &saved_args, &io_data, is_buf); 3113 struct file *file = (struct file *) saved_args.args[0]; 3114 loff_t *pos = (loff_t *) saved_args.args[3]; 3115 size_t written_bytes = PT_REGS_RC(ctx); 3116 3117 off_t start_pos; 3118 bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos); 3119 // Calculate write start offset 3120 if (start_pos != 0) 3121 start_pos -= written_bytes; 3122 3123 if (filter_file_write_capture(&p, file, io_data, start_pos)) { 3124 // There is a filter, but no match 3125 return 0; 3126 } 3127 // No filter was given, or filter match - continue 3128 3129 // Because we don't pass the file path in the capture map, we can't do path checks in user mode. 3130 // We don't want to pass the PID for most file writes, because we want to save writes according 3131 // to the inode-device only. In the case of writes to /dev/null, we want to pass the PID because 3132 // otherwise the capture will overwrite itself. 3133 int pid = 0; 3134 void *path_buf = get_path_str_cached(file); 3135 if (path_buf != NULL && has_prefix("/dev/null", (char *) path_buf, 10)) { 3136 pid = p.event->context.task.pid; 3137 } 3138 3139 bin_args_t bin_args = {}; 3140 fill_vfs_file_bin_args(SEND_VFS_WRITE, file, pos, io_data, PT_REGS_RC(ctx), pid, &bin_args); 3141 3142 // Send file data 3143 tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN); 3144 return 0; 3145 } 3146 3147 // Filter capture of file reads according to path prefix, type and fd. 3148 statfunc bool 3149 filter_file_read_capture(program_data_t *p, struct file *file, io_data_t io_data, off_t start_pos) 3150 { 3151 return filter_file_path(p->ctx, &file_read_path_filter, file) || 3152 filter_file_type( 3153 p->ctx, &file_type_filter, CAPTURE_READ_TYPE_FILTER_IDX, file, io_data, start_pos) || 3154 filter_file_fd(p->ctx, &file_type_filter, CAPTURE_READ_TYPE_FILTER_IDX, file); 3155 } 3156 3157 statfunc int capture_file_read(struct pt_regs *ctx, u32 event_id, bool is_buf) 3158 { 3159 args_t saved_args; 3160 io_data_t io_data; 3161 3162 if (load_args(&saved_args, event_id) != 0) 3163 return 0; 3164 del_args(event_id); 3165 3166 program_data_t p = {}; 3167 if (!init_program_data(&p, ctx)) 3168 return 0; 3169 3170 if ((p.config->options & OPT_CAPTURE_FILES_READ) == 0) 3171 return 0; 3172 3173 extract_vfs_ret_io_data(ctx, &saved_args, &io_data, is_buf); 3174 struct file *file = (struct file *) saved_args.args[0]; 3175 loff_t *pos = (loff_t *) saved_args.args[3]; 3176 size_t read_bytes = PT_REGS_RC(ctx); 3177 3178 off_t start_pos; 3179 bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos); 3180 // Calculate write start offset 3181 if (start_pos != 0) 3182 start_pos -= read_bytes; 3183 3184 if (filter_file_read_capture(&p, file, io_data, start_pos)) { 3185 // There is a filter, but no match 3186 return 0; 3187 } 3188 // No filter was given, or filter match - continue 3189 3190 bin_args_t bin_args = {}; 3191 u64 id = bpf_get_current_pid_tgid(); 3192 fill_vfs_file_bin_args(SEND_VFS_READ, file, pos, io_data, PT_REGS_RC(ctx), 0, &bin_args); 3193 3194 // Send file data 3195 tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN); 3196 return 0; 3197 } 3198 3199 SEC("kprobe/vfs_write") 3200 TRACE_ENT_FUNC(vfs_write, VFS_WRITE); 3201 3202 SEC("kretprobe/vfs_write") 3203 int BPF_KPROBE(trace_ret_vfs_write) 3204 { 3205 return do_file_io_operation(ctx, VFS_WRITE, TAIL_VFS_WRITE, false, true); 3206 } 3207 3208 SEC("kretprobe/vfs_write_tail") 3209 int BPF_KPROBE(trace_ret_vfs_write_tail) 3210 { 3211 return capture_file_write(ctx, VFS_WRITE, true); 3212 } 3213 3214 SEC("kprobe/vfs_writev") 3215 TRACE_ENT_FUNC(vfs_writev, VFS_WRITEV); 3216 3217 SEC("kretprobe/vfs_writev") 3218 int BPF_KPROBE(trace_ret_vfs_writev) 3219 { 3220 return do_file_io_operation(ctx, VFS_WRITEV, TAIL_VFS_WRITEV, false, false); 3221 } 3222 3223 SEC("kretprobe/vfs_writev_tail") 3224 int BPF_KPROBE(trace_ret_vfs_writev_tail) 3225 { 3226 return capture_file_write(ctx, VFS_WRITEV, false); 3227 } 3228 3229 SEC("kprobe/__kernel_write") 3230 TRACE_ENT_FUNC(kernel_write, __KERNEL_WRITE); 3231 3232 SEC("kretprobe/__kernel_write") 3233 int BPF_KPROBE(trace_ret_kernel_write) 3234 { 3235 return do_file_io_operation(ctx, __KERNEL_WRITE, TAIL_KERNEL_WRITE, false, true); 3236 } 3237 3238 SEC("kretprobe/__kernel_write_tail") 3239 int BPF_KPROBE(trace_ret_kernel_write_tail) 3240 { 3241 return capture_file_write(ctx, __KERNEL_WRITE, true); 3242 } 3243 3244 SEC("kprobe/vfs_read") 3245 TRACE_ENT_FUNC(vfs_read, VFS_READ); 3246 3247 SEC("kretprobe/vfs_read") 3248 int BPF_KPROBE(trace_ret_vfs_read) 3249 { 3250 return do_file_io_operation(ctx, VFS_READ, TAIL_VFS_READ, true, true); 3251 } 3252 3253 SEC("kretprobe/vfs_read_tail") 3254 int BPF_KPROBE(trace_ret_vfs_read_tail) 3255 { 3256 return capture_file_read(ctx, VFS_READ, true); 3257 } 3258 3259 SEC("kprobe/vfs_readv") 3260 TRACE_ENT_FUNC(vfs_readv, VFS_READV); 3261 3262 SEC("kretprobe/vfs_readv") 3263 int BPF_KPROBE(trace_ret_vfs_readv) 3264 { 3265 return do_file_io_operation(ctx, VFS_READV, TAIL_VFS_READV, true, false); 3266 } 3267 3268 SEC("kretprobe/vfs_readv_tail") 3269 int BPF_KPROBE(trace_ret_vfs_readv_tail) 3270 { 3271 return capture_file_read(ctx, VFS_READV, false); 3272 } 3273 3274 statfunc int do_vfs_write_magic_enter(struct pt_regs *ctx) 3275 { 3276 loff_t start_pos; 3277 loff_t *pos = (loff_t *) PT_REGS_PARM4(ctx); 3278 bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos); 3279 if (start_pos != 0) { 3280 return 0; 3281 } 3282 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 3283 unsigned short i_mode = get_inode_mode_from_file(file); 3284 if ((i_mode & S_IFMT) != S_IFREG) { 3285 return 0; 3286 } 3287 3288 args_t args = {}; 3289 args.args[0] = PT_REGS_PARM1(ctx); 3290 args.args[1] = PT_REGS_PARM2(ctx); 3291 args.args[2] = PT_REGS_PARM3(ctx); 3292 args.args[3] = PT_REGS_PARM4(ctx); 3293 args.args[4] = PT_REGS_PARM5(ctx); 3294 args.args[5] = PT_REGS_PARM6(ctx); 3295 3296 return save_args(&args, MAGIC_WRITE); 3297 } 3298 3299 statfunc int do_vfs_write_magic_return(struct pt_regs *ctx, bool is_buf) 3300 { 3301 args_t saved_args; 3302 if (load_args(&saved_args, MAGIC_WRITE) != 0) { 3303 // missed entry or not traced 3304 return 0; 3305 } 3306 del_args(MAGIC_WRITE); 3307 3308 program_data_t p = {}; 3309 if (!init_program_data(&p, ctx)) 3310 return 0; 3311 3312 if (!should_trace(&p)) { 3313 return 0; 3314 } 3315 3316 if (!should_submit(MAGIC_WRITE, p.event)) { 3317 return 0; 3318 } 3319 3320 u32 bytes_written = PT_REGS_RC(ctx); 3321 if (bytes_written == 0) { 3322 return 0; 3323 } 3324 3325 io_data_t io_data; 3326 file_info_t file_info; 3327 3328 struct file *file = (struct file *) saved_args.args[0]; 3329 file_info.pathname_p = get_path_str_cached(file); 3330 3331 io_data.is_buf = is_buf; 3332 io_data.ptr = (void *) saved_args.args[1]; 3333 io_data.len = (unsigned long) saved_args.args[2]; 3334 3335 // Extract device id, inode number, and pos (offset) 3336 file_info.id.device = get_dev_from_file(file); 3337 file_info.id.inode = get_inode_nr_from_file(file); 3338 3339 u32 header_bytes = FILE_MAGIC_HDR_SIZE; 3340 if (header_bytes > bytes_written) 3341 header_bytes = bytes_written; 3342 3343 u8 header[FILE_MAGIC_HDR_SIZE]; 3344 __builtin_memset(&header, 0, sizeof(header)); 3345 3346 save_str_to_buf(&(p.event->args_buf), file_info.pathname_p, 0); 3347 3348 fill_file_header(header, io_data); 3349 3350 if (!is_elf(io_data, header)) { 3351 return 0; 3352 } 3353 3354 save_bytes_to_buf(&(p.event->args_buf), header, header_bytes, 1); 3355 save_to_submit_buf(&(p.event->args_buf), &file_info.id.device, sizeof(dev_t), 2); 3356 save_to_submit_buf(&(p.event->args_buf), &file_info.id.inode, sizeof(unsigned long), 3); 3357 3358 // Submit magic_write event 3359 return events_perf_submit(&p, MAGIC_WRITE, bytes_written); 3360 } 3361 3362 SEC("kprobe/vfs_write") 3363 int BPF_KPROBE(vfs_write_magic_enter) 3364 { 3365 return do_vfs_write_magic_enter(ctx); 3366 } 3367 3368 SEC("kprobe/vfs_writev") 3369 int BPF_KPROBE(vfs_writev_magic_enter) 3370 { 3371 return do_vfs_write_magic_enter(ctx); 3372 } 3373 3374 SEC("kprobe/__kernel_write") 3375 int BPF_KPROBE(kernel_write_magic_enter) 3376 { 3377 return do_vfs_write_magic_enter(ctx); 3378 } 3379 3380 SEC("kretprobe/vfs_write") 3381 int BPF_KPROBE(vfs_write_magic_return) 3382 { 3383 return do_vfs_write_magic_return(ctx, true); 3384 } 3385 3386 SEC("kretprobe/vfs_writev") 3387 int BPF_KPROBE(vfs_writev_magic_return) 3388 { 3389 return do_vfs_write_magic_return(ctx, false); 3390 } 3391 3392 SEC("kretprobe/__kernel_write") 3393 int BPF_KPROBE(kernel_write_magic_return) 3394 { 3395 return do_vfs_write_magic_return(ctx, true); 3396 } 3397 // Used macro because of problem with verifier in NONCORE kinetic519 3398 #define submit_mem_prot_alert_event(event, alert, addr, len, prot, previous_prot, file_info) \ 3399 { \ 3400 save_to_submit_buf(event, &alert, sizeof(u32), 0); \ 3401 save_to_submit_buf(event, &addr, sizeof(void *), 1); \ 3402 save_to_submit_buf(event, &len, sizeof(size_t), 2); \ 3403 save_to_submit_buf(event, &prot, sizeof(int), 3); \ 3404 save_to_submit_buf(event, &previous_prot, sizeof(int), 4); \ 3405 if (file_info.pathname_p != NULL) { \ 3406 save_str_to_buf(event, file_info.pathname_p, 5); \ 3407 save_to_submit_buf(event, &file_info.id.device, sizeof(dev_t), 6); \ 3408 save_to_submit_buf(event, &file_info.id.inode, sizeof(unsigned long), 7); \ 3409 save_to_submit_buf(event, &file_info.id.ctime, sizeof(u64), 8); \ 3410 } \ 3411 events_perf_submit(&p, MEM_PROT_ALERT, 0); \ 3412 } 3413 3414 SEC("kprobe/security_mmap_addr") 3415 int BPF_KPROBE(trace_mmap_alert) 3416 { 3417 program_data_t p = {}; 3418 if (!init_program_data(&p, ctx)) 3419 return 0; 3420 3421 if (!should_trace(&p)) 3422 return 0; 3423 3424 // Load the arguments given to the mmap syscall (which eventually invokes this function) 3425 syscall_data_t *sys = &p.task_info->syscall_data; 3426 if (!p.task_info->syscall_traced || sys->id != SYSCALL_MMAP) 3427 return 0; 3428 3429 int prot = sys->args.args[2]; 3430 3431 if ((prot & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC) && 3432 should_submit(MEM_PROT_ALERT, p.event)) { 3433 u32 alert = ALERT_MMAP_W_X; 3434 int fd = sys->args.args[4]; 3435 void *addr = (void *) sys->args.args[0]; 3436 size_t len = sys->args.args[1]; 3437 int prev_prot = 0; 3438 file_info_t file_info = {.pathname_p = NULL}; 3439 if (fd >= 0) { 3440 struct file *file = get_struct_file_from_fd(fd); 3441 file_info = get_file_info(file); 3442 } 3443 submit_mem_prot_alert_event( 3444 &p.event->args_buf, alert, addr, len, prot, prev_prot, file_info); 3445 } 3446 3447 return 0; 3448 } 3449 3450 SEC("kprobe/do_mmap") 3451 TRACE_ENT_FUNC(do_mmap, DO_MMAP) 3452 3453 SEC("kretprobe/do_mmap") 3454 int BPF_KPROBE(trace_ret_do_mmap) 3455 { 3456 args_t saved_args; 3457 if (load_args(&saved_args, DO_MMAP) != 0) { 3458 // missed entry or not traced 3459 return 0; 3460 } 3461 3462 program_data_t p = {}; 3463 if (!init_program_data(&p, ctx)) 3464 return 0; 3465 3466 if (!should_trace(&p)) 3467 return 0; 3468 3469 if (!should_submit(DO_MMAP, p.event)) { 3470 return 0; 3471 } 3472 3473 dev_t s_dev; 3474 unsigned long inode_nr; 3475 void *file_path; 3476 u64 ctime; 3477 unsigned int flags; 3478 3479 struct file *file = (struct file *) saved_args.args[0]; 3480 if (file != NULL) { 3481 s_dev = get_dev_from_file(file); 3482 inode_nr = get_inode_nr_from_file(file); 3483 file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 3484 ctime = get_ctime_nanosec_from_file(file); 3485 } 3486 unsigned long len = (unsigned long) saved_args.args[2]; 3487 unsigned long prot = (unsigned long) saved_args.args[3]; 3488 unsigned long mmap_flags = (unsigned long) saved_args.args[4]; 3489 unsigned long pgoff = (unsigned long) saved_args.args[5]; 3490 unsigned long addr = (unsigned long) PT_REGS_RC(ctx); 3491 3492 save_to_submit_buf(&p.event->args_buf, &addr, sizeof(void *), 0); 3493 if (file != NULL) { 3494 save_str_to_buf(&p.event->args_buf, file_path, 1); 3495 save_to_submit_buf(&p.event->args_buf, &flags, sizeof(unsigned int), 2); 3496 save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 3); 3497 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 4); 3498 save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 5); 3499 } 3500 save_to_submit_buf(&p.event->args_buf, &pgoff, sizeof(unsigned long), 6); 3501 save_to_submit_buf(&p.event->args_buf, &len, sizeof(unsigned long), 7); 3502 save_to_submit_buf(&p.event->args_buf, &prot, sizeof(unsigned long), 8); 3503 save_to_submit_buf(&p.event->args_buf, &mmap_flags, sizeof(unsigned long), 9); 3504 3505 return events_perf_submit(&p, DO_MMAP, 0); 3506 } 3507 3508 SEC("kprobe/security_mmap_file") 3509 int BPF_KPROBE(trace_security_mmap_file) 3510 { 3511 program_data_t p = {}; 3512 if (!init_program_data(&p, ctx)) 3513 return 0; 3514 3515 if (!should_trace(&p)) 3516 return 0; 3517 3518 bool submit_sec_mmap_file = should_submit(SECURITY_MMAP_FILE, p.event); 3519 bool submit_shared_object_loaded = should_submit(SHARED_OBJECT_LOADED, p.event); 3520 3521 if (!submit_sec_mmap_file && !submit_shared_object_loaded) 3522 return 0; 3523 3524 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 3525 if (file == 0) 3526 return 0; 3527 dev_t s_dev = get_dev_from_file(file); 3528 unsigned long inode_nr = get_inode_nr_from_file(file); 3529 void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 3530 u64 ctime = get_ctime_nanosec_from_file(file); 3531 unsigned long prot = (unsigned long) PT_REGS_PARM2(ctx); 3532 unsigned long mmap_flags = (unsigned long) PT_REGS_PARM3(ctx); 3533 3534 save_str_to_buf(&p.event->args_buf, file_path, 0); 3535 save_to_submit_buf(&p.event->args_buf, 3536 (void *) __builtin_preserve_access_index(&file->f_flags), 3537 sizeof(int), 3538 1); 3539 save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 2); 3540 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 3); 3541 save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4); 3542 3543 if (submit_shared_object_loaded) { 3544 if ((prot & VM_EXEC) == VM_EXEC && p.event->context.syscall == SYSCALL_MMAP) { 3545 events_perf_submit(&p, SHARED_OBJECT_LOADED, 0); 3546 } 3547 } 3548 3549 if (submit_sec_mmap_file) { 3550 save_to_submit_buf(&p.event->args_buf, &prot, sizeof(unsigned long), 5); 3551 save_to_submit_buf(&p.event->args_buf, &mmap_flags, sizeof(unsigned long), 6); 3552 return events_perf_submit(&p, SECURITY_MMAP_FILE, 0); 3553 } 3554 3555 return 0; 3556 } 3557 3558 SEC("kprobe/security_file_mprotect") 3559 int BPF_KPROBE(trace_security_file_mprotect) 3560 { 3561 bin_args_t bin_args = {}; 3562 3563 program_data_t p = {}; 3564 if (!init_program_data(&p, ctx)) 3565 return 0; 3566 3567 if (!should_trace(&p)) 3568 return 0; 3569 3570 // Load the arguments given to the mprotect syscall (which eventually invokes this function) 3571 syscall_data_t *sys = &p.task_info->syscall_data; 3572 if (!p.task_info->syscall_traced || 3573 (sys->id != SYSCALL_MPROTECT && sys->id != SYSCALL_PKEY_MPROTECT)) 3574 return 0; 3575 3576 int should_submit_mprotect = should_submit(SECURITY_FILE_MPROTECT, p.event); 3577 int should_submit_mem_prot_alert = should_submit(MEM_PROT_ALERT, p.event); 3578 3579 if (!should_submit_mprotect && !should_submit_mem_prot_alert) { 3580 return 0; 3581 } 3582 3583 struct vm_area_struct *vma = (struct vm_area_struct *) PT_REGS_PARM1(ctx); 3584 unsigned long reqprot = PT_REGS_PARM2(ctx); 3585 unsigned long prev_prot = get_vma_flags(vma); 3586 3587 struct file *file = (struct file *) BPF_CORE_READ(vma, vm_file); 3588 file_info_t file_info = get_file_info(file); 3589 3590 if (should_submit_mprotect) { 3591 void *addr = (void *) sys->args.args[0]; 3592 size_t len = sys->args.args[1]; 3593 3594 save_str_to_buf(&p.event->args_buf, file_info.pathname_p, 0); 3595 save_to_submit_buf(&p.event->args_buf, &reqprot, sizeof(int), 1); 3596 save_to_submit_buf(&p.event->args_buf, &file_info.id.ctime, sizeof(u64), 2); 3597 save_to_submit_buf(&p.event->args_buf, &prev_prot, sizeof(int), 3); 3598 save_to_submit_buf(&p.event->args_buf, &addr, sizeof(void *), 4); 3599 save_to_submit_buf(&p.event->args_buf, &len, sizeof(size_t), 5); 3600 3601 if (sys->id == SYSCALL_PKEY_MPROTECT) { 3602 int pkey = sys->args.args[3]; 3603 save_to_submit_buf(&p.event->args_buf, &pkey, sizeof(int), 6); 3604 } 3605 3606 events_perf_submit(&p, SECURITY_FILE_MPROTECT, 0); 3607 } 3608 3609 if (should_submit_mem_prot_alert) { 3610 void *addr = (void *) sys->args.args[0]; 3611 size_t len = sys->args.args[1]; 3612 3613 if (addr <= 0) 3614 return 0; 3615 3616 // If length is 0, the current page permissions are changed 3617 if (len == 0) 3618 len = PAGE_SIZE; 3619 3620 u32 alert; 3621 bool should_alert = false; 3622 bool should_extract_code = false; 3623 3624 if ((!(prev_prot & VM_EXEC)) && (reqprot & VM_EXEC)) { 3625 alert = ALERT_MPROT_X_ADD; 3626 should_alert = true; 3627 } 3628 3629 if ((prev_prot & VM_EXEC) && !(prev_prot & VM_WRITE) && 3630 ((reqprot & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC))) { 3631 alert = ALERT_MPROT_W_ADD; 3632 should_alert = true; 3633 } 3634 3635 if ((prev_prot & VM_WRITE) && (reqprot & VM_EXEC) && !(reqprot & VM_WRITE)) { 3636 alert = ALERT_MPROT_W_REM; 3637 should_alert = true; 3638 3639 if (p.config->options & OPT_EXTRACT_DYN_CODE) { 3640 should_extract_code = true; 3641 } 3642 } 3643 if (should_alert) { 3644 reset_event_args(&p); 3645 submit_mem_prot_alert_event( 3646 &p.event->args_buf, alert, addr, len, reqprot, prev_prot, file_info); 3647 } 3648 if (should_extract_code) { 3649 u32 pid = p.event->context.task.host_pid; 3650 bin_args.type = SEND_MPROTECT; 3651 bpf_probe_read(bin_args.metadata, sizeof(u64), &p.event->context.ts); 3652 bpf_probe_read(&bin_args.metadata[8], 4, &pid); 3653 bin_args.ptr = (char *) addr; 3654 bin_args.start_off = 0; 3655 bin_args.full_size = len; 3656 3657 tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN); 3658 } 3659 } 3660 3661 return 0; 3662 } 3663 3664 SEC("raw_tracepoint/sys_init_module") 3665 int syscall__init_module(void *ctx) 3666 { 3667 program_data_t p = {}; 3668 if (!init_program_data(&p, ctx)) 3669 return 0; 3670 3671 syscall_data_t *sys = &p.task_info->syscall_data; 3672 if (!p.task_info->syscall_traced) 3673 return -1; 3674 3675 bin_args_t bin_args = {}; 3676 3677 u32 pid = p.event->context.task.host_pid; 3678 u64 dummy = 0; 3679 void *addr = (void *) sys->args.args[0]; 3680 unsigned long len = (unsigned long) sys->args.args[1]; 3681 3682 if (p.config->options & OPT_CAPTURE_MODULES) { 3683 bin_args.type = SEND_KERNEL_MODULE; 3684 bpf_probe_read_kernel(bin_args.metadata, 4, &dummy); 3685 bpf_probe_read_kernel(&bin_args.metadata[4], 8, &dummy); 3686 bpf_probe_read_kernel(&bin_args.metadata[12], 4, &pid); 3687 bpf_probe_read_kernel(&bin_args.metadata[16], 8, &len); 3688 bin_args.ptr = (char *) addr; 3689 bin_args.start_off = 0; 3690 bin_args.full_size = (unsigned int) len; 3691 3692 tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN_TP); 3693 } 3694 return 0; 3695 } 3696 3697 statfunc int do_check_bpf_link(program_data_t *p, union bpf_attr *attr, int cmd) 3698 { 3699 if (cmd == BPF_LINK_CREATE) { 3700 u32 prog_fd = BPF_CORE_READ(attr, link_create.prog_fd); 3701 u32 perf_fd = BPF_CORE_READ(attr, link_create.target_fd); 3702 3703 struct file *bpf_prog_file = get_struct_file_from_fd(prog_fd); 3704 struct file *perf_event_file = get_struct_file_from_fd(perf_fd); 3705 3706 send_bpf_perf_attach(p, bpf_prog_file, perf_event_file); 3707 } 3708 3709 return 0; 3710 } 3711 3712 statfunc int check_bpf_link(program_data_t *p, union bpf_attr *attr, int cmd) 3713 { 3714 // BPF_LINK_CREATE command was only introduced in kernel 5.7. 3715 // nothing to check for kernels < 5.7. 3716 3717 if (bpf_core_field_exists(attr->link_create)) { 3718 do_check_bpf_link(p, attr, cmd); 3719 } 3720 3721 return 0; 3722 } 3723 3724 // TODO: This fails on 5.4 kernel with error: 3725 // loading ebpf module: field TraceSecurityBpf: program trace_security_bpf: l 3726 // oad program: permission denied: 1595: (73) *(u8 *)(r10 -120) = r1: ; return (const str 3727 // (truncated, 951 line(s) omitted) 3728 SEC("kprobe/security_bpf") 3729 int BPF_KPROBE(trace_security_bpf) 3730 { 3731 program_data_t p = {}; 3732 if (!init_program_data(&p, ctx)) 3733 return 0; 3734 3735 if (!should_trace(&p)) 3736 return 0; 3737 3738 int cmd = (int) PT_REGS_PARM1(ctx); 3739 3740 if (should_submit(SECURITY_BPF, p.event)) { 3741 // 1st argument == cmd (int) 3742 save_to_submit_buf(&p.event->args_buf, (void *) &cmd, sizeof(int), 0); 3743 events_perf_submit(&p, SECURITY_BPF, 0); 3744 } 3745 union bpf_attr *attr = (union bpf_attr *) PT_REGS_PARM2(ctx); 3746 3747 reset_event_args(&p); 3748 check_bpf_link(&p, attr, cmd); 3749 3750 // Capture BPF object loaded 3751 if (cmd == BPF_PROG_LOAD && p.config->options & OPT_CAPTURE_BPF) { 3752 bin_args_t bin_args = {}; 3753 u32 pid = p.task_info->context.host_pid; 3754 3755 u32 insn_cnt = get_attr_insn_cnt(attr); 3756 const struct bpf_insn *insns = get_attr_insns(attr); 3757 unsigned int insn_size = (unsigned int) (sizeof(struct bpf_insn) * insn_cnt); 3758 3759 bin_args.type = SEND_BPF_OBJECT; 3760 char prog_name[16] = {0}; 3761 long sz = bpf_probe_read_kernel_str(prog_name, 16, attr->prog_name); 3762 if (sz > 0) { 3763 sz = bpf_probe_read_kernel_str(bin_args.metadata, 16, prog_name); 3764 } 3765 3766 u32 rand = bpf_get_prandom_u32(); 3767 bpf_probe_read_kernel(&bin_args.metadata[16], 4, &rand); 3768 bpf_probe_read_kernel(&bin_args.metadata[20], 4, &pid); 3769 bpf_probe_read_kernel(&bin_args.metadata[24], 4, &insn_size); 3770 bin_args.ptr = (char *) insns; 3771 bin_args.start_off = 0; 3772 bin_args.full_size = insn_size; 3773 3774 tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN); 3775 } 3776 return 0; 3777 } 3778 3779 // arm_kprobe can't be hooked in arm64 architecture, use enable logic instead 3780 3781 statfunc int arm_kprobe_handler(struct pt_regs *ctx) 3782 { 3783 args_t saved_args; 3784 if (load_args(&saved_args, KPROBE_ATTACH) != 0) { 3785 return 0; 3786 } 3787 del_args(KPROBE_ATTACH); 3788 3789 program_data_t p = {}; 3790 if (!init_program_data(&p, ctx)) 3791 return 0; 3792 3793 if (!should_trace(&p)) 3794 return 0; 3795 3796 struct kprobe *kp = (struct kprobe *) saved_args.args[0]; 3797 unsigned int retcode = PT_REGS_RC(ctx); 3798 3799 if (retcode) 3800 return 0; // register_kprobe() failed 3801 3802 char *symbol_name = (char *) BPF_CORE_READ(kp, symbol_name); 3803 u64 pre_handler = (u64) BPF_CORE_READ(kp, pre_handler); 3804 u64 post_handler = (u64) BPF_CORE_READ(kp, post_handler); 3805 3806 save_str_to_buf(&p.event->args_buf, (void *) symbol_name, 0); 3807 save_to_submit_buf(&p.event->args_buf, (void *) &pre_handler, sizeof(u64), 1); 3808 save_to_submit_buf(&p.event->args_buf, (void *) &post_handler, sizeof(u64), 2); 3809 3810 return events_perf_submit(&p, KPROBE_ATTACH, 0); 3811 } 3812 3813 // register_kprobe and enable_kprobe have same execution path, and both call 3814 // arm_kprobe, which is the function we are interested in. Nevertheless, there 3815 // is also another function, register_aggr_kprobes, that might be able to call 3816 // arm_kprobe so, instead of hooking into enable_kprobe, we hook into 3817 // register_kprobe covering all execution paths. 3818 3819 SEC("kprobe/register_kprobe") 3820 TRACE_ENT_FUNC(register_kprobe, KPROBE_ATTACH); 3821 3822 SEC("kretprobe/register_kprobe") 3823 int BPF_KPROBE(trace_ret_register_kprobe) 3824 { 3825 return arm_kprobe_handler(ctx); 3826 } 3827 3828 SEC("kprobe/security_bpf_map") 3829 int BPF_KPROBE(trace_security_bpf_map) 3830 { 3831 program_data_t p = {}; 3832 if (!init_program_data(&p, ctx)) 3833 return 0; 3834 3835 if (!should_trace(&p)) 3836 return 0; 3837 3838 if (!should_submit(SECURITY_BPF_MAP, p.event)) 3839 return 0; 3840 3841 struct bpf_map *map = (struct bpf_map *) PT_REGS_PARM1(ctx); 3842 3843 // 1st argument == map_id (u32) 3844 save_to_submit_buf( 3845 &p.event->args_buf, (void *) __builtin_preserve_access_index(&map->id), sizeof(int), 0); 3846 // 2nd argument == map_name (const char *) 3847 save_str_to_buf(&p.event->args_buf, (void *) __builtin_preserve_access_index(&map->name), 1); 3848 3849 return events_perf_submit(&p, SECURITY_BPF_MAP, 0); 3850 } 3851 3852 SEC("kprobe/security_bpf_prog") 3853 int BPF_KPROBE(trace_security_bpf_prog) 3854 { 3855 program_data_t p = {}; 3856 if (!init_program_data(&p, ctx)) 3857 return 0; 3858 3859 if (!should_trace(&p)) 3860 return 0; 3861 3862 struct bpf_prog *prog = (struct bpf_prog *) PT_REGS_PARM1(ctx); 3863 struct bpf_prog_aux *prog_aux = BPF_CORE_READ(prog, aux); 3864 u32 prog_id = BPF_CORE_READ(prog_aux, id); 3865 3866 // In some systems, the 'check_map_func_compatibility' and 'check_helper_call' symbols are not 3867 // available. For these cases, the temporary map 'bpf_attach_tmp_map' will not hold any 3868 // information about the used helpers in the prog. nevertheless, we always want to output the 3869 // 'bpf_attach' event to the user, so using zero values 3870 bpf_used_helpers_t val = {0}; 3871 3872 // if there is a value, use it 3873 bpf_used_helpers_t *existing_val; 3874 existing_val = bpf_map_lookup_elem(&bpf_attach_tmp_map, &p.event->context.task.host_tid); 3875 if (existing_val != NULL) { 3876 __builtin_memcpy(&val.helpers, &existing_val->helpers, sizeof(bpf_used_helpers_t)); 3877 } 3878 3879 bpf_map_delete_elem(&bpf_attach_tmp_map, &p.event->context.task.host_tid); 3880 3881 if (should_submit(BPF_ATTACH, p.event)) { 3882 bpf_map_update_elem(&bpf_attach_map, &prog_id, &val, BPF_ANY); 3883 } 3884 3885 if (!should_submit(SECURITY_BPF_PROG, p.event)) { 3886 return 0; 3887 } 3888 3889 bool is_load = false; 3890 void **aux_ptr = bpf_map_lookup_elem(&bpf_prog_load_map, &p.event->context.task.host_tid); 3891 if (aux_ptr != NULL) { 3892 if (*aux_ptr == (void *) prog_aux) { 3893 is_load = true; 3894 } 3895 3896 bpf_map_delete_elem(&bpf_prog_load_map, &p.event->context.task.host_tid); 3897 } 3898 3899 int prog_type = BPF_CORE_READ(prog, type); 3900 3901 char prog_name[BPF_OBJ_NAME_LEN]; 3902 bpf_probe_read_kernel_str(&prog_name, BPF_OBJ_NAME_LEN, prog_aux->name); 3903 3904 save_to_submit_buf(&p.event->args_buf, &prog_type, sizeof(int), 0); 3905 save_str_to_buf(&p.event->args_buf, (void *) &prog_name, 1); 3906 save_u64_arr_to_buf(&p.event->args_buf, (const u64 *) val.helpers, 4, 2); 3907 save_to_submit_buf(&p.event->args_buf, &prog_id, sizeof(u32), 3); 3908 save_to_submit_buf(&p.event->args_buf, &is_load, sizeof(bool), 4); 3909 3910 events_perf_submit(&p, SECURITY_BPF_PROG, 0); 3911 3912 return 0; 3913 } 3914 3915 SEC("kprobe/bpf_check") 3916 int BPF_KPROBE(trace_bpf_check) 3917 { 3918 program_data_t p = {}; 3919 if (!init_program_data(&p, ctx)) 3920 return 0; 3921 3922 if (!should_trace(&p)) 3923 return 0; 3924 3925 // this probe is triggered when a bpf program is loaded. 3926 // we save the aux pointer to be used in security_bpf_prog, to indicate this prog is being 3927 // loaded - security_bpf_prog is triggered not only on prog load. 3928 3929 if (!should_submit(SECURITY_BPF_PROG, p.event)) 3930 return 0; 3931 3932 struct bpf_prog **prog; 3933 struct bpf_prog *prog_ptr; 3934 struct bpf_prog_aux *prog_aux; 3935 3936 prog = (struct bpf_prog **) PT_REGS_PARM1(ctx); 3937 bpf_core_read(&prog_ptr, sizeof(void *), prog); 3938 prog_aux = BPF_CORE_READ(prog_ptr, aux); 3939 3940 bpf_map_update_elem(&bpf_prog_load_map, &p.event->context.task.host_tid, &prog_aux, BPF_ANY); 3941 3942 return 0; 3943 } 3944 3945 // Save in the temporary map 'bpf_attach_tmp_map' whether bpf_probe_write_user and 3946 // bpf_override_return are used in the bpf program. Get this information in the verifier phase of 3947 // the bpf program load lifecycle, before a prog_id is set for the bpf program. Save this 3948 // information in a temporary map which includes the host_tid as key instead of the prog_id. 3949 // 3950 // Later on, in security_bpf_prog, save this information in the stable map 'bpf_attach_map', which 3951 // contains the prog_id in its key. 3952 3953 statfunc int handle_bpf_helper_func_id(u32 host_tid, int func_id) 3954 { 3955 bpf_used_helpers_t val = {0}; 3956 3957 // we want to the existing value in the map a just update it with the current func_id 3958 bpf_used_helpers_t *existing_val = bpf_map_lookup_elem(&bpf_attach_tmp_map, &host_tid); 3959 if (existing_val != NULL) { 3960 __builtin_memcpy(&val.helpers, &existing_val->helpers, sizeof(bpf_used_helpers_t)); 3961 } 3962 3963 // calculate where to encode usage of this func_id in bpf_used_helpers_t. 3964 // this method is used in order to stay in bounds of the helpers array and pass verifier checks. 3965 // it is equivalent to: 3966 // val.helpers[func_id / 64] |= (1ULL << (func_id % 64)); 3967 // which the verifier doesn't like. 3968 int arr_num; 3969 int arr_idx = func_id; 3970 3971 #pragma unroll 3972 for (int i = 0; i < NUM_OF_HELPERS_ELEMS; i++) { 3973 arr_num = i; 3974 if (arr_idx - SIZE_OF_HELPER_ELEM >= 0) { 3975 arr_idx = arr_idx - SIZE_OF_HELPER_ELEM; 3976 } else { 3977 break; 3978 } 3979 } 3980 if (arr_idx >= SIZE_OF_HELPER_ELEM) { 3981 // unsupported func_id 3982 return 0; 3983 } 3984 3985 val.helpers[arr_num] |= (1ULL << (arr_idx)); 3986 3987 // update the map with the current func_id 3988 bpf_map_update_elem(&bpf_attach_tmp_map, &host_tid, &val, BPF_ANY); 3989 3990 return 0; 3991 } 3992 3993 SEC("kprobe/check_map_func_compatibility") 3994 int BPF_KPROBE(trace_check_map_func_compatibility) 3995 { 3996 program_data_t p = {}; 3997 if (!init_program_data(&p, ctx)) 3998 return 0; 3999 4000 if (!should_trace(&p)) 4001 return 0; 4002 4003 int func_id = (int) PT_REGS_PARM3(ctx); 4004 4005 return handle_bpf_helper_func_id(p.event->context.task.host_tid, func_id); 4006 } 4007 4008 SEC("kprobe/check_helper_call") 4009 int BPF_KPROBE(trace_check_helper_call) 4010 { 4011 program_data_t p = {}; 4012 if (!init_program_data(&p, ctx)) 4013 return 0; 4014 4015 if (!should_trace(&p)) 4016 return 0; 4017 4018 int func_id; 4019 4020 if (!bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_for_each_map_elem)) { 4021 // if BPF_FUNC_for_each_map_elem doesn't exist under bpf_func_id - kernel version < 5.13 4022 func_id = (int) PT_REGS_PARM2(ctx); 4023 } else { 4024 struct bpf_insn *insn = (struct bpf_insn *) PT_REGS_PARM2(ctx); 4025 func_id = BPF_CORE_READ(insn, imm); 4026 } 4027 4028 return handle_bpf_helper_func_id(p.event->context.task.host_tid, func_id); 4029 } 4030 4031 SEC("kprobe/security_kernel_read_file") 4032 int BPF_KPROBE(trace_security_kernel_read_file) 4033 { 4034 program_data_t p = {}; 4035 if (!init_program_data(&p, ctx)) 4036 return 0; 4037 4038 if (!should_trace(&p)) 4039 return 0; 4040 4041 if (!should_submit(SECURITY_KERNEL_READ_FILE, p.event)) 4042 return 0; 4043 4044 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 4045 dev_t s_dev = get_dev_from_file(file); 4046 unsigned long inode_nr = get_inode_nr_from_file(file); 4047 enum kernel_read_file_id type_id = (enum kernel_read_file_id) PT_REGS_PARM2(ctx); 4048 void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 4049 u64 ctime = get_ctime_nanosec_from_file(file); 4050 4051 save_str_to_buf(&p.event->args_buf, file_path, 0); 4052 save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 1); 4053 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 2); 4054 save_to_submit_buf(&p.event->args_buf, &type_id, sizeof(int), 3); 4055 save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4); 4056 4057 return events_perf_submit(&p, SECURITY_KERNEL_READ_FILE, 0); 4058 } 4059 4060 SEC("kprobe/security_kernel_post_read_file") 4061 int BPF_KPROBE(trace_security_kernel_post_read_file) 4062 { 4063 program_data_t p = {}; 4064 if (!init_program_data(&p, ctx)) 4065 return 0; 4066 4067 if (!should_trace(&p)) 4068 return 0; 4069 4070 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 4071 char *buf = (char *) PT_REGS_PARM2(ctx); 4072 loff_t size = (loff_t) PT_REGS_PARM3(ctx); 4073 enum kernel_read_file_id type_id = (enum kernel_read_file_id) PT_REGS_PARM4(ctx); 4074 4075 // Send event if chosen 4076 if (should_submit(SECURITY_POST_READ_FILE, p.event)) { 4077 void *file_path = get_path_str(&file->f_path); 4078 save_str_to_buf(&p.event->args_buf, file_path, 0); 4079 save_to_submit_buf(&p.event->args_buf, &size, sizeof(loff_t), 1); 4080 save_to_submit_buf(&p.event->args_buf, &type_id, sizeof(int), 2); 4081 events_perf_submit(&p, SECURITY_POST_READ_FILE, 0); 4082 } 4083 4084 if (p.config->options & OPT_CAPTURE_MODULES) { 4085 // Do not extract files greater than 4GB 4086 if (size >= (u64) 1 << 32) { 4087 return 0; 4088 } 4089 // Extract device id, inode number for file name 4090 dev_t s_dev = get_dev_from_file(file); 4091 unsigned long inode_nr = get_inode_nr_from_file(file); 4092 bin_args_t bin_args = {}; 4093 u32 pid = p.event->context.task.host_pid; 4094 4095 bin_args.type = SEND_KERNEL_MODULE; 4096 bpf_probe_read_kernel(bin_args.metadata, 4, &s_dev); 4097 bpf_probe_read_kernel(&bin_args.metadata[4], 8, &inode_nr); 4098 bpf_probe_read_kernel(&bin_args.metadata[12], 4, &pid); 4099 bpf_probe_read_kernel(&bin_args.metadata[16], 4, &size); 4100 bin_args.start_off = 0; 4101 bin_args.ptr = buf; 4102 bin_args.full_size = size; 4103 4104 tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN); 4105 } 4106 4107 return 0; 4108 } 4109 4110 SEC("kprobe/security_inode_mknod") 4111 int BPF_KPROBE(trace_security_inode_mknod) 4112 { 4113 program_data_t p = {}; 4114 if (!init_program_data(&p, ctx)) 4115 return 0; 4116 4117 if (!should_trace(&p)) 4118 return 0; 4119 4120 if (!should_submit(SECURITY_INODE_MKNOD, p.event)) 4121 return 0; 4122 4123 struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx); 4124 unsigned short mode = (unsigned short) PT_REGS_PARM3(ctx); 4125 unsigned int dev = (unsigned int) PT_REGS_PARM4(ctx); 4126 void *dentry_path = get_dentry_path_str(dentry); 4127 4128 save_str_to_buf(&p.event->args_buf, dentry_path, 0); 4129 save_to_submit_buf(&p.event->args_buf, &mode, sizeof(unsigned short), 1); 4130 save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2); 4131 4132 return events_perf_submit(&p, SECURITY_INODE_MKNOD, 0); 4133 } 4134 4135 SEC("kprobe/device_add") 4136 int BPF_KPROBE(trace_device_add) 4137 { 4138 program_data_t p = {}; 4139 if (!init_program_data(&p, ctx)) 4140 return 0; 4141 4142 if (!should_trace(&p)) 4143 return 0; 4144 4145 if (!should_submit(DEVICE_ADD, p.event)) 4146 return 0; 4147 4148 struct device *dev = (struct device *) PT_REGS_PARM1(ctx); 4149 const char *name = get_device_name(dev); 4150 4151 struct device *parent_dev = BPF_CORE_READ(dev, parent); 4152 const char *parent_name = get_device_name(parent_dev); 4153 4154 save_str_to_buf(&p.event->args_buf, (void *) name, 0); 4155 save_str_to_buf(&p.event->args_buf, (void *) parent_name, 1); 4156 4157 return events_perf_submit(&p, DEVICE_ADD, 0); 4158 } 4159 4160 SEC("kprobe/__register_chrdev") 4161 TRACE_ENT_FUNC(__register_chrdev, REGISTER_CHRDEV); 4162 4163 SEC("kretprobe/__register_chrdev") 4164 int BPF_KPROBE(trace_ret__register_chrdev) 4165 { 4166 args_t saved_args; 4167 if (load_args(&saved_args, REGISTER_CHRDEV) != 0) { 4168 // missed entry or not traced 4169 return 0; 4170 } 4171 del_args(REGISTER_CHRDEV); 4172 4173 program_data_t p = {}; 4174 if (!init_program_data(&p, ctx)) 4175 return 0; 4176 4177 if (!should_trace(&p)) 4178 return 0; 4179 4180 if (!should_submit(REGISTER_CHRDEV, p.event)) 4181 return 0; 4182 4183 unsigned int major_number = (unsigned int) saved_args.args[0]; 4184 unsigned int returned_major = PT_REGS_RC(ctx); 4185 4186 // sets the returned major to the requested one in case of a successful registration 4187 if (major_number > 0 && returned_major == 0) { 4188 returned_major = major_number; 4189 } 4190 4191 char *char_device_name = (char *) saved_args.args[3]; 4192 struct file_operations *char_device_fops = (struct file_operations *) saved_args.args[4]; 4193 4194 save_to_submit_buf(&p.event->args_buf, &major_number, sizeof(unsigned int), 0); 4195 save_to_submit_buf(&p.event->args_buf, &returned_major, sizeof(unsigned int), 1); 4196 save_str_to_buf(&p.event->args_buf, char_device_name, 2); 4197 save_to_submit_buf(&p.event->args_buf, &char_device_fops, sizeof(void *), 3); 4198 4199 return events_perf_submit(&p, REGISTER_CHRDEV, 0); 4200 } 4201 4202 statfunc struct pipe_buffer *get_last_write_pipe_buffer(struct pipe_inode_info *pipe) 4203 { 4204 // Extract the last page buffer used in the pipe for write 4205 struct pipe_buffer *bufs = BPF_CORE_READ(pipe, bufs); 4206 unsigned int curbuf; 4207 4208 struct pipe_inode_info___v54 *legacy_pipe = (struct pipe_inode_info___v54 *) pipe; 4209 if (bpf_core_field_exists(legacy_pipe->nrbufs)) { 4210 unsigned int nrbufs = BPF_CORE_READ(legacy_pipe, nrbufs); 4211 if (nrbufs > 0) { 4212 nrbufs--; 4213 } 4214 curbuf = (BPF_CORE_READ(legacy_pipe, curbuf) + nrbufs) & 4215 (BPF_CORE_READ(legacy_pipe, buffers) - 1); 4216 } else { 4217 int head = BPF_CORE_READ(pipe, head); 4218 int ring_size = BPF_CORE_READ(pipe, ring_size); 4219 curbuf = (head - 1) & (ring_size - 1); 4220 } 4221 4222 struct pipe_buffer *current_buffer = get_node_addr(bufs, curbuf); 4223 return current_buffer; 4224 } 4225 4226 SEC("kprobe/do_splice") 4227 TRACE_ENT_FUNC(do_splice, DIRTY_PIPE_SPLICE); 4228 4229 SEC("kretprobe/do_splice") 4230 int BPF_KPROBE(trace_ret_do_splice) 4231 { 4232 // The Dirty Pipe vulnerability exist in the kernel since version 5.8, so 4233 // there is not use to do logic if version is too old. In non-CORE, it will 4234 // even mean using defines which are not available in the kernel headers, 4235 // which will cause bugs. 4236 4237 // Check if field of struct exist to determine kernel version - some fields 4238 // change between versions. In version 5.8 of the kernel, the field 4239 // "high_zoneidx" changed its name to "highest_zoneidx". This means that the 4240 // existence of the field "high_zoneidx" can indicate that the kernel 4241 // version is lower than v5.8 4242 4243 struct alloc_context *check_508; 4244 if (bpf_core_field_exists(check_508->high_zoneidx)) { 4245 del_args(DIRTY_PIPE_SPLICE); 4246 return 0; 4247 } 4248 4249 args_t saved_args; 4250 if (load_args(&saved_args, DIRTY_PIPE_SPLICE) != 0) { 4251 // missed entry or not traced 4252 return 0; 4253 } 4254 del_args(DIRTY_PIPE_SPLICE); 4255 4256 program_data_t p = {}; 4257 if (!init_program_data(&p, ctx)) 4258 return 0; 4259 4260 if (!should_trace(&p)) 4261 return 0; 4262 4263 if (!should_submit(DIRTY_PIPE_SPLICE, p.event)) 4264 return 0; 4265 4266 // Catch only successful splice 4267 if ((int) PT_REGS_RC(ctx) <= 0) { 4268 return 0; 4269 } 4270 4271 struct file *out_file = (struct file *) saved_args.args[2]; 4272 struct pipe_inode_info *out_pipe = get_file_pipe_info(out_file); 4273 // Check that output is a pipe 4274 if (!out_pipe) { 4275 return 0; 4276 } 4277 4278 // dirty_pipe_splice is a splice to a pipe which results that the last page copied could be 4279 // modified (the PIPE_BUF_CAN_MERGE flag is on in the pipe_buffer struct). 4280 struct pipe_buffer *last_write_page_buffer = get_last_write_pipe_buffer(out_pipe); 4281 unsigned int out_pipe_last_buffer_flags = BPF_CORE_READ(last_write_page_buffer, flags); 4282 if ((out_pipe_last_buffer_flags & PIPE_BUF_FLAG_CAN_MERGE) == 0) { 4283 return 0; 4284 } 4285 4286 struct file *in_file = (struct file *) saved_args.args[0]; 4287 struct inode *in_inode = BPF_CORE_READ(in_file, f_inode); 4288 u64 in_inode_number = BPF_CORE_READ(in_inode, i_ino); 4289 unsigned short in_file_type = BPF_CORE_READ(in_inode, i_mode) & S_IFMT; 4290 void *in_file_path = get_path_str(__builtin_preserve_access_index(&in_file->f_path)); 4291 size_t write_len = (size_t) saved_args.args[4]; 4292 4293 loff_t *off_in_addr = (loff_t *) saved_args.args[1]; 4294 // In kernel v5.10 the pointer passed was no longer of the user, so flexibility is needed to 4295 // read it 4296 loff_t off_in; 4297 4298 // 4299 // Check if field of struct exist to determine kernel version - some fields change between 4300 // versions. Field 'data' of struct 'public_key_signature' was introduced between v5.9 and 4301 // v5.10, so its existence might be used to determine whether the current version is older than 4302 // 5.9 or newer than 5.10. 4303 // 4304 // https://lore.kernel.org/stable/20210821203108.215937-1-rafaeldtinoco@gmail.com/ 4305 // 4306 struct public_key_signature *check; 4307 4308 if (!bpf_core_field_exists(check->data)) // version < v5.10 4309 bpf_core_read_user(&off_in, sizeof(off_in), off_in_addr); 4310 4311 else // version >= v5.10 4312 bpf_core_read(&off_in, sizeof(off_in), off_in_addr); 4313 4314 struct inode *out_inode = BPF_CORE_READ(out_file, f_inode); 4315 u64 out_inode_number = BPF_CORE_READ(out_inode, i_ino); 4316 4317 // Only last page written to pipe is vulnerable from the end of written data 4318 loff_t next_exposed_data_offset_in_out_pipe_last_page = 4319 BPF_CORE_READ(last_write_page_buffer, offset) + BPF_CORE_READ(last_write_page_buffer, len); 4320 size_t in_file_size = BPF_CORE_READ(in_inode, i_size); 4321 size_t exposed_data_len = (PAGE_SIZE - 1) - next_exposed_data_offset_in_out_pipe_last_page; 4322 loff_t current_file_offset = off_in + write_len; 4323 if (current_file_offset + exposed_data_len > in_file_size) { 4324 exposed_data_len = in_file_size - current_file_offset - 1; 4325 } 4326 4327 save_to_submit_buf(&p.event->args_buf, &in_inode_number, sizeof(u64), 0); 4328 save_to_submit_buf(&p.event->args_buf, &in_file_type, sizeof(unsigned short), 1); 4329 save_str_to_buf(&p.event->args_buf, in_file_path, 2); 4330 save_to_submit_buf(&p.event->args_buf, ¤t_file_offset, sizeof(loff_t), 3); 4331 save_to_submit_buf(&p.event->args_buf, &exposed_data_len, sizeof(size_t), 4); 4332 save_to_submit_buf(&p.event->args_buf, &out_inode_number, sizeof(u64), 5); 4333 save_to_submit_buf(&p.event->args_buf, &out_pipe_last_buffer_flags, sizeof(unsigned int), 6); 4334 4335 return events_perf_submit(&p, DIRTY_PIPE_SPLICE, 0); 4336 } 4337 4338 SEC("raw_tracepoint/module_load") 4339 int tracepoint__module__module_load(struct bpf_raw_tracepoint_args *ctx) 4340 { 4341 program_data_t p = {}; 4342 if (!init_program_data(&p, ctx)) 4343 return 0; 4344 4345 if (!should_trace(&p)) 4346 return 0; 4347 4348 bool should_submit_module_load = should_submit(MODULE_LOAD, p.event); 4349 bool should_submit_hidden_module = should_submit(HIDDEN_KERNEL_MODULE_SEEKER, p.event); 4350 if (!(should_submit_module_load || should_submit_hidden_module)) 4351 return 0; 4352 4353 struct module *mod = (struct module *) ctx->args[0]; 4354 4355 if (should_submit_hidden_module) { 4356 u64 insert_time = bpf_ktime_get_ns(); 4357 kernel_new_mod_t new_mod = {.insert_time = insert_time}; 4358 u64 mod_addr = (u64) mod; 4359 // new_module_map - must be after the module is added to modules list, 4360 // otherwise there's a risk for race condition 4361 bpf_map_update_elem(&new_module_map, &mod_addr, &new_mod, BPF_ANY); 4362 4363 last_module_insert_time = insert_time; 4364 4365 if (!should_submit_module_load) 4366 return 0; 4367 } 4368 4369 const char *version = BPF_CORE_READ(mod, version); 4370 const char *srcversion = BPF_CORE_READ(mod, srcversion); 4371 save_str_to_buf(&p.event->args_buf, &mod->name, 0); 4372 save_str_to_buf(&p.event->args_buf, (void *) version, 1); 4373 save_str_to_buf(&p.event->args_buf, (void *) srcversion, 2); 4374 4375 return events_perf_submit(&p, MODULE_LOAD, 0); 4376 } 4377 4378 SEC("raw_tracepoint/module_free") 4379 int tracepoint__module__module_free(struct bpf_raw_tracepoint_args *ctx) 4380 { 4381 program_data_t p = {}; 4382 if (!init_program_data(&p, ctx)) 4383 return 0; 4384 4385 if (!should_trace(&p)) 4386 return 0; 4387 4388 bool should_submit_module_free = should_submit(MODULE_FREE, p.event); 4389 bool should_submit_hidden_module = should_submit(HIDDEN_KERNEL_MODULE_SEEKER, p.event); 4390 if (!(should_submit_module_free || should_submit_hidden_module)) 4391 return 0; 4392 4393 struct module *mod = (struct module *) ctx->args[0]; 4394 if (should_submit_hidden_module) { 4395 u64 mod_addr = (u64) mod; 4396 // We must delete before the actual deletion from modules list occurs, otherwise there's a 4397 // risk of race condition 4398 bpf_map_delete_elem(&new_module_map, &mod_addr); 4399 4400 kernel_deleted_mod_t deleted_mod = {.deleted_time = bpf_ktime_get_ns()}; 4401 bpf_map_update_elem(&recent_deleted_module_map, &mod_addr, &deleted_mod, BPF_ANY); 4402 4403 if (!should_submit_module_free) 4404 return 0; 4405 } 4406 4407 const char *version = BPF_CORE_READ(mod, version); 4408 const char *srcversion = BPF_CORE_READ(mod, srcversion); 4409 save_str_to_buf(&p.event->args_buf, &mod->name, 0); 4410 save_str_to_buf(&p.event->args_buf, (void *) version, 1); 4411 save_str_to_buf(&p.event->args_buf, (void *) srcversion, 2); 4412 4413 return events_perf_submit(&p, MODULE_FREE, 0); 4414 } 4415 4416 SEC("kprobe/do_init_module") 4417 TRACE_ENT_FUNC(do_init_module, DO_INIT_MODULE); 4418 4419 SEC("kretprobe/do_init_module") 4420 int BPF_KPROBE(trace_ret_do_init_module) 4421 { 4422 args_t saved_args; 4423 if (load_args(&saved_args, DO_INIT_MODULE) != 0) { 4424 // missed entry or not traced 4425 return 0; 4426 } 4427 del_args(DO_INIT_MODULE); 4428 4429 program_data_t p = {}; 4430 if (!init_program_data(&p, ctx)) 4431 return 0; 4432 4433 if (!should_trace(&p)) 4434 return 0; 4435 4436 bool should_submit_do_init_module = should_submit(DO_INIT_MODULE, p.event); 4437 bool should_submit_hidden_module = should_submit(HIDDEN_KERNEL_MODULE_SEEKER, p.event); 4438 if (!(should_submit_do_init_module || should_submit_hidden_module)) 4439 return 0; 4440 4441 struct module *mod = (struct module *) saved_args.args[0]; 4442 4443 // trigger the lkm seeker 4444 if (should_submit_hidden_module) { 4445 u64 addr = (u64) mod; 4446 u32 flags = FULL_SCAN; 4447 lkm_seeker_send_to_userspace((struct module *) addr, &flags, &p); 4448 reset_event_args(&p); // Do not corrupt the buffer for the do_init_module event 4449 if (!should_submit_do_init_module) 4450 return 0; 4451 } 4452 4453 // save strings to buf 4454 const char *version = BPF_CORE_READ(mod, version); 4455 const char *srcversion = BPF_CORE_READ(mod, srcversion); 4456 save_str_to_buf(&p.event->args_buf, &mod->name, 0); 4457 save_str_to_buf(&p.event->args_buf, (void *) version, 1); 4458 save_str_to_buf(&p.event->args_buf, (void *) srcversion, 2); 4459 4460 int ret_val = PT_REGS_RC(ctx); 4461 return events_perf_submit(&p, DO_INIT_MODULE, ret_val); 4462 } 4463 4464 // clang-format off 4465 4466 SEC("kprobe/load_elf_phdrs") 4467 int BPF_KPROBE(trace_load_elf_phdrs) 4468 { 4469 program_data_t p = {}; 4470 if (!init_program_data(&p, ctx)) 4471 return 0; 4472 4473 if (!should_trace((&p))) 4474 return 0; 4475 4476 proc_info_t *proc_info = p.proc_info; 4477 4478 struct file *loaded_elf = (struct file *) PT_REGS_PARM2(ctx); 4479 const char *elf_pathname = (char *) get_path_str(__builtin_preserve_access_index(&loaded_elf->f_path)); 4480 4481 // The interpreter field will be updated for any loading of an elf, both for the binary and for 4482 // the interpreter. Because the interpreter is loaded only after the executed elf is loaded, the 4483 // value of the executed binary should be overridden by the interpreter. 4484 4485 size_t sz = sizeof(proc_info->interpreter.pathname); 4486 bpf_probe_read_kernel_str(proc_info->interpreter.pathname, sz, elf_pathname); 4487 proc_info->interpreter.id.device = get_dev_from_file(loaded_elf); 4488 proc_info->interpreter.id.inode = get_inode_nr_from_file(loaded_elf); 4489 proc_info->interpreter.id.ctime = get_ctime_nanosec_from_file(loaded_elf); 4490 4491 if (should_submit(LOAD_ELF_PHDRS, p.event)) { 4492 save_str_to_buf(&p.event->args_buf, (void *) elf_pathname, 0); 4493 save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 1); 4494 save_to_submit_buf( 4495 &p.event->args_buf, &proc_info->interpreter.id.inode, sizeof(unsigned long), 2); 4496 4497 events_perf_submit(&p, LOAD_ELF_PHDRS, 0); 4498 } 4499 4500 return 0; 4501 } 4502 4503 // clang-format on 4504 4505 SEC("kprobe/security_file_permission") 4506 int BPF_KPROBE(trace_security_file_permission) 4507 { 4508 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 4509 if (file == NULL) 4510 return 0; 4511 struct inode *f_inode = get_inode_from_file(file); 4512 struct super_block *i_sb = get_super_block_from_inode(f_inode); 4513 unsigned long s_magic = get_s_magic_from_super_block(i_sb); 4514 4515 // Only check procfs entries 4516 if (s_magic != PROC_SUPER_MAGIC) { 4517 return 0; 4518 } 4519 4520 program_data_t p = {}; 4521 if (!init_program_data(&p, ctx)) 4522 return 0; 4523 4524 if (!should_trace(&p)) 4525 return 0; 4526 4527 if (!should_submit(HOOKED_PROC_FOPS, p.event)) 4528 return 0; 4529 4530 struct file_operations *fops = (struct file_operations *) BPF_CORE_READ(f_inode, i_fop); 4531 if (fops == NULL) 4532 return 0; 4533 4534 unsigned long iterate_addr = 0; 4535 unsigned long iterate_shared_addr = (unsigned long) BPF_CORE_READ(fops, iterate_shared); 4536 4537 // iterate() removed by commit 3e3271549670 at v6.5-rc4 4538 if (bpf_core_field_exists(fops->iterate)) 4539 iterate_addr = (unsigned long) BPF_CORE_READ(fops, iterate); 4540 4541 if (iterate_addr == 0 && iterate_shared_addr == 0) 4542 return 0; 4543 4544 // get text segment bounds 4545 void *stext_addr = get_stext_addr(); 4546 if (unlikely(stext_addr == NULL)) 4547 return 0; 4548 void *etext_addr = get_etext_addr(); 4549 if (unlikely(etext_addr == NULL)) 4550 return 0; 4551 4552 // mark as 0 if in bounds 4553 if (iterate_shared_addr >= (u64) stext_addr && iterate_shared_addr < (u64) etext_addr) 4554 iterate_shared_addr = 0; 4555 if (iterate_addr >= (u64) stext_addr && iterate_addr < (u64) etext_addr) 4556 iterate_addr = 0; 4557 4558 // now check again, if both are in text bounds, return 4559 if (iterate_addr == 0 && iterate_shared_addr == 0) 4560 return 0; 4561 4562 unsigned long fops_addresses[2] = {iterate_shared_addr, iterate_addr}; 4563 4564 save_u64_arr_to_buf(&p.event->args_buf, (const u64 *) fops_addresses, 2, 0); 4565 events_perf_submit(&p, HOOKED_PROC_FOPS, 0); 4566 return 0; 4567 } 4568 4569 SEC("raw_tracepoint/task_rename") 4570 int tracepoint__task__task_rename(struct bpf_raw_tracepoint_args *ctx) 4571 { 4572 program_data_t p = {}; 4573 if (!init_program_data(&p, ctx)) 4574 return 0; 4575 4576 if (!should_trace((&p))) 4577 return 0; 4578 4579 if (!should_submit(TASK_RENAME, p.event)) 4580 return 0; 4581 4582 struct task_struct *tsk = (struct task_struct *) ctx->args[0]; 4583 char old_name[TASK_COMM_LEN]; 4584 bpf_probe_read_kernel_str(&old_name, TASK_COMM_LEN, tsk->comm); 4585 const char *new_name = (const char *) ctx->args[1]; 4586 4587 save_str_to_buf(&p.event->args_buf, (void *) old_name, 0); 4588 save_str_to_buf(&p.event->args_buf, (void *) new_name, 1); 4589 4590 return events_perf_submit(&p, TASK_RENAME, 0); 4591 } 4592 4593 SEC("kprobe/security_inode_rename") 4594 int BPF_KPROBE(trace_security_inode_rename) 4595 { 4596 program_data_t p = {}; 4597 if (!init_program_data(&p, ctx)) 4598 return 0; 4599 4600 if (!should_trace(&p)) 4601 return 0; 4602 4603 if (!should_submit(SECURITY_INODE_RENAME, p.event)) 4604 return 0; 4605 4606 struct dentry *old_dentry = (struct dentry *) PT_REGS_PARM2(ctx); 4607 struct dentry *new_dentry = (struct dentry *) PT_REGS_PARM4(ctx); 4608 4609 void *old_dentry_path = get_dentry_path_str(old_dentry); 4610 save_str_to_buf(&p.event->args_buf, old_dentry_path, 0); 4611 void *new_dentry_path = get_dentry_path_str(new_dentry); 4612 save_str_to_buf(&p.event->args_buf, new_dentry_path, 1); 4613 return events_perf_submit(&p, SECURITY_INODE_RENAME, 0); 4614 } 4615 4616 SEC("kprobe/kallsyms_lookup_name") 4617 TRACE_ENT_FUNC(kallsyms_lookup_name, KALLSYMS_LOOKUP_NAME); 4618 4619 SEC("kretprobe/kallsyms_lookup_name") 4620 int BPF_KPROBE(trace_ret_kallsyms_lookup_name) 4621 { 4622 args_t saved_args; 4623 if (load_args(&saved_args, KALLSYMS_LOOKUP_NAME) != 0) 4624 return 0; 4625 del_args(KALLSYMS_LOOKUP_NAME); 4626 4627 program_data_t p = {}; 4628 if (!init_program_data(&p, ctx)) 4629 return 0; 4630 4631 if (!should_trace(&p)) 4632 return 0; 4633 4634 if (!should_submit(KALLSYMS_LOOKUP_NAME, p.event)) 4635 return 0; 4636 4637 char *name = (char *) saved_args.args[0]; 4638 unsigned long address = PT_REGS_RC(ctx); 4639 4640 save_str_to_buf(&p.event->args_buf, name, 0); 4641 save_to_submit_buf(&p.event->args_buf, &address, sizeof(unsigned long), 1); 4642 4643 return events_perf_submit(&p, KALLSYMS_LOOKUP_NAME, 0); 4644 } 4645 4646 enum signal_handling_method_e { 4647 SIG_DFL, 4648 SIG_IGN, 4649 SIG_HND = 2 // Doesn't exist in the kernel, but signifies that the method is through 4650 // user-defined handler 4651 }; 4652 4653 SEC("kprobe/do_sigaction") 4654 int BPF_KPROBE(trace_do_sigaction) 4655 { 4656 program_data_t p = {}; 4657 if (!init_program_data(&p, ctx)) 4658 return 0; 4659 4660 if (!should_trace(&p)) 4661 return 0; 4662 4663 if (!should_submit(DO_SIGACTION, p.event)) 4664 return 0; 4665 4666 // Initialize all relevant arguments values 4667 int sig = (int) PT_REGS_PARM1(ctx); 4668 u8 old_handle_method = 0, new_handle_method = 0; 4669 unsigned long new_sa_flags, old_sa_flags; 4670 void *new_sa_handler, *old_sa_handler; 4671 unsigned long new_sa_mask, old_sa_mask; 4672 4673 // Extract old signal handler values 4674 struct task_struct *task = p.task; 4675 struct sighand_struct *sighand = BPF_CORE_READ(task, sighand); 4676 struct k_sigaction *sig_actions = &(sighand->action[0]); 4677 if (sig > 0 && sig < _NSIG) { 4678 struct k_sigaction *old_act = get_node_addr(sig_actions, sig - 1); 4679 old_sa_flags = BPF_CORE_READ(old_act, sa.sa_flags); 4680 // In 64-bit system there is only 1 node in the mask array 4681 old_sa_mask = BPF_CORE_READ(old_act, sa.sa_mask.sig[0]); 4682 old_sa_handler = BPF_CORE_READ(old_act, sa.sa_handler); 4683 if (old_sa_handler >= (void *) SIG_HND) 4684 old_handle_method = SIG_HND; 4685 else { 4686 old_handle_method = (u8) (old_sa_handler && 0xFF); 4687 old_sa_handler = NULL; 4688 } 4689 } 4690 4691 // Check if a pointer for storing old signal handler is given 4692 struct k_sigaction *recv_old_act = (struct k_sigaction *) PT_REGS_PARM3(ctx); 4693 bool old_act_initialized = recv_old_act != NULL; 4694 4695 // Extract new signal handler values if initialized 4696 struct k_sigaction *new_act = (struct k_sigaction *) PT_REGS_PARM2(ctx); 4697 bool new_act_initialized = new_act != NULL; 4698 if (new_act_initialized) { 4699 struct sigaction *new_sigaction = &new_act->sa; 4700 new_sa_flags = BPF_CORE_READ(new_sigaction, sa_flags); 4701 // In 64-bit system there is only 1 node in the mask array 4702 new_sa_mask = BPF_CORE_READ(new_sigaction, sa_mask.sig[0]); 4703 new_sa_handler = BPF_CORE_READ(new_sigaction, sa_handler); 4704 if (new_sa_handler >= (void *) SIG_HND) 4705 new_handle_method = SIG_HND; 4706 else { 4707 new_handle_method = (u8) (new_sa_handler && 0xFF); 4708 new_sa_handler = NULL; 4709 } 4710 } 4711 4712 save_to_submit_buf(&p.event->args_buf, &sig, sizeof(int), 0); 4713 save_to_submit_buf(&p.event->args_buf, &new_act_initialized, sizeof(bool), 1); 4714 if (new_act_initialized) { 4715 save_to_submit_buf(&p.event->args_buf, &new_sa_flags, sizeof(unsigned long), 2); 4716 save_to_submit_buf(&p.event->args_buf, &new_sa_mask, sizeof(unsigned long), 3); 4717 save_to_submit_buf(&p.event->args_buf, &new_handle_method, sizeof(u8), 4); 4718 save_to_submit_buf(&p.event->args_buf, &new_sa_handler, sizeof(void *), 5); 4719 } 4720 save_to_submit_buf(&p.event->args_buf, &old_act_initialized, sizeof(bool), 6); 4721 save_to_submit_buf(&p.event->args_buf, &old_sa_flags, sizeof(unsigned long), 7); 4722 save_to_submit_buf(&p.event->args_buf, &old_sa_mask, sizeof(unsigned long), 8); 4723 save_to_submit_buf(&p.event->args_buf, &old_handle_method, sizeof(u8), 9); 4724 save_to_submit_buf(&p.event->args_buf, &old_sa_handler, sizeof(void *), 10); 4725 4726 return events_perf_submit(&p, DO_SIGACTION, 0); 4727 } 4728 4729 statfunc int common_utimes(struct pt_regs *ctx) 4730 { 4731 program_data_t p = {}; 4732 if (!init_program_data(&p, ctx)) 4733 return 0; 4734 4735 if (!should_trace(&p)) 4736 return 0; 4737 4738 if (!should_submit(VFS_UTIMES, p.event)) 4739 return 0; 4740 4741 struct path *path = (struct path *) PT_REGS_PARM1(ctx); 4742 struct timespec64 *times = (struct timespec64 *) PT_REGS_PARM2(ctx); 4743 4744 void *path_str = get_path_str(path); 4745 4746 struct dentry *dentry = BPF_CORE_READ(path, dentry); 4747 u64 inode_nr = get_inode_nr_from_dentry(dentry); 4748 dev_t dev = get_dev_from_dentry(dentry); 4749 4750 u64 atime = get_time_nanosec_timespec(times); 4751 u64 mtime = get_time_nanosec_timespec(×[1]); 4752 4753 save_str_to_buf(&p.event->args_buf, path_str, 0); 4754 save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 1); 4755 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(u64), 2); 4756 save_to_submit_buf(&p.event->args_buf, &atime, sizeof(u64), 3); 4757 save_to_submit_buf(&p.event->args_buf, &mtime, sizeof(u64), 4); 4758 4759 return events_perf_submit(&p, VFS_UTIMES, 0); 4760 } 4761 4762 SEC("kprobe/vfs_utimes") 4763 int BPF_KPROBE(trace_vfs_utimes) 4764 { 4765 return common_utimes(ctx); 4766 } 4767 4768 SEC("kprobe/utimes_common") 4769 int BPF_KPROBE(trace_utimes_common) 4770 { 4771 return common_utimes(ctx); 4772 } 4773 4774 SEC("kprobe/do_truncate") 4775 int BPF_KPROBE(trace_do_truncate) 4776 { 4777 program_data_t p = {}; 4778 if (!init_program_data(&p, ctx)) 4779 return 0; 4780 4781 if (!should_trace(&p)) 4782 return 0; 4783 4784 if (!should_submit(DO_TRUNCATE, p.event)) 4785 return 0; 4786 4787 struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx); 4788 u64 length = (long) PT_REGS_PARM3(ctx); 4789 4790 void *dentry_path = get_dentry_path_str(dentry); 4791 unsigned long inode_nr = get_inode_nr_from_dentry(dentry); 4792 dev_t dev = get_dev_from_dentry(dentry); 4793 4794 save_str_to_buf(&p.event->args_buf, dentry_path, 0); 4795 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 1); 4796 save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2); 4797 save_to_submit_buf(&p.event->args_buf, &length, sizeof(u64), 3); 4798 4799 return events_perf_submit(&p, DO_TRUNCATE, 0); 4800 } 4801 4802 SEC("kprobe/fd_install") 4803 int BPF_KPROBE(trace_fd_install) 4804 { 4805 program_data_t p = {}; 4806 if (!init_program_data(&p, ctx)) 4807 return 0; 4808 4809 if (!should_trace(&p)) 4810 return 0; 4811 4812 struct file *file = (struct file *) PT_REGS_PARM2(ctx); 4813 4814 // check if regular file. otherwise don't save the file_mod_key_t in file_modification_map. 4815 unsigned short file_mode = get_inode_mode_from_file(file); 4816 if ((file_mode & S_IFMT) != S_IFREG) { 4817 return 0; 4818 } 4819 4820 file_info_t file_info = get_file_info(file); 4821 4822 file_mod_key_t file_mod_key = { 4823 p.task_info->context.host_pid, file_info.id.device, file_info.id.inode}; 4824 int op = FILE_MODIFICATION_SUBMIT; 4825 4826 bpf_map_update_elem(&file_modification_map, &file_mod_key, &op, BPF_ANY); 4827 4828 return 0; 4829 } 4830 4831 SEC("kprobe/filp_close") 4832 int BPF_KPROBE(trace_filp_close) 4833 { 4834 program_data_t p = {}; 4835 if (!init_program_data(&p, ctx)) 4836 return 0; 4837 4838 if (!should_trace(&p)) 4839 return 0; 4840 4841 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 4842 file_info_t file_info = get_file_info(file); 4843 4844 file_mod_key_t file_mod_key = { 4845 p.task_info->context.host_pid, file_info.id.device, file_info.id.inode}; 4846 4847 bpf_map_delete_elem(&file_modification_map, &file_mod_key); 4848 4849 return 0; 4850 } 4851 4852 statfunc int common_file_modification_ent(struct pt_regs *ctx) 4853 { 4854 struct file *file = (struct file *) PT_REGS_PARM1(ctx); 4855 4856 // check if regular file. otherwise don't output the event. 4857 unsigned short file_mode = get_inode_mode_from_file(file); 4858 if ((file_mode & S_IFMT) != S_IFREG) { 4859 return 0; 4860 } 4861 4862 u64 ctime = get_ctime_nanosec_from_file(file); 4863 4864 args_t args = {}; 4865 args.args[0] = (unsigned long) file; 4866 args.args[1] = ctime; 4867 save_args(&args, FILE_MODIFICATION); 4868 4869 return 0; 4870 } 4871 4872 statfunc int common_file_modification_ret(struct pt_regs *ctx) 4873 { 4874 args_t saved_args; 4875 if (load_args(&saved_args, FILE_MODIFICATION) != 0) 4876 return 0; 4877 del_args(FILE_MODIFICATION); 4878 4879 program_data_t p = {}; 4880 if (!init_program_data(&p, ctx)) 4881 return 0; 4882 4883 if (!should_trace(&p)) 4884 return 0; 4885 4886 if (!should_submit(FILE_MODIFICATION, p.event)) 4887 return 0; 4888 4889 struct file *file = (struct file *) saved_args.args[0]; 4890 u64 old_ctime = saved_args.args[1]; 4891 4892 file_info_t file_info = get_file_info(file); 4893 4894 file_mod_key_t file_mod_key = { 4895 p.task_info->context.host_pid, file_info.id.device, file_info.id.inode}; 4896 4897 int *op = bpf_map_lookup_elem(&file_modification_map, &file_mod_key); 4898 if (op == NULL || *op == FILE_MODIFICATION_SUBMIT) { 4899 // we should submit the event once and mark as done. 4900 int op = FILE_MODIFICATION_DONE; 4901 bpf_map_update_elem(&file_modification_map, &file_mod_key, &op, BPF_ANY); 4902 } else { 4903 // no need to submit. return. 4904 return 0; 4905 } 4906 4907 save_str_to_buf(&p.event->args_buf, file_info.pathname_p, 0); 4908 save_to_submit_buf(&p.event->args_buf, &file_info.id.device, sizeof(dev_t), 1); 4909 save_to_submit_buf(&p.event->args_buf, &file_info.id.inode, sizeof(unsigned long), 2); 4910 save_to_submit_buf(&p.event->args_buf, &old_ctime, sizeof(u64), 3); 4911 save_to_submit_buf(&p.event->args_buf, &file_info.id.ctime, sizeof(u64), 4); 4912 4913 events_perf_submit(&p, FILE_MODIFICATION, 0); 4914 4915 return 0; 4916 } 4917 4918 SEC("kprobe/file_update_time") 4919 int BPF_KPROBE(trace_file_update_time) 4920 { 4921 return common_file_modification_ent(ctx); 4922 } 4923 4924 SEC("kretprobe/file_update_time") 4925 int BPF_KPROBE(trace_ret_file_update_time) 4926 { 4927 return common_file_modification_ret(ctx); 4928 } 4929 4930 SEC("kprobe/file_modified") 4931 int BPF_KPROBE(trace_file_modified) 4932 { 4933 /* 4934 * we want this probe to run only on kernel versions >= 6. 4935 * this is because on older kernels the file_modified() function calls the file_update_time() 4936 * function. in those cases, we don't need this probe active. 4937 */ 4938 if (bpf_core_field_exists(((struct file *) 0)->f_iocb_flags)) { 4939 /* kernel version >= 6 */ 4940 return common_file_modification_ent(ctx); 4941 } 4942 4943 return 0; 4944 } 4945 4946 SEC("kretprobe/file_modified") 4947 int BPF_KPROBE(trace_ret_file_modified) 4948 { 4949 /* 4950 * we want this probe to run only on kernel versions >= 6. 4951 * this is because on older kernels the file_modified() function calls the file_update_time() 4952 * function. in those cases, we don't need this probe active. 4953 */ 4954 if (bpf_core_field_exists(((struct file *) 0)->f_iocb_flags)) { 4955 /* kernel version >= 6 */ 4956 return common_file_modification_ret(ctx); 4957 } 4958 4959 return 0; 4960 } 4961 4962 SEC("kprobe/inotify_find_inode") 4963 TRACE_ENT_FUNC(inotify_find_inode, INOTIFY_WATCH); 4964 4965 SEC("kretprobe/inotify_find_inode") 4966 int BPF_KPROBE(trace_ret_inotify_find_inode) 4967 { 4968 args_t saved_args; 4969 if (load_args(&saved_args, INOTIFY_WATCH) != 0) 4970 return 0; 4971 del_args(INOTIFY_WATCH); 4972 4973 program_data_t p = {}; 4974 if (!init_program_data(&p, ctx)) 4975 return 0; 4976 4977 if (!should_trace(&p)) 4978 return 0; 4979 4980 if (!should_submit(INOTIFY_WATCH, p.event)) 4981 return 0; 4982 4983 struct path *path = (struct path *) saved_args.args[1]; 4984 4985 void *path_str = get_path_str(path); 4986 4987 struct dentry *dentry = BPF_CORE_READ(path, dentry); 4988 u64 inode_nr = get_inode_nr_from_dentry(dentry); 4989 dev_t dev = get_dev_from_dentry(dentry); 4990 4991 save_str_to_buf(&p.event->args_buf, path_str, 0); 4992 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 1); 4993 save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2); 4994 4995 return events_perf_submit(&p, INOTIFY_WATCH, 0); 4996 } 4997 4998 SEC("kprobe/exec_binprm") 4999 TRACE_ENT_FUNC(exec_binprm, EXEC_BINPRM); 5000 5001 SEC("kretprobe/exec_binprm") 5002 int BPF_KPROBE(trace_ret_exec_binprm) 5003 { 5004 args_t saved_args; 5005 if (load_args(&saved_args, EXEC_BINPRM) != 0) { 5006 // missed entry or not traced 5007 return 0; 5008 } 5009 del_args(EXEC_BINPRM); 5010 5011 program_data_t p = {}; 5012 if (!init_program_data(&p, ctx)) 5013 return 0; 5014 5015 if (!should_trace(&p)) 5016 return 0; 5017 5018 if (!should_submit(PROCESS_EXECUTION_FAILED, p.event)) 5019 return 0; 5020 5021 int ret_val = PT_REGS_RC(ctx); 5022 if (ret_val == 0) 5023 return 0; // not interested of successful execution - for that we have sched_process_exec 5024 5025 struct linux_binprm *bprm = (struct linux_binprm *) saved_args.args[0]; 5026 if (bprm == NULL) { 5027 return -1; 5028 } 5029 5030 struct file *file = get_file_ptr_from_bprm(bprm); 5031 5032 const char *path = get_binprm_filename(bprm); 5033 save_str_to_buf(&p.event->args_buf, (void *) path, 0); 5034 5035 void *binary_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 5036 save_str_to_buf(&p.event->args_buf, binary_path, 1); 5037 5038 dev_t binary_device_id = get_dev_from_file(file); 5039 save_to_submit_buf(&p.event->args_buf, &binary_device_id, sizeof(dev_t), 2); 5040 5041 unsigned long binary_inode_number = get_inode_nr_from_file(file); 5042 save_to_submit_buf(&p.event->args_buf, &binary_inode_number, sizeof(unsigned long), 3); 5043 5044 u64 binary_ctime = get_ctime_nanosec_from_file(file); 5045 save_to_submit_buf(&p.event->args_buf, &binary_ctime, sizeof(u64), 4); 5046 5047 umode_t binary_inode_mode = get_inode_mode_from_file(file); 5048 save_to_submit_buf(&p.event->args_buf, &binary_inode_mode, sizeof(umode_t), 5); 5049 5050 const char *interpreter_path = get_binprm_interp(bprm); 5051 save_str_to_buf(&p.event->args_buf, (void *) interpreter_path, 6); 5052 5053 bpf_tail_call(ctx, &prog_array, TAIL_EXEC_BINPRM1); 5054 return -1; 5055 } 5056 5057 SEC("kretprobe/trace_ret_exec_binprm1") 5058 int BPF_KPROBE(trace_ret_exec_binprm1) 5059 { 5060 program_data_t p = {}; 5061 if (!init_tailcall_program_data(&p, ctx)) 5062 return -1; 5063 5064 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 5065 struct file *stdin_file = get_struct_file_from_fd(0); 5066 5067 unsigned short stdin_type = get_inode_mode_from_file(stdin_file) & S_IFMT; 5068 save_to_submit_buf(&p.event->args_buf, &stdin_type, sizeof(unsigned short), 7); 5069 5070 void *stdin_path = get_path_str(__builtin_preserve_access_index(&stdin_file->f_path)); 5071 save_str_to_buf(&p.event->args_buf, stdin_path, 8); 5072 5073 int kernel_invoked = (get_task_parent_flags(task) & PF_KTHREAD) ? 1 : 0; 5074 save_to_submit_buf(&p.event->args_buf, &kernel_invoked, sizeof(int), 9); 5075 5076 bpf_tail_call(ctx, &prog_array, TAIL_EXEC_BINPRM2); 5077 return -1; 5078 } 5079 5080 SEC("kretprobe/trace_ret_exec_binprm2") 5081 int BPF_KPROBE(trace_ret_exec_binprm2) 5082 { 5083 program_data_t p = {}; 5084 if (!init_tailcall_program_data(&p, ctx)) 5085 return -1; 5086 5087 syscall_data_t *sys = &p.task_info->syscall_data; 5088 save_str_arr_to_buf( 5089 &p.event->args_buf, (const char *const *) sys->args.args[1], 10); // userspace argv 5090 5091 if (p.config->options & OPT_EXEC_ENV) { 5092 save_str_arr_to_buf( 5093 &p.event->args_buf, (const char *const *) sys->args.args[2], 11); // userspace envp 5094 } 5095 5096 int ret = PT_REGS_RC(ctx); // needs to be int 5097 5098 return events_perf_submit(&p, PROCESS_EXECUTION_FAILED, ret); 5099 } 5100 5101 SEC("kprobe/security_path_notify") 5102 int BPF_KPROBE(trace_security_path_notify) 5103 { 5104 program_data_t p = {}; 5105 if (!init_program_data(&p, ctx)) 5106 return 0; 5107 5108 if (!should_trace(&p)) 5109 return 0; 5110 5111 if (!should_submit(SECURITY_PATH_NOTIFY, p.event)) 5112 return 0; 5113 5114 struct path *path = (struct path *) PT_REGS_PARM1(ctx); 5115 void *path_str = get_path_str(path); 5116 struct dentry *dentry = BPF_CORE_READ(path, dentry); 5117 u64 inode_nr = get_inode_nr_from_dentry(dentry); 5118 dev_t dev = get_dev_from_dentry(dentry); 5119 5120 u64 mask = PT_REGS_PARM2(ctx); 5121 unsigned int obj_type = PT_REGS_PARM3(ctx); 5122 5123 save_str_to_buf(&p.event->args_buf, path_str, 0); 5124 save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 1); 5125 save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2); 5126 save_to_submit_buf(&p.event->args_buf, &mask, sizeof(u64), 3); 5127 save_to_submit_buf(&p.event->args_buf, &obj_type, sizeof(unsigned int), 4); 5128 5129 return events_perf_submit(&p, SECURITY_PATH_NOTIFY, 0); 5130 } 5131 5132 // clang-format off 5133 5134 // Network Packets (works from ~5.2 and beyond) 5135 5136 // To track ingress/egress traffic we always need to link a flow to its related 5137 // task (particularly when hooking ingress skb bpf programs, where the current 5138 // task is typically a kernel thread). 5139 5140 // In older kernels, managing cgroup skb programs can be more difficult due to 5141 // the lack of bpf helpers and buggy/incomplete verifier. To deal with this, 5142 // this approach uses a technique of kprobing the function responsible for 5143 // calling the cgroup/skb programs. 5144 5145 // Tracee utilizes a technique of kprobing the function responsible for calling 5146 // the cgroup/skb programs in order to perform the tasks which cgroup skb 5147 // programs would usually accomplish. Through this method, all the data needed 5148 // by the cgroup/skb programs is already stored in a map. 5149 5150 // Unfortunately this approach has some cons: the kprobe to cgroup/skb execution 5151 // flow does not have preemption disabled, so the map used in between all the 5152 // hooks need to use as a key something that is available to all the hooks 5153 // context (the packet contents themselves: e.g. L3 header fields). 5154 5155 // At the end, the logic is simple: every time a socket is created an inode is 5156 // also created. The task owning the socket is indexed by the socket inode so 5157 // everytime this socket is used we know which task it belongs to (specially 5158 // during ingress hook, executed from the softirq context within a kthread). 5159 5160 // 5161 // network helper functions 5162 // 5163 5164 statfunc bool is_family_supported(struct socket *sock) 5165 { 5166 struct sock *sk = (void *) BPF_CORE_READ(sock, sk); 5167 struct sock_common *common = (void *) sk; 5168 u8 family = BPF_CORE_READ(common, skc_family); 5169 5170 switch (family) { 5171 case PF_INET: 5172 case PF_INET6: 5173 break; 5174 // case PF_UNSPEC: 5175 // case PF_LOCAL: // PF_UNIX or PF_FILE 5176 // case PF_NETLINK: 5177 // case PF_VSOCK: 5178 // case PF_XDP: 5179 // case PF_BRIDGE: 5180 // case PF_PACKET: 5181 // case PF_MPLS: 5182 // case PF_BLUETOOTH: 5183 // case PF_IB: 5184 // ... 5185 default: 5186 return 0; // not supported 5187 } 5188 5189 return 1; // supported 5190 } 5191 5192 statfunc bool is_socket_supported(struct socket *sock) 5193 { 5194 struct sock *sk = (void *) BPF_CORE_READ(sock, sk); 5195 u16 protocol = get_sock_protocol(sk); 5196 switch (protocol) { 5197 // case IPPROTO_IPIP: 5198 // case IPPROTO_DCCP: 5199 // case IPPROTO_SCTP: 5200 // case IPPROTO_UDPLITE: 5201 case IPPROTO_IP: 5202 case IPPROTO_IPV6: 5203 case IPPROTO_TCP: 5204 case IPPROTO_UDP: 5205 case IPPROTO_ICMP: 5206 case IPPROTO_ICMPV6: 5207 break; 5208 default: 5209 return 0; // not supported 5210 } 5211 5212 return 1; // supported 5213 } 5214 5215 // 5216 // Support functions for network code 5217 // 5218 5219 statfunc u64 sizeof_net_event_context_t(void) 5220 { 5221 return sizeof(net_event_context_t) - sizeof(net_event_contextmd_t); 5222 } 5223 5224 statfunc void set_net_task_context(program_data_t *p, net_task_context_t *netctx) 5225 { 5226 netctx->task = p->task; 5227 netctx->matched_policies = p->event->context.matched_policies; 5228 netctx->syscall = p->event->context.syscall; 5229 __builtin_memset(&netctx->taskctx, 0, sizeof(task_context_t)); 5230 __builtin_memcpy(&netctx->taskctx, &p->event->context.task, sizeof(task_context_t)); 5231 5232 // Normally this will be set filled inside events_perf_submit but for some events like set_socket_state we 5233 // want to prefill full network context. 5234 init_task_context(&netctx->taskctx, p->task, p->config->options); 5235 } 5236 5237 statfunc enum event_id_e net_packet_to_net_event(net_packet_t packet_type) 5238 { 5239 switch (packet_type) { 5240 case CAP_NET_PACKET: 5241 return NET_PACKET_CAP_BASE; 5242 // Packets 5243 case SUB_NET_PACKET_IP: 5244 return NET_PACKET_IP; 5245 case SUB_NET_PACKET_TCP: 5246 return NET_PACKET_TCP; 5247 case SUB_NET_PACKET_UDP: 5248 return NET_PACKET_UDP; 5249 case SUB_NET_PACKET_ICMP: 5250 return NET_PACKET_ICMP; 5251 case SUB_NET_PACKET_ICMPV6: 5252 return NET_PACKET_ICMPV6; 5253 case SUB_NET_PACKET_DNS: 5254 return NET_PACKET_DNS; 5255 case SUB_NET_PACKET_HTTP: 5256 return NET_PACKET_HTTP; 5257 case SUB_NET_PACKET_SOCKS5: 5258 return NET_PACKET_SOCKS5; 5259 }; 5260 return MAX_EVENT_ID; 5261 } 5262 5263 // The address of &neteventctx->eventctx will be aligned as eventctx is the 5264 // first member of that packed struct. This is a false positive as we do need 5265 // the neteventctx struct to be all packed. 5266 #pragma clang diagnostic push 5267 #pragma clang diagnostic ignored "-Waddress-of-packed-member" 5268 5269 // Return if a network event should to be sumitted: if any of the policies 5270 // matched, submit the network event. This means that if any of the policies 5271 // need a network event, kernel can submit the network base event and let 5272 // userland deal with it (derived events will match the appropriate policies). 5273 statfunc u64 should_submit_net_event(net_event_context_t *neteventctx, 5274 net_packet_t packet_type) 5275 { 5276 // TODO after v0.15.0: After some testing, the caching is never used, as the net context is 5277 // always a new one (created by the cgroup/skb program caller, AND there is a single map check 5278 // for each protocol, each protocol check for submission. Go back to changes made by commit 5279 // #4e9bb610049 ("network: ebpf: lazy submit checks for net events"), but still using enum and 5280 // better code (will improve the callers syntax as well). 5281 enum event_id_e evt_id = net_packet_to_net_event(packet_type); 5282 5283 event_config_t *evt_config = bpf_map_lookup_elem(&events_map, &evt_id); 5284 if (evt_config == NULL) 5285 return 0; 5286 5287 return evt_config->submit_for_policies & neteventctx->eventctx.matched_policies; 5288 } 5289 5290 #pragma clang diagnostic pop // -Waddress-of-packed-member 5291 5292 // Return if a network flow event should be submitted. 5293 statfunc bool should_submit_flow_event(net_event_context_t *neteventctx) 5294 { 5295 switch (neteventctx->md.should_flow) { 5296 case 0: 5297 break; 5298 case 1: 5299 return true; 5300 case 2: 5301 return false; 5302 } 5303 5304 u32 evt_id = NET_FLOW_BASE; 5305 5306 // Again, if any policy matched, submit the flow base event so other flow 5307 // events can be derived in userland and their policies matched in userland. 5308 event_config_t *evt_config = bpf_map_lookup_elem(&events_map, &evt_id); 5309 if (evt_config == NULL) 5310 return 0; 5311 5312 u64 should = evt_config->submit_for_policies & neteventctx->eventctx.matched_policies; 5313 5314 // Cache the result so next time we don't need to check again. 5315 if (should) 5316 neteventctx->md.should_flow = 1; // cache result: submit flow events 5317 else 5318 neteventctx->md.should_flow = 2; // cache result: don't submit flow events 5319 5320 return should ? true : false; 5321 } 5322 5323 // Return if a network capture event should be submitted. 5324 statfunc u64 should_capture_net_event(net_event_context_t *neteventctx, net_packet_t packet_type) 5325 { 5326 if (neteventctx->md.captured) // already captured 5327 return 0; 5328 5329 return should_submit_net_event(neteventctx, packet_type); 5330 } 5331 5332 // 5333 // Protocol parsing functions 5334 // 5335 5336 #define CGROUP_SKB_HANDLE_FUNCTION(name) \ 5337 statfunc u32 cgroup_skb_handle_##name( \ 5338 struct __sk_buff *ctx, \ 5339 net_event_context_t *neteventctx, \ 5340 nethdrs *nethdrs, \ 5341 bool ingress \ 5342 ) 5343 5344 CGROUP_SKB_HANDLE_FUNCTION(family); 5345 CGROUP_SKB_HANDLE_FUNCTION(proto); 5346 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp); 5347 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_dns); 5348 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_http); 5349 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_socks5); 5350 CGROUP_SKB_HANDLE_FUNCTION(proto_udp); 5351 CGROUP_SKB_HANDLE_FUNCTION(proto_udp_dns); 5352 CGROUP_SKB_HANDLE_FUNCTION(proto_icmp); 5353 CGROUP_SKB_HANDLE_FUNCTION(proto_icmpv6); 5354 CGROUP_SKB_HANDLE_FUNCTION(proto_socks5); 5355 5356 #define CGROUP_SKB_HANDLE(name) cgroup_skb_handle_##name(ctx, neteventctx, nethdrs, ingress); 5357 5358 // 5359 // Network submission functions 5360 // 5361 5362 // Submit a network event (packet, capture, flow) to userland. 5363 statfunc u32 cgroup_skb_submit(void *map, struct __sk_buff *ctx, 5364 net_event_context_t *neteventctx, 5365 u32 event_type, u32 size) 5366 { 5367 size = size > FULL ? FULL : size; 5368 switch (size) { 5369 case HEADERS: // submit only headers 5370 size = neteventctx->md.header_size; 5371 break; 5372 case FULL: // submit full packet 5373 size = ctx->len; 5374 break; 5375 default: // submit size bytes 5376 size += neteventctx->md.header_size; 5377 size = size > ctx->len ? ctx->len : size; 5378 break; 5379 } 5380 5381 // Flag eBPF subsystem to use current CPU and copy size bytes of payload. 5382 u64 flags = BPF_F_CURRENT_CPU | (u64) size << 32; 5383 neteventctx->bytes = size; 5384 5385 // Set the event type before submitting event. 5386 neteventctx->eventctx.eventid = event_type; 5387 5388 // Submit the event. 5389 return bpf_perf_event_output(ctx, map, flags, neteventctx, sizeof_net_event_context_t()); 5390 } 5391 5392 // Submit a network event. 5393 #define cgroup_skb_submit_event(a, b, c, d) cgroup_skb_submit(&events, a, b, c, d) 5394 5395 // Check if a flag is set in the retval. 5396 #define retval_hasflag(flag) (neteventctx->eventctx.retval & flag) == flag 5397 5398 statfunc void update_flow_stats(struct __sk_buff *skb, netflowvalue_t *val, bool ingress) { 5399 if (ingress) { 5400 val->rx_bytes += skb->len; 5401 val->rx_packets += 1; 5402 } else { 5403 val->tx_bytes += skb->len; 5404 val->tx_packets += 1; 5405 } 5406 } 5407 5408 statfunc void reset_flow_stats(netflowvalue_t *val) { 5409 5410 val->tx_bytes = 0; 5411 val->rx_bytes = 0; 5412 val->tx_packets = 0; 5413 val->rx_packets = 0; 5414 } 5415 5416 statfunc u32 submit_netflow_event(struct __sk_buff *ctx, net_event_context_t *neteventctx, netflowvalue_t *netflowvalptr) { 5417 event_data_t *e = init_netflows_event_data(); 5418 if (unlikely(e == NULL)) 5419 return 0; 5420 5421 __builtin_memcpy(&e->context.task, &neteventctx->eventctx.task, sizeof(task_context_t)); 5422 e->context.retval = neteventctx->eventctx.retval; 5423 5424 save_to_submit_buf_kernel(&e->args_buf, (void *) &neteventctx->md.flow.proto, sizeof(u8), 0); 5425 save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->direction, sizeof(u8), 1); 5426 save_to_submit_buf_kernel(&e->args_buf, (void *) &neteventctx->md.flow.tuple, sizeof(tuple_t), 2); 5427 save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->tx_bytes, sizeof(u64), 3); 5428 save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->rx_bytes, sizeof(u64), 4); 5429 save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->tx_packets, sizeof(u64), 5); 5430 save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->rx_packets, sizeof(u64), 6); 5431 net_events_perf_submit(ctx, NET_FLOW_BASE, e); 5432 return 0; 5433 } 5434 5435 // Keep track of a flow event if they are enabled and if any policy matched. 5436 // Submit the flow base event so userland can derive the flow events. 5437 statfunc u32 cgroup_skb_handle_flow(struct __sk_buff *ctx, 5438 net_event_context_t *neteventctx, 5439 u32 event_type, u32 size, u32 flow) 5440 { 5441 netflowvalue_t *netflowvalptr, netflowvalue = { 5442 .last_update = bpf_ktime_get_ns(), 5443 .direction = flow_unknown, 5444 }; 5445 5446 // Set the current netctx task as the flow task. 5447 neteventctx->md.flow.host_pid = neteventctx->eventctx.task.host_pid; 5448 5449 // Set the flow event type in retval. 5450 neteventctx->eventctx.retval |= flow; 5451 5452 // Check if the current packet source is the flow initiator. 5453 bool is_initiator = 0; 5454 bool ingress = 0; 5455 5456 switch (flow) { 5457 // 1) TCP connection is being established. 5458 case flow_tcp_begin: 5459 // Ingress: Remote (src) is sending SYN+ACK: this host (dst) is the initiator. 5460 if (retval_hasflag(packet_ingress)) { 5461 netflowvalue.direction = flow_outgoing; 5462 ingress = 1; 5463 } 5464 5465 // Egress: Host (src) is sending SYN+ACK: remote (dst) host is the initiator. 5466 if (retval_hasflag(packet_egress)) 5467 netflowvalue.direction = flow_incoming; 5468 5469 // Invert src/dst: The flowmap src should always be set to flow initiator. 5470 neteventctx->md.flow = invert_netflow(neteventctx->md.flow); 5471 update_flow_stats(ctx, &netflowvalue, ingress); 5472 5473 // Update the flow map. 5474 bpf_map_update_elem(&netflowmap, &neteventctx->md.flow, &netflowvalue, BPF_NOEXIST); 5475 5476 return submit_netflow_event(ctx, neteventctx, &netflowvalue); 5477 // 2) TCP Flow sample with current statistics. 5478 case flow_tcp_sample: 5479 netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow); 5480 if (!netflowvalptr) { 5481 neteventctx->md.flow = invert_netflow(neteventctx->md.flow); 5482 netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow); 5483 if (!netflowvalptr) { 5484 return 0; 5485 } 5486 } 5487 5488 update_flow_stats(ctx, netflowvalptr, retval_hasflag(packet_ingress)); 5489 5490 u64 now = bpf_ktime_get_ns(); 5491 u64 last_submit_seconds = (now - netflowvalptr->last_update) / 1000000000; 5492 // Check if it's time to submit flow sample. 5493 if (last_submit_seconds >= global_config.flow_sample_submit_interval_seconds) { 5494 netflowvalptr->last_update = now; 5495 submit_netflow_event(ctx, neteventctx, netflowvalptr); 5496 reset_flow_stats(netflowvalptr); 5497 return 0; 5498 } 5499 5500 // Flow sample should not be submitted yet, exit. 5501 return 0; 5502 5503 // 3) TCP connection is being closed/terminated. 5504 case flow_tcp_end: 5505 // Any side can close the connection (FIN, RST, etc). Need heuristics. 5506 5507 // Attempt 01: Try to find the flow using current src/dst. 5508 for (int n = 0; n < 3; n++) { 5509 netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow); 5510 if (!netflowvalptr) 5511 continue; 5512 } 5513 5514 // FIN could be sent by either side, by both, or by none (RST). Need heuristics. 5515 if (!netflowvalptr) { 5516 // Attempt 02: Maybe this packet src wasn't the flow initiator, invert src/dst. 5517 neteventctx->md.flow = invert_netflow(neteventctx->md.flow); 5518 5519 for (int n = 0; n < 3; n++) { 5520 netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow); 5521 if (!netflowvalptr) 5522 continue; 5523 } 5524 5525 // After first FIN packet is processed the flow is deleted, so the second 5526 // FIN packet, if ever processed, will not find the flow in the map, and 5527 // that is ok. 5528 if (!netflowvalptr) 5529 return 0; 5530 5531 // Flow was found using inverted src/dst: current pkt dst was the flow initiator. 5532 is_initiator = 0; 5533 5534 } else { 5535 // Flow was found using current src/dst: current pkt src was the flow initiator. 5536 is_initiator = 1; 5537 } 5538 5539 // Inform userland the flow being terminated started by current packet src. 5540 // This is important so userland knows how to report flow termination correctly. 5541 if (is_initiator) 5542 neteventctx->eventctx.retval |= flow_src_initiator; 5543 5544 update_flow_stats(ctx, netflowvalptr, retval_hasflag(packet_ingress)); 5545 submit_netflow_event(ctx, neteventctx, netflowvalptr); 5546 5547 // Delete the flow from the map (make sure to delete both sides). 5548 bpf_map_delete_elem(&netflowmap, &neteventctx->md.flow); 5549 neteventctx->md.flow = invert_netflow(neteventctx->md.flow); 5550 bpf_map_delete_elem(&netflowmap, &neteventctx->md.flow); 5551 5552 return 0; 5553 5554 // 3) TODO: UDP flow is considered started when the first packet is sent. 5555 // case flow_udp_begin: 5556 // 5557 // 4) TODO: UDP flow is considered terminated when socket is closed. 5558 // case flow_udp_end: 5559 // 5560 default: 5561 return 0; 5562 }; 5563 5564 return 0; 5565 }; 5566 5567 // Check if capture event should be submitted, cache the result and submit. 5568 #define cgroup_skb_capture() \ 5569 { \ 5570 if (should_submit_net_event(neteventctx, CAP_NET_PACKET)) { \ 5571 if (neteventctx->md.captured == 0) { \ 5572 cgroup_skb_capture_event(ctx, neteventctx, NET_CAPTURE_BASE); \ 5573 neteventctx->md.captured = 1; \ 5574 } \ 5575 } \ 5576 } 5577 5578 // Check if packet should be captured and submit the capture base event. 5579 statfunc u32 cgroup_skb_capture_event(struct __sk_buff *ctx, 5580 net_event_context_t *neteventctx, 5581 u32 event_type) 5582 { 5583 int zero = 0; 5584 5585 // Pick the network config map to know the requested capture length. 5586 netconfig_entry_t *nc = bpf_map_lookup_elem(&netconfig_map, &zero); 5587 if (nc == NULL) 5588 return 0; 5589 5590 // Submit the capture base event. 5591 return cgroup_skb_submit(&net_cap_events, ctx, neteventctx, event_type, nc->capture_length); 5592 } 5593 5594 // 5595 // Socket creation and socket <=> task context updates 5596 // 5597 5598 // Used to create a file descriptor for a socket. After a file descriptor is 5599 // created, it can be associated with the file operations of the socket, this 5600 // allows a socket to be used with the standard file operations (read, write, 5601 // etc). By having a file descriptor, kernel can keep track of the socket state, 5602 // and also the inode associated to the socket (which is used to link the socket 5603 // to a task). 5604 SEC("kprobe/sock_alloc_file") 5605 int BPF_KPROBE(trace_sock_alloc_file) 5606 { 5607 // runs every time a socket is created (entry) 5608 5609 struct socket *sock = (void *) PT_REGS_PARM1(ctx); 5610 5611 if (!is_family_supported(sock)) 5612 return 0; 5613 5614 if (!is_socket_supported(sock)) 5615 return 0; 5616 5617 struct entry entry = {0}; 5618 5619 // save args for retprobe 5620 entry.args[0] = PT_REGS_PARM1(ctx); // struct socket *sock 5621 entry.args[1] = PT_REGS_PARM2(ctx); // int flags 5622 entry.args[2] = PT_REGS_PARM2(ctx); // char *dname 5623 5624 // prepare for kretprobe using entrymap 5625 u32 host_tid = bpf_get_current_pid_tgid(); 5626 bpf_map_update_elem(&entrymap, &host_tid, &entry, BPF_ANY); 5627 5628 return 0; 5629 } 5630 5631 // Ditto. 5632 SEC("kretprobe/sock_alloc_file") 5633 int BPF_KRETPROBE(trace_ret_sock_alloc_file) 5634 { 5635 // runs every time a socket is created (return) 5636 5637 program_data_t p = {}; 5638 if (!init_program_data(&p, ctx)) 5639 return 0; 5640 5641 if (!should_trace(&p)) 5642 return 0; 5643 5644 // pick from entry from entrymap 5645 u32 host_tid = p.event->context.task.host_tid; 5646 struct entry *entry = bpf_map_lookup_elem(&entrymap, &host_tid); 5647 if (!entry) // no entry == no tracing 5648 return 0; 5649 5650 // pick args from entry point's entry 5651 // struct socket *sock = (void *) entry->args[0]; 5652 // int flags = entry->args[1]; 5653 // char *dname = (void *) entry->args[2]; 5654 struct file *sock_file = (void *) PT_REGS_RC(ctx); 5655 5656 // cleanup entrymap 5657 bpf_map_delete_elem(&entrymap, &host_tid); 5658 5659 if (!sock_file) 5660 return 0; // socket() failed ? 5661 5662 u64 inode = BPF_CORE_READ(sock_file, f_inode, i_ino); 5663 if (inode == 0) 5664 return 0; 5665 5666 // save context to further create an event when no context exists 5667 net_task_context_t netctx = {0}; 5668 set_net_task_context(&p, &netctx); 5669 5670 // update inodemap correlating inode <=> task 5671 bpf_map_update_elem(&inodemap, &inode, &netctx, BPF_ANY); 5672 5673 return 0; 5674 } 5675 5676 SEC("kprobe/security_sk_clone") 5677 int BPF_KPROBE(trace_security_sk_clone) 5678 { 5679 // When a "sock" is cloned because of a SYN packet, a new "sock" is created 5680 // and the return value is the new "sock" (not the original one). 5681 // 5682 // There is a problem though, the "sock" does not contain a valid "socket" 5683 // associated to it yet (sk_socket is NULL as this is running with SoftIRQ 5684 // context). Without a "socket" we also don't have a "file" associated to 5685 // it, nor an inode associated to that file. This is the way tracee links 5686 // a network flow (packets) to a task. 5687 // 5688 // The only way we can relate this new "sock", just cloned by a kernel 5689 // thread, to a task, is through the existence of the old "sock" struct, 5690 // describing the listening socket (one accept() was called for). 5691 // 5692 // Then, by knowing the old "sock" (with an existing socket, an existing 5693 // file, an existing inode), we're able to link this new "sock" to the task 5694 // we're tracing for the old "sock". 5695 // 5696 // In bullets: 5697 // 5698 // - tracing a process that has a socket listening for connections. 5699 // - it receives a SYN packet and a new socket can be created (accept). 5700 // - a sock (socket descriptor) is created for the socket to be created. 5701 // - no socket/inode exists yet (sock->sk_socket is NULL). 5702 // - accept() traces are too late for initial pkts (socked does not exist). 5703 // - by linking old "sock" to the new "sock" we can relate the task. 5704 // - some of the initial packets, sometimes with big length, are traced now. 5705 // 5706 // More at: https://github.com/aquasecurity/tracee/issues/2739 5707 5708 struct sock *osock = (void *) PT_REGS_PARM1(ctx); 5709 struct sock *nsock = (void *) PT_REGS_PARM2(ctx); 5710 5711 struct socket *osocket = BPF_CORE_READ(osock, sk_socket); 5712 if (!osocket) 5713 return 0; 5714 5715 // obtain old socket inode 5716 u64 inode = BPF_CORE_READ(osocket, file, f_inode, i_ino); 5717 if (inode == 0) 5718 return 0; 5719 5720 // check if old socket family is supported 5721 if (!is_family_supported(osocket)) 5722 return 0; 5723 5724 // if the original socket isn't linked to a task, then the newly cloned 5725 // socket won't need to be linked as well: return in that case 5726 5727 net_task_context_t *netctx = bpf_map_lookup_elem(&inodemap, &inode); 5728 if (!netctx) { 5729 return 0; // e.g. task isn't being traced 5730 } 5731 5732 u64 nsockptr = (u64)(void *) nsock; 5733 5734 // link the new "sock" to the old inode, so it can be linked to a task later 5735 5736 bpf_map_update_elem(&sockmap, &nsockptr, &inode, BPF_ANY); 5737 5738 return 0; 5739 } 5740 5741 // Associate a socket to a task. This is done by linking the socket inode to the 5742 // task context (inside netctx). This is done when a socket is created, and also 5743 // when a socket is cloned (e.g. when a SYN packet is received and a new socket 5744 // is created). 5745 statfunc u32 update_net_inodemap(struct socket *sock, program_data_t *p) 5746 { 5747 struct file *sock_file = BPF_CORE_READ(sock, file); 5748 if (!sock_file) 5749 return 0; 5750 5751 u64 inode = BPF_CORE_READ(sock_file, f_inode, i_ino); 5752 if (inode == 0) 5753 return 0; 5754 5755 // save updated context to the inode map (inode <=> task ctx relation) 5756 net_task_context_t netctx = {0}; 5757 set_net_task_context(p, &netctx); 5758 5759 bpf_map_update_elem(&inodemap, &inode, &netctx, BPF_ANY); 5760 5761 return 0; 5762 } 5763 5764 // Called by recv system calls (e.g. recvmsg, recvfrom, recv, ...), or when data 5765 // arrives at the network stack and is destined for a socket, or during socket 5766 // buffer management when kernel is copying data from the network buffer to the 5767 // socket buffer. 5768 SEC("kprobe/security_socket_recvmsg") 5769 int BPF_KPROBE(trace_security_socket_recvmsg) 5770 { 5771 struct socket *sock = (void *) PT_REGS_PARM1(ctx); 5772 if (sock == NULL) 5773 return 0; 5774 if (!is_family_supported(sock)) 5775 return 0; 5776 if (!is_socket_supported(sock)) 5777 return 0; 5778 5779 program_data_t p = {}; 5780 if (!init_program_data(&p, ctx)) 5781 return 0; 5782 5783 if (!should_trace(&p)) 5784 return 0; 5785 5786 return update_net_inodemap(sock, &p); 5787 } 5788 5789 // Called by send system calls (e.g. sendmsg, sendto, send, ...), or when data 5790 // is queued for transmission by the network stack, or during socket buffer 5791 // management when kernel is copying data from the socket buffer to the network 5792 // buffer. 5793 SEC("kprobe/security_socket_sendmsg") 5794 int BPF_KPROBE(trace_security_socket_sendmsg) 5795 { 5796 struct socket *sock = (void *) PT_REGS_PARM1(ctx); 5797 if (sock == NULL) 5798 return 0; 5799 if (!is_family_supported(sock)) 5800 return 0; 5801 if (!is_socket_supported(sock)) 5802 return 0; 5803 5804 program_data_t p = {}; 5805 if (!init_program_data(&p, ctx)) 5806 return 0; 5807 5808 if (!should_trace(&p)) 5809 return 0; 5810 5811 return update_net_inodemap(sock, &p); 5812 } 5813 5814 // 5815 // Socket Ingress/Egress eBPF program loader (right before and right after eBPF) 5816 // 5817 5818 SEC("kprobe/__cgroup_bpf_run_filter_skb") 5819 int BPF_KPROBE(cgroup_bpf_run_filter_skb) 5820 { 5821 // runs BEFORE the CGROUP/SKB eBPF program 5822 5823 void *cgrpctxmap = NULL; 5824 5825 struct sock *sk = (void *) PT_REGS_PARM1(ctx); 5826 struct sk_buff *skb = (void *) PT_REGS_PARM2(ctx); 5827 int type = PT_REGS_PARM3(ctx); 5828 5829 if (!sk || !skb) 5830 return 0; 5831 5832 s64 packet_dir_flag; // used later to set packet direction flag 5833 switch (type) { 5834 case BPF_CGROUP_INET_INGRESS: 5835 cgrpctxmap = &cgrpctxmap_in; 5836 packet_dir_flag = packet_ingress; 5837 break; 5838 case BPF_CGROUP_INET_EGRESS: 5839 cgrpctxmap = &cgrpctxmap_eg; 5840 packet_dir_flag = packet_egress; 5841 break; 5842 default: 5843 return 0; // other attachment type, return fast 5844 } 5845 5846 struct sock_common *common = (void *) sk; 5847 u8 family = BPF_CORE_READ(common, skc_family); 5848 5849 switch (family) { 5850 case PF_INET: 5851 case PF_INET6: 5852 break; 5853 default: 5854 return 1; // return fast for unsupported socket families 5855 } 5856 5857 // 5858 // EVENT CONTEXT (from current task, might be a kernel context/thread) 5859 // 5860 5861 u32 zero = 0; 5862 event_data_t *e = bpf_map_lookup_elem(&net_heap_event, &zero); 5863 if (unlikely(e == NULL)) 5864 return 0; 5865 5866 program_data_t p = {}; 5867 p.scratch_idx = 1; 5868 p.event = e; 5869 if (!init_program_data(&p, ctx)) 5870 return 0; 5871 5872 bool mightbecloned = false; // cloned sock structs come from accept() 5873 5874 // obtain the socket inode using current "sock" structure 5875 5876 u64 inode = BPF_CORE_READ(sk, sk_socket, file, f_inode, i_ino); 5877 if (inode == 0) 5878 mightbecloned = true; // kernel threads might have zero inode 5879 5880 struct net_task_context *netctx; 5881 5882 // obtain the task ctx using the obtained socket inode 5883 5884 if (!mightbecloned) { 5885 // pick network context from the inodemap (inode <=> task) 5886 netctx = bpf_map_lookup_elem(&inodemap, &inode); 5887 if (!netctx) 5888 mightbecloned = true; // e.g. task isn't being traced 5889 } 5890 5891 // If inode is zero, or task context couldn't be found, try to find it using 5892 // the "sock" pointer from sockmap (this sock struct might be new, just 5893 // cloned, and a socket might not exist yet, but the sockmap is likely to 5894 // have the entry). Check trace_security_sk_clone() for more details. 5895 5896 if (mightbecloned) { 5897 // pick network context from the sockmap (new sockptr <=> old inode <=> task) 5898 u64 skptr = (u64) (void *) sk; 5899 u64 *o = bpf_map_lookup_elem(&sockmap, &skptr); 5900 if (o == 0) 5901 return 0; 5902 u64 oinode = *o; 5903 5904 // with the old inode, find the netctx for the task 5905 netctx = bpf_map_lookup_elem(&inodemap, &oinode); 5906 if (!netctx) 5907 return 0; // old inode wasn't being traced as well 5908 5909 // update inodemap w/ new inode <=> task context (faster path next time) 5910 bpf_map_update_elem(&inodemap, &oinode, netctx, BPF_ANY); 5911 } 5912 5913 // CHECK: should_submit_net_event() for more info 5914 #pragma clang diagnostic push 5915 #pragma clang diagnostic ignored "-Waddress-of-packed-member" 5916 5917 // 5918 // PREPARE SKG PROGRAM EVENT CONTEXT (cgrpctxmap value) 5919 // 5920 5921 // Prepare [event_context_t][args1,arg2,arg3...] to be sent by cgroup/skb 5922 // program. The [...] part of the event can't use existing per-cpu submit 5923 // buffer helpers because the time in between this kprobe fires and the 5924 // cgroup/skb program runs might be suffer a preemption. 5925 5926 net_event_context_t neteventctx = {0}; // to be sent by cgroup/skb program 5927 event_context_t *eventctx = &neteventctx.eventctx; 5928 5929 #pragma clang diagnostic pop 5930 5931 // copy orig task ctx (from the netctx) to event ctx and build the rest 5932 __builtin_memcpy(&eventctx->task, &netctx->taskctx, sizeof(task_context_t)); 5933 eventctx->ts = p.event->context.ts; // copy timestamp from current ctx 5934 neteventctx.argnum = 1; // 1 argument (add more if needed) 5935 eventctx->eventid = NET_PACKET_IP; // will be changed in skb program 5936 eventctx->stack_id = 0; // no stack trace 5937 eventctx->processor_id = p.event->context.processor_id; // copy from current ctx 5938 eventctx->matched_policies = netctx->matched_policies; // pick matched_policies from net ctx 5939 eventctx->syscall = NO_SYSCALL; // ingress has no orig syscall 5940 if (type == BPF_CGROUP_INET_EGRESS) 5941 eventctx->syscall = netctx->syscall; // egress does have an orig syscall 5942 5943 // 5944 // SKB PROGRAM CONTEXT INDEXER (cgrpctxmap key) 5945 // 5946 5947 u32 l3_size = 0; 5948 nethdrs hdrs = {0}, *nethdrs = &hdrs; 5949 5950 // inform userland about protocol family (for correct L3 header parsing)... 5951 switch (family) { 5952 case PF_INET: 5953 eventctx->retval |= family_ipv4; 5954 l3_size = get_type_size(struct iphdr); 5955 break; 5956 case PF_INET6: 5957 eventctx->retval |= family_ipv6; 5958 l3_size = get_type_size(struct ipv6hdr); 5959 break; 5960 default: 5961 return 1; 5962 } 5963 5964 // ... and packet direction(ingress/egress) ... 5965 eventctx->retval |= packet_dir_flag; 5966 // ... through event ctx ret val. 5967 5968 // Read packet headers from the skb. 5969 void *data_ptr = BPF_CORE_READ(skb, head) + BPF_CORE_READ(skb, network_header); 5970 bpf_core_read(nethdrs, l3_size, data_ptr); 5971 5972 // Prepare the inter-eBPF-program indexer. 5973 indexer_t indexer = {0}; 5974 indexer.ts = BPF_CORE_READ(skb, tstamp); 5975 5976 u8 proto = 0; 5977 5978 // Parse the packet layer 3 headers. 5979 switch (family) { 5980 case PF_INET: 5981 if (nethdrs->iphdrs.iphdr.version != 4) // IPv4 5982 return 1; 5983 5984 if (nethdrs->iphdrs.iphdr.ihl > 5) { // re-read IP header if needed 5985 l3_size -= get_type_size(struct iphdr); 5986 l3_size += nethdrs->iphdrs.iphdr.ihl * 4; 5987 bpf_core_read(nethdrs, l3_size, data_ptr); 5988 } 5989 5990 proto = nethdrs->iphdrs.iphdr.protocol; 5991 switch (proto) { 5992 case IPPROTO_TCP: 5993 case IPPROTO_UDP: 5994 case IPPROTO_ICMP: 5995 break; 5996 default: 5997 return 1; // ignore other protocols 5998 } 5999 6000 // Update inter-eBPF-program indexer with IPv4 header items. 6001 indexer.ip_csum = nethdrs->iphdrs.iphdr.check; 6002 indexer.src.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.saddr; 6003 indexer.dst.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.daddr; 6004 break; 6005 6006 case PF_INET6: 6007 // TODO: dual-stack IP implementation unsupported for now 6008 // https://en.wikipedia.org/wiki/IPv6_transition_mechanism 6009 if (nethdrs->iphdrs.ipv6hdr.version != 6) // IPv6 6010 return 1; 6011 6012 proto = nethdrs->iphdrs.ipv6hdr.nexthdr; 6013 switch (proto) { 6014 case IPPROTO_TCP: 6015 case IPPROTO_UDP: 6016 case IPPROTO_ICMPV6: 6017 break; 6018 default: 6019 return 1; // ignore other protocols 6020 } 6021 6022 // Update inter-eBPF-program indexer with IPv6 header items. 6023 __builtin_memcpy(&indexer.src.in6_u, &nethdrs->iphdrs.ipv6hdr.saddr.in6_u, 4 * sizeof(u32)); 6024 __builtin_memcpy(&indexer.dst.in6_u, &nethdrs->iphdrs.ipv6hdr.daddr.in6_u, 4 * sizeof(u32)); 6025 break; 6026 6027 default: 6028 return 1; 6029 } 6030 6031 // 6032 // LINK CONTENT INDEXER TO EVENT CONTEXT 6033 // 6034 6035 neteventctx.bytes = 0; // event arg size: no payload by default (changed inside skb prog) 6036 6037 // TODO: log collisions 6038 bpf_map_update_elem(cgrpctxmap, &indexer, &neteventctx, BPF_NOEXIST); 6039 6040 return 0; 6041 } 6042 6043 // 6044 // SKB eBPF programs 6045 // 6046 6047 statfunc u32 cgroup_skb_generic(struct __sk_buff *ctx, void *cgrpctxmap) 6048 { 6049 // IMPORTANT: runs for EVERY packet of tasks belonging to root cgroup 6050 6051 switch (ctx->family) { 6052 case PF_INET: 6053 case PF_INET6: 6054 break; 6055 default: 6056 return 1; // PF_INET and PF_INET6 only 6057 } 6058 6059 // HANDLE SOCKET FAMILY 6060 6061 struct bpf_sock *sk = ctx->sk; 6062 if (!sk) 6063 return 1; 6064 6065 sk = bpf_sk_fullsock(sk); 6066 if (!sk) 6067 return 1; 6068 6069 nethdrs hdrs = {0}, *nethdrs = &hdrs; 6070 6071 void *dest; 6072 6073 u32 size = 0; 6074 u32 family = ctx->family; 6075 6076 switch (family) { 6077 case PF_INET: 6078 dest = &nethdrs->iphdrs.iphdr; 6079 size = get_type_size(struct iphdr); 6080 break; 6081 case PF_INET6: 6082 dest = &nethdrs->iphdrs.ipv6hdr; 6083 size = get_type_size(struct ipv6hdr); 6084 break; 6085 default: 6086 return 1; // verifier 6087 } 6088 6089 // load layer 3 headers (for cgrpctxmap key/indexer) 6090 6091 if (bpf_skb_load_bytes_relative(ctx, 0, dest, size, 1)) 6092 return 1; 6093 6094 // 6095 // IGNORE UNSUPPORTED PROTOCOLS, CREATE INDEXER TO OBTAIN EVENT 6096 // 6097 6098 indexer_t indexer = {0}; 6099 indexer.ts = ctx->tstamp; 6100 6101 u32 ihl = 0; 6102 switch (family) { 6103 case PF_INET: 6104 if (nethdrs->iphdrs.iphdr.version != 4) // IPv4 6105 return 1; 6106 6107 ihl = nethdrs->iphdrs.iphdr.ihl; 6108 if (ihl > 5) { // re-read IPv4 header if needed 6109 size -= get_type_size(struct iphdr); 6110 size += ihl * 4; 6111 bpf_skb_load_bytes_relative(ctx, 0, dest, size, 1); 6112 } 6113 6114 switch (nethdrs->iphdrs.iphdr.protocol) { 6115 case IPPROTO_TCP: 6116 case IPPROTO_UDP: 6117 case IPPROTO_ICMP: 6118 break; 6119 default: 6120 return 1; // unsupported proto 6121 } 6122 6123 // add IPv4 header items to indexer 6124 indexer.ip_csum = nethdrs->iphdrs.iphdr.check; 6125 indexer.src.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.saddr; 6126 indexer.dst.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.daddr; 6127 break; 6128 6129 case PF_INET6: 6130 // TODO: dual-stack IP implementation unsupported for now 6131 // https://en.wikipedia.org/wiki/IPv6_transition_mechanism 6132 if (nethdrs->iphdrs.ipv6hdr.version != 6) // IPv6 6133 return 1; 6134 6135 switch (nethdrs->iphdrs.ipv6hdr.nexthdr) { 6136 case IPPROTO_TCP: 6137 case IPPROTO_UDP: 6138 case IPPROTO_ICMPV6: 6139 break; 6140 default: 6141 return 1; // unsupported proto 6142 } 6143 6144 // add IPv6 header items to indexer 6145 __builtin_memcpy(&indexer.src.in6_u, &nethdrs->iphdrs.ipv6hdr.saddr.in6_u, 4 * sizeof(u32)); 6146 __builtin_memcpy(&indexer.dst.in6_u, &nethdrs->iphdrs.ipv6hdr.daddr.in6_u, 4 * sizeof(u32)); 6147 break; 6148 6149 default: 6150 return 1; // verifier 6151 } 6152 6153 net_event_context_t *neteventctx; 6154 neteventctx = bpf_map_lookup_elem(cgrpctxmap, &indexer); // obtain event context 6155 if (!neteventctx) { 6156 // 1. kthreads receiving ICMP and ICMPv6 (e.g dest unreach) 6157 // 2. tasks not being traced 6158 // 3. unknown (yet) sockets (need egress packet to link task and inode) 6159 // ... 6160 return 1; 6161 } 6162 6163 // Skip if cgroup is muted. 6164 u64 cgroup_id = neteventctx->eventctx.task.cgroup_id; 6165 if (bpf_map_lookup_elem(&ignored_cgroups_map, &cgroup_id)) { 6166 return 1; 6167 } 6168 6169 neteventctx->md.header_size = size; // add header size to offset 6170 6171 u32 ret = CGROUP_SKB_HANDLE(proto); 6172 6173 bpf_map_delete_elem(cgrpctxmap, &indexer); // cleanup 6174 6175 return ret; // important for network blocking 6176 } 6177 6178 SEC("cgroup_skb/ingress") 6179 int cgroup_skb_ingress(struct __sk_buff *ctx) 6180 { 6181 return cgroup_skb_generic(ctx, &cgrpctxmap_in); 6182 } 6183 6184 SEC("cgroup_skb/egress") 6185 int cgroup_skb_egress(struct __sk_buff *ctx) 6186 { 6187 return cgroup_skb_generic(ctx, &cgrpctxmap_eg); 6188 } 6189 6190 // 6191 // Network Protocol Events Logic 6192 // 6193 6194 // 6195 // SUPPORTED L3 NETWORK PROTOCOLS (ip, ipv6) HANDLERS 6196 // 6197 6198 CGROUP_SKB_HANDLE_FUNCTION(proto) 6199 { 6200 void *dest = NULL; 6201 u32 prev_hdr_size = neteventctx->md.header_size; 6202 u32 size = 0; 6203 u8 next_proto = 0; 6204 6205 // NOTE: might block IP and IPv6 here if needed (return 0) 6206 6207 switch (ctx->family) { 6208 case PF_INET: 6209 if (nethdrs->iphdrs.iphdr.version != 4) // IPv4 6210 return 1; 6211 6212 next_proto = nethdrs->iphdrs.iphdr.protocol; 6213 switch (next_proto) { 6214 case IPPROTO_TCP: 6215 dest = &nethdrs->protohdrs.tcphdr; 6216 size = get_type_size(struct tcphdr); 6217 break; 6218 case IPPROTO_UDP: 6219 dest = &nethdrs->protohdrs.udphdr; 6220 size = get_type_size(struct udphdr); 6221 break; 6222 case IPPROTO_ICMP: 6223 dest = &nethdrs->protohdrs.icmphdr; 6224 size = 0; // will be added later, last function 6225 break; 6226 default: 6227 return 1; // other protocols are not an error 6228 } 6229 6230 // Update the network flow map indexer with the packet headers. 6231 neteventctx->md.flow.tuple.saddr.v4addr = nethdrs->iphdrs.iphdr.saddr; 6232 neteventctx->md.flow.tuple.daddr.v4addr = nethdrs->iphdrs.iphdr.daddr; 6233 neteventctx->md.flow.tuple.family = AF_INET; 6234 break; 6235 6236 case PF_INET6: 6237 // TODO: dual-stack IP implementation unsupported for now 6238 // https://en.wikipedia.org/wiki/IPv6_transition_mechanism 6239 if (nethdrs->iphdrs.ipv6hdr.version != 6) // IPv6 6240 return 1; 6241 6242 next_proto = nethdrs->iphdrs.ipv6hdr.nexthdr; 6243 switch (next_proto) { 6244 case IPPROTO_TCP: 6245 dest = &nethdrs->protohdrs.tcphdr; 6246 size = get_type_size(struct tcphdr); 6247 break; 6248 case IPPROTO_UDP: 6249 dest = &nethdrs->protohdrs.udphdr; 6250 size = get_type_size(struct udphdr); 6251 break; 6252 case IPPROTO_ICMPV6: 6253 dest = &nethdrs->protohdrs.icmp6hdr; 6254 size = 0; // will be added later, last function 6255 break; 6256 default: 6257 return 1; // other protocols are not an error 6258 } 6259 6260 // Update the network flow map indexer with the packet headers. 6261 __builtin_memcpy(&neteventctx->md.flow.tuple.saddr.v6addr, &nethdrs->iphdrs.ipv6hdr.saddr.in6_u, 4 * sizeof(u32)); 6262 __builtin_memcpy(&neteventctx->md.flow.tuple.daddr.v6addr, &nethdrs->iphdrs.ipv6hdr.daddr.in6_u, 4 * sizeof(u32)); 6263 break; 6264 6265 default: 6266 return 1; // verifier needs 6267 } 6268 6269 // Update the network flow map indexer with the packet headers. 6270 neteventctx->md.flow.proto = next_proto; 6271 6272 if (!dest) 6273 return 1; // satisfy verifier for clang-12 generated binaries 6274 6275 // fastpath: submit the IP base event 6276 6277 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_IP)) 6278 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_IP, HEADERS); 6279 6280 // fastpath: capture all packets if filtered pcap-option is not set 6281 6282 u32 zero = 0; 6283 netconfig_entry_t *nc = bpf_map_lookup_elem(&netconfig_map, &zero); 6284 if (nc == NULL) 6285 return 0; 6286 6287 if (!(nc->capture_options & NET_CAP_OPT_FILTERED)) 6288 cgroup_skb_capture(); // will avoid extra lookups further if not needed 6289 6290 // Update the network event context with payload size. 6291 neteventctx->md.header_size += size; 6292 6293 // Load the next protocol header. 6294 if (size) { 6295 if (bpf_skb_load_bytes_relative(ctx, prev_hdr_size, dest, size, BPF_HDR_START_NET)) 6296 return 1; 6297 } 6298 6299 // Call the next protocol handler. 6300 switch (next_proto) { 6301 case IPPROTO_TCP: 6302 return CGROUP_SKB_HANDLE(proto_tcp); 6303 case IPPROTO_UDP: 6304 return CGROUP_SKB_HANDLE(proto_udp); 6305 case IPPROTO_ICMP: 6306 return CGROUP_SKB_HANDLE(proto_icmp); 6307 case IPPROTO_ICMPV6: 6308 return CGROUP_SKB_HANDLE(proto_icmpv6); 6309 default: 6310 return 1; // verifier needs 6311 } 6312 6313 // TODO: If cmdline is tracing net_packet_ipv6 only, then the ipv4 packets 6314 // shouldn't be added to the pcap file. Filters will have to be 6315 // applied to the capture pipeline to obey derived events only 6316 // filters + capture. 6317 6318 // capture IPv4/IPv6 packets (filtered) 6319 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP)) 6320 cgroup_skb_capture(); 6321 6322 return 1; 6323 } 6324 6325 // 6326 // GUESS L7 NETWORK PROTOCOLS (http, dns, etc) 6327 // 6328 6329 // when guessing by src/dst ports, declare at network.h 6330 6331 // when guessing through l7 layer, here 6332 6333 statfunc int net_l7_is_http(struct __sk_buff *skb, u32 l7_off) 6334 { 6335 char http_min_str[http_min_len]; 6336 __builtin_memset((void *) &http_min_str, 0, sizeof(char) * http_min_len); 6337 6338 // load first http_min_len bytes from layer 7 in packet. 6339 if (bpf_skb_load_bytes(skb, l7_off, http_min_str, http_min_len) < 0) { 6340 return 0; // failed loading data into http_min_str - return. 6341 } 6342 6343 // check if HTTP response 6344 if (has_prefix("HTTP/", http_min_str, 6)) { 6345 return proto_http_resp; 6346 } 6347 6348 // check if HTTP request 6349 if (has_prefix("GET ", http_min_str, 5) || 6350 has_prefix("POST ", http_min_str, 6) || 6351 has_prefix("PUT ", http_min_str, 5) || 6352 has_prefix("DELETE ", http_min_str, 8) || 6353 has_prefix("HEAD ", http_min_str, 6)) { 6354 return proto_http_req; 6355 } 6356 6357 return 0; 6358 } 6359 6360 // clang-format on 6361 6362 #define SOCKS5_VERSION(buf) buf[0] 6363 #define SOCKS5_NUM_METHODS(buf) buf[1] 6364 #define SOCKS5_CMD(buf) buf[1] 6365 #define SOCKS5_RESERVED(buf) buf[2] 6366 #define SOCKS5_ADDR_TYPE(buf) buf[3] 6367 6368 // see https://datatracker.ietf.org/doc/html/rfc1928 for the definition of the socks5 protocol 6369 statfunc bool net_l7_is_socks5(struct __sk_buff *skb, u32 l7_off) 6370 { 6371 // we treat all messages from the default socks ports as potential sock messages and try to 6372 // parse them in userspace. 6373 if (skb->remote_port == TCP_PORT_SOCKS5) { 6374 return true; 6375 } 6376 6377 if (skb->local_port == TCP_PORT_SOCKS5) { 6378 return true; 6379 } 6380 6381 char buf[socks5_min_len]; 6382 __builtin_memset(&buf, 0, sizeof(buf)); 6383 6384 if (skb->len < l7_off) { 6385 return false; 6386 } 6387 6388 u32 payload_len = skb->len - l7_off; 6389 u32 read_len = payload_len; 6390 // inline bounds check to force compiler to use the register of size 6391 asm volatile("if %[size] < %[max_size] goto +1;\n" 6392 "%[size] = %[max_size];\n" 6393 : 6394 : [size] "r"(read_len), [max_size] "i"(socks5_min_len)); 6395 6396 // make the verifier happy to ensure that we read more than a single byte 6397 // the test is for 2, since we anyway exect at least 2 bytes to check for socks5 6398 asm goto("if %[size] < 2 goto %l[out]" ::[size] "r"(read_len)::out); 6399 6400 if (read_len < 2) { 6401 return false; 6402 } 6403 6404 // load first socks5_min_len bytes from layer 7 in packet. 6405 if (bpf_skb_load_bytes(skb, l7_off, buf, read_len) < 0) { 6406 return false; // failed loading data into http_min_str - return. 6407 } 6408 6409 if (SOCKS5_VERSION(buf) != 5) { 6410 return false; // all socks5 messages begin with the version (which is 5 for socks5) 6411 } 6412 6413 // this might be a bit of a leap of faith here, since the first server response only selects the 6414 // method used for auth. This requires more massaging in userspace. 6415 if (payload_len == 2) { 6416 return true; 6417 } 6418 6419 // the client starts by sending a message containing the number of methods for auth in the 6420 // second byte. Each of these methods are then listed in the following bytes, meaning that 6421 // if our message is the length of the number of messages + 2 (since starting after the second 6422 // byte), we should have ourselfs a client request. 6423 if (payload_len == (u32) SOCKS5_NUM_METHODS(buf) + 2) { 6424 return true; 6425 } 6426 6427 // we now access fields above the two 6428 if (read_len < socks5_min_len) { 6429 return false; 6430 } 6431 6432 // both request and response have the 3rd byte reserved and it needs to be set to 0x00 6433 if (SOCKS5_RESERVED(buf) != 0x00) { 6434 return false; 6435 } 6436 6437 if (SOCKS5_ADDR_TYPE(buf) == 0x01 // IPv4 address 6438 || SOCKS5_ADDR_TYPE(buf) == 0x03 // domain name 6439 || SOCKS5_ADDR_TYPE(buf) == 0x04) { // IPv6 address 6440 return true; 6441 } 6442 6443 out: 6444 return false; 6445 } 6446 // clang-format off 6447 6448 // 6449 // SUPPORTED L4 NETWORK PROTOCOL (tcp, udp, icmp) HANDLERS 6450 // 6451 6452 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp) 6453 { 6454 // check flag for dynamic header size (TCP: data offset flag) 6455 6456 if (nethdrs->protohdrs.tcphdr.doff > 5) { // offset flag set 6457 u32 doff = nethdrs->protohdrs.tcphdr.doff * (32 / 8); 6458 neteventctx->md.header_size -= get_type_size(struct tcphdr); 6459 neteventctx->md.header_size += doff; 6460 } 6461 6462 // Pick src/dst ports. 6463 u16 srcport = bpf_ntohs(nethdrs->protohdrs.tcphdr.source); 6464 u16 dstport = bpf_ntohs(nethdrs->protohdrs.tcphdr.dest); 6465 6466 // Update the network flow map indexer with the packet headers. 6467 neteventctx->md.flow.tuple.sport = srcport; 6468 neteventctx->md.flow.tuple.dport = dstport; 6469 6470 if (should_submit_flow_event(neteventctx)) { 6471 // Check if TCP flow needs to be submitted (only headers). 6472 bool is_rst = nethdrs->protohdrs.tcphdr.rst; 6473 bool is_syn = nethdrs->protohdrs.tcphdr.syn; 6474 bool is_ack = nethdrs->protohdrs.tcphdr.ack; 6475 bool is_fin = nethdrs->protohdrs.tcphdr.fin; 6476 6477 // Has TCP flow started ? 6478 if ((is_syn & is_ack)) 6479 cgroup_skb_handle_flow(ctx, neteventctx, NET_FLOW_BASE, HEADERS, flow_tcp_begin); 6480 6481 if (!is_syn && !is_fin && !is_rst) { 6482 cgroup_skb_handle_flow(ctx, neteventctx, NET_FLOW_BASE, HEADERS, flow_tcp_sample); 6483 } 6484 6485 // Has TCP flow ended ? 6486 if (is_fin || is_rst) 6487 cgroup_skb_handle_flow(ctx, neteventctx, NET_FLOW_BASE, HEADERS, flow_tcp_end); 6488 } 6489 6490 // Submit TCP base event if needed (only headers) 6491 6492 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_TCP)) 6493 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_TCP, HEADERS); 6494 6495 // Fastpath: return if no other L7 network events. 6496 6497 if (!should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS) && 6498 !should_submit_net_event(neteventctx, SUB_NET_PACKET_HTTP) && 6499 !should_submit_net_event(neteventctx, SUB_NET_PACKET_SOCKS5)) 6500 goto capture; 6501 6502 // Guess layer 7 protocols by src/dst ports ... 6503 6504 switch (srcport < dstport ? srcport : dstport) { 6505 case TCP_PORT_DNS: 6506 return CGROUP_SKB_HANDLE(proto_tcp_dns); 6507 case TCP_PORT_SOCKS5: 6508 return CGROUP_SKB_HANDLE(proto_tcp_socks5); 6509 } 6510 6511 // ... and by analyzing payload. 6512 6513 int http_proto = net_l7_is_http(ctx, neteventctx->md.header_size); 6514 if (http_proto) { 6515 neteventctx->eventctx.retval |= http_proto; 6516 return CGROUP_SKB_HANDLE(proto_tcp_http); 6517 } 6518 6519 int socks5_proto = net_l7_is_socks5(ctx, neteventctx->md.header_size); 6520 if (socks5_proto) { 6521 return CGROUP_SKB_HANDLE(proto_tcp_socks5); 6522 } 6523 6524 // ... continue with net_l7_is_protocol_xxx 6525 6526 capture: 6527 // Capture IP or TCP packets (filtered) 6528 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6529 should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP)) { 6530 cgroup_skb_capture(); 6531 } 6532 6533 return 1; // NOTE: might block TCP here if needed (return 0) 6534 } 6535 6536 CGROUP_SKB_HANDLE_FUNCTION(proto_udp) 6537 { 6538 // Submit UDP base event if needed (only headers). 6539 6540 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_UDP)) 6541 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_UDP, HEADERS); 6542 6543 // Fastpath: return if no other L7 network events. 6544 6545 if (!should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS) && 6546 !should_submit_net_event(neteventctx, SUB_NET_PACKET_HTTP)) 6547 goto capture; 6548 6549 // Guess layer 7 protocols ... 6550 6551 u16 source = bpf_ntohs(nethdrs->protohdrs.udphdr.source); 6552 u16 dest = bpf_ntohs(nethdrs->protohdrs.udphdr.dest); 6553 6554 // ... by src/dst ports 6555 6556 switch (source < dest ? source : dest) { 6557 case UDP_PORT_DNS: 6558 return CGROUP_SKB_HANDLE(proto_udp_dns); 6559 } 6560 6561 // ... by analyzing payload 6562 // ... 6563 6564 // ... continue with net_l7_is_protocol_xxx 6565 6566 capture: 6567 // Capture IP or UDP packets (filtered). 6568 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6569 should_capture_net_event(neteventctx, SUB_NET_PACKET_UDP)) { 6570 cgroup_skb_capture(); 6571 } 6572 6573 return 1; // NOTE: might block UDP here if needed (return 0) 6574 } 6575 6576 CGROUP_SKB_HANDLE_FUNCTION(proto_icmp) 6577 { 6578 // submit ICMP base event if needed (full packet) 6579 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_ICMP)) 6580 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_ICMP, FULL); 6581 6582 // capture ip or icmp packets (filtered) 6583 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6584 should_capture_net_event(neteventctx, SUB_NET_PACKET_ICMP)) { 6585 neteventctx->md.header_size = ctx->len; // full ICMP header 6586 cgroup_skb_capture(); 6587 } 6588 6589 return 1; // NOTE: might block ICMP here if needed (return 0) 6590 } 6591 6592 CGROUP_SKB_HANDLE_FUNCTION(proto_icmpv6) 6593 { 6594 // submit ICMPv6 base event if needed (full packet) 6595 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_ICMPV6)) 6596 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_ICMPV6, FULL); 6597 6598 // capture ip or icmpv6 packets (filtered) 6599 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6600 should_capture_net_event(neteventctx, SUB_NET_PACKET_ICMPV6)) { 6601 neteventctx->md.header_size = ctx->len; // full ICMPv6 header 6602 cgroup_skb_capture(); 6603 } 6604 6605 return 1; // NOTE: might block ICMPv6 here if needed (return 0) 6606 } 6607 6608 // 6609 // SUPPORTED L7 NETWORK PROTOCOL (dns) HANDLERS 6610 // 6611 6612 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_dns) 6613 { 6614 // submit DNS base event if needed (full packet) 6615 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS)) 6616 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_DNS, FULL); 6617 6618 // capture DNS-TCP, TCP or IP packets (filtered) 6619 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6620 should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP) || 6621 should_capture_net_event(neteventctx, SUB_NET_PACKET_DNS)) { 6622 neteventctx->md.header_size = ctx->len; // full dns header 6623 cgroup_skb_capture(); 6624 } 6625 6626 return 1; // NOTE: might block DNS here if needed (return 0) 6627 } 6628 6629 CGROUP_SKB_HANDLE_FUNCTION(proto_udp_dns) 6630 { 6631 // submit DNS base event if needed (full packet) 6632 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS)) 6633 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_DNS, FULL); 6634 6635 // capture DNS-UDP, UDP or IP packets (filtered) 6636 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6637 should_capture_net_event(neteventctx, SUB_NET_PACKET_UDP) || 6638 should_capture_net_event(neteventctx, SUB_NET_PACKET_DNS)) { 6639 neteventctx->md.header_size = ctx->len; // full dns header 6640 cgroup_skb_capture(); 6641 } 6642 6643 return 1; // NOTE: might block DNS here if needed (return 0) 6644 } 6645 6646 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_http) 6647 { 6648 // submit HTTP base event if needed (full packet) 6649 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_HTTP)) 6650 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_HTTP, FULL); 6651 6652 // capture HTTP-TCP, TCP or IP packets (filtered) 6653 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6654 should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP) || 6655 should_capture_net_event(neteventctx, SUB_NET_PACKET_HTTP)) { 6656 cgroup_skb_capture(); // http header is dyn, do not change header_size 6657 } 6658 6659 return 1; // NOTE: might block HTTP here if needed (return 0) 6660 } 6661 6662 CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_socks5) 6663 { 6664 u32 payload_len = ctx->len - neteventctx->md.header_size; 6665 6666 // submit SOCKS5 base event if needed (full packet) 6667 // we only care about packets that have a payload though 6668 if (should_submit_net_event(neteventctx, SUB_NET_PACKET_SOCKS5) && payload_len > 0) { 6669 cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_SOCKS5, FULL); 6670 } 6671 6672 // capture SOCKS5-TCP, TCP or IP packets (filtered) 6673 if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) || 6674 should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP) || 6675 should_capture_net_event(neteventctx, SUB_NET_PACKET_HTTP)) { 6676 neteventctx->md.header_size = ctx->len; // full socks5 packet 6677 cgroup_skb_capture(); 6678 } 6679 6680 return 1; // NOTE: might block SOCKS5 here if needed (return 0) 6681 } 6682 6683 // clang-format on 6684 6685 // 6686 // Control Plane Programs 6687 // 6688 // Control Plane programs are almost duplicate programs of select events which we send as direct 6689 // signals to tracee in a separate buffer. This is done to mitigate the consenquences of losing 6690 // these events in the main perf buffer. 6691 // 6692 6693 // Containers Lifecyle 6694 6695 SEC("raw_tracepoint/cgroup_mkdir_signal") 6696 int cgroup_mkdir_signal(struct bpf_raw_tracepoint_args *ctx) 6697 { 6698 u32 zero = 0; 6699 config_entry_t *cfg = bpf_map_lookup_elem(&config_map, &zero); 6700 if (unlikely(cfg == NULL)) 6701 return 0; 6702 controlplane_signal_t *signal = init_controlplane_signal(); 6703 if (unlikely(signal == NULL)) 6704 return 0; 6705 6706 struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0]; 6707 char *path = (char *) ctx->args[1]; 6708 6709 u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp); 6710 u64 cgroup_id = get_cgroup_id(dst_cgrp); 6711 u32 cgroup_id_lsb = cgroup_id; 6712 6713 bool should_update = true; 6714 if ((cfg->options & OPT_CGROUP_V1) && (cfg->cgroup_v1_hid != hierarchy_id)) 6715 should_update = false; 6716 6717 if (should_update) { 6718 // Assume this is a new container. If not, userspace code will delete this entry 6719 u8 state = CONTAINER_CREATED; 6720 bpf_map_update_elem(&containers_map, &cgroup_id_lsb, &state, BPF_ANY); 6721 } 6722 6723 save_to_submit_buf(&signal->args_buf, &cgroup_id, sizeof(u64), 0); 6724 save_str_to_buf(&signal->args_buf, path, 1); 6725 save_to_submit_buf(&signal->args_buf, &hierarchy_id, sizeof(u32), 2); 6726 signal_perf_submit(ctx, signal, SIGNAL_CGROUP_MKDIR); 6727 6728 return 0; 6729 } 6730 6731 SEC("raw_tracepoint/cgroup_rmdir_signal") 6732 int cgroup_rmdir_signal(struct bpf_raw_tracepoint_args *ctx) 6733 { 6734 u32 zero = 0; 6735 config_entry_t *cfg = bpf_map_lookup_elem(&config_map, &zero); 6736 if (unlikely(cfg == NULL)) 6737 return 0; 6738 controlplane_signal_t *signal = init_controlplane_signal(); 6739 if (unlikely(signal == NULL)) 6740 return 0; 6741 6742 struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0]; 6743 char *path = (char *) ctx->args[1]; 6744 6745 u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp); 6746 u64 cgroup_id = get_cgroup_id(dst_cgrp); 6747 u32 cgroup_id_lsb = cgroup_id; 6748 6749 bool should_update = true; 6750 if ((cfg->options & OPT_CGROUP_V1) && (cfg->cgroup_v1_hid != hierarchy_id)) 6751 should_update = false; 6752 6753 if (should_update) 6754 bpf_map_delete_elem(&containers_map, &cgroup_id_lsb); 6755 6756 save_to_submit_buf(&signal->args_buf, &cgroup_id, sizeof(u64), 0); 6757 save_str_to_buf(&signal->args_buf, path, 1); 6758 save_to_submit_buf(&signal->args_buf, &hierarchy_id, sizeof(u32), 2); 6759 signal_perf_submit(ctx, signal, SIGNAL_CGROUP_RMDIR); 6760 6761 return 0; 6762 } 6763 6764 // Processes Lifecycle 6765 6766 // NOTE: sched_process_fork is called by kernel_clone(), which is executed during 6767 // clone() calls as well, not only fork(). This means that sched_process_fork() 6768 // is also able to pick the creation of LWPs through clone(). 6769 6770 SEC("raw_tracepoint/sched_process_fork") 6771 int sched_process_fork_signal(struct bpf_raw_tracepoint_args *ctx) 6772 { 6773 controlplane_signal_t *signal = init_controlplane_signal(); 6774 if (unlikely(signal == NULL)) 6775 return 0; 6776 6777 struct task_struct *parent = (struct task_struct *) ctx->args[0]; 6778 struct task_struct *child = (struct task_struct *) ctx->args[1]; 6779 struct task_struct *leader = get_leader_task(child); 6780 struct task_struct *up_parent = get_leader_task(get_parent_task(leader)); 6781 6782 // In the Linux kernel: 6783 // 6784 // Every task (a process or a thread) is represented by a `task_struct`: 6785 // 6786 // - `pid`: Inside the `task_struct`, there's a field called `pid`. This is a unique identifier 6787 // for every task, which can be thought of as the thread ID (TID) from a user space 6788 // perspective. Every task, whether it's the main thread of a process or an additional thread, 6789 // has a unique `pid`. 6790 // 6791 // - `tgid` (Thread Group ID): This field in the `task_struct` is used to group threads from the 6792 // same process. For the main thread of a process, the `tgid` is the same as its `pid`. For 6793 // other threads created by that process, the `tgid` matches the `pid` of the main thread. 6794 // 6795 // In userspace: 6796 // 6797 // - `getpid()` returns the TGID, effectively the traditional process ID. 6798 // - `gettid()` returns the PID (from the `task_struct`), effectively the thread ID. 6799 // 6800 // This design in the Linux kernel leads to a unified handling of processes and threads. In the 6801 // kernel's view, every thread is a task with potentially shared resources, but each has a 6802 // unique PID. In user space, the distinction is made where processes have a unique PID, and 6803 // threads within those processes have unique TIDs. 6804 6805 // Summary: 6806 // userland pid = kernel tgid 6807 // userland tgid = kernel pid 6808 6809 // The event timestamp, so process tree info can be changelog'ed. 6810 u64 timestamp = bpf_ktime_get_ns(); 6811 save_to_submit_buf(&signal->args_buf, ×tamp, sizeof(u64), 0); 6812 6813 // Parent information. 6814 u64 parent_start_time = get_task_start_time(parent); 6815 int parent_pid = get_task_host_tgid(parent); 6816 int parent_tid = get_task_host_pid(parent); 6817 int parent_ns_pid = get_task_ns_tgid(parent); 6818 int parent_ns_tid = get_task_ns_pid(parent); 6819 6820 // Child information. 6821 u64 child_start_time = get_task_start_time(child); 6822 int child_pid = get_task_host_tgid(child); 6823 int child_tid = get_task_host_pid(child); 6824 int child_ns_pid = get_task_ns_tgid(child); 6825 int child_ns_tid = get_task_ns_pid(child); 6826 6827 // Up Parent information: Go up in hierarchy until parent is process. 6828 u64 up_parent_start_time = get_task_start_time(up_parent); 6829 int up_parent_pid = get_task_host_tgid(up_parent); 6830 int up_parent_tid = get_task_host_pid(up_parent); 6831 int up_parent_ns_pid = get_task_ns_tgid(up_parent); 6832 int up_parent_ns_tid = get_task_ns_pid(up_parent); 6833 6834 // Leader information. 6835 u64 leader_start_time = get_task_start_time(leader); 6836 int leader_pid = get_task_host_tgid(leader); 6837 int leader_tid = get_task_host_pid(leader); 6838 int leader_ns_pid = get_task_ns_tgid(leader); 6839 int leader_ns_tid = get_task_ns_pid(leader); 6840 6841 // Parent (might be a thread or a process). 6842 save_to_submit_buf(&signal->args_buf, (void *) &parent_tid, sizeof(int), 1); 6843 save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_tid, sizeof(int), 2); 6844 save_to_submit_buf(&signal->args_buf, (void *) &parent_pid, sizeof(int), 3); 6845 save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_pid, sizeof(int), 4); 6846 save_to_submit_buf(&signal->args_buf, (void *) &parent_start_time, sizeof(u64), 5); 6847 6848 // Child (might be a thread or a process, sched_process_fork trace is calle by clone() also). 6849 save_to_submit_buf(&signal->args_buf, (void *) &child_tid, sizeof(int), 6); 6850 save_to_submit_buf(&signal->args_buf, (void *) &child_ns_tid, sizeof(int), 7); 6851 save_to_submit_buf(&signal->args_buf, (void *) &child_pid, sizeof(int), 8); 6852 save_to_submit_buf(&signal->args_buf, (void *) &child_ns_pid, sizeof(int), 9); 6853 save_to_submit_buf(&signal->args_buf, (void *) &child_start_time, sizeof(u64), 10); 6854 6855 // Up Parent: always a real process (might be the same as Parent if it is a real process). 6856 save_to_submit_buf(&signal->args_buf, (void *) &up_parent_tid, sizeof(int), 11); 6857 save_to_submit_buf(&signal->args_buf, (void *) &up_parent_ns_tid, sizeof(int), 12); 6858 save_to_submit_buf(&signal->args_buf, (void *) &up_parent_pid, sizeof(int), 13); 6859 save_to_submit_buf(&signal->args_buf, (void *) &up_parent_ns_pid, sizeof(int), 14); 6860 save_to_submit_buf(&signal->args_buf, (void *) &up_parent_start_time, sizeof(u64), 15); 6861 6862 // Leader: always a real process (might be the same as the Child if child is a real process). 6863 save_to_submit_buf(&signal->args_buf, (void *) &leader_tid, sizeof(int), 16); 6864 save_to_submit_buf(&signal->args_buf, (void *) &leader_ns_tid, sizeof(int), 17); 6865 save_to_submit_buf(&signal->args_buf, (void *) &leader_pid, sizeof(int), 18); 6866 save_to_submit_buf(&signal->args_buf, (void *) &leader_ns_pid, sizeof(int), 19); 6867 save_to_submit_buf(&signal->args_buf, (void *) &leader_start_time, sizeof(u64), 20); 6868 6869 signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_FORK); 6870 6871 return 0; 6872 } 6873 6874 // clang-format off 6875 6876 SEC("raw_tracepoint/sched_process_exec") 6877 int sched_process_exec_signal(struct bpf_raw_tracepoint_args *ctx) 6878 { 6879 controlplane_signal_t *signal = init_controlplane_signal(); 6880 if (unlikely(signal == NULL)) 6881 return 0; 6882 6883 // Hashes 6884 6885 struct task_struct *task = (struct task_struct *) ctx->args[0]; 6886 if (task == NULL) 6887 return -1; 6888 struct task_struct *leader = get_leader_task(task); 6889 struct task_struct *parent = get_leader_task(get_parent_task(leader)); 6890 6891 // The hash is always calculated with "task_struct->pid + start_time". 6892 u32 task_hash = hash_task_id(get_task_host_pid(task), get_task_start_time(task)); 6893 u32 parent_hash = hash_task_id(get_task_host_pid(parent), get_task_start_time(parent)); 6894 u32 leader_hash = hash_task_id(get_task_host_pid(leader), get_task_start_time(leader)); 6895 6896 // The event timestamp, so process tree info can be changelog'ed. 6897 u64 timestamp = bpf_ktime_get_ns(); 6898 save_to_submit_buf(&signal->args_buf, ×tamp, sizeof(u64), 0); 6899 6900 save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 1); 6901 save_to_submit_buf(&signal->args_buf, (void *) &parent_hash, sizeof(u32), 2); 6902 save_to_submit_buf(&signal->args_buf, (void *) &leader_hash, sizeof(u32), 3); 6903 6904 // Exec logic 6905 6906 struct linux_binprm *bprm = (struct linux_binprm *) ctx->args[2]; 6907 if (bprm == NULL) 6908 return -1; 6909 6910 // Pick the interpreter path from the proc_info map, which is set by the "load_elf_phdrs". 6911 u32 host_pid = get_task_host_tgid(task); 6912 proc_info_t *proc_info = bpf_map_lookup_elem(&proc_info_map, &host_pid); 6913 if (proc_info == NULL) { 6914 proc_info = init_proc_info(host_pid, 0); 6915 if (unlikely(proc_info == NULL)) { 6916 tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0); 6917 return 0; 6918 } 6919 } 6920 6921 struct file *file = get_file_ptr_from_bprm(bprm); 6922 void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path)); 6923 const char *filename = get_binprm_filename(bprm); 6924 dev_t s_dev = get_dev_from_file(file); 6925 unsigned long inode_nr = get_inode_nr_from_file(file); 6926 u64 ctime = get_ctime_nanosec_from_file(file); 6927 umode_t inode_mode = get_inode_mode_from_file(file); 6928 6929 save_str_to_buf(&signal->args_buf, (void *) filename, 4); // executable name 6930 save_str_to_buf(&signal->args_buf, file_path, 5); // executable path 6931 save_to_submit_buf(&signal->args_buf, &s_dev, sizeof(dev_t), 6); // device number 6932 save_to_submit_buf(&signal->args_buf, &inode_nr, sizeof(unsigned long), 7); // inode number 6933 save_to_submit_buf(&signal->args_buf, &ctime, sizeof(u64), 8); // creation time 6934 save_to_submit_buf(&signal->args_buf, &inode_mode, sizeof(umode_t), 9); // inode mode 6935 6936 // The proc_info interpreter field is set by "load_elf_phdrs" kprobe program. 6937 save_str_to_buf(&signal->args_buf, &proc_info->interpreter.pathname, 10); // interpreter path 6938 save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 11); // interpreter device number 6939 save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.inode, sizeof(u64), 12); // interpreter inode number 6940 save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.ctime, sizeof(u64), 13); // interpreter creation time 6941 6942 struct mm_struct *mm = get_mm_from_task(task); // bprm->mm is null here, but task->mm is not 6943 6944 unsigned long arg_start, arg_end; 6945 arg_start = get_arg_start_from_mm(mm); 6946 arg_end = get_arg_end_from_mm(mm); 6947 int argc = get_argc_from_bprm(bprm); 6948 6949 struct file *stdin_file = get_struct_file_from_fd(0); 6950 unsigned short stdin_type = get_inode_mode_from_file(stdin_file) & S_IFMT; 6951 void *stdin_path = get_path_str(__builtin_preserve_access_index(&stdin_file->f_path)); 6952 const char *interp = get_binprm_interp(bprm); 6953 6954 int invoked_from_kernel = 0; 6955 if (get_task_parent_flags(task) & PF_KTHREAD) 6956 invoked_from_kernel = 1; 6957 6958 save_args_str_arr_to_buf(&signal->args_buf, (void *) arg_start, (void *) arg_end, argc, 14); // argv 6959 save_str_to_buf(&signal->args_buf, (void *) interp, 15); // interp 6960 save_to_submit_buf(&signal->args_buf, &stdin_type, sizeof(unsigned short), 16); // stdin type 6961 save_str_to_buf(&signal->args_buf, stdin_path, 17); // stdin path 6962 save_to_submit_buf(&signal->args_buf, &invoked_from_kernel, sizeof(int), 18); // invoked from kernel ? 6963 6964 signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_EXEC); 6965 6966 return 0; 6967 } 6968 6969 // clang-format on 6970 6971 SEC("raw_tracepoint/sched_process_exit") 6972 int sched_process_exit_signal(struct bpf_raw_tracepoint_args *ctx) 6973 { 6974 controlplane_signal_t *signal = init_controlplane_signal(); 6975 if (unlikely(signal == NULL)) 6976 return 0; 6977 6978 // Hashes 6979 6980 struct task_struct *task = (struct task_struct *) bpf_get_current_task(); 6981 if (task == NULL) 6982 return -1; 6983 struct task_struct *leader = get_leader_task(task); 6984 struct task_struct *parent = get_leader_task(get_parent_task(leader)); 6985 6986 // The hash is always calculated with "task_struct->pid + start_time". 6987 u32 task_hash = hash_task_id(get_task_host_pid(task), get_task_start_time(task)); 6988 u32 parent_hash = hash_task_id(get_task_host_pid(parent), get_task_start_time(parent)); 6989 u32 leader_hash = hash_task_id(get_task_host_pid(leader), get_task_start_time(leader)); 6990 6991 // The event timestamp, so process tree info can be changelog'ed. 6992 u64 timestamp = bpf_ktime_get_ns(); 6993 save_to_submit_buf(&signal->args_buf, ×tamp, sizeof(u64), 0); 6994 6995 save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 1); 6996 save_to_submit_buf(&signal->args_buf, (void *) &parent_hash, sizeof(u32), 2); 6997 save_to_submit_buf(&signal->args_buf, (void *) &leader_hash, sizeof(u32), 3); 6998 6999 // Exit logic. 7000 7001 bool group_dead = false; 7002 struct signal_struct *s = BPF_CORE_READ(task, signal); 7003 atomic_t live = BPF_CORE_READ(s, live); 7004 7005 if (live.counter == 0) 7006 group_dead = true; 7007 7008 long exit_code = get_task_exit_code(task); 7009 7010 save_to_submit_buf(&signal->args_buf, (void *) &exit_code, sizeof(long), 4); 7011 save_to_submit_buf(&signal->args_buf, (void *) &group_dead, sizeof(bool), 5); 7012 7013 signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_EXIT); 7014 7015 return 0; 7016 } 7017 7018 // END OF Control Plane Programs 7019 7020 // TODO: Instead of returning sock state return tcp_connect, tcp_listen, tcp_connect_error events. 7021 // That will allow to subscribe only to wanted events and make handing easier. 7022 statfunc bool should_trace_sock_set_state(int old_state, int new_state) 7023 { 7024 if (old_state == TCP_CLOSE && new_state == TCP_LISTEN) { 7025 return true; 7026 } 7027 if (old_state == TCP_LISTEN && new_state == TCP_CLOSE) { 7028 return true; 7029 } 7030 if (old_state == TCP_SYN_SENT && new_state == TCP_ESTABLISHED) { 7031 return true; 7032 } 7033 if (old_state == TCP_SYN_SENT && new_state == TCP_CLOSE) { 7034 return true; 7035 } 7036 if (old_state == TCP_ESTABLISHED && 7037 (new_state == TCP_FIN_WAIT1 || new_state == TCP_CLOSE_WAIT)) { 7038 return false; 7039 } 7040 return false; 7041 } 7042 7043 // TP_PROTO(const struct sock *sk, const int oldstate, const int newstate) 7044 SEC("raw_tracepoint/inet_sock_set_state") 7045 int trace_inet_sock_set_state(struct bpf_raw_tracepoint_args *ctx) 7046 { 7047 struct sock *sk = (struct sock *) ctx->args[0]; 7048 int old_state = ctx->args[1]; 7049 int new_state = ctx->args[2]; 7050 7051 if (!should_trace_sock_set_state(old_state, new_state)) { 7052 return 0; 7053 } 7054 7055 bool mightbecloned = false; // cloned sock structs come from accept() 7056 u64 inode = BPF_CORE_READ(sk, sk_socket, file, f_inode, i_ino); 7057 if (inode == 0) 7058 mightbecloned = true; // kernel threads might have zero inode 7059 7060 struct net_task_context *netctx; 7061 if (!mightbecloned) { 7062 // pick network context from the inodemap (inode <=> task) 7063 netctx = bpf_map_lookup_elem(&inodemap, &inode); 7064 if (!netctx) 7065 mightbecloned = true; // e.g. task isn't being traced 7066 } 7067 if (mightbecloned) { 7068 // pick network context from the sockmap (new sockptr <=> old inode <=> task) 7069 u64 skptr = (u64) (void *) sk; 7070 u64 *o = bpf_map_lookup_elem(&sockmap, &skptr); 7071 if (o == 0) 7072 return 0; 7073 u64 oinode = *o; 7074 // with the old inode, find the netctx for the task 7075 netctx = bpf_map_lookup_elem(&inodemap, &oinode); 7076 if (!netctx) 7077 return 0; // old inode wasn't being traced as well 7078 } 7079 7080 u32 zero = 0; 7081 event_data_t *e = bpf_map_lookup_elem(&net_heap_event, &zero); 7082 if (unlikely(e == NULL)) 7083 return 0; 7084 7085 program_data_t p = {}; 7086 p.scratch_idx = 1; 7087 p.event = e; 7088 if (!init_program_data(&p, ctx)) 7089 return 0; 7090 __builtin_memcpy(&p.event->context.task, &netctx->taskctx, sizeof(task_context_t)); 7091 7092 tuple_t tuple = {}; 7093 fill_tuple(sk, &tuple); 7094 7095 save_to_submit_buf(&p.event->args_buf, (void *) &old_state, sizeof(u32), 0); 7096 save_to_submit_buf(&p.event->args_buf, (void *) &new_state, sizeof(u32), 1); 7097 save_to_submit_buf(&p.event->args_buf, &tuple, sizeof(tuple), 2); 7098 events_perf_submit(&p, SOCK_SET_STATE, 0); 7099 7100 return 0; 7101 } 7102 // clang-format on 7103 7104 SEC("raw_tracepoint/oom/mark_victim") 7105 int oom_mark_victim(struct bpf_raw_tracepoint_args *ctx) 7106 { 7107 __u32 pid = ctx->args[0]; 7108 7109 bpf_map_update_elem(&oom_info, &pid, &pid, BPF_ANY); 7110 7111 return 0; 7112 } 7113 7114 SEC("kprobe/tty_open") 7115 int BPF_KPROBE(tty_open, struct inode *inode, struct file *filep) 7116 { 7117 program_data_t p = {}; 7118 if (!init_program_data(&p, ctx)) { 7119 return 0; 7120 } 7121 7122 if (!should_trace((&p))) { 7123 return 0; 7124 } 7125 7126 if (!should_submit(TTY_OPEN, p.event)) { 7127 return 0; 7128 } 7129 7130 void *file_path = get_path_str(__builtin_preserve_access_index(&filep->f_path)); 7131 unsigned long ino = BPF_CORE_READ(inode, i_ino); 7132 dev_t dev = BPF_CORE_READ(inode, i_rdev); 7133 umode_t inode_mode = get_inode_mode_from_file(filep); 7134 7135 save_str_to_buf(&p.event->args_buf, file_path, 0); 7136 save_to_submit_buf(&p.event->args_buf, &ino, sizeof(ino), 1); 7137 save_to_submit_buf(&p.event->args_buf, &inode_mode, sizeof(inode_mode), 2); 7138 save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev), 3); 7139 7140 return events_perf_submit(&p, TTY_OPEN, 0); 7141 }