github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/pkg/ebpftracer/c/tracee.bpf.c (about)

     1  // +build ignore
     2  
     3  // Note: This file is licenced differently from the rest of the project
     4  // SPDX-License-Identifier: GPL-2.0
     5  // Copyright (C) Aqua Security inc.
     6  
     7  #include <vmlinux.h>
     8  #include <vmlinux_flavors.h>
     9  #include <vmlinux_missing.h>
    10  
    11  #undef container_of
    12  
    13  #include <bpf/bpf_core_read.h>
    14  #include <bpf/bpf_endian.h>
    15  #include <bpf/bpf_helpers.h>
    16  #include <bpf/bpf_tracing.h>
    17  #include <maps.h>
    18  #include <types.h>
    19  #include <capture_filtering.h>
    20  #include <tracee.h>
    21  
    22  #include <common/arch.h>
    23  #include <common/arguments.h>
    24  #include <common/binprm.h>
    25  #include <common/bpf_prog.h>
    26  #include <common/buffer.h>
    27  #include <common/capabilities.h>
    28  #include <common/cgroups.h>
    29  #include <common/common.h>
    30  #include <common/consts.h>
    31  #include <common/context.h>
    32  #include <common/filesystem.h>
    33  #include <common/filtering.h>
    34  #include <common/kconfig.h>
    35  #include <common/ksymbols.h>
    36  #include <common/logging.h>
    37  #include <common/memory.h>
    38  #include <common/network.h>
    39  #include <common/probes.h>
    40  #include <common/signal.h>
    41  #include <common/debug.h>
    42  #include <common/stats.h>
    43  
    44  char LICENSE[] SEC("license") = "GPL";
    45  
    46  extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig;
    47  
    48  // trace/events/syscalls.h: TP_PROTO(struct pt_regs *regs, long id)
    49  // initial entry for sys_enter syscall logic
    50  SEC("raw_tracepoint/sys_enter")
    51  int tracepoint__raw_syscalls__sys_enter(struct bpf_raw_tracepoint_args *ctx)
    52  {
    53      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
    54      int id = ctx->args[1];
    55      if (is_compat(task)) {
    56          // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler
    57          u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id);
    58          if (id_64 == 0)
    59              return 0;
    60  
    61          id = *id_64;
    62      }
    63  
    64      int zero = 0;
    65      config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero);
    66      if (unlikely(config == NULL))
    67          return 0;
    68      u64 cgroup_id = 0;
    69      if (config->options & OPT_CGROUP_V1) {
    70          cgroup_id = get_cgroup_v1_subsys0_id(task);
    71      } else {
    72          cgroup_id = bpf_get_current_cgroup_id();
    73      }
    74      // Skip if cgroup is muted
    75      if (bpf_map_lookup_elem(&ignored_cgroups_map, &cgroup_id) != NULL) {
    76          return 0;
    77      }
    78      // Update containers syscall stats.
    79      update_syscall_stats(ctx, cgroup_id, id);
    80  
    81      // Continue to tail calls.
    82      bpf_tail_call(ctx, &sys_enter_init_tail, id);
    83      return 0;
    84  }
    85  
    86  // initial tail call entry from sys_enter.
    87  // purpose is to save the syscall info of relevant syscalls through the task_info map.
    88  // can move to one of:
    89  // 1. sys_enter_submit, general event submit logic from sys_enter
    90  // 2. directly to syscall tail handler in sys_enter_tails
    91  SEC("raw_tracepoint/sys_enter_init")
    92  int sys_enter_init(struct bpf_raw_tracepoint_args *ctx)
    93  {
    94      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
    95  
    96      u64 pid_tgid = bpf_get_current_pid_tgid();
    97      u32 tid = pid_tgid;
    98      task_info_t *task_info = bpf_map_lookup_elem(&task_info_map, &tid);
    99      if (unlikely(task_info == NULL)) {
   100          task_info = init_task_info(tid, 0);
   101          if (unlikely(task_info == NULL)) {
   102              return 0;
   103          }
   104          int zero = 0;
   105          config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero);
   106          if (unlikely(config == NULL))
   107              return 0;
   108  
   109          init_task_context(&task_info->context, task, config->options);
   110      }
   111  
   112      syscall_data_t *sys = &(task_info->syscall_data);
   113      sys->id = ctx->args[1];
   114  
   115      if (LINUX_HAS_SYSCALL_WRAPPER) {
   116          struct pt_regs *regs = (struct pt_regs *) ctx->args[0];
   117  
   118          if (is_x86_compat(task)) {
   119  #if defined(bpf_target_x86)
   120              sys->args.args[0] = BPF_CORE_READ(regs, bx);
   121              sys->args.args[1] = BPF_CORE_READ(regs, cx);
   122              sys->args.args[2] = BPF_CORE_READ(regs, dx);
   123              sys->args.args[3] = BPF_CORE_READ(regs, si);
   124              sys->args.args[4] = BPF_CORE_READ(regs, di);
   125              sys->args.args[5] = BPF_CORE_READ(regs, bp);
   126  #endif // bpf_target_x86
   127          } else {
   128              sys->args.args[0] = PT_REGS_PARM1_CORE_SYSCALL(regs);
   129              sys->args.args[1] = PT_REGS_PARM2_CORE_SYSCALL(regs);
   130              sys->args.args[2] = PT_REGS_PARM3_CORE_SYSCALL(regs);
   131              sys->args.args[3] = PT_REGS_PARM4_CORE_SYSCALL(regs);
   132              sys->args.args[4] = PT_REGS_PARM5_CORE_SYSCALL(regs);
   133              sys->args.args[5] = PT_REGS_PARM6_CORE_SYSCALL(regs);
   134          }
   135      } else {
   136          bpf_probe_read(sys->args.args, sizeof(6 * sizeof(u64)), (void *) ctx->args);
   137      }
   138  
   139      if (is_compat(task)) {
   140          // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler
   141          u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &sys->id);
   142          if (id_64 == 0)
   143              return 0;
   144  
   145          sys->id = *id_64;
   146      }
   147  
   148      // exit, exit_group and rt_sigreturn syscalls don't return
   149      if (sys->id != SYSCALL_EXIT && sys->id != SYSCALL_EXIT_GROUP &&
   150          sys->id != SYSCALL_RT_SIGRETURN) {
   151          sys->ts = bpf_ktime_get_ns();
   152          task_info->syscall_traced = true;
   153      }
   154  
   155      // if id is irrelevant continue to next tail call
   156      bpf_tail_call(ctx, &sys_enter_submit_tail, sys->id);
   157  
   158      // call syscall handler, if exists
   159      bpf_tail_call(ctx, &sys_enter_tails, sys->id);
   160      return 0;
   161  }
   162  
   163  // submit tail call part of sys_enter.
   164  // events that are required for submission go through two logics here:
   165  // 1. parsing their FD filepath if requested as an option
   166  // 2. submitting the event if relevant
   167  // may move to the direct syscall handler in sys_enter_tails
   168  SEC("raw_tracepoint/sys_enter_submit")
   169  int sys_enter_submit(struct bpf_raw_tracepoint_args *ctx)
   170  {
   171      program_data_t p = {};
   172      if (!init_program_data(&p, ctx))
   173          return 0;
   174  
   175      if (!should_trace(&p))
   176          return 0;
   177  
   178      syscall_data_t *sys = &p.task_info->syscall_data;
   179  
   180      if (p.config->options & OPT_TRANSLATE_FD_FILEPATH && has_syscall_fd_arg(sys->id)) {
   181          // Process filepath related to fd argument
   182          uint fd_num = get_syscall_fd_num_from_arg(sys->id, &sys->args);
   183          struct file *file = get_struct_file_from_fd(fd_num);
   184  
   185          if (file) {
   186              u64 ts = sys->ts;
   187              fd_arg_path_t fd_arg_path = {};
   188              void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
   189  
   190              bpf_probe_read_kernel_str(&fd_arg_path.path, sizeof(fd_arg_path.path), file_path);
   191              bpf_map_update_elem(&fd_arg_path_map, &ts, &fd_arg_path, BPF_ANY);
   192          }
   193      }
   194  
   195      if (sys->id != SYSCALL_RT_SIGRETURN && !p.task_info->syscall_traced) {
   196          save_to_submit_buf(&p.event->args_buf, (void *) &(sys->args.args[0]), sizeof(int), 0);
   197          events_perf_submit(&p, sys->id, 0);
   198      }
   199  
   200      // call syscall handler, if exists
   201      bpf_tail_call(ctx, &sys_enter_tails, sys->id);
   202      return 0;
   203  }
   204  
   205  // trace/events/syscalls.h: TP_PROTO(struct pt_regs *regs, long ret)
   206  // initial entry for sys_exit syscall logic
   207  SEC("raw_tracepoint/sys_exit")
   208  int tracepoint__raw_syscalls__sys_exit(struct bpf_raw_tracepoint_args *ctx)
   209  {
   210      struct pt_regs *regs = (struct pt_regs *) ctx->args[0];
   211      int id = get_syscall_id_from_regs(regs);
   212      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
   213      if (is_compat(task)) {
   214          // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler
   215          u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id);
   216          if (id_64 == 0)
   217              return 0;
   218  
   219          id = *id_64;
   220      }
   221  
   222      // Skip if cgroup is muted.
   223      int zero = 0;
   224      config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero);
   225      if (unlikely(config == NULL))
   226          return 0;
   227      u64 cgroup_id = 0;
   228      if (config->options & OPT_CGROUP_V1) {
   229          cgroup_id = get_cgroup_v1_subsys0_id(task);
   230      } else {
   231          cgroup_id = bpf_get_current_cgroup_id();
   232      }
   233      if (bpf_map_lookup_elem(&ignored_cgroups_map, &cgroup_id) != NULL) {
   234          return 0;
   235      }
   236  
   237      bpf_tail_call(ctx, &sys_exit_init_tail, id);
   238      return 0;
   239  }
   240  
   241  // initial tail call entry from sys_exit.
   242  // purpose is to "confirm" the syscall data saved by marking it as complete(see
   243  // task_info->syscall_traced) and adding the return value to the syscall_info struct. can move to
   244  // one of:
   245  // 1. sys_exit, general event submit logic from sys_exit
   246  // 2. directly to syscall tail hanler in sys_exit_tails
   247  SEC("raw_tracepoint/sys_exit_init")
   248  int sys_exit_init(struct bpf_raw_tracepoint_args *ctx)
   249  {
   250      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
   251  
   252      u64 pid_tgid = bpf_get_current_pid_tgid();
   253      u32 tid = pid_tgid;
   254      task_info_t *task_info = bpf_map_lookup_elem(&task_info_map, &tid);
   255      if (unlikely(task_info == NULL)) {
   256          task_info = init_task_info(tid, 0);
   257          if (unlikely(task_info == NULL))
   258              return 0;
   259  
   260          int zero = 0;
   261          config_entry_t *config = bpf_map_lookup_elem(&config_map, &zero);
   262          if (unlikely(config == NULL))
   263              return 0;
   264  
   265          init_task_context(&task_info->context, task, config->options);
   266      }
   267  
   268      // check if syscall is being traced and mark that it finished
   269      if (!task_info->syscall_traced)
   270          return 0;
   271      task_info->syscall_traced = false;
   272  
   273      syscall_data_t *sys = &task_info->syscall_data;
   274  
   275      long ret = ctx->args[1];
   276      struct pt_regs *regs = (struct pt_regs *) ctx->args[0];
   277      int id = get_syscall_id_from_regs(regs);
   278  
   279      if (is_compat(task)) {
   280          // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler
   281          u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id);
   282          if (id_64 == 0)
   283              return 0;
   284  
   285          id = *id_64;
   286      }
   287  
   288      // Sanity check - we returned from the expected syscall this task was executing
   289      if (sys->id != id)
   290          return 0;
   291  
   292      sys->ret = ret;
   293  
   294      // move to submit tail call if needed
   295      bpf_tail_call(ctx, &sys_exit_submit_tail, id);
   296  
   297      // otherwise move to direct syscall handler
   298      bpf_tail_call(ctx, &sys_exit_tails, id);
   299      return 0;
   300  }
   301  
   302  // submit tail call part of sys_exit.
   303  // most syscall events are submitted at this point, and if not,
   304  // they are submitted through direct syscall handlers in sys_exit_tails
   305  SEC("raw_tracepoint/sys_exit_submit")
   306  int sys_exit_submit(struct bpf_raw_tracepoint_args *ctx)
   307  {
   308      program_data_t p = {};
   309      if (!init_program_data(&p, ctx))
   310          return 0;
   311  
   312      if (!should_trace(&p))
   313          return 0;
   314  
   315      syscall_data_t *sys = &p.task_info->syscall_data;
   316      long ret = ctx->args[1];
   317  
   318      if (!should_submit(sys->id, p.event))
   319          goto out;
   320  
   321      // We can't use saved args after execve syscall, as pointers are invalid.
   322      // To avoid showing execve event both on entry and exit, we only output failed execs.
   323      if ((sys->id == SYSCALL_EXECVE || sys->id == SYSCALL_EXECVEAT) && (ret == 0))
   324          goto out;
   325  
   326      save_args_to_submit_buf(p.event, &sys->args);
   327      p.event->context.ts = sys->ts;
   328      events_perf_submit(&p, sys->id, ret);
   329  
   330  out:
   331      bpf_tail_call(ctx, &sys_exit_tails, sys->id);
   332      return 0;
   333  }
   334  
   335  // here are the direct hook points for sys_enter and sys_exit.
   336  // There are used not for submitting syscall events but the enter and exit events themselves.
   337  // As such they are usually not attached, and will only be used if sys_enter or sys_exit events are
   338  // given as tracing arguments.
   339  
   340  // separate hook point for sys_enter event tracing
   341  SEC("raw_tracepoint/trace_sys_enter")
   342  int trace_sys_enter(struct bpf_raw_tracepoint_args *ctx)
   343  {
   344      program_data_t p = {};
   345      if (!init_program_data(&p, ctx))
   346          return 0;
   347  
   348      if (!should_trace(&p))
   349          return 0;
   350  
   351      if (!should_submit(RAW_SYS_ENTER, p.event))
   352          return 0;
   353  
   354      // always submit since this won't be attached otherwise
   355      int id = ctx->args[1];
   356      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
   357      if (is_compat(task)) {
   358          // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler
   359          u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id);
   360          if (id_64 == 0)
   361              return 0;
   362  
   363          id = *id_64;
   364      }
   365      save_to_submit_buf(&p.event->args_buf, (void *) &id, sizeof(int), 0);
   366      events_perf_submit(&p, RAW_SYS_ENTER, 0);
   367      return 0;
   368  }
   369  
   370  // separate hook point for sys_exit event tracing
   371  SEC("raw_tracepoint/trace_sys_exit")
   372  int trace_sys_exit(struct bpf_raw_tracepoint_args *ctx)
   373  {
   374      program_data_t p = {};
   375      if (!init_program_data(&p, ctx))
   376          return 0;
   377  
   378      if (!should_trace(&p))
   379          return 0;
   380  
   381      if (!should_submit(RAW_SYS_EXIT, p.event))
   382          return 0;
   383  
   384      // always submit since this won't be attached otherwise
   385      struct pt_regs *regs = (struct pt_regs *) ctx->args[0];
   386      int id = get_syscall_id_from_regs(regs);
   387      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
   388      if (is_compat(task)) {
   389          // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler
   390          u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &id);
   391          if (id_64 == 0)
   392              return 0;
   393  
   394          id = *id_64;
   395      }
   396      save_to_submit_buf(&p.event->args_buf, (void *) &id, sizeof(int), 0);
   397      events_perf_submit(&p, RAW_SYS_EXIT, 0);
   398      return 0;
   399  }
   400  
   401  SEC("raw_tracepoint/sys_execve")
   402  int syscall__execve(void *ctx)
   403  {
   404      program_data_t p = {};
   405      if (!init_tailcall_program_data(&p, ctx))
   406          return 0;
   407  
   408      if (!p.task_info->syscall_traced)
   409          return -1;
   410      syscall_data_t *sys = &p.task_info->syscall_data;
   411      p.event->context.ts = sys->ts;
   412  
   413      if (!should_submit(SYSCALL_EXECVE, p.event))
   414          return 0;
   415  
   416      reset_event_args(&p);
   417      save_str_to_buf(&p.event->args_buf, (void *) sys->args.args[0] /*filename*/, 0);
   418      save_str_arr_to_buf(&p.event->args_buf, (const char *const *) sys->args.args[1] /*argv*/, 1);
   419      if (p.config->options & OPT_EXEC_ENV) {
   420          save_str_arr_to_buf(
   421              &p.event->args_buf, (const char *const *) sys->args.args[2] /*envp*/, 2);
   422      }
   423  
   424      return events_perf_submit(&p, SYSCALL_EXECVE, 0);
   425  }
   426  
   427  SEC("raw_tracepoint/sys_execveat")
   428  int syscall__execveat(void *ctx)
   429  {
   430      program_data_t p = {};
   431      if (!init_tailcall_program_data(&p, ctx))
   432          return 0;
   433  
   434      if (!p.task_info->syscall_traced)
   435          return -1;
   436      syscall_data_t *sys = &p.task_info->syscall_data;
   437      p.event->context.ts = sys->ts;
   438  
   439      if (!should_submit(SYSCALL_EXECVEAT, p.event))
   440          return 0;
   441  
   442      reset_event_args(&p);
   443      save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0] /*dirfd*/, sizeof(int), 0);
   444      save_str_to_buf(&p.event->args_buf, (void *) sys->args.args[1] /*pathname*/, 1);
   445      save_str_arr_to_buf(&p.event->args_buf, (const char *const *) sys->args.args[2] /*argv*/, 2);
   446      if (p.config->options & OPT_EXEC_ENV) {
   447          save_str_arr_to_buf(
   448              &p.event->args_buf, (const char *const *) sys->args.args[3] /*envp*/, 3);
   449      }
   450      save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[4] /*flags*/, sizeof(int), 4);
   451  
   452      return events_perf_submit(&p, SYSCALL_EXECVEAT, 0);
   453  }
   454  
   455  statfunc int send_socket_dup(program_data_t *p, u64 oldfd, u64 newfd)
   456  {
   457      if (!should_submit(SOCKET_DUP, p->event)) {
   458          return 0;
   459      }
   460  
   461      if (!check_fd_type(oldfd, S_IFSOCK)) {
   462          return 0;
   463      }
   464  
   465      struct file *f = get_struct_file_from_fd(oldfd);
   466      if (f == NULL) {
   467          return -1;
   468      }
   469  
   470      // this is a socket - submit the SOCKET_DUP event
   471  
   472      reset_event_args(p);
   473      save_to_submit_buf(&(p->event->args_buf), &oldfd, sizeof(u32), 0);
   474      save_to_submit_buf(&(p->event->args_buf), &newfd, sizeof(u32), 1);
   475  
   476      // get the address
   477      struct socket *socket_from_file = (struct socket *) BPF_CORE_READ(f, private_data);
   478      if (socket_from_file == NULL) {
   479          return -1;
   480      }
   481  
   482      struct sock *sk = get_socket_sock(socket_from_file);
   483      u16 family = get_sock_family(sk);
   484      if ((family != AF_INET) && (family != AF_INET6) && (family != AF_UNIX)) {
   485          return 0;
   486      }
   487  
   488      if (family == AF_INET) {
   489          net_conn_v4_t net_details = {};
   490          struct sockaddr_in remote;
   491  
   492          get_network_details_from_sock_v4(sk, &net_details, 0);
   493          get_remote_sockaddr_in_from_network_details(&remote, &net_details, family);
   494  
   495          save_to_submit_buf(&(p->event->args_buf), &remote, sizeof(struct sockaddr_in), 2);
   496      } else if (family == AF_INET6) {
   497          net_conn_v6_t net_details = {};
   498          struct sockaddr_in6 remote;
   499  
   500          get_network_details_from_sock_v6(sk, &net_details, 0);
   501          get_remote_sockaddr_in6_from_network_details(&remote, &net_details, family);
   502  
   503          save_to_submit_buf(&(p->event->args_buf), &remote, sizeof(struct sockaddr_in6), 2);
   504      } else if (family == AF_UNIX) {
   505          struct unix_sock *unix_sk = (struct unix_sock *) sk;
   506          struct sockaddr_un sockaddr = get_unix_sock_addr(unix_sk);
   507  
   508          save_to_submit_buf(&(p->event->args_buf), &sockaddr, sizeof(struct sockaddr_un), 2);
   509      }
   510  
   511      return events_perf_submit(p, SOCKET_DUP, 0);
   512  }
   513  
   514  SEC("raw_tracepoint/sys_dup")
   515  int sys_dup_exit_tail(void *ctx)
   516  {
   517      program_data_t p = {};
   518      if (!init_tailcall_program_data(&p, ctx))
   519          return 0;
   520  
   521      if (!should_trace(&p))
   522          return 0;
   523  
   524      syscall_data_t *sys = &p.task_info->syscall_data;
   525  
   526      if (sys->ret < 0) {
   527          // dup failed
   528          return 0;
   529      }
   530  
   531      if (sys->id == SYSCALL_DUP) {
   532          // args.args[0]: oldfd
   533          // retval: newfd
   534          send_socket_dup(&p, sys->args.args[0], sys->ret);
   535      } else if (sys->id == SYSCALL_DUP2 || sys->id == SYSCALL_DUP3) {
   536          // args.args[0]: oldfd
   537          // args.args[1]: newfd
   538          // retval: retval
   539          send_socket_dup(&p, sys->args.args[0], sys->args.args[1]);
   540      }
   541  
   542      return 0;
   543  }
   544  
   545  // trace/events/sched.h: TP_PROTO(struct task_struct *parent, struct task_struct *child)
   546  SEC("raw_tracepoint/sched_process_fork")
   547  int tracepoint__sched__sched_process_fork(struct bpf_raw_tracepoint_args *ctx)
   548  {
   549      long ret = 0;
   550      program_data_t p = {};
   551      if (!init_program_data(&p, ctx))
   552          return 0;
   553  
   554      // NOTE: proc_info_map updates before should_trace() as the entries are needed in other places.
   555  
   556      struct task_struct *parent = (struct task_struct *) ctx->args[0];
   557      struct task_struct *child = (struct task_struct *) ctx->args[1];
   558  
   559      // Information needed before the event:
   560      int parent_pid = get_task_host_tgid(parent);
   561      u64 child_start_time = get_task_start_time(child);
   562      int child_pid = get_task_host_tgid(child);
   563      int child_tid = get_task_host_pid(child);
   564      int child_ns_pid = get_task_ns_tgid(child);
   565      int child_ns_tid = get_task_ns_pid(child);
   566  
   567      // Update the task_info map with the new task's info
   568  
   569      ret = bpf_map_update_elem(&task_info_map, &child_tid, p.task_info, BPF_ANY);
   570      if (ret < 0)
   571          tracee_log(ctx, BPF_LOG_LVL_DEBUG, BPF_LOG_ID_MAP_UPDATE_ELEM, ret);
   572      task_info_t *task = bpf_map_lookup_elem(&task_info_map, &child_tid);
   573      if (unlikely(task == NULL)) {
   574          // this should never happen - we just updated the map with this key
   575          tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0);
   576          return 0;
   577      }
   578  
   579      task->context.tid = child_ns_tid;
   580      task->context.host_tid = child_tid;
   581      task->context.start_time = child_start_time;
   582  
   583      // Update the proc_info_map with the new process's info (from parent)
   584  
   585      proc_info_t *c_proc_info = bpf_map_lookup_elem(&proc_info_map, &child_pid);
   586      if (c_proc_info == NULL) {
   587          // It is a new process (not another thread): add it to proc_info_map.
   588          proc_info_t *p_proc_info = bpf_map_lookup_elem(&proc_info_map, &parent_pid);
   589          if (unlikely(p_proc_info == NULL)) {
   590              // parent should exist in proc_info_map (init_program_data sets it)
   591              tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0);
   592              return 0;
   593          }
   594  
   595          // Copy the parent's proc_info to the child's entry.
   596          bpf_map_update_elem(&proc_info_map, &child_pid, p_proc_info, BPF_NOEXIST);
   597          c_proc_info = bpf_map_lookup_elem(&proc_info_map, &child_pid);
   598          if (unlikely(c_proc_info == NULL)) {
   599              tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0);
   600              return 0;
   601          }
   602  
   603          c_proc_info->follow_in_scopes = 0; // updated later if should_trace() passes (follow filter)
   604          c_proc_info->new_proc = true;      // started after tracee (new_pid filter)
   605      }
   606  
   607      // Update the process tree map (filter related) if the parent has an entry.
   608  
   609      policies_config_t *policies_cfg = &p.config->policies_config;
   610  
   611      if (policies_cfg->proc_tree_filter_enabled_scopes) {
   612          eq_t *tgid_filtered = bpf_map_lookup_elem(&process_tree_map, &parent_pid);
   613          if (tgid_filtered) {
   614              ret = bpf_map_update_elem(&process_tree_map, &child_pid, tgid_filtered, BPF_ANY);
   615              if (ret < 0)
   616                  tracee_log(ctx, BPF_LOG_LVL_DEBUG, BPF_LOG_ID_MAP_UPDATE_ELEM, ret);
   617          }
   618      }
   619  
   620      if (!should_trace(&p))
   621          return 0;
   622  
   623      // Always follow every pid that passed the should_trace() checks (follow filter)
   624      c_proc_info->follow_in_scopes = p.event->context.matched_policies;
   625  
   626      // Submit the event
   627  
   628      if (should_submit(SCHED_PROCESS_FORK, p.event)) {
   629          // Parent information.
   630          u64 parent_start_time = get_task_start_time(parent);
   631          int parent_tid = get_task_host_pid(parent);
   632          int parent_ns_pid = get_task_ns_tgid(parent);
   633          int parent_ns_tid = get_task_ns_pid(parent);
   634  
   635          // Parent (might be a thread or a process).
   636          save_to_submit_buf(&p.event->args_buf, (void *) &parent_tid, sizeof(int), 0);
   637          save_to_submit_buf(&p.event->args_buf, (void *) &parent_ns_tid, sizeof(int), 1);
   638          save_to_submit_buf(&p.event->args_buf, (void *) &parent_pid, sizeof(int), 2);
   639          save_to_submit_buf(&p.event->args_buf, (void *) &parent_ns_pid, sizeof(int), 3);
   640          save_to_submit_buf(&p.event->args_buf, (void *) &parent_start_time, sizeof(u64), 4);
   641  
   642          // Child (might be a lwp or a process, sched_process_fork trace is calle by clone() also).
   643          save_to_submit_buf(&p.event->args_buf, (void *) &child_tid, sizeof(int), 5);
   644          save_to_submit_buf(&p.event->args_buf, (void *) &child_ns_tid, sizeof(int), 6);
   645          save_to_submit_buf(&p.event->args_buf, (void *) &child_pid, sizeof(int), 7);
   646          save_to_submit_buf(&p.event->args_buf, (void *) &child_ns_pid, sizeof(int), 8);
   647          save_to_submit_buf(&p.event->args_buf, (void *) &child_start_time, sizeof(u64), 9);
   648  
   649          // Process tree information (if needed).
   650          if (p.config->options & OPT_FORK_PROCTREE) {
   651              // Both, the thread group leader and the "up_parent" (the first process, not lwp, found
   652              // as a parent of the child in the hierarchy), are needed by the userland process tree.
   653              // The userland process tree default source of events is the signal events, but there is
   654              // an option to use regular event for maintaining it as well (and it is needed for some
   655              // situatins). These arguments will always be removed by userland event processors.
   656              struct task_struct *leader = get_leader_task(child);
   657              struct task_struct *up_parent = get_leader_task(get_parent_task(leader));
   658  
   659              // Up Parent information: Go up in hierarchy until parent is process.
   660              u64 up_parent_start_time = get_task_start_time(up_parent);
   661              int up_parent_pid = get_task_host_tgid(up_parent);
   662              int up_parent_tid = get_task_host_pid(up_parent);
   663              int up_parent_ns_pid = get_task_ns_tgid(up_parent);
   664              int up_parent_ns_tid = get_task_ns_pid(up_parent);
   665              // Leader information.
   666              u64 leader_start_time = get_task_start_time(leader);
   667              int leader_pid = get_task_host_tgid(leader);
   668              int leader_tid = get_task_host_pid(leader);
   669              int leader_ns_pid = get_task_ns_tgid(leader);
   670              int leader_ns_tid = get_task_ns_pid(leader);
   671  
   672              // Up Parent: always a process (might be the same as Parent if parent is a process).
   673              save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_tid, sizeof(int), 10);
   674              save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_ns_tid, sizeof(int), 11);
   675              save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_pid, sizeof(int), 12);
   676              save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_ns_pid, sizeof(int), 13);
   677              save_to_submit_buf(&p.event->args_buf, (void *) &up_parent_start_time, sizeof(u64), 14);
   678              // Leader: always a process (might be the same as the Child if child is a process).
   679              save_to_submit_buf(&p.event->args_buf, (void *) &leader_tid, sizeof(int), 15);
   680              save_to_submit_buf(&p.event->args_buf, (void *) &leader_ns_tid, sizeof(int), 16);
   681              save_to_submit_buf(&p.event->args_buf, (void *) &leader_pid, sizeof(int), 17);
   682              save_to_submit_buf(&p.event->args_buf, (void *) &leader_ns_pid, sizeof(int), 18);
   683              save_to_submit_buf(&p.event->args_buf, (void *) &leader_start_time, sizeof(u64), 19);
   684          }
   685  
   686          // Submit
   687          events_perf_submit(&p, SCHED_PROCESS_FORK, 0);
   688      }
   689  
   690      return 0;
   691  }
   692  
   693  // number of iterations - value that the verifier was seen to cope with - the higher, the better
   694  #define MAX_NUM_MODULES 100
   695  
   696  enum {
   697      PROC_MODULES = 1 << 0,
   698      KSET = 1 << 1,
   699      MOD_TREE = 1 << 2,
   700      NEW_MOD = 1 << 3,
   701      FULL_SCAN = 1 << 30,
   702      HIDDEN_MODULE = 1 << 31,
   703  };
   704  
   705  // Forcibly create the map in all kernels, even when not needed, due to lack of
   706  // support for kernel version awareness about map loading errors.
   707  
   708  BPF_HASH(modules_map, u64, kernel_module_t, MAX_NUM_MODULES);
   709  BPF_HASH(new_module_map, u64, kernel_new_mod_t, MAX_NUM_MODULES);
   710  
   711  // We only care for modules that got deleted or inserted between our scan and if
   712  // we detected something suspicious. Since it's a very small time frame, it's
   713  // not likely that a large amount of modules will be deleted. Instead of saving
   714  // a map of deleted modules, we could have saved the last deleted module
   715  // timestamp and, if we detected something suspicious, verify that no modules
   716  // got deleted between our check. This is preferable space-wise (u64 instead of
   717  // a map), but an attacker might start unloading modules in the background and
   718  // race with the check in order to abort reporting for hidden modules.
   719  
   720  BPF_LRU_HASH(recent_deleted_module_map, u64, kernel_deleted_mod_t, 50);
   721  BPF_LRU_HASH(recent_inserted_module_map,
   722               u64,
   723               kernel_new_mod_t,
   724               50); // Likewise for module insertion
   725  
   726  u64 start_scan_time_init_shown_mods = 0;
   727  u64 last_module_insert_time = 0;
   728  bool hidden_old_mod_scan_done = false;
   729  static const int HID_MOD_RACE_CONDITION = -1;
   730  static const int HID_MOD_UNCOMPLETED_ITERATIONS = -2;
   731  static const int HID_MOD_MEM_ZEROED = -3;
   732  static const int MOD_HIDDEN = 1;
   733  static const int MOD_NOT_HIDDEN = 0;
   734  
   735  void __always_inline lkm_seeker_send_to_userspace(struct module *mod, u32 *flags, program_data_t *p)
   736  {
   737      reset_event_args(p);
   738      u64 mod_addr = (u64) mod;
   739      char *mod_name = mod->name;
   740      const char *mod_srcversion = BPF_CORE_READ(mod, srcversion);
   741  
   742      save_to_submit_buf(&(p->event->args_buf), &mod_addr, sizeof(u64), 0);
   743      save_bytes_to_buf(&(p->event->args_buf),
   744                        (void *) mod_name,
   745                        MODULE_NAME_LEN & MAX_MEM_DUMP_SIZE,
   746                        1); // string saved as bytes (verifier issues).
   747      save_to_submit_buf(&(p->event->args_buf), flags, sizeof(u32), 2);
   748      save_bytes_to_buf(&(p->event->args_buf),
   749                        (void *) mod_srcversion,
   750                        MODULE_SRCVERSION_MAX_LENGTH & MAX_MEM_DUMP_SIZE,
   751                        3); // string saved as bytes (verifier issues).
   752  
   753      events_perf_submit(p, HIDDEN_KERNEL_MODULE_SEEKER, 0);
   754  }
   755  
   756  // Populate all the modules to an efficient query-able hash map.
   757  // We can't read it once and then hook on do_init_module and free_module since a hidden module will
   758  // remove itself from the list directly and we wouldn't know (hence from our perspective the module
   759  // will reside in the modules list, which could be false). So on every trigger, we go over the
   760  // modules list and populate the map. It gets clean in userspace before every run.
   761  // Since this mechanism is suppose to be triggered every once in a while,
   762  // this should be ok.
   763  statfunc int init_shown_modules()
   764  {
   765      char modules_sym[8] = "modules";
   766      struct list_head *head = (struct list_head *) get_symbol_addr(modules_sym);
   767      kernel_module_t ker_mod = {};
   768      bool iterated_all_modules = false;
   769      struct module *pos, *n;
   770  
   771      pos = list_first_entry_ebpf(head, typeof(*pos), list);
   772      n = pos;
   773  
   774  #pragma unroll
   775      for (int i = 0; i < MAX_NUM_MODULES; i++) {
   776          pos = n;
   777          n = list_next_entry_ebpf(n, list);
   778  
   779          if (&pos->list == head) {
   780              return 0;
   781          }
   782  
   783          bpf_map_update_elem(&modules_map, &pos, &ker_mod, BPF_ANY);
   784      }
   785  
   786      return HID_MOD_UNCOMPLETED_ITERATIONS;
   787  }
   788  
   789  statfunc int is_hidden(u64 mod)
   790  {
   791      if (bpf_map_lookup_elem(&modules_map, &mod) != NULL) {
   792          return MOD_NOT_HIDDEN;
   793      }
   794  
   795      // Verify that this module wasn't removed after we initialized modules_map
   796      kernel_deleted_mod_t *deleted_mod = bpf_map_lookup_elem(&recent_deleted_module_map, &mod);
   797      if (deleted_mod && deleted_mod->deleted_time > start_scan_time_init_shown_mods) {
   798          // This module got deleted after the start of the scan time.. So there
   799          // was a valid remove, and it's not hidden.
   800          return false;
   801      }
   802  
   803      // Check if some module was inserted after we started scanning.
   804      // If that's the case, then if the module got inserted to the modules list after we walked on
   805      // the list, it'll be missing from our eBPF map. If it got inserted to other places (kset for
   806      // example), then it will appear as if the module is hidden (in kset but not in module's list),
   807      // but in fact it only got added in the midst of our scan. Thus, we need to monitor for this
   808      // situation.
   809      if (start_scan_time_init_shown_mods < last_module_insert_time) {
   810          // No point of checking other modules in this scan... abort
   811          return HID_MOD_RACE_CONDITION;
   812      }
   813  
   814      return MOD_HIDDEN;
   815  }
   816  
   817  statfunc int find_modules_from_module_kset_list(program_data_t *p)
   818  {
   819      char module_kset_sym[12] = "module_kset";
   820      struct module *first_mod = NULL;
   821      struct kset *mod_kset = (struct kset *) get_symbol_addr(module_kset_sym);
   822      struct list_head *head = &(mod_kset->list);
   823      struct kobject *pos = list_first_entry_ebpf(head, typeof(*pos), entry);
   824      struct kobject *n = list_next_entry_ebpf(pos, entry);
   825      u32 flags = KSET | HIDDEN_MODULE;
   826  
   827      for (int i = 0; i < MAX_NUM_MODULES; i++) {
   828          if (BPF_CORE_READ(n, name) ==
   829              NULL) { // Without this the list seems infinite. Also, using pos
   830                      // here seems incorrect as it starts from a weird member
   831              return 0;
   832          }
   833  
   834          struct module_kobject *mod_kobj =
   835              (struct module_kobject *) container_of(n, struct module_kobject, kobj);
   836          if (mod_kobj) {
   837              struct module *mod = BPF_CORE_READ(mod_kobj, mod);
   838              if (mod) {
   839                  if (first_mod == NULL) {
   840                      first_mod = mod;
   841                  } else if (first_mod == mod) { // Iterated over all modules - stop.
   842                      return 0;
   843                  }
   844                  int ret = is_hidden((u64) mod);
   845                  if (ret == MOD_HIDDEN) {
   846                      lkm_seeker_send_to_userspace(mod, &flags, p);
   847                  } else if (ret == HID_MOD_RACE_CONDITION) {
   848                      return ret;
   849                  }
   850              }
   851          }
   852  
   853          pos = n;
   854          n = list_next_entry_ebpf(n, entry);
   855      }
   856  
   857      return HID_MOD_UNCOMPLETED_ITERATIONS;
   858  }
   859  
   860  BPF_QUEUE(walk_mod_tree_queue, rb_node_t, MAX_NUM_MODULES); // used to walk a rb tree
   861  
   862  statfunc struct latch_tree_node *__lt_from_rb(struct rb_node *node, int idx)
   863  {
   864      return container_of(node, struct latch_tree_node, node[idx]);
   865  }
   866  
   867  statfunc int walk_mod_tree(program_data_t *p, struct rb_node *root, int idx)
   868  {
   869      struct latch_tree_node *ltn;
   870      struct module *mod;
   871      struct rb_node *curr = root;
   872      u32 flags = MOD_TREE | HIDDEN_MODULE;
   873  
   874  #pragma unroll
   875      for (int i = 0; i < MAX_NUM_MODULES; i++) {
   876          if (curr != NULL) {
   877              rb_node_t rb_nod = {.node = curr};
   878              bpf_map_push_elem(&walk_mod_tree_queue, &rb_nod, BPF_EXIST);
   879  
   880              curr = BPF_CORE_READ(curr, rb_left); // Move left
   881          } else {
   882              rb_node_t rb_nod;
   883              if (bpf_map_pop_elem(&walk_mod_tree_queue, &rb_nod) != 0) {
   884                  return 0; // Finished iterating
   885              } else {
   886                  curr = rb_nod.node;
   887                  ltn = __lt_from_rb(curr, idx);
   888                  mod = BPF_CORE_READ(container_of(ltn, struct mod_tree_node, node), mod);
   889  
   890                  int ret = is_hidden((u64) mod);
   891                  if (ret == MOD_HIDDEN) {
   892                      lkm_seeker_send_to_userspace(mod, &flags, p);
   893                  } else if (ret == HID_MOD_RACE_CONDITION) {
   894                      return ret;
   895                  }
   896  
   897                  /* We have visited the node and its left subtree.
   898                  Now, it's right subtree's turn */
   899                  curr = BPF_CORE_READ(curr, rb_right);
   900              }
   901          }
   902      }
   903  
   904      return HID_MOD_UNCOMPLETED_ITERATIONS;
   905  }
   906  
   907  struct mod_tree_root {
   908      struct latch_tree_root root;
   909  };
   910  
   911  statfunc int find_modules_from_mod_tree(program_data_t *p)
   912  {
   913      char mod_tree_sym[9] = "mod_tree";
   914      struct mod_tree_root *m_tree = (struct mod_tree_root *) get_symbol_addr(mod_tree_sym);
   915      unsigned int seq;
   916  
   917      if (bpf_core_field_exists(m_tree->root.seq.sequence)) {
   918          seq = BPF_CORE_READ(m_tree, root.seq.sequence); // below 5.10
   919      } else {
   920          seq = BPF_CORE_READ(m_tree, root.seq.seqcount.sequence); // version >= v5.10
   921      }
   922  
   923      struct rb_node *node = BPF_CORE_READ(m_tree, root.tree[seq & 1].rb_node);
   924  
   925      return walk_mod_tree(p, node, seq & 1);
   926  }
   927  
   928  static __always_inline u64 check_new_mods_only(program_data_t *p)
   929  {
   930      struct module *pos, *n;
   931      u64 start_scan_time = bpf_ktime_get_ns();
   932      char modules_sym[8] = "modules";
   933      kernel_new_mod_t *new_mod;
   934      u64 mod_addr;
   935      struct list_head *head = (struct list_head *) get_symbol_addr(modules_sym);
   936  
   937      pos = list_first_entry_ebpf(head, typeof(*pos), list);
   938      n = pos;
   939  
   940  #pragma unroll
   941      for (int i = 0; i < MAX_NUM_MODULES; i++) {
   942          pos = n;
   943          n = list_next_entry_ebpf(n, list);
   944          if (&pos->list == head) {
   945              return start_scan_time; // To be used in userspace
   946          }
   947  
   948          mod_addr = (u64) pos;
   949          new_mod = bpf_map_lookup_elem(&new_module_map, &mod_addr);
   950          if (new_mod) {
   951              new_mod->last_seen_time = bpf_ktime_get_ns();
   952          }
   953      }
   954  
   955      return 0;
   956  }
   957  
   958  statfunc int check_is_proc_modules_hooked(program_data_t *p)
   959  {
   960      struct module *pos, *n;
   961      u64 mod_base_addr;
   962      char modules_sym[8] = "modules";
   963      struct list_head *head = (struct list_head *) get_symbol_addr(modules_sym);
   964      u32 flags = PROC_MODULES | HIDDEN_MODULE;
   965  
   966      pos = list_first_entry_ebpf(head, typeof(*pos), list);
   967      n = pos;
   968  
   969  #pragma unroll
   970      for (int i = 0; i < MAX_NUM_MODULES; i++) {
   971          pos = n;
   972          n = list_next_entry_ebpf(n, list);
   973          if (&pos->list == head) {
   974              return 0;
   975          }
   976  
   977          // Check with the address being the start of the memory area, since
   978          // this is what is given from /proc/modules.
   979          if (bpf_core_field_exists(pos->mem)) { // Version >= v6.4
   980              mod_base_addr = (u64) BPF_CORE_READ(pos, mem[MOD_TEXT].base);
   981          } else {
   982              struct module___older_v64 *old_mod = (void *) pos;
   983              mod_base_addr = (u64) BPF_CORE_READ(old_mod, core_layout.base);
   984          }
   985  
   986          if (unlikely(mod_base_addr == 0)) { // Module memory was possibly tampered.. submit an error
   987              return HID_MOD_MEM_ZEROED;
   988          } else if (bpf_map_lookup_elem(&modules_map, &mod_base_addr) == NULL) {
   989              // Was there any recent insertion of a module since we populated
   990              // modules_list? if so, don't report as there's possible race
   991              // condition. Note that this granularity (insertion of any module
   992              // and not just this particular module) is only for /proc/modules
   993              // logic, since there's a context switch between userspace to kernel
   994              // space, it opens a window for more modules to get
   995              // inserted/deleted, and then the LRU size is not enough - modules
   996              // get evicted and we report a false-positive. We don't really want
   997              // the init_shown_mods time, but the time proc modules map was
   998              // filled (userspace) - so assume it happened max 2 seconds prior to
   999              // that.
  1000              if (start_scan_time_init_shown_mods - (2 * 1000000000) < last_module_insert_time) {
  1001                  return 0;
  1002              }
  1003  
  1004              // Module was not seen in proc modules and there was no recent insertion, report.
  1005              lkm_seeker_send_to_userspace(pos, &flags, p);
  1006          }
  1007      }
  1008  
  1009      return HID_MOD_UNCOMPLETED_ITERATIONS;
  1010  }
  1011  
  1012  statfunc bool kern_ver_below_min_lkm(struct pt_regs *ctx)
  1013  {
  1014      // If we're below kernel version 5.2, propogate error to userspace and return
  1015      if (!bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_sk_storage_get)) {
  1016          goto below_threshold;
  1017      }
  1018  
  1019      return false; // lkm seeker may run!
  1020  
  1021      goto below_threshold; // For compiler - avoid "unused label" warning
  1022  below_threshold:
  1023      tracee_log(ctx,
  1024                 BPF_LOG_LVL_ERROR,
  1025                 BPF_LOG_ID_UNSPEC,
  1026                 -1); // notify the user that the event logic isn't loaded even though it's requested
  1027      return true;
  1028  }
  1029  
  1030  SEC("uprobe/lkm_seeker_submitter")
  1031  int uprobe_lkm_seeker_submitter(struct pt_regs *ctx)
  1032  {
  1033      // This check is to satisfy the verifier for kernels older than 5.2
  1034      if (kern_ver_below_min_lkm(ctx))
  1035          return 0;
  1036  
  1037      u64 mod_address = 0;
  1038      u64 received_flags = 0;
  1039  
  1040  #if defined(bpf_target_x86)
  1041      mod_address = ctx->bx;    // 1st arg
  1042      received_flags = ctx->cx; // 2nd arg
  1043  #elif defined(bpf_target_arm64)
  1044      mod_address = ctx->user_regs.regs[1];    // 1st arg
  1045      received_flags = ctx->user_regs.regs[2]; // 2nd arg
  1046  #else
  1047      return 0;
  1048  #endif
  1049  
  1050      program_data_t p = {};
  1051      if (!init_program_data(&p, ctx))
  1052          return 0;
  1053  
  1054      // Uprobes are not triggered by syscalls, so we need to override the false value.
  1055      p.event->context.syscall = NO_SYSCALL;
  1056  
  1057      u32 trigger_pid = bpf_get_current_pid_tgid() >> 32;
  1058      // Uprobe was triggered from other tracee instance
  1059      if (p.config->tracee_pid != trigger_pid)
  1060          return 0;
  1061  
  1062      u32 flags =
  1063          ((u32) received_flags) | HIDDEN_MODULE; // Convert to 32bit and turn on the bit that will
  1064                                                  // cause it to be sent as an event to the user
  1065      lkm_seeker_send_to_userspace((struct module *) mod_address, &flags, &p);
  1066  
  1067      return 0;
  1068  }
  1069  
  1070  // There are 2 types of scans:
  1071  // - Scan of modules that were loaded prior tracee started: this is only done once at the start of
  1072  // tracee
  1073  // - Scan of modules that were loaded after tracee started: runs periodically and on each new module
  1074  // insertion
  1075  SEC("uprobe/lkm_seeker")
  1076  int uprobe_lkm_seeker(struct pt_regs *ctx)
  1077  {
  1078      if (kern_ver_below_min_lkm(ctx))
  1079          return 0;
  1080  
  1081      program_data_t p = {};
  1082      if (!init_program_data(&p, ctx))
  1083          return 0;
  1084  
  1085      // Uprobes are not triggered by syscalls, so we need to override the false value.
  1086      p.event->context.syscall = NO_SYSCALL;
  1087  
  1088      // uprobe was triggered from other tracee instance
  1089      if (p.config->tracee_pid != p.task_info->context.pid &&
  1090          p.config->tracee_pid != p.task_info->context.host_pid) {
  1091          return 0;
  1092      }
  1093  
  1094      start_scan_time_init_shown_mods = bpf_ktime_get_ns();
  1095      int ret = init_shown_modules();
  1096      if (ret != 0) {
  1097          tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret);
  1098          return 1;
  1099      }
  1100  
  1101      // On first run, do a scan only relevant for modules that were inserted prior tracee started.
  1102      if (unlikely(!hidden_old_mod_scan_done)) {
  1103          hidden_old_mod_scan_done = true;
  1104          bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_KSET);
  1105          return -1;
  1106      }
  1107  
  1108      bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_PROC);
  1109  
  1110      return -1;
  1111  }
  1112  
  1113  SEC("uprobe/lkm_seeker_kset_tail")
  1114  int lkm_seeker_kset_tail(struct pt_regs *ctx)
  1115  {
  1116      // This check is to satisfy the verifier for kernels older than 5.2
  1117      // as in runtime we'll never get here (the tail call doesn't happen)
  1118      if (kern_ver_below_min_lkm(ctx))
  1119          return 0;
  1120  
  1121      program_data_t p = {};
  1122      if (!init_tailcall_program_data(&p, ctx))
  1123          return -1;
  1124  
  1125      int ret = find_modules_from_module_kset_list(&p);
  1126      if (ret < 0) {
  1127          tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret);
  1128          return -1;
  1129      }
  1130  
  1131      bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_MOD_TREE);
  1132  
  1133      return -1;
  1134  }
  1135  
  1136  SEC("uprobe/lkm_seeker_mod_tree_tail")
  1137  int lkm_seeker_mod_tree_tail(struct pt_regs *ctx)
  1138  {
  1139      // This check is to satisfy the verifier for kernels older than 5.2
  1140      // as in runtime we'll never get here (the tail call doesn't happen)
  1141      if (kern_ver_below_min_lkm(ctx))
  1142          return 0;
  1143  
  1144      program_data_t p = {};
  1145      if (!init_tailcall_program_data(&p, ctx))
  1146          return -1;
  1147  
  1148      // This method is efficient only when the kernel is compiled with
  1149      // CONFIG_MODULES_TREE_LOOKUP=y
  1150      int ret = find_modules_from_mod_tree(&p);
  1151      if (ret < 0) {
  1152          tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret);
  1153          return -1;
  1154      }
  1155  
  1156      bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_PROC);
  1157  
  1158      return -1;
  1159  }
  1160  
  1161  SEC("uprobe/lkm_seeker_proc_tail")
  1162  int lkm_seeker_proc_tail(struct pt_regs *ctx)
  1163  {
  1164      // This check is to satisfy the verifier for kernels older than 5.2
  1165      // as in runtime we'll never get here (the tail call doesn't happen)
  1166      if (kern_ver_below_min_lkm(ctx))
  1167          return 0;
  1168  
  1169      program_data_t p = {};
  1170      if (!init_tailcall_program_data(&p, ctx))
  1171          return -1;
  1172  
  1173      int ret = check_is_proc_modules_hooked(&p);
  1174      if (ret < 0) {
  1175          tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, ret);
  1176          return -1;
  1177      }
  1178  
  1179      bpf_tail_call(ctx, &prog_array, TAIL_HIDDEN_KERNEL_MODULE_NEW_MOD_ONLY);
  1180  
  1181      return -1;
  1182  }
  1183  
  1184  // We maintain a map of newly loaded modules. At times, we verify that this module appears in
  1185  // modules list. If it is not (and there was no valid deletion), then it's hidden.
  1186  SEC("uprobe/lkm_seeker_new_mod_only_tail")
  1187  int lkm_seeker_new_mod_only_tail(struct pt_regs *ctx)
  1188  {
  1189      // This check is to satisfy the verifier for kernels older than 5.2
  1190      // as in runtime we'll never get here (the tail call doesn't happen)
  1191      if (kern_ver_below_min_lkm(ctx))
  1192          return 0;
  1193  
  1194      program_data_t p = {};
  1195      if (!init_tailcall_program_data(&p, ctx))
  1196          return -1;
  1197  
  1198      u64 start_scan_time = check_new_mods_only(&p);
  1199      if (start_scan_time == 0) {
  1200          tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_HID_KER_MOD, HID_MOD_UNCOMPLETED_ITERATIONS);
  1201          return 1;
  1202      }
  1203  
  1204      struct module *mod =
  1205          (struct module *) start_scan_time; // Use the module address field as the start_scan_time
  1206      u32 flags = NEW_MOD;
  1207      lkm_seeker_send_to_userspace(mod, &flags, &p);
  1208  
  1209      return 0;
  1210  }
  1211  
  1212  // clang-format off
  1213  
  1214  // trace/events/sched.h: TP_PROTO(struct task_struct *p, pid_t old_pid, struct linux_binprm *bprm)
  1215  SEC("raw_tracepoint/sched_process_exec")
  1216  int tracepoint__sched__sched_process_exec(struct bpf_raw_tracepoint_args *ctx)
  1217  {
  1218      program_data_t p = {};
  1219      if (!init_program_data(&p, ctx)) {
  1220          return 0;
  1221      }
  1222  
  1223      // Perform checks below before should_trace(), so tracee can filter by newly created containers
  1224      // or processes. Assume that a new container, or pod, has started when a process of a newly
  1225      // created cgroup and mount ns executed a binary.
  1226  
  1227      if (p.task_info->container_state == CONTAINER_CREATED) {
  1228          u32 mntns = get_task_mnt_ns_id(p.task);
  1229          struct task_struct *parent = get_parent_task(p.task);
  1230          u32 parent_mntns = get_task_mnt_ns_id(parent);
  1231          if (mntns != parent_mntns) {
  1232              u32 cgroup_id_lsb = p.event->context.task.cgroup_id;
  1233              u8 state = CONTAINER_STARTED;
  1234              bpf_map_update_elem(&containers_map, &cgroup_id_lsb, &state, BPF_ANY);
  1235              p.task_info->container_state = state;
  1236              p.event->context.task.flags |= CONTAINER_STARTED_FLAG; // change for current event
  1237              p.task_info->context.flags |= CONTAINER_STARTED_FLAG;  // change for future task events
  1238          }
  1239      }
  1240  
  1241      struct linux_binprm *bprm = (struct linux_binprm *) ctx->args[2];
  1242      if (bprm == NULL) {
  1243          return -1;
  1244      }
  1245      struct file *file = get_file_ptr_from_bprm(bprm);
  1246      void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  1247  
  1248      proc_info_t *proc_info = p.proc_info;
  1249      proc_info->new_proc = true; // task has started after tracee started running
  1250  
  1251      // extract the binary name to be used in should_trace
  1252      __builtin_memset(proc_info->binary.path, 0, MAX_BIN_PATH_SIZE);
  1253      bpf_probe_read_kernel_str(proc_info->binary.path, MAX_BIN_PATH_SIZE, file_path);
  1254      proc_info->binary.mnt_id = p.event->context.task.mnt_id;
  1255  
  1256      if (!should_trace(&p)) {
  1257          return 0;
  1258      }
  1259  
  1260      proc_info->follow_in_scopes = p.event->context.matched_policies; // follow task for matched scopes
  1261  
  1262      if (!should_submit(SCHED_PROCESS_EXEC, p.event)) {
  1263          return 0;
  1264      }
  1265  
  1266      // Note: From v5.9+, there are two interesting fields in bprm that could be added:
  1267      // 1. struct file *executable: the executable name passed to an interpreter
  1268      // 2. fdpath: generated filename for execveat (after resolving dirfd)
  1269  
  1270      const char *filename = get_binprm_filename(bprm);
  1271      dev_t s_dev = get_dev_from_file(file);
  1272      unsigned long inode_nr = get_inode_nr_from_file(file);
  1273      u64 ctime = get_ctime_nanosec_from_file(file);
  1274      umode_t inode_mode = get_inode_mode_from_file(file);
  1275  
  1276      save_str_to_buf(&p.event->args_buf, (void *) filename, 0);
  1277      save_str_to_buf(&p.event->args_buf, file_path, 1);
  1278      save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 2);
  1279      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 3);
  1280      save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4);
  1281      save_to_submit_buf(&p.event->args_buf, &inode_mode, sizeof(umode_t), 5);
  1282  
  1283      // NOTES:
  1284      // - interp is the real interpreter (sh, bash, python, perl, ...)
  1285      // - interpreter is the binary interpreter (ld.so), also known as the loader
  1286      // - interpreter might be the same as executable (so there is no interpreter)
  1287  
  1288      // Check if there is an interpreter and if it is different from the executable:
  1289  
  1290      bool itp_inode_exists = proc_info->interpreter.id.inode != 0;
  1291      bool itp_dev_diff = proc_info->interpreter.id.device != s_dev;
  1292      bool itp_inode_diff = proc_info->interpreter.id.inode != inode_nr;
  1293  
  1294      if (itp_inode_exists && (itp_dev_diff || itp_inode_diff)) {
  1295          save_str_to_buf(&p.event->args_buf, &proc_info->interpreter.pathname, 6);                    // interpreter path
  1296          save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 7); // interpreter device number
  1297          save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.inode, sizeof(u64), 8);    // interpreter inode number
  1298          save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.ctime, sizeof(u64), 9);    // interpreter changed time
  1299      }
  1300  
  1301      bpf_tail_call(ctx, &prog_array_tp, TAIL_SCHED_PROCESS_EXEC_EVENT_SUBMIT);
  1302  
  1303      return 0;
  1304  }
  1305  
  1306  // clang-format on
  1307  
  1308  SEC("raw_tracepoint/sched_process_exec_event_submit_tail")
  1309  int sched_process_exec_event_submit_tail(struct bpf_raw_tracepoint_args *ctx)
  1310  {
  1311      program_data_t p = {};
  1312      if (!init_tailcall_program_data(&p, ctx))
  1313          return -1;
  1314  
  1315      struct task_struct *task = (struct task_struct *) ctx->args[0];
  1316      struct linux_binprm *bprm = (struct linux_binprm *) ctx->args[2];
  1317  
  1318      if (bprm == NULL)
  1319          return -1;
  1320  
  1321      // bprm->mm is null at this point (set by begin_new_exec()), and task->mm is already initialized
  1322      struct mm_struct *mm = get_mm_from_task(task);
  1323  
  1324      unsigned long arg_start, arg_end;
  1325      arg_start = get_arg_start_from_mm(mm);
  1326      arg_end = get_arg_end_from_mm(mm);
  1327      int argc = get_argc_from_bprm(bprm);
  1328  
  1329      struct file *stdin_file = get_struct_file_from_fd(0);
  1330      unsigned short stdin_type = get_inode_mode_from_file(stdin_file) & S_IFMT;
  1331      void *stdin_path = get_path_str(__builtin_preserve_access_index(&stdin_file->f_path));
  1332      const char *interp = get_binprm_interp(bprm);
  1333  
  1334      int invoked_from_kernel = 0;
  1335      if (get_task_parent_flags(task) & PF_KTHREAD) {
  1336          invoked_from_kernel = 1;
  1337      }
  1338  
  1339      save_args_str_arr_to_buf(&p.event->args_buf, (void *) arg_start, (void *) arg_end, argc, 10);
  1340      save_str_to_buf(&p.event->args_buf, (void *) interp, 11);
  1341      save_to_submit_buf(&p.event->args_buf, &stdin_type, sizeof(unsigned short), 12);
  1342      save_str_to_buf(&p.event->args_buf, stdin_path, 13);
  1343      save_to_submit_buf(&p.event->args_buf, &invoked_from_kernel, sizeof(int), 14);
  1344      if (p.config->options & OPT_EXEC_ENV) {
  1345          unsigned long env_start, env_end;
  1346          env_start = get_env_start_from_mm(mm);
  1347          env_end = get_env_end_from_mm(mm);
  1348          int envc = get_envc_from_bprm(bprm);
  1349  
  1350          save_args_str_arr_to_buf(
  1351              &p.event->args_buf, (void *) env_start, (void *) env_end, envc, 15);
  1352      }
  1353  
  1354      events_perf_submit(&p, SCHED_PROCESS_EXEC, 0);
  1355      return 0;
  1356  }
  1357  
  1358  // trace/events/sched.h: TP_PROTO(struct task_struct *p)
  1359  SEC("raw_tracepoint/sched_process_exit")
  1360  int tracepoint__sched__sched_process_exit(struct bpf_raw_tracepoint_args *ctx)
  1361  {
  1362      program_data_t p = {};
  1363      if (!init_program_data(&p, ctx))
  1364          return 0;
  1365  
  1366      // evaluate should_trace before removing this pid from the maps
  1367      bool traced = !!should_trace(&p);
  1368  
  1369      bpf_map_delete_elem(&task_info_map, &p.event->context.task.host_tid);
  1370  
  1371      bool group_dead = false;
  1372      struct task_struct *task = p.task;
  1373      struct signal_struct *signal = BPF_CORE_READ(task, signal);
  1374      atomic_t live = BPF_CORE_READ(signal, live);
  1375      // This check could be true for multiple thread exits if the thread count was 0 when the hooks
  1376      // were triggered. This could happen for example if the threads performed exit in different CPUs
  1377      // simultaneously.
  1378      if (live.counter == 0) {
  1379          group_dead = true;
  1380      }
  1381  
  1382      bool oom_killed = false;
  1383  
  1384      if (bpf_map_lookup_elem(&oom_info, &p.task_info->context.host_pid)) {
  1385          oom_killed = true;
  1386          bpf_map_delete_elem(&oom_info, &p.task_info->context.host_pid);
  1387      }
  1388  
  1389      if (!traced)
  1390          return 0;
  1391  
  1392      long exit_code = get_task_exit_code(p.task);
  1393  
  1394      if (oom_killed) {
  1395          if (should_submit(PROCESS_OOM_KILLED, p.event)) {
  1396              save_to_submit_buf(&p.event->args_buf, (void *) &exit_code, sizeof(long), 0);
  1397              save_to_submit_buf(&p.event->args_buf, (void *) &group_dead, sizeof(bool), 1);
  1398  
  1399              events_perf_submit(&p, PROCESS_OOM_KILLED, 0);
  1400          }
  1401  
  1402          return 0;
  1403      }
  1404  
  1405      if (should_submit(SCHED_PROCESS_EXIT, p.event)) {
  1406          save_to_submit_buf(&p.event->args_buf, (void *) &exit_code, sizeof(long), 0);
  1407          save_to_submit_buf(&p.event->args_buf, (void *) &group_dead, sizeof(bool), 1);
  1408  
  1409          events_perf_submit(&p, SCHED_PROCESS_EXIT, 0);
  1410      }
  1411  
  1412      return 0;
  1413  }
  1414  
  1415  // trace/events/sched.h: TP_PROTO(struct task_struct *p)
  1416  SEC("raw_tracepoint/sched_process_free")
  1417  int tracepoint__sched__sched_process_free(struct bpf_raw_tracepoint_args *ctx)
  1418  {
  1419      struct task_struct *task = (struct task_struct *) ctx->args[0];
  1420  
  1421      int pid = get_task_host_pid(task);
  1422      int tgid = get_task_host_tgid(task);
  1423  
  1424      if (pid == tgid) {
  1425          // we only care about process (and not thread) exit
  1426          // if tgid task is freed, we know for sure that the process exited
  1427          // so we can safely remove it from the process map
  1428          bpf_map_delete_elem(&proc_info_map, &tgid);
  1429  
  1430          u32 zero = 0;
  1431          config_entry_t *cfg = bpf_map_lookup_elem(&config_map, &zero);
  1432          if (unlikely(cfg == NULL)) {
  1433              return 0;
  1434          }
  1435  
  1436          bpf_map_delete_elem(&process_tree_map, &tgid);
  1437      }
  1438  
  1439      return 0;
  1440  }
  1441  
  1442  SEC("raw_tracepoint/syscall__accept4")
  1443  int syscall__accept4(void *ctx)
  1444  {
  1445      args_t saved_args;
  1446      if (load_args(&saved_args, SOCKET_ACCEPT) != 0) {
  1447          // missed entry or not traced
  1448          return 0;
  1449      }
  1450      del_args(SOCKET_ACCEPT);
  1451  
  1452      program_data_t p = {};
  1453      if (!init_program_data(&p, ctx))
  1454          return 0;
  1455  
  1456      struct socket *old_sock = (struct socket *) saved_args.args[0];
  1457      struct socket *new_sock = (struct socket *) saved_args.args[1];
  1458      u32 sockfd = (u32) saved_args.args[2];
  1459  
  1460      if (new_sock == NULL) {
  1461          return -1;
  1462      }
  1463      if (old_sock == NULL) {
  1464          return -1;
  1465      }
  1466  
  1467      reset_event_args(&p);
  1468      save_to_submit_buf(&p.event->args_buf, (void *) &sockfd, sizeof(u32), 0);
  1469      save_sockaddr_to_buf(&p.event->args_buf, old_sock, 1);
  1470      save_sockaddr_to_buf(&p.event->args_buf, new_sock, 2);
  1471  
  1472      return events_perf_submit(&p, SOCKET_ACCEPT, 0);
  1473  }
  1474  
  1475  // trace/events/sched.h: TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
  1476  SEC("raw_tracepoint/sched_switch")
  1477  int tracepoint__sched__sched_switch(struct bpf_raw_tracepoint_args *ctx)
  1478  {
  1479      program_data_t p = {};
  1480      if (!init_program_data(&p, ctx))
  1481          return 0;
  1482  
  1483      if (!should_trace(&p))
  1484          return 0;
  1485  
  1486      if (!should_submit(SCHED_SWITCH, p.event))
  1487          return 0;
  1488  
  1489      struct task_struct *prev = (struct task_struct *) ctx->args[1];
  1490      struct task_struct *next = (struct task_struct *) ctx->args[2];
  1491      int prev_pid = get_task_host_pid(prev);
  1492      int next_pid = get_task_host_pid(next);
  1493      int cpu = bpf_get_smp_processor_id();
  1494  
  1495      save_to_submit_buf(&p.event->args_buf, (void *) &cpu, sizeof(int), 0);
  1496      save_to_submit_buf(&p.event->args_buf, (void *) &prev_pid, sizeof(int), 1);
  1497      save_str_to_buf(&p.event->args_buf, prev->comm, 2);
  1498      save_to_submit_buf(&p.event->args_buf, (void *) &next_pid, sizeof(int), 3);
  1499      save_str_to_buf(&p.event->args_buf, next->comm, 4);
  1500  
  1501      return events_perf_submit(&p, SCHED_SWITCH, 0);
  1502  }
  1503  
  1504  SEC("kprobe/filldir64")
  1505  int BPF_KPROBE(trace_filldir64)
  1506  {
  1507      program_data_t p = {};
  1508      if (!init_program_data(&p, ctx))
  1509          return 0;
  1510  
  1511      if (!should_trace((&p)))
  1512          return 0;
  1513  
  1514      if (!should_submit(HIDDEN_INODES, p.event))
  1515          return 0;
  1516  
  1517      char *process_name = (char *) PT_REGS_PARM2(ctx);
  1518      unsigned long process_inode_number = (unsigned long) PT_REGS_PARM5(ctx);
  1519      if (process_inode_number == 0) {
  1520          save_str_to_buf(&p.event->args_buf, process_name, 0);
  1521          return events_perf_submit(&p, HIDDEN_INODES, 0);
  1522      }
  1523      return 0;
  1524  }
  1525  
  1526  SEC("kprobe/call_usermodehelper")
  1527  int BPF_KPROBE(trace_call_usermodehelper)
  1528  {
  1529      program_data_t p = {};
  1530      if (!init_program_data(&p, ctx))
  1531          return 0;
  1532  
  1533      if (!should_trace(&p))
  1534          return 0;
  1535  
  1536      if (!should_submit(CALL_USERMODE_HELPER, p.event))
  1537          return 0;
  1538  
  1539      void *path = (void *) PT_REGS_PARM1(ctx);
  1540      unsigned long argv = PT_REGS_PARM2(ctx);
  1541      unsigned long envp = PT_REGS_PARM3(ctx);
  1542      int wait = PT_REGS_PARM4(ctx);
  1543  
  1544      save_str_to_buf(&p.event->args_buf, path, 0);
  1545      save_str_arr_to_buf(&p.event->args_buf, (const char *const *) argv, 1);
  1546      save_str_arr_to_buf(&p.event->args_buf, (const char *const *) envp, 2);
  1547      save_to_submit_buf(&p.event->args_buf, (void *) &wait, sizeof(int), 3);
  1548  
  1549      return events_perf_submit(&p, CALL_USERMODE_HELPER, 0);
  1550  }
  1551  
  1552  SEC("kprobe/do_exit")
  1553  int BPF_KPROBE(trace_do_exit)
  1554  {
  1555      program_data_t p = {};
  1556      if (!init_program_data(&p, ctx))
  1557          return 0;
  1558  
  1559      if (!should_trace(&p))
  1560          return 0;
  1561  
  1562      if (!should_submit(DO_EXIT, p.event))
  1563          return 0;
  1564  
  1565      long code = PT_REGS_PARM1(ctx);
  1566  
  1567      return events_perf_submit(&p, DO_EXIT, code);
  1568  }
  1569  
  1570  SEC("uprobe/trigger_seq_ops_event")
  1571  int uprobe_seq_ops_trigger(struct pt_regs *ctx)
  1572  {
  1573      u64 caller_ctx_id = 0;
  1574      u64 *address_array = NULL;
  1575      u64 struct_address = 0;
  1576  
  1577      // clang-format off
  1578      //
  1579      // Golang calling convention per architecture
  1580  
  1581      #if defined(bpf_target_x86)
  1582          caller_ctx_id = ctx->bx;                // 1st arg
  1583          address_array = ((void *) ctx->sp + 8); // 2nd arg
  1584      #elif defined(bpf_target_arm64)
  1585          caller_ctx_id = ctx->user_regs.regs[1]; // 1st arg
  1586          address_array = ((void *) ctx->sp + 8); // 2nd arg
  1587  
  1588      #else
  1589          return 0;
  1590      #endif
  1591      // clang-format on
  1592  
  1593      program_data_t p = {};
  1594      if (!init_program_data(&p, ctx))
  1595          return 0;
  1596  
  1597      // Uprobes are not triggered by syscalls, so we need to override the false value.
  1598      p.event->context.syscall = NO_SYSCALL;
  1599  
  1600      // uprobe was triggered from other tracee instance
  1601      if (p.config->tracee_pid != p.task_info->context.pid &&
  1602          p.config->tracee_pid != p.task_info->context.host_pid)
  1603          return 0;
  1604  
  1605      void *stext_addr = get_stext_addr();
  1606      if (unlikely(stext_addr == NULL))
  1607          return 0;
  1608      void *etext_addr = get_etext_addr();
  1609      if (unlikely(etext_addr == NULL))
  1610          return 0;
  1611  
  1612      u32 count_off = p.event->args_buf.offset + 1;
  1613      save_u64_arr_to_buf(&p.event->args_buf, NULL, 0, 0); // init u64 array with size 0
  1614  
  1615  #pragma unroll
  1616      for (int i = 0; i < NET_SEQ_OPS_TYPES; i++) {
  1617          bpf_probe_read_user(&struct_address, 8, (address_array + i));
  1618          struct seq_operations *seq_ops = (struct seq_operations *) struct_address;
  1619  
  1620          u64 show_addr = (u64) BPF_CORE_READ(seq_ops, show);
  1621          if (show_addr == 0)
  1622              return 0;
  1623          if (show_addr >= (u64) stext_addr && show_addr < (u64) etext_addr)
  1624              show_addr = 0;
  1625  
  1626          u64 start_addr = (u64) BPF_CORE_READ(seq_ops, start);
  1627          if (start_addr == 0)
  1628              return 0;
  1629          if (start_addr >= (u64) stext_addr && start_addr < (u64) etext_addr)
  1630              start_addr = 0;
  1631  
  1632          u64 next_addr = (u64) BPF_CORE_READ(seq_ops, next);
  1633          if (next_addr == 0)
  1634              return 0;
  1635          if (next_addr >= (u64) stext_addr && next_addr < (u64) etext_addr)
  1636              next_addr = 0;
  1637  
  1638          u64 stop_addr = (u64) BPF_CORE_READ(seq_ops, stop);
  1639          if (stop_addr == 0)
  1640              return 0;
  1641          if (stop_addr >= (u64) stext_addr && stop_addr < (u64) etext_addr)
  1642              stop_addr = 0;
  1643  
  1644          u64 seq_ops_addresses[NET_SEQ_OPS_SIZE + 1] = {show_addr, start_addr, next_addr, stop_addr};
  1645  
  1646          add_u64_elements_to_buf(&p.event->args_buf, (const u64 *) seq_ops_addresses, 4, count_off);
  1647      }
  1648  
  1649      save_to_submit_buf(&p.event->args_buf, (void *) &caller_ctx_id, sizeof(uint64_t), 1);
  1650      events_perf_submit(&p, PRINT_NET_SEQ_OPS, 0);
  1651      return 0;
  1652  }
  1653  
  1654  SEC("uprobe/trigger_mem_dump_event")
  1655  int uprobe_mem_dump_trigger(struct pt_regs *ctx)
  1656  {
  1657      u64 address = 0;
  1658      u64 size = 0;
  1659      u64 caller_ctx_id = 0;
  1660  
  1661  #if defined(bpf_target_x86)
  1662      address = ctx->bx;       // 1st arg
  1663      size = ctx->cx;          // 2nd arg
  1664      caller_ctx_id = ctx->di; // 3rd arg
  1665  #elif defined(bpf_target_arm64)
  1666      address = ctx->user_regs.regs[1];        // 1st arg
  1667      size = ctx->user_regs.regs[2];           // 2nd arg
  1668      caller_ctx_id = ctx->user_regs.regs[3];  // 3rd arg
  1669  #else
  1670      return 0;
  1671  #endif
  1672  
  1673      program_data_t p = {};
  1674      if (!init_program_data(&p, ctx))
  1675          return 0;
  1676  
  1677      // Uprobes are not triggered by syscalls, so we need to override the false value.
  1678      p.event->context.syscall = NO_SYSCALL;
  1679  
  1680      // uprobe was triggered from other tracee instance
  1681      if (p.config->tracee_pid != p.task_info->context.pid &&
  1682          p.config->tracee_pid != p.task_info->context.host_pid)
  1683          return 0;
  1684  
  1685      if (size <= 0)
  1686          return 0;
  1687  
  1688      int ret = save_bytes_to_buf(&p.event->args_buf, (void *) address, size & MAX_MEM_DUMP_SIZE, 0);
  1689      // return in case of failed pointer read
  1690      if (ret == 0) {
  1691          tracee_log(ctx, BPF_LOG_LVL_ERROR, BPF_LOG_ID_MEM_READ, ret);
  1692          return 0;
  1693      }
  1694      save_to_submit_buf(&p.event->args_buf, (void *) &address, sizeof(void *), 1);
  1695      save_to_submit_buf(&p.event->args_buf, &size, sizeof(u64), 2);
  1696      save_to_submit_buf(&p.event->args_buf, &caller_ctx_id, sizeof(u64), 3);
  1697  
  1698      return events_perf_submit(&p, PRINT_MEM_DUMP, 0);
  1699  }
  1700  
  1701  statfunc struct trace_kprobe *get_trace_kprobe_from_trace_probe(void *tracep)
  1702  {
  1703      struct trace_kprobe *tracekp =
  1704          (struct trace_kprobe *) container_of(tracep, struct trace_kprobe, tp);
  1705  
  1706      return tracekp;
  1707  }
  1708  
  1709  statfunc struct trace_uprobe *get_trace_uprobe_from_trace_probe(void *tracep)
  1710  {
  1711      struct trace_uprobe *traceup =
  1712          (struct trace_uprobe *) container_of(tracep, struct trace_uprobe, tp);
  1713  
  1714      return traceup;
  1715  }
  1716  
  1717  // This function returns a pointer to struct trace_probe from struct trace_event_call.
  1718  statfunc void *get_trace_probe_from_trace_event_call(struct trace_event_call *call)
  1719  {
  1720      void *tracep_ptr;
  1721  
  1722      struct trace_probe___v53 *legacy_tracep;
  1723      if (bpf_core_field_exists(legacy_tracep->call)) {
  1724          tracep_ptr = container_of(call, struct trace_probe___v53, call);
  1725      } else {
  1726          struct trace_probe_event *tpe = container_of(call, struct trace_probe_event, call);
  1727          struct list_head probes = BPF_CORE_READ(tpe, probes);
  1728          tracep_ptr = container_of(probes.next, struct trace_probe, list);
  1729      }
  1730  
  1731      return tracep_ptr;
  1732  }
  1733  
  1734  enum bpf_attach_type_e {
  1735      BPF_RAW_TRACEPOINT,
  1736      PERF_TRACEPOINT,
  1737      PERF_KPROBE,
  1738      PERF_KRETPROBE,
  1739      PERF_UPROBE,
  1740      PERF_URETPROBE
  1741  };
  1742  
  1743  statfunc int send_bpf_attach(
  1744      program_data_t *p, struct bpf_prog *prog, void *event_name, u64 probe_addr, int perf_type)
  1745  {
  1746      if (!should_submit(BPF_ATTACH, p->event)) {
  1747          return 0;
  1748      }
  1749  
  1750      // get bpf prog details
  1751  
  1752      int prog_type = BPF_CORE_READ(prog, type);
  1753      struct bpf_prog_aux *prog_aux = BPF_CORE_READ(prog, aux);
  1754      u32 prog_id = BPF_CORE_READ(prog_aux, id);
  1755      char prog_name[BPF_OBJ_NAME_LEN];
  1756      bpf_probe_read_kernel_str(&prog_name, BPF_OBJ_NAME_LEN, prog_aux->name);
  1757  
  1758      // get usage of helpers
  1759      bpf_used_helpers_t *val = bpf_map_lookup_elem(&bpf_attach_map, &prog_id);
  1760      if (val == NULL)
  1761          return 0;
  1762  
  1763      // submit the event
  1764  
  1765      save_to_submit_buf(&(p->event->args_buf), &prog_type, sizeof(int), 0);
  1766      save_str_to_buf(&(p->event->args_buf), (void *) &prog_name, 1);
  1767      save_to_submit_buf(&(p->event->args_buf), &prog_id, sizeof(u32), 2);
  1768      save_u64_arr_to_buf(&(p->event->args_buf), (const u64 *) val->helpers, 4, 3);
  1769      save_str_to_buf(&(p->event->args_buf), event_name, 4);
  1770      save_to_submit_buf(&(p->event->args_buf), &probe_addr, sizeof(u64), 5);
  1771      save_to_submit_buf(&(p->event->args_buf), &perf_type, sizeof(int), 6);
  1772  
  1773      events_perf_submit(p, BPF_ATTACH, 0);
  1774  
  1775      // delete from map
  1776      bpf_map_delete_elem(&bpf_attach_map, &prog_id);
  1777  
  1778      return 0;
  1779  }
  1780  
  1781  // Inspired by bpf_get_perf_event_info() kernel func.
  1782  // https://elixir.bootlin.com/linux/v5.19.2/source/kernel/trace/bpf_trace.c#L2123
  1783  statfunc int
  1784  send_bpf_perf_attach(program_data_t *p, struct file *bpf_prog_file, struct file *perf_event_file)
  1785  {
  1786      if (!should_submit(BPF_ATTACH, p->event)) {
  1787          return 0;
  1788      }
  1789  
  1790      // get real values of TRACE_EVENT_FL_KPROBE and TRACE_EVENT_FL_UPROBE.
  1791      // these values were changed in kernels >= 5.15.
  1792      int TRACE_EVENT_FL_KPROBE_BIT;
  1793      int TRACE_EVENT_FL_UPROBE_BIT;
  1794      if (bpf_core_field_exists(((struct trace_event_call *) 0)->module)) { // kernel >= 5.15
  1795          TRACE_EVENT_FL_KPROBE_BIT = 6;
  1796          TRACE_EVENT_FL_UPROBE_BIT = 7;
  1797      } else { // kernel < 5.15
  1798          TRACE_EVENT_FL_KPROBE_BIT = 5;
  1799          TRACE_EVENT_FL_UPROBE_BIT = 6;
  1800      }
  1801      int TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT);
  1802      int TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT);
  1803  
  1804      // get perf event details
  1805  
  1806  // clang-format off
  1807  #define MAX_PERF_EVENT_NAME ((MAX_PATH_PREF_SIZE > MAX_KSYM_NAME_SIZE) ? MAX_PATH_PREF_SIZE : MAX_KSYM_NAME_SIZE)
  1808  #define REQUIRED_SYSTEM_LENGTH 9
  1809      // clang-format on
  1810  
  1811      struct perf_event *event = (struct perf_event *) BPF_CORE_READ(perf_event_file, private_data);
  1812      struct trace_event_call *tp_event = BPF_CORE_READ(event, tp_event);
  1813      char event_name[MAX_PERF_EVENT_NAME];
  1814      u64 probe_addr = 0;
  1815      int perf_type;
  1816  
  1817      int flags = BPF_CORE_READ(tp_event, flags);
  1818  
  1819      // check if syscall_tracepoint
  1820      bool is_syscall_tracepoint = false;
  1821      struct trace_event_class *tp_class = BPF_CORE_READ(tp_event, class);
  1822      char class_system[REQUIRED_SYSTEM_LENGTH];
  1823      bpf_probe_read_kernel_str(
  1824          &class_system, REQUIRED_SYSTEM_LENGTH, BPF_CORE_READ(tp_class, system));
  1825      class_system[REQUIRED_SYSTEM_LENGTH - 1] = '\0';
  1826      if (has_prefix("syscalls", class_system, REQUIRED_SYSTEM_LENGTH)) {
  1827          is_syscall_tracepoint = true;
  1828      }
  1829  
  1830      if (flags & TRACE_EVENT_FL_TRACEPOINT) { // event is tracepoint
  1831  
  1832          perf_type = PERF_TRACEPOINT;
  1833          struct tracepoint *tp = BPF_CORE_READ(tp_event, tp);
  1834          bpf_probe_read_kernel_str(&event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tp, name));
  1835  
  1836      } else if (is_syscall_tracepoint) { // event is syscall tracepoint
  1837  
  1838          perf_type = PERF_TRACEPOINT;
  1839          bpf_probe_read_kernel_str(&event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tp_event, name));
  1840  
  1841      } else {
  1842          bool is_ret_probe = false;
  1843          void *tracep_ptr = get_trace_probe_from_trace_event_call(tp_event);
  1844  
  1845          if (flags & TRACE_EVENT_FL_KPROBE) { // event is kprobe
  1846  
  1847              struct trace_kprobe *tracekp = get_trace_kprobe_from_trace_probe(tracep_ptr);
  1848  
  1849              // check if probe is a kretprobe
  1850              struct kretprobe *krp = &tracekp->rp;
  1851              kretprobe_handler_t handler_f = BPF_CORE_READ(krp, handler);
  1852              if (handler_f != NULL)
  1853                  is_ret_probe = true;
  1854  
  1855              if (is_ret_probe)
  1856                  perf_type = PERF_KRETPROBE;
  1857              else
  1858                  perf_type = PERF_KPROBE;
  1859  
  1860              // get symbol name
  1861              bpf_probe_read_kernel_str(
  1862                  &event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tracekp, symbol));
  1863  
  1864              // get symbol address
  1865              if (!event_name[0])
  1866                  probe_addr = (unsigned long) BPF_CORE_READ(krp, kp.addr);
  1867  
  1868          } else if (flags & TRACE_EVENT_FL_UPROBE) { // event is uprobe
  1869  
  1870              struct trace_uprobe *traceup = get_trace_uprobe_from_trace_probe(tracep_ptr);
  1871  
  1872              // determine if ret probe
  1873              struct uprobe_consumer *upc = &traceup->consumer;
  1874              void *handler_f = BPF_CORE_READ(upc, ret_handler);
  1875              if (handler_f != NULL)
  1876                  is_ret_probe = true;
  1877  
  1878              if (is_ret_probe)
  1879                  perf_type = PERF_URETPROBE;
  1880              else
  1881                  perf_type = PERF_UPROBE;
  1882  
  1883              // get binary path
  1884              bpf_probe_read_kernel_str(
  1885                  &event_name, MAX_PATH_PREF_SIZE, BPF_CORE_READ(traceup, filename));
  1886  
  1887              // get symbol offset
  1888              probe_addr = BPF_CORE_READ(traceup, offset);
  1889  
  1890          } else {
  1891              // unsupported perf type
  1892              return 0;
  1893          }
  1894      }
  1895  
  1896      struct bpf_prog *prog = (struct bpf_prog *) BPF_CORE_READ(bpf_prog_file, private_data);
  1897  
  1898      return send_bpf_attach(p, prog, &event_name, probe_addr, perf_type);
  1899  }
  1900  
  1901  SEC("kprobe/security_file_ioctl")
  1902  int BPF_KPROBE(trace_security_file_ioctl)
  1903  {
  1904      program_data_t p = {};
  1905      if (!init_program_data(&p, ctx))
  1906          return 0;
  1907  
  1908      if (!should_trace(&p))
  1909          return 0;
  1910  
  1911      unsigned int cmd = PT_REGS_PARM2(ctx);
  1912  
  1913      if (cmd == PERF_EVENT_IOC_SET_BPF) {
  1914          struct file *perf_event_file = (struct file *) PT_REGS_PARM1(ctx);
  1915          unsigned long fd = PT_REGS_PARM3(ctx);
  1916          struct file *bpf_prog_file = get_struct_file_from_fd(fd);
  1917  
  1918          send_bpf_perf_attach(&p, bpf_prog_file, perf_event_file);
  1919      }
  1920  
  1921      return 0;
  1922  }
  1923  
  1924  SEC("kprobe/tracepoint_probe_register_prio_may_exist")
  1925  int BPF_KPROBE(trace_tracepoint_probe_register_prio_may_exist)
  1926  {
  1927      program_data_t p = {};
  1928      if (!init_program_data(&p, ctx))
  1929          return 0;
  1930  
  1931      if (!should_trace(&p))
  1932          return 0;
  1933  
  1934      struct tracepoint *tp = (struct tracepoint *) PT_REGS_PARM1(ctx);
  1935      struct bpf_prog *prog = (struct bpf_prog *) PT_REGS_PARM3(ctx);
  1936  
  1937      char event_name[MAX_PERF_EVENT_NAME];
  1938      bpf_probe_read_kernel_str(&event_name, MAX_KSYM_NAME_SIZE, BPF_CORE_READ(tp, name));
  1939  
  1940      int perf_type = BPF_RAW_TRACEPOINT;
  1941      u64 probe_addr = 0;
  1942  
  1943      return send_bpf_attach(&p, prog, &event_name, probe_addr, perf_type);
  1944  }
  1945  
  1946  // trace/events/cgroup.h:
  1947  // TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup)
  1948  SEC("raw_tracepoint/cgroup_attach_task")
  1949  int tracepoint__cgroup__cgroup_attach_task(struct bpf_raw_tracepoint_args *ctx)
  1950  {
  1951      program_data_t p = {};
  1952      if (!init_program_data(&p, ctx))
  1953          return 0;
  1954  
  1955      if (!should_trace(&p))
  1956          return 0;
  1957  
  1958      if (!should_submit(CGROUP_ATTACH_TASK, p.event))
  1959          return 0;
  1960  
  1961      char *path = (char *) ctx->args[1];
  1962      struct task_struct *task = (struct task_struct *) ctx->args[2];
  1963  
  1964      int pid = get_task_host_pid(task);
  1965      char *comm = BPF_CORE_READ(task, comm);
  1966  
  1967      save_str_to_buf(&p.event->args_buf, path, 0);
  1968      save_str_to_buf(&p.event->args_buf, comm, 1);
  1969      save_to_submit_buf(&p.event->args_buf, (void *) &pid, sizeof(int), 2);
  1970      events_perf_submit(&p, CGROUP_ATTACH_TASK, 0);
  1971  
  1972      return 0;
  1973  }
  1974  
  1975  // trace/events/cgroup.h: TP_PROTO(struct cgroup *cgrp, const char *path)
  1976  SEC("raw_tracepoint/cgroup_mkdir")
  1977  int tracepoint__cgroup__cgroup_mkdir(struct bpf_raw_tracepoint_args *ctx)
  1978  {
  1979      program_data_t p = {};
  1980      if (!init_program_data(&p, ctx))
  1981          return 0;
  1982  
  1983      if (!should_trace(&p))
  1984          return 0;
  1985  
  1986      if (!should_submit(CGROUP_MKDIR, p.event))
  1987          return 0;
  1988  
  1989      struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0];
  1990      char *path = (char *) ctx->args[1];
  1991  
  1992      u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp);
  1993      u64 cgroup_id = get_cgroup_id(dst_cgrp);
  1994      u32 cgroup_id_lsb = cgroup_id;
  1995  
  1996      save_to_submit_buf(&p.event->args_buf, &cgroup_id, sizeof(u64), 0);
  1997      save_str_to_buf(&p.event->args_buf, path, 1);
  1998      save_to_submit_buf(&p.event->args_buf, &hierarchy_id, sizeof(u32), 2);
  1999      events_perf_submit(&p, CGROUP_MKDIR, 0);
  2000  
  2001      return 0;
  2002  }
  2003  
  2004  // trace/events/cgroup.h: TP_PROTO(struct cgroup *cgrp, const char *path)
  2005  SEC("raw_tracepoint/cgroup_rmdir")
  2006  int tracepoint__cgroup__cgroup_rmdir(struct bpf_raw_tracepoint_args *ctx)
  2007  {
  2008      program_data_t p = {};
  2009      if (!init_program_data(&p, ctx))
  2010          return 0;
  2011  
  2012      if (!should_trace(&p))
  2013          return 0;
  2014  
  2015      if (!should_submit(CGROUP_MKDIR, p.event))
  2016          return 0;
  2017  
  2018      struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0];
  2019      char *path = (char *) ctx->args[1];
  2020  
  2021      u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp);
  2022      u64 cgroup_id = get_cgroup_id(dst_cgrp);
  2023      u32 cgroup_id_lsb = cgroup_id;
  2024  
  2025      save_to_submit_buf(&p.event->args_buf, &cgroup_id, sizeof(u64), 0);
  2026      save_str_to_buf(&p.event->args_buf, path, 1);
  2027      save_to_submit_buf(&p.event->args_buf, &hierarchy_id, sizeof(u32), 2);
  2028      events_perf_submit(&p, CGROUP_RMDIR, 0);
  2029  
  2030      return 0;
  2031  }
  2032  
  2033  SEC("kprobe/security_bprm_check")
  2034  int BPF_KPROBE(trace_security_bprm_check)
  2035  {
  2036      program_data_t p = {};
  2037      if (!init_program_data(&p, ctx))
  2038          return 0;
  2039  
  2040      if (!should_trace(&p))
  2041          return 0;
  2042  
  2043      if (!should_submit(SECURITY_BPRM_CHECK, p.event))
  2044          return 0;
  2045  
  2046      struct linux_binprm *bprm = (struct linux_binprm *) PT_REGS_PARM1(ctx);
  2047      struct file *file = get_file_ptr_from_bprm(bprm);
  2048      dev_t s_dev = get_dev_from_file(file);
  2049      unsigned long inode_nr = get_inode_nr_from_file(file);
  2050      void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  2051  
  2052      syscall_data_t *sys = &p.task_info->syscall_data;
  2053      const char *const *argv = NULL;
  2054      const char *const *envp = NULL;
  2055      switch (sys->id) {
  2056          case SYSCALL_EXECVE:
  2057              argv = (const char *const *) sys->args.args[1];
  2058              envp = (const char *const *) sys->args.args[2];
  2059              break;
  2060          case SYSCALL_EXECVEAT:
  2061              argv = (const char *const *) sys->args.args[2];
  2062              envp = (const char *const *) sys->args.args[3];
  2063              break;
  2064          default:
  2065              break;
  2066      }
  2067  
  2068      save_str_to_buf(&p.event->args_buf, file_path, 0);
  2069      save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 1);
  2070      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 2);
  2071      save_str_arr_to_buf(&p.event->args_buf, argv, 3);
  2072      if (p.config->options & OPT_EXEC_ENV)
  2073          save_str_arr_to_buf(&p.event->args_buf, envp, 4);
  2074  
  2075      return events_perf_submit(&p, SECURITY_BPRM_CHECK, 0);
  2076  }
  2077  
  2078  SEC("kprobe/security_file_open")
  2079  int BPF_KPROBE(trace_security_file_open)
  2080  {
  2081      program_data_t p = {};
  2082      if (!init_program_data(&p, ctx))
  2083          return 0;
  2084  
  2085      if (!should_trace(&p))
  2086          return 0;
  2087  
  2088      if (!should_submit(SECURITY_FILE_OPEN, p.event))
  2089          return 0;
  2090  
  2091      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  2092      dev_t s_dev = get_dev_from_file(file);
  2093      unsigned long inode_nr = get_inode_nr_from_file(file);
  2094      void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  2095      u64 ctime = get_ctime_nanosec_from_file(file);
  2096  
  2097      // Load the arguments given to the open syscall (which eventually invokes this function)
  2098      char empty_string[1] = "";
  2099      void *syscall_pathname = &empty_string;
  2100      syscall_data_t *sys = NULL;
  2101      bool syscall_traced = p.task_info->syscall_traced;
  2102      if (syscall_traced) {
  2103          sys = &p.task_info->syscall_data;
  2104          switch (sys->id) {
  2105              case SYSCALL_EXECVE:
  2106              case SYSCALL_OPEN:
  2107                  syscall_pathname = (void *) sys->args.args[0];
  2108                  break;
  2109  
  2110              case SYSCALL_EXECVEAT:
  2111              case SYSCALL_OPENAT:
  2112              case SYSCALL_OPENAT2:
  2113                  syscall_pathname = (void *) sys->args.args[1];
  2114                  break;
  2115          }
  2116      }
  2117  
  2118      save_str_to_buf(&p.event->args_buf, file_path, 0);
  2119      save_to_submit_buf(&p.event->args_buf,
  2120                         (void *) __builtin_preserve_access_index(&file->f_flags),
  2121                         sizeof(int),
  2122                         1);
  2123      save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 2);
  2124      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 3);
  2125      save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4);
  2126      save_str_to_buf(&p.event->args_buf, syscall_pathname, 5);
  2127  
  2128      return events_perf_submit(&p, SECURITY_FILE_OPEN, 0);
  2129  }
  2130  
  2131  SEC("kprobe/security_sb_mount")
  2132  int BPF_KPROBE(trace_security_sb_mount)
  2133  {
  2134      program_data_t p = {};
  2135      if (!init_program_data(&p, ctx))
  2136          return 0;
  2137  
  2138      if (!should_trace(&p))
  2139          return 0;
  2140  
  2141      if (!should_submit(SECURITY_SB_MOUNT, p.event))
  2142          return 0;
  2143  
  2144      const char *dev_name = (const char *) PT_REGS_PARM1(ctx);
  2145      struct path *path = (struct path *) PT_REGS_PARM2(ctx);
  2146      const char *type = (const char *) PT_REGS_PARM3(ctx);
  2147      unsigned long flags = (unsigned long) PT_REGS_PARM4(ctx);
  2148  
  2149      void *path_str = get_path_str(path);
  2150  
  2151      save_str_to_buf(&p.event->args_buf, (void *) dev_name, 0);
  2152      save_str_to_buf(&p.event->args_buf, path_str, 1);
  2153      save_str_to_buf(&p.event->args_buf, (void *) type, 2);
  2154      save_to_submit_buf(&p.event->args_buf, &flags, sizeof(unsigned long), 3);
  2155  
  2156      return events_perf_submit(&p, SECURITY_SB_MOUNT, 0);
  2157  }
  2158  
  2159  SEC("kprobe/security_inode_unlink")
  2160  int BPF_KPROBE(trace_security_inode_unlink)
  2161  {
  2162      program_data_t p = {};
  2163      if (!init_program_data(&p, ctx))
  2164          return 0;
  2165  
  2166      if (!should_trace(&p))
  2167          return 0;
  2168  
  2169      bool should_trace_inode_unlink = should_submit(SECURITY_INODE_UNLINK, p.event);
  2170      bool should_capture_io = false;
  2171      if ((p.config->options & (OPT_CAPTURE_FILES_READ | OPT_CAPTURE_FILES_WRITE)) != 0)
  2172          should_capture_io = true;
  2173  
  2174      if (!should_trace_inode_unlink && !should_capture_io)
  2175          return 0;
  2176  
  2177      file_id_t unlinked_file_id = {};
  2178      int ret = 0;
  2179  
  2180      // struct inode *dir = (struct inode *)PT_REGS_PARM1(ctx);
  2181      struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx);
  2182      unlinked_file_id.inode = get_inode_nr_from_dentry(dentry);
  2183      unlinked_file_id.device = get_dev_from_dentry(dentry);
  2184  
  2185      if (should_trace_inode_unlink) {
  2186          void *dentry_path = get_dentry_path_str(dentry);
  2187          unlinked_file_id.ctime = get_ctime_nanosec_from_dentry(dentry);
  2188  
  2189          save_str_to_buf(&p.event->args_buf, dentry_path, 0);
  2190          save_to_submit_buf(&p.event->args_buf, &unlinked_file_id.inode, sizeof(unsigned long), 1);
  2191          save_to_submit_buf(&p.event->args_buf, &unlinked_file_id.device, sizeof(dev_t), 2);
  2192          save_to_submit_buf(&p.event->args_buf, &unlinked_file_id.ctime, sizeof(u64), 3);
  2193          ret = events_perf_submit(&p, SECURITY_INODE_UNLINK, 0);
  2194      }
  2195  
  2196      if (should_capture_io) {
  2197          // We want to avoid reacquisition of the same inode-device affecting capture behavior
  2198          unlinked_file_id.ctime = 0;
  2199          bpf_map_delete_elem(&elf_files_map, &unlinked_file_id);
  2200      }
  2201  
  2202      return ret;
  2203  }
  2204  
  2205  SEC("kprobe/commit_creds")
  2206  int BPF_KPROBE(trace_commit_creds)
  2207  {
  2208      program_data_t p = {};
  2209      if (!init_program_data(&p, ctx))
  2210          return 0;
  2211  
  2212      if (!should_trace(&p))
  2213          return 0;
  2214  
  2215      if (!should_submit(COMMIT_CREDS, p.event))
  2216          return 0;
  2217  
  2218      struct cred *new_cred = (struct cred *) PT_REGS_PARM1(ctx);
  2219      struct cred *old_cred = (struct cred *) get_task_real_cred(p.task);
  2220  
  2221      slim_cred_t old_slim = {0};
  2222      slim_cred_t new_slim = {0};
  2223  
  2224      struct user_namespace *userns_old = BPF_CORE_READ(old_cred, user_ns);
  2225      struct user_namespace *userns_new = BPF_CORE_READ(new_cred, user_ns);
  2226  
  2227      // old credentials
  2228  
  2229      old_slim.uid = BPF_CORE_READ(old_cred, uid.val);
  2230      old_slim.gid = BPF_CORE_READ(old_cred, gid.val);
  2231      old_slim.suid = BPF_CORE_READ(old_cred, suid.val);
  2232      old_slim.sgid = BPF_CORE_READ(old_cred, sgid.val);
  2233      old_slim.euid = BPF_CORE_READ(old_cred, euid.val);
  2234      old_slim.egid = BPF_CORE_READ(old_cred, egid.val);
  2235      old_slim.fsuid = BPF_CORE_READ(old_cred, fsuid.val);
  2236      old_slim.fsgid = BPF_CORE_READ(old_cred, fsgid.val);
  2237      old_slim.user_ns = BPF_CORE_READ(userns_old, ns.inum);
  2238      old_slim.securebits = BPF_CORE_READ(old_cred, securebits);
  2239  
  2240      old_slim.cap_inheritable = credcap_to_slimcap(&old_cred->cap_inheritable);
  2241      old_slim.cap_permitted = credcap_to_slimcap(&old_cred->cap_permitted);
  2242      old_slim.cap_effective = credcap_to_slimcap(&old_cred->cap_effective);
  2243      old_slim.cap_bset = credcap_to_slimcap(&old_cred->cap_bset);
  2244      old_slim.cap_ambient = credcap_to_slimcap(&old_cred->cap_ambient);
  2245  
  2246      // new credentials
  2247  
  2248      new_slim.uid = BPF_CORE_READ(new_cred, uid.val);
  2249      new_slim.gid = BPF_CORE_READ(new_cred, gid.val);
  2250      new_slim.suid = BPF_CORE_READ(new_cred, suid.val);
  2251      new_slim.sgid = BPF_CORE_READ(new_cred, sgid.val);
  2252      new_slim.euid = BPF_CORE_READ(new_cred, euid.val);
  2253      new_slim.egid = BPF_CORE_READ(new_cred, egid.val);
  2254      new_slim.fsuid = BPF_CORE_READ(new_cred, fsuid.val);
  2255      new_slim.fsgid = BPF_CORE_READ(new_cred, fsgid.val);
  2256      new_slim.user_ns = BPF_CORE_READ(userns_new, ns.inum);
  2257      new_slim.securebits = BPF_CORE_READ(new_cred, securebits);
  2258  
  2259      new_slim.cap_inheritable = credcap_to_slimcap(&new_cred->cap_inheritable);
  2260      new_slim.cap_permitted = credcap_to_slimcap(&new_cred->cap_permitted);
  2261      new_slim.cap_effective = credcap_to_slimcap(&new_cred->cap_effective);
  2262      new_slim.cap_bset = credcap_to_slimcap(&new_cred->cap_bset);
  2263      new_slim.cap_ambient = credcap_to_slimcap(&new_cred->cap_ambient);
  2264  
  2265      save_to_submit_buf(&p.event->args_buf, (void *) &old_slim, sizeof(slim_cred_t), 0);
  2266      save_to_submit_buf(&p.event->args_buf, (void *) &new_slim, sizeof(slim_cred_t), 1);
  2267  
  2268      // clang-format off
  2269      if (
  2270          (old_slim.uid != new_slim.uid)                          ||
  2271          (old_slim.gid != new_slim.gid)                          ||
  2272          (old_slim.suid != new_slim.suid)                        ||
  2273          (old_slim.sgid != new_slim.sgid)                        ||
  2274          (old_slim.euid != new_slim.euid)                        ||
  2275          (old_slim.egid != new_slim.egid)                        ||
  2276          (old_slim.fsuid != new_slim.fsuid)                      ||
  2277          (old_slim.fsgid != new_slim.fsgid)                      ||
  2278          (old_slim.cap_inheritable != new_slim.cap_inheritable)  ||
  2279          (old_slim.cap_permitted != new_slim.cap_permitted)      ||
  2280          (old_slim.cap_effective != new_slim.cap_effective)      ||
  2281          (old_slim.cap_bset != new_slim.cap_bset)                ||
  2282          (old_slim.cap_ambient != new_slim.cap_ambient)
  2283      ) {
  2284          events_perf_submit(&p, COMMIT_CREDS, 0);
  2285      }
  2286      // clang-format on
  2287  
  2288      return 0;
  2289  }
  2290  
  2291  SEC("kprobe/switch_task_namespaces")
  2292  int BPF_KPROBE(trace_switch_task_namespaces)
  2293  {
  2294      program_data_t p = {};
  2295      if (!init_program_data(&p, ctx))
  2296          return 0;
  2297  
  2298      if (!should_trace(&p))
  2299          return 0;
  2300  
  2301      if (!should_submit(SWITCH_TASK_NS, p.event))
  2302          return 0;
  2303  
  2304      struct task_struct *task = (struct task_struct *) PT_REGS_PARM1(ctx);
  2305      struct nsproxy *new = (struct nsproxy *) PT_REGS_PARM2(ctx);
  2306  
  2307      if (!new)
  2308          return 0;
  2309  
  2310      pid_t pid = BPF_CORE_READ(task, pid);
  2311      u32 old_mnt = p.event->context.task.mnt_id;
  2312      u32 new_mnt = get_mnt_ns_id(new);
  2313      u32 old_pid = get_task_pid_ns_for_children_id(task);
  2314      u32 new_pid = get_pid_ns_for_children_id(new);
  2315      u32 old_uts = get_task_uts_ns_id(task);
  2316      u32 new_uts = get_uts_ns_id(new);
  2317      u32 old_ipc = get_task_ipc_ns_id(task);
  2318      u32 new_ipc = get_ipc_ns_id(new);
  2319      u32 old_net = get_task_net_ns_id(task);
  2320      u32 new_net = get_net_ns_id(new);
  2321      u32 old_cgroup = get_task_cgroup_ns_id(task);
  2322      u32 new_cgroup = get_cgroup_ns_id(new);
  2323  
  2324      save_to_submit_buf(&p.event->args_buf, (void *) &pid, sizeof(int), 0);
  2325  
  2326      if (old_mnt != new_mnt)
  2327          save_to_submit_buf(&p.event->args_buf, (void *) &new_mnt, sizeof(u32), 1);
  2328      if (old_pid != new_pid)
  2329          save_to_submit_buf(&p.event->args_buf, (void *) &new_pid, sizeof(u32), 2);
  2330      if (old_uts != new_uts)
  2331          save_to_submit_buf(&p.event->args_buf, (void *) &new_uts, sizeof(u32), 3);
  2332      if (old_ipc != new_ipc)
  2333          save_to_submit_buf(&p.event->args_buf, (void *) &new_ipc, sizeof(u32), 4);
  2334      if (old_net != new_net)
  2335          save_to_submit_buf(&p.event->args_buf, (void *) &new_net, sizeof(u32), 5);
  2336      if (old_cgroup != new_cgroup)
  2337          save_to_submit_buf(&p.event->args_buf, (void *) &new_cgroup, sizeof(u32), 6);
  2338      if (p.event->args_buf.argnum > 1)
  2339          events_perf_submit(&p, SWITCH_TASK_NS, 0);
  2340  
  2341      return 0;
  2342  }
  2343  
  2344  SEC("kprobe/cap_capable")
  2345  int BPF_KPROBE(trace_cap_capable)
  2346  {
  2347      program_data_t p = {};
  2348      if (!init_program_data(&p, ctx))
  2349          return 0;
  2350  
  2351      if (!should_trace(&p))
  2352          return 0;
  2353  
  2354      if (!should_submit(CAP_CAPABLE, p.event))
  2355          return 0;
  2356  
  2357      int cap = PT_REGS_PARM3(ctx);
  2358      int cap_opt = PT_REGS_PARM4(ctx);
  2359  
  2360      if (cap_opt & CAP_OPT_NOAUDIT)
  2361          return 0;
  2362  
  2363      save_to_submit_buf(&p.event->args_buf, (void *) &cap, sizeof(int), 0);
  2364  
  2365      return events_perf_submit(&p, CAP_CAPABLE, 0);
  2366  }
  2367  
  2368  SEC("kprobe/security_socket_create")
  2369  int BPF_KPROBE(trace_security_socket_create)
  2370  {
  2371      program_data_t p = {};
  2372      if (!init_program_data(&p, ctx))
  2373          return 0;
  2374  
  2375      if (!should_trace(&p))
  2376          return 0;
  2377  
  2378      if (!should_submit(SECURITY_SOCKET_CREATE, p.event))
  2379          return 0;
  2380  
  2381      int family = (int) PT_REGS_PARM1(ctx);
  2382      int type = (int) PT_REGS_PARM2(ctx);
  2383      int protocol = (int) PT_REGS_PARM3(ctx);
  2384      int kern = (int) PT_REGS_PARM4(ctx);
  2385  
  2386      save_to_submit_buf(&p.event->args_buf, (void *) &family, sizeof(int), 0);
  2387      save_to_submit_buf(&p.event->args_buf, (void *) &type, sizeof(int), 1);
  2388      save_to_submit_buf(&p.event->args_buf, (void *) &protocol, sizeof(int), 2);
  2389      save_to_submit_buf(&p.event->args_buf, (void *) &kern, sizeof(int), 3);
  2390  
  2391      return events_perf_submit(&p, SECURITY_SOCKET_CREATE, 0);
  2392  }
  2393  
  2394  SEC("kprobe/security_inode_symlink")
  2395  int BPF_KPROBE(trace_security_inode_symlink)
  2396  {
  2397      program_data_t p = {};
  2398      if (!init_program_data(&p, ctx))
  2399          return 0;
  2400  
  2401      if (!should_trace(&p))
  2402          return 0;
  2403  
  2404      if (!should_submit(SECURITY_INODE_SYMLINK, p.event))
  2405          return 0;
  2406  
  2407      // struct inode *dir = (struct inode *)PT_REGS_PARM1(ctx);
  2408      struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx);
  2409      const char *old_name = (const char *) PT_REGS_PARM3(ctx);
  2410  
  2411      void *dentry_path = get_dentry_path_str(dentry);
  2412  
  2413      save_str_to_buf(&p.event->args_buf, dentry_path, 0);
  2414      save_str_to_buf(&p.event->args_buf, (void *) old_name, 1);
  2415  
  2416      return events_perf_submit(&p, SECURITY_INODE_SYMLINK, 0);
  2417  }
  2418  
  2419  SEC("kprobe/proc_create")
  2420  int BPF_KPROBE(trace_proc_create)
  2421  {
  2422      program_data_t p = {};
  2423      if (!init_program_data(&p, ctx))
  2424          return 0;
  2425  
  2426      if (!should_trace((&p)))
  2427          return 0;
  2428  
  2429      if (!should_submit(PROC_CREATE, p.event))
  2430          return 0;
  2431  
  2432      char *name = (char *) PT_REGS_PARM1(ctx);
  2433      unsigned long proc_ops_addr = (unsigned long) PT_REGS_PARM4(ctx);
  2434  
  2435      save_str_to_buf(&p.event->args_buf, name, 0);
  2436      save_to_submit_buf(&p.event->args_buf, (void *) &proc_ops_addr, sizeof(u64), 1);
  2437  
  2438      return events_perf_submit(&p, PROC_CREATE, 0);
  2439  }
  2440  
  2441  SEC("kprobe/debugfs_create_file")
  2442  int BPF_KPROBE(trace_debugfs_create_file)
  2443  {
  2444      program_data_t p = {};
  2445      if (!init_program_data(&p, ctx))
  2446          return 0;
  2447  
  2448      if (!should_trace((&p)))
  2449          return 0;
  2450  
  2451      if (!should_submit(DEBUGFS_CREATE_FILE, p.event))
  2452          return 0;
  2453  
  2454      char *name = (char *) PT_REGS_PARM1(ctx);
  2455      mode_t mode = (unsigned short) PT_REGS_PARM2(ctx);
  2456      struct dentry *dentry = (struct dentry *) PT_REGS_PARM3(ctx);
  2457      void *dentry_path = get_dentry_path_str(dentry);
  2458      unsigned long proc_ops_addr = (unsigned long) PT_REGS_PARM5(ctx);
  2459  
  2460      save_str_to_buf(&p.event->args_buf, name, 0);
  2461      save_str_to_buf(&p.event->args_buf, dentry_path, 1);
  2462      save_to_submit_buf(&p.event->args_buf, &mode, sizeof(mode_t), 2);
  2463      save_to_submit_buf(&p.event->args_buf, (void *) &proc_ops_addr, sizeof(u64), 3);
  2464  
  2465      return events_perf_submit(&p, DEBUGFS_CREATE_FILE, 0);
  2466  }
  2467  
  2468  SEC("kprobe/debugfs_create_dir")
  2469  int BPF_KPROBE(trace_debugfs_create_dir)
  2470  {
  2471      program_data_t p = {};
  2472      if (!init_program_data(&p, ctx))
  2473          return 0;
  2474  
  2475      if (!should_trace((&p)))
  2476          return 0;
  2477  
  2478      if (!should_submit(DEBUGFS_CREATE_DIR, p.event))
  2479          return 0;
  2480  
  2481      char *name = (char *) PT_REGS_PARM1(ctx);
  2482      struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx);
  2483      void *dentry_path = get_dentry_path_str(dentry);
  2484  
  2485      save_str_to_buf(&p.event->args_buf, name, 0);
  2486      save_str_to_buf(&p.event->args_buf, dentry_path, 1);
  2487  
  2488      return events_perf_submit(&p, DEBUGFS_CREATE_DIR, 0);
  2489  }
  2490  
  2491  SEC("kprobe/security_socket_listen")
  2492  int BPF_KPROBE(trace_security_socket_listen)
  2493  {
  2494      program_data_t p = {};
  2495      if (!init_program_data(&p, ctx))
  2496          return 0;
  2497  
  2498      if (!should_trace(&p))
  2499          return 0;
  2500  
  2501      if (!should_submit(SECURITY_SOCKET_LISTEN, p.event))
  2502          return 0;
  2503  
  2504      struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx);
  2505      int backlog = (int) PT_REGS_PARM2(ctx);
  2506  
  2507      // Load the arguments given to the listen syscall (which eventually invokes this function)
  2508      syscall_data_t *sys = &p.task_info->syscall_data;
  2509      if (!p.task_info->syscall_traced)
  2510          return 0;
  2511  
  2512      switch (sys->id) {
  2513          case SYSCALL_LISTEN:
  2514              save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0);
  2515              break;
  2516  #if defined(bpf_target_x86) // armhf makes use of SYSCALL_LISTEN
  2517          case SYSCALL_SOCKETCALL:
  2518              save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0);
  2519              break;
  2520  #endif
  2521          default:
  2522              return 0;
  2523      }
  2524  
  2525      save_sockaddr_to_buf(&p.event->args_buf, sock, 1);
  2526      save_to_submit_buf(&p.event->args_buf, (void *) &backlog, sizeof(int), 2);
  2527  
  2528      return events_perf_submit(&p, SECURITY_SOCKET_LISTEN, 0);
  2529  }
  2530  
  2531  SEC("kprobe/security_socket_connect")
  2532  int BPF_KPROBE(trace_security_socket_connect)
  2533  {
  2534      program_data_t p = {};
  2535      if (!init_program_data(&p, ctx))
  2536          return 0;
  2537  
  2538      if (!should_trace(&p))
  2539          return 0;
  2540  
  2541      if (!should_submit(SECURITY_SOCKET_CONNECT, p.event))
  2542          return 0;
  2543  
  2544      u64 addr_len = PT_REGS_PARM3(ctx);
  2545  
  2546      struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx);
  2547      if (!sock)
  2548          return 0;
  2549  
  2550      struct sockaddr *address = (struct sockaddr *) PT_REGS_PARM2(ctx);
  2551      if (!address)
  2552          return 0;
  2553  
  2554      // Check if the socket type is supported.
  2555      u32 type = BPF_CORE_READ(sock, type);
  2556      switch (type) {
  2557          // TODO: case SOCK_DCCP:
  2558          case SOCK_DGRAM:
  2559          case SOCK_SEQPACKET:
  2560          case SOCK_STREAM:
  2561              break;
  2562          default:
  2563              return 0;
  2564      }
  2565  
  2566      // Check if the socket family is supported.
  2567      sa_family_t sa_fam = get_sockaddr_family(address);
  2568      switch (sa_fam) {
  2569          case AF_INET:
  2570          case AF_INET6:
  2571          case AF_UNIX:
  2572              break;
  2573          default:
  2574              return 0;
  2575      }
  2576  
  2577      // Load args given to the syscall that invoked this function.
  2578      syscall_data_t *sys = &p.task_info->syscall_data;
  2579      if (!p.task_info->syscall_traced)
  2580          return 0;
  2581  
  2582      // Reduce line cols by having a few temp pointers.
  2583      int (*stsb)(args_buffer_t *, void *, u32, u8) = save_to_submit_buf;
  2584      void *args_buf = &p.event->args_buf;
  2585      void *to = (void *) &sys->args.args[0];
  2586  
  2587      if (is_x86_compat(p.task)) // only i386 binaries uses socketcall
  2588          to = (void *) sys->args.args[1];
  2589  
  2590      // Save the socket fd, depending on the syscall.
  2591      switch (sys->id) {
  2592          case SYSCALL_CONNECT:
  2593          case SYSCALL_SOCKETCALL:
  2594              break;
  2595          default:
  2596              return 0;
  2597      }
  2598  
  2599      // Save the socket fd argument to the event.
  2600      stsb(args_buf, to, sizeof(u32), 0);
  2601  
  2602      // Save the socket type argument to the event.
  2603      stsb(args_buf, &type, sizeof(u32), 1);
  2604  
  2605      bool need_workaround = false;
  2606  
  2607      // Save the sockaddr struct, depending on the family.
  2608      size_t sockaddr_len = 0;
  2609      switch (sa_fam) {
  2610          case AF_INET:
  2611              sockaddr_len = sizeof(struct sockaddr_in);
  2612              break;
  2613          case AF_INET6:
  2614              sockaddr_len = sizeof(struct sockaddr_in6);
  2615              break;
  2616          case AF_UNIX:
  2617              sockaddr_len = sizeof(struct sockaddr_un);
  2618              if (addr_len < sockaddr_len)
  2619                  need_workaround = true;
  2620  
  2621              break;
  2622      }
  2623  
  2624  #if defined(bpf_target_x86)
  2625      if (need_workaround) {
  2626          // Workaround for sockaddr_un struct length (issue: #1129).
  2627          struct sockaddr_un sockaddr = {0};
  2628          bpf_probe_read(&sockaddr, (u32) addr_len, (void *) address);
  2629          stsb(args_buf, (void *) &sockaddr, sizeof(struct sockaddr_un), 2);
  2630      }
  2631  #endif
  2632  
  2633      // Save the sockaddr struct argument to the event.
  2634      if (!need_workaround) {
  2635          stsb(args_buf, (void *) address, sockaddr_len, 2);
  2636      }
  2637  
  2638      // Submit the event.
  2639      return events_perf_submit(&p, SECURITY_SOCKET_CONNECT, 0);
  2640  }
  2641  
  2642  SEC("kprobe/security_socket_accept")
  2643  int BPF_KPROBE(trace_security_socket_accept)
  2644  {
  2645      program_data_t p = {};
  2646      if (!init_program_data(&p, ctx))
  2647          return 0;
  2648  
  2649      if (!should_trace(&p))
  2650          return 0;
  2651  
  2652      struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx);
  2653      struct socket *new_sock = (struct socket *) PT_REGS_PARM2(ctx);
  2654      syscall_data_t *sys = &p.task_info->syscall_data;
  2655  
  2656      // save sockets for "socket_accept event"
  2657      if (should_submit(SOCKET_ACCEPT, p.event)) {
  2658          args_t args = {};
  2659          args.args[0] = (unsigned long) sock;
  2660          args.args[1] = (unsigned long) new_sock;
  2661          args.args[2] = sys->args.args[0]; // sockfd
  2662          save_args(&args, SOCKET_ACCEPT);
  2663      }
  2664  
  2665      if (!should_submit(SECURITY_SOCKET_ACCEPT, p.event))
  2666          return 0;
  2667  
  2668      // Load the arguments given to the accept syscall (which eventually invokes this function)
  2669      if (!p.task_info->syscall_traced || (sys->id != SYSCALL_ACCEPT && sys->id != SYSCALL_ACCEPT4))
  2670          return 0;
  2671  
  2672      switch (sys->id) {
  2673          case SYSCALL_ACCEPT:
  2674          case SYSCALL_ACCEPT4:
  2675              save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0);
  2676              break;
  2677  #if defined(bpf_target_x86) // armhf makes use of SYSCALL_ACCEPT/4
  2678          case SYSCALL_SOCKETCALL:
  2679              save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0);
  2680              break;
  2681  #endif
  2682          default:
  2683              return 0;
  2684      }
  2685  
  2686      save_sockaddr_to_buf(&p.event->args_buf, sock, 1);
  2687  
  2688      return events_perf_submit(&p, SECURITY_SOCKET_ACCEPT, 0);
  2689  }
  2690  
  2691  SEC("kprobe/security_socket_bind")
  2692  int BPF_KPROBE(trace_security_socket_bind)
  2693  {
  2694      program_data_t p = {};
  2695      if (!init_program_data(&p, ctx))
  2696          return 0;
  2697  
  2698      if (!should_trace(&p))
  2699          return 0;
  2700  
  2701      if (!should_submit(SECURITY_SOCKET_BIND, p.event))
  2702          return 0;
  2703  
  2704      struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx);
  2705      struct sock *sk = get_socket_sock(sock);
  2706  
  2707      struct sockaddr *address = (struct sockaddr *) PT_REGS_PARM2(ctx);
  2708  #if defined(__TARGET_ARCH_x86) // TODO: issue: #1129
  2709      uint addr_len = (uint) PT_REGS_PARM3(ctx);
  2710  #endif
  2711  
  2712      sa_family_t sa_fam = get_sockaddr_family(address);
  2713      if ((sa_fam != AF_INET) && (sa_fam != AF_INET6) && (sa_fam != AF_UNIX)) {
  2714          return 0;
  2715      }
  2716  
  2717      // Load the arguments given to the bind syscall (which eventually invokes this function)
  2718      syscall_data_t *sys = &p.task_info->syscall_data;
  2719      if (!p.task_info->syscall_traced || sys->id != SYSCALL_BIND)
  2720          return 0;
  2721  
  2722      switch (sys->id) {
  2723          case SYSCALL_BIND:
  2724              save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0);
  2725              break;
  2726  #if defined(bpf_target_x86) // armhf makes use of SYSCALL_BIND
  2727          case SYSCALL_SOCKETCALL:
  2728              save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0);
  2729              break;
  2730  #endif
  2731          default:
  2732              return 0;
  2733      }
  2734  
  2735      u16 protocol = get_sock_protocol(sk);
  2736      net_id_t connect_id = {0};
  2737      connect_id.protocol = protocol;
  2738  
  2739      if (sa_fam == AF_INET) {
  2740          save_to_submit_buf(&p.event->args_buf, (void *) address, sizeof(struct sockaddr_in), 1);
  2741  
  2742          struct sockaddr_in *addr = (struct sockaddr_in *) address;
  2743  
  2744          if (protocol == IPPROTO_UDP && BPF_CORE_READ(addr, sin_port)) {
  2745              connect_id.address.s6_addr32[3] = BPF_CORE_READ(addr, sin_addr).s_addr;
  2746              connect_id.address.s6_addr16[5] = 0xffff;
  2747              connect_id.port = BPF_CORE_READ(addr, sin_port);
  2748          }
  2749      } else if (sa_fam == AF_INET6) {
  2750          save_to_submit_buf(&p.event->args_buf, (void *) address, sizeof(struct sockaddr_in6), 1);
  2751  
  2752          struct sockaddr_in6 *addr = (struct sockaddr_in6 *) address;
  2753  
  2754          if (protocol == IPPROTO_UDP && BPF_CORE_READ(addr, sin6_port)) {
  2755              connect_id.address = BPF_CORE_READ(addr, sin6_addr);
  2756              connect_id.port = BPF_CORE_READ(addr, sin6_port);
  2757          }
  2758      } else if (sa_fam == AF_UNIX) {
  2759  #if defined(__TARGET_ARCH_x86) // TODO: this is broken in arm64 (issue: #1129)
  2760          if (addr_len <= sizeof(struct sockaddr_un)) {
  2761              struct sockaddr_un sockaddr = {};
  2762              bpf_probe_read(&sockaddr, addr_len, (void *) address);
  2763              save_to_submit_buf(
  2764                  &p.event->args_buf, (void *) &sockaddr, sizeof(struct sockaddr_un), 1);
  2765          } else
  2766  #endif
  2767              save_to_submit_buf(&p.event->args_buf, (void *) address, sizeof(struct sockaddr_un), 1);
  2768      }
  2769  
  2770      return events_perf_submit(&p, SECURITY_SOCKET_BIND, 0);
  2771  }
  2772  
  2773  SEC("kprobe/security_socket_setsockopt")
  2774  int BPF_KPROBE(trace_security_socket_setsockopt)
  2775  {
  2776      program_data_t p = {};
  2777      if (!init_program_data(&p, ctx))
  2778          return 0;
  2779  
  2780      if (!should_trace(&p))
  2781          return 0;
  2782  
  2783      if (!should_submit(SECURITY_SOCKET_SETSOCKOPT, p.event))
  2784          return 0;
  2785  
  2786      struct socket *sock = (struct socket *) PT_REGS_PARM1(ctx);
  2787      int level = (int) PT_REGS_PARM2(ctx);
  2788      int optname = (int) PT_REGS_PARM3(ctx);
  2789  
  2790      // Load the arguments given to the setsockopt syscall (which eventually invokes this function)
  2791      syscall_data_t *sys = &p.task_info->syscall_data;
  2792      if (sys == NULL) {
  2793          return -1;
  2794      }
  2795  
  2796      if (!p.task_info->syscall_traced || sys->id != SYSCALL_SETSOCKOPT)
  2797          return 0;
  2798  
  2799      switch (sys->id) {
  2800          case SYSCALL_SETSOCKOPT:
  2801              save_to_submit_buf(&p.event->args_buf, (void *) &sys->args.args[0], sizeof(u32), 0);
  2802              break;
  2803  #if defined(bpf_target_x86) // armhf makes use of SYSCALL_SETSOCKOPT
  2804          case SYSCALL_SOCKETCALL:
  2805              save_to_submit_buf(&p.event->args_buf, (void *) sys->args.args[1], sizeof(u32), 0);
  2806              break;
  2807  #endif
  2808          default:
  2809              return 0;
  2810      }
  2811  
  2812      save_to_submit_buf(&p.event->args_buf, (void *) &level, sizeof(int), 1);
  2813      save_to_submit_buf(&p.event->args_buf, (void *) &optname, sizeof(int), 2);
  2814      save_sockaddr_to_buf(&p.event->args_buf, sock, 3);
  2815  
  2816      return events_perf_submit(&p, SECURITY_SOCKET_SETSOCKOPT, 0);
  2817  }
  2818  
  2819  enum bin_type_e {
  2820      SEND_VFS_WRITE = 1,
  2821      SEND_MPROTECT,
  2822      SEND_KERNEL_MODULE,
  2823      SEND_BPF_OBJECT,
  2824      SEND_VFS_READ
  2825  };
  2826  
  2827  statfunc u32 tail_call_send_bin(void *ctx, program_data_t *p, bin_args_t *bin_args, int tail_call)
  2828  {
  2829      if (p->event->args_buf.offset < ARGS_BUF_SIZE - sizeof(bin_args_t)) {
  2830          bpf_probe_read_kernel(
  2831              &(p->event->args_buf.args[p->event->args_buf.offset]), sizeof(bin_args_t), bin_args);
  2832          if (tail_call == TAIL_SEND_BIN)
  2833              bpf_tail_call(ctx, &prog_array, tail_call);
  2834          else if (tail_call == TAIL_SEND_BIN_TP)
  2835              bpf_tail_call(ctx, &prog_array_tp, tail_call);
  2836      }
  2837  
  2838      return 0;
  2839  }
  2840  
  2841  statfunc u32 send_bin_helper(void *ctx, void *prog_array, int tail_call)
  2842  {
  2843      // Note: sending the data to the userspace have the following constraints:
  2844      //
  2845      // 1. We need a buffer that we know it's exact size
  2846      //    (so we can send chunks of known sizes in BPF)
  2847      // 2. We can have multiple cpus - need percpu array
  2848      // 3. We have to use perf submit and not maps as data
  2849      //    can be overridden if userspace doesn't consume
  2850      //    it fast enough
  2851  
  2852      int i = 0;
  2853      unsigned int chunk_size;
  2854      u32 zero = 0;
  2855  
  2856      event_data_t *event = bpf_map_lookup_elem(&event_data_map, &zero);
  2857      if (!event || (event->args_buf.offset > ARGS_BUF_SIZE - sizeof(bin_args_t)))
  2858          return 0;
  2859  
  2860      bin_args_t *bin_args = (bin_args_t *) &(event->args_buf.args[event->args_buf.offset]);
  2861  
  2862      if (bin_args->full_size <= 0) {
  2863          // If there are more vector elements, continue to the next one
  2864          bin_args->iov_idx++;
  2865          if (bin_args->iov_idx < bin_args->iov_len) {
  2866              // Handle the rest of write recursively
  2867              bin_args->start_off += bin_args->full_size;
  2868              struct iovec io_vec;
  2869              bpf_probe_read(&io_vec, sizeof(struct iovec), &bin_args->vec[bin_args->iov_idx]);
  2870              bin_args->ptr = io_vec.iov_base;
  2871              bin_args->full_size = io_vec.iov_len;
  2872              bpf_tail_call(ctx, prog_array, tail_call);
  2873          }
  2874          return 0;
  2875      }
  2876  
  2877      buf_t *file_buf_p = get_buf(FILE_BUF_IDX);
  2878      if (file_buf_p == NULL)
  2879          return 0;
  2880  
  2881  #define F_SEND_TYPE  0
  2882  #define F_CGROUP_ID  (F_SEND_TYPE + sizeof(u8))
  2883  #define F_META_OFF   (F_CGROUP_ID + sizeof(u64))
  2884  #define F_SZ_OFF     (F_META_OFF + SEND_META_SIZE)
  2885  #define F_POS_OFF    (F_SZ_OFF + sizeof(unsigned int))
  2886  #define F_CHUNK_OFF  (F_POS_OFF + sizeof(off_t))
  2887  #define F_CHUNK_SIZE (MAX_PERCPU_BUFSIZE >> 1)
  2888  
  2889      bpf_probe_read_kernel((void **) &(file_buf_p->buf[F_SEND_TYPE]), sizeof(u8), &bin_args->type);
  2890  
  2891      u64 cgroup_id = event->context.task.cgroup_id;
  2892      bpf_probe_read_kernel((void **) &(file_buf_p->buf[F_CGROUP_ID]), sizeof(u64), &cgroup_id);
  2893  
  2894      // Save metadata to be used in filename
  2895      bpf_probe_read_kernel(
  2896          (void **) &(file_buf_p->buf[F_META_OFF]), SEND_META_SIZE, bin_args->metadata);
  2897  
  2898      // Save number of written bytes. Set this to CHUNK_SIZE for full chunks
  2899      chunk_size = F_CHUNK_SIZE;
  2900      bpf_probe_read_kernel(
  2901          (void **) &(file_buf_p->buf[F_SZ_OFF]), sizeof(unsigned int), &chunk_size);
  2902  
  2903      unsigned int full_chunk_num = bin_args->full_size / F_CHUNK_SIZE;
  2904      void *data = file_buf_p->buf;
  2905  
  2906  // Handle full chunks in loop
  2907  #pragma unroll
  2908      for (i = 0; i < MAX_BIN_CHUNKS; i++) {
  2909          // Dummy instruction, as break instruction can't be first with unroll optimization
  2910          chunk_size = F_CHUNK_SIZE;
  2911  
  2912          if (i == full_chunk_num)
  2913              break;
  2914  
  2915          // Save binary chunk and file position of write
  2916          bpf_probe_read_kernel(
  2917              (void **) &(file_buf_p->buf[F_POS_OFF]), sizeof(off_t), &bin_args->start_off);
  2918          bpf_probe_read_kernel(
  2919              (void **) &(file_buf_p->buf[F_CHUNK_OFF]), F_CHUNK_SIZE, bin_args->ptr);
  2920          bin_args->ptr += F_CHUNK_SIZE;
  2921          bin_args->start_off += F_CHUNK_SIZE;
  2922  
  2923          bpf_perf_event_output(
  2924              ctx, &file_writes, BPF_F_CURRENT_CPU, data, F_CHUNK_OFF + F_CHUNK_SIZE);
  2925      }
  2926  
  2927      chunk_size = bin_args->full_size - i * F_CHUNK_SIZE;
  2928  
  2929      if (chunk_size > F_CHUNK_SIZE) {
  2930          // Handle the rest of write recursively
  2931          bin_args->full_size = chunk_size;
  2932          bpf_tail_call(ctx, prog_array, tail_call);
  2933          return 0;
  2934      }
  2935  
  2936      if (chunk_size) {
  2937          // Save last chunk
  2938          chunk_size = chunk_size & ((MAX_PERCPU_BUFSIZE >> 1) - 1);
  2939          bpf_probe_read_kernel((void **) &(file_buf_p->buf[F_CHUNK_OFF]), chunk_size, bin_args->ptr);
  2940          bpf_probe_read_kernel(
  2941              (void **) &(file_buf_p->buf[F_SZ_OFF]), sizeof(unsigned int), &chunk_size);
  2942          bpf_probe_read_kernel(
  2943              (void **) &(file_buf_p->buf[F_POS_OFF]), sizeof(off_t), &bin_args->start_off);
  2944  
  2945          // Satisfy validator by setting buffer bounds
  2946          int size = (F_CHUNK_OFF + chunk_size) & (MAX_PERCPU_BUFSIZE - 1);
  2947          bpf_perf_event_output(ctx, &file_writes, BPF_F_CURRENT_CPU, data, size);
  2948      }
  2949  
  2950      // We finished writing an element of the vector - continue to next element
  2951      bin_args->iov_idx++;
  2952      if (bin_args->iov_idx < bin_args->iov_len) {
  2953          // Handle the rest of write recursively
  2954          bin_args->start_off += bin_args->full_size;
  2955          struct iovec io_vec;
  2956          bpf_probe_read(&io_vec, sizeof(struct iovec), &bin_args->vec[bin_args->iov_idx]);
  2957          bin_args->ptr = io_vec.iov_base;
  2958          bin_args->full_size = io_vec.iov_len;
  2959          bpf_tail_call(ctx, prog_array, tail_call);
  2960      }
  2961  
  2962      return 0;
  2963  }
  2964  
  2965  SEC("kprobe/send_bin")
  2966  int BPF_KPROBE(send_bin)
  2967  {
  2968      return send_bin_helper(ctx, &prog_array, TAIL_SEND_BIN);
  2969  }
  2970  
  2971  SEC("raw_tracepoint/send_bin_tp")
  2972  int send_bin_tp(void *ctx)
  2973  {
  2974      return send_bin_helper(ctx, &prog_array_tp, TAIL_SEND_BIN_TP);
  2975  }
  2976  
  2977  statfunc bool should_submit_io_event(u32 event_id, program_data_t *p)
  2978  {
  2979      return ((event_id == VFS_READ || event_id == VFS_READV || event_id == VFS_WRITE ||
  2980               event_id == VFS_WRITEV || event_id == __KERNEL_WRITE) &&
  2981              should_submit(event_id, p->event));
  2982  }
  2983  
  2984  statfunc int is_elf(io_data_t io_data, u8 header[FILE_MAGIC_HDR_SIZE])
  2985  {
  2986      // ELF binaries start with a 4 byte long header
  2987      if (io_data.len < 4) {
  2988          return false;
  2989      }
  2990  
  2991      return header[0] == 0x7F && header[1] == 'E' && header[2] == 'L' && header[3] == 'F';
  2992  }
  2993  
  2994  /** do_file_io_operation - generic file IO (read and write) event creator.
  2995   *
  2996   * @ctx:            the state of the registers prior the hook.
  2997   * @event_id:       the ID of the event to be created.
  2998   * @tail_call_id:   the ID of the tail call to be called before function return.
  2999   * @is_read:        true if the operation is read. False if write.
  3000   * @is_buf:         true if the non-file side of the operation is a buffer. False if io_vector.
  3001   */
  3002  statfunc int
  3003  do_file_io_operation(struct pt_regs *ctx, u32 event_id, u32 tail_call_id, bool is_read, bool is_buf)
  3004  {
  3005      args_t saved_args;
  3006      if (load_args(&saved_args, event_id) != 0) {
  3007          // missed entry or not traced
  3008          return 0;
  3009      }
  3010      // We shouldn't call del_args(event_id) here as the arguments are also used by the tail call
  3011  
  3012      program_data_t p = {};
  3013      if (!init_program_data(&p, ctx)) {
  3014          goto out;
  3015      }
  3016  
  3017      if (!should_trace(&p)) {
  3018          goto out;
  3019      }
  3020  
  3021      if (!should_submit_io_event(event_id, &p)) {
  3022          goto tail;
  3023      }
  3024  
  3025      loff_t start_pos;
  3026      io_data_t io_data;
  3027      file_info_t file_info;
  3028  
  3029      struct file *file = (struct file *) saved_args.args[0];
  3030      file_info.pathname_p = get_path_str_cached(file);
  3031  
  3032      io_data.is_buf = is_buf;
  3033      io_data.ptr = (void *) saved_args.args[1];
  3034      io_data.len = (unsigned long) saved_args.args[2];
  3035      loff_t *pos = (loff_t *) saved_args.args[3];
  3036  
  3037      // Extract device id, inode number, and pos (offset)
  3038      file_info.id.device = get_dev_from_file(file);
  3039      file_info.id.inode = get_inode_nr_from_file(file);
  3040      bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos);
  3041  
  3042      u32 io_bytes_amount = PT_REGS_RC(ctx);
  3043  
  3044      // Calculate write start offset
  3045      if (start_pos != 0)
  3046          start_pos -= io_bytes_amount;
  3047  
  3048      save_str_to_buf(&p.event->args_buf, file_info.pathname_p, 0);
  3049      save_to_submit_buf(&p.event->args_buf, &file_info.id.device, sizeof(dev_t), 1);
  3050      save_to_submit_buf(&p.event->args_buf, &file_info.id.inode, sizeof(unsigned long), 2);
  3051      save_to_submit_buf(&p.event->args_buf, &io_data.len, sizeof(unsigned long), 3);
  3052      save_to_submit_buf(&p.event->args_buf, &start_pos, sizeof(off_t), 4);
  3053  
  3054      // Submit io event
  3055      events_perf_submit(&p, event_id, PT_REGS_RC(ctx));
  3056  
  3057  tail:
  3058      bpf_tail_call(ctx, &prog_array, tail_call_id);
  3059  out:
  3060      del_args(event_id);
  3061  
  3062      return 0;
  3063  }
  3064  
  3065  statfunc void
  3066  extract_vfs_ret_io_data(struct pt_regs *ctx, args_t *saved_args, io_data_t *io_data, bool is_buf)
  3067  {
  3068      io_data->is_buf = is_buf;
  3069      if (is_buf) {
  3070          io_data->ptr = (void *) saved_args->args[1];
  3071          io_data->len = (size_t) PT_REGS_RC(ctx);
  3072      } else {
  3073          io_data->ptr = (struct iovec *) saved_args->args[1];
  3074          io_data->len = saved_args->args[2];
  3075      }
  3076  }
  3077  
  3078  // Filter capture of file writes according to path prefix, type and fd.
  3079  statfunc bool
  3080  filter_file_write_capture(program_data_t *p, struct file *file, io_data_t io_data, off_t start_pos)
  3081  {
  3082      return filter_file_path(p->ctx, &file_write_path_filter, file) ||
  3083             filter_file_type(p->ctx,
  3084                              &file_type_filter,
  3085                              CAPTURE_WRITE_TYPE_FILTER_IDX,
  3086                              file,
  3087                              io_data,
  3088                              start_pos) ||
  3089             filter_file_fd(p->ctx, &file_type_filter, CAPTURE_WRITE_TYPE_FILTER_IDX, file);
  3090  }
  3091  
  3092  // Capture file write
  3093  // Will only capture if:
  3094  // 1. File write capture was configured
  3095  // 2. File matches the filters given
  3096  statfunc int capture_file_write(struct pt_regs *ctx, u32 event_id, bool is_buf)
  3097  {
  3098      args_t saved_args;
  3099      io_data_t io_data;
  3100  
  3101      if (load_args(&saved_args, event_id) != 0)
  3102          return 0;
  3103      del_args(event_id);
  3104  
  3105      program_data_t p = {};
  3106      if (!init_program_data(&p, ctx))
  3107          return 0;
  3108  
  3109      if ((p.config->options & OPT_CAPTURE_FILES_WRITE) == 0)
  3110          return 0;
  3111  
  3112      extract_vfs_ret_io_data(ctx, &saved_args, &io_data, is_buf);
  3113      struct file *file = (struct file *) saved_args.args[0];
  3114      loff_t *pos = (loff_t *) saved_args.args[3];
  3115      size_t written_bytes = PT_REGS_RC(ctx);
  3116  
  3117      off_t start_pos;
  3118      bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos);
  3119      // Calculate write start offset
  3120      if (start_pos != 0)
  3121          start_pos -= written_bytes;
  3122  
  3123      if (filter_file_write_capture(&p, file, io_data, start_pos)) {
  3124          // There is a filter, but no match
  3125          return 0;
  3126      }
  3127      // No filter was given, or filter match - continue
  3128  
  3129      // Because we don't pass the file path in the capture map, we can't do path checks in user mode.
  3130      // We don't want to pass the PID for most file writes, because we want to save writes according
  3131      // to the inode-device only. In the case of writes to /dev/null, we want to pass the PID because
  3132      // otherwise the capture will overwrite itself.
  3133      int pid = 0;
  3134      void *path_buf = get_path_str_cached(file);
  3135      if (path_buf != NULL && has_prefix("/dev/null", (char *) path_buf, 10)) {
  3136          pid = p.event->context.task.pid;
  3137      }
  3138  
  3139      bin_args_t bin_args = {};
  3140      fill_vfs_file_bin_args(SEND_VFS_WRITE, file, pos, io_data, PT_REGS_RC(ctx), pid, &bin_args);
  3141  
  3142      // Send file data
  3143      tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN);
  3144      return 0;
  3145  }
  3146  
  3147  // Filter capture of file reads according to path prefix, type and fd.
  3148  statfunc bool
  3149  filter_file_read_capture(program_data_t *p, struct file *file, io_data_t io_data, off_t start_pos)
  3150  {
  3151      return filter_file_path(p->ctx, &file_read_path_filter, file) ||
  3152             filter_file_type(
  3153                 p->ctx, &file_type_filter, CAPTURE_READ_TYPE_FILTER_IDX, file, io_data, start_pos) ||
  3154             filter_file_fd(p->ctx, &file_type_filter, CAPTURE_READ_TYPE_FILTER_IDX, file);
  3155  }
  3156  
  3157  statfunc int capture_file_read(struct pt_regs *ctx, u32 event_id, bool is_buf)
  3158  {
  3159      args_t saved_args;
  3160      io_data_t io_data;
  3161  
  3162      if (load_args(&saved_args, event_id) != 0)
  3163          return 0;
  3164      del_args(event_id);
  3165  
  3166      program_data_t p = {};
  3167      if (!init_program_data(&p, ctx))
  3168          return 0;
  3169  
  3170      if ((p.config->options & OPT_CAPTURE_FILES_READ) == 0)
  3171          return 0;
  3172  
  3173      extract_vfs_ret_io_data(ctx, &saved_args, &io_data, is_buf);
  3174      struct file *file = (struct file *) saved_args.args[0];
  3175      loff_t *pos = (loff_t *) saved_args.args[3];
  3176      size_t read_bytes = PT_REGS_RC(ctx);
  3177  
  3178      off_t start_pos;
  3179      bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos);
  3180      // Calculate write start offset
  3181      if (start_pos != 0)
  3182          start_pos -= read_bytes;
  3183  
  3184      if (filter_file_read_capture(&p, file, io_data, start_pos)) {
  3185          // There is a filter, but no match
  3186          return 0;
  3187      }
  3188      // No filter was given, or filter match - continue
  3189  
  3190      bin_args_t bin_args = {};
  3191      u64 id = bpf_get_current_pid_tgid();
  3192      fill_vfs_file_bin_args(SEND_VFS_READ, file, pos, io_data, PT_REGS_RC(ctx), 0, &bin_args);
  3193  
  3194      // Send file data
  3195      tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN);
  3196      return 0;
  3197  }
  3198  
  3199  SEC("kprobe/vfs_write")
  3200  TRACE_ENT_FUNC(vfs_write, VFS_WRITE);
  3201  
  3202  SEC("kretprobe/vfs_write")
  3203  int BPF_KPROBE(trace_ret_vfs_write)
  3204  {
  3205      return do_file_io_operation(ctx, VFS_WRITE, TAIL_VFS_WRITE, false, true);
  3206  }
  3207  
  3208  SEC("kretprobe/vfs_write_tail")
  3209  int BPF_KPROBE(trace_ret_vfs_write_tail)
  3210  {
  3211      return capture_file_write(ctx, VFS_WRITE, true);
  3212  }
  3213  
  3214  SEC("kprobe/vfs_writev")
  3215  TRACE_ENT_FUNC(vfs_writev, VFS_WRITEV);
  3216  
  3217  SEC("kretprobe/vfs_writev")
  3218  int BPF_KPROBE(trace_ret_vfs_writev)
  3219  {
  3220      return do_file_io_operation(ctx, VFS_WRITEV, TAIL_VFS_WRITEV, false, false);
  3221  }
  3222  
  3223  SEC("kretprobe/vfs_writev_tail")
  3224  int BPF_KPROBE(trace_ret_vfs_writev_tail)
  3225  {
  3226      return capture_file_write(ctx, VFS_WRITEV, false);
  3227  }
  3228  
  3229  SEC("kprobe/__kernel_write")
  3230  TRACE_ENT_FUNC(kernel_write, __KERNEL_WRITE);
  3231  
  3232  SEC("kretprobe/__kernel_write")
  3233  int BPF_KPROBE(trace_ret_kernel_write)
  3234  {
  3235      return do_file_io_operation(ctx, __KERNEL_WRITE, TAIL_KERNEL_WRITE, false, true);
  3236  }
  3237  
  3238  SEC("kretprobe/__kernel_write_tail")
  3239  int BPF_KPROBE(trace_ret_kernel_write_tail)
  3240  {
  3241      return capture_file_write(ctx, __KERNEL_WRITE, true);
  3242  }
  3243  
  3244  SEC("kprobe/vfs_read")
  3245  TRACE_ENT_FUNC(vfs_read, VFS_READ);
  3246  
  3247  SEC("kretprobe/vfs_read")
  3248  int BPF_KPROBE(trace_ret_vfs_read)
  3249  {
  3250      return do_file_io_operation(ctx, VFS_READ, TAIL_VFS_READ, true, true);
  3251  }
  3252  
  3253  SEC("kretprobe/vfs_read_tail")
  3254  int BPF_KPROBE(trace_ret_vfs_read_tail)
  3255  {
  3256      return capture_file_read(ctx, VFS_READ, true);
  3257  }
  3258  
  3259  SEC("kprobe/vfs_readv")
  3260  TRACE_ENT_FUNC(vfs_readv, VFS_READV);
  3261  
  3262  SEC("kretprobe/vfs_readv")
  3263  int BPF_KPROBE(trace_ret_vfs_readv)
  3264  {
  3265      return do_file_io_operation(ctx, VFS_READV, TAIL_VFS_READV, true, false);
  3266  }
  3267  
  3268  SEC("kretprobe/vfs_readv_tail")
  3269  int BPF_KPROBE(trace_ret_vfs_readv_tail)
  3270  {
  3271      return capture_file_read(ctx, VFS_READV, false);
  3272  }
  3273  
  3274  statfunc int do_vfs_write_magic_enter(struct pt_regs *ctx)
  3275  {
  3276      loff_t start_pos;
  3277      loff_t *pos = (loff_t *) PT_REGS_PARM4(ctx);
  3278      bpf_probe_read_kernel(&start_pos, sizeof(off_t), pos);
  3279      if (start_pos != 0) {
  3280          return 0;
  3281      }
  3282      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  3283      unsigned short i_mode = get_inode_mode_from_file(file);
  3284      if ((i_mode & S_IFMT) != S_IFREG) {
  3285          return 0;
  3286      }
  3287  
  3288      args_t args = {};
  3289      args.args[0] = PT_REGS_PARM1(ctx);
  3290      args.args[1] = PT_REGS_PARM2(ctx);
  3291      args.args[2] = PT_REGS_PARM3(ctx);
  3292      args.args[3] = PT_REGS_PARM4(ctx);
  3293      args.args[4] = PT_REGS_PARM5(ctx);
  3294      args.args[5] = PT_REGS_PARM6(ctx);
  3295  
  3296      return save_args(&args, MAGIC_WRITE);
  3297  }
  3298  
  3299  statfunc int do_vfs_write_magic_return(struct pt_regs *ctx, bool is_buf)
  3300  {
  3301      args_t saved_args;
  3302      if (load_args(&saved_args, MAGIC_WRITE) != 0) {
  3303          // missed entry or not traced
  3304          return 0;
  3305      }
  3306      del_args(MAGIC_WRITE);
  3307  
  3308      program_data_t p = {};
  3309      if (!init_program_data(&p, ctx))
  3310          return 0;
  3311  
  3312      if (!should_trace(&p)) {
  3313          return 0;
  3314      }
  3315  
  3316      if (!should_submit(MAGIC_WRITE, p.event)) {
  3317          return 0;
  3318      }
  3319  
  3320      u32 bytes_written = PT_REGS_RC(ctx);
  3321      if (bytes_written == 0) {
  3322          return 0;
  3323      }
  3324  
  3325      io_data_t io_data;
  3326      file_info_t file_info;
  3327  
  3328      struct file *file = (struct file *) saved_args.args[0];
  3329      file_info.pathname_p = get_path_str_cached(file);
  3330  
  3331      io_data.is_buf = is_buf;
  3332      io_data.ptr = (void *) saved_args.args[1];
  3333      io_data.len = (unsigned long) saved_args.args[2];
  3334  
  3335      // Extract device id, inode number, and pos (offset)
  3336      file_info.id.device = get_dev_from_file(file);
  3337      file_info.id.inode = get_inode_nr_from_file(file);
  3338  
  3339      u32 header_bytes = FILE_MAGIC_HDR_SIZE;
  3340      if (header_bytes > bytes_written)
  3341          header_bytes = bytes_written;
  3342  
  3343      u8 header[FILE_MAGIC_HDR_SIZE];
  3344      __builtin_memset(&header, 0, sizeof(header));
  3345  
  3346      save_str_to_buf(&(p.event->args_buf), file_info.pathname_p, 0);
  3347  
  3348      fill_file_header(header, io_data);
  3349  
  3350      if (!is_elf(io_data, header)) {
  3351          return 0;
  3352      }
  3353  
  3354      save_bytes_to_buf(&(p.event->args_buf), header, header_bytes, 1);
  3355      save_to_submit_buf(&(p.event->args_buf), &file_info.id.device, sizeof(dev_t), 2);
  3356      save_to_submit_buf(&(p.event->args_buf), &file_info.id.inode, sizeof(unsigned long), 3);
  3357  
  3358      // Submit magic_write event
  3359      return events_perf_submit(&p, MAGIC_WRITE, bytes_written);
  3360  }
  3361  
  3362  SEC("kprobe/vfs_write")
  3363  int BPF_KPROBE(vfs_write_magic_enter)
  3364  {
  3365      return do_vfs_write_magic_enter(ctx);
  3366  }
  3367  
  3368  SEC("kprobe/vfs_writev")
  3369  int BPF_KPROBE(vfs_writev_magic_enter)
  3370  {
  3371      return do_vfs_write_magic_enter(ctx);
  3372  }
  3373  
  3374  SEC("kprobe/__kernel_write")
  3375  int BPF_KPROBE(kernel_write_magic_enter)
  3376  {
  3377      return do_vfs_write_magic_enter(ctx);
  3378  }
  3379  
  3380  SEC("kretprobe/vfs_write")
  3381  int BPF_KPROBE(vfs_write_magic_return)
  3382  {
  3383      return do_vfs_write_magic_return(ctx, true);
  3384  }
  3385  
  3386  SEC("kretprobe/vfs_writev")
  3387  int BPF_KPROBE(vfs_writev_magic_return)
  3388  {
  3389      return do_vfs_write_magic_return(ctx, false);
  3390  }
  3391  
  3392  SEC("kretprobe/__kernel_write")
  3393  int BPF_KPROBE(kernel_write_magic_return)
  3394  {
  3395      return do_vfs_write_magic_return(ctx, true);
  3396  }
  3397  // Used macro because of problem with verifier in NONCORE kinetic519
  3398  #define submit_mem_prot_alert_event(event, alert, addr, len, prot, previous_prot, file_info)       \
  3399      {                                                                                              \
  3400          save_to_submit_buf(event, &alert, sizeof(u32), 0);                                         \
  3401          save_to_submit_buf(event, &addr, sizeof(void *), 1);                                       \
  3402          save_to_submit_buf(event, &len, sizeof(size_t), 2);                                        \
  3403          save_to_submit_buf(event, &prot, sizeof(int), 3);                                          \
  3404          save_to_submit_buf(event, &previous_prot, sizeof(int), 4);                                 \
  3405          if (file_info.pathname_p != NULL) {                                                        \
  3406              save_str_to_buf(event, file_info.pathname_p, 5);                                       \
  3407              save_to_submit_buf(event, &file_info.id.device, sizeof(dev_t), 6);                     \
  3408              save_to_submit_buf(event, &file_info.id.inode, sizeof(unsigned long), 7);              \
  3409              save_to_submit_buf(event, &file_info.id.ctime, sizeof(u64), 8);                        \
  3410          }                                                                                          \
  3411          events_perf_submit(&p, MEM_PROT_ALERT, 0);                                                 \
  3412      }
  3413  
  3414  SEC("kprobe/security_mmap_addr")
  3415  int BPF_KPROBE(trace_mmap_alert)
  3416  {
  3417      program_data_t p = {};
  3418      if (!init_program_data(&p, ctx))
  3419          return 0;
  3420  
  3421      if (!should_trace(&p))
  3422          return 0;
  3423  
  3424      // Load the arguments given to the mmap syscall (which eventually invokes this function)
  3425      syscall_data_t *sys = &p.task_info->syscall_data;
  3426      if (!p.task_info->syscall_traced || sys->id != SYSCALL_MMAP)
  3427          return 0;
  3428  
  3429      int prot = sys->args.args[2];
  3430  
  3431      if ((prot & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC) &&
  3432          should_submit(MEM_PROT_ALERT, p.event)) {
  3433          u32 alert = ALERT_MMAP_W_X;
  3434          int fd = sys->args.args[4];
  3435          void *addr = (void *) sys->args.args[0];
  3436          size_t len = sys->args.args[1];
  3437          int prev_prot = 0;
  3438          file_info_t file_info = {.pathname_p = NULL};
  3439          if (fd >= 0) {
  3440              struct file *file = get_struct_file_from_fd(fd);
  3441              file_info = get_file_info(file);
  3442          }
  3443          submit_mem_prot_alert_event(
  3444              &p.event->args_buf, alert, addr, len, prot, prev_prot, file_info);
  3445      }
  3446  
  3447      return 0;
  3448  }
  3449  
  3450  SEC("kprobe/do_mmap")
  3451  TRACE_ENT_FUNC(do_mmap, DO_MMAP)
  3452  
  3453  SEC("kretprobe/do_mmap")
  3454  int BPF_KPROBE(trace_ret_do_mmap)
  3455  {
  3456      args_t saved_args;
  3457      if (load_args(&saved_args, DO_MMAP) != 0) {
  3458          // missed entry or not traced
  3459          return 0;
  3460      }
  3461  
  3462      program_data_t p = {};
  3463      if (!init_program_data(&p, ctx))
  3464          return 0;
  3465  
  3466      if (!should_trace(&p))
  3467          return 0;
  3468  
  3469      if (!should_submit(DO_MMAP, p.event)) {
  3470          return 0;
  3471      }
  3472  
  3473      dev_t s_dev;
  3474      unsigned long inode_nr;
  3475      void *file_path;
  3476      u64 ctime;
  3477      unsigned int flags;
  3478  
  3479      struct file *file = (struct file *) saved_args.args[0];
  3480      if (file != NULL) {
  3481          s_dev = get_dev_from_file(file);
  3482          inode_nr = get_inode_nr_from_file(file);
  3483          file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  3484          ctime = get_ctime_nanosec_from_file(file);
  3485      }
  3486      unsigned long len = (unsigned long) saved_args.args[2];
  3487      unsigned long prot = (unsigned long) saved_args.args[3];
  3488      unsigned long mmap_flags = (unsigned long) saved_args.args[4];
  3489      unsigned long pgoff = (unsigned long) saved_args.args[5];
  3490      unsigned long addr = (unsigned long) PT_REGS_RC(ctx);
  3491  
  3492      save_to_submit_buf(&p.event->args_buf, &addr, sizeof(void *), 0);
  3493      if (file != NULL) {
  3494          save_str_to_buf(&p.event->args_buf, file_path, 1);
  3495          save_to_submit_buf(&p.event->args_buf, &flags, sizeof(unsigned int), 2);
  3496          save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 3);
  3497          save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 4);
  3498          save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 5);
  3499      }
  3500      save_to_submit_buf(&p.event->args_buf, &pgoff, sizeof(unsigned long), 6);
  3501      save_to_submit_buf(&p.event->args_buf, &len, sizeof(unsigned long), 7);
  3502      save_to_submit_buf(&p.event->args_buf, &prot, sizeof(unsigned long), 8);
  3503      save_to_submit_buf(&p.event->args_buf, &mmap_flags, sizeof(unsigned long), 9);
  3504  
  3505      return events_perf_submit(&p, DO_MMAP, 0);
  3506  }
  3507  
  3508  SEC("kprobe/security_mmap_file")
  3509  int BPF_KPROBE(trace_security_mmap_file)
  3510  {
  3511      program_data_t p = {};
  3512      if (!init_program_data(&p, ctx))
  3513          return 0;
  3514  
  3515      if (!should_trace(&p))
  3516          return 0;
  3517  
  3518      bool submit_sec_mmap_file = should_submit(SECURITY_MMAP_FILE, p.event);
  3519      bool submit_shared_object_loaded = should_submit(SHARED_OBJECT_LOADED, p.event);
  3520  
  3521      if (!submit_sec_mmap_file && !submit_shared_object_loaded)
  3522          return 0;
  3523  
  3524      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  3525      if (file == 0)
  3526          return 0;
  3527      dev_t s_dev = get_dev_from_file(file);
  3528      unsigned long inode_nr = get_inode_nr_from_file(file);
  3529      void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  3530      u64 ctime = get_ctime_nanosec_from_file(file);
  3531      unsigned long prot = (unsigned long) PT_REGS_PARM2(ctx);
  3532      unsigned long mmap_flags = (unsigned long) PT_REGS_PARM3(ctx);
  3533  
  3534      save_str_to_buf(&p.event->args_buf, file_path, 0);
  3535      save_to_submit_buf(&p.event->args_buf,
  3536                         (void *) __builtin_preserve_access_index(&file->f_flags),
  3537                         sizeof(int),
  3538                         1);
  3539      save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 2);
  3540      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 3);
  3541      save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4);
  3542  
  3543      if (submit_shared_object_loaded) {
  3544          if ((prot & VM_EXEC) == VM_EXEC && p.event->context.syscall == SYSCALL_MMAP) {
  3545              events_perf_submit(&p, SHARED_OBJECT_LOADED, 0);
  3546          }
  3547      }
  3548  
  3549      if (submit_sec_mmap_file) {
  3550          save_to_submit_buf(&p.event->args_buf, &prot, sizeof(unsigned long), 5);
  3551          save_to_submit_buf(&p.event->args_buf, &mmap_flags, sizeof(unsigned long), 6);
  3552          return events_perf_submit(&p, SECURITY_MMAP_FILE, 0);
  3553      }
  3554  
  3555      return 0;
  3556  }
  3557  
  3558  SEC("kprobe/security_file_mprotect")
  3559  int BPF_KPROBE(trace_security_file_mprotect)
  3560  {
  3561      bin_args_t bin_args = {};
  3562  
  3563      program_data_t p = {};
  3564      if (!init_program_data(&p, ctx))
  3565          return 0;
  3566  
  3567      if (!should_trace(&p))
  3568          return 0;
  3569  
  3570      // Load the arguments given to the mprotect syscall (which eventually invokes this function)
  3571      syscall_data_t *sys = &p.task_info->syscall_data;
  3572      if (!p.task_info->syscall_traced ||
  3573          (sys->id != SYSCALL_MPROTECT && sys->id != SYSCALL_PKEY_MPROTECT))
  3574          return 0;
  3575  
  3576      int should_submit_mprotect = should_submit(SECURITY_FILE_MPROTECT, p.event);
  3577      int should_submit_mem_prot_alert = should_submit(MEM_PROT_ALERT, p.event);
  3578  
  3579      if (!should_submit_mprotect && !should_submit_mem_prot_alert) {
  3580          return 0;
  3581      }
  3582  
  3583      struct vm_area_struct *vma = (struct vm_area_struct *) PT_REGS_PARM1(ctx);
  3584      unsigned long reqprot = PT_REGS_PARM2(ctx);
  3585      unsigned long prev_prot = get_vma_flags(vma);
  3586  
  3587      struct file *file = (struct file *) BPF_CORE_READ(vma, vm_file);
  3588      file_info_t file_info = get_file_info(file);
  3589  
  3590      if (should_submit_mprotect) {
  3591          void *addr = (void *) sys->args.args[0];
  3592          size_t len = sys->args.args[1];
  3593  
  3594          save_str_to_buf(&p.event->args_buf, file_info.pathname_p, 0);
  3595          save_to_submit_buf(&p.event->args_buf, &reqprot, sizeof(int), 1);
  3596          save_to_submit_buf(&p.event->args_buf, &file_info.id.ctime, sizeof(u64), 2);
  3597          save_to_submit_buf(&p.event->args_buf, &prev_prot, sizeof(int), 3);
  3598          save_to_submit_buf(&p.event->args_buf, &addr, sizeof(void *), 4);
  3599          save_to_submit_buf(&p.event->args_buf, &len, sizeof(size_t), 5);
  3600  
  3601          if (sys->id == SYSCALL_PKEY_MPROTECT) {
  3602              int pkey = sys->args.args[3];
  3603              save_to_submit_buf(&p.event->args_buf, &pkey, sizeof(int), 6);
  3604          }
  3605  
  3606          events_perf_submit(&p, SECURITY_FILE_MPROTECT, 0);
  3607      }
  3608  
  3609      if (should_submit_mem_prot_alert) {
  3610          void *addr = (void *) sys->args.args[0];
  3611          size_t len = sys->args.args[1];
  3612  
  3613          if (addr <= 0)
  3614              return 0;
  3615  
  3616          // If length is 0, the current page permissions are changed
  3617          if (len == 0)
  3618              len = PAGE_SIZE;
  3619  
  3620          u32 alert;
  3621          bool should_alert = false;
  3622          bool should_extract_code = false;
  3623  
  3624          if ((!(prev_prot & VM_EXEC)) && (reqprot & VM_EXEC)) {
  3625              alert = ALERT_MPROT_X_ADD;
  3626              should_alert = true;
  3627          }
  3628  
  3629          if ((prev_prot & VM_EXEC) && !(prev_prot & VM_WRITE) &&
  3630              ((reqprot & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC))) {
  3631              alert = ALERT_MPROT_W_ADD;
  3632              should_alert = true;
  3633          }
  3634  
  3635          if ((prev_prot & VM_WRITE) && (reqprot & VM_EXEC) && !(reqprot & VM_WRITE)) {
  3636              alert = ALERT_MPROT_W_REM;
  3637              should_alert = true;
  3638  
  3639              if (p.config->options & OPT_EXTRACT_DYN_CODE) {
  3640                  should_extract_code = true;
  3641              }
  3642          }
  3643          if (should_alert) {
  3644              reset_event_args(&p);
  3645              submit_mem_prot_alert_event(
  3646                  &p.event->args_buf, alert, addr, len, reqprot, prev_prot, file_info);
  3647          }
  3648          if (should_extract_code) {
  3649              u32 pid = p.event->context.task.host_pid;
  3650              bin_args.type = SEND_MPROTECT;
  3651              bpf_probe_read(bin_args.metadata, sizeof(u64), &p.event->context.ts);
  3652              bpf_probe_read(&bin_args.metadata[8], 4, &pid);
  3653              bin_args.ptr = (char *) addr;
  3654              bin_args.start_off = 0;
  3655              bin_args.full_size = len;
  3656  
  3657              tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN);
  3658          }
  3659      }
  3660  
  3661      return 0;
  3662  }
  3663  
  3664  SEC("raw_tracepoint/sys_init_module")
  3665  int syscall__init_module(void *ctx)
  3666  {
  3667      program_data_t p = {};
  3668      if (!init_program_data(&p, ctx))
  3669          return 0;
  3670  
  3671      syscall_data_t *sys = &p.task_info->syscall_data;
  3672      if (!p.task_info->syscall_traced)
  3673          return -1;
  3674  
  3675      bin_args_t bin_args = {};
  3676  
  3677      u32 pid = p.event->context.task.host_pid;
  3678      u64 dummy = 0;
  3679      void *addr = (void *) sys->args.args[0];
  3680      unsigned long len = (unsigned long) sys->args.args[1];
  3681  
  3682      if (p.config->options & OPT_CAPTURE_MODULES) {
  3683          bin_args.type = SEND_KERNEL_MODULE;
  3684          bpf_probe_read_kernel(bin_args.metadata, 4, &dummy);
  3685          bpf_probe_read_kernel(&bin_args.metadata[4], 8, &dummy);
  3686          bpf_probe_read_kernel(&bin_args.metadata[12], 4, &pid);
  3687          bpf_probe_read_kernel(&bin_args.metadata[16], 8, &len);
  3688          bin_args.ptr = (char *) addr;
  3689          bin_args.start_off = 0;
  3690          bin_args.full_size = (unsigned int) len;
  3691  
  3692          tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN_TP);
  3693      }
  3694      return 0;
  3695  }
  3696  
  3697  statfunc int do_check_bpf_link(program_data_t *p, union bpf_attr *attr, int cmd)
  3698  {
  3699      if (cmd == BPF_LINK_CREATE) {
  3700          u32 prog_fd = BPF_CORE_READ(attr, link_create.prog_fd);
  3701          u32 perf_fd = BPF_CORE_READ(attr, link_create.target_fd);
  3702  
  3703          struct file *bpf_prog_file = get_struct_file_from_fd(prog_fd);
  3704          struct file *perf_event_file = get_struct_file_from_fd(perf_fd);
  3705  
  3706          send_bpf_perf_attach(p, bpf_prog_file, perf_event_file);
  3707      }
  3708  
  3709      return 0;
  3710  }
  3711  
  3712  statfunc int check_bpf_link(program_data_t *p, union bpf_attr *attr, int cmd)
  3713  {
  3714      // BPF_LINK_CREATE command was only introduced in kernel 5.7.
  3715      // nothing to check for kernels < 5.7.
  3716  
  3717      if (bpf_core_field_exists(attr->link_create)) {
  3718          do_check_bpf_link(p, attr, cmd);
  3719      }
  3720  
  3721      return 0;
  3722  }
  3723  
  3724  // TODO: This fails on 5.4 kernel with error:
  3725  // loading ebpf module: field TraceSecurityBpf: program trace_security_bpf: l
  3726  // oad program: permission denied: 1595: (73) *(u8 *)(r10 -120) = r1: ; return (const str
  3727  // (truncated, 951 line(s) omitted)
  3728  SEC("kprobe/security_bpf")
  3729  int BPF_KPROBE(trace_security_bpf)
  3730  {
  3731      program_data_t p = {};
  3732      if (!init_program_data(&p, ctx))
  3733          return 0;
  3734  
  3735      if (!should_trace(&p))
  3736          return 0;
  3737  
  3738      int cmd = (int) PT_REGS_PARM1(ctx);
  3739  
  3740      if (should_submit(SECURITY_BPF, p.event)) {
  3741          // 1st argument == cmd (int)
  3742          save_to_submit_buf(&p.event->args_buf, (void *) &cmd, sizeof(int), 0);
  3743          events_perf_submit(&p, SECURITY_BPF, 0);
  3744      }
  3745      union bpf_attr *attr = (union bpf_attr *) PT_REGS_PARM2(ctx);
  3746  
  3747      reset_event_args(&p);
  3748      check_bpf_link(&p, attr, cmd);
  3749  
  3750      // Capture BPF object loaded
  3751      if (cmd == BPF_PROG_LOAD && p.config->options & OPT_CAPTURE_BPF) {
  3752          bin_args_t bin_args = {};
  3753          u32 pid = p.task_info->context.host_pid;
  3754  
  3755          u32 insn_cnt = get_attr_insn_cnt(attr);
  3756          const struct bpf_insn *insns = get_attr_insns(attr);
  3757          unsigned int insn_size = (unsigned int) (sizeof(struct bpf_insn) * insn_cnt);
  3758  
  3759          bin_args.type = SEND_BPF_OBJECT;
  3760          char prog_name[16] = {0};
  3761          long sz = bpf_probe_read_kernel_str(prog_name, 16, attr->prog_name);
  3762          if (sz > 0) {
  3763              sz = bpf_probe_read_kernel_str(bin_args.metadata, 16, prog_name);
  3764          }
  3765  
  3766          u32 rand = bpf_get_prandom_u32();
  3767          bpf_probe_read_kernel(&bin_args.metadata[16], 4, &rand);
  3768          bpf_probe_read_kernel(&bin_args.metadata[20], 4, &pid);
  3769          bpf_probe_read_kernel(&bin_args.metadata[24], 4, &insn_size);
  3770          bin_args.ptr = (char *) insns;
  3771          bin_args.start_off = 0;
  3772          bin_args.full_size = insn_size;
  3773  
  3774          tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN);
  3775      }
  3776      return 0;
  3777  }
  3778  
  3779  // arm_kprobe can't be hooked in arm64 architecture, use enable logic instead
  3780  
  3781  statfunc int arm_kprobe_handler(struct pt_regs *ctx)
  3782  {
  3783      args_t saved_args;
  3784      if (load_args(&saved_args, KPROBE_ATTACH) != 0) {
  3785          return 0;
  3786      }
  3787      del_args(KPROBE_ATTACH);
  3788  
  3789      program_data_t p = {};
  3790      if (!init_program_data(&p, ctx))
  3791          return 0;
  3792  
  3793      if (!should_trace(&p))
  3794          return 0;
  3795  
  3796      struct kprobe *kp = (struct kprobe *) saved_args.args[0];
  3797      unsigned int retcode = PT_REGS_RC(ctx);
  3798  
  3799      if (retcode)
  3800          return 0; // register_kprobe() failed
  3801  
  3802      char *symbol_name = (char *) BPF_CORE_READ(kp, symbol_name);
  3803      u64 pre_handler = (u64) BPF_CORE_READ(kp, pre_handler);
  3804      u64 post_handler = (u64) BPF_CORE_READ(kp, post_handler);
  3805  
  3806      save_str_to_buf(&p.event->args_buf, (void *) symbol_name, 0);
  3807      save_to_submit_buf(&p.event->args_buf, (void *) &pre_handler, sizeof(u64), 1);
  3808      save_to_submit_buf(&p.event->args_buf, (void *) &post_handler, sizeof(u64), 2);
  3809  
  3810      return events_perf_submit(&p, KPROBE_ATTACH, 0);
  3811  }
  3812  
  3813  // register_kprobe and enable_kprobe have same execution path, and both call
  3814  // arm_kprobe, which is the function we are interested in. Nevertheless, there
  3815  // is also another function, register_aggr_kprobes, that might be able to call
  3816  // arm_kprobe so, instead of hooking into enable_kprobe, we hook into
  3817  // register_kprobe covering all execution paths.
  3818  
  3819  SEC("kprobe/register_kprobe")
  3820  TRACE_ENT_FUNC(register_kprobe, KPROBE_ATTACH);
  3821  
  3822  SEC("kretprobe/register_kprobe")
  3823  int BPF_KPROBE(trace_ret_register_kprobe)
  3824  {
  3825      return arm_kprobe_handler(ctx);
  3826  }
  3827  
  3828  SEC("kprobe/security_bpf_map")
  3829  int BPF_KPROBE(trace_security_bpf_map)
  3830  {
  3831      program_data_t p = {};
  3832      if (!init_program_data(&p, ctx))
  3833          return 0;
  3834  
  3835      if (!should_trace(&p))
  3836          return 0;
  3837  
  3838      if (!should_submit(SECURITY_BPF_MAP, p.event))
  3839          return 0;
  3840  
  3841      struct bpf_map *map = (struct bpf_map *) PT_REGS_PARM1(ctx);
  3842  
  3843      // 1st argument == map_id (u32)
  3844      save_to_submit_buf(
  3845          &p.event->args_buf, (void *) __builtin_preserve_access_index(&map->id), sizeof(int), 0);
  3846      // 2nd argument == map_name (const char *)
  3847      save_str_to_buf(&p.event->args_buf, (void *) __builtin_preserve_access_index(&map->name), 1);
  3848  
  3849      return events_perf_submit(&p, SECURITY_BPF_MAP, 0);
  3850  }
  3851  
  3852  SEC("kprobe/security_bpf_prog")
  3853  int BPF_KPROBE(trace_security_bpf_prog)
  3854  {
  3855      program_data_t p = {};
  3856      if (!init_program_data(&p, ctx))
  3857          return 0;
  3858  
  3859      if (!should_trace(&p))
  3860          return 0;
  3861  
  3862      struct bpf_prog *prog = (struct bpf_prog *) PT_REGS_PARM1(ctx);
  3863      struct bpf_prog_aux *prog_aux = BPF_CORE_READ(prog, aux);
  3864      u32 prog_id = BPF_CORE_READ(prog_aux, id);
  3865  
  3866      // In some systems, the 'check_map_func_compatibility' and 'check_helper_call' symbols are not
  3867      // available. For these cases, the temporary map 'bpf_attach_tmp_map' will not hold any
  3868      // information about the used helpers in the prog. nevertheless, we always want to output the
  3869      // 'bpf_attach' event to the user, so using zero values
  3870      bpf_used_helpers_t val = {0};
  3871  
  3872      // if there is a value, use it
  3873      bpf_used_helpers_t *existing_val;
  3874      existing_val = bpf_map_lookup_elem(&bpf_attach_tmp_map, &p.event->context.task.host_tid);
  3875      if (existing_val != NULL) {
  3876          __builtin_memcpy(&val.helpers, &existing_val->helpers, sizeof(bpf_used_helpers_t));
  3877      }
  3878  
  3879      bpf_map_delete_elem(&bpf_attach_tmp_map, &p.event->context.task.host_tid);
  3880  
  3881      if (should_submit(BPF_ATTACH, p.event)) {
  3882          bpf_map_update_elem(&bpf_attach_map, &prog_id, &val, BPF_ANY);
  3883      }
  3884  
  3885      if (!should_submit(SECURITY_BPF_PROG, p.event)) {
  3886          return 0;
  3887      }
  3888  
  3889      bool is_load = false;
  3890      void **aux_ptr = bpf_map_lookup_elem(&bpf_prog_load_map, &p.event->context.task.host_tid);
  3891      if (aux_ptr != NULL) {
  3892          if (*aux_ptr == (void *) prog_aux) {
  3893              is_load = true;
  3894          }
  3895  
  3896          bpf_map_delete_elem(&bpf_prog_load_map, &p.event->context.task.host_tid);
  3897      }
  3898  
  3899      int prog_type = BPF_CORE_READ(prog, type);
  3900  
  3901      char prog_name[BPF_OBJ_NAME_LEN];
  3902      bpf_probe_read_kernel_str(&prog_name, BPF_OBJ_NAME_LEN, prog_aux->name);
  3903  
  3904      save_to_submit_buf(&p.event->args_buf, &prog_type, sizeof(int), 0);
  3905      save_str_to_buf(&p.event->args_buf, (void *) &prog_name, 1);
  3906      save_u64_arr_to_buf(&p.event->args_buf, (const u64 *) val.helpers, 4, 2);
  3907      save_to_submit_buf(&p.event->args_buf, &prog_id, sizeof(u32), 3);
  3908      save_to_submit_buf(&p.event->args_buf, &is_load, sizeof(bool), 4);
  3909  
  3910      events_perf_submit(&p, SECURITY_BPF_PROG, 0);
  3911  
  3912      return 0;
  3913  }
  3914  
  3915  SEC("kprobe/bpf_check")
  3916  int BPF_KPROBE(trace_bpf_check)
  3917  {
  3918      program_data_t p = {};
  3919      if (!init_program_data(&p, ctx))
  3920          return 0;
  3921  
  3922      if (!should_trace(&p))
  3923          return 0;
  3924  
  3925      // this probe is triggered when a bpf program is loaded.
  3926      // we save the aux pointer to be used in security_bpf_prog, to indicate this prog is being
  3927      // loaded - security_bpf_prog is triggered not only on prog load.
  3928  
  3929      if (!should_submit(SECURITY_BPF_PROG, p.event))
  3930          return 0;
  3931  
  3932      struct bpf_prog **prog;
  3933      struct bpf_prog *prog_ptr;
  3934      struct bpf_prog_aux *prog_aux;
  3935  
  3936      prog = (struct bpf_prog **) PT_REGS_PARM1(ctx);
  3937      bpf_core_read(&prog_ptr, sizeof(void *), prog);
  3938      prog_aux = BPF_CORE_READ(prog_ptr, aux);
  3939  
  3940      bpf_map_update_elem(&bpf_prog_load_map, &p.event->context.task.host_tid, &prog_aux, BPF_ANY);
  3941  
  3942      return 0;
  3943  }
  3944  
  3945  // Save in the temporary map 'bpf_attach_tmp_map' whether bpf_probe_write_user and
  3946  // bpf_override_return are used in the bpf program. Get this information in the verifier phase of
  3947  // the bpf program load lifecycle, before a prog_id is set for the bpf program. Save this
  3948  // information in a temporary map which includes the host_tid as key instead of the prog_id.
  3949  //
  3950  // Later on, in security_bpf_prog, save this information in the stable map 'bpf_attach_map', which
  3951  // contains the prog_id in its key.
  3952  
  3953  statfunc int handle_bpf_helper_func_id(u32 host_tid, int func_id)
  3954  {
  3955      bpf_used_helpers_t val = {0};
  3956  
  3957      // we want to the existing value in the map a just update it with the current func_id
  3958      bpf_used_helpers_t *existing_val = bpf_map_lookup_elem(&bpf_attach_tmp_map, &host_tid);
  3959      if (existing_val != NULL) {
  3960          __builtin_memcpy(&val.helpers, &existing_val->helpers, sizeof(bpf_used_helpers_t));
  3961      }
  3962  
  3963      // calculate where to encode usage of this func_id in bpf_used_helpers_t.
  3964      // this method is used in order to stay in bounds of the helpers array and pass verifier checks.
  3965      // it is equivalent to:
  3966      //  val.helpers[func_id / 64] |= (1ULL << (func_id % 64));
  3967      // which the verifier doesn't like.
  3968      int arr_num;
  3969      int arr_idx = func_id;
  3970  
  3971  #pragma unroll
  3972      for (int i = 0; i < NUM_OF_HELPERS_ELEMS; i++) {
  3973          arr_num = i;
  3974          if (arr_idx - SIZE_OF_HELPER_ELEM >= 0) {
  3975              arr_idx = arr_idx - SIZE_OF_HELPER_ELEM;
  3976          } else {
  3977              break;
  3978          }
  3979      }
  3980      if (arr_idx >= SIZE_OF_HELPER_ELEM) {
  3981          // unsupported func_id
  3982          return 0;
  3983      }
  3984  
  3985      val.helpers[arr_num] |= (1ULL << (arr_idx));
  3986  
  3987      // update the map with the current func_id
  3988      bpf_map_update_elem(&bpf_attach_tmp_map, &host_tid, &val, BPF_ANY);
  3989  
  3990      return 0;
  3991  }
  3992  
  3993  SEC("kprobe/check_map_func_compatibility")
  3994  int BPF_KPROBE(trace_check_map_func_compatibility)
  3995  {
  3996      program_data_t p = {};
  3997      if (!init_program_data(&p, ctx))
  3998          return 0;
  3999  
  4000      if (!should_trace(&p))
  4001          return 0;
  4002  
  4003      int func_id = (int) PT_REGS_PARM3(ctx);
  4004  
  4005      return handle_bpf_helper_func_id(p.event->context.task.host_tid, func_id);
  4006  }
  4007  
  4008  SEC("kprobe/check_helper_call")
  4009  int BPF_KPROBE(trace_check_helper_call)
  4010  {
  4011      program_data_t p = {};
  4012      if (!init_program_data(&p, ctx))
  4013          return 0;
  4014  
  4015      if (!should_trace(&p))
  4016          return 0;
  4017  
  4018      int func_id;
  4019  
  4020      if (!bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_for_each_map_elem)) {
  4021          // if BPF_FUNC_for_each_map_elem doesn't exist under bpf_func_id - kernel version < 5.13
  4022          func_id = (int) PT_REGS_PARM2(ctx);
  4023      } else {
  4024          struct bpf_insn *insn = (struct bpf_insn *) PT_REGS_PARM2(ctx);
  4025          func_id = BPF_CORE_READ(insn, imm);
  4026      }
  4027  
  4028      return handle_bpf_helper_func_id(p.event->context.task.host_tid, func_id);
  4029  }
  4030  
  4031  SEC("kprobe/security_kernel_read_file")
  4032  int BPF_KPROBE(trace_security_kernel_read_file)
  4033  {
  4034      program_data_t p = {};
  4035      if (!init_program_data(&p, ctx))
  4036          return 0;
  4037  
  4038      if (!should_trace(&p))
  4039          return 0;
  4040  
  4041      if (!should_submit(SECURITY_KERNEL_READ_FILE, p.event))
  4042          return 0;
  4043  
  4044      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  4045      dev_t s_dev = get_dev_from_file(file);
  4046      unsigned long inode_nr = get_inode_nr_from_file(file);
  4047      enum kernel_read_file_id type_id = (enum kernel_read_file_id) PT_REGS_PARM2(ctx);
  4048      void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  4049      u64 ctime = get_ctime_nanosec_from_file(file);
  4050  
  4051      save_str_to_buf(&p.event->args_buf, file_path, 0);
  4052      save_to_submit_buf(&p.event->args_buf, &s_dev, sizeof(dev_t), 1);
  4053      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 2);
  4054      save_to_submit_buf(&p.event->args_buf, &type_id, sizeof(int), 3);
  4055      save_to_submit_buf(&p.event->args_buf, &ctime, sizeof(u64), 4);
  4056  
  4057      return events_perf_submit(&p, SECURITY_KERNEL_READ_FILE, 0);
  4058  }
  4059  
  4060  SEC("kprobe/security_kernel_post_read_file")
  4061  int BPF_KPROBE(trace_security_kernel_post_read_file)
  4062  {
  4063      program_data_t p = {};
  4064      if (!init_program_data(&p, ctx))
  4065          return 0;
  4066  
  4067      if (!should_trace(&p))
  4068          return 0;
  4069  
  4070      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  4071      char *buf = (char *) PT_REGS_PARM2(ctx);
  4072      loff_t size = (loff_t) PT_REGS_PARM3(ctx);
  4073      enum kernel_read_file_id type_id = (enum kernel_read_file_id) PT_REGS_PARM4(ctx);
  4074  
  4075      // Send event if chosen
  4076      if (should_submit(SECURITY_POST_READ_FILE, p.event)) {
  4077          void *file_path = get_path_str(&file->f_path);
  4078          save_str_to_buf(&p.event->args_buf, file_path, 0);
  4079          save_to_submit_buf(&p.event->args_buf, &size, sizeof(loff_t), 1);
  4080          save_to_submit_buf(&p.event->args_buf, &type_id, sizeof(int), 2);
  4081          events_perf_submit(&p, SECURITY_POST_READ_FILE, 0);
  4082      }
  4083  
  4084      if (p.config->options & OPT_CAPTURE_MODULES) {
  4085          // Do not extract files greater than 4GB
  4086          if (size >= (u64) 1 << 32) {
  4087              return 0;
  4088          }
  4089          // Extract device id, inode number for file name
  4090          dev_t s_dev = get_dev_from_file(file);
  4091          unsigned long inode_nr = get_inode_nr_from_file(file);
  4092          bin_args_t bin_args = {};
  4093          u32 pid = p.event->context.task.host_pid;
  4094  
  4095          bin_args.type = SEND_KERNEL_MODULE;
  4096          bpf_probe_read_kernel(bin_args.metadata, 4, &s_dev);
  4097          bpf_probe_read_kernel(&bin_args.metadata[4], 8, &inode_nr);
  4098          bpf_probe_read_kernel(&bin_args.metadata[12], 4, &pid);
  4099          bpf_probe_read_kernel(&bin_args.metadata[16], 4, &size);
  4100          bin_args.start_off = 0;
  4101          bin_args.ptr = buf;
  4102          bin_args.full_size = size;
  4103  
  4104          tail_call_send_bin(ctx, &p, &bin_args, TAIL_SEND_BIN);
  4105      }
  4106  
  4107      return 0;
  4108  }
  4109  
  4110  SEC("kprobe/security_inode_mknod")
  4111  int BPF_KPROBE(trace_security_inode_mknod)
  4112  {
  4113      program_data_t p = {};
  4114      if (!init_program_data(&p, ctx))
  4115          return 0;
  4116  
  4117      if (!should_trace(&p))
  4118          return 0;
  4119  
  4120      if (!should_submit(SECURITY_INODE_MKNOD, p.event))
  4121          return 0;
  4122  
  4123      struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx);
  4124      unsigned short mode = (unsigned short) PT_REGS_PARM3(ctx);
  4125      unsigned int dev = (unsigned int) PT_REGS_PARM4(ctx);
  4126      void *dentry_path = get_dentry_path_str(dentry);
  4127  
  4128      save_str_to_buf(&p.event->args_buf, dentry_path, 0);
  4129      save_to_submit_buf(&p.event->args_buf, &mode, sizeof(unsigned short), 1);
  4130      save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2);
  4131  
  4132      return events_perf_submit(&p, SECURITY_INODE_MKNOD, 0);
  4133  }
  4134  
  4135  SEC("kprobe/device_add")
  4136  int BPF_KPROBE(trace_device_add)
  4137  {
  4138      program_data_t p = {};
  4139      if (!init_program_data(&p, ctx))
  4140          return 0;
  4141  
  4142      if (!should_trace(&p))
  4143          return 0;
  4144  
  4145      if (!should_submit(DEVICE_ADD, p.event))
  4146          return 0;
  4147  
  4148      struct device *dev = (struct device *) PT_REGS_PARM1(ctx);
  4149      const char *name = get_device_name(dev);
  4150  
  4151      struct device *parent_dev = BPF_CORE_READ(dev, parent);
  4152      const char *parent_name = get_device_name(parent_dev);
  4153  
  4154      save_str_to_buf(&p.event->args_buf, (void *) name, 0);
  4155      save_str_to_buf(&p.event->args_buf, (void *) parent_name, 1);
  4156  
  4157      return events_perf_submit(&p, DEVICE_ADD, 0);
  4158  }
  4159  
  4160  SEC("kprobe/__register_chrdev")
  4161  TRACE_ENT_FUNC(__register_chrdev, REGISTER_CHRDEV);
  4162  
  4163  SEC("kretprobe/__register_chrdev")
  4164  int BPF_KPROBE(trace_ret__register_chrdev)
  4165  {
  4166      args_t saved_args;
  4167      if (load_args(&saved_args, REGISTER_CHRDEV) != 0) {
  4168          // missed entry or not traced
  4169          return 0;
  4170      }
  4171      del_args(REGISTER_CHRDEV);
  4172  
  4173      program_data_t p = {};
  4174      if (!init_program_data(&p, ctx))
  4175          return 0;
  4176  
  4177      if (!should_trace(&p))
  4178          return 0;
  4179  
  4180      if (!should_submit(REGISTER_CHRDEV, p.event))
  4181          return 0;
  4182  
  4183      unsigned int major_number = (unsigned int) saved_args.args[0];
  4184      unsigned int returned_major = PT_REGS_RC(ctx);
  4185  
  4186      // sets the returned major to the requested one in case of a successful registration
  4187      if (major_number > 0 && returned_major == 0) {
  4188          returned_major = major_number;
  4189      }
  4190  
  4191      char *char_device_name = (char *) saved_args.args[3];
  4192      struct file_operations *char_device_fops = (struct file_operations *) saved_args.args[4];
  4193  
  4194      save_to_submit_buf(&p.event->args_buf, &major_number, sizeof(unsigned int), 0);
  4195      save_to_submit_buf(&p.event->args_buf, &returned_major, sizeof(unsigned int), 1);
  4196      save_str_to_buf(&p.event->args_buf, char_device_name, 2);
  4197      save_to_submit_buf(&p.event->args_buf, &char_device_fops, sizeof(void *), 3);
  4198  
  4199      return events_perf_submit(&p, REGISTER_CHRDEV, 0);
  4200  }
  4201  
  4202  statfunc struct pipe_buffer *get_last_write_pipe_buffer(struct pipe_inode_info *pipe)
  4203  {
  4204      // Extract the last page buffer used in the pipe for write
  4205      struct pipe_buffer *bufs = BPF_CORE_READ(pipe, bufs);
  4206      unsigned int curbuf;
  4207  
  4208      struct pipe_inode_info___v54 *legacy_pipe = (struct pipe_inode_info___v54 *) pipe;
  4209      if (bpf_core_field_exists(legacy_pipe->nrbufs)) {
  4210          unsigned int nrbufs = BPF_CORE_READ(legacy_pipe, nrbufs);
  4211          if (nrbufs > 0) {
  4212              nrbufs--;
  4213          }
  4214          curbuf = (BPF_CORE_READ(legacy_pipe, curbuf) + nrbufs) &
  4215                   (BPF_CORE_READ(legacy_pipe, buffers) - 1);
  4216      } else {
  4217          int head = BPF_CORE_READ(pipe, head);
  4218          int ring_size = BPF_CORE_READ(pipe, ring_size);
  4219          curbuf = (head - 1) & (ring_size - 1);
  4220      }
  4221  
  4222      struct pipe_buffer *current_buffer = get_node_addr(bufs, curbuf);
  4223      return current_buffer;
  4224  }
  4225  
  4226  SEC("kprobe/do_splice")
  4227  TRACE_ENT_FUNC(do_splice, DIRTY_PIPE_SPLICE);
  4228  
  4229  SEC("kretprobe/do_splice")
  4230  int BPF_KPROBE(trace_ret_do_splice)
  4231  {
  4232      // The Dirty Pipe vulnerability exist in the kernel since version 5.8, so
  4233      // there is not use to do logic if version is too old. In non-CORE, it will
  4234      // even mean using defines which are not available in the kernel headers,
  4235      // which will cause bugs.
  4236  
  4237      // Check if field of struct exist to determine kernel version - some fields
  4238      // change between versions. In version 5.8 of the kernel, the field
  4239      // "high_zoneidx" changed its name to "highest_zoneidx". This means that the
  4240      // existence of the field "high_zoneidx" can indicate that the kernel
  4241      // version is lower than v5.8
  4242  
  4243      struct alloc_context *check_508;
  4244      if (bpf_core_field_exists(check_508->high_zoneidx)) {
  4245          del_args(DIRTY_PIPE_SPLICE);
  4246          return 0;
  4247      }
  4248  
  4249      args_t saved_args;
  4250      if (load_args(&saved_args, DIRTY_PIPE_SPLICE) != 0) {
  4251          // missed entry or not traced
  4252          return 0;
  4253      }
  4254      del_args(DIRTY_PIPE_SPLICE);
  4255  
  4256      program_data_t p = {};
  4257      if (!init_program_data(&p, ctx))
  4258          return 0;
  4259  
  4260      if (!should_trace(&p))
  4261          return 0;
  4262  
  4263      if (!should_submit(DIRTY_PIPE_SPLICE, p.event))
  4264          return 0;
  4265  
  4266      // Catch only successful splice
  4267      if ((int) PT_REGS_RC(ctx) <= 0) {
  4268          return 0;
  4269      }
  4270  
  4271      struct file *out_file = (struct file *) saved_args.args[2];
  4272      struct pipe_inode_info *out_pipe = get_file_pipe_info(out_file);
  4273      // Check that output is a pipe
  4274      if (!out_pipe) {
  4275          return 0;
  4276      }
  4277  
  4278      // dirty_pipe_splice is a splice to a pipe which results that the last page copied could be
  4279      // modified (the PIPE_BUF_CAN_MERGE flag is on in the pipe_buffer struct).
  4280      struct pipe_buffer *last_write_page_buffer = get_last_write_pipe_buffer(out_pipe);
  4281      unsigned int out_pipe_last_buffer_flags = BPF_CORE_READ(last_write_page_buffer, flags);
  4282      if ((out_pipe_last_buffer_flags & PIPE_BUF_FLAG_CAN_MERGE) == 0) {
  4283          return 0;
  4284      }
  4285  
  4286      struct file *in_file = (struct file *) saved_args.args[0];
  4287      struct inode *in_inode = BPF_CORE_READ(in_file, f_inode);
  4288      u64 in_inode_number = BPF_CORE_READ(in_inode, i_ino);
  4289      unsigned short in_file_type = BPF_CORE_READ(in_inode, i_mode) & S_IFMT;
  4290      void *in_file_path = get_path_str(__builtin_preserve_access_index(&in_file->f_path));
  4291      size_t write_len = (size_t) saved_args.args[4];
  4292  
  4293      loff_t *off_in_addr = (loff_t *) saved_args.args[1];
  4294      // In kernel v5.10 the pointer passed was no longer of the user, so flexibility is needed to
  4295      // read it
  4296      loff_t off_in;
  4297  
  4298      //
  4299      // Check if field of struct exist to determine kernel version - some fields change between
  4300      // versions. Field 'data' of struct 'public_key_signature' was introduced between v5.9 and
  4301      // v5.10, so its existence might be used to determine whether the current version is older than
  4302      // 5.9 or newer than 5.10.
  4303      //
  4304      // https://lore.kernel.org/stable/20210821203108.215937-1-rafaeldtinoco@gmail.com/
  4305      //
  4306      struct public_key_signature *check;
  4307  
  4308      if (!bpf_core_field_exists(check->data)) // version < v5.10
  4309          bpf_core_read_user(&off_in, sizeof(off_in), off_in_addr);
  4310  
  4311      else // version >= v5.10
  4312          bpf_core_read(&off_in, sizeof(off_in), off_in_addr);
  4313  
  4314      struct inode *out_inode = BPF_CORE_READ(out_file, f_inode);
  4315      u64 out_inode_number = BPF_CORE_READ(out_inode, i_ino);
  4316  
  4317      // Only last page written to pipe is vulnerable from the end of written data
  4318      loff_t next_exposed_data_offset_in_out_pipe_last_page =
  4319          BPF_CORE_READ(last_write_page_buffer, offset) + BPF_CORE_READ(last_write_page_buffer, len);
  4320      size_t in_file_size = BPF_CORE_READ(in_inode, i_size);
  4321      size_t exposed_data_len = (PAGE_SIZE - 1) - next_exposed_data_offset_in_out_pipe_last_page;
  4322      loff_t current_file_offset = off_in + write_len;
  4323      if (current_file_offset + exposed_data_len > in_file_size) {
  4324          exposed_data_len = in_file_size - current_file_offset - 1;
  4325      }
  4326  
  4327      save_to_submit_buf(&p.event->args_buf, &in_inode_number, sizeof(u64), 0);
  4328      save_to_submit_buf(&p.event->args_buf, &in_file_type, sizeof(unsigned short), 1);
  4329      save_str_to_buf(&p.event->args_buf, in_file_path, 2);
  4330      save_to_submit_buf(&p.event->args_buf, &current_file_offset, sizeof(loff_t), 3);
  4331      save_to_submit_buf(&p.event->args_buf, &exposed_data_len, sizeof(size_t), 4);
  4332      save_to_submit_buf(&p.event->args_buf, &out_inode_number, sizeof(u64), 5);
  4333      save_to_submit_buf(&p.event->args_buf, &out_pipe_last_buffer_flags, sizeof(unsigned int), 6);
  4334  
  4335      return events_perf_submit(&p, DIRTY_PIPE_SPLICE, 0);
  4336  }
  4337  
  4338  SEC("raw_tracepoint/module_load")
  4339  int tracepoint__module__module_load(struct bpf_raw_tracepoint_args *ctx)
  4340  {
  4341      program_data_t p = {};
  4342      if (!init_program_data(&p, ctx))
  4343          return 0;
  4344  
  4345      if (!should_trace(&p))
  4346          return 0;
  4347  
  4348      bool should_submit_module_load = should_submit(MODULE_LOAD, p.event);
  4349      bool should_submit_hidden_module = should_submit(HIDDEN_KERNEL_MODULE_SEEKER, p.event);
  4350      if (!(should_submit_module_load || should_submit_hidden_module))
  4351          return 0;
  4352  
  4353      struct module *mod = (struct module *) ctx->args[0];
  4354  
  4355      if (should_submit_hidden_module) {
  4356          u64 insert_time = bpf_ktime_get_ns();
  4357          kernel_new_mod_t new_mod = {.insert_time = insert_time};
  4358          u64 mod_addr = (u64) mod;
  4359          // new_module_map - must be after the module is added to modules list,
  4360          // otherwise there's a risk for race condition
  4361          bpf_map_update_elem(&new_module_map, &mod_addr, &new_mod, BPF_ANY);
  4362  
  4363          last_module_insert_time = insert_time;
  4364  
  4365          if (!should_submit_module_load)
  4366              return 0;
  4367      }
  4368  
  4369      const char *version = BPF_CORE_READ(mod, version);
  4370      const char *srcversion = BPF_CORE_READ(mod, srcversion);
  4371      save_str_to_buf(&p.event->args_buf, &mod->name, 0);
  4372      save_str_to_buf(&p.event->args_buf, (void *) version, 1);
  4373      save_str_to_buf(&p.event->args_buf, (void *) srcversion, 2);
  4374  
  4375      return events_perf_submit(&p, MODULE_LOAD, 0);
  4376  }
  4377  
  4378  SEC("raw_tracepoint/module_free")
  4379  int tracepoint__module__module_free(struct bpf_raw_tracepoint_args *ctx)
  4380  {
  4381      program_data_t p = {};
  4382      if (!init_program_data(&p, ctx))
  4383          return 0;
  4384  
  4385      if (!should_trace(&p))
  4386          return 0;
  4387  
  4388      bool should_submit_module_free = should_submit(MODULE_FREE, p.event);
  4389      bool should_submit_hidden_module = should_submit(HIDDEN_KERNEL_MODULE_SEEKER, p.event);
  4390      if (!(should_submit_module_free || should_submit_hidden_module))
  4391          return 0;
  4392  
  4393      struct module *mod = (struct module *) ctx->args[0];
  4394      if (should_submit_hidden_module) {
  4395          u64 mod_addr = (u64) mod;
  4396          // We must delete before the actual deletion from modules list occurs, otherwise there's a
  4397          // risk of race condition
  4398          bpf_map_delete_elem(&new_module_map, &mod_addr);
  4399  
  4400          kernel_deleted_mod_t deleted_mod = {.deleted_time = bpf_ktime_get_ns()};
  4401          bpf_map_update_elem(&recent_deleted_module_map, &mod_addr, &deleted_mod, BPF_ANY);
  4402  
  4403          if (!should_submit_module_free)
  4404              return 0;
  4405      }
  4406  
  4407      const char *version = BPF_CORE_READ(mod, version);
  4408      const char *srcversion = BPF_CORE_READ(mod, srcversion);
  4409      save_str_to_buf(&p.event->args_buf, &mod->name, 0);
  4410      save_str_to_buf(&p.event->args_buf, (void *) version, 1);
  4411      save_str_to_buf(&p.event->args_buf, (void *) srcversion, 2);
  4412  
  4413      return events_perf_submit(&p, MODULE_FREE, 0);
  4414  }
  4415  
  4416  SEC("kprobe/do_init_module")
  4417  TRACE_ENT_FUNC(do_init_module, DO_INIT_MODULE);
  4418  
  4419  SEC("kretprobe/do_init_module")
  4420  int BPF_KPROBE(trace_ret_do_init_module)
  4421  {
  4422      args_t saved_args;
  4423      if (load_args(&saved_args, DO_INIT_MODULE) != 0) {
  4424          // missed entry or not traced
  4425          return 0;
  4426      }
  4427      del_args(DO_INIT_MODULE);
  4428  
  4429      program_data_t p = {};
  4430      if (!init_program_data(&p, ctx))
  4431          return 0;
  4432  
  4433      if (!should_trace(&p))
  4434          return 0;
  4435  
  4436      bool should_submit_do_init_module = should_submit(DO_INIT_MODULE, p.event);
  4437      bool should_submit_hidden_module = should_submit(HIDDEN_KERNEL_MODULE_SEEKER, p.event);
  4438      if (!(should_submit_do_init_module || should_submit_hidden_module))
  4439          return 0;
  4440  
  4441      struct module *mod = (struct module *) saved_args.args[0];
  4442  
  4443      // trigger the lkm seeker
  4444      if (should_submit_hidden_module) {
  4445          u64 addr = (u64) mod;
  4446          u32 flags = FULL_SCAN;
  4447          lkm_seeker_send_to_userspace((struct module *) addr, &flags, &p);
  4448          reset_event_args(&p); // Do not corrupt the buffer for the do_init_module event
  4449          if (!should_submit_do_init_module)
  4450              return 0;
  4451      }
  4452  
  4453      // save strings to buf
  4454      const char *version = BPF_CORE_READ(mod, version);
  4455      const char *srcversion = BPF_CORE_READ(mod, srcversion);
  4456      save_str_to_buf(&p.event->args_buf, &mod->name, 0);
  4457      save_str_to_buf(&p.event->args_buf, (void *) version, 1);
  4458      save_str_to_buf(&p.event->args_buf, (void *) srcversion, 2);
  4459  
  4460      int ret_val = PT_REGS_RC(ctx);
  4461      return events_perf_submit(&p, DO_INIT_MODULE, ret_val);
  4462  }
  4463  
  4464  // clang-format off
  4465  
  4466  SEC("kprobe/load_elf_phdrs")
  4467  int BPF_KPROBE(trace_load_elf_phdrs)
  4468  {
  4469      program_data_t p = {};
  4470      if (!init_program_data(&p, ctx))
  4471          return 0;
  4472  
  4473      if (!should_trace((&p)))
  4474          return 0;
  4475  
  4476      proc_info_t *proc_info = p.proc_info;
  4477  
  4478      struct file *loaded_elf = (struct file *) PT_REGS_PARM2(ctx);
  4479      const char *elf_pathname = (char *) get_path_str(__builtin_preserve_access_index(&loaded_elf->f_path));
  4480  
  4481      // The interpreter field will be updated for any loading of an elf, both for the binary and for
  4482      // the interpreter. Because the interpreter is loaded only after the executed elf is loaded, the
  4483      // value of the executed binary should be overridden by the interpreter.
  4484  
  4485      size_t sz = sizeof(proc_info->interpreter.pathname);
  4486      bpf_probe_read_kernel_str(proc_info->interpreter.pathname, sz, elf_pathname);
  4487      proc_info->interpreter.id.device = get_dev_from_file(loaded_elf);
  4488      proc_info->interpreter.id.inode = get_inode_nr_from_file(loaded_elf);
  4489      proc_info->interpreter.id.ctime = get_ctime_nanosec_from_file(loaded_elf);
  4490  
  4491      if (should_submit(LOAD_ELF_PHDRS, p.event)) {
  4492          save_str_to_buf(&p.event->args_buf, (void *) elf_pathname, 0);
  4493          save_to_submit_buf(&p.event->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 1);
  4494          save_to_submit_buf(
  4495              &p.event->args_buf, &proc_info->interpreter.id.inode, sizeof(unsigned long), 2);
  4496  
  4497          events_perf_submit(&p, LOAD_ELF_PHDRS, 0);
  4498      }
  4499  
  4500      return 0;
  4501  }
  4502  
  4503  // clang-format on
  4504  
  4505  SEC("kprobe/security_file_permission")
  4506  int BPF_KPROBE(trace_security_file_permission)
  4507  {
  4508      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  4509      if (file == NULL)
  4510          return 0;
  4511      struct inode *f_inode = get_inode_from_file(file);
  4512      struct super_block *i_sb = get_super_block_from_inode(f_inode);
  4513      unsigned long s_magic = get_s_magic_from_super_block(i_sb);
  4514  
  4515      // Only check procfs entries
  4516      if (s_magic != PROC_SUPER_MAGIC) {
  4517          return 0;
  4518      }
  4519  
  4520      program_data_t p = {};
  4521      if (!init_program_data(&p, ctx))
  4522          return 0;
  4523  
  4524      if (!should_trace(&p))
  4525          return 0;
  4526  
  4527      if (!should_submit(HOOKED_PROC_FOPS, p.event))
  4528          return 0;
  4529  
  4530      struct file_operations *fops = (struct file_operations *) BPF_CORE_READ(f_inode, i_fop);
  4531      if (fops == NULL)
  4532          return 0;
  4533  
  4534      unsigned long iterate_addr = 0;
  4535      unsigned long iterate_shared_addr = (unsigned long) BPF_CORE_READ(fops, iterate_shared);
  4536  
  4537      // iterate() removed by commit 3e3271549670 at v6.5-rc4
  4538      if (bpf_core_field_exists(fops->iterate))
  4539          iterate_addr = (unsigned long) BPF_CORE_READ(fops, iterate);
  4540  
  4541      if (iterate_addr == 0 && iterate_shared_addr == 0)
  4542          return 0;
  4543  
  4544      // get text segment bounds
  4545      void *stext_addr = get_stext_addr();
  4546      if (unlikely(stext_addr == NULL))
  4547          return 0;
  4548      void *etext_addr = get_etext_addr();
  4549      if (unlikely(etext_addr == NULL))
  4550          return 0;
  4551  
  4552      // mark as 0 if in bounds
  4553      if (iterate_shared_addr >= (u64) stext_addr && iterate_shared_addr < (u64) etext_addr)
  4554          iterate_shared_addr = 0;
  4555      if (iterate_addr >= (u64) stext_addr && iterate_addr < (u64) etext_addr)
  4556          iterate_addr = 0;
  4557  
  4558      // now check again, if both are in text bounds, return
  4559      if (iterate_addr == 0 && iterate_shared_addr == 0)
  4560          return 0;
  4561  
  4562      unsigned long fops_addresses[2] = {iterate_shared_addr, iterate_addr};
  4563  
  4564      save_u64_arr_to_buf(&p.event->args_buf, (const u64 *) fops_addresses, 2, 0);
  4565      events_perf_submit(&p, HOOKED_PROC_FOPS, 0);
  4566      return 0;
  4567  }
  4568  
  4569  SEC("raw_tracepoint/task_rename")
  4570  int tracepoint__task__task_rename(struct bpf_raw_tracepoint_args *ctx)
  4571  {
  4572      program_data_t p = {};
  4573      if (!init_program_data(&p, ctx))
  4574          return 0;
  4575  
  4576      if (!should_trace((&p)))
  4577          return 0;
  4578  
  4579      if (!should_submit(TASK_RENAME, p.event))
  4580          return 0;
  4581  
  4582      struct task_struct *tsk = (struct task_struct *) ctx->args[0];
  4583      char old_name[TASK_COMM_LEN];
  4584      bpf_probe_read_kernel_str(&old_name, TASK_COMM_LEN, tsk->comm);
  4585      const char *new_name = (const char *) ctx->args[1];
  4586  
  4587      save_str_to_buf(&p.event->args_buf, (void *) old_name, 0);
  4588      save_str_to_buf(&p.event->args_buf, (void *) new_name, 1);
  4589  
  4590      return events_perf_submit(&p, TASK_RENAME, 0);
  4591  }
  4592  
  4593  SEC("kprobe/security_inode_rename")
  4594  int BPF_KPROBE(trace_security_inode_rename)
  4595  {
  4596      program_data_t p = {};
  4597      if (!init_program_data(&p, ctx))
  4598          return 0;
  4599  
  4600      if (!should_trace(&p))
  4601          return 0;
  4602  
  4603      if (!should_submit(SECURITY_INODE_RENAME, p.event))
  4604          return 0;
  4605  
  4606      struct dentry *old_dentry = (struct dentry *) PT_REGS_PARM2(ctx);
  4607      struct dentry *new_dentry = (struct dentry *) PT_REGS_PARM4(ctx);
  4608  
  4609      void *old_dentry_path = get_dentry_path_str(old_dentry);
  4610      save_str_to_buf(&p.event->args_buf, old_dentry_path, 0);
  4611      void *new_dentry_path = get_dentry_path_str(new_dentry);
  4612      save_str_to_buf(&p.event->args_buf, new_dentry_path, 1);
  4613      return events_perf_submit(&p, SECURITY_INODE_RENAME, 0);
  4614  }
  4615  
  4616  SEC("kprobe/kallsyms_lookup_name")
  4617  TRACE_ENT_FUNC(kallsyms_lookup_name, KALLSYMS_LOOKUP_NAME);
  4618  
  4619  SEC("kretprobe/kallsyms_lookup_name")
  4620  int BPF_KPROBE(trace_ret_kallsyms_lookup_name)
  4621  {
  4622      args_t saved_args;
  4623      if (load_args(&saved_args, KALLSYMS_LOOKUP_NAME) != 0)
  4624          return 0;
  4625      del_args(KALLSYMS_LOOKUP_NAME);
  4626  
  4627      program_data_t p = {};
  4628      if (!init_program_data(&p, ctx))
  4629          return 0;
  4630  
  4631      if (!should_trace(&p))
  4632          return 0;
  4633  
  4634      if (!should_submit(KALLSYMS_LOOKUP_NAME, p.event))
  4635          return 0;
  4636  
  4637      char *name = (char *) saved_args.args[0];
  4638      unsigned long address = PT_REGS_RC(ctx);
  4639  
  4640      save_str_to_buf(&p.event->args_buf, name, 0);
  4641      save_to_submit_buf(&p.event->args_buf, &address, sizeof(unsigned long), 1);
  4642  
  4643      return events_perf_submit(&p, KALLSYMS_LOOKUP_NAME, 0);
  4644  }
  4645  
  4646  enum signal_handling_method_e {
  4647      SIG_DFL,
  4648      SIG_IGN,
  4649      SIG_HND = 2 // Doesn't exist in the kernel, but signifies that the method is through
  4650                  // user-defined handler
  4651  };
  4652  
  4653  SEC("kprobe/do_sigaction")
  4654  int BPF_KPROBE(trace_do_sigaction)
  4655  {
  4656      program_data_t p = {};
  4657      if (!init_program_data(&p, ctx))
  4658          return 0;
  4659  
  4660      if (!should_trace(&p))
  4661          return 0;
  4662  
  4663      if (!should_submit(DO_SIGACTION, p.event))
  4664          return 0;
  4665  
  4666      // Initialize all relevant arguments values
  4667      int sig = (int) PT_REGS_PARM1(ctx);
  4668      u8 old_handle_method = 0, new_handle_method = 0;
  4669      unsigned long new_sa_flags, old_sa_flags;
  4670      void *new_sa_handler, *old_sa_handler;
  4671      unsigned long new_sa_mask, old_sa_mask;
  4672  
  4673      // Extract old signal handler values
  4674      struct task_struct *task = p.task;
  4675      struct sighand_struct *sighand = BPF_CORE_READ(task, sighand);
  4676      struct k_sigaction *sig_actions = &(sighand->action[0]);
  4677      if (sig > 0 && sig < _NSIG) {
  4678          struct k_sigaction *old_act = get_node_addr(sig_actions, sig - 1);
  4679          old_sa_flags = BPF_CORE_READ(old_act, sa.sa_flags);
  4680          // In 64-bit system there is only 1 node in the mask array
  4681          old_sa_mask = BPF_CORE_READ(old_act, sa.sa_mask.sig[0]);
  4682          old_sa_handler = BPF_CORE_READ(old_act, sa.sa_handler);
  4683          if (old_sa_handler >= (void *) SIG_HND)
  4684              old_handle_method = SIG_HND;
  4685          else {
  4686              old_handle_method = (u8) (old_sa_handler && 0xFF);
  4687              old_sa_handler = NULL;
  4688          }
  4689      }
  4690  
  4691      // Check if a pointer for storing old signal handler is given
  4692      struct k_sigaction *recv_old_act = (struct k_sigaction *) PT_REGS_PARM3(ctx);
  4693      bool old_act_initialized = recv_old_act != NULL;
  4694  
  4695      // Extract new signal handler values if initialized
  4696      struct k_sigaction *new_act = (struct k_sigaction *) PT_REGS_PARM2(ctx);
  4697      bool new_act_initialized = new_act != NULL;
  4698      if (new_act_initialized) {
  4699          struct sigaction *new_sigaction = &new_act->sa;
  4700          new_sa_flags = BPF_CORE_READ(new_sigaction, sa_flags);
  4701          // In 64-bit system there is only 1 node in the mask array
  4702          new_sa_mask = BPF_CORE_READ(new_sigaction, sa_mask.sig[0]);
  4703          new_sa_handler = BPF_CORE_READ(new_sigaction, sa_handler);
  4704          if (new_sa_handler >= (void *) SIG_HND)
  4705              new_handle_method = SIG_HND;
  4706          else {
  4707              new_handle_method = (u8) (new_sa_handler && 0xFF);
  4708              new_sa_handler = NULL;
  4709          }
  4710      }
  4711  
  4712      save_to_submit_buf(&p.event->args_buf, &sig, sizeof(int), 0);
  4713      save_to_submit_buf(&p.event->args_buf, &new_act_initialized, sizeof(bool), 1);
  4714      if (new_act_initialized) {
  4715          save_to_submit_buf(&p.event->args_buf, &new_sa_flags, sizeof(unsigned long), 2);
  4716          save_to_submit_buf(&p.event->args_buf, &new_sa_mask, sizeof(unsigned long), 3);
  4717          save_to_submit_buf(&p.event->args_buf, &new_handle_method, sizeof(u8), 4);
  4718          save_to_submit_buf(&p.event->args_buf, &new_sa_handler, sizeof(void *), 5);
  4719      }
  4720      save_to_submit_buf(&p.event->args_buf, &old_act_initialized, sizeof(bool), 6);
  4721      save_to_submit_buf(&p.event->args_buf, &old_sa_flags, sizeof(unsigned long), 7);
  4722      save_to_submit_buf(&p.event->args_buf, &old_sa_mask, sizeof(unsigned long), 8);
  4723      save_to_submit_buf(&p.event->args_buf, &old_handle_method, sizeof(u8), 9);
  4724      save_to_submit_buf(&p.event->args_buf, &old_sa_handler, sizeof(void *), 10);
  4725  
  4726      return events_perf_submit(&p, DO_SIGACTION, 0);
  4727  }
  4728  
  4729  statfunc int common_utimes(struct pt_regs *ctx)
  4730  {
  4731      program_data_t p = {};
  4732      if (!init_program_data(&p, ctx))
  4733          return 0;
  4734  
  4735      if (!should_trace(&p))
  4736          return 0;
  4737  
  4738      if (!should_submit(VFS_UTIMES, p.event))
  4739          return 0;
  4740  
  4741      struct path *path = (struct path *) PT_REGS_PARM1(ctx);
  4742      struct timespec64 *times = (struct timespec64 *) PT_REGS_PARM2(ctx);
  4743  
  4744      void *path_str = get_path_str(path);
  4745  
  4746      struct dentry *dentry = BPF_CORE_READ(path, dentry);
  4747      u64 inode_nr = get_inode_nr_from_dentry(dentry);
  4748      dev_t dev = get_dev_from_dentry(dentry);
  4749  
  4750      u64 atime = get_time_nanosec_timespec(times);
  4751      u64 mtime = get_time_nanosec_timespec(&times[1]);
  4752  
  4753      save_str_to_buf(&p.event->args_buf, path_str, 0);
  4754      save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 1);
  4755      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(u64), 2);
  4756      save_to_submit_buf(&p.event->args_buf, &atime, sizeof(u64), 3);
  4757      save_to_submit_buf(&p.event->args_buf, &mtime, sizeof(u64), 4);
  4758  
  4759      return events_perf_submit(&p, VFS_UTIMES, 0);
  4760  }
  4761  
  4762  SEC("kprobe/vfs_utimes")
  4763  int BPF_KPROBE(trace_vfs_utimes)
  4764  {
  4765      return common_utimes(ctx);
  4766  }
  4767  
  4768  SEC("kprobe/utimes_common")
  4769  int BPF_KPROBE(trace_utimes_common)
  4770  {
  4771      return common_utimes(ctx);
  4772  }
  4773  
  4774  SEC("kprobe/do_truncate")
  4775  int BPF_KPROBE(trace_do_truncate)
  4776  {
  4777      program_data_t p = {};
  4778      if (!init_program_data(&p, ctx))
  4779          return 0;
  4780  
  4781      if (!should_trace(&p))
  4782          return 0;
  4783  
  4784      if (!should_submit(DO_TRUNCATE, p.event))
  4785          return 0;
  4786  
  4787      struct dentry *dentry = (struct dentry *) PT_REGS_PARM2(ctx);
  4788      u64 length = (long) PT_REGS_PARM3(ctx);
  4789  
  4790      void *dentry_path = get_dentry_path_str(dentry);
  4791      unsigned long inode_nr = get_inode_nr_from_dentry(dentry);
  4792      dev_t dev = get_dev_from_dentry(dentry);
  4793  
  4794      save_str_to_buf(&p.event->args_buf, dentry_path, 0);
  4795      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 1);
  4796      save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2);
  4797      save_to_submit_buf(&p.event->args_buf, &length, sizeof(u64), 3);
  4798  
  4799      return events_perf_submit(&p, DO_TRUNCATE, 0);
  4800  }
  4801  
  4802  SEC("kprobe/fd_install")
  4803  int BPF_KPROBE(trace_fd_install)
  4804  {
  4805      program_data_t p = {};
  4806      if (!init_program_data(&p, ctx))
  4807          return 0;
  4808  
  4809      if (!should_trace(&p))
  4810          return 0;
  4811  
  4812      struct file *file = (struct file *) PT_REGS_PARM2(ctx);
  4813  
  4814      // check if regular file. otherwise don't save the file_mod_key_t in file_modification_map.
  4815      unsigned short file_mode = get_inode_mode_from_file(file);
  4816      if ((file_mode & S_IFMT) != S_IFREG) {
  4817          return 0;
  4818      }
  4819  
  4820      file_info_t file_info = get_file_info(file);
  4821  
  4822      file_mod_key_t file_mod_key = {
  4823          p.task_info->context.host_pid, file_info.id.device, file_info.id.inode};
  4824      int op = FILE_MODIFICATION_SUBMIT;
  4825  
  4826      bpf_map_update_elem(&file_modification_map, &file_mod_key, &op, BPF_ANY);
  4827  
  4828      return 0;
  4829  }
  4830  
  4831  SEC("kprobe/filp_close")
  4832  int BPF_KPROBE(trace_filp_close)
  4833  {
  4834      program_data_t p = {};
  4835      if (!init_program_data(&p, ctx))
  4836          return 0;
  4837  
  4838      if (!should_trace(&p))
  4839          return 0;
  4840  
  4841      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  4842      file_info_t file_info = get_file_info(file);
  4843  
  4844      file_mod_key_t file_mod_key = {
  4845          p.task_info->context.host_pid, file_info.id.device, file_info.id.inode};
  4846  
  4847      bpf_map_delete_elem(&file_modification_map, &file_mod_key);
  4848  
  4849      return 0;
  4850  }
  4851  
  4852  statfunc int common_file_modification_ent(struct pt_regs *ctx)
  4853  {
  4854      struct file *file = (struct file *) PT_REGS_PARM1(ctx);
  4855  
  4856      // check if regular file. otherwise don't output the event.
  4857      unsigned short file_mode = get_inode_mode_from_file(file);
  4858      if ((file_mode & S_IFMT) != S_IFREG) {
  4859          return 0;
  4860      }
  4861  
  4862      u64 ctime = get_ctime_nanosec_from_file(file);
  4863  
  4864      args_t args = {};
  4865      args.args[0] = (unsigned long) file;
  4866      args.args[1] = ctime;
  4867      save_args(&args, FILE_MODIFICATION);
  4868  
  4869      return 0;
  4870  }
  4871  
  4872  statfunc int common_file_modification_ret(struct pt_regs *ctx)
  4873  {
  4874      args_t saved_args;
  4875      if (load_args(&saved_args, FILE_MODIFICATION) != 0)
  4876          return 0;
  4877      del_args(FILE_MODIFICATION);
  4878  
  4879      program_data_t p = {};
  4880      if (!init_program_data(&p, ctx))
  4881          return 0;
  4882  
  4883      if (!should_trace(&p))
  4884          return 0;
  4885  
  4886      if (!should_submit(FILE_MODIFICATION, p.event))
  4887          return 0;
  4888  
  4889      struct file *file = (struct file *) saved_args.args[0];
  4890      u64 old_ctime = saved_args.args[1];
  4891  
  4892      file_info_t file_info = get_file_info(file);
  4893  
  4894      file_mod_key_t file_mod_key = {
  4895          p.task_info->context.host_pid, file_info.id.device, file_info.id.inode};
  4896  
  4897      int *op = bpf_map_lookup_elem(&file_modification_map, &file_mod_key);
  4898      if (op == NULL || *op == FILE_MODIFICATION_SUBMIT) {
  4899          // we should submit the event once and mark as done.
  4900          int op = FILE_MODIFICATION_DONE;
  4901          bpf_map_update_elem(&file_modification_map, &file_mod_key, &op, BPF_ANY);
  4902      } else {
  4903          // no need to submit. return.
  4904          return 0;
  4905      }
  4906  
  4907      save_str_to_buf(&p.event->args_buf, file_info.pathname_p, 0);
  4908      save_to_submit_buf(&p.event->args_buf, &file_info.id.device, sizeof(dev_t), 1);
  4909      save_to_submit_buf(&p.event->args_buf, &file_info.id.inode, sizeof(unsigned long), 2);
  4910      save_to_submit_buf(&p.event->args_buf, &old_ctime, sizeof(u64), 3);
  4911      save_to_submit_buf(&p.event->args_buf, &file_info.id.ctime, sizeof(u64), 4);
  4912  
  4913      events_perf_submit(&p, FILE_MODIFICATION, 0);
  4914  
  4915      return 0;
  4916  }
  4917  
  4918  SEC("kprobe/file_update_time")
  4919  int BPF_KPROBE(trace_file_update_time)
  4920  {
  4921      return common_file_modification_ent(ctx);
  4922  }
  4923  
  4924  SEC("kretprobe/file_update_time")
  4925  int BPF_KPROBE(trace_ret_file_update_time)
  4926  {
  4927      return common_file_modification_ret(ctx);
  4928  }
  4929  
  4930  SEC("kprobe/file_modified")
  4931  int BPF_KPROBE(trace_file_modified)
  4932  {
  4933      /*
  4934       * we want this probe to run only on kernel versions >= 6.
  4935       * this is because on older kernels the file_modified() function calls the file_update_time()
  4936       * function. in those cases, we don't need this probe active.
  4937       */
  4938      if (bpf_core_field_exists(((struct file *) 0)->f_iocb_flags)) {
  4939          /* kernel version >= 6 */
  4940          return common_file_modification_ent(ctx);
  4941      }
  4942  
  4943      return 0;
  4944  }
  4945  
  4946  SEC("kretprobe/file_modified")
  4947  int BPF_KPROBE(trace_ret_file_modified)
  4948  {
  4949      /*
  4950       * we want this probe to run only on kernel versions >= 6.
  4951       * this is because on older kernels the file_modified() function calls the file_update_time()
  4952       * function. in those cases, we don't need this probe active.
  4953       */
  4954      if (bpf_core_field_exists(((struct file *) 0)->f_iocb_flags)) {
  4955          /* kernel version >= 6 */
  4956          return common_file_modification_ret(ctx);
  4957      }
  4958  
  4959      return 0;
  4960  }
  4961  
  4962  SEC("kprobe/inotify_find_inode")
  4963  TRACE_ENT_FUNC(inotify_find_inode, INOTIFY_WATCH);
  4964  
  4965  SEC("kretprobe/inotify_find_inode")
  4966  int BPF_KPROBE(trace_ret_inotify_find_inode)
  4967  {
  4968      args_t saved_args;
  4969      if (load_args(&saved_args, INOTIFY_WATCH) != 0)
  4970          return 0;
  4971      del_args(INOTIFY_WATCH);
  4972  
  4973      program_data_t p = {};
  4974      if (!init_program_data(&p, ctx))
  4975          return 0;
  4976  
  4977      if (!should_trace(&p))
  4978          return 0;
  4979  
  4980      if (!should_submit(INOTIFY_WATCH, p.event))
  4981          return 0;
  4982  
  4983      struct path *path = (struct path *) saved_args.args[1];
  4984  
  4985      void *path_str = get_path_str(path);
  4986  
  4987      struct dentry *dentry = BPF_CORE_READ(path, dentry);
  4988      u64 inode_nr = get_inode_nr_from_dentry(dentry);
  4989      dev_t dev = get_dev_from_dentry(dentry);
  4990  
  4991      save_str_to_buf(&p.event->args_buf, path_str, 0);
  4992      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 1);
  4993      save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2);
  4994  
  4995      return events_perf_submit(&p, INOTIFY_WATCH, 0);
  4996  }
  4997  
  4998  SEC("kprobe/exec_binprm")
  4999  TRACE_ENT_FUNC(exec_binprm, EXEC_BINPRM);
  5000  
  5001  SEC("kretprobe/exec_binprm")
  5002  int BPF_KPROBE(trace_ret_exec_binprm)
  5003  {
  5004      args_t saved_args;
  5005      if (load_args(&saved_args, EXEC_BINPRM) != 0) {
  5006          // missed entry or not traced
  5007          return 0;
  5008      }
  5009      del_args(EXEC_BINPRM);
  5010  
  5011      program_data_t p = {};
  5012      if (!init_program_data(&p, ctx))
  5013          return 0;
  5014  
  5015      if (!should_trace(&p))
  5016          return 0;
  5017  
  5018      if (!should_submit(PROCESS_EXECUTION_FAILED, p.event))
  5019          return 0;
  5020  
  5021      int ret_val = PT_REGS_RC(ctx);
  5022      if (ret_val == 0)
  5023          return 0; // not interested of successful execution - for that we have sched_process_exec
  5024  
  5025      struct linux_binprm *bprm = (struct linux_binprm *) saved_args.args[0];
  5026      if (bprm == NULL) {
  5027          return -1;
  5028      }
  5029  
  5030      struct file *file = get_file_ptr_from_bprm(bprm);
  5031  
  5032      const char *path = get_binprm_filename(bprm);
  5033      save_str_to_buf(&p.event->args_buf, (void *) path, 0);
  5034  
  5035      void *binary_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  5036      save_str_to_buf(&p.event->args_buf, binary_path, 1);
  5037  
  5038      dev_t binary_device_id = get_dev_from_file(file);
  5039      save_to_submit_buf(&p.event->args_buf, &binary_device_id, sizeof(dev_t), 2);
  5040  
  5041      unsigned long binary_inode_number = get_inode_nr_from_file(file);
  5042      save_to_submit_buf(&p.event->args_buf, &binary_inode_number, sizeof(unsigned long), 3);
  5043  
  5044      u64 binary_ctime = get_ctime_nanosec_from_file(file);
  5045      save_to_submit_buf(&p.event->args_buf, &binary_ctime, sizeof(u64), 4);
  5046  
  5047      umode_t binary_inode_mode = get_inode_mode_from_file(file);
  5048      save_to_submit_buf(&p.event->args_buf, &binary_inode_mode, sizeof(umode_t), 5);
  5049  
  5050      const char *interpreter_path = get_binprm_interp(bprm);
  5051      save_str_to_buf(&p.event->args_buf, (void *) interpreter_path, 6);
  5052  
  5053      bpf_tail_call(ctx, &prog_array, TAIL_EXEC_BINPRM1);
  5054      return -1;
  5055  }
  5056  
  5057  SEC("kretprobe/trace_ret_exec_binprm1")
  5058  int BPF_KPROBE(trace_ret_exec_binprm1)
  5059  {
  5060      program_data_t p = {};
  5061      if (!init_tailcall_program_data(&p, ctx))
  5062          return -1;
  5063  
  5064      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
  5065      struct file *stdin_file = get_struct_file_from_fd(0);
  5066  
  5067      unsigned short stdin_type = get_inode_mode_from_file(stdin_file) & S_IFMT;
  5068      save_to_submit_buf(&p.event->args_buf, &stdin_type, sizeof(unsigned short), 7);
  5069  
  5070      void *stdin_path = get_path_str(__builtin_preserve_access_index(&stdin_file->f_path));
  5071      save_str_to_buf(&p.event->args_buf, stdin_path, 8);
  5072  
  5073      int kernel_invoked = (get_task_parent_flags(task) & PF_KTHREAD) ? 1 : 0;
  5074      save_to_submit_buf(&p.event->args_buf, &kernel_invoked, sizeof(int), 9);
  5075  
  5076      bpf_tail_call(ctx, &prog_array, TAIL_EXEC_BINPRM2);
  5077      return -1;
  5078  }
  5079  
  5080  SEC("kretprobe/trace_ret_exec_binprm2")
  5081  int BPF_KPROBE(trace_ret_exec_binprm2)
  5082  {
  5083      program_data_t p = {};
  5084      if (!init_tailcall_program_data(&p, ctx))
  5085          return -1;
  5086  
  5087      syscall_data_t *sys = &p.task_info->syscall_data;
  5088      save_str_arr_to_buf(
  5089          &p.event->args_buf, (const char *const *) sys->args.args[1], 10); // userspace argv
  5090  
  5091      if (p.config->options & OPT_EXEC_ENV) {
  5092          save_str_arr_to_buf(
  5093              &p.event->args_buf, (const char *const *) sys->args.args[2], 11); // userspace envp
  5094      }
  5095  
  5096      int ret = PT_REGS_RC(ctx); // needs to be int
  5097  
  5098      return events_perf_submit(&p, PROCESS_EXECUTION_FAILED, ret);
  5099  }
  5100  
  5101  SEC("kprobe/security_path_notify")
  5102  int BPF_KPROBE(trace_security_path_notify)
  5103  {
  5104      program_data_t p = {};
  5105      if (!init_program_data(&p, ctx))
  5106          return 0;
  5107  
  5108      if (!should_trace(&p))
  5109          return 0;
  5110  
  5111      if (!should_submit(SECURITY_PATH_NOTIFY, p.event))
  5112          return 0;
  5113  
  5114      struct path *path = (struct path *) PT_REGS_PARM1(ctx);
  5115      void *path_str = get_path_str(path);
  5116      struct dentry *dentry = BPF_CORE_READ(path, dentry);
  5117      u64 inode_nr = get_inode_nr_from_dentry(dentry);
  5118      dev_t dev = get_dev_from_dentry(dentry);
  5119  
  5120      u64 mask = PT_REGS_PARM2(ctx);
  5121      unsigned int obj_type = PT_REGS_PARM3(ctx);
  5122  
  5123      save_str_to_buf(&p.event->args_buf, path_str, 0);
  5124      save_to_submit_buf(&p.event->args_buf, &inode_nr, sizeof(unsigned long), 1);
  5125      save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev_t), 2);
  5126      save_to_submit_buf(&p.event->args_buf, &mask, sizeof(u64), 3);
  5127      save_to_submit_buf(&p.event->args_buf, &obj_type, sizeof(unsigned int), 4);
  5128  
  5129      return events_perf_submit(&p, SECURITY_PATH_NOTIFY, 0);
  5130  }
  5131  
  5132  // clang-format off
  5133  
  5134  // Network Packets (works from ~5.2 and beyond)
  5135  
  5136  // To track ingress/egress traffic we always need to link a flow to its related
  5137  // task (particularly when hooking ingress skb bpf programs, where the current
  5138  // task is typically a kernel thread).
  5139  
  5140  // In older kernels, managing cgroup skb programs can be more difficult due to
  5141  // the lack of bpf helpers and buggy/incomplete verifier. To deal with this,
  5142  // this approach uses a technique of kprobing the function responsible for
  5143  // calling the cgroup/skb programs.
  5144  
  5145  // Tracee utilizes a technique of kprobing the function responsible for calling
  5146  // the cgroup/skb programs in order to perform the tasks which cgroup skb
  5147  // programs would usually accomplish. Through this method, all the data needed
  5148  // by the cgroup/skb programs is already stored in a map.
  5149  
  5150  // Unfortunately this approach has some cons: the kprobe to cgroup/skb execution
  5151  // flow does not have preemption disabled, so the map used in between all the
  5152  // hooks need to use as a key something that is available to all the hooks
  5153  // context (the packet contents themselves: e.g. L3 header fields).
  5154  
  5155  // At the end, the logic is simple: every time a socket is created an inode is
  5156  // also created. The task owning the socket is indexed by the socket inode so
  5157  // everytime this socket is used we know which task it belongs to (specially
  5158  // during ingress hook, executed from the softirq context within a kthread).
  5159  
  5160  //
  5161  // network helper functions
  5162  //
  5163  
  5164  statfunc bool is_family_supported(struct socket *sock)
  5165  {
  5166      struct sock *sk = (void *) BPF_CORE_READ(sock, sk);
  5167      struct sock_common *common = (void *) sk;
  5168      u8 family = BPF_CORE_READ(common, skc_family);
  5169  
  5170      switch (family) {
  5171          case PF_INET:
  5172          case PF_INET6:
  5173              break;
  5174          // case PF_UNSPEC:
  5175          // case PF_LOCAL:      // PF_UNIX or PF_FILE
  5176          // case PF_NETLINK:
  5177          // case PF_VSOCK:
  5178          // case PF_XDP:
  5179          // case PF_BRIDGE:
  5180          // case PF_PACKET:
  5181          // case PF_MPLS:
  5182          // case PF_BLUETOOTH:
  5183          // case PF_IB:
  5184          // ...
  5185          default:
  5186              return 0; // not supported
  5187      }
  5188  
  5189      return 1; // supported
  5190  }
  5191  
  5192  statfunc bool is_socket_supported(struct socket *sock)
  5193  {
  5194      struct sock *sk = (void *) BPF_CORE_READ(sock, sk);
  5195      u16 protocol = get_sock_protocol(sk);
  5196      switch (protocol) {
  5197          // case IPPROTO_IPIP:
  5198          // case IPPROTO_DCCP:
  5199          // case IPPROTO_SCTP:
  5200          // case IPPROTO_UDPLITE:
  5201          case IPPROTO_IP:
  5202          case IPPROTO_IPV6:
  5203          case IPPROTO_TCP:
  5204          case IPPROTO_UDP:
  5205          case IPPROTO_ICMP:
  5206          case IPPROTO_ICMPV6:
  5207              break;
  5208          default:
  5209              return 0; // not supported
  5210      }
  5211  
  5212      return 1; // supported
  5213  }
  5214  
  5215  //
  5216  // Support functions for network code
  5217  //
  5218  
  5219  statfunc u64 sizeof_net_event_context_t(void)
  5220  {
  5221      return sizeof(net_event_context_t) - sizeof(net_event_contextmd_t);
  5222  }
  5223  
  5224  statfunc void set_net_task_context(program_data_t *p, net_task_context_t *netctx)
  5225  {
  5226      netctx->task = p->task;
  5227      netctx->matched_policies = p->event->context.matched_policies;
  5228      netctx->syscall = p->event->context.syscall;
  5229      __builtin_memset(&netctx->taskctx, 0, sizeof(task_context_t));
  5230      __builtin_memcpy(&netctx->taskctx, &p->event->context.task, sizeof(task_context_t));
  5231  
  5232      // Normally this will be set filled inside events_perf_submit but for some events like set_socket_state we
  5233      // want to prefill full network context.
  5234      init_task_context(&netctx->taskctx, p->task, p->config->options);
  5235  }
  5236  
  5237  statfunc enum event_id_e net_packet_to_net_event(net_packet_t packet_type)
  5238  {
  5239      switch (packet_type) {
  5240          case CAP_NET_PACKET:
  5241              return NET_PACKET_CAP_BASE;
  5242          // Packets
  5243          case SUB_NET_PACKET_IP:
  5244              return NET_PACKET_IP;
  5245          case SUB_NET_PACKET_TCP:
  5246              return NET_PACKET_TCP;
  5247          case SUB_NET_PACKET_UDP:
  5248              return NET_PACKET_UDP;
  5249          case SUB_NET_PACKET_ICMP:
  5250              return NET_PACKET_ICMP;
  5251          case SUB_NET_PACKET_ICMPV6:
  5252              return NET_PACKET_ICMPV6;
  5253          case SUB_NET_PACKET_DNS:
  5254              return NET_PACKET_DNS;
  5255          case SUB_NET_PACKET_HTTP:
  5256              return NET_PACKET_HTTP;
  5257          case SUB_NET_PACKET_SOCKS5:
  5258              return NET_PACKET_SOCKS5;
  5259      };
  5260      return MAX_EVENT_ID;
  5261  }
  5262  
  5263  // The address of &neteventctx->eventctx will be aligned as eventctx is the
  5264  // first member of that packed struct. This is a false positive as we do need
  5265  // the neteventctx struct to be all packed.
  5266  #pragma clang diagnostic push
  5267  #pragma clang diagnostic ignored "-Waddress-of-packed-member"
  5268  
  5269  // Return if a network event should to be sumitted: if any of the policies
  5270  // matched, submit the network event. This means that if any of the policies
  5271  // need a network event, kernel can submit the network base event and let
  5272  // userland deal with it (derived events will match the appropriate policies).
  5273  statfunc u64 should_submit_net_event(net_event_context_t *neteventctx,
  5274                                       net_packet_t packet_type)
  5275  {
  5276      // TODO after v0.15.0: After some testing, the caching is never used, as the net context is
  5277      // always a new one (created by the cgroup/skb program caller, AND there is a single map check
  5278      // for each protocol, each protocol check for submission. Go back to changes made by commit
  5279      // #4e9bb610049 ("network: ebpf: lazy submit checks for net events"), but still using enum and
  5280      // better code (will improve the callers syntax as well).
  5281      enum event_id_e evt_id = net_packet_to_net_event(packet_type);
  5282  
  5283      event_config_t *evt_config = bpf_map_lookup_elem(&events_map, &evt_id);
  5284      if (evt_config == NULL)
  5285          return 0;
  5286  
  5287      return evt_config->submit_for_policies & neteventctx->eventctx.matched_policies;
  5288  }
  5289  
  5290  #pragma clang diagnostic pop // -Waddress-of-packed-member
  5291  
  5292  // Return if a network flow event should be submitted.
  5293  statfunc bool should_submit_flow_event(net_event_context_t *neteventctx)
  5294  {
  5295      switch (neteventctx->md.should_flow) {
  5296          case 0:
  5297              break;
  5298          case 1:
  5299              return true;
  5300          case 2:
  5301              return false;
  5302      }
  5303  
  5304      u32 evt_id = NET_FLOW_BASE;
  5305  
  5306      // Again, if any policy matched, submit the flow base event so other flow
  5307      // events can be derived in userland and their policies matched in userland.
  5308      event_config_t *evt_config = bpf_map_lookup_elem(&events_map, &evt_id);
  5309      if (evt_config == NULL)
  5310          return 0;
  5311  
  5312      u64 should = evt_config->submit_for_policies & neteventctx->eventctx.matched_policies;
  5313  
  5314      // Cache the result so next time we don't need to check again.
  5315      if (should)
  5316          neteventctx->md.should_flow = 1; // cache result: submit flow events
  5317      else
  5318          neteventctx->md.should_flow = 2; // cache result: don't submit flow events
  5319  
  5320      return should ? true : false;
  5321  }
  5322  
  5323  // Return if a network capture event should be submitted.
  5324  statfunc u64 should_capture_net_event(net_event_context_t *neteventctx, net_packet_t packet_type)
  5325  {
  5326      if (neteventctx->md.captured) // already captured
  5327          return 0;
  5328  
  5329      return should_submit_net_event(neteventctx, packet_type);
  5330  }
  5331  
  5332  //
  5333  // Protocol parsing functions
  5334  //
  5335  
  5336  #define CGROUP_SKB_HANDLE_FUNCTION(name)                                       \
  5337  statfunc u32 cgroup_skb_handle_##name(                           \
  5338      struct __sk_buff *ctx,                                                     \
  5339      net_event_context_t *neteventctx,                                          \
  5340      nethdrs *nethdrs,                                                           \
  5341      bool ingress                                                           \
  5342  )
  5343  
  5344  CGROUP_SKB_HANDLE_FUNCTION(family);
  5345  CGROUP_SKB_HANDLE_FUNCTION(proto);
  5346  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp);
  5347  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_dns);
  5348  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_http);
  5349  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_socks5);
  5350  CGROUP_SKB_HANDLE_FUNCTION(proto_udp);
  5351  CGROUP_SKB_HANDLE_FUNCTION(proto_udp_dns);
  5352  CGROUP_SKB_HANDLE_FUNCTION(proto_icmp);
  5353  CGROUP_SKB_HANDLE_FUNCTION(proto_icmpv6);
  5354  CGROUP_SKB_HANDLE_FUNCTION(proto_socks5);
  5355  
  5356  #define CGROUP_SKB_HANDLE(name) cgroup_skb_handle_##name(ctx, neteventctx, nethdrs, ingress);
  5357  
  5358  //
  5359  // Network submission functions
  5360  //
  5361  
  5362  // Submit a network event (packet, capture, flow) to userland.
  5363  statfunc u32 cgroup_skb_submit(void *map, struct __sk_buff *ctx,
  5364                                 net_event_context_t *neteventctx,
  5365                                 u32 event_type, u32 size)
  5366  {
  5367      size = size > FULL ? FULL : size;
  5368      switch (size) {
  5369          case HEADERS: // submit only headers
  5370              size = neteventctx->md.header_size;
  5371              break;
  5372          case FULL: // submit full packet
  5373              size = ctx->len;
  5374              break;
  5375          default: // submit size bytes
  5376              size += neteventctx->md.header_size;
  5377              size = size > ctx->len ? ctx->len : size;
  5378              break;
  5379      }
  5380  
  5381      // Flag eBPF subsystem to use current CPU and copy size bytes of payload.
  5382      u64 flags = BPF_F_CURRENT_CPU | (u64) size << 32;
  5383      neteventctx->bytes = size;
  5384  
  5385      // Set the event type before submitting event.
  5386      neteventctx->eventctx.eventid = event_type;
  5387  
  5388      // Submit the event.
  5389      return bpf_perf_event_output(ctx, map, flags, neteventctx, sizeof_net_event_context_t());
  5390  }
  5391  
  5392  // Submit a network event.
  5393  #define cgroup_skb_submit_event(a, b, c, d) cgroup_skb_submit(&events, a, b, c, d)
  5394  
  5395  // Check if a flag is set in the retval.
  5396  #define retval_hasflag(flag) (neteventctx->eventctx.retval & flag) == flag
  5397  
  5398  statfunc void update_flow_stats(struct __sk_buff *skb, netflowvalue_t *val, bool ingress) {
  5399      if (ingress) {
  5400          val->rx_bytes += skb->len;
  5401          val->rx_packets += 1;
  5402      } else {
  5403          val->tx_bytes += skb->len;
  5404          val->tx_packets += 1;
  5405      }
  5406  }
  5407  
  5408  statfunc void reset_flow_stats(netflowvalue_t *val) {
  5409  
  5410      val->tx_bytes = 0;
  5411      val->rx_bytes = 0;
  5412      val->tx_packets = 0;
  5413      val->rx_packets = 0;
  5414  }
  5415  
  5416  statfunc u32 submit_netflow_event(struct __sk_buff *ctx, net_event_context_t *neteventctx, netflowvalue_t *netflowvalptr) {
  5417      event_data_t *e = init_netflows_event_data();
  5418      if (unlikely(e == NULL))
  5419          return 0;
  5420  
  5421      __builtin_memcpy(&e->context.task, &neteventctx->eventctx.task, sizeof(task_context_t));
  5422      e->context.retval = neteventctx->eventctx.retval;
  5423  
  5424      save_to_submit_buf_kernel(&e->args_buf, (void *) &neteventctx->md.flow.proto, sizeof(u8), 0);
  5425      save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->direction, sizeof(u8), 1);
  5426      save_to_submit_buf_kernel(&e->args_buf, (void *) &neteventctx->md.flow.tuple, sizeof(tuple_t), 2);
  5427      save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->tx_bytes, sizeof(u64), 3);
  5428      save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->rx_bytes, sizeof(u64), 4);
  5429      save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->tx_packets, sizeof(u64), 5);
  5430      save_to_submit_buf_kernel(&e->args_buf, (void *) &netflowvalptr->rx_packets, sizeof(u64), 6);
  5431      net_events_perf_submit(ctx, NET_FLOW_BASE, e);
  5432      return 0;
  5433  }
  5434  
  5435  // Keep track of a flow event if they are enabled and if any policy matched.
  5436  // Submit the flow base event so userland can derive the flow events.
  5437  statfunc u32 cgroup_skb_handle_flow(struct __sk_buff *ctx,
  5438                                      net_event_context_t *neteventctx,
  5439                                      u32 event_type, u32 size, u32 flow)
  5440  {
  5441      netflowvalue_t *netflowvalptr, netflowvalue = {
  5442                                         .last_update = bpf_ktime_get_ns(),
  5443                                         .direction = flow_unknown,
  5444                                     };
  5445  
  5446      // Set the current netctx task as the flow task.
  5447      neteventctx->md.flow.host_pid = neteventctx->eventctx.task.host_pid;
  5448  
  5449      // Set the flow event type in retval.
  5450      neteventctx->eventctx.retval |= flow;
  5451  
  5452      // Check if the current packet source is the flow initiator.
  5453      bool is_initiator = 0;
  5454      bool ingress = 0;
  5455  
  5456      switch (flow) {
  5457          // 1) TCP connection is being established.
  5458          case flow_tcp_begin:
  5459              // Ingress: Remote (src) is sending SYN+ACK: this host (dst) is the initiator.
  5460              if (retval_hasflag(packet_ingress)) {
  5461                  netflowvalue.direction = flow_outgoing;
  5462                  ingress = 1;
  5463              }
  5464  
  5465              // Egress: Host (src) is sending SYN+ACK: remote (dst) host is the initiator.
  5466              if (retval_hasflag(packet_egress))
  5467                  netflowvalue.direction = flow_incoming;
  5468  
  5469              // Invert src/dst: The flowmap src should always be set to flow initiator.
  5470              neteventctx->md.flow = invert_netflow(neteventctx->md.flow);
  5471              update_flow_stats(ctx, &netflowvalue, ingress);
  5472  
  5473              // Update the flow map.
  5474              bpf_map_update_elem(&netflowmap, &neteventctx->md.flow, &netflowvalue, BPF_NOEXIST);
  5475  
  5476              return submit_netflow_event(ctx, neteventctx, &netflowvalue);
  5477          // 2) TCP Flow sample with current statistics.
  5478          case flow_tcp_sample:
  5479              netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow);
  5480              if (!netflowvalptr) {
  5481                  neteventctx->md.flow = invert_netflow(neteventctx->md.flow);
  5482                  netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow);
  5483                  if (!netflowvalptr) {
  5484                      return 0;
  5485                  }
  5486              }
  5487  
  5488              update_flow_stats(ctx, netflowvalptr, retval_hasflag(packet_ingress));
  5489  
  5490              u64 now = bpf_ktime_get_ns();
  5491              u64 last_submit_seconds = (now - netflowvalptr->last_update) / 1000000000;
  5492              // Check if it's time to submit flow sample.
  5493              if (last_submit_seconds >= global_config.flow_sample_submit_interval_seconds) {
  5494                  netflowvalptr->last_update = now;
  5495                  submit_netflow_event(ctx, neteventctx, netflowvalptr);
  5496                  reset_flow_stats(netflowvalptr);
  5497                  return 0;
  5498              }
  5499  
  5500              // Flow sample should not be submitted yet, exit.
  5501              return 0;
  5502  
  5503          // 3) TCP connection is being closed/terminated.
  5504          case flow_tcp_end:
  5505              // Any side can close the connection (FIN, RST, etc). Need heuristics.
  5506  
  5507              // Attempt 01: Try to find the flow using current src/dst.
  5508              for (int n = 0; n < 3; n++) {
  5509                  netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow);
  5510                  if (!netflowvalptr)
  5511                      continue;
  5512              }
  5513  
  5514              // FIN could be sent by either side, by both, or by none (RST). Need heuristics.
  5515              if (!netflowvalptr) {
  5516                  // Attempt 02: Maybe this packet src wasn't the flow initiator, invert src/dst.
  5517                  neteventctx->md.flow = invert_netflow(neteventctx->md.flow);
  5518  
  5519                  for (int n = 0; n < 3; n++) {
  5520                      netflowvalptr = bpf_map_lookup_elem(&netflowmap, &neteventctx->md.flow);
  5521                      if (!netflowvalptr)
  5522                          continue;
  5523                  }
  5524  
  5525                  // After first FIN packet is processed the flow is deleted, so the second
  5526                  // FIN packet, if ever processed, will not find the flow in the map, and
  5527                  // that is ok.
  5528                  if (!netflowvalptr)
  5529                      return 0;
  5530  
  5531                  // Flow was found using inverted src/dst: current pkt dst was the flow initiator.
  5532                  is_initiator = 0;
  5533  
  5534              } else {
  5535                  // Flow was found using current src/dst: current pkt src was the flow initiator.
  5536                  is_initiator = 1;
  5537              }
  5538  
  5539              // Inform userland the flow being terminated started by current packet src.
  5540              // This is important so userland knows how to report flow termination correctly.
  5541              if (is_initiator)
  5542                  neteventctx->eventctx.retval |= flow_src_initiator;
  5543  
  5544              update_flow_stats(ctx, netflowvalptr, retval_hasflag(packet_ingress));
  5545              submit_netflow_event(ctx, neteventctx, netflowvalptr);
  5546  
  5547              // Delete the flow from the map (make sure to delete both sides).
  5548              bpf_map_delete_elem(&netflowmap, &neteventctx->md.flow);
  5549              neteventctx->md.flow = invert_netflow(neteventctx->md.flow);
  5550              bpf_map_delete_elem(&netflowmap, &neteventctx->md.flow);
  5551  
  5552              return 0;
  5553  
  5554          // 3) TODO: UDP flow is considered started when the first packet is sent.
  5555          // case flow_udp_begin:
  5556          //
  5557          // 4) TODO: UDP flow is considered terminated when socket is closed.
  5558          // case flow_udp_end:
  5559          //
  5560          default:
  5561              return 0;
  5562      };
  5563  
  5564      return 0;
  5565  };
  5566  
  5567  // Check if capture event should be submitted, cache the result and submit.
  5568  #define cgroup_skb_capture()                                                                       \
  5569      {                                                                                              \
  5570          if (should_submit_net_event(neteventctx, CAP_NET_PACKET)) {                                \
  5571              if (neteventctx->md.captured == 0) {                                                   \
  5572                  cgroup_skb_capture_event(ctx, neteventctx, NET_CAPTURE_BASE);                      \
  5573                  neteventctx->md.captured = 1;                                                      \
  5574              }                                                                                      \
  5575          }                                                                                          \
  5576      }
  5577  
  5578  // Check if packet should be captured and submit the capture base event.
  5579  statfunc u32 cgroup_skb_capture_event(struct __sk_buff *ctx,
  5580                                        net_event_context_t *neteventctx,
  5581                                        u32 event_type)
  5582  {
  5583      int zero = 0;
  5584  
  5585      // Pick the network config map to know the requested capture length.
  5586      netconfig_entry_t *nc = bpf_map_lookup_elem(&netconfig_map, &zero);
  5587      if (nc == NULL)
  5588          return 0;
  5589  
  5590      // Submit the capture base event.
  5591      return cgroup_skb_submit(&net_cap_events, ctx, neteventctx, event_type, nc->capture_length);
  5592  }
  5593  
  5594  //
  5595  // Socket creation and socket <=> task context updates
  5596  //
  5597  
  5598  // Used to create a file descriptor for a socket. After a file descriptor is
  5599  // created, it can be associated with the file operations of the socket, this
  5600  // allows a socket to be used with the standard file operations (read, write,
  5601  // etc). By having a file descriptor, kernel can keep track of the socket state,
  5602  // and also the inode associated to the socket (which is used to link the socket
  5603  // to a task).
  5604  SEC("kprobe/sock_alloc_file")
  5605  int BPF_KPROBE(trace_sock_alloc_file)
  5606  {
  5607      // runs every time a socket is created (entry)
  5608  
  5609      struct socket *sock = (void *) PT_REGS_PARM1(ctx);
  5610  
  5611      if (!is_family_supported(sock))
  5612          return 0;
  5613  
  5614      if (!is_socket_supported(sock))
  5615          return 0;
  5616  
  5617      struct entry entry = {0};
  5618  
  5619      // save args for retprobe
  5620      entry.args[0] = PT_REGS_PARM1(ctx); // struct socket *sock
  5621      entry.args[1] = PT_REGS_PARM2(ctx); // int flags
  5622      entry.args[2] = PT_REGS_PARM2(ctx); // char *dname
  5623  
  5624      // prepare for kretprobe using entrymap
  5625      u32 host_tid = bpf_get_current_pid_tgid();
  5626      bpf_map_update_elem(&entrymap, &host_tid, &entry, BPF_ANY);
  5627  
  5628      return 0;
  5629  }
  5630  
  5631  // Ditto.
  5632  SEC("kretprobe/sock_alloc_file")
  5633  int BPF_KRETPROBE(trace_ret_sock_alloc_file)
  5634  {
  5635      // runs every time a socket is created (return)
  5636  
  5637      program_data_t p = {};
  5638      if (!init_program_data(&p, ctx))
  5639          return 0;
  5640  
  5641      if (!should_trace(&p))
  5642          return 0;
  5643  
  5644      // pick from entry from entrymap
  5645      u32 host_tid = p.event->context.task.host_tid;
  5646      struct entry *entry = bpf_map_lookup_elem(&entrymap, &host_tid);
  5647      if (!entry) // no entry == no tracing
  5648          return 0;
  5649  
  5650      // pick args from entry point's entry
  5651      // struct socket *sock = (void *) entry->args[0];
  5652      // int flags = entry->args[1];
  5653      // char *dname = (void *) entry->args[2];
  5654      struct file *sock_file = (void *) PT_REGS_RC(ctx);
  5655  
  5656      // cleanup entrymap
  5657      bpf_map_delete_elem(&entrymap, &host_tid);
  5658  
  5659      if (!sock_file)
  5660          return 0; // socket() failed ?
  5661  
  5662      u64 inode = BPF_CORE_READ(sock_file, f_inode, i_ino);
  5663      if (inode == 0)
  5664          return 0;
  5665  
  5666      // save context to further create an event when no context exists
  5667      net_task_context_t netctx = {0};
  5668      set_net_task_context(&p, &netctx);
  5669  
  5670      // update inodemap correlating inode <=> task
  5671      bpf_map_update_elem(&inodemap, &inode, &netctx, BPF_ANY);
  5672  
  5673      return 0;
  5674  }
  5675  
  5676  SEC("kprobe/security_sk_clone")
  5677  int BPF_KPROBE(trace_security_sk_clone)
  5678  {
  5679      // When a "sock" is cloned because of a SYN packet, a new "sock" is created
  5680      // and the return value is the new "sock" (not the original one).
  5681      //
  5682      // There is a problem though, the "sock" does not contain a valid "socket"
  5683      // associated to it yet (sk_socket is NULL as this is running with SoftIRQ
  5684      // context). Without a "socket" we also don't have a "file" associated to
  5685      // it, nor an inode associated to that file. This is the way tracee links
  5686      // a network flow (packets) to a task.
  5687      //
  5688      // The only way we can relate this new "sock", just cloned by a kernel
  5689      // thread, to a task, is through the existence of the old "sock" struct,
  5690      // describing the listening socket (one accept() was called for).
  5691      //
  5692      // Then, by knowing the old "sock" (with an existing socket, an existing
  5693      // file, an existing inode), we're able to link this new "sock" to the task
  5694      // we're tracing for the old "sock".
  5695      //
  5696      // In bullets:
  5697      //
  5698      // - tracing a process that has a socket listening for connections.
  5699      // - it receives a SYN packet and a new socket can be created (accept).
  5700      // - a sock (socket descriptor) is created for the socket to be created.
  5701      // - no socket/inode exists yet (sock->sk_socket is NULL).
  5702      // - accept() traces are too late for initial pkts (socked does not exist).
  5703      // - by linking old "sock" to the new "sock" we can relate the task.
  5704      // - some of the initial packets, sometimes with big length, are traced now.
  5705      //
  5706      // More at: https://github.com/aquasecurity/tracee/issues/2739
  5707  
  5708      struct sock *osock = (void *) PT_REGS_PARM1(ctx);
  5709      struct sock *nsock = (void *) PT_REGS_PARM2(ctx);
  5710  
  5711      struct socket *osocket = BPF_CORE_READ(osock, sk_socket);
  5712      if (!osocket)
  5713          return 0;
  5714  
  5715      // obtain old socket inode
  5716      u64 inode = BPF_CORE_READ(osocket, file, f_inode, i_ino);
  5717      if (inode == 0)
  5718          return 0;
  5719  
  5720      // check if old socket family is supported
  5721      if (!is_family_supported(osocket))
  5722          return 0;
  5723  
  5724      // if the original socket isn't linked to a task, then the newly cloned
  5725      // socket won't need to be linked as well: return in that case
  5726  
  5727      net_task_context_t *netctx = bpf_map_lookup_elem(&inodemap, &inode);
  5728      if (!netctx) {
  5729          return 0; // e.g. task isn't being traced
  5730      }
  5731  
  5732      u64 nsockptr = (u64)(void *) nsock;
  5733  
  5734      // link the new "sock" to the old inode, so it can be linked to a task later
  5735  
  5736      bpf_map_update_elem(&sockmap, &nsockptr, &inode, BPF_ANY);
  5737  
  5738      return 0;
  5739  }
  5740  
  5741  // Associate a socket to a task. This is done by linking the socket inode to the
  5742  // task context (inside netctx). This is done when a socket is created, and also
  5743  // when a socket is cloned (e.g. when a SYN packet is received and a new socket
  5744  // is created).
  5745  statfunc u32 update_net_inodemap(struct socket *sock, program_data_t *p)
  5746  {
  5747      struct file *sock_file = BPF_CORE_READ(sock, file);
  5748      if (!sock_file)
  5749          return 0;
  5750  
  5751      u64 inode = BPF_CORE_READ(sock_file, f_inode, i_ino);
  5752      if (inode == 0)
  5753          return 0;
  5754  
  5755      // save updated context to the inode map (inode <=> task ctx relation)
  5756      net_task_context_t netctx = {0};
  5757      set_net_task_context(p, &netctx);
  5758  
  5759      bpf_map_update_elem(&inodemap, &inode, &netctx, BPF_ANY);
  5760  
  5761      return 0;
  5762  }
  5763  
  5764  // Called by recv system calls (e.g. recvmsg, recvfrom, recv, ...), or when data
  5765  // arrives at the network stack and is destined for a socket, or during socket
  5766  // buffer management when kernel is copying data from the network buffer to the
  5767  // socket buffer.
  5768  SEC("kprobe/security_socket_recvmsg")
  5769  int BPF_KPROBE(trace_security_socket_recvmsg)
  5770  {
  5771      struct socket *sock = (void *) PT_REGS_PARM1(ctx);
  5772      if (sock == NULL)
  5773          return 0;
  5774      if (!is_family_supported(sock))
  5775          return 0;
  5776      if (!is_socket_supported(sock))
  5777          return 0;
  5778  
  5779      program_data_t p = {};
  5780      if (!init_program_data(&p, ctx))
  5781          return 0;
  5782  
  5783      if (!should_trace(&p))
  5784          return 0;
  5785  
  5786      return update_net_inodemap(sock, &p);
  5787  }
  5788  
  5789  // Called by send system calls (e.g. sendmsg, sendto, send, ...), or when data
  5790  // is queued for transmission by the network stack, or during socket buffer
  5791  // management when kernel is copying data from the socket buffer to the network
  5792  // buffer.
  5793  SEC("kprobe/security_socket_sendmsg")
  5794  int BPF_KPROBE(trace_security_socket_sendmsg)
  5795  {
  5796      struct socket *sock = (void *) PT_REGS_PARM1(ctx);
  5797      if (sock == NULL)
  5798          return 0;
  5799      if (!is_family_supported(sock))
  5800          return 0;
  5801      if (!is_socket_supported(sock))
  5802          return 0;
  5803  
  5804      program_data_t p = {};
  5805      if (!init_program_data(&p, ctx))
  5806          return 0;
  5807  
  5808      if (!should_trace(&p))
  5809          return 0;
  5810  
  5811      return update_net_inodemap(sock, &p);
  5812  }
  5813  
  5814  //
  5815  // Socket Ingress/Egress eBPF program loader (right before and right after eBPF)
  5816  //
  5817  
  5818  SEC("kprobe/__cgroup_bpf_run_filter_skb")
  5819  int BPF_KPROBE(cgroup_bpf_run_filter_skb)
  5820  {
  5821      // runs BEFORE the CGROUP/SKB eBPF program
  5822  
  5823      void *cgrpctxmap = NULL;
  5824  
  5825      struct sock *sk = (void *) PT_REGS_PARM1(ctx);
  5826      struct sk_buff *skb = (void *) PT_REGS_PARM2(ctx);
  5827      int type = PT_REGS_PARM3(ctx);
  5828  
  5829      if (!sk || !skb)
  5830          return 0;
  5831  
  5832      s64 packet_dir_flag; // used later to set packet direction flag
  5833      switch (type) {
  5834          case BPF_CGROUP_INET_INGRESS:
  5835              cgrpctxmap = &cgrpctxmap_in;
  5836              packet_dir_flag = packet_ingress;
  5837              break;
  5838          case BPF_CGROUP_INET_EGRESS:
  5839              cgrpctxmap = &cgrpctxmap_eg;
  5840              packet_dir_flag = packet_egress;
  5841              break;
  5842          default:
  5843              return 0; // other attachment type, return fast
  5844      }
  5845  
  5846      struct sock_common *common = (void *) sk;
  5847      u8 family = BPF_CORE_READ(common, skc_family);
  5848  
  5849      switch (family) {
  5850          case PF_INET:
  5851          case PF_INET6:
  5852              break;
  5853          default:
  5854              return 1; // return fast for unsupported socket families
  5855      }
  5856  
  5857      //
  5858      // EVENT CONTEXT (from current task, might be a kernel context/thread)
  5859      //
  5860  
  5861      u32 zero = 0;
  5862      event_data_t *e = bpf_map_lookup_elem(&net_heap_event, &zero);
  5863      if (unlikely(e == NULL))
  5864          return 0;
  5865  
  5866      program_data_t p = {};
  5867      p.scratch_idx = 1;
  5868      p.event = e;
  5869      if (!init_program_data(&p, ctx))
  5870          return 0;
  5871  
  5872      bool mightbecloned = false; // cloned sock structs come from accept()
  5873  
  5874      // obtain the socket inode using current "sock" structure
  5875  
  5876      u64 inode = BPF_CORE_READ(sk, sk_socket, file, f_inode, i_ino);
  5877      if (inode == 0)
  5878          mightbecloned = true; // kernel threads might have zero inode
  5879  
  5880      struct net_task_context *netctx;
  5881  
  5882      // obtain the task ctx using the obtained socket inode
  5883  
  5884      if (!mightbecloned) {
  5885          // pick network context from the inodemap (inode <=> task)
  5886          netctx = bpf_map_lookup_elem(&inodemap, &inode);
  5887          if (!netctx)
  5888              mightbecloned = true; // e.g. task isn't being traced
  5889      }
  5890  
  5891      // If inode is zero, or task context couldn't be found, try to find it using
  5892      // the "sock" pointer from sockmap (this sock struct might be new, just
  5893      // cloned, and a socket might not exist yet, but the sockmap is likely to
  5894      // have the entry). Check trace_security_sk_clone() for more details.
  5895  
  5896      if (mightbecloned) {
  5897          // pick network context from the sockmap (new sockptr <=> old inode <=> task)
  5898          u64 skptr = (u64) (void *) sk;
  5899          u64 *o = bpf_map_lookup_elem(&sockmap, &skptr);
  5900          if (o == 0)
  5901              return 0;
  5902          u64 oinode = *o;
  5903  
  5904          // with the old inode, find the netctx for the task
  5905          netctx = bpf_map_lookup_elem(&inodemap, &oinode);
  5906          if (!netctx)
  5907              return 0; // old inode wasn't being traced as well
  5908  
  5909          // update inodemap w/ new inode <=> task context (faster path next time)
  5910          bpf_map_update_elem(&inodemap, &oinode, netctx, BPF_ANY);
  5911      }
  5912  
  5913  // CHECK: should_submit_net_event() for more info
  5914  #pragma clang diagnostic push
  5915  #pragma clang diagnostic ignored "-Waddress-of-packed-member"
  5916  
  5917      //
  5918      // PREPARE SKG PROGRAM EVENT CONTEXT (cgrpctxmap value)
  5919      //
  5920  
  5921      // Prepare [event_context_t][args1,arg2,arg3...] to be sent by cgroup/skb
  5922      // program. The [...] part of the event can't use existing per-cpu submit
  5923      // buffer helpers because the time in between this kprobe fires and the
  5924      // cgroup/skb program runs might be suffer a preemption.
  5925  
  5926      net_event_context_t neteventctx = {0}; // to be sent by cgroup/skb program
  5927      event_context_t *eventctx = &neteventctx.eventctx;
  5928  
  5929  #pragma clang diagnostic pop
  5930  
  5931      // copy orig task ctx (from the netctx) to event ctx and build the rest
  5932      __builtin_memcpy(&eventctx->task, &netctx->taskctx, sizeof(task_context_t));
  5933      eventctx->ts = p.event->context.ts;                     // copy timestamp from current ctx
  5934      neteventctx.argnum = 1;                                 // 1 argument (add more if needed)
  5935      eventctx->eventid = NET_PACKET_IP;                      // will be changed in skb program
  5936      eventctx->stack_id = 0;                                 // no stack trace
  5937      eventctx->processor_id = p.event->context.processor_id; // copy from current ctx
  5938      eventctx->matched_policies = netctx->matched_policies;  // pick matched_policies from net ctx
  5939      eventctx->syscall = NO_SYSCALL;                         // ingress has no orig syscall
  5940      if (type == BPF_CGROUP_INET_EGRESS)
  5941          eventctx->syscall = netctx->syscall; // egress does have an orig syscall
  5942  
  5943      //
  5944      // SKB PROGRAM CONTEXT INDEXER (cgrpctxmap key)
  5945      //
  5946  
  5947      u32 l3_size = 0;
  5948      nethdrs hdrs = {0}, *nethdrs = &hdrs;
  5949  
  5950      // inform userland about protocol family (for correct L3 header parsing)...
  5951      switch (family) {
  5952          case PF_INET:
  5953              eventctx->retval |= family_ipv4;
  5954              l3_size = get_type_size(struct iphdr);
  5955              break;
  5956          case PF_INET6:
  5957              eventctx->retval |= family_ipv6;
  5958              l3_size = get_type_size(struct ipv6hdr);
  5959              break;
  5960          default:
  5961              return 1;
  5962      }
  5963  
  5964      // ... and packet direction(ingress/egress) ...
  5965      eventctx->retval |= packet_dir_flag;
  5966      // ... through event ctx ret val.
  5967  
  5968      // Read packet headers from the skb.
  5969      void *data_ptr = BPF_CORE_READ(skb, head) + BPF_CORE_READ(skb, network_header);
  5970      bpf_core_read(nethdrs, l3_size, data_ptr);
  5971  
  5972      // Prepare the inter-eBPF-program indexer.
  5973      indexer_t indexer = {0};
  5974      indexer.ts = BPF_CORE_READ(skb, tstamp);
  5975  
  5976      u8 proto = 0;
  5977  
  5978      // Parse the packet layer 3 headers.
  5979      switch (family) {
  5980          case PF_INET:
  5981              if (nethdrs->iphdrs.iphdr.version != 4) // IPv4
  5982                  return 1;
  5983  
  5984              if (nethdrs->iphdrs.iphdr.ihl > 5) { // re-read IP header if needed
  5985                  l3_size -= get_type_size(struct iphdr);
  5986                  l3_size += nethdrs->iphdrs.iphdr.ihl * 4;
  5987                  bpf_core_read(nethdrs, l3_size, data_ptr);
  5988              }
  5989  
  5990              proto = nethdrs->iphdrs.iphdr.protocol;
  5991              switch (proto) {
  5992                  case IPPROTO_TCP:
  5993                  case IPPROTO_UDP:
  5994                  case IPPROTO_ICMP:
  5995                      break;
  5996                  default:
  5997                      return 1; // ignore other protocols
  5998              }
  5999  
  6000              // Update inter-eBPF-program indexer with IPv4 header items.
  6001              indexer.ip_csum = nethdrs->iphdrs.iphdr.check;
  6002              indexer.src.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.saddr;
  6003              indexer.dst.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.daddr;
  6004              break;
  6005  
  6006          case PF_INET6:
  6007              // TODO: dual-stack IP implementation unsupported for now
  6008              // https://en.wikipedia.org/wiki/IPv6_transition_mechanism
  6009              if (nethdrs->iphdrs.ipv6hdr.version != 6) // IPv6
  6010                  return 1;
  6011  
  6012              proto = nethdrs->iphdrs.ipv6hdr.nexthdr;
  6013              switch (proto) {
  6014                  case IPPROTO_TCP:
  6015                  case IPPROTO_UDP:
  6016                  case IPPROTO_ICMPV6:
  6017                      break;
  6018                  default:
  6019                      return 1; // ignore other protocols
  6020              }
  6021  
  6022              // Update inter-eBPF-program indexer with IPv6 header items.
  6023              __builtin_memcpy(&indexer.src.in6_u, &nethdrs->iphdrs.ipv6hdr.saddr.in6_u, 4 * sizeof(u32));
  6024              __builtin_memcpy(&indexer.dst.in6_u, &nethdrs->iphdrs.ipv6hdr.daddr.in6_u, 4 * sizeof(u32));
  6025              break;
  6026  
  6027          default:
  6028              return 1;
  6029      }
  6030  
  6031      //
  6032      // LINK CONTENT INDEXER TO EVENT CONTEXT
  6033      //
  6034  
  6035      neteventctx.bytes = 0; // event arg size: no payload by default (changed inside skb prog)
  6036  
  6037      // TODO: log collisions
  6038      bpf_map_update_elem(cgrpctxmap, &indexer, &neteventctx, BPF_NOEXIST);
  6039  
  6040      return 0;
  6041  }
  6042  
  6043  //
  6044  // SKB eBPF programs
  6045  //
  6046  
  6047  statfunc u32 cgroup_skb_generic(struct __sk_buff *ctx, void *cgrpctxmap)
  6048  {
  6049      // IMPORTANT: runs for EVERY packet of tasks belonging to root cgroup
  6050  
  6051      switch (ctx->family) {
  6052          case PF_INET:
  6053          case PF_INET6:
  6054              break;
  6055          default:
  6056              return 1; // PF_INET and PF_INET6 only
  6057      }
  6058  
  6059      // HANDLE SOCKET FAMILY
  6060  
  6061      struct bpf_sock *sk = ctx->sk;
  6062      if (!sk)
  6063          return 1;
  6064  
  6065      sk = bpf_sk_fullsock(sk);
  6066      if (!sk)
  6067          return 1;
  6068  
  6069      nethdrs hdrs = {0}, *nethdrs = &hdrs;
  6070  
  6071      void *dest;
  6072  
  6073      u32 size = 0;
  6074      u32 family = ctx->family;
  6075  
  6076      switch (family) {
  6077          case PF_INET:
  6078              dest = &nethdrs->iphdrs.iphdr;
  6079              size = get_type_size(struct iphdr);
  6080              break;
  6081          case PF_INET6:
  6082              dest = &nethdrs->iphdrs.ipv6hdr;
  6083              size = get_type_size(struct ipv6hdr);
  6084              break;
  6085          default:
  6086              return 1; // verifier
  6087      }
  6088  
  6089      // load layer 3 headers (for cgrpctxmap key/indexer)
  6090  
  6091      if (bpf_skb_load_bytes_relative(ctx, 0, dest, size, 1))
  6092          return 1;
  6093  
  6094      //
  6095      // IGNORE UNSUPPORTED PROTOCOLS, CREATE INDEXER TO OBTAIN EVENT
  6096      //
  6097  
  6098      indexer_t indexer = {0};
  6099      indexer.ts = ctx->tstamp;
  6100  
  6101      u32 ihl = 0;
  6102      switch (family) {
  6103          case PF_INET:
  6104              if (nethdrs->iphdrs.iphdr.version != 4) // IPv4
  6105                  return 1;
  6106  
  6107              ihl = nethdrs->iphdrs.iphdr.ihl;
  6108              if (ihl > 5) { // re-read IPv4 header if needed
  6109                  size -= get_type_size(struct iphdr);
  6110                  size += ihl * 4;
  6111                  bpf_skb_load_bytes_relative(ctx, 0, dest, size, 1);
  6112              }
  6113  
  6114              switch (nethdrs->iphdrs.iphdr.protocol) {
  6115                  case IPPROTO_TCP:
  6116                  case IPPROTO_UDP:
  6117                  case IPPROTO_ICMP:
  6118                      break;
  6119                  default:
  6120                      return 1; // unsupported proto
  6121              }
  6122  
  6123              // add IPv4 header items to indexer
  6124              indexer.ip_csum = nethdrs->iphdrs.iphdr.check;
  6125              indexer.src.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.saddr;
  6126              indexer.dst.in6_u.u6_addr32[0] = nethdrs->iphdrs.iphdr.daddr;
  6127              break;
  6128  
  6129          case PF_INET6:
  6130              // TODO: dual-stack IP implementation unsupported for now
  6131              // https://en.wikipedia.org/wiki/IPv6_transition_mechanism
  6132              if (nethdrs->iphdrs.ipv6hdr.version != 6) // IPv6
  6133                  return 1;
  6134  
  6135              switch (nethdrs->iphdrs.ipv6hdr.nexthdr) {
  6136                  case IPPROTO_TCP:
  6137                  case IPPROTO_UDP:
  6138                  case IPPROTO_ICMPV6:
  6139                      break;
  6140                  default:
  6141                      return 1; // unsupported proto
  6142              }
  6143  
  6144              // add IPv6 header items to indexer
  6145              __builtin_memcpy(&indexer.src.in6_u, &nethdrs->iphdrs.ipv6hdr.saddr.in6_u, 4 * sizeof(u32));
  6146              __builtin_memcpy(&indexer.dst.in6_u, &nethdrs->iphdrs.ipv6hdr.daddr.in6_u, 4 * sizeof(u32));
  6147              break;
  6148  
  6149          default:
  6150              return 1; // verifier
  6151      }
  6152  
  6153      net_event_context_t *neteventctx;
  6154      neteventctx = bpf_map_lookup_elem(cgrpctxmap, &indexer); // obtain event context
  6155      if (!neteventctx) {
  6156          // 1. kthreads receiving ICMP and ICMPv6 (e.g dest unreach)
  6157          // 2. tasks not being traced
  6158          // 3. unknown (yet) sockets (need egress packet to link task and inode)
  6159          // ...
  6160          return 1;
  6161      }
  6162  
  6163      // Skip if cgroup is muted.
  6164      u64 cgroup_id = neteventctx->eventctx.task.cgroup_id;
  6165      if (bpf_map_lookup_elem(&ignored_cgroups_map, &cgroup_id)) {
  6166          return 1;
  6167      }
  6168  
  6169      neteventctx->md.header_size = size; // add header size to offset
  6170  
  6171      u32 ret = CGROUP_SKB_HANDLE(proto);
  6172  
  6173      bpf_map_delete_elem(cgrpctxmap, &indexer); // cleanup
  6174  
  6175      return ret; // important for network blocking
  6176  }
  6177  
  6178  SEC("cgroup_skb/ingress")
  6179  int cgroup_skb_ingress(struct __sk_buff *ctx)
  6180  {
  6181      return cgroup_skb_generic(ctx, &cgrpctxmap_in);
  6182  }
  6183  
  6184  SEC("cgroup_skb/egress")
  6185  int cgroup_skb_egress(struct __sk_buff *ctx)
  6186  {
  6187      return cgroup_skb_generic(ctx, &cgrpctxmap_eg);
  6188  }
  6189  
  6190  //
  6191  // Network Protocol Events Logic
  6192  //
  6193  
  6194  //
  6195  // SUPPORTED L3 NETWORK PROTOCOLS (ip, ipv6) HANDLERS
  6196  //
  6197  
  6198  CGROUP_SKB_HANDLE_FUNCTION(proto)
  6199  {
  6200      void *dest = NULL;
  6201      u32 prev_hdr_size = neteventctx->md.header_size;
  6202      u32 size = 0;
  6203      u8 next_proto = 0;
  6204  
  6205      // NOTE: might block IP and IPv6 here if needed (return 0)
  6206  
  6207      switch (ctx->family) {
  6208          case PF_INET:
  6209              if (nethdrs->iphdrs.iphdr.version != 4) // IPv4
  6210                  return 1;
  6211  
  6212              next_proto = nethdrs->iphdrs.iphdr.protocol;
  6213              switch (next_proto) {
  6214                  case IPPROTO_TCP:
  6215                      dest = &nethdrs->protohdrs.tcphdr;
  6216                      size = get_type_size(struct tcphdr);
  6217                      break;
  6218                  case IPPROTO_UDP:
  6219                      dest = &nethdrs->protohdrs.udphdr;
  6220                      size = get_type_size(struct udphdr);
  6221                      break;
  6222                  case IPPROTO_ICMP:
  6223                      dest = &nethdrs->protohdrs.icmphdr;
  6224                      size = 0; // will be added later, last function
  6225                      break;
  6226                  default:
  6227                      return 1; // other protocols are not an error
  6228              }
  6229  
  6230              // Update the network flow map indexer with the packet headers.
  6231              neteventctx->md.flow.tuple.saddr.v4addr = nethdrs->iphdrs.iphdr.saddr;
  6232              neteventctx->md.flow.tuple.daddr.v4addr = nethdrs->iphdrs.iphdr.daddr;
  6233              neteventctx->md.flow.tuple.family = AF_INET;
  6234              break;
  6235  
  6236          case PF_INET6:
  6237              // TODO: dual-stack IP implementation unsupported for now
  6238              // https://en.wikipedia.org/wiki/IPv6_transition_mechanism
  6239              if (nethdrs->iphdrs.ipv6hdr.version != 6) // IPv6
  6240                  return 1;
  6241  
  6242              next_proto = nethdrs->iphdrs.ipv6hdr.nexthdr;
  6243              switch (next_proto) {
  6244                  case IPPROTO_TCP:
  6245                      dest = &nethdrs->protohdrs.tcphdr;
  6246                      size = get_type_size(struct tcphdr);
  6247                      break;
  6248                  case IPPROTO_UDP:
  6249                      dest = &nethdrs->protohdrs.udphdr;
  6250                      size = get_type_size(struct udphdr);
  6251                      break;
  6252                  case IPPROTO_ICMPV6:
  6253                      dest = &nethdrs->protohdrs.icmp6hdr;
  6254                      size = 0; // will be added later, last function
  6255                      break;
  6256                  default:
  6257                      return 1; // other protocols are not an error
  6258              }
  6259  
  6260              // Update the network flow map indexer with the packet headers.
  6261              __builtin_memcpy(&neteventctx->md.flow.tuple.saddr.v6addr, &nethdrs->iphdrs.ipv6hdr.saddr.in6_u, 4 * sizeof(u32));
  6262              __builtin_memcpy(&neteventctx->md.flow.tuple.daddr.v6addr, &nethdrs->iphdrs.ipv6hdr.daddr.in6_u, 4 * sizeof(u32));
  6263              break;
  6264  
  6265          default:
  6266              return 1; // verifier needs
  6267      }
  6268  
  6269      // Update the network flow map indexer with the packet headers.
  6270      neteventctx->md.flow.proto = next_proto;
  6271  
  6272      if (!dest)
  6273          return 1; // satisfy verifier for clang-12 generated binaries
  6274  
  6275      // fastpath: submit the IP base event
  6276  
  6277      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_IP))
  6278          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_IP, HEADERS);
  6279  
  6280      // fastpath: capture all packets if filtered pcap-option is not set
  6281  
  6282      u32 zero = 0;
  6283      netconfig_entry_t *nc = bpf_map_lookup_elem(&netconfig_map, &zero);
  6284      if (nc == NULL)
  6285          return 0;
  6286  
  6287      if (!(nc->capture_options & NET_CAP_OPT_FILTERED))
  6288          cgroup_skb_capture(); // will avoid extra lookups further if not needed
  6289  
  6290      // Update the network event context with payload size.
  6291      neteventctx->md.header_size += size;
  6292  
  6293      // Load the next protocol header.
  6294      if (size) {
  6295          if (bpf_skb_load_bytes_relative(ctx, prev_hdr_size, dest, size, BPF_HDR_START_NET))
  6296              return 1;
  6297      }
  6298  
  6299      // Call the next protocol handler.
  6300      switch (next_proto) {
  6301          case IPPROTO_TCP:
  6302              return CGROUP_SKB_HANDLE(proto_tcp);
  6303          case IPPROTO_UDP:
  6304              return CGROUP_SKB_HANDLE(proto_udp);
  6305          case IPPROTO_ICMP:
  6306              return CGROUP_SKB_HANDLE(proto_icmp);
  6307          case IPPROTO_ICMPV6:
  6308              return CGROUP_SKB_HANDLE(proto_icmpv6);
  6309          default:
  6310              return 1; // verifier needs
  6311      }
  6312  
  6313      // TODO: If cmdline is tracing net_packet_ipv6 only, then the ipv4 packets
  6314      //       shouldn't be added to the pcap file. Filters will have to be
  6315      //       applied to the capture pipeline to obey derived events only
  6316      //       filters + capture.
  6317  
  6318      // capture IPv4/IPv6 packets (filtered)
  6319      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP))
  6320          cgroup_skb_capture();
  6321  
  6322      return 1;
  6323  }
  6324  
  6325  //
  6326  // GUESS L7 NETWORK PROTOCOLS (http, dns, etc)
  6327  //
  6328  
  6329  // when guessing by src/dst ports, declare at network.h
  6330  
  6331  // when guessing through l7 layer, here
  6332  
  6333  statfunc int net_l7_is_http(struct __sk_buff *skb, u32 l7_off)
  6334  {
  6335      char http_min_str[http_min_len];
  6336      __builtin_memset((void *) &http_min_str, 0, sizeof(char) * http_min_len);
  6337  
  6338      // load first http_min_len bytes from layer 7 in packet.
  6339      if (bpf_skb_load_bytes(skb, l7_off, http_min_str, http_min_len) < 0) {
  6340          return 0; // failed loading data into http_min_str - return.
  6341      }
  6342  
  6343      // check if HTTP response
  6344      if (has_prefix("HTTP/", http_min_str, 6)) {
  6345          return proto_http_resp;
  6346      }
  6347  
  6348      // check if HTTP request
  6349      if (has_prefix("GET ", http_min_str, 5)    ||
  6350          has_prefix("POST ", http_min_str, 6)   ||
  6351          has_prefix("PUT ", http_min_str, 5)    ||
  6352          has_prefix("DELETE ", http_min_str, 8) ||
  6353          has_prefix("HEAD ", http_min_str, 6)) {
  6354          return proto_http_req;
  6355      }
  6356  
  6357      return 0;
  6358  }
  6359  
  6360  // clang-format on
  6361  
  6362  #define SOCKS5_VERSION(buf)     buf[0]
  6363  #define SOCKS5_NUM_METHODS(buf) buf[1]
  6364  #define SOCKS5_CMD(buf)         buf[1]
  6365  #define SOCKS5_RESERVED(buf)    buf[2]
  6366  #define SOCKS5_ADDR_TYPE(buf)   buf[3]
  6367  
  6368  // see https://datatracker.ietf.org/doc/html/rfc1928 for the definition of the socks5 protocol
  6369  statfunc bool net_l7_is_socks5(struct __sk_buff *skb, u32 l7_off)
  6370  {
  6371      // we treat all messages from the default socks ports as potential sock messages and try to
  6372      // parse them in userspace.
  6373      if (skb->remote_port == TCP_PORT_SOCKS5) {
  6374          return true;
  6375      }
  6376  
  6377      if (skb->local_port == TCP_PORT_SOCKS5) {
  6378          return true;
  6379      }
  6380  
  6381      char buf[socks5_min_len];
  6382      __builtin_memset(&buf, 0, sizeof(buf));
  6383  
  6384      if (skb->len < l7_off) {
  6385          return false;
  6386      }
  6387  
  6388      u32 payload_len = skb->len - l7_off;
  6389      u32 read_len = payload_len;
  6390      // inline bounds check to force compiler to use the register of size
  6391      asm volatile("if %[size] < %[max_size] goto +1;\n"
  6392                   "%[size] = %[max_size];\n"
  6393                   :
  6394                   : [size] "r"(read_len), [max_size] "i"(socks5_min_len));
  6395  
  6396      // make the verifier happy to ensure that we read more than a single byte
  6397      // the test is for 2, since we anyway exect at least 2 bytes to check for socks5
  6398      asm goto("if %[size] < 2 goto %l[out]" ::[size] "r"(read_len)::out);
  6399  
  6400      if (read_len < 2) {
  6401          return false;
  6402      }
  6403  
  6404      // load first socks5_min_len bytes from layer 7 in packet.
  6405      if (bpf_skb_load_bytes(skb, l7_off, buf, read_len) < 0) {
  6406          return false; // failed loading data into http_min_str - return.
  6407      }
  6408  
  6409      if (SOCKS5_VERSION(buf) != 5) {
  6410          return false; // all socks5 messages begin with the version (which is 5 for socks5)
  6411      }
  6412  
  6413      // this might be a bit of a leap of faith here, since the first server response only selects the
  6414      // method used for auth. This requires more massaging in userspace.
  6415      if (payload_len == 2) {
  6416          return true;
  6417      }
  6418  
  6419      // the client starts by sending a message containing the number of methods for auth in the
  6420      // second byte. Each of these methods are then listed in the following bytes, meaning that
  6421      // if our message is the length of the number of messages + 2 (since starting after the second
  6422      // byte), we should have ourselfs a client request.
  6423      if (payload_len == (u32) SOCKS5_NUM_METHODS(buf) + 2) {
  6424          return true;
  6425      }
  6426  
  6427      // we now access fields above the two
  6428      if (read_len < socks5_min_len) {
  6429          return false;
  6430      }
  6431  
  6432      // both request and response have the 3rd byte reserved and it needs to be set to 0x00
  6433      if (SOCKS5_RESERVED(buf) != 0x00) {
  6434          return false;
  6435      }
  6436  
  6437      if (SOCKS5_ADDR_TYPE(buf) == 0x01       // IPv4 address
  6438          || SOCKS5_ADDR_TYPE(buf) == 0x03    // domain name
  6439          || SOCKS5_ADDR_TYPE(buf) == 0x04) { // IPv6 address
  6440          return true;
  6441      }
  6442  
  6443  out:
  6444      return false;
  6445  }
  6446  // clang-format off
  6447  
  6448  //
  6449  // SUPPORTED L4 NETWORK PROTOCOL (tcp, udp, icmp) HANDLERS
  6450  //
  6451  
  6452  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp)
  6453  {
  6454      // check flag for dynamic header size (TCP: data offset flag)
  6455  
  6456      if (nethdrs->protohdrs.tcphdr.doff > 5) { // offset flag set
  6457          u32 doff = nethdrs->protohdrs.tcphdr.doff * (32 / 8);
  6458          neteventctx->md.header_size -= get_type_size(struct tcphdr);
  6459          neteventctx->md.header_size += doff;
  6460      }
  6461  
  6462      // Pick src/dst ports.
  6463      u16 srcport = bpf_ntohs(nethdrs->protohdrs.tcphdr.source);
  6464      u16 dstport = bpf_ntohs(nethdrs->protohdrs.tcphdr.dest);
  6465  
  6466      // Update the network flow map indexer with the packet headers.
  6467      neteventctx->md.flow.tuple.sport = srcport;
  6468      neteventctx->md.flow.tuple.dport = dstport;
  6469  
  6470      if (should_submit_flow_event(neteventctx)) {
  6471          // Check if TCP flow needs to be submitted (only headers).
  6472          bool is_rst = nethdrs->protohdrs.tcphdr.rst;
  6473          bool is_syn = nethdrs->protohdrs.tcphdr.syn;
  6474          bool is_ack = nethdrs->protohdrs.tcphdr.ack;
  6475          bool is_fin = nethdrs->protohdrs.tcphdr.fin;
  6476  
  6477          // Has TCP flow started ?
  6478          if ((is_syn & is_ack))
  6479              cgroup_skb_handle_flow(ctx, neteventctx, NET_FLOW_BASE, HEADERS, flow_tcp_begin);
  6480  
  6481          if (!is_syn && !is_fin && !is_rst) {
  6482              cgroup_skb_handle_flow(ctx, neteventctx, NET_FLOW_BASE, HEADERS, flow_tcp_sample);
  6483          }
  6484  
  6485          // Has TCP flow ended ?
  6486          if (is_fin || is_rst)
  6487              cgroup_skb_handle_flow(ctx, neteventctx, NET_FLOW_BASE, HEADERS, flow_tcp_end);
  6488      }
  6489  
  6490      // Submit TCP base event if needed (only headers)
  6491  
  6492      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_TCP))
  6493          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_TCP, HEADERS);
  6494  
  6495      // Fastpath: return if no other L7 network events.
  6496  
  6497      if (!should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS) &&
  6498          !should_submit_net_event(neteventctx, SUB_NET_PACKET_HTTP) &&
  6499          !should_submit_net_event(neteventctx, SUB_NET_PACKET_SOCKS5))
  6500          goto capture;
  6501  
  6502      // Guess layer 7 protocols by src/dst ports ...
  6503  
  6504      switch (srcport < dstport ? srcport : dstport) {
  6505          case TCP_PORT_DNS:
  6506              return CGROUP_SKB_HANDLE(proto_tcp_dns);
  6507          case TCP_PORT_SOCKS5:
  6508              return CGROUP_SKB_HANDLE(proto_tcp_socks5);
  6509      }
  6510  
  6511      // ... and by analyzing payload.
  6512  
  6513      int http_proto = net_l7_is_http(ctx, neteventctx->md.header_size);
  6514      if (http_proto) {
  6515          neteventctx->eventctx.retval |= http_proto;
  6516          return CGROUP_SKB_HANDLE(proto_tcp_http);
  6517      }
  6518  
  6519      int socks5_proto = net_l7_is_socks5(ctx, neteventctx->md.header_size);
  6520      if (socks5_proto) {
  6521          return CGROUP_SKB_HANDLE(proto_tcp_socks5);
  6522      }
  6523  
  6524      // ... continue with net_l7_is_protocol_xxx
  6525  
  6526  capture:
  6527      // Capture IP or TCP packets (filtered)
  6528      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6529          should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP)) {
  6530          cgroup_skb_capture();
  6531      }
  6532  
  6533      return 1; // NOTE: might block TCP here if needed (return 0)
  6534  }
  6535  
  6536  CGROUP_SKB_HANDLE_FUNCTION(proto_udp)
  6537  {
  6538      // Submit UDP base event if needed (only headers).
  6539  
  6540      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_UDP))
  6541          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_UDP, HEADERS);
  6542  
  6543      // Fastpath: return if no other L7 network events.
  6544  
  6545      if (!should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS) &&
  6546          !should_submit_net_event(neteventctx, SUB_NET_PACKET_HTTP))
  6547          goto capture;
  6548  
  6549      // Guess layer 7 protocols ...
  6550  
  6551      u16 source = bpf_ntohs(nethdrs->protohdrs.udphdr.source);
  6552      u16 dest = bpf_ntohs(nethdrs->protohdrs.udphdr.dest);
  6553  
  6554      // ... by src/dst ports
  6555  
  6556      switch (source < dest ? source : dest) {
  6557          case UDP_PORT_DNS:
  6558              return CGROUP_SKB_HANDLE(proto_udp_dns);
  6559      }
  6560  
  6561      // ... by analyzing payload
  6562      // ...
  6563  
  6564      // ... continue with net_l7_is_protocol_xxx
  6565  
  6566  capture:
  6567      // Capture IP or UDP packets (filtered).
  6568      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6569          should_capture_net_event(neteventctx, SUB_NET_PACKET_UDP)) {
  6570          cgroup_skb_capture();
  6571      }
  6572  
  6573      return 1; // NOTE: might block UDP here if needed (return 0)
  6574  }
  6575  
  6576  CGROUP_SKB_HANDLE_FUNCTION(proto_icmp)
  6577  {
  6578      // submit ICMP base event if needed (full packet)
  6579      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_ICMP))
  6580          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_ICMP, FULL);
  6581  
  6582      // capture ip or icmp packets (filtered)
  6583      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6584          should_capture_net_event(neteventctx, SUB_NET_PACKET_ICMP)) {
  6585          neteventctx->md.header_size = ctx->len; // full ICMP header
  6586          cgroup_skb_capture();
  6587      }
  6588  
  6589      return 1; // NOTE: might block ICMP here if needed (return 0)
  6590  }
  6591  
  6592  CGROUP_SKB_HANDLE_FUNCTION(proto_icmpv6)
  6593  {
  6594      // submit ICMPv6 base event if needed (full packet)
  6595      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_ICMPV6))
  6596          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_ICMPV6, FULL);
  6597  
  6598      // capture ip or icmpv6 packets (filtered)
  6599      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6600          should_capture_net_event(neteventctx, SUB_NET_PACKET_ICMPV6)) {
  6601          neteventctx->md.header_size = ctx->len; // full ICMPv6 header
  6602          cgroup_skb_capture();
  6603      }
  6604  
  6605      return 1; // NOTE: might block ICMPv6 here if needed (return 0)
  6606  }
  6607  
  6608  //
  6609  // SUPPORTED L7 NETWORK PROTOCOL (dns) HANDLERS
  6610  //
  6611  
  6612  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_dns)
  6613  {
  6614      // submit DNS base event if needed (full packet)
  6615      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS))
  6616          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_DNS, FULL);
  6617  
  6618      // capture DNS-TCP, TCP or IP packets (filtered)
  6619      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6620          should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP) ||
  6621          should_capture_net_event(neteventctx, SUB_NET_PACKET_DNS)) {
  6622          neteventctx->md.header_size = ctx->len; // full dns header
  6623          cgroup_skb_capture();
  6624      }
  6625  
  6626      return 1; // NOTE: might block DNS here if needed (return 0)
  6627  }
  6628  
  6629  CGROUP_SKB_HANDLE_FUNCTION(proto_udp_dns)
  6630  {
  6631      // submit DNS base event if needed (full packet)
  6632      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_DNS))
  6633          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_DNS, FULL);
  6634  
  6635      // capture DNS-UDP, UDP or IP packets (filtered)
  6636      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6637          should_capture_net_event(neteventctx, SUB_NET_PACKET_UDP) ||
  6638          should_capture_net_event(neteventctx, SUB_NET_PACKET_DNS)) {
  6639          neteventctx->md.header_size = ctx->len; // full dns header
  6640          cgroup_skb_capture();
  6641      }
  6642  
  6643      return 1; // NOTE: might block DNS here if needed (return 0)
  6644  }
  6645  
  6646  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_http)
  6647  {
  6648      // submit HTTP base event if needed (full packet)
  6649      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_HTTP))
  6650          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_HTTP, FULL);
  6651  
  6652      // capture HTTP-TCP, TCP or IP packets (filtered)
  6653      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6654          should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP) ||
  6655          should_capture_net_event(neteventctx, SUB_NET_PACKET_HTTP)) {
  6656          cgroup_skb_capture(); // http header is dyn, do not change header_size
  6657      }
  6658  
  6659      return 1; // NOTE: might block HTTP here if needed (return 0)
  6660  }
  6661  
  6662  CGROUP_SKB_HANDLE_FUNCTION(proto_tcp_socks5)
  6663  {
  6664      u32 payload_len = ctx->len - neteventctx->md.header_size;
  6665  
  6666      // submit SOCKS5 base event if needed (full packet)
  6667      // we only care about packets that have a payload though
  6668      if (should_submit_net_event(neteventctx, SUB_NET_PACKET_SOCKS5) && payload_len > 0) {
  6669          cgroup_skb_submit_event(ctx, neteventctx, NET_PACKET_SOCKS5, FULL);
  6670      }
  6671  
  6672      // capture SOCKS5-TCP, TCP or IP packets (filtered)
  6673      if (should_capture_net_event(neteventctx, SUB_NET_PACKET_IP) ||
  6674          should_capture_net_event(neteventctx, SUB_NET_PACKET_TCP) ||
  6675          should_capture_net_event(neteventctx, SUB_NET_PACKET_HTTP)) {
  6676          neteventctx->md.header_size = ctx->len; // full socks5 packet
  6677          cgroup_skb_capture();
  6678      }
  6679  
  6680      return 1; // NOTE: might block SOCKS5 here if needed (return 0)
  6681  }
  6682  
  6683  // clang-format on
  6684  
  6685  //
  6686  // Control Plane Programs
  6687  //
  6688  // Control Plane programs are almost duplicate programs of select events which we send as direct
  6689  // signals to tracee in a separate buffer. This is done to mitigate the consenquences of losing
  6690  // these events in the main perf buffer.
  6691  //
  6692  
  6693  // Containers Lifecyle
  6694  
  6695  SEC("raw_tracepoint/cgroup_mkdir_signal")
  6696  int cgroup_mkdir_signal(struct bpf_raw_tracepoint_args *ctx)
  6697  {
  6698      u32 zero = 0;
  6699      config_entry_t *cfg = bpf_map_lookup_elem(&config_map, &zero);
  6700      if (unlikely(cfg == NULL))
  6701          return 0;
  6702      controlplane_signal_t *signal = init_controlplane_signal();
  6703      if (unlikely(signal == NULL))
  6704          return 0;
  6705  
  6706      struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0];
  6707      char *path = (char *) ctx->args[1];
  6708  
  6709      u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp);
  6710      u64 cgroup_id = get_cgroup_id(dst_cgrp);
  6711      u32 cgroup_id_lsb = cgroup_id;
  6712  
  6713      bool should_update = true;
  6714      if ((cfg->options & OPT_CGROUP_V1) && (cfg->cgroup_v1_hid != hierarchy_id))
  6715          should_update = false;
  6716  
  6717      if (should_update) {
  6718          // Assume this is a new container. If not, userspace code will delete this entry
  6719          u8 state = CONTAINER_CREATED;
  6720          bpf_map_update_elem(&containers_map, &cgroup_id_lsb, &state, BPF_ANY);
  6721      }
  6722  
  6723      save_to_submit_buf(&signal->args_buf, &cgroup_id, sizeof(u64), 0);
  6724      save_str_to_buf(&signal->args_buf, path, 1);
  6725      save_to_submit_buf(&signal->args_buf, &hierarchy_id, sizeof(u32), 2);
  6726      signal_perf_submit(ctx, signal, SIGNAL_CGROUP_MKDIR);
  6727  
  6728      return 0;
  6729  }
  6730  
  6731  SEC("raw_tracepoint/cgroup_rmdir_signal")
  6732  int cgroup_rmdir_signal(struct bpf_raw_tracepoint_args *ctx)
  6733  {
  6734      u32 zero = 0;
  6735      config_entry_t *cfg = bpf_map_lookup_elem(&config_map, &zero);
  6736      if (unlikely(cfg == NULL))
  6737          return 0;
  6738      controlplane_signal_t *signal = init_controlplane_signal();
  6739      if (unlikely(signal == NULL))
  6740          return 0;
  6741  
  6742      struct cgroup *dst_cgrp = (struct cgroup *) ctx->args[0];
  6743      char *path = (char *) ctx->args[1];
  6744  
  6745      u32 hierarchy_id = get_cgroup_hierarchy_id(dst_cgrp);
  6746      u64 cgroup_id = get_cgroup_id(dst_cgrp);
  6747      u32 cgroup_id_lsb = cgroup_id;
  6748  
  6749      bool should_update = true;
  6750      if ((cfg->options & OPT_CGROUP_V1) && (cfg->cgroup_v1_hid != hierarchy_id))
  6751          should_update = false;
  6752  
  6753      if (should_update)
  6754          bpf_map_delete_elem(&containers_map, &cgroup_id_lsb);
  6755  
  6756      save_to_submit_buf(&signal->args_buf, &cgroup_id, sizeof(u64), 0);
  6757      save_str_to_buf(&signal->args_buf, path, 1);
  6758      save_to_submit_buf(&signal->args_buf, &hierarchy_id, sizeof(u32), 2);
  6759      signal_perf_submit(ctx, signal, SIGNAL_CGROUP_RMDIR);
  6760  
  6761      return 0;
  6762  }
  6763  
  6764  // Processes Lifecycle
  6765  
  6766  // NOTE: sched_process_fork is called by kernel_clone(), which is executed during
  6767  //       clone() calls as well, not only fork(). This means that sched_process_fork()
  6768  //       is also able to pick the creation of LWPs through clone().
  6769  
  6770  SEC("raw_tracepoint/sched_process_fork")
  6771  int sched_process_fork_signal(struct bpf_raw_tracepoint_args *ctx)
  6772  {
  6773      controlplane_signal_t *signal = init_controlplane_signal();
  6774      if (unlikely(signal == NULL))
  6775          return 0;
  6776  
  6777      struct task_struct *parent = (struct task_struct *) ctx->args[0];
  6778      struct task_struct *child = (struct task_struct *) ctx->args[1];
  6779      struct task_struct *leader = get_leader_task(child);
  6780      struct task_struct *up_parent = get_leader_task(get_parent_task(leader));
  6781  
  6782      // In the Linux kernel:
  6783      //
  6784      // Every task (a process or a thread) is represented by a `task_struct`:
  6785      //
  6786      // - `pid`: Inside the `task_struct`, there's a field called `pid`. This is a unique identifier
  6787      //   for every task, which can be thought of as the thread ID (TID) from a user space
  6788      //   perspective. Every task, whether it's the main thread of a process or an additional thread,
  6789      //   has a unique `pid`.
  6790      //
  6791      // - `tgid` (Thread Group ID): This field in the `task_struct` is used to group threads from the
  6792      //   same process. For the main thread of a process, the `tgid` is the same as its `pid`. For
  6793      //   other threads created by that process, the `tgid` matches the `pid` of the main thread.
  6794      //
  6795      // In userspace:
  6796      //
  6797      // - `getpid()` returns the TGID, effectively the traditional process ID.
  6798      // - `gettid()` returns the PID (from the `task_struct`), effectively the thread ID.
  6799      //
  6800      // This design in the Linux kernel leads to a unified handling of processes and threads. In the
  6801      // kernel's view, every thread is a task with potentially shared resources, but each has a
  6802      // unique PID. In user space, the distinction is made where processes have a unique PID, and
  6803      // threads within those processes have unique TIDs.
  6804  
  6805      // Summary:
  6806      // userland pid = kernel tgid
  6807      // userland tgid = kernel pid
  6808  
  6809      // The event timestamp, so process tree info can be changelog'ed.
  6810      u64 timestamp = bpf_ktime_get_ns();
  6811      save_to_submit_buf(&signal->args_buf, &timestamp, sizeof(u64), 0);
  6812  
  6813      // Parent information.
  6814      u64 parent_start_time = get_task_start_time(parent);
  6815      int parent_pid = get_task_host_tgid(parent);
  6816      int parent_tid = get_task_host_pid(parent);
  6817      int parent_ns_pid = get_task_ns_tgid(parent);
  6818      int parent_ns_tid = get_task_ns_pid(parent);
  6819  
  6820      // Child information.
  6821      u64 child_start_time = get_task_start_time(child);
  6822      int child_pid = get_task_host_tgid(child);
  6823      int child_tid = get_task_host_pid(child);
  6824      int child_ns_pid = get_task_ns_tgid(child);
  6825      int child_ns_tid = get_task_ns_pid(child);
  6826  
  6827      // Up Parent information: Go up in hierarchy until parent is process.
  6828      u64 up_parent_start_time = get_task_start_time(up_parent);
  6829      int up_parent_pid = get_task_host_tgid(up_parent);
  6830      int up_parent_tid = get_task_host_pid(up_parent);
  6831      int up_parent_ns_pid = get_task_ns_tgid(up_parent);
  6832      int up_parent_ns_tid = get_task_ns_pid(up_parent);
  6833  
  6834      // Leader information.
  6835      u64 leader_start_time = get_task_start_time(leader);
  6836      int leader_pid = get_task_host_tgid(leader);
  6837      int leader_tid = get_task_host_pid(leader);
  6838      int leader_ns_pid = get_task_ns_tgid(leader);
  6839      int leader_ns_tid = get_task_ns_pid(leader);
  6840  
  6841      // Parent (might be a thread or a process).
  6842      save_to_submit_buf(&signal->args_buf, (void *) &parent_tid, sizeof(int), 1);
  6843      save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_tid, sizeof(int), 2);
  6844      save_to_submit_buf(&signal->args_buf, (void *) &parent_pid, sizeof(int), 3);
  6845      save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_pid, sizeof(int), 4);
  6846      save_to_submit_buf(&signal->args_buf, (void *) &parent_start_time, sizeof(u64), 5);
  6847  
  6848      // Child (might be a thread or a process, sched_process_fork trace is calle by clone() also).
  6849      save_to_submit_buf(&signal->args_buf, (void *) &child_tid, sizeof(int), 6);
  6850      save_to_submit_buf(&signal->args_buf, (void *) &child_ns_tid, sizeof(int), 7);
  6851      save_to_submit_buf(&signal->args_buf, (void *) &child_pid, sizeof(int), 8);
  6852      save_to_submit_buf(&signal->args_buf, (void *) &child_ns_pid, sizeof(int), 9);
  6853      save_to_submit_buf(&signal->args_buf, (void *) &child_start_time, sizeof(u64), 10);
  6854  
  6855      // Up Parent: always a real process (might be the same as Parent if it is a real process).
  6856      save_to_submit_buf(&signal->args_buf, (void *) &up_parent_tid, sizeof(int), 11);
  6857      save_to_submit_buf(&signal->args_buf, (void *) &up_parent_ns_tid, sizeof(int), 12);
  6858      save_to_submit_buf(&signal->args_buf, (void *) &up_parent_pid, sizeof(int), 13);
  6859      save_to_submit_buf(&signal->args_buf, (void *) &up_parent_ns_pid, sizeof(int), 14);
  6860      save_to_submit_buf(&signal->args_buf, (void *) &up_parent_start_time, sizeof(u64), 15);
  6861  
  6862      // Leader: always a real process (might be the same as the Child if child is a real process).
  6863      save_to_submit_buf(&signal->args_buf, (void *) &leader_tid, sizeof(int), 16);
  6864      save_to_submit_buf(&signal->args_buf, (void *) &leader_ns_tid, sizeof(int), 17);
  6865      save_to_submit_buf(&signal->args_buf, (void *) &leader_pid, sizeof(int), 18);
  6866      save_to_submit_buf(&signal->args_buf, (void *) &leader_ns_pid, sizeof(int), 19);
  6867      save_to_submit_buf(&signal->args_buf, (void *) &leader_start_time, sizeof(u64), 20);
  6868  
  6869      signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_FORK);
  6870  
  6871      return 0;
  6872  }
  6873  
  6874  // clang-format off
  6875  
  6876  SEC("raw_tracepoint/sched_process_exec")
  6877  int sched_process_exec_signal(struct bpf_raw_tracepoint_args *ctx)
  6878  {
  6879      controlplane_signal_t *signal = init_controlplane_signal();
  6880      if (unlikely(signal == NULL))
  6881          return 0;
  6882  
  6883      // Hashes
  6884  
  6885      struct task_struct *task = (struct task_struct *) ctx->args[0];
  6886      if (task == NULL)
  6887          return -1;
  6888      struct task_struct *leader = get_leader_task(task);
  6889      struct task_struct *parent = get_leader_task(get_parent_task(leader));
  6890  
  6891      // The hash is always calculated with "task_struct->pid + start_time".
  6892      u32 task_hash = hash_task_id(get_task_host_pid(task), get_task_start_time(task));
  6893      u32 parent_hash = hash_task_id(get_task_host_pid(parent), get_task_start_time(parent));
  6894      u32 leader_hash = hash_task_id(get_task_host_pid(leader), get_task_start_time(leader));
  6895  
  6896      // The event timestamp, so process tree info can be changelog'ed.
  6897      u64 timestamp = bpf_ktime_get_ns();
  6898      save_to_submit_buf(&signal->args_buf, &timestamp, sizeof(u64), 0);
  6899  
  6900      save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 1);
  6901      save_to_submit_buf(&signal->args_buf, (void *) &parent_hash, sizeof(u32), 2);
  6902      save_to_submit_buf(&signal->args_buf, (void *) &leader_hash, sizeof(u32), 3);
  6903  
  6904      // Exec logic
  6905  
  6906      struct linux_binprm *bprm = (struct linux_binprm *) ctx->args[2];
  6907      if (bprm == NULL)
  6908          return -1;
  6909  
  6910      // Pick the interpreter path from the proc_info map, which is set by the "load_elf_phdrs".
  6911      u32 host_pid = get_task_host_tgid(task);
  6912      proc_info_t *proc_info = bpf_map_lookup_elem(&proc_info_map, &host_pid);
  6913      if (proc_info == NULL) {
  6914          proc_info = init_proc_info(host_pid, 0);
  6915          if (unlikely(proc_info == NULL)) {
  6916              tracee_log(ctx, BPF_LOG_LVL_WARN, BPF_LOG_ID_MAP_LOOKUP_ELEM, 0);
  6917              return 0;
  6918          }
  6919      }
  6920  
  6921      struct file *file = get_file_ptr_from_bprm(bprm);
  6922      void *file_path = get_path_str(__builtin_preserve_access_index(&file->f_path));
  6923      const char *filename = get_binprm_filename(bprm);
  6924      dev_t s_dev = get_dev_from_file(file);
  6925      unsigned long inode_nr = get_inode_nr_from_file(file);
  6926      u64 ctime = get_ctime_nanosec_from_file(file);
  6927      umode_t inode_mode = get_inode_mode_from_file(file);
  6928  
  6929      save_str_to_buf(&signal->args_buf, (void *) filename, 4);                   // executable name
  6930      save_str_to_buf(&signal->args_buf, file_path, 5);                           // executable path
  6931      save_to_submit_buf(&signal->args_buf, &s_dev, sizeof(dev_t), 6);            // device number
  6932      save_to_submit_buf(&signal->args_buf, &inode_nr, sizeof(unsigned long), 7); // inode number
  6933      save_to_submit_buf(&signal->args_buf, &ctime, sizeof(u64), 8);              // creation time
  6934      save_to_submit_buf(&signal->args_buf, &inode_mode, sizeof(umode_t), 9);     // inode mode
  6935  
  6936      // The proc_info interpreter field is set by "load_elf_phdrs" kprobe program.
  6937      save_str_to_buf(&signal->args_buf, &proc_info->interpreter.pathname, 10);                    // interpreter path
  6938      save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 11); // interpreter device number
  6939      save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.inode, sizeof(u64), 12);    // interpreter inode number
  6940      save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.ctime, sizeof(u64), 13);    // interpreter creation time
  6941  
  6942      struct mm_struct *mm = get_mm_from_task(task); // bprm->mm is null here, but task->mm is not
  6943  
  6944      unsigned long arg_start, arg_end;
  6945      arg_start = get_arg_start_from_mm(mm);
  6946      arg_end = get_arg_end_from_mm(mm);
  6947      int argc = get_argc_from_bprm(bprm);
  6948  
  6949      struct file *stdin_file = get_struct_file_from_fd(0);
  6950      unsigned short stdin_type = get_inode_mode_from_file(stdin_file) & S_IFMT;
  6951      void *stdin_path = get_path_str(__builtin_preserve_access_index(&stdin_file->f_path));
  6952      const char *interp = get_binprm_interp(bprm);
  6953  
  6954      int invoked_from_kernel = 0;
  6955      if (get_task_parent_flags(task) & PF_KTHREAD)
  6956          invoked_from_kernel = 1;
  6957  
  6958      save_args_str_arr_to_buf(&signal->args_buf, (void *) arg_start, (void *) arg_end, argc, 14); // argv
  6959      save_str_to_buf(&signal->args_buf, (void *) interp, 15);                                     // interp
  6960      save_to_submit_buf(&signal->args_buf, &stdin_type, sizeof(unsigned short), 16);              // stdin type
  6961      save_str_to_buf(&signal->args_buf, stdin_path, 17);                                          // stdin path
  6962      save_to_submit_buf(&signal->args_buf, &invoked_from_kernel, sizeof(int), 18);                // invoked from kernel ?
  6963  
  6964      signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_EXEC);
  6965  
  6966      return 0;
  6967  }
  6968  
  6969  // clang-format on
  6970  
  6971  SEC("raw_tracepoint/sched_process_exit")
  6972  int sched_process_exit_signal(struct bpf_raw_tracepoint_args *ctx)
  6973  {
  6974      controlplane_signal_t *signal = init_controlplane_signal();
  6975      if (unlikely(signal == NULL))
  6976          return 0;
  6977  
  6978      // Hashes
  6979  
  6980      struct task_struct *task = (struct task_struct *) bpf_get_current_task();
  6981      if (task == NULL)
  6982          return -1;
  6983      struct task_struct *leader = get_leader_task(task);
  6984      struct task_struct *parent = get_leader_task(get_parent_task(leader));
  6985  
  6986      // The hash is always calculated with "task_struct->pid + start_time".
  6987      u32 task_hash = hash_task_id(get_task_host_pid(task), get_task_start_time(task));
  6988      u32 parent_hash = hash_task_id(get_task_host_pid(parent), get_task_start_time(parent));
  6989      u32 leader_hash = hash_task_id(get_task_host_pid(leader), get_task_start_time(leader));
  6990  
  6991      // The event timestamp, so process tree info can be changelog'ed.
  6992      u64 timestamp = bpf_ktime_get_ns();
  6993      save_to_submit_buf(&signal->args_buf, &timestamp, sizeof(u64), 0);
  6994  
  6995      save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 1);
  6996      save_to_submit_buf(&signal->args_buf, (void *) &parent_hash, sizeof(u32), 2);
  6997      save_to_submit_buf(&signal->args_buf, (void *) &leader_hash, sizeof(u32), 3);
  6998  
  6999      // Exit logic.
  7000  
  7001      bool group_dead = false;
  7002      struct signal_struct *s = BPF_CORE_READ(task, signal);
  7003      atomic_t live = BPF_CORE_READ(s, live);
  7004  
  7005      if (live.counter == 0)
  7006          group_dead = true;
  7007  
  7008      long exit_code = get_task_exit_code(task);
  7009  
  7010      save_to_submit_buf(&signal->args_buf, (void *) &exit_code, sizeof(long), 4);
  7011      save_to_submit_buf(&signal->args_buf, (void *) &group_dead, sizeof(bool), 5);
  7012  
  7013      signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_EXIT);
  7014  
  7015      return 0;
  7016  }
  7017  
  7018  // END OF Control Plane Programs
  7019  
  7020  // TODO: Instead of returning sock state return tcp_connect, tcp_listen, tcp_connect_error events.
  7021  // That will allow to subscribe only to wanted events and make handing easier.
  7022  statfunc bool should_trace_sock_set_state(int old_state, int new_state)
  7023  {
  7024      if (old_state == TCP_CLOSE && new_state == TCP_LISTEN) {
  7025          return true;
  7026      }
  7027      if (old_state == TCP_LISTEN && new_state == TCP_CLOSE) {
  7028          return true;
  7029      }
  7030      if (old_state == TCP_SYN_SENT && new_state == TCP_ESTABLISHED) {
  7031          return true;
  7032      }
  7033      if (old_state == TCP_SYN_SENT && new_state == TCP_CLOSE) {
  7034          return true;
  7035      }
  7036      if (old_state == TCP_ESTABLISHED &&
  7037          (new_state == TCP_FIN_WAIT1 || new_state == TCP_CLOSE_WAIT)) {
  7038          return false;
  7039      }
  7040      return false;
  7041  }
  7042  
  7043  // TP_PROTO(const struct sock *sk, const int oldstate, const int newstate)
  7044  SEC("raw_tracepoint/inet_sock_set_state")
  7045  int trace_inet_sock_set_state(struct bpf_raw_tracepoint_args *ctx)
  7046  {
  7047      struct sock *sk = (struct sock *) ctx->args[0];
  7048      int old_state = ctx->args[1];
  7049      int new_state = ctx->args[2];
  7050  
  7051      if (!should_trace_sock_set_state(old_state, new_state)) {
  7052          return 0;
  7053      }
  7054  
  7055      bool mightbecloned = false; // cloned sock structs come from accept()
  7056      u64 inode = BPF_CORE_READ(sk, sk_socket, file, f_inode, i_ino);
  7057      if (inode == 0)
  7058          mightbecloned = true; // kernel threads might have zero inode
  7059  
  7060      struct net_task_context *netctx;
  7061      if (!mightbecloned) {
  7062          // pick network context from the inodemap (inode <=> task)
  7063          netctx = bpf_map_lookup_elem(&inodemap, &inode);
  7064          if (!netctx)
  7065              mightbecloned = true; // e.g. task isn't being traced
  7066      }
  7067      if (mightbecloned) {
  7068          // pick network context from the sockmap (new sockptr <=> old inode <=> task)
  7069          u64 skptr = (u64) (void *) sk;
  7070          u64 *o = bpf_map_lookup_elem(&sockmap, &skptr);
  7071          if (o == 0)
  7072              return 0;
  7073          u64 oinode = *o;
  7074          // with the old inode, find the netctx for the task
  7075          netctx = bpf_map_lookup_elem(&inodemap, &oinode);
  7076          if (!netctx)
  7077              return 0; // old inode wasn't being traced as well
  7078      }
  7079  
  7080      u32 zero = 0;
  7081      event_data_t *e = bpf_map_lookup_elem(&net_heap_event, &zero);
  7082      if (unlikely(e == NULL))
  7083          return 0;
  7084  
  7085      program_data_t p = {};
  7086      p.scratch_idx = 1;
  7087      p.event = e;
  7088      if (!init_program_data(&p, ctx))
  7089          return 0;
  7090      __builtin_memcpy(&p.event->context.task, &netctx->taskctx, sizeof(task_context_t));
  7091  
  7092      tuple_t tuple = {};
  7093      fill_tuple(sk, &tuple);
  7094  
  7095      save_to_submit_buf(&p.event->args_buf, (void *) &old_state, sizeof(u32), 0);
  7096      save_to_submit_buf(&p.event->args_buf, (void *) &new_state, sizeof(u32), 1);
  7097      save_to_submit_buf(&p.event->args_buf, &tuple, sizeof(tuple), 2);
  7098      events_perf_submit(&p, SOCK_SET_STATE, 0);
  7099  
  7100      return 0;
  7101  }
  7102  // clang-format on
  7103  
  7104  SEC("raw_tracepoint/oom/mark_victim")
  7105  int oom_mark_victim(struct bpf_raw_tracepoint_args *ctx)
  7106  {
  7107      __u32 pid = ctx->args[0];
  7108  
  7109      bpf_map_update_elem(&oom_info, &pid, &pid, BPF_ANY);
  7110  
  7111      return 0;
  7112  }
  7113  
  7114  SEC("kprobe/tty_open")
  7115  int BPF_KPROBE(tty_open, struct inode *inode, struct file *filep)
  7116  {
  7117      program_data_t p = {};
  7118      if (!init_program_data(&p, ctx)) {
  7119          return 0;
  7120      }
  7121  
  7122      if (!should_trace((&p))) {
  7123          return 0;
  7124      }
  7125  
  7126      if (!should_submit(TTY_OPEN, p.event)) {
  7127          return 0;
  7128      }
  7129  
  7130      void *file_path = get_path_str(__builtin_preserve_access_index(&filep->f_path));
  7131      unsigned long ino = BPF_CORE_READ(inode, i_ino);
  7132      dev_t dev = BPF_CORE_READ(inode, i_rdev);
  7133      umode_t inode_mode = get_inode_mode_from_file(filep);
  7134  
  7135      save_str_to_buf(&p.event->args_buf, file_path, 0);
  7136      save_to_submit_buf(&p.event->args_buf, &ino, sizeof(ino), 1);
  7137      save_to_submit_buf(&p.event->args_buf, &inode_mode, sizeof(inode_mode), 2);
  7138      save_to_submit_buf(&p.event->args_buf, &dev, sizeof(dev), 3);
  7139  
  7140      return events_perf_submit(&p, TTY_OPEN, 0);
  7141  }