github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/container-hook/bpf/execruntime.bpf.c

github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/container-hook/bpf/execruntime.bpf.c (about)

     1  // SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR Apache-2.0
     2  #include <vmlinux.h>
     3  #include <bpf/bpf_helpers.h>
     4  #include <bpf/bpf_core_read.h>
     5  #include <bpf/bpf_tracing.h>
     6  
     7  #include "execruntime.h"
     8  
     9  const volatile int max_args = DEFAULT_MAXARGS;
    10  
    11  static const struct record empty_record = {};
    12  
    13  // configured by userspace
    14  const volatile u64 tracer_group = 0;
    15  
    16  // ig_fa_pick_ctx keeps context for kprobe/kretprobe fsnotify_remove_first_event
    17  struct {
    18  	__uint(type, BPF_MAP_TYPE_HASH);
    19  	__uint(max_entries, 64);
    20  	__type(key, u64); // tgid_pid
    21  	__type(value, u64); // dummy
    22  } ig_fa_pick_ctx SEC(".maps");
    23  
    24  // ig_fa_records is consumed by userspace
    25  struct {
    26  	__uint(type, BPF_MAP_TYPE_QUEUE);
    27  	__uint(max_entries, 64);
    28  	__type(value, struct record);
    29  } ig_fa_records SEC(".maps");
    30  
    31  struct {
    32  	__uint(type, BPF_MAP_TYPE_HASH);
    33  	__uint(max_entries, 128);
    34  	__type(key, u32); // pid (not tgid)
    35  	__type(value, struct record);
    36  } exec_args SEC(".maps");
    37  
    38  // man clone(2):
    39  //   If any of the threads in a thread group performs an
    40  //   execve(2), then all threads other than the thread group
    41  //   leader are terminated, and the new program is executed in
    42  //   the thread group leader.
    43  //
    44  // sys_enter_execve might be called from a thread and the corresponding
    45  // sys_exit_execve will be called from the thread group leader in case of
    46  // execve success, or from the same thread in case of execve failure. So we
    47  // need to lookup the pid from the tgid in sys_exit_execve.
    48  //
    49  // We don't know in advance which execve(2) will succeed, so we need to keep
    50  // track of all tgid<->pid mappings in a BPF map.
    51  //
    52  // We don't want to use bpf_for_each_map_elem() because it requires Linux 5.13.
    53  //
    54  // If several execve(2) are performed in parallel from different threads, only
    55  // one can succeed. The kernel will run the tracepoint syscalls/sys_exit_execve
    56  // for the failing execve(2) first and then for the successful one last.
    57  //
    58  // So we can insert a tgid->pid mapping in the same hash entry by adding
    59  // the pid in value and removing it by subtracting. By the time we need to
    60  // lookup the pid by the tgid, there will be only one pid left in the hash entry.
    61  struct pid_set {
    62  	__u64 pid_sum;
    63  	__u64 pid_count;
    64  };
    65  struct {
    66  	__uint(type, BPF_MAP_TYPE_HASH);
    67  	__type(key, pid_t); // tgid
    68  	__type(value, struct pid_set);
    69  	__uint(max_entries, 1024);
    70  } pid_by_tgid SEC(".maps");
    71  
    72  SEC("kprobe/fsnotify_remove_first_event")
    73  int BPF_KPROBE(ig_fa_pick_e, struct fsnotify_group *group)
    74  {
    75  	u64 current_pid_tgid;
    76  	u64 dummy = 0;
    77  
    78  	if (tracer_group != (u64)group)
    79  		return 0;
    80  
    81  	current_pid_tgid = bpf_get_current_pid_tgid();
    82  
    83  	// Keep context for kretprobe/fsnotify_remove_first_event
    84  	bpf_map_update_elem(&ig_fa_pick_ctx, &current_pid_tgid, &dummy, 0);
    85  
    86  	return 0;
    87  }
    88  
    89  SEC("kretprobe/fsnotify_remove_first_event")
    90  int BPF_KRETPROBE(ig_fa_pick_x, struct fanotify_event *ret)
    91  {
    92  	struct record *record;
    93  	u64 current_pid_tgid;
    94  	u32 event_tgid;
    95  	u32 pid;
    96  	u64 *exists;
    97  	struct pid_set *pid_set;
    98  
    99  	// current_pid_tgid is the Inspektor Gadget task
   100  	current_pid_tgid = bpf_get_current_pid_tgid();
   101  
   102  	exists = bpf_map_lookup_elem(&ig_fa_pick_ctx, &current_pid_tgid);
   103  	if (!exists)
   104  		return 0;
   105  
   106  	// event_tgid is the tgid of the process that triggered the fanotify event.
   107  	// Since Inspektor Gadget didn't use FAN_REPORT_TID, this is the process id
   108  	// and not the thread id.
   109  	event_tgid = BPF_CORE_READ(ret, pid, numbers[0].nr);
   110  
   111  	pid_set = bpf_map_lookup_elem(&pid_by_tgid, &event_tgid);
   112  	if (!pid_set)
   113  		goto fail;
   114  
   115  	if (pid_set->pid_count != 1)
   116  		goto fail;
   117  	pid = pid_set->pid_sum;
   118  
   119  	record = bpf_map_lookup_elem(&exec_args, &pid);
   120  	if (!record) {
   121  		// no record found but we need to push an empty record in the queue to
   122  		// ensure userspace understands that there is no record for this event
   123  		goto fail;
   124  	}
   125  
   126  	bpf_map_push_elem(&ig_fa_records, record, 0);
   127  	bpf_map_delete_elem(&ig_fa_pick_ctx, &current_pid_tgid);
   128  	return 0;
   129  
   130  fail:
   131  	bpf_map_push_elem(&ig_fa_records, &empty_record, 0);
   132  	bpf_map_delete_elem(&ig_fa_pick_ctx, &current_pid_tgid);
   133  	return 0;
   134  }
   135  
   136  SEC("tracepoint/syscalls/sys_enter_execve")
   137  int ig_execve_e(struct syscall_trace_enter *ctx)
   138  {
   139  	u64 pid_tgid;
   140  	u32 tgid, pid;
   141  	struct record *record;
   142  	struct task_struct *task;
   143  	uid_t uid = (u32)bpf_get_current_uid_gid();
   144  	struct pid_set zero_pid_set = { 0, 0 };
   145  	struct pid_set *pid_set;
   146  	u64 *pid_sum;
   147  
   148  	int ret;
   149  	const char **args = (const char **)(ctx->args[1]);
   150  	const char *argp;
   151  	int i;
   152  
   153  	pid_tgid = bpf_get_current_pid_tgid();
   154  	tgid = pid_tgid >> 32;
   155  	pid = (u32)pid_tgid;
   156  
   157  	bpf_map_update_elem(&pid_by_tgid, &tgid, &zero_pid_set, BPF_NOEXIST);
   158  
   159  	pid_set = bpf_map_lookup_elem(&pid_by_tgid, &tgid);
   160  	if (!pid_set)
   161  		return 0;
   162  
   163  	__atomic_add_fetch(&pid_set->pid_sum, (u64)pid, __ATOMIC_RELAXED);
   164  	__atomic_add_fetch(&pid_set->pid_count, 1, __ATOMIC_RELAXED);
   165  
   166  	// Add new entry but not from the stack due to size limitations
   167  	if (bpf_map_update_elem(&exec_args, &pid, &empty_record, 0))
   168  		return 0;
   169  	record = bpf_map_lookup_elem(&exec_args, &pid);
   170  	if (!record)
   171  		return 0;
   172  
   173  	task = (struct task_struct *)bpf_get_current_task();
   174  
   175  	bpf_get_current_comm(&record->caller_comm, sizeof(record->caller_comm));
   176  	record->pid = tgid;
   177  	record->args_size = 0;
   178  
   179  	ret = bpf_probe_read_user_str(record->args, ARGSIZE,
   180  				      (const char *)ctx->args[0]);
   181  	if (ret > 0 && ret <= ARGSIZE) {
   182  		record->args_size += ret;
   183  	} else {
   184  		// write an empty string
   185  		record->args[0] = '\0';
   186  		record->args_size++;
   187  	}
   188  
   189  #pragma unroll
   190  	for (i = 1; i < TOTAL_MAX_ARGS && i < max_args; i++) {
   191  		ret = bpf_probe_read_user(&argp, sizeof(argp), &args[i]);
   192  		if (ret != 0 || !argp)
   193  			return 0;
   194  
   195  		if (record->args_size > LAST_ARG)
   196  			return 0;
   197  
   198  		ret = bpf_probe_read_user_str(&record->args[record->args_size],
   199  					      ARGSIZE, argp);
   200  		if (ret > 0 && ret <= ARGSIZE) {
   201  			record->args_size += ret;
   202  		} else {
   203  			return 0;
   204  		}
   205  	}
   206  
   207  	return 0;
   208  }
   209  
   210  SEC("tracepoint/syscalls/sys_exit_execve")
   211  int ig_execve_x(struct syscall_trace_exit *ctx)
   212  {
   213  	u64 pid_tgid;
   214  	u32 tgid, pid;
   215  	u32 execs_lookup_key;
   216  	int ret;
   217  	struct pid_set *pid_set;
   218  
   219  	pid_tgid = bpf_get_current_pid_tgid();
   220  	tgid = pid_tgid >> 32;
   221  	pid = (u32)pid_tgid;
   222  	ret = ctx->ret;
   223  
   224  	pid_set = bpf_map_lookup_elem(&pid_by_tgid, &tgid);
   225  	if (!pid_set)
   226  		return 0;
   227  
   228  	// sys_enter_execve and sys_exit_execve might be called from different
   229  	// threads. We need to lookup the pid from the tgid.
   230  	execs_lookup_key = (ret == 0) ? pid_set->pid_sum : pid;
   231  	bpf_map_delete_elem(&exec_args, &execs_lookup_key);
   232  
   233  	// Remove the tgid->pid mapping if the value reaches 0
   234  	// or the execve() call was successful
   235  	// Convert pid to u64 before applying the negative sign to ensure it's not
   236  	// truncated
   237  	__atomic_add_fetch(&pid_set->pid_sum, -((u64)pid), __ATOMIC_RELAXED);
   238  	__atomic_add_fetch(&pid_set->pid_count, -1ULL, __ATOMIC_RELAXED);
   239  	if (pid_set->pid_sum == 0 || ret == 0)
   240  		bpf_map_delete_elem(&pid_by_tgid, &tgid);
   241  
   242  	return 0;
   243  }
   244  
   245  char LICENSE[] SEC("license") = "GPL";