github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/container-hook/bpf/execruntime.bpf.c (about) 1 // SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR Apache-2.0 2 #include <vmlinux.h> 3 #include <bpf/bpf_helpers.h> 4 #include <bpf/bpf_core_read.h> 5 #include <bpf/bpf_tracing.h> 6 7 #include "execruntime.h" 8 9 const volatile int max_args = DEFAULT_MAXARGS; 10 11 static const struct record empty_record = {}; 12 13 // configured by userspace 14 const volatile u64 tracer_group = 0; 15 16 // ig_fa_pick_ctx keeps context for kprobe/kretprobe fsnotify_remove_first_event 17 struct { 18 __uint(type, BPF_MAP_TYPE_HASH); 19 __uint(max_entries, 64); 20 __type(key, u64); // tgid_pid 21 __type(value, u64); // dummy 22 } ig_fa_pick_ctx SEC(".maps"); 23 24 // ig_fa_records is consumed by userspace 25 struct { 26 __uint(type, BPF_MAP_TYPE_QUEUE); 27 __uint(max_entries, 64); 28 __type(value, struct record); 29 } ig_fa_records SEC(".maps"); 30 31 struct { 32 __uint(type, BPF_MAP_TYPE_HASH); 33 __uint(max_entries, 128); 34 __type(key, u32); // pid (not tgid) 35 __type(value, struct record); 36 } exec_args SEC(".maps"); 37 38 // man clone(2): 39 // If any of the threads in a thread group performs an 40 // execve(2), then all threads other than the thread group 41 // leader are terminated, and the new program is executed in 42 // the thread group leader. 43 // 44 // sys_enter_execve might be called from a thread and the corresponding 45 // sys_exit_execve will be called from the thread group leader in case of 46 // execve success, or from the same thread in case of execve failure. So we 47 // need to lookup the pid from the tgid in sys_exit_execve. 48 // 49 // We don't know in advance which execve(2) will succeed, so we need to keep 50 // track of all tgid<->pid mappings in a BPF map. 51 // 52 // We don't want to use bpf_for_each_map_elem() because it requires Linux 5.13. 53 // 54 // If several execve(2) are performed in parallel from different threads, only 55 // one can succeed. The kernel will run the tracepoint syscalls/sys_exit_execve 56 // for the failing execve(2) first and then for the successful one last. 57 // 58 // So we can insert a tgid->pid mapping in the same hash entry by adding 59 // the pid in value and removing it by subtracting. By the time we need to 60 // lookup the pid by the tgid, there will be only one pid left in the hash entry. 61 struct pid_set { 62 __u64 pid_sum; 63 __u64 pid_count; 64 }; 65 struct { 66 __uint(type, BPF_MAP_TYPE_HASH); 67 __type(key, pid_t); // tgid 68 __type(value, struct pid_set); 69 __uint(max_entries, 1024); 70 } pid_by_tgid SEC(".maps"); 71 72 SEC("kprobe/fsnotify_remove_first_event") 73 int BPF_KPROBE(ig_fa_pick_e, struct fsnotify_group *group) 74 { 75 u64 current_pid_tgid; 76 u64 dummy = 0; 77 78 if (tracer_group != (u64)group) 79 return 0; 80 81 current_pid_tgid = bpf_get_current_pid_tgid(); 82 83 // Keep context for kretprobe/fsnotify_remove_first_event 84 bpf_map_update_elem(&ig_fa_pick_ctx, ¤t_pid_tgid, &dummy, 0); 85 86 return 0; 87 } 88 89 SEC("kretprobe/fsnotify_remove_first_event") 90 int BPF_KRETPROBE(ig_fa_pick_x, struct fanotify_event *ret) 91 { 92 struct record *record; 93 u64 current_pid_tgid; 94 u32 event_tgid; 95 u32 pid; 96 u64 *exists; 97 struct pid_set *pid_set; 98 99 // current_pid_tgid is the Inspektor Gadget task 100 current_pid_tgid = bpf_get_current_pid_tgid(); 101 102 exists = bpf_map_lookup_elem(&ig_fa_pick_ctx, ¤t_pid_tgid); 103 if (!exists) 104 return 0; 105 106 // event_tgid is the tgid of the process that triggered the fanotify event. 107 // Since Inspektor Gadget didn't use FAN_REPORT_TID, this is the process id 108 // and not the thread id. 109 event_tgid = BPF_CORE_READ(ret, pid, numbers[0].nr); 110 111 pid_set = bpf_map_lookup_elem(&pid_by_tgid, &event_tgid); 112 if (!pid_set) 113 goto fail; 114 115 if (pid_set->pid_count != 1) 116 goto fail; 117 pid = pid_set->pid_sum; 118 119 record = bpf_map_lookup_elem(&exec_args, &pid); 120 if (!record) { 121 // no record found but we need to push an empty record in the queue to 122 // ensure userspace understands that there is no record for this event 123 goto fail; 124 } 125 126 bpf_map_push_elem(&ig_fa_records, record, 0); 127 bpf_map_delete_elem(&ig_fa_pick_ctx, ¤t_pid_tgid); 128 return 0; 129 130 fail: 131 bpf_map_push_elem(&ig_fa_records, &empty_record, 0); 132 bpf_map_delete_elem(&ig_fa_pick_ctx, ¤t_pid_tgid); 133 return 0; 134 } 135 136 SEC("tracepoint/syscalls/sys_enter_execve") 137 int ig_execve_e(struct syscall_trace_enter *ctx) 138 { 139 u64 pid_tgid; 140 u32 tgid, pid; 141 struct record *record; 142 struct task_struct *task; 143 uid_t uid = (u32)bpf_get_current_uid_gid(); 144 struct pid_set zero_pid_set = { 0, 0 }; 145 struct pid_set *pid_set; 146 u64 *pid_sum; 147 148 int ret; 149 const char **args = (const char **)(ctx->args[1]); 150 const char *argp; 151 int i; 152 153 pid_tgid = bpf_get_current_pid_tgid(); 154 tgid = pid_tgid >> 32; 155 pid = (u32)pid_tgid; 156 157 bpf_map_update_elem(&pid_by_tgid, &tgid, &zero_pid_set, BPF_NOEXIST); 158 159 pid_set = bpf_map_lookup_elem(&pid_by_tgid, &tgid); 160 if (!pid_set) 161 return 0; 162 163 __atomic_add_fetch(&pid_set->pid_sum, (u64)pid, __ATOMIC_RELAXED); 164 __atomic_add_fetch(&pid_set->pid_count, 1, __ATOMIC_RELAXED); 165 166 // Add new entry but not from the stack due to size limitations 167 if (bpf_map_update_elem(&exec_args, &pid, &empty_record, 0)) 168 return 0; 169 record = bpf_map_lookup_elem(&exec_args, &pid); 170 if (!record) 171 return 0; 172 173 task = (struct task_struct *)bpf_get_current_task(); 174 175 bpf_get_current_comm(&record->caller_comm, sizeof(record->caller_comm)); 176 record->pid = tgid; 177 record->args_size = 0; 178 179 ret = bpf_probe_read_user_str(record->args, ARGSIZE, 180 (const char *)ctx->args[0]); 181 if (ret > 0 && ret <= ARGSIZE) { 182 record->args_size += ret; 183 } else { 184 // write an empty string 185 record->args[0] = '\0'; 186 record->args_size++; 187 } 188 189 #pragma unroll 190 for (i = 1; i < TOTAL_MAX_ARGS && i < max_args; i++) { 191 ret = bpf_probe_read_user(&argp, sizeof(argp), &args[i]); 192 if (ret != 0 || !argp) 193 return 0; 194 195 if (record->args_size > LAST_ARG) 196 return 0; 197 198 ret = bpf_probe_read_user_str(&record->args[record->args_size], 199 ARGSIZE, argp); 200 if (ret > 0 && ret <= ARGSIZE) { 201 record->args_size += ret; 202 } else { 203 return 0; 204 } 205 } 206 207 return 0; 208 } 209 210 SEC("tracepoint/syscalls/sys_exit_execve") 211 int ig_execve_x(struct syscall_trace_exit *ctx) 212 { 213 u64 pid_tgid; 214 u32 tgid, pid; 215 u32 execs_lookup_key; 216 int ret; 217 struct pid_set *pid_set; 218 219 pid_tgid = bpf_get_current_pid_tgid(); 220 tgid = pid_tgid >> 32; 221 pid = (u32)pid_tgid; 222 ret = ctx->ret; 223 224 pid_set = bpf_map_lookup_elem(&pid_by_tgid, &tgid); 225 if (!pid_set) 226 return 0; 227 228 // sys_enter_execve and sys_exit_execve might be called from different 229 // threads. We need to lookup the pid from the tgid. 230 execs_lookup_key = (ret == 0) ? pid_set->pid_sum : pid; 231 bpf_map_delete_elem(&exec_args, &execs_lookup_key); 232 233 // Remove the tgid->pid mapping if the value reaches 0 234 // or the execve() call was successful 235 // Convert pid to u64 before applying the negative sign to ensure it's not 236 // truncated 237 __atomic_add_fetch(&pid_set->pid_sum, -((u64)pid), __ATOMIC_RELAXED); 238 __atomic_add_fetch(&pid_set->pid_count, -1ULL, __ATOMIC_RELAXED); 239 if (pid_set->pid_sum == 0 || ret == 0) 240 bpf_map_delete_elem(&pid_by_tgid, &tgid); 241 242 return 0; 243 } 244 245 char LICENSE[] SEC("license") = "GPL";