github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/gadgets/traceloop/tracer/bpf/traceloop.bpf.c (about)

     1  // SPDX-License-Identifier: GPL-2.0
     2  #include <vmlinux.h>
     3  #include <bpf/bpf_helpers.h>
     4  #include <bpf/bpf_core_read.h>
     5  #include <bpf/bpf_tracing.h>
     6  #include <gadget/mntns_filter.h>
     7  #include "traceloop.h"
     8  
     9  /*
    10   * Taken from:
    11   * https://github.com/seccomp/libseccomp/blob/afbde6ddaec7c58c3b281d43b0b287269ffca9bd/src/syscalls.csv
    12   */
    13  #if defined(__TARGET_ARCH_arm64)
    14  #define __NR_rt_sigreturn 139
    15  #define __NR_exit_group 94
    16  #define __NR_exit 93
    17  #elif defined(__TARGET_ARCH_x86)
    18  #define __NR_rt_sigreturn 15
    19  #define __NR_exit_group 231
    20  #define __NR_exit 60
    21  #else
    22  #error "Traceloop is not supported on your architecture."
    23  #endif
    24  
    25  /* Compile with -DSHOW_DEBUG to print debug messages. */
    26  #if defined(SHOW_DEBUG)
    27  #define bpf_debug_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__)
    28  #else /* !defined(SHOW_DEBUG) */
    29  #define bpf_debug_printk(fmt, ...)
    30  #endif /* !defined(SHOW_DEBUG) */
    31  
    32  /* Compile with -DSHOW_ERROR to print error messages. */
    33  #if defined(SHOW_ERROR)
    34  #define bpf_error_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__)
    35  #else /* !defined(SHOW_ERROR) */
    36  #define bpf_error_printk(fmt, ...)
    37  #endif /* !defined(SHOW_ERROR) */
    38  
    39  const volatile bool filter_syscall = false;
    40  
    41  const struct syscall_event_t *unused_event __attribute__((unused));
    42  const struct syscall_event_cont_t *unused_event_cont __attribute__((unused));
    43  
    44  /*
    45   * We need this to avoid hitting the 512 bytes stack limit.
    46   * Indeed, pt_regs contains several u64 fields, so it is quite big.
    47   */
    48  static const struct pt_regs empty;
    49  static struct syscall_def_t default_definition;
    50  
    51  struct {
    52  	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
    53  	/*
    54  	 * We will use mount namespace ID to get the perf buffer corresponding
    55  	 * to this container.
    56  	 */
    57  	__uint(key_size, sizeof(u64));
    58  	__uint(value_size, sizeof(u32));
    59  	__uint(max_entries, 1024);
    60  	__array(
    61  		values, struct {
    62  			__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    63  			__uint(key_size, sizeof(u32));
    64  			__uint(value_size, sizeof(u32));
    65  		});
    66  } map_of_perf_buffers SEC(".maps");
    67  
    68  struct {
    69  	__uint(type, BPF_MAP_TYPE_HASH);
    70  	__uint(key_size, sizeof(u64));
    71  	__uint(value_size, sizeof(struct syscall_def_t));
    72  	/*
    73  	 * We have around 300 syscalls, let's use the immediate greater power of
    74  	 * 2.
    75  	 */
    76  	__uint(max_entries, 512);
    77  } syscalls SEC(".maps");
    78  
    79  struct {
    80  	__uint(type, BPF_MAP_TYPE_HASH);
    81  	__uint(key_size, sizeof(u64));
    82  	/*
    83  	 * We do not care about the value here, so let's use a bool to consume one
    84  	 * byte per value.
    85  	 */
    86  	__uint(value_size, sizeof(bool));
    87  	__uint(map_flags, BPF_F_NO_PREALLOC);
    88  	__uint(max_entries, SYSCALL_FILTERS);
    89  } syscall_filters SEC(".maps");
    90  
    91  /*
    92   * This key/value store maps thread PIDs to syscall arg arrays
    93   * that were remembered at sys_enter so that sys_exit can probe buffer
    94   * contents and generate syscall events showing the result content.
    95   */
    96  struct {
    97  	__uint(type, BPF_MAP_TYPE_HASH);
    98  	__uint(key_size, sizeof(u64));
    99  	__uint(value_size, sizeof(struct remembered_args));
   100  	__uint(max_entries, 1024);
   101  } probe_at_sys_exit SEC(".maps");
   102  
   103  struct {
   104  	__uint(type, BPF_MAP_TYPE_HASH);
   105  	__uint(key_size, sizeof(u64));
   106  	__uint(value_size, sizeof(struct pt_regs));
   107  	__uint(max_entries, 1024);
   108  } regs_map SEC(".maps");
   109  
   110  static __always_inline int skip_exit_probe(int nr)
   111  {
   112  	return !!(nr == __NR_exit || nr == __NR_exit_group ||
   113  		  nr == __NR_rt_sigreturn);
   114  }
   115  
   116  /*
   117   * Highly inspired from ksnoop.bpf.c:
   118   * https://github.com/iovisor/bcc/blob/f90126bb3770ea1bdd915ff3b47e451c6dde5c40/libbpf-tools/ksnoop.bpf.c#L280
   119   */
   120  static __always_inline u64 get_arg(struct pt_regs *regs, int i)
   121  {
   122  	switch (i) {
   123  	case 1:
   124  		return PT_REGS_PARM1_CORE_SYSCALL(regs);
   125  	case 2:
   126  		return PT_REGS_PARM2_CORE_SYSCALL(regs);
   127  	case 3:
   128  		return PT_REGS_PARM3_CORE_SYSCALL(regs);
   129  	case 4:
   130  		return PT_REGS_PARM4_CORE_SYSCALL(regs);
   131  	case 5:
   132  		return PT_REGS_PARM5_CORE_SYSCALL(regs);
   133  	case 6:
   134  		return PT_REGS_PARM6_CORE_SYSCALL(regs);
   135  	default:
   136  		bpf_error_printk(
   137  			"There is no PT_REGS_PARM%d_SYSCALL macro, check the argument!\n",
   138  			i);
   139  		return 0;
   140  	}
   141  }
   142  
   143  static __always_inline bool should_filter_out_syscall(u64 syscall_nr)
   144  {
   145  	return filter_syscall &&
   146  	       bpf_map_lookup_elem(&syscall_filters, &syscall_nr) == NULL;
   147  }
   148  
   149  /*
   150   * sys_enter is defined as:
   151   * TP_PROTO(struct pt_regs *regs, long id)
   152   * (https://elixir.bootlin.com/linux/v5.19/source/include/trace/events/syscalls.h#L20)
   153   * So, ctx->args[0] contains a struct pt_regs and ctx->args[1] the syscall ID.
   154   */
   155  SEC("raw_tracepoint/sys_enter")
   156  int ig_traceloop_e(struct bpf_raw_tracepoint_args *ctx)
   157  {
   158  	struct remembered_args remembered = {};
   159  	u64 pid = bpf_get_current_pid_tgid();
   160  	struct syscall_def_t *syscall_def;
   161  	/*
   162  	 * Initialize struct to empty to be sure all fields (even padding) are zeroed:
   163  	 * https://github.com/iovisor/bcc/issues/2623#issuecomment-560214481
   164  	 */
   165  	struct syscall_event_t sc = {};
   166  	struct task_struct *task;
   167  	u64 nr = ctx->args[1];
   168  	struct pt_regs *args;
   169  	void *perf_buffer;
   170  	u64 mntns_id;
   171  	int ret;
   172  	int i;
   173  
   174  	if (should_filter_out_syscall(nr))
   175  		return 0;
   176  
   177  	/* The boot time timestamp is used to give the timestamp to users. It
   178  	 * is converted to the wall-clock time in userspace. It only works
   179  	 * from Linux 5.7. On older kernels, the BPF bytecode for
   180  	 * bpf_ktime_get_boot_ns is automatically removed by the BPF loader,
   181  	 * see FixBpfKtimeGetBootNs. In this way, this BPF program can still be
   182  	 * loaded on older kernels. */
   183  	u64 boot_ts = bpf_ktime_get_boot_ns();
   184  
   185  	/* The monotonic timestamp is used by traceloop to match the sys_enter
   186  	 * event with the cont and sys_exit events. This is an internal
   187  	 * implementation detail not exposed to the user. */
   188  	u64 monotonic_ts = bpf_ktime_get_ns();
   189  
   190  	sc.boot_timestamp = boot_ts;
   191  	sc.monotonic_timestamp = monotonic_ts;
   192  	sc.cont_nr = 0;
   193  	sc.cpu = bpf_get_smp_processor_id();
   194  	sc.pid = pid >> 32;
   195  	sc.typ = SYSCALL_EVENT_TYPE_ENTER;
   196  	sc.id = nr;
   197  
   198  	remembered.monotonic_timestamp = monotonic_ts;
   199  	remembered.nr = nr;
   200  
   201  	syscall_def = bpf_map_lookup_elem(&syscalls, &nr);
   202  	/*
   203  	 * syscalls map contains definition for specific syscall like read or
   204  	 * write.
   205  	 * All others syscalls, like nanosleep, are not in this map because
   206  	 * their signature is not specific, in this case, we use the default
   207  	 * definition.
   208  	 */
   209  	if (syscall_def == NULL)
   210  		syscall_def = &default_definition;
   211  
   212  	task = (struct task_struct *)bpf_get_current_task();
   213  	mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum);
   214  
   215  	perf_buffer = bpf_map_lookup_elem(&map_of_perf_buffers, &mntns_id);
   216  	if (!perf_buffer)
   217  		return 0;
   218  
   219  	bpf_get_current_comm(sc.comm, sizeof(sc.comm));
   220  
   221  	ret = bpf_map_update_elem(&regs_map, &pid, &empty, BPF_NOEXIST);
   222  	if (ret) {
   223  		bpf_error_printk(
   224  			"enter: there should not be any pt_regs for key %lu: %d\n",
   225  			pid, ret);
   226  
   227  		return 0;
   228  	}
   229  
   230  	args = bpf_map_lookup_elem(&regs_map, &pid);
   231  	if (!args) {
   232  		bpf_error_printk(
   233  			"enter: there should be a pt_regs for key %lu\n", pid);
   234  
   235  		goto end;
   236  	}
   237  
   238  	bpf_probe_read(args, sizeof(*args), (void *)ctx->args[0]);
   239  
   240  	for (i = 0; i < SYSCALL_ARGS; i++) {
   241  		/* + 1 because PT_REGS_PARM begins from 1. */
   242  		u64 arg = get_arg(args, i + 1);
   243  		sc.args[i] = arg;
   244  		remembered.args[i] = arg;
   245  		if (syscall_def->args_len[i])
   246  			sc.cont_nr++;
   247  	}
   248  
   249  	bpf_debug_printk(
   250  		"Perf event output: sc.id: %d; sc.comm: %s; sizeof(sc): %d\n",
   251  		sc.id, sc.comm, sizeof(sc));
   252  	ret = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, &sc,
   253  				    sizeof(sc));
   254  	if (ret != 0) {
   255  		bpf_error_printk("Problem outputting perf event: %d", ret);
   256  	}
   257  
   258  	// Avoid using probe_at_sys_exit for exit() and exit_group() because sys_exit
   259  	// would not be called and the map would not be cleaned up and would get full.
   260  	// Note that a process can still get killed in the middle, so we would need
   261  	// a userspace cleaner for this case (TODO).
   262  	if (!skip_exit_probe(nr))
   263  		bpf_map_update_elem(&probe_at_sys_exit, &pid, &remembered,
   264  				    BPF_ANY);
   265  
   266  // We need to unroll this loop to make this work on kernels 5.4.0-x on ubuntu, see
   267  // https://github.com/inspektor-gadget/inspektor-gadget/issues/1465 for more details.
   268  #pragma unroll
   269  	for (i = 0; i < SYSCALL_ARGS; i++) {
   270  		__u64 arg_len = syscall_def->args_len[i];
   271  
   272  		if (!arg_len || (arg_len & PARAM_PROBE_AT_EXIT_MASK) ||
   273  		    arg_len == USE_RET_AS_PARAM_LENGTH)
   274  			continue;
   275  
   276  		bool null_terminated = false;
   277  		struct syscall_event_cont_t sc_cont = {};
   278  
   279  		sc_cont.monotonic_timestamp = monotonic_ts;
   280  		sc_cont.index = i;
   281  		sc_cont.failed = false;
   282  
   283  		if (arg_len == USE_NULL_BYTE_LENGTH) {
   284  			null_terminated = true;
   285  			arg_len = 0;
   286  		} else if (arg_len >= USE_ARG_INDEX_AS_PARAM_LENGTH) {
   287  			__u64 idx = arg_len &
   288  				    USE_ARG_INDEX_AS_PARAM_LENGTH_MASK;
   289  
   290  			/*
   291  			 * Access args via the previously saved map entry instead of
   292  			 * the ctx pointer or 'remembered' struct to avoid this verifier
   293  			 * issue (which does not occur in sys_exit for the same code):
   294  			 * "variable ctx access var_off=(0x0; 0x38) disallowed"
   295  			 */
   296  			struct remembered_args *remembered_ctx_workaround;
   297  			if (idx < SYSCALL_ARGS) {
   298  				remembered_ctx_workaround = bpf_map_lookup_elem(
   299  					&probe_at_sys_exit, &pid);
   300  				if (remembered_ctx_workaround)
   301  					arg_len = remembered_ctx_workaround
   302  							  ->args[idx];
   303  				else
   304  					arg_len = 0;
   305  			} else {
   306  				arg_len = PARAM_LEN;
   307  			}
   308  		}
   309  
   310  		if (arg_len > sizeof(sc_cont.param))
   311  			arg_len = sizeof(sc_cont.param);
   312  
   313  		if (null_terminated)
   314  			sc_cont.length = USE_NULL_BYTE_LENGTH;
   315  		else
   316  			sc_cont.length = arg_len;
   317  
   318  		/* + 1 because PT_REGS_PARM begins from 1. */
   319  		u64 arg = get_arg(args, i + 1);
   320  
   321  		if (!arg_len &&
   322  		    null_terminated /* NULL terminated argument like string */
   323  		    && bpf_probe_read_user_str(sc_cont.param, PARAM_LEN,
   324  					       (void *)(arg)) < 0)
   325  			sc_cont.failed = true;
   326  		else if (sizeof(u8) <= arg_len &&
   327  			 arg_len <=
   328  				 sizeof(u64) /* Conventional arguments like type (char, int, etc.) */
   329  			 && bpf_probe_read_user(sc_cont.param, arg_len,
   330  						(void *)(arg)))
   331  			sc_cont.failed = true;
   332  		else if (bpf_probe_read_user(
   333  				 sc_cont.param, PARAM_LEN,
   334  				 (void *)(arg))) /* TODO Struct arguments? */
   335  			sc_cont.failed = true;
   336  
   337  		bpf_debug_printk(
   338  			"Perf event output: sc_cont.index: %d; sizeof(sc_cont): %d\n",
   339  			sc_cont.index, sizeof(sc_cont));
   340  		ret = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU,
   341  					    &sc_cont, sizeof(sc_cont));
   342  		if (ret != 0) {
   343  			bpf_error_printk(
   344  				"Problem outputting continued perf event: %d",
   345  				ret);
   346  		}
   347  	}
   348  
   349  end:
   350  	bpf_map_delete_elem(&regs_map, &pid);
   351  
   352  	return 0;
   353  }
   354  
   355  /*
   356   * syscall_get_nr() is defined for each architecture in the Linux kernel.
   357   * As we cannot use trace_event_raw_sys_exit, we need to get the current syscall
   358   * number from the register.
   359   * So, this function should be expanded with the code of the architecture we
   360   * support.
   361   */
   362  static __always_inline int syscall_get_nr(struct pt_regs *regs)
   363  {
   364  #if defined(__TARGET_ARCH_arm64)
   365  	return regs->syscallno;
   366  #elif defined(__TARGET_ARCH_x86)
   367  	return regs->orig_ax;
   368  #else
   369  #error "Traceloop is not supported on your architecture."
   370  #endif
   371  }
   372  
   373  /*
   374   * sys_exit is defined as:
   375   * TP_PROTO(struct pt_regs *regs, long ret),
   376   * (https://elixir.bootlin.com/linux/v5.19/source/include/trace/events/syscalls.h#L46)
   377   * So, ctx->args[0] contains a struct pt_regs and ctx->args[1] the syscall
   378   * return value.
   379   */
   380  SEC("raw_tracepoint/sys_exit")
   381  int ig_traceloop_x(struct bpf_raw_tracepoint_args *ctx)
   382  {
   383  	u64 pid = bpf_get_current_pid_tgid();
   384  	struct remembered_args *remembered;
   385  	struct syscall_def_t *syscall_def;
   386  	struct task_struct *task;
   387  	long ret = ctx->args[1];
   388  	struct pt_regs *args;
   389  	void *perf_buffer;
   390  	u64 mntns_id;
   391  	int i, r;
   392  	u64 nr;
   393  
   394  	r = bpf_map_update_elem(&regs_map, &pid, &empty, BPF_NOEXIST);
   395  	if (r) {
   396  		bpf_error_printk(
   397  			"exit: there should not be any pt_regs for key %lu: %d\n",
   398  			pid, r);
   399  
   400  		return 0;
   401  	}
   402  
   403  	args = bpf_map_lookup_elem(&regs_map, &pid);
   404  	if (!args) {
   405  		bpf_error_printk(
   406  			"exit: there should be a pt_regs for key %lu\n", pid);
   407  
   408  		goto end;
   409  	}
   410  
   411  	bpf_probe_read(args, sizeof(*args), (void *)ctx->args[0]);
   412  	nr = syscall_get_nr(args);
   413  	/* TODO Why this can occur? */
   414  	if (nr == -1)
   415  		goto end;
   416  
   417  	struct syscall_event_t sc = {
   418  		.boot_timestamp = bpf_ktime_get_boot_ns(),
   419  		.cpu = bpf_get_smp_processor_id(),
   420  		.pid = pid >> 32,
   421  		.typ = SYSCALL_EVENT_TYPE_EXIT,
   422  		.id = nr,
   423  	};
   424  	sc.args[0] = ret;
   425  
   426  	syscall_def = bpf_map_lookup_elem(&syscalls, &nr);
   427  	if (syscall_def == NULL)
   428  		syscall_def = &default_definition;
   429  
   430  	task = (struct task_struct *)bpf_get_current_task();
   431  	mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum);
   432  
   433  	perf_buffer = bpf_map_lookup_elem(&map_of_perf_buffers, &mntns_id);
   434  	if (!perf_buffer)
   435  		goto end;
   436  
   437  	remembered = bpf_map_lookup_elem(&probe_at_sys_exit, &pid);
   438  	if (!remembered)
   439  		goto end;
   440  
   441  	/*
   442  	 * This ensures all events (enter, exit and cont) related to a given
   443  	 * syscall have the same timestamp.
   444  	 */
   445  	sc.monotonic_timestamp = remembered->monotonic_timestamp;
   446  
   447  	for (i = 0; i < SYSCALL_ARGS; i++) {
   448  		__u64 arg_len = syscall_def->args_len[i];
   449  
   450  		if (!arg_len || !(arg_len & PARAM_PROBE_AT_EXIT_MASK))
   451  			goto end_loop;
   452  
   453  		bool null_terminated = false;
   454  		struct syscall_event_cont_t sc_cont = {
   455  			.monotonic_timestamp = remembered->monotonic_timestamp,
   456  			.index = i,
   457  			.failed = false,
   458  		};
   459  
   460  		arg_len &= ~PARAM_PROBE_AT_EXIT_MASK;
   461  
   462  		if (arg_len == USE_RET_AS_PARAM_LENGTH) {
   463  			if ((signed long)ret < 0)
   464  				arg_len = 0;
   465  			else
   466  				arg_len = ret;
   467  		} else if (arg_len == USE_NULL_BYTE_LENGTH) {
   468  			null_terminated = true;
   469  			arg_len = 0;
   470  		} else if (arg_len >= USE_ARG_INDEX_AS_PARAM_LENGTH) {
   471  			__u64 idx = arg_len &
   472  				    USE_ARG_INDEX_AS_PARAM_LENGTH_MASK;
   473  			if (idx < SYSCALL_ARGS)
   474  				arg_len = remembered->args[idx];
   475  			else
   476  				arg_len = PARAM_LEN;
   477  		}
   478  
   479  		if (arg_len > sizeof(sc_cont.param))
   480  			arg_len = sizeof(sc_cont.param);
   481  
   482  		if (null_terminated)
   483  			sc_cont.length = USE_NULL_BYTE_LENGTH;
   484  		else
   485  			sc_cont.length = arg_len;
   486  
   487  		if (arg_len == 0 && null_terminated) {
   488  			if (bpf_probe_read_user_str(
   489  				    sc_cont.param, PARAM_LEN,
   490  				    (void *)(remembered->args[i])) < 0)
   491  				sc_cont.failed = true;
   492  		} else if (sizeof(u8) <= arg_len && arg_len <= sizeof(u64) &&
   493  			   bpf_probe_read_user(sc_cont.param, arg_len,
   494  					       (void *)(remembered->args[i]))) {
   495  			sc_cont.failed = true;
   496  		} else if (bpf_probe_read_user(sc_cont.param, PARAM_LEN,
   497  					       (void *)(remembered->args[i]))) {
   498  			sc_cont.failed = true;
   499  		}
   500  
   501  		bpf_debug_printk(
   502  			"Perf event output (exit): sc_cont.index: %d; sizeof(sc_cont): %d\n",
   503  			sc_cont.index, sizeof(sc_cont));
   504  		r = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU,
   505  					  &sc_cont, sizeof(sc_cont));
   506  		if (r != 0) {
   507  			bpf_error_printk(
   508  				"Problem outputting continued perf event: %d",
   509  				ret);
   510  		}
   511  end_loop:
   512  		bpf_map_delete_elem(&probe_at_sys_exit, &pid);
   513  	}
   514  
   515  	bpf_get_current_comm(sc.comm, sizeof(sc.comm));
   516  
   517  	bpf_debug_printk(
   518  		"Perf event output (exit): sc.id: %d; sc.comm: %s; sizeof(sc): %d\n",
   519  		sc.id, sc.comm, sizeof(sc));
   520  	r = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, &sc,
   521  				  sizeof(sc));
   522  	if (r != 0) {
   523  		bpf_error_printk("Problem outputting perf event: %d", ret);
   524  	}
   525  end:
   526  	bpf_map_delete_elem(&regs_map, &pid);
   527  
   528  	return 0;
   529  }
   530  
   531  char LICENSE[] SEC("license") = "GPL";