github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/gadgets/trace/capabilities/tracer/bpf/capable.bpf.c (about)

     1  // SPDX-License-Identifier: GPL-2.0
     2  //
     3  // Unique filtering based on
     4  // https://github.com/libbpf/libbpf-rs/tree/master/examples/capable
     5  //
     6  // Copyright 2022 Sony Group Corporation
     7  
     8  #include <vmlinux.h>
     9  #include <bpf/bpf_core_read.h>
    10  #include <bpf/bpf_helpers.h>
    11  #include <bpf/bpf_tracing.h>
    12  #include "capable.h"
    13  #include <gadget/mntns_filter.h>
    14  
    15  // include/linux/security.h
    16  #ifndef CAP_OPT_NOAUDIT
    17  #define CAP_OPT_NOAUDIT 1 << 1
    18  #endif
    19  
    20  #define MAX_ENTRIES 10240
    21  
    22  const volatile pid_t my_pid = -1;
    23  const volatile pid_t targ_pid = -1;
    24  const volatile u32 linux_version_code = 0;
    25  const volatile bool audit_only = false;
    26  const volatile bool unique = false;
    27  
    28  extern int LINUX_KERNEL_VERSION __kconfig;
    29  
    30  // we need this to make sure the compiler doesn't remove our struct
    31  const struct cap_event *unusedcapevent __attribute__((unused));
    32  
    33  struct args_t {
    34  	u64 current_userns;
    35  	u64 target_userns;
    36  	u64 cap_effective;
    37  	int cap;
    38  	int cap_opt;
    39  };
    40  
    41  struct {
    42  	__uint(type, BPF_MAP_TYPE_HASH);
    43  	__uint(max_entries, 10240);
    44  	__type(key, u64);
    45  	__type(value, struct args_t);
    46  } start SEC(".maps");
    47  
    48  struct {
    49  	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    50  	__uint(key_size, sizeof(__u32));
    51  	__uint(value_size, sizeof(__u32));
    52  } events SEC(".maps");
    53  
    54  struct unique_key {
    55  	int cap;
    56  	u64 mntns_id;
    57  };
    58  
    59  struct {
    60  	__uint(type, BPF_MAP_TYPE_HASH);
    61  	__uint(max_entries, 10240);
    62  	__type(key, struct unique_key);
    63  	__type(value, u64);
    64  } seen SEC(".maps");
    65  
    66  struct syscall_context {
    67  	// Syscall id
    68  	// -1 for unknown syscall
    69  	u64 nr;
    70  
    71  	// We could add more fields for the arguments if desired
    72  };
    73  
    74  struct {
    75  	__uint(type, BPF_MAP_TYPE_HASH);
    76  	__uint(key_size, sizeof(u64));
    77  	__uint(value_size, sizeof(struct syscall_context));
    78  	__uint(max_entries,
    79  	       1048576); // There can be many threads sleeping in some futex/poll syscalls
    80  } current_syscall SEC(".maps");
    81  
    82  SEC("kprobe/cap_capable")
    83  int BPF_KPROBE(ig_trace_cap_e, const struct cred *cred,
    84  	       struct user_namespace *targ_ns, int cap, int cap_opt)
    85  {
    86  	__u32 pid;
    87  	u64 mntns_id;
    88  	__u64 pid_tgid;
    89  	struct task_struct *task;
    90  
    91  	task = (struct task_struct *)bpf_get_current_task();
    92  	mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum);
    93  
    94  	if (gadget_should_discard_mntns_id(mntns_id))
    95  		return 0;
    96  
    97  	const struct cred *real_cred = BPF_CORE_READ(task, real_cred);
    98  	if (cred != real_cred) {
    99  		// the subjective credentials are in an overridden state with
   100  		// override_creds/revert_creds (e.g. during overlayfs cache or copyup)
   101  		// https://kernel.org/doc/html/v6.2-rc8/security/credentials.html#overriding-the-vfs-s-use-of-credentials
   102  		return 0;
   103  	}
   104  
   105  	pid_tgid = bpf_get_current_pid_tgid();
   106  	pid = pid_tgid >> 32;
   107  
   108  	if (pid == my_pid)
   109  		return 0;
   110  
   111  	if (targ_pid != -1 && targ_pid != pid)
   112  		return 0;
   113  
   114  	if (audit_only) {
   115  		if (LINUX_KERNEL_VERSION >= KERNEL_VERSION(5, 1, 0)) {
   116  			if (cap_opt & CAP_OPT_NOAUDIT)
   117  				return 0;
   118  		} else {
   119  			if (!cap_opt)
   120  				return 0;
   121  		}
   122  	}
   123  
   124  	if (unique) {
   125  		struct unique_key key = {
   126  			.cap = cap,
   127  			.mntns_id = mntns_id,
   128  		};
   129  
   130  		if (bpf_map_lookup_elem(&seen, &key) != NULL) {
   131  			return 0;
   132  		}
   133  		u64 zero = 0;
   134  		bpf_map_update_elem(&seen, &key, &zero, 0);
   135  	}
   136  
   137  	struct args_t args = {};
   138  	args.current_userns =
   139  		(u64)BPF_CORE_READ(task, real_cred, user_ns, ns.inum);
   140  	args.target_userns = (u64)BPF_CORE_READ(targ_ns, ns.inum);
   141  	/*
   142  	 * cap_effective has kernel_cap_t for type.
   143  	 * This type definition changed along the time:
   144  	 * 1. It was defined as a __u32 in:
   145  	 * https://github.com/torvalds/linux/commit/1da177e4c3f4
   146  	 * 2. It later was modified to be an array of __u32, so 64 bits kernel
   147  	 * can use 64 bits for capabilities while supporting legacy 32 bits
   148  	 * ones:
   149  	 * https://github.com/torvalds/linux/commit/e338d263a76a
   150  	 * 3. It was recently defined to be a simple u64:
   151  	 * https://github.com/torvalds/linux/commit/f122a08b197d
   152  	 * BPF_CORE_READ_INTO() will handle the different size for us and in any
   153  	 * case, we define args.cap_effective as u64 which is enough to contain
   154  	 * the information.
   155  	 */
   156  	BPF_CORE_READ_INTO(&args.cap_effective, task, real_cred, cap_effective);
   157  	args.cap = cap;
   158  	args.cap_opt = cap_opt;
   159  	bpf_map_update_elem(&start, &pid_tgid, &args, 0);
   160  
   161  	return 0;
   162  }
   163  
   164  SEC("kretprobe/cap_capable")
   165  int BPF_KRETPROBE(ig_trace_cap_x)
   166  {
   167  	__u64 pid_tgid;
   168  	__u64 uid_gid = bpf_get_current_uid_gid();
   169  	struct args_t *ap;
   170  	int ret;
   171  
   172  	pid_tgid = bpf_get_current_pid_tgid();
   173  	ap = bpf_map_lookup_elem(&start, &pid_tgid);
   174  	if (!ap)
   175  		return 0; /* missed entry */
   176  
   177  	struct cap_event event = {};
   178  	event.current_userns = ap->current_userns;
   179  	event.target_userns = ap->target_userns;
   180  	event.cap_effective = ap->cap_effective;
   181  	event.pid = pid_tgid >> 32;
   182  	event.tgid = pid_tgid;
   183  	event.cap = ap->cap;
   184  	event.uid = (u32)uid_gid;
   185  	event.gid = (u32)(uid_gid >> 32);
   186  	event.mntnsid = gadget_get_mntns_id();
   187  	bpf_get_current_comm(&event.task, sizeof(event.task));
   188  	event.ret = PT_REGS_RC(ctx);
   189  	event.timestamp = bpf_ktime_get_boot_ns();
   190  
   191  	if (LINUX_KERNEL_VERSION >= KERNEL_VERSION(5, 1, 0)) {
   192  		event.audit = (ap->cap_opt & CAP_OPT_NOAUDIT) == 0;
   193  		event.insetid = (ap->cap_opt & CAP_OPT_INSETID) != 0;
   194  	} else {
   195  		event.audit = ap->cap_opt;
   196  		event.insetid = -1;
   197  	}
   198  
   199  	struct syscall_context *sc_ctx;
   200  	sc_ctx = bpf_map_lookup_elem(&current_syscall, &pid_tgid);
   201  	if (sc_ctx) {
   202  		event.syscall = sc_ctx->nr;
   203  	} else {
   204  		event.syscall = -1;
   205  	}
   206  
   207  	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event,
   208  			      sizeof(event));
   209  
   210  	bpf_map_delete_elem(&start, &pid_tgid);
   211  
   212  	return 0;
   213  }
   214  
   215  /*
   216   * Taken from:
   217   * https://github.com/seccomp/libseccomp/blob/afbde6ddaec7c58c3b281d43b0b287269ffca9bd/src/syscalls.csv
   218   */
   219  #if defined(__TARGET_ARCH_arm64)
   220  #define __NR_rt_sigreturn 139
   221  #define __NR_exit_group 94
   222  #define __NR_exit 93
   223  #elif defined(__TARGET_ARCH_x86)
   224  #define __NR_rt_sigreturn 15
   225  #define __NR_exit_group 231
   226  #define __NR_exit 60
   227  #else
   228  #error "The trace capabilities gadget is not supported on your architecture."
   229  #endif
   230  
   231  static __always_inline int skip_exit_probe(int nr)
   232  {
   233  	return !!(nr == __NR_exit || nr == __NR_exit_group ||
   234  		  nr == __NR_rt_sigreturn);
   235  }
   236  
   237  SEC("raw_tracepoint/sys_enter")
   238  int ig_cap_sys_enter(struct bpf_raw_tracepoint_args *ctx)
   239  {
   240  	u64 pid_tgid = bpf_get_current_pid_tgid();
   241  	struct pt_regs regs = {};
   242  	struct syscall_context sc_ctx = {};
   243  
   244  	u64 mntns_id = gadget_get_mntns_id();
   245  
   246  	if (gadget_should_discard_mntns_id(mntns_id))
   247  		return 0;
   248  
   249  	u64 nr = ctx->args[1];
   250  	sc_ctx.nr = nr;
   251  
   252  	// The sys_exit tracepoint is not called for some syscalls.
   253  	if (!skip_exit_probe(nr))
   254  		bpf_map_update_elem(&current_syscall, &pid_tgid, &sc_ctx,
   255  				    BPF_ANY);
   256  
   257  	return 0;
   258  }
   259  
   260  SEC("raw_tracepoint/sys_exit")
   261  int ig_cap_sys_exit(struct bpf_raw_tracepoint_args *ctx)
   262  {
   263  	u64 pid_tgid = bpf_get_current_pid_tgid();
   264  	bpf_map_delete_elem(&current_syscall, &pid_tgid);
   265  	return 0;
   266  }
   267  
   268  char LICENSE[] SEC("license") = "GPL";