github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/gadgets/trace/dns/tracer/bpf/dns.c (about)

     1  // SPDX-License-Identifier: GPL-2.0
     2  /* Copyright (c) 2024 The Inspektor Gadget authors */
     3  
     4  #include <linux/bpf.h>
     5  #include <linux/if_ether.h>
     6  #include <linux/ip.h>
     7  #include <linux/in.h>
     8  #include <linux/udp.h>
     9  #include <sys/socket.h>
    10  #include <stdbool.h>
    11  
    12  #include <bpf/bpf_helpers.h>
    13  #include <bpf/bpf_endian.h>
    14  
    15  #define GADGET_TYPE_NETWORKING
    16  #include <gadget/sockets-map.h>
    17  
    18  #include "dns-common.h"
    19  
    20  #ifndef PACKET_HOST
    21  #define PACKET_HOST 0x0
    22  #endif
    23  
    24  #ifndef PACKET_OUTGOING
    25  #define PACKET_OUTGOING 0x4
    26  #endif
    27  
    28  #define DNS_QR_QUERY 0
    29  #define DNS_QR_RESP 1
    30  
    31  #define MAX_PORTS 16
    32  const volatile __u16 ports[MAX_PORTS] = { 53, 5353 };
    33  const volatile __u16 ports_len = 2;
    34  
    35  static __always_inline bool is_dns_port(__u16 port)
    36  {
    37  	for (int i = 0; i < ports_len; i++) {
    38  		if (ports[i] == port)
    39  			return true;
    40  	}
    41  	return false;
    42  }
    43  
    44  // we need this to make sure the compiler doesn't remove our struct
    45  const struct event_t *unusedevent __attribute__((unused));
    46  
    47  struct {
    48  	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    49  	__uint(key_size, sizeof(__u32));
    50  	__uint(value_size, sizeof(__u32));
    51  } events SEC(".maps");
    52  
    53  // https://datatracker.ietf.org/doc/html/rfc1035#section-4.1.1
    54  union dnsflags {
    55  	struct {
    56  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    57  		__u8 rcode : 4; // response code
    58  		__u8 z : 3; // reserved
    59  		__u8 ra : 1; // recursion available
    60  		__u8 rd : 1; // recursion desired
    61  		__u8 tc : 1; // truncation
    62  		__u8 aa : 1; // authoritive answer
    63  		__u8 opcode : 4; // kind of query
    64  		__u8 qr : 1; // 0=query; 1=response
    65  #elif __BYTE_ORDER == __ORDER_BIG_ENDIAN__
    66  		__u8 qr : 1; // 0=query; 1=response
    67  		__u8 opcode : 4; // kind of query
    68  		__u8 aa : 1; // authoritive answer
    69  		__u8 tc : 1; // truncation
    70  		__u8 rd : 1; // recursion desired
    71  		__u8 ra : 1; // recursion available
    72  		__u8 z : 3; // reserved
    73  		__u8 rcode : 4; // response code
    74  #else
    75  #error "Fix your compiler's __BYTE_ORDER__?!"
    76  #endif
    77  	};
    78  	__u16 flags;
    79  };
    80  
    81  struct dnshdr {
    82  	__u16 id;
    83  
    84  	union dnsflags flags;
    85  
    86  	__u16 qdcount; // number of question entries
    87  	__u16 ancount; // number of answer entries
    88  	__u16 nscount; // number of authority records
    89  	__u16 arcount; // number of additional records
    90  };
    91  
    92  // Map of DNS query to timestamp so we can calculate latency from query sent to answer received.
    93  struct query_key_t {
    94  	__u64 pid_tgid;
    95  	__u16 id;
    96  	__u16 pad[3]; // this is needed, otherwise the verifier claims an invalid read from stack
    97  };
    98  
    99  struct {
   100  	__uint(type, BPF_MAP_TYPE_HASH);
   101  	__type(key, struct query_key_t);
   102  	__type(value, __u64); // timestamp of the query
   103  	__uint(max_entries, 1024);
   104  } query_map SEC(".maps");
   105  
   106  SEC("socket1")
   107  int ig_trace_dns(struct __sk_buff *skb)
   108  {
   109  	struct event_t event;
   110  	__u16 sport, dport, l4_off, dns_off, h_proto, id;
   111  	__u8 proto;
   112  	int i;
   113  
   114  	// Do a first pass only to extract the port and drop the packet if it's not DNS
   115  	h_proto = load_half(skb, offsetof(struct ethhdr, h_proto));
   116  	switch (h_proto) {
   117  	case ETH_P_IP:
   118  		proto = load_byte(skb,
   119  				  ETH_HLEN + offsetof(struct iphdr, protocol));
   120  		// An IPv4 header doesn't have a fixed size. The IHL field of a packet
   121  		// represents the size of the IP header in 32-bit words, so we need to
   122  		// multiply this value by 4 to get the header size in bytes.
   123  		__u8 ihl_byte = load_byte(skb, ETH_HLEN);
   124  		struct iphdr *iph = (struct iphdr *)&ihl_byte;
   125  		__u8 ip_header_len = iph->ihl * 4;
   126  		l4_off = ETH_HLEN + ip_header_len;
   127  		break;
   128  
   129  	case ETH_P_IPV6:
   130  		proto = load_byte(skb,
   131  				  ETH_HLEN + offsetof(struct ipv6hdr, nexthdr));
   132  		l4_off = ETH_HLEN + sizeof(struct ipv6hdr);
   133  
   134  // Parse IPv6 extension headers
   135  // Up to 6 extension headers can be chained. See ipv6_ext_hdr().
   136  #pragma unroll
   137  		for (i = 0; i < 6; i++) {
   138  			__u8 nextproto;
   139  
   140  			// TCP or UDP found
   141  			if (proto == NEXTHDR_TCP || proto == NEXTHDR_UDP)
   142  				break;
   143  
   144  			nextproto = load_byte(skb, l4_off);
   145  
   146  			// Unfortunately, each extension header has a different way to calculate the header length.
   147  			// Support the ones defined in ipv6_ext_hdr(). See ipv6_skip_exthdr().
   148  			switch (proto) {
   149  			case NEXTHDR_FRAGMENT:
   150  				// No hdrlen in the fragment header
   151  				l4_off += 8;
   152  				break;
   153  			case NEXTHDR_AUTH:
   154  				// See ipv6_authlen()
   155  				l4_off += 4 * (load_byte(skb, l4_off + 1) + 2);
   156  				break;
   157  			case NEXTHDR_HOP:
   158  			case NEXTHDR_ROUTING:
   159  			case NEXTHDR_DEST:
   160  				// See ipv6_optlen()
   161  				l4_off += 8 * (load_byte(skb, l4_off + 1) + 1);
   162  				break;
   163  			case NEXTHDR_NONE:
   164  				// Nothing more in the packet. Not even TCP or UDP.
   165  				return 0;
   166  			default:
   167  				// Unknown header
   168  				return 0;
   169  			}
   170  			proto = nextproto;
   171  		}
   172  		break;
   173  
   174  	default:
   175  		return 0;
   176  	}
   177  
   178  	switch (proto) {
   179  	case IPPROTO_UDP:
   180  		sport = load_half(skb,
   181  				  l4_off + offsetof(struct udphdr, source));
   182  		dport = load_half(skb, l4_off + offsetof(struct udphdr, dest));
   183  		dns_off = l4_off + sizeof(struct udphdr);
   184  		break;
   185  	// TODO: support TCP
   186  	default:
   187  		return 0;
   188  	}
   189  
   190  	if (!is_dns_port(sport) && !is_dns_port(dport))
   191  		return 0;
   192  
   193  	// Initialize event here only after we know we're interested in this packet to avoid
   194  	// spending useless cycles.
   195  	__builtin_memset(&event, 0, sizeof(event));
   196  
   197  	event.netns = skb->cb[0]; // cb[0] initialized by dispatcher.bpf.c
   198  	event.timestamp = bpf_ktime_get_boot_ns();
   199  	event.proto = proto;
   200  	event.dns_off = dns_off;
   201  	event.pkt_type = skb->pkt_type;
   202  	event.sport = sport;
   203  	event.dport = dport;
   204  
   205  	// The packet is DNS: Do a second pass to extract all the information we need
   206  	switch (h_proto) {
   207  	case ETH_P_IP:
   208  		event.af = AF_INET;
   209  		event.daddr_v4 = load_word(
   210  			skb, ETH_HLEN + offsetof(struct iphdr, daddr));
   211  		event.saddr_v4 = load_word(
   212  			skb, ETH_HLEN + offsetof(struct iphdr, saddr));
   213  		// load_word converts from network to host endianness. Convert back to
   214  		// network endianness because inet_ntop() requires it.
   215  		event.daddr_v4 = bpf_htonl(event.daddr_v4);
   216  		event.saddr_v4 = bpf_htonl(event.saddr_v4);
   217  		break;
   218  	case ETH_P_IPV6:
   219  		event.af = AF_INET6;
   220  		if (bpf_skb_load_bytes(
   221  			    skb, ETH_HLEN + offsetof(struct ipv6hdr, saddr),
   222  			    &event.saddr_v6, sizeof(event.saddr_v6)))
   223  			return 0;
   224  		if (bpf_skb_load_bytes(
   225  			    skb, ETH_HLEN + offsetof(struct ipv6hdr, daddr),
   226  			    &event.daddr_v6, sizeof(event.daddr_v6)))
   227  			return 0;
   228  		break;
   229  	}
   230  
   231  	// Enrich event with process metadata
   232  	struct sockets_value *skb_val = gadget_socket_lookup(skb);
   233  	if (skb_val != NULL) {
   234  		event.mount_ns_id = skb_val->mntns;
   235  		event.pid = skb_val->pid_tgid >> 32;
   236  		event.tid = (__u32)skb_val->pid_tgid;
   237  		__builtin_memcpy(&event.task, skb_val->task,
   238  				 sizeof(event.task));
   239  		event.uid = (__u32)skb_val->uid_gid;
   240  		event.gid = (__u32)(skb_val->uid_gid >> 32);
   241  	}
   242  
   243  	// Calculate latency:
   244  	//
   245  	// Track the latency from when a query is sent from a container
   246  	// to when a response to the query is received by that same container.
   247  	//
   248  	// * On DNS query sent from a container namespace (qr == DNS_QR_QUERY and pkt_type == OUTGOING),
   249  	//   store the query timestamp in a map.
   250  	//
   251  	// * On DNS response received in the same container namespace (qr == DNS_QR_RESP and pkt_type == HOST)
   252  	//   retrieve/delete the query timestamp and set the latency field on the event.
   253  	//
   254  	// A garbage collection thread running in userspace periodically scans for keys with old timestamps
   255  	// to free space occupied by queries that never receive a response.
   256  	//
   257  	// Skip this if skb_val == NULL (gadget_socket_lookup did not set pid_tgid we use in the query key)
   258  	// or if event->timestamp == 0 (kernels before 5.8 don't support bpf_ktime_get_boot_ns, and the patched
   259  	// version IG injects always returns zero).
   260  	if (skb_val != NULL && event.timestamp > 0) {
   261  		union dnsflags flags;
   262  		flags.flags = load_half(skb, dns_off + offsetof(struct dnshdr,
   263  								flags));
   264  		id = load_half(skb, dns_off + offsetof(struct dnshdr, id));
   265  		__u8 qr = flags.qr;
   266  
   267  		struct query_key_t query_key = {
   268  			.pid_tgid = skb_val->pid_tgid,
   269  			.id = id,
   270  		};
   271  		if (qr == DNS_QR_QUERY && event.pkt_type == PACKET_OUTGOING) {
   272  			bpf_map_update_elem(&query_map, &query_key,
   273  					    &event.timestamp, BPF_NOEXIST);
   274  		} else if (flags.qr == DNS_QR_RESP &&
   275  			   event.pkt_type == PACKET_HOST) {
   276  			__u64 *query_ts =
   277  				bpf_map_lookup_elem(&query_map, &query_key);
   278  			if (query_ts != NULL) {
   279  				// query ts should always be less than the event ts, but check anyway to be safe.
   280  				if (*query_ts < event.timestamp) {
   281  					event.latency_ns =
   282  						event.timestamp - *query_ts;
   283  				}
   284  				bpf_map_delete_elem(&query_map, &query_key);
   285  			}
   286  		}
   287  	}
   288  
   289  	__u64 skb_len = skb->len;
   290  	bpf_perf_event_output(skb, &events, skb_len << 32 | BPF_F_CURRENT_CPU,
   291  			      &event, sizeof(event));
   292  
   293  	return 0;
   294  }
   295  
   296  char _license[] SEC("license") = "GPL";