github.com/cilium/cilium@v1.16.2/bpf/lib/mcast.h (about)

     1  /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
     2  /* Copyright Authors of Cilium */
     3  
     4  #pragma once
     5  
     6  #include <bpf/api.h>
     7  #include <linux/ip.h>
     8  #include <linux/igmp.h>
     9  
    10  #include "bpf/ctx/skb.h"
    11  #include "bpf/helpers.h"
    12  #include "bpf/helpers_skb.h"
    13  #include "lib/common.h"
    14  #include "lib/drop.h"
    15  #include "lib/eth.h"
    16  #include "linux/bpf.h"
    17  #include "lib/encap.h"
    18  
    19  /* the below structures are define outside of an IFDEF guard to satisfy
    20   * enterprise_bpf_alignchecker.c requirement
    21   */
    22  
    23  /* mcast_subscriber flags */
    24  enum {
    25  	/* indicates subscriber is remote and ifindex is the exit interface */
    26  	MCAST_SUB_F_REMOTE = (1U << 0)
    27  };
    28  
    29  /* 32bit big endian multicast group address for use with ipv4 protocol */
    30  typedef __be32 mcast_group_v4;
    31  
    32  /* structure to describe a local or remote subscriber of a multicast group
    33   * for the ipv4 protocol.
    34   */
    35  struct mcast_subscriber_v4 {
    36  	/* source address of the subscriber, big endian */
    37  	__be32 saddr;
    38  	/* local ifindex of subscriber of exit interface is remote subscriber */
    39  	__u32 ifindex;
    40  	/* reserved */
    41  	__u16 pad1;
    42  	/* reserved */
    43  	__u8  pad2;
    44  	/* flags for further subscriber description */
    45  	__u8  flags;
    46  };
    47  
    48  #ifdef ENABLE_MULTICAST
    49  
    50  #define MCAST_MAX_GROUP 1024
    51  #define MCAST_MAX_SUBSCRIBERS 1024
    52  /* used to bound iteration of group records within an igmpv3 membership report */
    53  #define MCAST_MAX_GREC 24
    54  
    55  /* Multicast group map is a nested hash of maps.
    56   * The outer map is keyed by a 'mcast_group_v4' multicast group address.
    57   * The inner value is an hash map of 'mcast_subscriber_v4' structures keyed
    58   * by a their IPv4 source address in big endian format.
    59   */
    60  struct {
    61  	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
    62  	__type(key, mcast_group_v4);
    63  	__type(value, __u32);
    64  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    65  	__uint(max_entries, MCAST_MAX_GROUP);
    66  	__uint(map_flags, CONDITIONAL_PREALLOC);
    67  	/* Multicast group subscribers inner map definition */
    68  	__array(values, struct {
    69  		__uint(type, BPF_MAP_TYPE_HASH);
    70  		__uint(key_size, sizeof(__be32));
    71  		__uint(value_size, sizeof(struct mcast_subscriber_v4));
    72  		__uint(max_entries, MCAST_MAX_SUBSCRIBERS);
    73  		__uint(map_flags, CONDITIONAL_PREALLOC);
    74  	});
    75  } cilium_mcast_group_outer_v4_map __section_maps_btf;
    76  
    77  /* lookup a subscriber map for the given ipv4 multicast group
    78   * returns a void pointer to a inner subscriper map if one exists
    79   */
    80  static __always_inline void *mcast_lookup_subscriber_map(__be32 *group)
    81  {
    82  	return map_lookup_elem(&cilium_mcast_group_outer_v4_map, group);
    83  }
    84  
    85  /* returns 1 if ip4 header is followed by an IGMP payload, 0 if not */
    86  static __always_inline bool mcast_ipv4_is_igmp(const struct iphdr *ip4)
    87  {
    88  	if (ip4->protocol == IPPROTO_IGMP)
    89  		return 1;
    90  	return 0;
    91  }
    92  
    93  /* returns the IGMP type for a given IGMP message
    94   * a call to 'mcast_ipv4_is_igmp' must be used prior to this call to ensure an
    95   * igmp message follows the ipv4 header
    96   */
    97  static __always_inline __s32 mcast_ipv4_igmp_type(const struct iphdr *ip4,
    98  						  const void *data,
    99  						  const void *data_end)
   100  {
   101  	const struct igmphdr *hdr;
   102  	int ip_len = ip4->ihl * 4;
   103  
   104  	if (data + ETH_HLEN + ip_len + sizeof(struct igmphdr) > data_end)
   105  		return DROP_INVALID;
   106  
   107  	hdr = data + ETH_HLEN + ip_len;
   108  	return hdr->type;
   109  }
   110  
   111  /* add a subscriber to a subscriber map */
   112  /* returns 1 on success or DROP_INVALID for error */
   113  static __always_inline __s32 mcast_ipv4_add_subscriber(void *map,
   114  						       struct mcast_subscriber_v4 *sub)
   115  {
   116  	if ((map_update_elem(map, &sub->saddr, sub, BPF_ANY) != 0))
   117  		return DROP_INVALID;
   118  	return 1;
   119  }
   120  
   121  /* remove a subscriber to a subscriber map */
   122  /* always returns 1 */
   123  static __always_inline void mcast_ipv4_remove_subscriber(void *map,
   124  							 struct mcast_subscriber_v4 *sub)
   125  {
   126  	map_delete_elem(map, &sub->saddr);
   127  }
   128  
   129  static __always_inline __s32 mcast_ipv4_handle_v3_membership_report(void *ctx,
   130  								    void *group_map,
   131  								    const struct iphdr *ip4,
   132  								    const void *data,
   133  								    const void *data_end)
   134  {
   135  	struct mcast_subscriber_v4 subscriber = {
   136  		.saddr = ip4->saddr,
   137  		.ifindex = ctx_get_ingress_ifindex(ctx)
   138  	};
   139  	const struct igmpv3_report *rep;
   140  	const struct igmpv3_grec *rec;
   141  	int ip_len = ip4->ihl * 4;
   142  	__s32 subscribed = 0;
   143  	void *sub_map = 0;
   144  	__u16 ngrec = 0;
   145  	__u32 i = 0;
   146  
   147  	if (data + ETH_HLEN + ip_len + sizeof(struct igmpv3_report) > data_end)
   148  		return DROP_INVALID;
   149  
   150  	rep = data + ETH_HLEN + ip_len;
   151  
   152  	ngrec = bpf_ntohs(rep->ngrec);
   153  
   154  	if (ngrec > MCAST_MAX_GREC)
   155  		return DROP_INVALID;
   156  
   157  	/* start a bounded loop which exits when we hit the total number of
   158  	 * group records in the membership report.
   159  	 *
   160  	 * add our subscriber into each group advertised in the report.
   161  	 */
   162  #pragma unroll
   163  	for (i = 0; i < MCAST_MAX_GREC; i++) {
   164  		/* Wrap this in an if, instead of breaking out of the loop,
   165  		 * so unroll has a constant number of iterations.
   166  		 *
   167  		 * Compiler was not happy with a continue; statement and the
   168  		 * wrap is necessary.
   169  		 *
   170  		 * remove this when Cilium's min supported kernel version is
   171  		 * >= 5.3 with support for bounded loops.
   172  		 */
   173  		if (i < ngrec) {
   174  			rec = &rep->grec[i];
   175  
   176  			/* verifier seems to only be happy with a packet bounds check
   177  			 * per iteration
   178  			 */
   179  			if ((void *)rec + sizeof(struct igmpv3_grec) > data_end)
   180  				return DROP_INVALID;
   181  
   182  			/* lookup user configured multicast group */
   183  			sub_map = map_lookup_elem(group_map, &rec->grec_mca);
   184  			if (!sub_map)
   185  				continue;
   186  
   187  			/* note:
   188  			 * the datapath currently assumes that no source addresses are
   189  			 * present in the exclude message, indicating a join from all
   190  			 * sources message
   191  			 */
   192  			if (rec->grec_type == IGMPV3_CHANGE_TO_EXCLUDE) {
   193  				subscribed = mcast_ipv4_add_subscriber(sub_map, &subscriber);
   194  				if (subscribed != 1)
   195  					return DROP_INVALID;
   196  				continue;
   197  			}
   198  
   199  			/* note:
   200  			 * the datapath currently assumes that no source addresses are
   201  			 * present in the include message, indicating a leave from all
   202  			 * sources message
   203  			 */
   204  			if (rec->grec_type == IGMPV3_CHANGE_TO_INCLUDE)
   205  				mcast_ipv4_remove_subscriber(sub_map, &subscriber);
   206  		}
   207  	}
   208  	if (subscribed)
   209  		return DROP_IGMP_SUBSCRIBED;
   210  
   211  	return DROP_IGMP_HANDLED;
   212  }
   213  
   214  static __always_inline __s32 mcast_ipv4_handle_v2_membership_report(void *ctx,
   215  								    void *group_map,
   216  								    const struct iphdr *ip4,
   217  								    const void *data,
   218  								    const void *data_end)
   219  {
   220  	struct mcast_subscriber_v4 subscriber = {
   221  		.saddr = ip4->saddr,
   222  		.ifindex = ctx_get_ingress_ifindex(ctx)
   223  	};
   224  	int ip_len = ip4->ihl * 4;
   225  	const struct igmphdr *hdr;
   226  	void *sub_map = 0;
   227  
   228  	if (data + ETH_HLEN + ip_len + sizeof(struct igmphdr) > data_end)
   229  		return DROP_INVALID;
   230  
   231  	hdr = data + ETH_HLEN + ip_len;
   232  
   233  	if (hdr->type != IGMPV2_HOST_MEMBERSHIP_REPORT)
   234  		return DROP_INVALID;
   235  
   236  	/* lookup user configured multicast group */
   237  	sub_map = map_lookup_elem(group_map, &hdr->group);
   238  	if (!sub_map)
   239  		return DROP_IGMP_HANDLED;
   240  
   241  	if (mcast_ipv4_add_subscriber(sub_map, &subscriber))
   242  		return DROP_IGMP_SUBSCRIBED;
   243  
   244  	return DROP_IGMP_HANDLED;
   245  }
   246  
   247  static __always_inline __s32 mcast_ipv4_handle_igmp_leave(void *group_map,
   248  							  const struct iphdr *ip4,
   249  							  const void *data,
   250  							  const void *data_end)
   251  {
   252  	struct mcast_subscriber_v4 subscriber = {
   253  		.saddr = ip4->saddr,
   254  	};
   255  	int ip_len = ip4->ihl * 4;
   256  	const struct igmphdr *hdr;
   257  	void *sub_map = 0;
   258  
   259  	if (data + ETH_HLEN + ip_len + sizeof(struct igmphdr) > data_end)
   260  		return DROP_INVALID;
   261  
   262  	hdr = data + ETH_HLEN + ip_len;
   263  
   264  	if (hdr->type != IGMP_HOST_LEAVE_MESSAGE)
   265  		return DROP_INVALID;
   266  
   267  	/* lookup user configured multicast group */
   268  	sub_map = map_lookup_elem(group_map, &hdr->group);
   269  	if (!sub_map)
   270  		return DROP_IGMP_HANDLED;
   271  
   272  	mcast_ipv4_remove_subscriber(sub_map, &subscriber);
   273  
   274  	return DROP_IGMP_HANDLED;
   275  }
   276  
   277  /* ipv4 igmp handler which dispatches to specific igmp message handlers */
   278  static __always_inline __s32 mcast_ipv4_handle_igmp(void *ctx,
   279  						    struct iphdr *ip4,
   280  						    void *data,
   281  						    void *data_end)
   282  {
   283  	__s32 igmp_type = mcast_ipv4_igmp_type(ip4, data, data_end);
   284  
   285  	if (igmp_type < 0)
   286  		return igmp_type;
   287  
   288  	switch (igmp_type) {
   289  	case IGMPV3_HOST_MEMBERSHIP_REPORT:
   290  		return mcast_ipv4_handle_v3_membership_report(ctx,
   291  							      &cilium_mcast_group_outer_v4_map,
   292  							      ip4,
   293  							      data,
   294  							      data_end);
   295  	case IGMPV2_HOST_MEMBERSHIP_REPORT:
   296  		return mcast_ipv4_handle_v2_membership_report(ctx,
   297  							      &cilium_mcast_group_outer_v4_map,
   298  							      ip4,
   299  							      data,
   300  							      data_end);
   301  	case IGMP_HOST_LEAVE_MESSAGE:
   302  		return mcast_ipv4_handle_igmp_leave(&cilium_mcast_group_outer_v4_map,
   303  						    ip4,
   304  						    data,
   305  						    data_end);
   306  	}
   307  
   308  	return DROP_IGMP_HANDLED;
   309  }
   310  
   311  /* encodes a multicast mac address given a ipv4 group address
   312   * results are in big endian format and written directly into 'mac'
   313   */
   314  static __always_inline void mcast_encode_ipv4_mac(union macaddr *mac,
   315  						  const __u8 group[4])
   316  {
   317  	mac->addr[0] = 0x01;
   318  	mac->addr[1] = 0x00;
   319  	mac->addr[2] = 0x0E;
   320  	mac->addr[3] = group[1] & 0x7F;
   321  	mac->addr[4] = group[2];
   322  	mac->addr[5] = group[3];
   323  }
   324  
   325  /* callback data used for __mcast_ep_delivery */
   326  struct _mcast_ep_delivery_ctx {
   327  	void *ctx;
   328  	__s32 ret;
   329  };
   330  
   331  /* performs packet replication and delivery for multicast traffic egressing
   332   * an endpoint.
   333   *
   334   * to be used as a callback function for bpf_for_each_map_elem
   335   *
   336   * callback functions must return 1 or 0 to pass eBPF verifier.
   337   */
   338  static long __mcast_ep_delivery(__maybe_unused void *sub_map,
   339  				__maybe_unused const __u32 *key,
   340  				const struct mcast_subscriber_v4 *sub,
   341  				struct _mcast_ep_delivery_ctx *cb_ctx)
   342  {
   343  	int ret = 0;
   344  	__u32 tunnel_id = WORLD_ID;
   345  	__u8 from_overlay = 0;
   346  	struct bpf_tunnel_key tun_key = {0};
   347  
   348  	if (!cb_ctx || !sub)
   349  		return 1;
   350  
   351  	if (!cb_ctx->ctx)
   352  		return 1;
   353  
   354  	if (!sub->ifindex)
   355  		return 1;
   356  
   357  	from_overlay = (ctx_get_ingress_ifindex(cb_ctx->ctx) == ENCAP_IFINDEX);
   358  
   359  	/* set tunnel key for remote delivery
   360  	 * this helper sets the tunnel metadata on the skb_buff but only
   361  	 * tunnel drivers will read it, therefore any local delivery will
   362  	 * simply ignore if its present and deliver without an issue.
   363  	 *
   364  	 * if the ingress interface is set to our tunnel interface, do not
   365  	 * perform delivery, this would cause a loop, since the sender's node
   366  	 * already delivered to all remote nodes.
   367  	 *
   368  	 * checking ctx->ingress_ifindex is reliable since
   369  	 * __netif_receive_skb_core sets the skb's input interface before
   370  	 * calling ingress TC programs.
   371  	 */
   372  	if (sub->flags & MCAST_SUB_F_REMOTE) {
   373  		if (from_overlay)
   374  			return 0;
   375  
   376  #ifdef ENABLE_ENCRYPTED_OVERLAY
   377  		/* if encrypted overlay is enabled we'll mark the packet for
   378  		 * encryption via the tunnel ID.
   379  		 */
   380  		tunnel_id = ENCRYPTED_OVERLAY_ID;
   381  #endif /* ENABLE_ENCRYPTED_OVERLAY */
   382  		tun_key.tunnel_id = tunnel_id;
   383  		tun_key.remote_ipv4 = bpf_ntohl(sub->saddr);
   384  		tun_key.tunnel_ttl = IPDEFTTL;
   385  
   386  		ret = ctx_set_tunnel_key(cb_ctx->ctx,
   387  					 &tun_key,
   388  					 TUNNEL_KEY_WITHOUT_SRC_IP,
   389  					 BPF_F_ZERO_CSUM_TX);
   390  
   391  		if (ret < 0) {
   392  			cb_ctx->ret = ret;
   393  			return 1;
   394  		}
   395  	}
   396  
   397  	ret = clone_redirect(cb_ctx->ctx, sub->ifindex, 0);
   398  	if (ret != 0) {
   399  		cb_ctx->ret = ret;
   400  		return 1;
   401  	}
   402  	return 0;
   403  };
   404  
   405  /* tailcall to perform multicast packet replication and delivery.
   406   * when this call is entered we should already know that the packet is destined
   407   * for a multicast group and the multicast group exists in
   408   * cilium_mcast_group_outer_v4_map
   409   */
   410  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_MULTICAST_EP_DELIVERY)
   411  int tail_mcast_ep_delivery(struct __ctx_buff *ctx)
   412  {
   413  	struct _mcast_ep_delivery_ctx cb_ctx = {
   414  		.ctx = ctx,
   415  		.ret = 0
   416  	};
   417  	union macaddr mac = {0};
   418  	void *data, *data_end;
   419  	struct iphdr *ip4 = 0;
   420  	void *sub_map = 0;
   421  
   422  	if (!revalidate_data(ctx, &data, &data_end, &ip4))
   423  		return DROP_INVALID;
   424  
   425  	sub_map = map_lookup_elem(&cilium_mcast_group_outer_v4_map, &ip4->daddr);
   426  	if (!sub_map)
   427  		return DROP_INVALID;
   428  
   429  	mcast_encode_ipv4_mac(&mac, (__u8 *)&ip4->daddr);
   430  
   431  	eth_store_daddr(ctx, &mac.addr[0], 0);
   432  
   433  	for_each_map_elem(sub_map, __mcast_ep_delivery, &cb_ctx, 0);
   434  
   435  	return send_drop_notify(ctx,
   436  				UNKNOWN_ID,
   437  				UNKNOWN_ID,
   438  				TRACE_EP_ID_UNKNOWN,
   439  				DROP_MULTICAST_HANDLED,
   440  				CTX_ACT_DROP,
   441  				METRIC_INGRESS);
   442  }
   443  
   444  #endif /* ENABLE_MULTICAST */