github.com/cilium/cilium@v1.16.2/bpf/lib/egress_gateway.h (about)

     1  /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
     2  /* Copyright Authors of Cilium */
     3  
     4  #pragma once
     5  
     6  #include "lib/fib.h"
     7  #include "lib/identity.h"
     8  #include "lib/overloadable.h"
     9  
    10  #include "encap.h"
    11  
    12  #ifdef ENABLE_EGRESS_GATEWAY_COMMON
    13  
    14  /* EGRESS_STATIC_PREFIX represents the size in bits of the static prefix part of
    15   * an egress policy key (i.e. the source IP).
    16   */
    17  #define EGRESS_STATIC_PREFIX (sizeof(__be32) * 8)
    18  #define EGRESS_PREFIX_LEN(PREFIX) (EGRESS_STATIC_PREFIX + (PREFIX))
    19  #define EGRESS_IPV4_PREFIX EGRESS_PREFIX_LEN(32)
    20  
    21  /* These are special IP values in the CIDR 0.0.0.0/8 range that map to specific
    22   * case for in the egress gateway policies handling.
    23   */
    24  
    25  /* Special values in the policy_entry->gateway_ip: */
    26  #define EGRESS_GATEWAY_NO_GATEWAY (0)
    27  #define EGRESS_GATEWAY_EXCLUDED_CIDR bpf_htonl(1)
    28  
    29  /* Special values in the policy_entry->egress_ip: */
    30  #define EGRESS_GATEWAY_NO_EGRESS_IP (0)
    31  
    32  static __always_inline
    33  int egress_gw_fib_lookup_and_redirect(struct __ctx_buff *ctx, __be32 egress_ip, __be32 daddr,
    34  				      __s8 *ext_err)
    35  {
    36  	struct bpf_fib_lookup_padded fib_params = {};
    37  	int oif = 0;
    38  
    39  	*ext_err = (__s8)fib_lookup_v4(ctx, &fib_params, egress_ip, daddr, 0);
    40  
    41  	switch (*ext_err) {
    42  	case BPF_FIB_LKUP_RET_SUCCESS:
    43  		break;
    44  	case BPF_FIB_LKUP_RET_NO_NEIGH:
    45  		/* Don't redirect if we can't update the L2 DMAC: */
    46  		if (!neigh_resolver_available())
    47  			return CTX_ACT_OK;
    48  
    49  		/* Don't redirect without a valid target ifindex: */
    50  		if (!is_defined(HAVE_FIB_IFINDEX))
    51  			return CTX_ACT_OK;
    52  		break;
    53  	default:
    54  		return DROP_NO_FIB;
    55  	}
    56  
    57  	/* Skip redirect in to-netdev if we stay on the same iface: */
    58  	if (is_defined(IS_BPF_HOST) && fib_params.l.ifindex == ctx_get_ifindex(ctx))
    59  		return CTX_ACT_OK;
    60  
    61  	return fib_do_redirect(ctx, true, &fib_params, false, ext_err, &oif);
    62  }
    63  
    64  #ifdef ENABLE_EGRESS_GATEWAY
    65  struct {
    66  	__uint(type, BPF_MAP_TYPE_LPM_TRIE);
    67  	__type(key, struct egress_gw_policy_key);
    68  	__type(value, struct egress_gw_policy_entry);
    69  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    70  	__uint(max_entries, EGRESS_POLICY_MAP_SIZE);
    71  	__uint(map_flags, BPF_F_NO_PREALLOC);
    72  } EGRESS_POLICY_MAP __section_maps_btf;
    73  
    74  static __always_inline
    75  struct egress_gw_policy_entry *lookup_ip4_egress_gw_policy(__be32 saddr, __be32 daddr)
    76  {
    77  	struct egress_gw_policy_key key = {
    78  		.lpm_key = { EGRESS_IPV4_PREFIX, {} },
    79  		.saddr = saddr,
    80  		.daddr = daddr,
    81  	};
    82  	return map_lookup_elem(&EGRESS_POLICY_MAP, &key);
    83  }
    84  #endif /* ENABLE_EGRESS_GATEWAY */
    85  
    86  static __always_inline int
    87  egress_gw_request_needs_redirect(struct ipv4_ct_tuple *rtuple __maybe_unused,
    88  				 __be32 *gateway_ip __maybe_unused)
    89  {
    90  #if defined(ENABLE_EGRESS_GATEWAY)
    91  	struct egress_gw_policy_entry *egress_gw_policy;
    92  
    93  	egress_gw_policy = lookup_ip4_egress_gw_policy(ipv4_ct_reverse_tuple_saddr(rtuple),
    94  						       ipv4_ct_reverse_tuple_daddr(rtuple));
    95  	if (!egress_gw_policy)
    96  		return CTX_ACT_OK;
    97  
    98  	switch (egress_gw_policy->gateway_ip) {
    99  	case EGRESS_GATEWAY_NO_GATEWAY:
   100  		/* If no gateway is found, drop the packet. */
   101  		return DROP_NO_EGRESS_GATEWAY;
   102  	case EGRESS_GATEWAY_EXCLUDED_CIDR:
   103  		return CTX_ACT_OK;
   104  	}
   105  
   106  	*gateway_ip = egress_gw_policy->gateway_ip;
   107  	return CTX_ACT_REDIRECT;
   108  #else
   109  	return CTX_ACT_OK;
   110  #endif /* ENABLE_EGRESS_GATEWAY */
   111  }
   112  
   113  static __always_inline
   114  bool egress_gw_snat_needed(__be32 saddr __maybe_unused,
   115  			   __be32 daddr __maybe_unused,
   116  			   __be32 *snat_addr __maybe_unused)
   117  {
   118  #if defined(ENABLE_EGRESS_GATEWAY)
   119  	struct egress_gw_policy_entry *egress_gw_policy;
   120  
   121  	egress_gw_policy = lookup_ip4_egress_gw_policy(saddr, daddr);
   122  	if (!egress_gw_policy)
   123  		return false;
   124  
   125  	if (egress_gw_policy->gateway_ip == EGRESS_GATEWAY_NO_GATEWAY ||
   126  	    egress_gw_policy->gateway_ip == EGRESS_GATEWAY_EXCLUDED_CIDR)
   127  		return false;
   128  
   129  	*snat_addr = egress_gw_policy->egress_ip;
   130  	return true;
   131  #else
   132  	return false;
   133  #endif /* ENABLE_EGRESS_GATEWAY */
   134  }
   135  
   136  static __always_inline
   137  bool egress_gw_reply_matches_policy(struct iphdr *ip4 __maybe_unused)
   138  {
   139  #if defined(ENABLE_EGRESS_GATEWAY)
   140  	struct egress_gw_policy_entry *egress_policy;
   141  
   142  	/* Find a matching policy by looking up the reverse address tuple: */
   143  	egress_policy = lookup_ip4_egress_gw_policy(ip4->daddr, ip4->saddr);
   144  	if (!egress_policy)
   145  		return false;
   146  
   147  	if (egress_policy->gateway_ip == EGRESS_GATEWAY_NO_GATEWAY ||
   148  	    egress_policy->gateway_ip == EGRESS_GATEWAY_EXCLUDED_CIDR)
   149  		return false;
   150  
   151  	return true;
   152  #else
   153  	return false;
   154  #endif /* ENABLE_EGRESS_GATEWAY */
   155  }
   156  
   157  /** Match a packet against EGW policy map, and return the gateway's IP.
   158   * @arg rtuple		CT tuple for the packet
   159   * @arg ct_status	CT result, to identify egressing connections
   160   * @arg gateway_ip	returns the gateway node's IP
   161   *
   162   * Returns
   163   * * CTX_ACT_REDIRECT if a matching policy entry was found,
   164   * * CTX_ACT_OK if no EGW logic should be applied,
   165   * * DROP_* for error conditions.
   166   */
   167  static __always_inline int
   168  egress_gw_request_needs_redirect_hook(struct ipv4_ct_tuple *rtuple,
   169  				      enum ct_status ct_status,
   170  				      __be32 *gateway_ip)
   171  {
   172  #if defined(IS_BPF_LXC)
   173  	/* If the packet is a reply or is related, it means that outside
   174  	 * has initiated the connection, and so we should skip egress
   175  	 * gateway, since an egress policy is only matching connections
   176  	 * originating from a pod.
   177  	 */
   178  	if (ct_status == CT_REPLY || ct_status == CT_RELATED)
   179  		return CTX_ACT_OK;
   180  #else
   181  	/* We lookup CT in forward direction at to-netdev and expect to
   182  	 * get CT_ESTABLISHED for outbound connection as
   183  	 * from_container should have already created a CT entry.
   184  	 * If we get CT_NEW here, it's an indication that it's a reply
   185  	 * for inbound connection or host-level outbound connection.
   186  	 * We don't expect to receive any other ct_status here.
   187  	 */
   188  	if (ct_status != CT_ESTABLISHED)
   189  		return CTX_ACT_OK;
   190  #endif
   191  
   192  	return egress_gw_request_needs_redirect(rtuple, gateway_ip);
   193  }
   194  
   195  static __always_inline
   196  bool egress_gw_snat_needed_hook(__be32 saddr, __be32 daddr, __be32 *snat_addr)
   197  {
   198  	struct remote_endpoint_info *remote_ep;
   199  
   200  	remote_ep = lookup_ip4_remote_endpoint(daddr, 0);
   201  	/* If the packet is destined to an entity inside the cluster, either EP
   202  	 * or node, skip SNAT since only traffic leaving the cluster is supposed
   203  	 * to be masqueraded with an egress IP.
   204  	 */
   205  	if (remote_ep &&
   206  	    identity_is_cluster(remote_ep->sec_identity))
   207  		return false;
   208  
   209  	return egress_gw_snat_needed(saddr, daddr, snat_addr);
   210  }
   211  
   212  static __always_inline
   213  bool egress_gw_reply_needs_redirect_hook(struct iphdr *ip4, __u32 *tunnel_endpoint,
   214  					 __u32 *dst_sec_identity)
   215  {
   216  	if (egress_gw_reply_matches_policy(ip4)) {
   217  		struct remote_endpoint_info *info;
   218  
   219  		info = lookup_ip4_remote_endpoint(ip4->daddr, 0);
   220  		if (!info || info->tunnel_endpoint == 0)
   221  			return false;
   222  
   223  		*tunnel_endpoint = info->tunnel_endpoint;
   224  		*dst_sec_identity = info->sec_identity;
   225  
   226  		return true;
   227  	}
   228  
   229  	return false;
   230  }
   231  
   232  static __always_inline
   233  int egress_gw_handle_packet(struct __ctx_buff *ctx,
   234  			    struct ipv4_ct_tuple *tuple,
   235  			    enum ct_status ct_status,
   236  			    __u32 src_sec_identity, __u32 dst_sec_identity,
   237  			    const struct trace_ctx *trace)
   238  {
   239  	struct endpoint_info *gateway_node_ep;
   240  	__be32 gateway_ip = 0;
   241  	int ret;
   242  
   243  	/* If the packet is destined to an entity inside the cluster,
   244  	 * either EP or node, it should not be forwarded to an egress
   245  	 * gateway since only traffic leaving the cluster is supposed to
   246  	 * be masqueraded with an egress IP.
   247  	 */
   248  	if (identity_is_cluster(dst_sec_identity))
   249  		return CTX_ACT_OK;
   250  
   251  	ret = egress_gw_request_needs_redirect_hook(tuple, ct_status, &gateway_ip);
   252  	if (IS_ERR(ret))
   253  		return ret;
   254  
   255  	if (ret == CTX_ACT_OK)
   256  		return ret;
   257  
   258  	/* If the gateway node is the local node, then just let the
   259  	 * packet go through, as it will be SNATed later on by
   260  	 * handle_nat_fwd().
   261  	 */
   262  	gateway_node_ep = __lookup_ip4_endpoint(gateway_ip);
   263  	if (gateway_node_ep && (gateway_node_ep->flags & ENDPOINT_F_HOST))
   264  		return CTX_ACT_OK;
   265  
   266  	/* Send the packet to egress gateway node through a tunnel. */
   267  	return __encap_and_redirect_with_nodeid(ctx, 0, gateway_ip,
   268  						src_sec_identity, dst_sec_identity,
   269  						NOT_VTEP_DST, trace);
   270  }
   271  
   272  #endif /* ENABLE_EGRESS_GATEWAY_COMMON */