github.com/datadog/cilium@v1.6.12/bpf/lib/nodeport.h (about)

     1  /*
     2   *  Copyright (C) 2019 Authors of Cilium
     3   *
     4   *  This program is free software; you can redistribute it and/or modify
     5   *  it under the terms of the GNU General Public License as published by
     6   *  the Free Software Foundation; either version 2 of the License, or
     7   *  (at your option) any later version.
     8   *
     9   *  This program is distributed in the hope that it will be useful,
    10   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
    11   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    12   *  GNU General Public License for more details.
    13   *
    14   *  You should have received a copy of the GNU General Public License
    15   *  along with this program; if not, write to the Free Software
    16   *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    17   */
    18  
    19  #ifndef __NODEPORT_H_
    20  #define __NODEPORT_H_
    21  
    22  #include <bpf/api.h>
    23  
    24  #include "nat.h"
    25  #include "lb.h"
    26  #include "conntrack.h"
    27  #include "csum.h"
    28  #include "encap.h"
    29  
    30  #define CB_SRC_IDENTITY	0
    31  
    32  /* No nodeport on cilium_host interface. */
    33  #ifdef FROM_HOST
    34  # undef ENABLE_NODEPORT
    35  # undef ENABLE_MASQUERADE
    36  #endif
    37  
    38  #ifdef ENABLE_NODEPORT
    39  
    40  #ifdef ENABLE_IPV4
    41  struct bpf_elf_map __section_maps NODEPORT_NEIGH4 = {
    42  	.type		= BPF_MAP_TYPE_LRU_HASH,
    43  	.size_key	= sizeof(__be32),		// ipv4 addr
    44  	.size_value	= sizeof(union macaddr),	// hw addr
    45  	.pinning	= PIN_GLOBAL_NS,
    46  	.max_elem	= SNAT_MAPPING_IPV4_SIZE,
    47  };
    48  #endif /* ENABLE_IPV4 */
    49  
    50  #ifdef ENABLE_IPV6
    51  struct bpf_elf_map __section_maps NODEPORT_NEIGH6 = {
    52  	.type		= BPF_MAP_TYPE_LRU_HASH,
    53  	.size_key	= sizeof(union v6addr),		// ipv6 addr
    54  	.size_value	= sizeof(union macaddr),	// hw addr
    55  	.pinning	= PIN_GLOBAL_NS,
    56  	.max_elem	= SNAT_MAPPING_IPV6_SIZE,
    57  };
    58  #endif /* ENABLE_IPV6 */
    59  
    60  #endif /* ENABLE_NODEPORT */
    61  
    62  static inline void bpf_clear_nodeport(struct __sk_buff *skb)
    63  {
    64  #ifdef ENABLE_NODEPORT
    65  	skb->tc_index &= ~TC_INDEX_F_SKIP_NODEPORT;
    66  #endif
    67  }
    68  
    69  #ifdef ENABLE_NODEPORT
    70  static inline bool __inline__ bpf_skip_nodeport(struct __sk_buff *skb)
    71  {
    72  	volatile __u32 tc_index = skb->tc_index;
    73  	skb->tc_index &= ~TC_INDEX_F_SKIP_NODEPORT;
    74  	return tc_index & TC_INDEX_F_SKIP_NODEPORT;
    75  }
    76  #endif /* ENABLE_NODEPORT */
    77  
    78  #ifdef ENABLE_NODEPORT
    79  #ifdef ENABLE_IPV6
    80  static __always_inline bool nodeport_nat_ipv6_needed(struct __sk_buff *skb,
    81  						     union v6addr *addr, int dir)
    82  {
    83  	void *data, *data_end;
    84  	struct ipv6hdr *ip6;
    85  
    86  	if (!revalidate_data(skb, &data, &data_end, &ip6))
    87  		return false;
    88  	/* See nodeport_nat_ipv4_needed(). */
    89  	if (dir == NAT_DIR_EGRESS)
    90  		return !ipv6_addrcmp((union v6addr *)&ip6->saddr, addr);
    91  	else
    92  		return !ipv6_addrcmp((union v6addr *)&ip6->daddr, addr);
    93  	return false;
    94  }
    95  
    96  #define NODEPORT_DO_NAT_IPV6(ADDR, NDIR)					\
    97  	({									\
    98  		struct ipv6_nat_target target = {				\
    99  			.min_port = NODEPORT_PORT_MIN_NAT,			\
   100  			.max_port = 65535,					\
   101  		};								\
   102  		ipv6_addr_copy(&target.addr, (ADDR));				\
   103  		int ____ret = nodeport_nat_ipv6_needed(skb, (ADDR), (NDIR)) ?	\
   104  			      snat_v6_process(skb, (NDIR), &target) : TC_ACT_OK;\
   105  		if (____ret == NAT_PUNT_TO_STACK)				\
   106  			____ret = TC_ACT_OK;					\
   107  		____ret;							\
   108  	})
   109  
   110  static __always_inline int nodeport_nat_ipv6_fwd(struct __sk_buff *skb,
   111  						 union v6addr *addr)
   112  {
   113  	return NODEPORT_DO_NAT_IPV6(addr, NAT_DIR_EGRESS);
   114  }
   115  
   116  static __always_inline int nodeport_nat_ipv6_rev(struct __sk_buff *skb,
   117  						 union v6addr *addr)
   118  {
   119  	return NODEPORT_DO_NAT_IPV6(addr, NAT_DIR_INGRESS);
   120  }
   121  
   122  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT)
   123  int tail_nodeport_nat_ipv6(struct __sk_buff *skb)
   124  {
   125  	int ifindex = NATIVE_DEV_IFINDEX, ret, dir = skb->cb[CB_NAT];
   126  	struct bpf_fib_lookup fib_params = {};
   127  	struct ipv6_nat_target target = {
   128  		.min_port = NODEPORT_PORT_MIN_NAT,
   129  		.max_port = NODEPORT_PORT_MAX_NAT,
   130  		.src_from_world = true,
   131  	};
   132  	void *data, *data_end;
   133  	struct ipv6hdr *ip6;
   134  
   135  	BPF_V6(target.addr, IPV6_NODEPORT);
   136  #ifdef ENCAP_IFINDEX
   137  	if (dir == NAT_DIR_EGRESS) {
   138  		struct remote_endpoint_info *info;
   139  		union v6addr *dst;
   140  
   141  		if (!revalidate_data(skb, &data, &data_end, &ip6))
   142  			return DROP_INVALID;
   143  
   144  		dst = (union v6addr *)&ip6->daddr;
   145  		info = ipcache_lookup6(&IPCACHE_MAP, dst, V6_CACHE_KEY_LEN);
   146  		if (info != NULL && info->tunnel_endpoint != 0) {
   147  			int ret = __encap_with_nodeid(skb, info->tunnel_endpoint,
   148  						      SECLABEL, TRACE_PAYLOAD_LEN);
   149  			if (ret)
   150  				return ret;
   151  
   152  			BPF_V6(target.addr, ROUTER_IP);
   153  			ifindex = ENCAP_IFINDEX;
   154  
   155  			/* fib lookup not necessary when going over tunnel. */
   156  			if (eth_store_daddr(skb, fib_params.dmac, 0) < 0)
   157  				return DROP_WRITE_ERROR;
   158  			if (eth_store_saddr(skb, fib_params.smac, 0) < 0)
   159  				return DROP_WRITE_ERROR;
   160  		}
   161  	}
   162  #endif
   163  	ret = snat_v6_process(skb, dir, &target);
   164  	if (IS_ERR(ret)) {
   165  		/* In case of no mapping, recircle back to main path. SNAT is very
   166  		 * expensive in terms of instructions (since we don't have BPF to
   167  		 * BPF calls as we use tail calls) and complexity, hence this is
   168  		 * done inside a tail call here.
   169  		 */
   170  		if (dir == NAT_DIR_INGRESS) {
   171  			skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT;
   172  			ep_tail_call(skb, CILIUM_CALL_IPV6_FROM_LXC);
   173  			ret = DROP_MISSED_TAIL_CALL;
   174  		}
   175  		if (ret == NAT_PUNT_TO_STACK)
   176  			ret = TC_ACT_OK;
   177  		else
   178  			goto drop_err;
   179  	}
   180  
   181  	skb->mark |= MARK_MAGIC_SNAT_DONE;
   182  	if (dir == NAT_DIR_INGRESS) {
   183  		ep_tail_call(skb, CILIUM_CALL_IPV6_NODEPORT_REVNAT);
   184  		ret = DROP_MISSED_TAIL_CALL;
   185  		goto drop_err;
   186  	}
   187  #ifdef ENCAP_IFINDEX
   188  	if (ifindex == ENCAP_IFINDEX)
   189  		goto out_send;
   190  #endif
   191  	if (!revalidate_data(skb, &data, &data_end, &ip6)) {
   192  		ret = DROP_INVALID;
   193  		goto drop_err;
   194  	}
   195  
   196  	fib_params.family = AF_INET6;
   197  	fib_params.ifindex = ifindex;
   198  	ipv6_addr_copy((union v6addr *) &fib_params.ipv6_src, (union v6addr *) &ip6->saddr);
   199  	ipv6_addr_copy((union v6addr *) &fib_params.ipv6_dst, (union v6addr *) &ip6->daddr);
   200  
   201  	ret = fib_lookup(skb, &fib_params, sizeof(fib_params),
   202  			 BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT);
   203  	if (ret != 0) {
   204  		ret = DROP_NO_FIB;
   205  		goto drop_err;
   206  	}
   207  
   208  	if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) {
   209  		ret = DROP_WRITE_ERROR;
   210  		goto drop_err;
   211  	}
   212  	if (eth_store_saddr(skb, fib_params.smac, 0) < 0) {
   213  		ret = DROP_WRITE_ERROR;
   214  		goto drop_err;
   215  	}
   216  	ifindex = fib_params.ifindex;
   217  out_send:
   218  	return redirect(ifindex, 0);
   219  drop_err:
   220  	return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT,
   221  				      dir == NAT_DIR_INGRESS ?
   222  				      METRIC_INGRESS : METRIC_EGRESS);
   223  }
   224  
   225  /* See nodeport_lb4(). */
   226  static inline int nodeport_lb6(struct __sk_buff *skb, __u32 src_identity)
   227  {
   228  	int ret, l3_off = ETH_HLEN, l4_off, hdrlen;
   229  	struct ipv6_ct_tuple tuple = {};
   230  	void *data, *data_end;
   231  	struct ipv6hdr *ip6;
   232  	struct csum_offset csum_off = {};
   233  	struct lb6_service_v2 *svc;
   234  	struct lb6_key_v2 key = {};
   235  	struct ct_state ct_state_new = {};
   236  	struct ct_state ct_state = {};
   237  	bool backend_local;
   238  	__u32 monitor = 0;
   239  	__u16 service_port;
   240  	union macaddr smac;
   241  
   242  	if (!revalidate_data(skb, &data, &data_end, &ip6))
   243  		return DROP_INVALID;
   244  
   245  	tuple.nexthdr = ip6->nexthdr;
   246  	ipv6_addr_copy(&tuple.daddr, (union v6addr *) &ip6->daddr);
   247  	ipv6_addr_copy(&tuple.saddr, (union v6addr *) &ip6->saddr);
   248  
   249  	hdrlen = ipv6_hdrlen(skb, l3_off, &tuple.nexthdr);
   250  	if (hdrlen < 0)
   251  		return hdrlen;
   252  
   253  	l4_off = l3_off + hdrlen;
   254  
   255  	ret = lb6_extract_key_v2(skb, &tuple, l4_off, &key, &csum_off, CT_EGRESS);
   256  	if (IS_ERR(ret)) {
   257  		if (ret == DROP_UNKNOWN_L4)
   258  			return TC_ACT_OK;
   259  		else
   260  			return ret;
   261  	}
   262  
   263  	service_port = bpf_ntohs(key.dport);
   264  	if (service_port < NODEPORT_PORT_MIN ||
   265  	    service_port > NODEPORT_PORT_MAX) {
   266  		skb->cb[CB_NAT] = NAT_DIR_INGRESS;
   267  		skb->cb[CB_SRC_IDENTITY] = src_identity;
   268  		ep_tail_call(skb, CILIUM_CALL_IPV6_NODEPORT_NAT);
   269  		return DROP_MISSED_TAIL_CALL;
   270  	}
   271  
   272  	ct_state_new.orig_dport = key.dport;
   273  
   274  	if ((svc = lb6_lookup_service_v2(skb, &key)) != NULL) {
   275  		ret = lb6_local(get_ct_map6(&tuple), skb, l3_off, l4_off,
   276  				&csum_off, &key, &tuple, svc, &ct_state_new);
   277  		if (IS_ERR(ret))
   278  			return ret;
   279  	} else {
   280  		return TC_ACT_OK;
   281  	}
   282  
   283  	ret = ct_lookup6(get_ct_map6(&tuple), &tuple, skb, l4_off, CT_EGRESS,
   284  			 &ct_state, &monitor);
   285  	if (ret < 0)
   286  		return ret;
   287  	if (!revalidate_data(skb, &data, &data_end, &ip6))
   288  		return DROP_INVALID;
   289  
   290  	backend_local = lookup_ip6_endpoint(ip6);
   291  
   292  	switch (ret) {
   293  	case CT_NEW:
   294  		ct_state_new.src_sec_id = SECLABEL;
   295  		ct_state_new.node_port = 1;
   296  		ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_EGRESS,
   297  				 &ct_state_new, false);
   298  		if (IS_ERR(ret))
   299  			return ret;
   300  		if (backend_local) {
   301  			ct_flip_tuple_dir6(&tuple);
   302  			ct_state_new.rev_nat_index = 0;
   303  			ret = ct_create6(get_ct_map6(&tuple), &tuple, skb,
   304  					 CT_INGRESS, &ct_state_new, false);
   305  			if (IS_ERR(ret))
   306  				return ret;
   307  		}
   308  		break;
   309  
   310  	case CT_ESTABLISHED:
   311  	case CT_REPLY:
   312  		break;
   313  
   314  	default:
   315  		return DROP_UNKNOWN_CT;
   316  	}
   317  
   318  	if (!revalidate_data(skb, &data, &data_end, &ip6))
   319  		return DROP_INVALID;
   320  	if (eth_load_saddr(skb, &smac.addr, 0) < 0)
   321  		return DROP_INVALID;
   322  	ret = map_update_elem(&NODEPORT_NEIGH6, &ip6->saddr, &smac, 0);
   323  	if (ret < 0) {
   324  		return ret;
   325  	}
   326  
   327  	if (!backend_local) {
   328  		skb->cb[CB_NAT] = NAT_DIR_EGRESS;
   329  		ep_tail_call(skb, CILIUM_CALL_IPV6_NODEPORT_NAT);
   330  		return DROP_MISSED_TAIL_CALL;
   331  	}
   332  
   333  	return TC_ACT_OK;
   334  }
   335  
   336  /* See comment in tail_rev_nodeport_lb4(). */
   337  static inline int rev_nodeport_lb6(struct __sk_buff *skb, int *ifindex,
   338                                      union macaddr *mac)
   339  {
   340  	int ret, ret2, l3_off = ETH_HLEN, l4_off, hdrlen;
   341  	struct ipv6_ct_tuple tuple = {};
   342  	void *data, *data_end;
   343  	struct ipv6hdr *ip6;
   344  	struct csum_offset csum_off = {};
   345  	struct ct_state ct_state = {};
   346  	struct bpf_fib_lookup fib_params = {};
   347  	union macaddr *dmac;
   348  	__u32 monitor = 0;
   349  
   350  	if (!revalidate_data(skb, &data, &data_end, &ip6))
   351  		return DROP_INVALID;
   352  
   353  	tuple.nexthdr = ip6->nexthdr;
   354  	ipv6_addr_copy(&tuple.daddr, (union v6addr *) &ip6->daddr);
   355  	ipv6_addr_copy(&tuple.saddr, (union v6addr *) &ip6->saddr);
   356  
   357  	hdrlen = ipv6_hdrlen(skb, l3_off, &tuple.nexthdr);
   358  	if (hdrlen < 0)
   359  		return hdrlen;
   360  
   361  	l4_off = l3_off + hdrlen;
   362  	csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);
   363  
   364  	ret = ct_lookup6(get_ct_map6(&tuple), &tuple, skb, l4_off, CT_INGRESS, &ct_state,
   365  			 &monitor);
   366  
   367  	if (ret == CT_REPLY && ct_state.node_port == 1 && ct_state.rev_nat_index != 0) {
   368  		ret2 = lb6_rev_nat(skb, l4_off, &csum_off, ct_state.rev_nat_index,
   369  				   &tuple, REV_NAT_F_TUPLE_SADDR);
   370  		if (IS_ERR(ret2))
   371  			return ret2;
   372  
   373  		if (!revalidate_data(skb, &data, &data_end, &ip6))
   374  			return DROP_INVALID;
   375  
   376  		skb->mark |= MARK_MAGIC_SNAT_DONE;
   377  #ifdef ENCAP_IFINDEX
   378  		{
   379  			union v6addr *dst = (union v6addr *)&ip6->daddr;
   380  			struct remote_endpoint_info *info;
   381  
   382  			info = ipcache_lookup6(&IPCACHE_MAP, dst, V6_CACHE_KEY_LEN);
   383  			if (info != NULL && info->tunnel_endpoint != 0) {
   384  				int ret = __encap_with_nodeid(skb, info->tunnel_endpoint,
   385  							      SECLABEL, TRACE_PAYLOAD_LEN);
   386  				if (ret)
   387  					return ret;
   388  
   389  				*ifindex = ENCAP_IFINDEX;
   390  
   391  				/* fib lookup not necessary when going over tunnel. */
   392  				if (eth_store_daddr(skb, fib_params.dmac, 0) < 0)
   393  					return DROP_WRITE_ERROR;
   394  				if (eth_store_saddr(skb, fib_params.smac, 0) < 0)
   395  					return DROP_WRITE_ERROR;
   396  
   397  				return TC_ACT_OK;
   398  			}
   399  		}
   400  #endif
   401  
   402  		dmac = map_lookup_elem(&NODEPORT_NEIGH6, &tuple.daddr);
   403  		if (dmac) {
   404  			if (eth_store_daddr(skb, &dmac->addr, 0) < 0)
   405  				return DROP_WRITE_ERROR;
   406  			if (eth_store_saddr(skb, &mac->addr, 0) < 0)
   407  				return DROP_WRITE_ERROR;
   408  		} else {
   409  			fib_params.family = AF_INET6;
   410  			fib_params.ifindex = *ifindex;
   411  
   412  			ipv6_addr_copy((union v6addr *) &fib_params.ipv6_src, &tuple.saddr);
   413  			ipv6_addr_copy((union v6addr *) &fib_params.ipv6_dst, &tuple.daddr);
   414  
   415  			int rc = fib_lookup(skb, &fib_params, sizeof(fib_params),
   416  					BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT);
   417  			if (rc != 0)
   418  				return DROP_NO_FIB;
   419  
   420  			if (eth_store_daddr(skb, fib_params.dmac, 0) < 0)
   421  				return DROP_WRITE_ERROR;
   422  			if (eth_store_saddr(skb, fib_params.smac, 0) < 0)
   423  				return DROP_WRITE_ERROR;
   424  		}
   425  	} else {
   426  		if (!(skb->tc_index & TC_INDEX_F_SKIP_RECIRCULATION)) {
   427  			skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT;
   428  			ep_tail_call(skb, CILIUM_CALL_IPV6_FROM_LXC);
   429  			return DROP_MISSED_TAIL_CALL;
   430  		}
   431  	}
   432  
   433  	return TC_ACT_OK;
   434  }
   435  
   436  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_REVNAT)
   437  int tail_rev_nodeport_lb6(struct __sk_buff *skb)
   438  {
   439  	int ifindex = NATIVE_DEV_IFINDEX;
   440  	union macaddr mac = NATIVE_DEV_MAC;
   441  	int ret = 0;
   442  
   443  	ret = rev_nodeport_lb6(skb, &ifindex, &mac);
   444  	if (IS_ERR(ret))
   445  		return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, METRIC_EGRESS);
   446  	return redirect(ifindex, 0);
   447  }
   448  #endif /* ENABLE_IPV6 */
   449  
   450  #ifdef ENABLE_IPV4
   451  static __always_inline bool nodeport_nat_ipv4_needed(struct __sk_buff *skb,
   452  						     __be32 addr, int dir)
   453  {
   454  	void *data, *data_end;
   455  	struct iphdr *ip4;
   456  
   457  	if (!revalidate_data(skb, &data, &data_end, &ip4))
   458  		return false;
   459  	/* Basic minimum is to only NAT when there is a potential of
   460  	 * overlapping tuples, e.g. applications in hostns reusing
   461  	 * source IPs we SNAT in node-port.
   462  	 */
   463  	if (dir == NAT_DIR_EGRESS)
   464  		return ip4->saddr == addr;
   465  	else
   466  		return ip4->daddr == addr;
   467  	return false;
   468  }
   469  
   470  #define NODEPORT_DO_NAT_IPV4(ADDR, NDIR)					\
   471  	({									\
   472  		struct ipv4_nat_target target = {				\
   473  			.min_port = NODEPORT_PORT_MIN_NAT,			\
   474  			.max_port = 65535,					\
   475  			.addr = (ADDR),						\
   476  		};								\
   477  		int ____ret = nodeport_nat_ipv4_needed(skb, (ADDR), (NDIR)) ?	\
   478  			      snat_v4_process(skb, (NDIR), &target) : TC_ACT_OK;\
   479  		if (____ret == NAT_PUNT_TO_STACK)				\
   480  			____ret = TC_ACT_OK;					\
   481  		____ret;							\
   482  	})
   483  
   484  static __always_inline int nodeport_nat_ipv4_fwd(struct __sk_buff *skb,
   485  						 const __be32 addr)
   486  {
   487  	return NODEPORT_DO_NAT_IPV4(addr, NAT_DIR_EGRESS);
   488  }
   489  
   490  static __always_inline int nodeport_nat_ipv4_rev(struct __sk_buff *skb,
   491  						 const __be32 addr)
   492  {
   493  	return NODEPORT_DO_NAT_IPV4(addr, NAT_DIR_INGRESS);
   494  }
   495  
   496  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT)
   497  int tail_nodeport_nat_ipv4(struct __sk_buff *skb)
   498  {
   499  	int ifindex = NATIVE_DEV_IFINDEX, ret, dir = skb->cb[CB_NAT];
   500  	struct bpf_fib_lookup fib_params = {};
   501  	struct ipv4_nat_target target = {
   502  		.min_port = NODEPORT_PORT_MIN_NAT,
   503  		.max_port = NODEPORT_PORT_MAX_NAT,
   504  		.src_from_world = true,
   505  	};
   506  	void *data, *data_end;
   507  	struct iphdr *ip4;
   508  
   509  	target.addr = IPV4_NODEPORT;
   510  #ifdef ENCAP_IFINDEX
   511  	if (dir == NAT_DIR_EGRESS) {
   512  		struct remote_endpoint_info *info;
   513  
   514  		if (!revalidate_data(skb, &data, &data_end, &ip4))
   515  			return DROP_INVALID;
   516  
   517  		info = ipcache_lookup4(&IPCACHE_MAP, ip4->daddr, V4_CACHE_KEY_LEN);
   518  		if (info != NULL && info->tunnel_endpoint != 0) {
   519  			int ret = __encap_with_nodeid(skb, info->tunnel_endpoint,
   520  						      SECLABEL, TRACE_PAYLOAD_LEN);
   521  			if (ret)
   522  				return ret;
   523  
   524  			target.addr = IPV4_GATEWAY;
   525  			ifindex = ENCAP_IFINDEX;
   526  
   527  			/* fib lookup not necessary when going over tunnel. */
   528  			if (eth_store_daddr(skb, fib_params.dmac, 0) < 0)
   529  				return DROP_WRITE_ERROR;
   530  			if (eth_store_saddr(skb, fib_params.smac, 0) < 0)
   531  				return DROP_WRITE_ERROR;
   532  		}
   533  	}
   534  #endif
   535  	ret = snat_v4_process(skb, dir, &target);
   536  	if (IS_ERR(ret)) {
   537  		/* In case of no mapping, recircle back to main path. SNAT is very
   538  		 * expensive in terms of instructions (since we don't have BPF to
   539  		 * BPF calls as we use tail calls) and complexity, hence this is
   540  		 * done inside a tail call here.
   541  		 */
   542  		if (dir == NAT_DIR_INGRESS) {
   543  			skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT;
   544  			ep_tail_call(skb, CILIUM_CALL_IPV4_FROM_LXC);
   545  			ret = DROP_MISSED_TAIL_CALL;
   546  		}
   547  		if (ret == NAT_PUNT_TO_STACK)
   548  			ret = TC_ACT_OK;
   549  		else
   550  			goto drop_err;
   551  	}
   552  
   553  	skb->mark |= MARK_MAGIC_SNAT_DONE;
   554  	if (dir == NAT_DIR_INGRESS) {
   555  		ep_tail_call(skb, CILIUM_CALL_IPV4_NODEPORT_REVNAT);
   556  		ret = DROP_MISSED_TAIL_CALL;
   557  		goto drop_err;
   558  	}
   559  #ifdef ENCAP_IFINDEX
   560  	if (ifindex == ENCAP_IFINDEX)
   561  		goto out_send;
   562  #endif
   563  	if (!revalidate_data(skb, &data, &data_end, &ip4)) {
   564  		ret = DROP_INVALID;
   565  		goto drop_err;
   566  	}
   567  
   568  	fib_params.family = AF_INET;
   569  	fib_params.ifindex = ifindex;
   570  	fib_params.ipv4_src = ip4->saddr;
   571  	fib_params.ipv4_dst = ip4->daddr;
   572  
   573  	ret = fib_lookup(skb, &fib_params, sizeof(fib_params),
   574  			 BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT);
   575  	if (ret != 0) {
   576  		ret = DROP_NO_FIB;
   577  		goto drop_err;
   578  	}
   579  
   580  	if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) {
   581  		ret = DROP_WRITE_ERROR;
   582  		goto drop_err;
   583  	}
   584  	if (eth_store_saddr(skb, fib_params.smac, 0) < 0) {
   585  		ret = DROP_WRITE_ERROR;
   586  		goto drop_err;
   587  	}
   588  	ifindex = fib_params.ifindex;
   589  out_send:
   590  	return redirect(ifindex, 0);
   591  drop_err:
   592  	return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT,
   593  				      dir == NAT_DIR_INGRESS ?
   594  				      METRIC_INGRESS : METRIC_EGRESS);
   595  }
   596  
   597  /* Main node-port entry point for host-external ingressing node-port traffic
   598   * which handles the case of: i) backend is local EP, ii) backend is remote EP,
   599   * iii) reply from remote backend EP.
   600   */
   601  static inline int nodeport_lb4(struct __sk_buff *skb, __u32 src_identity)
   602  {
   603  	struct ipv4_ct_tuple tuple = {};
   604  	void *data, *data_end;
   605  	struct iphdr *ip4;
   606  	int ret,  l3_off = ETH_HLEN, l4_off;
   607  	struct csum_offset csum_off = {};
   608  	struct lb4_service_v2 *svc;
   609  	struct lb4_key_v2 key = {};
   610  	struct ct_state ct_state_new = {};
   611  	struct ct_state ct_state = {};
   612  	bool backend_local;
   613  	__u32 monitor = 0;
   614  	__u16 service_port;
   615  	union macaddr smac;
   616  
   617  	if (!revalidate_data(skb, &data, &data_end, &ip4))
   618  		return DROP_INVALID;
   619  
   620  	tuple.nexthdr = ip4->protocol;
   621  	tuple.daddr = ip4->daddr;
   622  	tuple.saddr = ip4->saddr;
   623  
   624  	l4_off = l3_off + ipv4_hdrlen(ip4);
   625  
   626  	ret = lb4_extract_key_v2(skb, &tuple, l4_off, &key, &csum_off, CT_EGRESS);
   627  	if (IS_ERR(ret)) {
   628  		if (ret == DROP_UNKNOWN_L4)
   629  			return TC_ACT_OK;
   630  		else
   631  			return ret;
   632  	}
   633  
   634  	service_port = bpf_ntohs(key.dport);
   635  	if (service_port < NODEPORT_PORT_MIN ||
   636  	    service_port > NODEPORT_PORT_MAX) {
   637  		skb->cb[CB_NAT] = NAT_DIR_INGRESS;
   638  		skb->cb[CB_SRC_IDENTITY] = src_identity;
   639  		ep_tail_call(skb, CILIUM_CALL_IPV4_NODEPORT_NAT);
   640  		return DROP_MISSED_TAIL_CALL;
   641  	}
   642  
   643  	ct_state_new.orig_dport = key.dport;
   644  
   645  	if ((svc = lb4_lookup_service_v2(skb, &key)) != NULL) {
   646  		ret = lb4_local(get_ct_map4(&tuple), skb, l3_off, l4_off, &csum_off,
   647  				&key, &tuple, svc, &ct_state_new, ip4->saddr);
   648  		if (IS_ERR(ret))
   649  			return ret;
   650  	} else {
   651  		return TC_ACT_OK;
   652  	}
   653  
   654  	ret = ct_lookup4(get_ct_map4(&tuple), &tuple, skb, l4_off, CT_EGRESS,
   655  			 &ct_state, &monitor);
   656  	if (ret < 0)
   657  		return ret;
   658  	if (!revalidate_data(skb, &data, &data_end, &ip4))
   659  		return DROP_INVALID;
   660  
   661  	backend_local = lookup_ip4_endpoint(ip4);
   662  
   663  	switch (ret) {
   664  	case CT_NEW:
   665  		ct_state_new.src_sec_id = SECLABEL;
   666  		ct_state_new.node_port = 1;
   667  		ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_EGRESS,
   668  				 &ct_state_new, false);
   669  		if (IS_ERR(ret))
   670  			return ret;
   671  		if (backend_local) {
   672  			ct_flip_tuple_dir4(&tuple);
   673  			ct_state_new.rev_nat_index = 0;
   674  			ret = ct_create4(get_ct_map4(&tuple), &tuple, skb,
   675  					 CT_INGRESS, &ct_state_new, false);
   676  			if (IS_ERR(ret))
   677  				return ret;
   678  		}
   679  		break;
   680  
   681  	case CT_ESTABLISHED:
   682  	case CT_REPLY:
   683  		break;
   684  
   685  	default:
   686  		return DROP_UNKNOWN_CT;
   687  	}
   688  
   689  	if (!revalidate_data(skb, &data, &data_end, &ip4))
   690  		return DROP_INVALID;
   691  	if (eth_load_saddr(skb, &smac.addr, 0) < 0)
   692  		return DROP_INVALID;
   693  	ret = map_update_elem(&NODEPORT_NEIGH4, &ip4->saddr, &smac, 0);
   694  	if (ret < 0) {
   695  		return ret;
   696  	}
   697  
   698  	if (!backend_local) {
   699  		skb->cb[CB_NAT] = NAT_DIR_EGRESS;
   700  		ep_tail_call(skb, CILIUM_CALL_IPV4_NODEPORT_NAT);
   701  		return DROP_MISSED_TAIL_CALL;
   702  	}
   703  
   704  	return TC_ACT_OK;
   705  }
   706  
   707  /* Reverse NAT handling of node-port traffic for the case where the
   708   * backend i) was a local EP and bpf_lxc redirected to us, ii) was
   709   * a remote backend and we got here after reverse SNAT from the
   710   * tail_nodeport_nat_ipv4().
   711   *
   712   * CILIUM_CALL_IPV{4,6}_NODEPORT_REVNAT is plugged into CILIUM_MAP_CALLS
   713   * of the bpf_netdev, bpf_overlay and of the bpf_lxc.
   714   */
   715  static inline int rev_nodeport_lb4(struct __sk_buff *skb, int *ifindex,
   716  				   union macaddr *mac)
   717  {
   718  	struct ipv4_ct_tuple tuple = {};
   719  	void *data, *data_end;
   720  	struct iphdr *ip4;
   721  	struct csum_offset csum_off = {};
   722  	int ret, ret2, l3_off = ETH_HLEN, l4_off;
   723  	struct ct_state ct_state = {};
   724  	struct bpf_fib_lookup fib_params = {};
   725  	union macaddr *dmac;
   726  	__u32 monitor = 0;
   727  
   728  	if (!revalidate_data(skb, &data, &data_end, &ip4))
   729  		return DROP_INVALID;
   730  
   731  	tuple.nexthdr = ip4->protocol;
   732  	tuple.daddr = ip4->daddr;
   733  	tuple.saddr = ip4->saddr;
   734  
   735  	l4_off = l3_off + ipv4_hdrlen(ip4);
   736  	csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);
   737  
   738  	ret = ct_lookup4(get_ct_map4(&tuple), &tuple, skb, l4_off, CT_INGRESS, &ct_state,
   739  			 &monitor);
   740  
   741  	if (ret == CT_REPLY && ct_state.node_port == 1 && ct_state.rev_nat_index != 0) {
   742  		ret2 = lb4_rev_nat(skb, l3_off, l4_off, &csum_off,
   743  				   &ct_state, &tuple,
   744  				   REV_NAT_F_TUPLE_SADDR);
   745  		if (IS_ERR(ret2))
   746  			return ret2;
   747  
   748  		if (!revalidate_data(skb, &data, &data_end, &ip4))
   749  			return DROP_INVALID;
   750  
   751  		skb->mark |= MARK_MAGIC_SNAT_DONE;
   752  #ifdef ENCAP_IFINDEX
   753  		{
   754  			struct remote_endpoint_info *info;
   755  
   756  			info = ipcache_lookup4(&IPCACHE_MAP, ip4->daddr, V4_CACHE_KEY_LEN);
   757  			if (info != NULL && info->tunnel_endpoint != 0) {
   758  				int ret = __encap_with_nodeid(skb, info->tunnel_endpoint,
   759  							      SECLABEL, TRACE_PAYLOAD_LEN);
   760  				if (ret)
   761  					return ret;
   762  
   763  				*ifindex = ENCAP_IFINDEX;
   764  
   765  				/* fib lookup not necessary when going over tunnel. */
   766  				if (eth_store_daddr(skb, fib_params.dmac, 0) < 0)
   767  					return DROP_WRITE_ERROR;
   768  				if (eth_store_saddr(skb, fib_params.smac, 0) < 0)
   769  					return DROP_WRITE_ERROR;
   770  
   771  				return TC_ACT_OK;
   772  			}
   773  		}
   774  #endif
   775  
   776  		dmac = map_lookup_elem(&NODEPORT_NEIGH4, &ip4->daddr);
   777  		if (dmac) {
   778  		    if (eth_store_daddr(skb, &dmac->addr, 0) < 0)
   779  			return DROP_WRITE_ERROR;
   780  		    if (eth_store_saddr(skb, &mac->addr, 0) < 0)
   781  			return DROP_WRITE_ERROR;
   782  		} else {
   783  		    fib_params.family = AF_INET;
   784  		    fib_params.ifindex = *ifindex;
   785  
   786  		    fib_params.ipv4_src = ip4->saddr;
   787  		    fib_params.ipv4_dst = ip4->daddr;
   788  
   789  		    int rc = fib_lookup(skb, &fib_params, sizeof(fib_params),
   790  				BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT);
   791  		    if (rc != 0)
   792  			return DROP_NO_FIB;
   793  
   794  		    if (eth_store_daddr(skb, fib_params.dmac, 0) < 0)
   795  			return DROP_WRITE_ERROR;
   796  		    if (eth_store_saddr(skb, fib_params.smac, 0) < 0)
   797  			return DROP_WRITE_ERROR;
   798  		}
   799  	} else {
   800  		if (!(skb->tc_index & TC_INDEX_F_SKIP_RECIRCULATION)) {
   801  			skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT;
   802  			ep_tail_call(skb, CILIUM_CALL_IPV4_FROM_LXC);
   803  			return DROP_MISSED_TAIL_CALL;
   804  		}
   805  	}
   806  
   807  	return TC_ACT_OK;
   808  }
   809  
   810  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_REVNAT)
   811  int tail_rev_nodeport_lb4(struct __sk_buff *skb)
   812  {
   813  	int ifindex = NATIVE_DEV_IFINDEX;
   814  	union macaddr mac = NATIVE_DEV_MAC;
   815  	int ret = 0;
   816  
   817  	ret = rev_nodeport_lb4(skb, &ifindex, &mac);
   818  	if (IS_ERR(ret))
   819  		return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, METRIC_EGRESS);
   820  	return redirect(ifindex, 0);
   821  }
   822  #endif /* ENABLE_IPV4 */
   823  
   824  static __always_inline int nodeport_nat_fwd(struct __sk_buff *skb,
   825  					    const bool encap)
   826  {
   827  	__u16 proto;
   828  
   829  	if (!validate_ethertype(skb, &proto))
   830  		return TC_ACT_OK;
   831  	switch (proto) {
   832  #ifdef ENABLE_IPV4
   833  	case bpf_htons(ETH_P_IP): {
   834  		__be32 addr;
   835  #ifdef ENCAP_IFINDEX
   836  		if (encap)
   837  			addr = IPV4_GATEWAY;
   838  		else
   839  #endif
   840  			addr = IPV4_NODEPORT;
   841  		return nodeport_nat_ipv4_fwd(skb, addr);
   842  	}
   843  #endif /* ENABLE_IPV4 */
   844  #ifdef ENABLE_IPV6
   845  	case bpf_htons(ETH_P_IPV6): {
   846  		union v6addr addr;
   847  #ifdef ENCAP_IFINDEX
   848  		if (encap)
   849  			BPF_V6(addr, ROUTER_IP);
   850  		else
   851  #endif
   852  			BPF_V6(addr, IPV6_NODEPORT);
   853  		return nodeport_nat_ipv6_fwd(skb, &addr);
   854  	}
   855  #endif /* ENABLE_IPV6 */
   856  	default:
   857  		break;
   858  	}
   859  	return TC_ACT_OK;
   860  }
   861  
   862  static __always_inline int nodeport_nat_rev(struct __sk_buff *skb,
   863  					    const bool encap)
   864  {
   865  	__u16 proto;
   866  
   867  	if (!validate_ethertype(skb, &proto))
   868  		return TC_ACT_OK;
   869  	switch (proto) {
   870  #ifdef ENABLE_IPV4
   871  	case bpf_htons(ETH_P_IP): {
   872  		__be32 addr;
   873  #ifdef ENCAP_IFINDEX
   874  		if (encap)
   875  			addr = IPV4_GATEWAY;
   876  		else
   877  #endif
   878  			addr = IPV4_NODEPORT;
   879  		return nodeport_nat_ipv4_rev(skb, addr);
   880  	}
   881  #endif /* ENABLE_IPV4 */
   882  #ifdef ENABLE_IPV6
   883  	case bpf_htons(ETH_P_IPV6): {
   884  		union v6addr addr;
   885  #ifdef ENCAP_IFINDEX
   886  		if (encap)
   887  			BPF_V6(addr, ROUTER_IP);
   888  		else
   889  #endif
   890  			BPF_V6(addr, IPV6_NODEPORT);
   891  		return nodeport_nat_ipv6_rev(skb, &addr);
   892  	}
   893  #endif /* ENABLE_IPV6 */
   894  	default:
   895  		build_bug_on(!(NODEPORT_PORT_MIN_NAT < NODEPORT_PORT_MAX_NAT));
   896  		build_bug_on(!(NODEPORT_PORT_MIN     < NODEPORT_PORT_MAX));
   897  		build_bug_on(!(NODEPORT_PORT_MAX     < NODEPORT_PORT_MIN_NAT));
   898  		build_bug_on(!(NODEPORT_PORT_MAX     < EPHERMERAL_MIN));
   899  		break;
   900  	}
   901  	return TC_ACT_OK;
   902  }
   903  #endif /* ENABLE_NODEPORT */
   904  #endif /* __NODEPORT_H_ */