github.com/cilium/cilium@v1.16.2/bpf/lib/nodeport.h

github.com/cilium/cilium@v1.16.2/bpf/lib/nodeport.h (about)

     1  /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
     2  /* Copyright Authors of Cilium */
     3  
     4  #pragma once
     5  
     6  #include <bpf/ctx/ctx.h>
     7  #include <bpf/api.h>
     8  
     9  #include "bpf/compiler.h"
    10  #include "tailcall.h"
    11  #include "nat.h"
    12  #include "edt.h"
    13  #include "lb.h"
    14  #include "common.h"
    15  #include "overloadable.h"
    16  #include "egress_gateway.h"
    17  #include "eps.h"
    18  #include "conntrack.h"
    19  #include "csum.h"
    20  #include "encap.h"
    21  #include "identity.h"
    22  #include "trace.h"
    23  #include "ghash.h"
    24  #include "pcap.h"
    25  #include "host_firewall.h"
    26  #include "stubs.h"
    27  #include "proxy_hairpin.h"
    28  #include "fib.h"
    29  
    30  #define nodeport_nat_egress_ipv4_hook(ctx, ip4, info, tuple, l4_off, ext_err) CTX_ACT_OK
    31  #define nodeport_rev_dnat_ingress_ipv4_hook(ctx, ip4, tuple, tunnel_endpoint, src_sec_identity, \
    32  		dst_sec_identity) -1
    33  
    34  #ifdef ENABLE_NODEPORT
    35  /* The IPv6 extension should be 8-bytes aligned */
    36  struct dsr_opt_v6 {
    37  	struct ipv6_opt_hdr hdr;
    38  	__u8 opt_type;
    39  	__u8 opt_len;
    40  	union v6addr addr;
    41  	__be16 port;
    42  	__u16 pad;
    43  };
    44  
    45  struct dsr_opt_v4 {
    46  	__u8 type;
    47  	__u8 len;
    48  	__u16 port;
    49  	__u32 addr;
    50  };
    51  
    52  static __always_inline bool nodeport_uses_dsr(__u8 nexthdr __maybe_unused)
    53  {
    54  # if defined(ENABLE_DSR) && !defined(ENABLE_DSR_HYBRID)
    55  	return true;
    56  # elif defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)
    57  	if (nexthdr == IPPROTO_TCP)
    58  		return true;
    59  	return false;
    60  # else
    61  	return false;
    62  # endif
    63  }
    64  
    65  #ifdef HAVE_ENCAP
    66  static __always_inline int
    67  nodeport_add_tunnel_encap(struct __ctx_buff *ctx, __u32 src_ip, __be16 src_port,
    68  			  __be32 dst_ip, __u32 src_sec_identity, __u32 dst_sec_identity,
    69  			  enum trace_reason ct_reason, __u32 monitor, int *ifindex)
    70  {
    71  	/* Let kernel choose the outer source ip */
    72  	if (ctx_is_skb())
    73  		src_ip = 0;
    74  
    75  	return __encap_with_nodeid(ctx, src_ip, src_port, dst_ip,
    76  				   src_sec_identity, dst_sec_identity, NOT_VTEP_DST,
    77  				   ct_reason, monitor, ifindex);
    78  }
    79  
    80  # if defined(ENABLE_DSR) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE
    81  static __always_inline int
    82  nodeport_add_tunnel_encap_opt(struct __ctx_buff *ctx, __u32 src_ip, __be16 src_port,
    83  			      __be32 dst_ip, __u32 src_sec_identity, __u32 dst_sec_identity,
    84  			      void *opt, __u32 opt_len, enum trace_reason ct_reason,
    85  			      __u32 monitor, int *ifindex)
    86  {
    87  	/* Let kernel choose the outer source ip */
    88  	if (ctx_is_skb())
    89  		src_ip = 0;
    90  
    91  	return __encap_with_nodeid_opt(ctx, src_ip, src_port, dst_ip,
    92  				       src_sec_identity, dst_sec_identity, NOT_VTEP_DST,
    93  				       opt, opt_len, ct_reason, monitor, ifindex);
    94  }
    95  # endif
    96  #endif /* HAVE_ENCAP */
    97  
    98  static __always_inline bool dsr_fail_needs_reply(int code __maybe_unused)
    99  {
   100  #ifdef ENABLE_DSR_ICMP_ERRORS
   101  	if (code == DROP_FRAG_NEEDED)
   102  		return true;
   103  #endif
   104  	return false;
   105  }
   106  
   107  static __always_inline bool dsr_is_too_big(struct __ctx_buff *ctx __maybe_unused,
   108  					   __u16 expanded_len __maybe_unused)
   109  {
   110  #ifdef ENABLE_DSR_ICMP_ERRORS
   111  	if (expanded_len > THIS_MTU)
   112  		return true;
   113  #endif
   114  	return false;
   115  }
   116  
   117  static __always_inline int
   118  nodeport_fib_lookup_and_redirect(struct __ctx_buff *ctx,
   119  				 struct bpf_fib_lookup_padded *fib_params,
   120  				 __s8 *ext_err)
   121  {
   122  	int oif = NATIVE_DEV_IFINDEX;
   123  	int ret;
   124  
   125  	ret = fib_lookup(ctx, &fib_params->l, sizeof(fib_params->l), 0);
   126  	*ext_err = (__s8)ret;
   127  
   128  	switch (ret) {
   129  	case BPF_FIB_LKUP_RET_SUCCESS:
   130  	case BPF_FIB_LKUP_RET_NO_NEIGH:
   131  		if ((__u32)oif == fib_params->l.ifindex)
   132  			return CTX_ACT_OK;
   133  
   134  		return fib_do_redirect(ctx, true, fib_params, true, ext_err, &oif);
   135  	default:
   136  		return DROP_NO_FIB;
   137  	}
   138  }
   139  
   140  #ifdef ENABLE_IPV6
   141  static __always_inline bool nodeport_uses_dsr6(const struct ipv6_ct_tuple *tuple)
   142  {
   143  	return nodeport_uses_dsr(tuple->nexthdr);
   144  }
   145  
   146  static __always_inline bool
   147  nodeport_has_nat_conflict_ipv6(const struct ipv6hdr *ip6 __maybe_unused,
   148  			       struct ipv6_nat_target *target __maybe_unused)
   149  {
   150  #if defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY)
   151  	union v6addr router_ip;
   152  
   153  	BPF_V6(router_ip, ROUTER_IP);
   154  	if (ipv6_addr_equals((union v6addr *)&ip6->saddr, &router_ip)) {
   155  		ipv6_addr_copy(&target->addr, &router_ip);
   156  		target->needs_ct = true;
   157  
   158  		return true;
   159  	}
   160  #endif /* TUNNEL_MODE && IS_BPF_OVERLAY */
   161  
   162  #if defined(IS_BPF_HOST)
   163  	const union v6addr dr_addr = IPV6_DIRECT_ROUTING;
   164  	__u32 dr_ifindex = DIRECT_ROUTING_DEV_IFINDEX;
   165  
   166  	/* See comment in nodeport_has_nat_conflict_ipv4(). */
   167  	if (dr_ifindex == NATIVE_DEV_IFINDEX &&
   168  	    ipv6_addr_equals((union v6addr *)&ip6->saddr, &dr_addr)) {
   169  		ipv6_addr_copy(&target->addr, &dr_addr);
   170  		target->needs_ct = true;
   171  
   172  		return true;
   173  	}
   174  #endif /* IS_BPF_HOST */
   175  
   176  	return false;
   177  }
   178  
   179  static __always_inline int nodeport_snat_fwd_ipv6(struct __ctx_buff *ctx,
   180  						  union v6addr *saddr,
   181  						  struct trace_ctx *trace,
   182  						  __s8 *ext_err)
   183  {
   184  	struct ipv6_nat_target target = {
   185  		.min_port = NODEPORT_PORT_MIN_NAT,
   186  		.max_port = NODEPORT_PORT_MAX_NAT,
   187  	};
   188  	struct ipv6_ct_tuple tuple = {};
   189  	int hdrlen, l4_off, ret;
   190  	void *data, *data_end;
   191  	struct ipv6hdr *ip6;
   192  
   193  	if (!revalidate_data(ctx, &data, &data_end, &ip6))
   194  		return DROP_INVALID;
   195  
   196  	tuple.nexthdr = ip6->nexthdr;
   197  	hdrlen = ipv6_hdrlen(ctx, &tuple.nexthdr);
   198  	if (hdrlen < 0)
   199  		return hdrlen;
   200  
   201  	snat_v6_init_tuple(ip6, NAT_DIR_EGRESS, &tuple);
   202  	l4_off = ETH_HLEN + hdrlen;
   203  
   204  	if (lb_is_svc_proto(tuple.nexthdr) &&
   205  	    !nodeport_uses_dsr6(&tuple) &&
   206  	    nodeport_has_nat_conflict_ipv6(ip6, &target))
   207  		goto apply_snat;
   208  
   209  	ret = snat_v6_needs_masquerade(ctx, &tuple, l4_off, &target);
   210  	if (IS_ERR(ret))
   211  		goto out;
   212  
   213  apply_snat:
   214  	ipv6_addr_copy(saddr, &tuple.saddr);
   215  	ret = snat_v6_nat(ctx, &tuple, l4_off, &target, trace, ext_err);
   216  	if (IS_ERR(ret))
   217  		goto out;
   218  
   219  	/* See the equivalent v4 path for comment */
   220  	if (is_defined(IS_BPF_HOST))
   221  		ctx_snat_done_set(ctx);
   222  
   223  out:
   224  	if (ret == NAT_PUNT_TO_STACK)
   225  		ret = CTX_ACT_OK;
   226  
   227  	return ret;
   228  }
   229  
   230  #ifdef ENABLE_DSR
   231  #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP
   232  static __always_inline void rss_gen_src6(union v6addr *src,
   233  					 const union v6addr *client,
   234  					 __be32 l4_hint)
   235  {
   236  	__u32 bits = 128 - IPV6_RSS_PREFIX_BITS;
   237  
   238  	*src = (union v6addr)IPV6_RSS_PREFIX;
   239  	if (bits) {
   240  		__u32 todo;
   241  
   242  		if (bits > 96) {
   243  			todo = bits - 96;
   244  			src->p1 |= bpf_htonl(hash_32(client->p1 ^ l4_hint, todo));
   245  			bits -= todo;
   246  		}
   247  		if (bits > 64) {
   248  			todo = bits - 64;
   249  			src->p2 |= bpf_htonl(hash_32(client->p2 ^ l4_hint, todo));
   250  			bits -= todo;
   251  		}
   252  		if (bits > 32) {
   253  			todo = bits - 32;
   254  			src->p3 |= bpf_htonl(hash_32(client->p3 ^ l4_hint, todo));
   255  			bits -= todo;
   256  		}
   257  		src->p4 |= bpf_htonl(hash_32(client->p4 ^ l4_hint, bits));
   258  	}
   259  }
   260  
   261  static __always_inline int dsr_set_ipip6(struct __ctx_buff *ctx,
   262  					 const struct ipv6hdr *ip6,
   263  					 const union v6addr *backend_addr,
   264  					 __be32 l4_hint, int *ohead)
   265  {
   266  	__u16 payload_len = bpf_ntohs(ip6->payload_len) + sizeof(*ip6);
   267  	const int l3_off = ETH_HLEN;
   268  	union v6addr saddr;
   269  	struct {
   270  		__be16 payload_len;
   271  		__u8 nexthdr;
   272  		__u8 hop_limit;
   273  	} tp_new = {
   274  		.payload_len	= bpf_htons(payload_len),
   275  		.nexthdr	= IPPROTO_IPV6,
   276  		.hop_limit	= IPDEFTTL,
   277  	};
   278  
   279  	if (dsr_is_too_big(ctx, payload_len + sizeof(*ip6))) {
   280  		*ohead = sizeof(*ip6);
   281  		return DROP_FRAG_NEEDED;
   282  	}
   283  
   284  	rss_gen_src6(&saddr, (union v6addr *)&ip6->saddr, l4_hint);
   285  
   286  	if (ctx_adjust_hroom(ctx, sizeof(*ip6), BPF_ADJ_ROOM_NET,
   287  			     ctx_adjust_hroom_flags()))
   288  		return DROP_INVALID;
   289  	if (ctx_store_bytes(ctx, l3_off + offsetof(struct ipv6hdr, payload_len),
   290  			    &tp_new.payload_len, 4, 0) < 0)
   291  		return DROP_WRITE_ERROR;
   292  	if (ctx_store_bytes(ctx, l3_off + offsetof(struct ipv6hdr, daddr),
   293  			    backend_addr, sizeof(ip6->daddr), 0) < 0)
   294  		return DROP_WRITE_ERROR;
   295  	if (ctx_store_bytes(ctx, l3_off + offsetof(struct ipv6hdr, saddr),
   296  			    &saddr, sizeof(ip6->saddr), 0) < 0)
   297  		return DROP_WRITE_ERROR;
   298  	return 0;
   299  }
   300  #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE
   301  static __always_inline int dsr_set_ext6(struct __ctx_buff *ctx,
   302  					struct ipv6hdr *ip6,
   303  					const union v6addr *svc_addr,
   304  					__be16 svc_port, int *ohead)
   305  {
   306  	struct dsr_opt_v6 opt __align_stack_8 = {};
   307  	__u16 payload_len = bpf_ntohs(ip6->payload_len) + sizeof(opt);
   308  	__u16 total_len = bpf_ntohs(ip6->payload_len) + sizeof(struct ipv6hdr) + sizeof(opt);
   309  	__u8 nexthdr = ip6->nexthdr;
   310  	int hdrlen;
   311  
   312  	/* The IPv6 extension should be 8-bytes aligned */
   313  	build_bug_on((sizeof(struct dsr_opt_v6) % 8) != 0);
   314  
   315  	hdrlen = ipv6_hdrlen(ctx, &nexthdr);
   316  	if (hdrlen < 0)
   317  		return hdrlen;
   318  
   319  	/* See dsr_set_opt4(): */
   320  	if (nexthdr == IPPROTO_TCP) {
   321  		union tcp_flags tcp_flags = { .value = 0 };
   322  
   323  		if (l4_load_tcp_flags(ctx, ETH_HLEN + hdrlen, &tcp_flags) < 0)
   324  			return DROP_CT_INVALID_HDR;
   325  
   326  		if (!(tcp_flags.value & (TCP_FLAG_SYN)))
   327  			return 0;
   328  	}
   329  
   330  	if (dsr_is_too_big(ctx, total_len)) {
   331  		*ohead = sizeof(opt);
   332  		return DROP_FRAG_NEEDED;
   333  	}
   334  
   335  	opt.hdr.nexthdr = ip6->nexthdr;
   336  	ip6->nexthdr = NEXTHDR_DEST;
   337  	ip6->payload_len = bpf_htons(payload_len);
   338  
   339  	opt.hdr.hdrlen = DSR_IPV6_EXT_LEN;
   340  	opt.opt_type = DSR_IPV6_OPT_TYPE;
   341  	opt.opt_len = DSR_IPV6_OPT_LEN;
   342  	ipv6_addr_copy_unaligned(&opt.addr, svc_addr);
   343  	opt.port = svc_port;
   344  
   345  	if (ctx_adjust_hroom(ctx, sizeof(opt), BPF_ADJ_ROOM_NET,
   346  			     ctx_adjust_hroom_flags()))
   347  		return DROP_INVALID;
   348  	if (ctx_store_bytes(ctx, ETH_HLEN + sizeof(*ip6), &opt,
   349  			    sizeof(opt), 0) < 0)
   350  		return DROP_INVALID;
   351  	return 0;
   352  }
   353  #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE
   354  static __always_inline int encap_geneve_dsr_opt6(struct __ctx_buff *ctx,
   355  						 struct ipv6hdr *ip6,
   356  						 const union v6addr *svc_addr,
   357  						 __be16 svc_port,
   358  						 int *ifindex, int *ohead)
   359  {
   360  	struct remote_endpoint_info *info;
   361  	struct ipv6_ct_tuple tuple __align_stack_8 = {};
   362  	struct geneve_dsr_opt6 gopt;
   363  	union v6addr *dst;
   364  	bool need_opt = true;
   365  	__u16 encap_len = sizeof(struct ipv6hdr) + sizeof(struct udphdr) +
   366  		sizeof(struct genevehdr) + ETH_HLEN;
   367  	__u16 payload_len = bpf_ntohs(ip6->payload_len) + sizeof(*ip6);
   368  	__u32 dst_sec_identity;
   369  	__be32 tunnel_endpoint;
   370  	__u16 total_len = 0;
   371  	__be16 src_port;
   372  	int l4_off, ret;
   373  
   374  	build_bug_on((sizeof(gopt) % 4) != 0);
   375  
   376  	dst = (union v6addr *)&ip6->daddr;
   377  	info = lookup_ip6_remote_endpoint(dst, 0);
   378  	if (!info || info->tunnel_endpoint == 0)
   379  		return DROP_NO_TUNNEL_ENDPOINT;
   380  
   381  	tunnel_endpoint = info->tunnel_endpoint;
   382  	dst_sec_identity = info->sec_identity;
   383  
   384  	ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple);
   385  	if (IS_ERR(ret))
   386  		return ret;
   387  
   388  	src_port = tunnel_gen_src_port_v6(&tuple);
   389  
   390  	/* See encap_geneve_dsr_opt4(): */
   391  	if (tuple.nexthdr == IPPROTO_TCP) {
   392  		union tcp_flags tcp_flags = { .value = 0 };
   393  
   394  		if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0)
   395  			return DROP_CT_INVALID_HDR;
   396  
   397  		if (!(tcp_flags.value & (TCP_FLAG_SYN)))
   398  			need_opt = false;
   399  	}
   400  
   401  	if (need_opt) {
   402  		encap_len += sizeof(struct geneve_dsr_opt6);
   403  		set_geneve_dsr_opt6(svc_port, svc_addr, &gopt);
   404  	}
   405  
   406  	total_len = encap_len + payload_len;
   407  
   408  	if (dsr_is_too_big(ctx, total_len)) {
   409  		*ohead = encap_len;
   410  		return DROP_FRAG_NEEDED;
   411  	}
   412  
   413  	if (need_opt)
   414  		return nodeport_add_tunnel_encap_opt(ctx,
   415  						     IPV4_DIRECT_ROUTING,
   416  						     src_port,
   417  						     tunnel_endpoint,
   418  						     WORLD_IPV6_ID,
   419  						     dst_sec_identity,
   420  						     &gopt,
   421  						     sizeof(gopt),
   422  						     (enum trace_reason)CT_NEW,
   423  						     TRACE_PAYLOAD_LEN,
   424  						     ifindex);
   425  
   426  	return nodeport_add_tunnel_encap(ctx,
   427  					 IPV4_DIRECT_ROUTING,
   428  					 src_port,
   429  					 tunnel_endpoint,
   430  					 WORLD_IPV6_ID,
   431  					 dst_sec_identity,
   432  					 (enum trace_reason)CT_NEW,
   433  					 TRACE_PAYLOAD_LEN,
   434  					 ifindex);
   435  }
   436  #endif /* DSR_ENCAP_MODE */
   437  
   438  static __always_inline int find_dsr_v6(struct __ctx_buff *ctx, __u8 nexthdr,
   439  				       struct dsr_opt_v6 *dsr_opt, bool *found)
   440  {
   441  	struct ipv6_opt_hdr opthdr __align_stack_8;
   442  	int i, len = sizeof(struct ipv6hdr);
   443  	__u8 nh = nexthdr;
   444  
   445  #pragma unroll
   446  	for (i = 0; i < IPV6_MAX_HEADERS; i++) {
   447  		switch (nh) {
   448  		case NEXTHDR_NONE:
   449  			return DROP_INVALID_EXTHDR;
   450  
   451  		case NEXTHDR_FRAGMENT:
   452  			return DROP_FRAG_NOSUPPORT;
   453  
   454  		case NEXTHDR_HOP:
   455  		case NEXTHDR_ROUTING:
   456  		case NEXTHDR_AUTH:
   457  		case NEXTHDR_DEST:
   458  			if (ctx_load_bytes(ctx, ETH_HLEN + len, &opthdr, sizeof(opthdr)) < 0)
   459  				return DROP_INVALID;
   460  
   461  			if (nh == NEXTHDR_DEST && opthdr.hdrlen == DSR_IPV6_EXT_LEN) {
   462  				if (ctx_load_bytes(ctx, ETH_HLEN + len, dsr_opt,
   463  						   sizeof(*dsr_opt)) < 0)
   464  					return DROP_INVALID;
   465  				if (dsr_opt->opt_type == DSR_IPV6_OPT_TYPE &&
   466  				    dsr_opt->opt_len == DSR_IPV6_OPT_LEN) {
   467  					*found = true;
   468  					return 0;
   469  				}
   470  			}
   471  
   472  			if (nh == NEXTHDR_AUTH)
   473  				len += ipv6_authlen(&opthdr);
   474  			else
   475  				len += ipv6_optlen(&opthdr);
   476  
   477  			nh = opthdr.nexthdr;
   478  			break;
   479  
   480  		default:
   481  			return 0;
   482  		}
   483  	}
   484  
   485  	/* Reached limit of supported extension headers */
   486  	return DROP_INVALID_EXTHDR;
   487  }
   488  
   489  static __always_inline int
   490  nodeport_extract_dsr_v6(struct __ctx_buff *ctx,
   491  			struct ipv6hdr *ip6 __maybe_unused,
   492  			const struct ipv6_ct_tuple *tuple, int l4_off,
   493  			union v6addr *addr, __be16 *port, bool *dsr)
   494  {
   495  	struct ipv6_ct_tuple tmp = *tuple;
   496  
   497  	if (tuple->nexthdr == IPPROTO_TCP) {
   498  		union tcp_flags tcp_flags = {};
   499  
   500  		if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0)
   501  			return DROP_CT_INVALID_HDR;
   502  
   503  		ipv6_ct_tuple_reverse(&tmp);
   504  
   505  		if (!(tcp_flags.value & TCP_FLAG_SYN)) {
   506  			*dsr = ct_has_dsr_egress_entry6(get_ct_map6(&tmp), &tmp);
   507  			*port = 0;
   508  			return 0;
   509  		}
   510  	}
   511  
   512  #if defined(IS_BPF_OVERLAY)
   513  	{
   514  		struct geneve_dsr_opt6 gopt;
   515  		int ret = ctx_get_tunnel_opt(ctx, &gopt, sizeof(gopt));
   516  
   517  		if (ret > 0) {
   518  			if (gopt.hdr.opt_class == bpf_htons(DSR_GENEVE_OPT_CLASS) &&
   519  			    gopt.hdr.type == DSR_GENEVE_OPT_TYPE) {
   520  				*dsr = true;
   521  				*port = gopt.port;
   522  				ipv6_addr_copy_unaligned(addr,
   523  							 (union v6addr *)&gopt.addr);
   524  				return 0;
   525  			}
   526  		}
   527  	}
   528  #else
   529  	{
   530  		struct dsr_opt_v6 opt __align_stack_8 = {};
   531  		int ret;
   532  
   533  		ret = find_dsr_v6(ctx, ip6->nexthdr, &opt, dsr);
   534  		if (ret != 0)
   535  			return ret;
   536  
   537  		if (*dsr) {
   538  			*addr = opt.addr;
   539  			*port = opt.port;
   540  			return 0;
   541  		}
   542  	}
   543  #endif
   544  
   545  	/* SYN for a new connection that's not / no longer DSR.
   546  	 * If it's reopened, avoid sending subsequent traffic down the DSR path.
   547  	 */
   548  	if (tuple->nexthdr == IPPROTO_TCP)
   549  		ct_update_dsr(get_ct_map6(&tmp), &tmp, false);
   550  
   551  	return 0;
   552  }
   553  
   554  static __always_inline struct ipv6_nat_entry *
   555  nodeport_dsr_lookup_v6_nat_entry(const struct ipv6_ct_tuple *nat_tuple)
   556  {
   557  	return snat_v6_lookup(nat_tuple);
   558  }
   559  
   560  static __always_inline int dsr_reply_icmp6(struct __ctx_buff *ctx,
   561  					   const struct ipv6hdr *ip6 __maybe_unused,
   562  					   const union v6addr *svc_addr __maybe_unused,
   563  					   __be16 dport __maybe_unused,
   564  					   int code, int ohead __maybe_unused)
   565  {
   566  #ifdef ENABLE_DSR_ICMP_ERRORS
   567  	const __s32 orig_dgram = 64, off = ETH_HLEN;
   568  	__u8 orig_ipv6_hdr[orig_dgram];
   569  	__be16 type = bpf_htons(ETH_P_IPV6);
   570  	__u64 len_new = off + sizeof(*ip6) + orig_dgram;
   571  	__u64 len_old = ctx_full_len(ctx);
   572  	void *data_end = ctx_data_end(ctx);
   573  	void *data = ctx_data(ctx);
   574  	__u8 reason = (__u8)-code;
   575  	__wsum wsum;
   576  	union macaddr smac, dmac;
   577  	struct icmp6hdr icmp __align_stack_8 = {
   578  		.icmp6_type	= ICMPV6_PKT_TOOBIG,
   579  		.icmp6_mtu	= bpf_htonl(THIS_MTU - ohead),
   580  	};
   581  	__u64 payload_len = sizeof(*ip6) + sizeof(icmp) + orig_dgram;
   582  	struct ipv6hdr ip __align_stack_8 = {
   583  		.version	= 6,
   584  		.priority	= ip6->priority,
   585  		.flow_lbl[0]	= ip6->flow_lbl[0],
   586  		.flow_lbl[1]	= ip6->flow_lbl[1],
   587  		.flow_lbl[2]	= ip6->flow_lbl[2],
   588  		.nexthdr	= IPPROTO_ICMPV6,
   589  		.hop_limit	= IPDEFTTL,
   590  		.saddr		= ip6->daddr,
   591  		.daddr		= ip6->saddr,
   592  		.payload_len	= bpf_htons((__u16)payload_len),
   593  	};
   594  	struct ipv6hdr inner_ipv6_hdr __align_stack_8 = *ip6;
   595  	__s32 l4_dport_offset;
   596  
   597  	/* DSR changes the destination address from service ip to pod ip and
   598  	 * destination port from service port to pod port. While resppnding
   599  	 * back with ICMP error, it is necessary to set it to original ip and
   600  	 * port.
   601  	 */
   602  	ipv6_addr_copy((union v6addr *)&inner_ipv6_hdr.daddr, svc_addr);
   603  
   604  	if (inner_ipv6_hdr.nexthdr == IPPROTO_UDP)
   605  		l4_dport_offset = UDP_DPORT_OFF;
   606  	else if (inner_ipv6_hdr.nexthdr == IPPROTO_TCP)
   607  		l4_dport_offset = TCP_DPORT_OFF;
   608  	else
   609  		goto drop_err;
   610  
   611  	if (ctx_load_bytes(ctx, off + sizeof(inner_ipv6_hdr), orig_ipv6_hdr,
   612  			   sizeof(orig_ipv6_hdr)) < 0)
   613  		goto drop_err;
   614  	memcpy(orig_ipv6_hdr + l4_dport_offset, &dport, sizeof(dport));
   615  
   616  	update_metrics(ctx_full_len(ctx), METRIC_EGRESS, reason);
   617  
   618  	if (eth_load_saddr(ctx, smac.addr, 0) < 0)
   619  		goto drop_err;
   620  	if (eth_load_daddr(ctx, dmac.addr, 0) < 0)
   621  		goto drop_err;
   622  	if (unlikely(data + len_new > data_end))
   623  		goto drop_err;
   624  
   625  	wsum = ipv6_pseudohdr_checksum(&ip, IPPROTO_ICMPV6,
   626  				       bpf_ntohs(ip.payload_len), 0);
   627  	icmp.icmp6_cksum = csum_fold(csum_diff(NULL, 0, orig_ipv6_hdr, sizeof(orig_ipv6_hdr),
   628  					       csum_diff(NULL, 0, &inner_ipv6_hdr,
   629  							 sizeof(inner_ipv6_hdr),
   630  							 csum_diff(NULL, 0, &icmp,
   631  								   sizeof(icmp), wsum))));
   632  
   633  	if (ctx_adjust_troom(ctx, -(len_old - len_new)) < 0)
   634  		goto drop_err;
   635  	if (ctx_adjust_hroom(ctx, sizeof(ip) + sizeof(icmp),
   636  			     BPF_ADJ_ROOM_NET,
   637  			     ctx_adjust_hroom_flags()) < 0)
   638  		goto drop_err;
   639  
   640  	if (eth_store_daddr(ctx, smac.addr, 0) < 0)
   641  		goto drop_err;
   642  	if (eth_store_saddr(ctx, dmac.addr, 0) < 0)
   643  		goto drop_err;
   644  	if (ctx_store_bytes(ctx, ETH_ALEN * 2, &type, sizeof(type), 0) < 0)
   645  		goto drop_err;
   646  	if (ctx_store_bytes(ctx, off, &ip, sizeof(ip), 0) < 0)
   647  		goto drop_err;
   648  	if (ctx_store_bytes(ctx, off + sizeof(ip), &icmp,
   649  			    sizeof(icmp), 0) < 0)
   650  		goto drop_err;
   651  	if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp), &inner_ipv6_hdr,
   652  			    sizeof(inner_ipv6_hdr), 0) < 0)
   653  		goto drop_err;
   654  	if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp) +
   655  			    sizeof(inner_ipv6_hdr) + l4_dport_offset,
   656  			    &dport, sizeof(dport), 0) < 0)
   657  		goto drop_err;
   658  
   659  	return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0);
   660  drop_err:
   661  #endif
   662  	return send_drop_notify_error(ctx, UNKNOWN_ID, code, CTX_ACT_DROP,
   663  				      METRIC_EGRESS);
   664  }
   665  
   666  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_DSR)
   667  int tail_nodeport_ipv6_dsr(struct __ctx_buff *ctx)
   668  {
   669  	struct bpf_fib_lookup_padded fib_params = {
   670  		.l = {
   671  			.family		= AF_INET6,
   672  			.ifindex	= ctx_get_ifindex(ctx),
   673  		},
   674  	};
   675  	int ret, oif = 0, ohead = 0;
   676  	void *data, *data_end;
   677  	struct ipv6hdr *ip6;
   678  	union v6addr addr;
   679  	__s8 ext_err = 0;
   680  	__be16 port;
   681  
   682  	if (!revalidate_data(ctx, &data, &data_end, &ip6)) {
   683  		ret = DROP_INVALID;
   684  		goto drop_err;
   685  	}
   686  
   687  	addr.p1 = ctx_load_meta(ctx, CB_ADDR_V6_1);
   688  	addr.p2 = ctx_load_meta(ctx, CB_ADDR_V6_2);
   689  	addr.p3 = ctx_load_meta(ctx, CB_ADDR_V6_3);
   690  	addr.p4 = ctx_load_meta(ctx, CB_ADDR_V6_4);
   691  
   692  	port = (__be16)ctx_load_meta(ctx, CB_PORT);
   693  
   694  #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP
   695  	ret = dsr_set_ipip6(ctx, ip6, &addr,
   696  			    ctx_load_meta(ctx, CB_HINT), &ohead);
   697  #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE
   698  	ret = dsr_set_ext6(ctx, ip6, &addr, port, &ohead);
   699  #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE
   700  	ret = encap_geneve_dsr_opt6(ctx, ip6, &addr, port, &oif, &ohead);
   701  	if (!IS_ERR(ret)) {
   702  		if (ret == CTX_ACT_REDIRECT && oif) {
   703  			cilium_capture_out(ctx);
   704  			return ctx_redirect(ctx, oif, 0);
   705  		}
   706  
   707  		fib_params.l.family = AF_INET;
   708  	}
   709  #else
   710  # error "Invalid load balancer DSR encapsulation mode!"
   711  #endif
   712  	if (IS_ERR(ret)) {
   713  		if (dsr_fail_needs_reply(ret))
   714  			return dsr_reply_icmp6(ctx, ip6, &addr, port, ret, ohead);
   715  		goto drop_err;
   716  	}
   717  
   718  	if (fib_params.l.family == AF_INET) {
   719  		struct iphdr *ip4;
   720  
   721  		if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
   722  			ret = DROP_INVALID;
   723  			goto drop_err;
   724  		}
   725  
   726  		fib_params.l.ipv4_src = ip4->saddr;
   727  		fib_params.l.ipv4_dst = ip4->daddr;
   728  	} else {
   729  		if (!revalidate_data(ctx, &data, &data_end, &ip6)) {
   730  			ret = DROP_INVALID;
   731  			goto drop_err;
   732  		}
   733  
   734  		ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_src,
   735  			       (union v6addr *)&ip6->saddr);
   736  		ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_dst,
   737  			       (union v6addr *)&ip6->daddr);
   738  	}
   739  
   740  	ret = fib_redirect(ctx, true, &fib_params, false, &ext_err, &oif);
   741  	if (fib_ok(ret)) {
   742  		cilium_capture_out(ctx);
   743  		return ret;
   744  	}
   745  drop_err:
   746  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
   747  					  CTX_ACT_DROP, METRIC_EGRESS);
   748  }
   749  
   750  static __always_inline int
   751  nodeport_dsr_ingress_ipv6(struct __ctx_buff *ctx, struct ipv6_ct_tuple *tuple,
   752  			  int l4_off, union v6addr *addr, __be16 port,
   753  			  __s8 *ext_err)
   754  {
   755  	struct ct_state ct_state_new = {};
   756  	__u32 monitor = 0;
   757  	int ret;
   758  
   759  	/* look up with SCOPE_FORWARD: */
   760  	__ipv6_ct_tuple_reverse(tuple);
   761  
   762  	ret = ct_lazy_lookup6(get_ct_map6(tuple), tuple, ctx, l4_off,
   763  			      CT_EGRESS, SCOPE_FORWARD, CT_ENTRY_DSR,
   764  			      NULL, &monitor);
   765  	if (ret < 0)
   766  		return ret;
   767  
   768  	switch (ret) {
   769  	case CT_NEW:
   770  create_ct:
   771  		if (port == 0)
   772  			return DROP_INVALID;
   773  
   774  		ct_state_new.src_sec_id = WORLD_IPV6_ID;
   775  		ct_state_new.dsr_internal = 1;
   776  
   777  		ret = ct_create6(get_ct_map6(tuple), NULL, tuple, ctx,
   778  				 CT_EGRESS, &ct_state_new, ext_err);
   779  		if (!IS_ERR(ret))
   780  			ret = snat_v6_create_dsr(tuple, addr, port, ext_err);
   781  
   782  		if (IS_ERR(ret))
   783  			return ret;
   784  		break;
   785  	case CT_ESTABLISHED:
   786  		if (tuple->nexthdr == IPPROTO_TCP && port)
   787  			goto create_ct;
   788  		break;
   789  	default:
   790  		return DROP_UNKNOWN_CT;
   791  	}
   792  
   793  	return CTX_ACT_OK;
   794  }
   795  #endif /* ENABLE_DSR */
   796  
   797  static __always_inline struct lb6_reverse_nat *
   798  nodeport_rev_dnat_get_info_ipv6(struct __ctx_buff *ctx,
   799  				struct ipv6_ct_tuple *tuple)
   800  {
   801  	struct ipv6_nat_entry *dsr_entry __maybe_unused;
   802  	struct ipv6_ct_tuple dsr_tuple __maybe_unused;
   803  	__u16 rev_nat_index = 0;
   804  
   805  	if (!ct_has_nodeport_egress_entry6(get_ct_map6(tuple), tuple,
   806  					   &rev_nat_index, is_defined(ENABLE_DSR)))
   807  		return NULL;
   808  
   809  	if (rev_nat_index)
   810  		return lb6_lookup_rev_nat_entry(ctx, rev_nat_index);
   811  
   812  #ifdef ENABLE_DSR
   813  	dsr_tuple = *tuple;
   814  
   815  	dsr_tuple.flags = NAT_DIR_EGRESS;
   816  	dsr_tuple.sport = tuple->dport;
   817  	dsr_tuple.dport = tuple->sport;
   818  
   819  	dsr_entry = nodeport_dsr_lookup_v6_nat_entry(&dsr_tuple);
   820  	if (dsr_entry)
   821  		return &dsr_entry->nat_info;
   822  #endif
   823  
   824  	return NULL;
   825  }
   826  
   827  #ifdef ENABLE_NAT_46X64_GATEWAY
   828  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV46_RFC8215)
   829  int tail_nat_ipv46(struct __ctx_buff *ctx)
   830  {
   831  	int ret, oif = 0, l3_off = ETH_HLEN;
   832  	void *data, *data_end;
   833  	struct ipv6hdr *ip6;
   834  	struct iphdr *ip4;
   835  	__s8 ext_err = 0;
   836  
   837  	if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
   838  		ret = DROP_INVALID;
   839  		goto drop_err;
   840  	}
   841  	if (nat46_rfc8215(ctx, ip4, l3_off)) {
   842  		ret = DROP_NAT46;
   843  		goto drop_err;
   844  	}
   845  	if (!revalidate_data(ctx, &data, &data_end, &ip6)) {
   846  		ret = DROP_INVALID;
   847  		goto drop_err;
   848  	}
   849  	ret = fib_redirect_v6(ctx, l3_off, ip6, false, true, &ext_err, &oif);
   850  	if (fib_ok(ret)) {
   851  		cilium_capture_out(ctx);
   852  		return ret;
   853  	}
   854  drop_err:
   855  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
   856  					  CTX_ACT_DROP, METRIC_EGRESS);
   857  }
   858  
   859  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV64_RFC8215)
   860  int tail_nat_ipv64(struct __ctx_buff *ctx)
   861  {
   862  	int ret, oif = 0, l3_off = ETH_HLEN;
   863  	void *data, *data_end;
   864  	struct ipv6hdr *ip6;
   865  	struct iphdr *ip4;
   866  	__s8 ext_err = 0;
   867  
   868  	if (!revalidate_data(ctx, &data, &data_end, &ip6)) {
   869  		ret = DROP_INVALID;
   870  		goto drop_err;
   871  	}
   872  	if (nat64_rfc8215(ctx, ip6)) {
   873  		ret = DROP_NAT64;
   874  		goto drop_err;
   875  	}
   876  	if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
   877  		ret = DROP_INVALID;
   878  		goto drop_err;
   879  	}
   880  	ret = fib_redirect_v4(ctx, l3_off, ip4, false, true, &ext_err, &oif);
   881  	if (fib_ok(ret)) {
   882  		cilium_capture_out(ctx);
   883  		return ret;
   884  	}
   885  drop_err:
   886  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
   887  					  CTX_ACT_DROP, METRIC_EGRESS);
   888  }
   889  #endif /* ENABLE_NAT_46X64_GATEWAY */
   890  
   891  static __always_inline int
   892  nodeport_rev_dnat_ingress_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
   893  			       __s8 *ext_err)
   894  {
   895  #ifdef ENABLE_NAT_46X64_GATEWAY
   896  	const bool nat_46x64_fib = nat46x64_cb_route(ctx);
   897  #endif
   898  	struct bpf_fib_lookup_padded fib_params = {
   899  		.l = {
   900  			.family		= AF_INET6,
   901  			.ifindex	= ctx_get_ifindex(ctx),
   902  		},
   903  	};
   904  	int ret, l4_off;
   905  	struct ipv6_ct_tuple tuple __align_stack_8 = {};
   906  	struct ct_state ct_state = {};
   907  	void *data, *data_end;
   908  	struct ipv6hdr *ip6;
   909  	__u32 tunnel_endpoint __maybe_unused = 0;
   910  	__u32 dst_sec_identity __maybe_unused = 0;
   911  	__be16 src_port __maybe_unused = 0;
   912  	bool allow_neigh_map = true;
   913  	int ifindex = 0;
   914  
   915  	if (!revalidate_data(ctx, &data, &data_end, &ip6))
   916  		return DROP_INVALID;
   917  #ifdef ENABLE_NAT_46X64_GATEWAY
   918  	if (nat_46x64_fib)
   919  		goto fib_lookup;
   920  #endif
   921  	ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple);
   922  	if (ret < 0) {
   923  		if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4)
   924  			goto out;
   925  		return ret;
   926  	}
   927  
   928  	ret = ct_lazy_lookup6(get_ct_map6(&tuple), &tuple, ctx, l4_off,
   929  			      CT_INGRESS, SCOPE_REVERSE, CT_ENTRY_NODEPORT,
   930  			      &ct_state, &trace->monitor);
   931  	if (ret == CT_REPLY) {
   932  		trace->reason = TRACE_REASON_CT_REPLY;
   933  		ret = ipv6_l3(ctx, ETH_HLEN, NULL, NULL, METRIC_EGRESS);
   934  		if (unlikely(ret != CTX_ACT_OK))
   935  			return ret;
   936  
   937  		ret = lb6_rev_nat(ctx, l4_off, ct_state.rev_nat_index,
   938  				  &tuple);
   939  		if (IS_ERR(ret))
   940  			return ret;
   941  		if (!revalidate_data(ctx, &data, &data_end, &ip6))
   942  			return DROP_INVALID;
   943  		ctx_snat_done_set(ctx);
   944  #ifndef HAVE_FIB_IFINDEX
   945  		ifindex = ct_state.ifindex;
   946  #endif
   947  #ifdef TUNNEL_MODE
   948  		{
   949  			union v6addr *dst = (union v6addr *)&ip6->daddr;
   950  			struct remote_endpoint_info *info;
   951  
   952  			info = lookup_ip6_remote_endpoint(dst, 0);
   953  			if (info && info->tunnel_endpoint && !info->flag_skip_tunnel) {
   954  				tunnel_endpoint = info->tunnel_endpoint;
   955  				dst_sec_identity = info->sec_identity;
   956  				goto encap_redirect;
   957  			}
   958  		}
   959  #endif
   960  
   961  		goto fib_lookup;
   962  	}
   963  out:
   964  	return CTX_ACT_OK;
   965  
   966  #ifdef TUNNEL_MODE
   967  encap_redirect:
   968  	src_port = tunnel_gen_src_port_v6(&tuple);
   969  
   970  	ret = nodeport_add_tunnel_encap(ctx, IPV4_DIRECT_ROUTING, src_port,
   971  					tunnel_endpoint, SECLABEL, dst_sec_identity,
   972  					trace->reason, trace->monitor, &ifindex);
   973  	if (IS_ERR(ret))
   974  		return ret;
   975  
   976  	if (ret == CTX_ACT_REDIRECT && ifindex)
   977  		return ctx_redirect(ctx, ifindex, 0);
   978  
   979  	fib_params.l.ipv4_src = IPV4_DIRECT_ROUTING;
   980  	fib_params.l.ipv4_dst = tunnel_endpoint;
   981  	fib_params.l.family = AF_INET;
   982  
   983  	/* neigh map doesn't contain DMACs for other nodes */
   984  	allow_neigh_map = false;
   985  	goto fib_redirect;
   986  #endif
   987  
   988  fib_lookup:
   989  	if (is_v4_in_v6((union v6addr *)&ip6->saddr)) {
   990  		struct iphdr *ip4;
   991  
   992  		ret = lb6_to_lb4(ctx, ip6);
   993  		if (ret < 0)
   994  			return ret;
   995  
   996  		if (!revalidate_data(ctx, &data, &data_end, &ip4))
   997  			return DROP_INVALID;
   998  
   999  		fib_params.l.ipv4_src = ip4->saddr;
  1000  		fib_params.l.ipv4_dst = ip4->daddr;
  1001  		fib_params.l.family = AF_INET;
  1002  	} else {
  1003  		ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_src,
  1004  			       (union v6addr *)&ip6->saddr);
  1005  		ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_dst,
  1006  			       (union v6addr *)&ip6->daddr);
  1007  	}
  1008  
  1009  #ifdef TUNNEL_MODE
  1010  fib_redirect:
  1011  #endif
  1012  	return fib_redirect(ctx, true, &fib_params, allow_neigh_map, ext_err, &ifindex);
  1013  }
  1014  
  1015  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_REVNAT)
  1016  static __always_inline
  1017  int tail_nodeport_rev_dnat_ingress_ipv6(struct __ctx_buff *ctx)
  1018  {
  1019  	struct trace_ctx trace = {
  1020  		.reason = TRACE_REASON_CT_REPLY,
  1021  		.monitor = TRACE_PAYLOAD_LEN,
  1022  	};
  1023  	__s8 ext_err = 0;
  1024  	int ret = 0;
  1025  
  1026  	ret = nodeport_rev_dnat_ingress_ipv6(ctx, &trace, &ext_err);
  1027  	if (IS_ERR(ret))
  1028  		goto drop;
  1029  
  1030  	if (ret == CTX_ACT_OK) {
  1031  		if (is_defined(IS_BPF_LXC)) {
  1032  			ret = DROP_NAT_NO_MAPPING;
  1033  			goto drop;
  1034  		}
  1035  
  1036  		ctx_skip_nodeport_set(ctx);
  1037  		ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_FROM_NETDEV, &ext_err);
  1038  		goto drop;
  1039  	}
  1040  
  1041  #ifndef IS_BPF_LXC
  1042  	edt_set_aggregate(ctx, 0);
  1043  #endif
  1044  	cilium_capture_out(ctx);
  1045  	return ret;
  1046  drop:
  1047  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  1048  					  CTX_ACT_DROP, METRIC_EGRESS);
  1049  }
  1050  
  1051  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS)
  1052  static __always_inline
  1053  int tail_nodeport_nat_ingress_ipv6(struct __ctx_buff *ctx)
  1054  {
  1055  	struct ipv6_nat_target target = {
  1056  		.min_port = NODEPORT_PORT_MIN_NAT,
  1057  		.max_port = NODEPORT_PORT_MAX_NAT,
  1058  	};
  1059  	struct trace_ctx trace = {
  1060  		.reason = TRACE_REASON_CT_REPLY,
  1061  		.monitor = TRACE_PAYLOAD_LEN,
  1062  	};
  1063  	__u32 src_id = 0;
  1064  	__s8 ext_err = 0;
  1065  	int ret;
  1066  
  1067  	ret = snat_v6_rev_nat(ctx, &target, &trace, &ext_err);
  1068  	if (IS_ERR(ret)) {
  1069  		if (ret == NAT_PUNT_TO_STACK ||
  1070  		    /* DROP_NAT_NO_MAPPING is unwanted behavior in a
  1071  		     * rev-SNAT context. Let's continue to passing it
  1072  		     * up to the host and revisiting this later if
  1073  		     * needed.
  1074  		     */
  1075  		    ret == DROP_NAT_NO_MAPPING) {
  1076  			/* In case of no mapping, recircle back to
  1077  			 * main path. SNAT is very expensive in terms
  1078  			 * of instructions and
  1079  			 * complexity. Consequently, this is done
  1080  			 * inside a tail call here (because we don't
  1081  			 * have BPF to BPF calls).
  1082  			 */
  1083  			goto recircle;
  1084  		}
  1085  		goto drop_err;
  1086  	}
  1087  
  1088  	ctx_snat_done_set(ctx);
  1089  
  1090  #if !defined(ENABLE_DSR) || (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID))
  1091  
  1092  # if defined(ENABLE_HOST_FIREWALL) && defined(IS_BPF_HOST)
  1093  	ret = ipv6_host_policy_ingress(ctx, &src_id, &trace, &ext_err);
  1094  	if (IS_ERR(ret))
  1095  		goto drop_err;
  1096  
  1097  	ctx_skip_host_fw_set(ctx);
  1098  # endif
  1099  
  1100  	ret = invoke_traced_tailcall_if(__and(is_defined(ENABLE_HOST_FIREWALL),
  1101  					      is_defined(IS_BPF_HOST)),
  1102  					CILIUM_CALL_IPV6_NODEPORT_REVNAT,
  1103  					nodeport_rev_dnat_ingress_ipv6,
  1104  					&trace, &ext_err);
  1105  	if (IS_ERR(ret))
  1106  		goto drop_err;
  1107  
  1108  	if (ret == CTX_ACT_OK)
  1109  		goto recircle;
  1110  
  1111  	edt_set_aggregate(ctx, 0);
  1112  	cilium_capture_out(ctx);
  1113  	return ret;
  1114  #endif
  1115  
  1116  recircle:
  1117  	ctx_skip_nodeport_set(ctx);
  1118  	ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_FROM_NETDEV, &ext_err);
  1119  
  1120  drop_err:
  1121  	return send_drop_notify_error_ext(ctx, src_id, ret, ext_err, CTX_ACT_DROP,
  1122  					  METRIC_INGRESS);
  1123  }
  1124  
  1125  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS)
  1126  static __always_inline
  1127  int tail_nodeport_nat_egress_ipv6(struct __ctx_buff *ctx)
  1128  {
  1129  	const bool nat_46x64 = nat46x64_cb_xlate(ctx);
  1130  	struct bpf_fib_lookup_padded fib_params = {
  1131  		.l = {
  1132  			.family		= AF_INET6,
  1133  			.ifindex	= ctx_get_ifindex(ctx),
  1134  		},
  1135  	};
  1136  	struct ipv6_nat_target target = {
  1137  		.min_port = NODEPORT_PORT_MIN_NAT,
  1138  		.max_port = NODEPORT_PORT_MAX_NAT,
  1139  		.addr = IPV6_DIRECT_ROUTING,
  1140  	};
  1141  	struct ipv6_ct_tuple tuple __align_stack_8 = {};
  1142  	struct trace_ctx trace = {
  1143  		.reason = (enum trace_reason)CT_NEW,
  1144  		.monitor = TRACE_PAYLOAD_LEN,
  1145  	};
  1146  	int ret, l4_off, oif = 0;
  1147  	void *data, *data_end;
  1148  	struct ipv6hdr *ip6;
  1149  	__s8 ext_err = 0;
  1150  #ifdef TUNNEL_MODE
  1151  	struct remote_endpoint_info *info;
  1152  	__be32 tunnel_endpoint = 0;
  1153  	__u32 dst_sec_identity = 0;
  1154  	union v6addr *dst;
  1155  #endif
  1156  
  1157  	if (nat_46x64)
  1158  		build_v4_in_v6(&target.addr, IPV4_DIRECT_ROUTING);
  1159  
  1160  	if (!revalidate_data(ctx, &data, &data_end, &ip6)) {
  1161  		ret = DROP_INVALID;
  1162  		goto drop_err;
  1163  	}
  1164  
  1165  #ifdef TUNNEL_MODE
  1166  	dst = (union v6addr *)&ip6->daddr;
  1167  	info = lookup_ip6_remote_endpoint(dst, 0);
  1168  	if (info && info->tunnel_endpoint != 0 && !info->flag_skip_tunnel) {
  1169  		tunnel_endpoint = info->tunnel_endpoint;
  1170  		dst_sec_identity = info->sec_identity;
  1171  
  1172  		BPF_V6(target.addr, ROUTER_IP);
  1173  	}
  1174  #endif
  1175  
  1176  	ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple);
  1177  	if (IS_ERR(ret))
  1178  		goto drop_err;
  1179  
  1180  	ipv6_ct_tuple_swap_ports(&tuple);
  1181  	tuple.flags = TUPLE_F_OUT;
  1182  
  1183  	ret = ipv6_l3(ctx, ETH_HLEN, NULL, NULL, METRIC_EGRESS);
  1184  	if (unlikely(ret != CTX_ACT_OK))
  1185  		goto drop_err;
  1186  
  1187  	ret = __snat_v6_nat(ctx, &tuple, l4_off, true, &target, TCP_SPORT_OFF,
  1188  			    &trace, &ext_err);
  1189  	if (IS_ERR(ret))
  1190  		goto drop_err;
  1191  
  1192  	ctx_snat_done_set(ctx);
  1193  
  1194  #ifdef TUNNEL_MODE
  1195  	if (tunnel_endpoint) {
  1196  		__be16 src_port;
  1197  
  1198  #if __ctx_is == __ctx_skb
  1199  		{
  1200  			/* See the corresponding v4 path for details */
  1201  			bool l2_hdr_required = false;
  1202  
  1203  			ret = maybe_add_l2_hdr(ctx, ENCAP_IFINDEX, &l2_hdr_required);
  1204  			if (ret != 0)
  1205  				goto drop_err;
  1206  		}
  1207  #endif
  1208  
  1209  		src_port = tunnel_gen_src_port_v6(&tuple);
  1210  
  1211  		ret = nodeport_add_tunnel_encap(ctx,
  1212  						IPV4_DIRECT_ROUTING,
  1213  						src_port,
  1214  						tunnel_endpoint,
  1215  						WORLD_IPV6_ID,
  1216  						dst_sec_identity,
  1217  						trace.reason,
  1218  						trace.monitor,
  1219  						&oif);
  1220  		if (IS_ERR(ret))
  1221  			goto drop_err;
  1222  
  1223  		if (ret == CTX_ACT_REDIRECT && oif) {
  1224  			cilium_capture_out(ctx);
  1225  			return ctx_redirect(ctx, oif, 0);
  1226  		}
  1227  
  1228  		goto fib_ipv4;
  1229  	}
  1230  #endif
  1231  	if (!revalidate_data(ctx, &data, &data_end, &ip6)) {
  1232  		ret = DROP_INVALID;
  1233  		goto drop_err;
  1234  	}
  1235  	if (nat_46x64) {
  1236  		struct iphdr *ip4;
  1237  
  1238  		ret = lb6_to_lb4(ctx, ip6);
  1239  		if (ret < 0)
  1240  			goto drop_err;
  1241  
  1242  #ifdef TUNNEL_MODE
  1243  fib_ipv4:
  1244  #endif
  1245  		if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
  1246  			ret = DROP_INVALID;
  1247  			goto drop_err;
  1248  		}
  1249  		fib_params.l.ipv4_src = ip4->saddr;
  1250  		fib_params.l.ipv4_dst = ip4->daddr;
  1251  		fib_params.l.family = AF_INET;
  1252  	} else {
  1253  		ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_src,
  1254  			       (union v6addr *)&ip6->saddr);
  1255  		ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_dst,
  1256  			       (union v6addr *)&ip6->daddr);
  1257  	}
  1258  	ret = fib_redirect(ctx, true, &fib_params, false, &ext_err, &oif);
  1259  	if (fib_ok(ret)) {
  1260  		cilium_capture_out(ctx);
  1261  		return ret;
  1262  	}
  1263  drop_err:
  1264  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  1265  					  CTX_ACT_DROP, METRIC_EGRESS);
  1266  }
  1267  
  1268  static __always_inline int nodeport_svc_lb6(struct __ctx_buff *ctx,
  1269  					    struct ipv6_ct_tuple *tuple,
  1270  					    struct lb6_service *svc,
  1271  					    struct lb6_key *key,
  1272  					    struct ipv6hdr *ip6,
  1273  					    int l3_off,
  1274  					    int l4_off,
  1275  					    __u32 src_sec_identity __maybe_unused,
  1276  					    __s8 *ext_err)
  1277  {
  1278  	const bool skip_l3_xlate = DSR_ENCAP_MODE == DSR_ENCAP_IPIP;
  1279  	struct ct_state ct_state_svc = {};
  1280  	bool backend_local;
  1281  	__u32 monitor = 0;
  1282  	int ret;
  1283  
  1284  	if (!lb6_src_range_ok(svc, (union v6addr *)&ip6->saddr))
  1285  		return DROP_NOT_IN_SRC_RANGE;
  1286  
  1287  	if (!lb6_svc_is_routable(svc))
  1288  		return DROP_IS_CLUSTER_IP;
  1289  
  1290  #if defined(ENABLE_L7_LB)
  1291  	if (lb6_svc_is_l7loadbalancer(svc) && svc->l7_lb_proxy_port > 0) {
  1292  		if (ctx_is_xdp())
  1293  			return CTX_ACT_OK;
  1294  
  1295  		send_trace_notify(ctx, TRACE_TO_PROXY, src_sec_identity, UNKNOWN_ID,
  1296  				  bpf_ntohs((__u16)svc->l7_lb_proxy_port),
  1297  				  NATIVE_DEV_IFINDEX, TRACE_REASON_POLICY, monitor);
  1298  		return ctx_redirect_to_proxy_hairpin_ipv6(ctx,
  1299  							  (__be16)svc->l7_lb_proxy_port);
  1300  	}
  1301  #endif
  1302  	ret = lb6_local(get_ct_map6(tuple), ctx, l3_off, l4_off,
  1303  			key, tuple, svc, &ct_state_svc,
  1304  			skip_l3_xlate, ext_err);
  1305  
  1306  #ifdef SERVICE_NO_BACKEND_RESPONSE
  1307  	if (ret == DROP_NO_SERVICE) {
  1308  		edt_set_aggregate(ctx, 0);
  1309  		ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_NO_SERVICE,
  1310  					 ext_err);
  1311  	}
  1312  #endif
  1313  
  1314  	if (IS_ERR(ret))
  1315  		return ret;
  1316  
  1317  	backend_local = __lookup_ip6_endpoint(&tuple->daddr);
  1318  	if (!backend_local && lb6_svc_is_hostport(svc))
  1319  		return DROP_INVALID;
  1320  	if (backend_local || !nodeport_uses_dsr6(tuple)) {
  1321  		struct ct_state ct_state = {};
  1322  
  1323  		/* lookup with SCOPE_FORWARD: */
  1324  		__ipv6_ct_tuple_reverse(tuple);
  1325  
  1326  		/* only match CT entries that belong to the same service: */
  1327  		ct_state.rev_nat_index = ct_state_svc.rev_nat_index;
  1328  
  1329  		ret = ct_lazy_lookup6(get_ct_map6(tuple), tuple, ctx, l4_off,
  1330  				      CT_EGRESS, SCOPE_FORWARD, CT_ENTRY_NODEPORT,
  1331  				      &ct_state, &monitor);
  1332  		if (ret < 0)
  1333  			return ret;
  1334  
  1335  		switch (ret) {
  1336  		case CT_NEW:
  1337  			ct_state.src_sec_id = WORLD_IPV6_ID;
  1338  			ct_state.node_port = 1;
  1339  #ifndef HAVE_FIB_IFINDEX
  1340  			ct_state.ifindex = (__u16)NATIVE_DEV_IFINDEX;
  1341  #endif
  1342  
  1343  			ret = ct_create6(get_ct_map6(tuple), NULL, tuple, ctx,
  1344  					 CT_EGRESS, &ct_state, ext_err);
  1345  			if (IS_ERR(ret))
  1346  				return ret;
  1347  			break;
  1348  		case CT_ESTABLISHED:
  1349  			/* Note that we don't validate whether the matched CT entry
  1350  			 * has identical values (eg. .ifindex) as set above.
  1351  			 */
  1352  			break;
  1353  		default:
  1354  			return DROP_UNKNOWN_CT;
  1355  		}
  1356  
  1357  		ret = neigh_record_ip6(ctx);
  1358  		if (ret < 0)
  1359  			return ret;
  1360  		if (backend_local) {
  1361  			ctx_set_xfer(ctx, XFER_PKT_NO_SVC);
  1362  			return CTX_ACT_OK;
  1363  		}
  1364  	}
  1365  
  1366  	/* TX request to remote backend: */
  1367  	edt_set_aggregate(ctx, 0);
  1368  	if (nodeport_uses_dsr6(tuple)) {
  1369  #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP
  1370  		ctx_store_meta(ctx, CB_HINT,
  1371  			       ((__u32)tuple->sport << 16) | tuple->dport);
  1372  		ctx_store_meta(ctx, CB_ADDR_V6_1, tuple->daddr.p1);
  1373  		ctx_store_meta(ctx, CB_ADDR_V6_2, tuple->daddr.p2);
  1374  		ctx_store_meta(ctx, CB_ADDR_V6_3, tuple->daddr.p3);
  1375  		ctx_store_meta(ctx, CB_ADDR_V6_4, tuple->daddr.p4);
  1376  #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE || DSR_ENCAP_MODE == DSR_ENCAP_NONE
  1377  		ctx_store_meta(ctx, CB_PORT, key->dport);
  1378  		ctx_store_meta(ctx, CB_ADDR_V6_1, key->address.p1);
  1379  		ctx_store_meta(ctx, CB_ADDR_V6_2, key->address.p2);
  1380  		ctx_store_meta(ctx, CB_ADDR_V6_3, key->address.p3);
  1381  		ctx_store_meta(ctx, CB_ADDR_V6_4, key->address.p4);
  1382  #endif /* DSR_ENCAP_MODE */
  1383  		return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_DSR, ext_err);
  1384  	} else {
  1385  		/* This code path is not only hit for NAT64, but also
  1386  		 * for NAT46. For the latter we initially hit the IPv4
  1387  		 * NodePort path, then migrate the request to IPv6 and
  1388  		 * recirculate into the regular IPv6 NodePort path. So
  1389  		 * we need to make sure to not NAT back to IPv4 for
  1390  		 * IPv4-in-IPv6 converted addresses.
  1391  		 */
  1392  		ctx_store_meta(ctx, CB_NAT_46X64,
  1393  			       !is_v4_in_v6(&key->address) &&
  1394  			       lb6_to_lb4_service(svc));
  1395  		return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS,
  1396  					  ext_err);
  1397  	}
  1398  }
  1399  
  1400  /* See nodeport_lb4(). */
  1401  static __always_inline int nodeport_lb6(struct __ctx_buff *ctx,
  1402  					struct ipv6hdr *ip6,
  1403  					__u32 src_sec_identity,
  1404  					__s8 *ext_err,
  1405  					bool __maybe_unused *dsr)
  1406  {
  1407  	bool is_svc_proto __maybe_unused = true;
  1408  	int ret, l3_off = ETH_HLEN, l4_off;
  1409  	struct ipv6_ct_tuple tuple __align_stack_8 = {};
  1410  	struct lb6_service *svc;
  1411  	struct lb6_key key = {};
  1412  
  1413  	cilium_capture_in(ctx);
  1414  
  1415  	ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple);
  1416  	if (IS_ERR(ret)) {
  1417  		if (ret == DROP_UNSUPP_SERVICE_PROTO) {
  1418  			is_svc_proto = false;
  1419  			goto skip_service_lookup;
  1420  		}
  1421  		if (ret == DROP_UNKNOWN_L4) {
  1422  			ctx_set_xfer(ctx, XFER_PKT_NO_SVC);
  1423  			return CTX_ACT_OK;
  1424  		}
  1425  		return ret;
  1426  	}
  1427  
  1428  	lb6_fill_key(&key, &tuple);
  1429  
  1430  	svc = lb6_lookup_service(&key, false);
  1431  	if (svc) {
  1432  		return nodeport_svc_lb6(ctx, &tuple, svc, &key, ip6, l3_off,
  1433  					l4_off, src_sec_identity, ext_err);
  1434  	} else {
  1435  skip_service_lookup:
  1436  #ifdef ENABLE_NAT_46X64_GATEWAY
  1437  		if (is_v4_in_v6_rfc8215((union v6addr *)&ip6->daddr)) {
  1438  			ret = neigh_record_ip6(ctx);
  1439  			if (ret < 0)
  1440  				return ret;
  1441  			if (is_v4_in_v6_rfc8215((union v6addr *)&ip6->saddr))
  1442  				return tail_call_internal(ctx, CILIUM_CALL_IPV64_RFC8215,
  1443  							  ext_err);
  1444  			ctx_store_meta(ctx, CB_NAT_46X64, NAT46x64_MODE_XLATE);
  1445  			return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS,
  1446  						  ext_err);
  1447  		}
  1448  #endif
  1449  		ctx_set_xfer(ctx, XFER_PKT_NO_SVC);
  1450  
  1451  #ifdef ENABLE_DSR
  1452  #if (defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE) || \
  1453  	(!defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE != DSR_ENCAP_GENEVE)
  1454  		if (is_svc_proto && nodeport_uses_dsr6(&tuple)) {
  1455  			ret = nodeport_extract_dsr_v6(ctx, ip6, &tuple, l4_off,
  1456  						      &key.address,
  1457  						      &key.dport, dsr);
  1458  			if (IS_ERR(ret))
  1459  				return ret;
  1460  
  1461  			if (*dsr)
  1462  				return nodeport_dsr_ingress_ipv6(ctx, &tuple, l4_off,
  1463  								 &key.address, key.dport,
  1464  								 ext_err);
  1465  		}
  1466  #endif
  1467  #endif /* ENABLE_DSR */
  1468  
  1469  #ifndef ENABLE_MASQUERADE_IPV6
  1470  		if (!is_svc_proto || nodeport_uses_dsr6(&tuple))
  1471  			return CTX_ACT_OK;
  1472  #endif /* ENABLE_MASQUERADE_IPV6 */
  1473  
  1474  		ctx_store_meta(ctx, CB_NAT_46X64, 0);
  1475  		ctx_store_meta(ctx, CB_SRC_LABEL, src_sec_identity);
  1476  		return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS,
  1477  					  ext_err);
  1478  	}
  1479  }
  1480  
  1481  static __always_inline int
  1482  nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, bool *snat_done,
  1483  			   bool revdnat_only __maybe_unused,
  1484  			   struct trace_ctx *trace, __s8 *ext_err __maybe_unused)
  1485  {
  1486  	struct bpf_fib_lookup_padded fib_params __maybe_unused = {};
  1487  	struct lb6_reverse_nat *nat_info;
  1488  	struct ipv6_ct_tuple tuple __align_stack_8 = {};
  1489  	void *data, *data_end;
  1490  	struct ipv6hdr *ip6;
  1491  	int ret, l4_off;
  1492  
  1493  	if (!revalidate_data(ctx, &data, &data_end, &ip6))
  1494  		return DROP_INVALID;
  1495  
  1496  	ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple);
  1497  	if (ret < 0) {
  1498  		if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4)
  1499  			return CTX_ACT_OK;
  1500  		return ret;
  1501  	}
  1502  
  1503  	nat_info = nodeport_rev_dnat_get_info_ipv6(ctx, &tuple);
  1504  	if (!nat_info)
  1505  		return CTX_ACT_OK;
  1506  
  1507  #if defined(IS_BPF_HOST) && !defined(ENABLE_SKIP_FIB)
  1508  	if (revdnat_only)
  1509  		goto skip_fib;
  1510  
  1511  	fib_params.l.family = AF_INET6;
  1512  	fib_params.l.ifindex = ctx_get_ifindex(ctx);
  1513  	ipv6_addr_copy((union v6addr *)fib_params.l.ipv6_src,
  1514  		       &nat_info->address);
  1515  	ipv6_addr_copy((union v6addr *)fib_params.l.ipv6_dst,
  1516  		       &tuple.daddr);
  1517  
  1518  	ret = nodeport_fib_lookup_and_redirect(ctx, &fib_params, ext_err);
  1519  	if (ret != CTX_ACT_OK)
  1520  		return ret;
  1521  
  1522  skip_fib:
  1523  #endif
  1524  
  1525  	ret = ct_lazy_lookup6(get_ct_map6(&tuple), &tuple, ctx, l4_off, CT_INGRESS,
  1526  			      SCOPE_REVERSE, CT_ENTRY_NODEPORT | CT_ENTRY_DSR,
  1527  			      NULL, &trace->monitor);
  1528  	if (ret == CT_REPLY) {
  1529  		trace->reason = TRACE_REASON_CT_REPLY;
  1530  
  1531  		ret = __lb6_rev_nat(ctx, l4_off, &tuple, nat_info);
  1532  		if (IS_ERR(ret))
  1533  			return ret;
  1534  
  1535  		*snat_done = true;
  1536  	}
  1537  
  1538  	return CTX_ACT_OK;
  1539  }
  1540  
  1541  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD)
  1542  int tail_handle_snat_fwd_ipv6(struct __ctx_buff *ctx)
  1543  {
  1544  	struct trace_ctx trace = {
  1545  		.reason = TRACE_REASON_UNKNOWN,
  1546  		.monitor = 0,
  1547  	};
  1548  	enum trace_point obs_point;
  1549  	union v6addr saddr = {};
  1550  	int ret;
  1551  	__s8 ext_err = 0;
  1552  
  1553  #ifdef IS_BPF_OVERLAY
  1554  	obs_point = TRACE_TO_OVERLAY;
  1555  #else
  1556  	obs_point = TRACE_TO_NETWORK;
  1557  #endif
  1558  
  1559  	ret = nodeport_snat_fwd_ipv6(ctx, &saddr, &trace, &ext_err);
  1560  	if (IS_ERR(ret))
  1561  		return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  1562  						  CTX_ACT_DROP, METRIC_EGRESS);
  1563  
  1564  	/* contrary to tail_handle_snat_fwd_ipv4, we don't check for
  1565  	 *
  1566  	 *     ret == CTX_ACT_OK
  1567  	 *
  1568  	 * in order to emit the event, as egress gateway is not yet supported
  1569  	 * for IPv6, and so it's not possible yet for masqueraded traffic to get
  1570  	 * redirected to another interface
  1571  	 */
  1572  	send_trace_notify6(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID, &saddr,
  1573  			   TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX,
  1574  			   trace.reason, trace.monitor);
  1575  
  1576  	return ret;
  1577  }
  1578  
  1579  static __always_inline int
  1580  __handle_nat_fwd_ipv6(struct __ctx_buff *ctx, bool revdnat_only, struct trace_ctx *trace,
  1581  		      __s8 *ext_err)
  1582  {
  1583  	bool snat_done = false;
  1584  	int ret;
  1585  
  1586  	ret = nodeport_rev_dnat_fwd_ipv6(ctx, &snat_done, revdnat_only, trace, ext_err);
  1587  	if (ret != CTX_ACT_OK || revdnat_only)
  1588  		return ret;
  1589  
  1590  #if !defined(ENABLE_DSR) ||						\
  1591      (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) ||		\
  1592       defined(ENABLE_MASQUERADE_IPV6)
  1593  	if (!snat_done)
  1594  		ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD,
  1595  					 ext_err);
  1596  #endif
  1597  
  1598  	if (is_defined(IS_BPF_HOST) && snat_done)
  1599  		ctx_snat_done_set(ctx);
  1600  
  1601  	return ret;
  1602  }
  1603  
  1604  static __always_inline int
  1605  handle_nat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
  1606  		    __s8 *ext_err)
  1607  {
  1608  	__u32 cb_nat_flags = ctx_load_and_clear_meta(ctx, CB_NAT_FLAGS);
  1609  	bool revdnat_only = cb_nat_flags & CB_NAT_FLAGS_REVDNAT_ONLY;
  1610  
  1611  	return __handle_nat_fwd_ipv6(ctx, revdnat_only, trace, ext_err);
  1612  }
  1613  
  1614  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT_FWD)
  1615  static __always_inline
  1616  int tail_handle_nat_fwd_ipv6(struct __ctx_buff *ctx)
  1617  {
  1618  	struct trace_ctx trace = {
  1619  		.reason = TRACE_REASON_UNKNOWN,
  1620  		.monitor = TRACE_PAYLOAD_LEN,
  1621  	};
  1622  	int ret;
  1623  	enum trace_point obs_point;
  1624  	__s8 ext_err = 0;
  1625  
  1626  #ifdef IS_BPF_OVERLAY
  1627  	obs_point = TRACE_TO_OVERLAY;
  1628  #else
  1629  	obs_point = TRACE_TO_NETWORK;
  1630  #endif
  1631  
  1632  	ret = handle_nat_fwd_ipv6(ctx, &trace, &ext_err);
  1633  	if (IS_ERR(ret))
  1634  		return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  1635  						  CTX_ACT_DROP, METRIC_EGRESS);
  1636  
  1637  	if (ret == CTX_ACT_OK)
  1638  		send_trace_notify(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID,
  1639  				  TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX,
  1640  				  trace.reason, trace.monitor);
  1641  
  1642  	return ret;
  1643  }
  1644  #endif /* ENABLE_IPV6 */
  1645  
  1646  #ifdef ENABLE_IPV4
  1647  static __always_inline bool nodeport_uses_dsr4(const struct ipv4_ct_tuple *tuple)
  1648  {
  1649  	return nodeport_uses_dsr(tuple->nexthdr);
  1650  }
  1651  
  1652  static __always_inline bool
  1653  nodeport_has_nat_conflict_ipv4(const struct iphdr *ip4 __maybe_unused,
  1654  			       struct ipv4_nat_target *target __maybe_unused)
  1655  {
  1656  #if defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY)
  1657  	if (ip4->saddr == IPV4_GATEWAY) {
  1658  		target->addr = IPV4_GATEWAY;
  1659  		target->needs_ct = true;
  1660  
  1661  		return true;
  1662  	}
  1663  #endif /* TUNNEL_MODE && IS_BPF_OVERLAY */
  1664  
  1665  #if defined(IS_BPF_HOST)
  1666  	__u32 dr_ifindex = DIRECT_ROUTING_DEV_IFINDEX;
  1667  
  1668  	/* NATIVE_DEV_IFINDEX == DIRECT_ROUTING_DEV_IFINDEX cannot be moved into
  1669  	 * preprocessor, as the former is known only during load time (templating).
  1670  	 * This checks whether bpf_host is running on the direct routing device.
  1671  	 */
  1672  	if (dr_ifindex == NATIVE_DEV_IFINDEX &&
  1673  	    ip4->saddr == IPV4_DIRECT_ROUTING) {
  1674  		target->addr = IPV4_DIRECT_ROUTING;
  1675  		target->needs_ct = true;
  1676  
  1677  		return true;
  1678  	}
  1679  #endif /* IS_BPF_HOST */
  1680  
  1681  	return false;
  1682  }
  1683  
  1684  static __always_inline int nodeport_snat_fwd_ipv4(struct __ctx_buff *ctx,
  1685  						  __u32 cluster_id __maybe_unused,
  1686  						  __be32 *saddr,
  1687  						  struct trace_ctx *trace,
  1688  						  __s8 *ext_err)
  1689  {
  1690  	struct ipv4_nat_target target = {
  1691  		.min_port = NODEPORT_PORT_MIN_NAT,
  1692  		.max_port = NODEPORT_PORT_MAX_NAT,
  1693  #if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT)
  1694  		.cluster_id = cluster_id,
  1695  #endif
  1696  	};
  1697  	struct ipv4_ct_tuple tuple = {};
  1698  	void *data, *data_end;
  1699  	struct iphdr *ip4;
  1700  	int l4_off, ret;
  1701  
  1702  	if (!revalidate_data(ctx, &data, &data_end, &ip4))
  1703  		return DROP_INVALID;
  1704  
  1705  	snat_v4_init_tuple(ip4, NAT_DIR_EGRESS, &tuple);
  1706  	l4_off = ETH_HLEN + ipv4_hdrlen(ip4);
  1707  
  1708  	if (lb_is_svc_proto(tuple.nexthdr) &&
  1709  	    !nodeport_uses_dsr4(&tuple) &&
  1710  	    nodeport_has_nat_conflict_ipv4(ip4, &target))
  1711  		goto apply_snat;
  1712  
  1713  	ret = snat_v4_needs_masquerade(ctx, &tuple, ip4, l4_off, &target);
  1714  	if (IS_ERR(ret))
  1715  		goto out;
  1716  
  1717  #if defined(ENABLE_EGRESS_GATEWAY_COMMON) && defined(IS_BPF_HOST)
  1718  	if (target.egress_gateway) {
  1719  		/* Send packet to the correct egress interface, and SNAT it there. */
  1720  		ret = egress_gw_fib_lookup_and_redirect(ctx, target.addr,
  1721  							tuple.daddr, ext_err);
  1722  		if (ret != CTX_ACT_OK)
  1723  			return ret;
  1724  
  1725  		if (!revalidate_data(ctx, &data, &data_end, &ip4))
  1726  			return DROP_INVALID;
  1727  	}
  1728  #endif
  1729  
  1730  apply_snat:
  1731  
  1732  	*saddr = tuple.saddr;
  1733  	ret = snat_v4_nat(ctx, &tuple, ip4, l4_off, ipv4_has_l4_header(ip4),
  1734  			  &target, trace, ext_err);
  1735  	if (IS_ERR(ret))
  1736  		goto out;
  1737  
  1738  	/* If multiple netdevs process an outgoing packet, then this packets will
  1739  	 * be handled multiple times by the "to-netdev" section. This can lead
  1740  	 * to multiple SNATs. To prevent from that, set the SNAT done flag.
  1741  	 *
  1742  	 * XDP doesn't need the flag (there's no egress prog that would utilize it),
  1743  	 * and for overlay traffic it makes no difference whether the inner packet
  1744  	 * was SNATed.
  1745  	 */
  1746  	if (is_defined(IS_BPF_HOST))
  1747  		ctx_snat_done_set(ctx);
  1748  
  1749  out:
  1750  	if (ret == NAT_PUNT_TO_STACK)
  1751  		ret = CTX_ACT_OK;
  1752  
  1753  	return ret;
  1754  }
  1755  
  1756  #ifdef ENABLE_DSR
  1757  #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP
  1758  static __always_inline __be32 rss_gen_src4(__be32 client, __be32 l4_hint)
  1759  {
  1760  	const __u32 bits = 32 - IPV4_RSS_PREFIX_BITS;
  1761  	__be32 src = IPV4_RSS_PREFIX;
  1762  
  1763  	if (bits)
  1764  		src |= bpf_htonl(hash_32(client ^ l4_hint, bits));
  1765  	return src;
  1766  }
  1767  
  1768  /*
  1769   * Original packet: [clientIP:clientPort -> serviceIP:servicePort] } IP/L4
  1770   *
  1771   * After DSR IPIP:  [rssSrcIP -> backendIP]                        } IP
  1772   *                  [clientIP:clientPort -> serviceIP:servicePort] } IP/L4
  1773   */
  1774  static __always_inline int dsr_set_ipip4(struct __ctx_buff *ctx,
  1775  					 const struct iphdr *ip4,
  1776  					 __be32 backend_addr,
  1777  					 __be32 l4_hint, __be16 *ohead)
  1778  {
  1779  	__u16 tot_len = bpf_ntohs(ip4->tot_len) + sizeof(*ip4);
  1780  	const int l3_off = ETH_HLEN;
  1781  	__be32 sum;
  1782  	struct {
  1783  		__be16 tot_len;
  1784  		__be16 id;
  1785  		__be16 frag_off;
  1786  		__u8   ttl;
  1787  		__u8   protocol;
  1788  		__be32 saddr;
  1789  		__be32 daddr;
  1790  	} tp_old = {
  1791  		.tot_len	= ip4->tot_len,
  1792  		.ttl		= ip4->ttl,
  1793  		.protocol	= ip4->protocol,
  1794  		.saddr		= ip4->saddr,
  1795  		.daddr		= ip4->daddr,
  1796  	}, tp_new = {
  1797  		.tot_len	= bpf_htons(tot_len),
  1798  		.ttl		= IPDEFTTL,
  1799  		.protocol	= IPPROTO_IPIP,
  1800  		.saddr		= rss_gen_src4(ip4->saddr, l4_hint),
  1801  		.daddr		= backend_addr,
  1802  	};
  1803  
  1804  	if (dsr_is_too_big(ctx, tot_len)) {
  1805  		*ohead = sizeof(*ip4);
  1806  		return DROP_FRAG_NEEDED;
  1807  	}
  1808  
  1809  	if (ctx_adjust_hroom(ctx, sizeof(*ip4), BPF_ADJ_ROOM_NET,
  1810  			     ctx_adjust_hroom_flags()))
  1811  		return DROP_INVALID;
  1812  	sum = csum_diff(&tp_old, 16, &tp_new, 16, 0);
  1813  	if (ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, tot_len),
  1814  			    &tp_new.tot_len, 2, 0) < 0)
  1815  		return DROP_WRITE_ERROR;
  1816  	if (ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, ttl),
  1817  			    &tp_new.ttl, 2, 0) < 0)
  1818  		return DROP_WRITE_ERROR;
  1819  	if (ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, saddr),
  1820  			    &tp_new.saddr, 8, 0) < 0)
  1821  		return DROP_WRITE_ERROR;
  1822  	if (ipv4_csum_update_by_diff(ctx, l3_off, sum) < 0)
  1823  		return DROP_CSUM_L3;
  1824  	return 0;
  1825  }
  1826  #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE
  1827  static __always_inline int dsr_set_opt4(struct __ctx_buff *ctx,
  1828  					struct iphdr *ip4, __be32 svc_addr,
  1829  					__be16 svc_port, __be16 *ohead)
  1830  {
  1831  	__u32 iph_old, iph_new;
  1832  	struct dsr_opt_v4 opt;
  1833  	__u16 tot_len = bpf_ntohs(ip4->tot_len) + sizeof(opt);
  1834  	__be32 sum;
  1835  
  1836  	if (ip4->protocol == IPPROTO_TCP) {
  1837  		union tcp_flags tcp_flags = { .value = 0 };
  1838  
  1839  		if (l4_load_tcp_flags(ctx, ETH_HLEN + ipv4_hdrlen(ip4), &tcp_flags) < 0)
  1840  			return DROP_CT_INVALID_HDR;
  1841  
  1842  		/* Setting the option is required only for the first packet
  1843  		 * (SYN), in the case of TCP, as for further packets of the
  1844  		 * same connection a remote node will use a NAT entry to
  1845  		 * reverse xlate a reply.
  1846  		 */
  1847  		if (!(tcp_flags.value & (TCP_FLAG_SYN)))
  1848  			return 0;
  1849  	}
  1850  
  1851  	if (ipv4_hdrlen(ip4) + sizeof(opt) > sizeof(struct iphdr) + MAX_IPOPTLEN)
  1852  		return DROP_CT_INVALID_HDR;
  1853  
  1854  	if (dsr_is_too_big(ctx, tot_len)) {
  1855  		*ohead = sizeof(opt);
  1856  		return DROP_FRAG_NEEDED;
  1857  	}
  1858  
  1859  	iph_old = *(__u32 *)ip4;
  1860  	ip4->ihl += sizeof(opt) >> 2;
  1861  	ip4->tot_len = bpf_htons(tot_len);
  1862  	iph_new = *(__u32 *)ip4;
  1863  
  1864  	opt.type = DSR_IPV4_OPT_TYPE;
  1865  	opt.len = sizeof(opt);
  1866  	opt.port = bpf_htons(svc_port);
  1867  	opt.addr = bpf_htonl(svc_addr);
  1868  
  1869  	sum = csum_diff(&iph_old, 4, &iph_new, 4, 0);
  1870  	sum = csum_diff(NULL, 0, &opt, sizeof(opt), sum);
  1871  
  1872  	if (ctx_adjust_hroom(ctx, sizeof(opt), BPF_ADJ_ROOM_NET,
  1873  			     ctx_adjust_hroom_flags()))
  1874  		return DROP_INVALID;
  1875  
  1876  	if (ctx_store_bytes(ctx, ETH_HLEN + sizeof(*ip4),
  1877  			    &opt, sizeof(opt), 0) < 0)
  1878  		return DROP_INVALID;
  1879  	if (ipv4_csum_update_by_diff(ctx, ETH_HLEN, sum) < 0)
  1880  		return DROP_CSUM_L3;
  1881  
  1882  	return 0;
  1883  }
  1884  #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE
  1885  static __always_inline int encap_geneve_dsr_opt4(struct __ctx_buff *ctx, int l3_off __maybe_unused,
  1886  						 struct iphdr *ip4, __be32 svc_addr,
  1887  						 __be16 svc_port, int *ifindex, __be16 *ohead)
  1888  {
  1889  	struct remote_endpoint_info *info __maybe_unused;
  1890  	struct geneve_dsr_opt4 gopt;
  1891  	bool need_opt = true;
  1892  	__u16 encap_len = sizeof(struct iphdr) + sizeof(struct udphdr) +
  1893  		sizeof(struct genevehdr) + ETH_HLEN;
  1894  	__u16 total_len = bpf_ntohs(ip4->tot_len);
  1895  	__u32 src_sec_identity = WORLD_IPV4_ID;
  1896  	__u32 dst_sec_identity;
  1897  	__be32 tunnel_endpoint;
  1898  	__be16 src_port = 0;
  1899  #if __ctx_is == __ctx_xdp
  1900  	bool has_encap = l3_off > ETH_HLEN;
  1901  	struct iphdr *outer_ip4 = ip4;
  1902  	void *data, *data_end;
  1903  
  1904  	build_bug_on((sizeof(gopt) % 4) != 0);
  1905  
  1906  	if (has_encap) {
  1907  		/* point at the inner IPv4 header */
  1908  		if (!revalidate_data_l3_off(ctx, &data, &data_end, &ip4, encap_len + ETH_HLEN))
  1909  			return DROP_INVALID;
  1910  
  1911  		encap_len = 0;
  1912  	} else {
  1913  		struct ipv4_ct_tuple tuple = {};
  1914  		int l4_off, ret;
  1915  
  1916  		ret = lb4_extract_tuple(ctx, ip4, l3_off, &l4_off, &tuple);
  1917  		if (IS_ERR(ret))
  1918  			return ret;
  1919  
  1920  		src_port = tunnel_gen_src_port_v4(&tuple);
  1921  	}
  1922  #endif
  1923  
  1924  #ifdef ENABLE_HIGH_SCALE_IPCACHE
  1925   #ifdef IS_BPF_OVERLAY
  1926  	src_sec_identity = ctx_load_meta(ctx, CB_DSR_SRC_LABEL);
  1927   #endif
  1928  
  1929  	tunnel_endpoint = ip4->daddr;
  1930  	dst_sec_identity = 0;
  1931  #else
  1932  	info = lookup_ip4_remote_endpoint(ip4->daddr, 0);
  1933  	if (!info || info->tunnel_endpoint == 0)
  1934  		return DROP_NO_TUNNEL_ENDPOINT;
  1935  
  1936  	tunnel_endpoint = info->tunnel_endpoint;
  1937  	dst_sec_identity = info->sec_identity;
  1938  #endif
  1939  
  1940  	if (ip4->protocol == IPPROTO_TCP) {
  1941  		union tcp_flags tcp_flags = { .value = 0 };
  1942  
  1943  		if (l4_load_tcp_flags(ctx, l3_off + ipv4_hdrlen(ip4), &tcp_flags) < 0)
  1944  			return DROP_CT_INVALID_HDR;
  1945  
  1946  		/* The GENEVE option is required only for the first packet
  1947  		 * (SYN), in the case of TCP, as for further packets of the
  1948  		 * same connection a remote node will use a NAT entry to
  1949  		 * reverse xlate a reply.
  1950  		 */
  1951  		if (!(tcp_flags.value & (TCP_FLAG_SYN)))
  1952  			need_opt = false;
  1953  	}
  1954  
  1955  	if (need_opt) {
  1956  		encap_len += sizeof(struct geneve_dsr_opt4);
  1957  		set_geneve_dsr_opt4(svc_port, svc_addr, &gopt);
  1958  	}
  1959  
  1960  	if (dsr_is_too_big(ctx, total_len + encap_len)) {
  1961  		*ohead = encap_len;
  1962  		return DROP_FRAG_NEEDED;
  1963  	}
  1964  
  1965  #if __ctx_is == __ctx_xdp
  1966  	if (has_encap) {
  1967  		int outer_l4_off = ETH_HLEN + ipv4_hdrlen(outer_ip4);
  1968  		__be32 lb_ip = IPV4_DIRECT_ROUTING;
  1969  		__wsum sum = 0;
  1970  
  1971  		/* update outer_ip4 daddr and saddr: */
  1972  		sum = csum_diff(&outer_ip4->daddr, 4, &tunnel_endpoint, 4, 0);
  1973  		if (ctx_store_bytes(ctx, ETH_HLEN + offsetof(struct iphdr, daddr),
  1974  				    &tunnel_endpoint, 4, 0) < 0)
  1975  			return DROP_WRITE_ERROR;
  1976  
  1977  		sum = csum_diff(&outer_ip4->saddr, 4, &lb_ip, 4, sum);
  1978  		if (ctx_store_bytes(ctx, ETH_HLEN + offsetof(struct iphdr, saddr),
  1979  				    &lb_ip, 4, 0) < 0)
  1980  			return DROP_WRITE_ERROR;
  1981  
  1982  		/* adjust outer_ip4->csum: */
  1983  		if (ipv4_csum_update_by_diff(ctx, ETH_HLEN, sum) < 0)
  1984  			return DROP_CSUM_L3;
  1985  
  1986  		/* insert the GENEVE-DSR option: */
  1987  		if (need_opt) {
  1988  			__be16 new_length;
  1989  			int ret;
  1990  
  1991  			/* update udp->len */
  1992  			if (ctx_load_bytes(ctx, outer_l4_off + offsetof(struct udphdr, len),
  1993  					   &new_length, sizeof(new_length)) < 0)
  1994  				return DROP_INVALID;
  1995  
  1996  			new_length = bpf_htons(bpf_ntohs(new_length) + sizeof(gopt));
  1997  
  1998  			if (ctx_store_bytes(ctx, outer_l4_off + offsetof(struct udphdr, len),
  1999  					    &new_length, sizeof(new_length), 0) < 0)
  2000  				return DROP_WRITE_ERROR;
  2001  
  2002  			/* update outer_ip4->tot_len */
  2003  			new_length = bpf_htons(total_len + sizeof(gopt));
  2004  
  2005  			if (ipv4_csum_update_by_value(ctx, ETH_HLEN, outer_ip4->tot_len,
  2006  						      new_length, sizeof(new_length)) < 0)
  2007  				return DROP_CSUM_L3;
  2008  
  2009  			if (ctx_store_bytes(ctx, ETH_HLEN + offsetof(struct iphdr, tot_len),
  2010  					    &new_length, sizeof(new_length), 0) < 0)
  2011  				return DROP_WRITE_ERROR;
  2012  
  2013  			ret = ctx_set_tunnel_opt(ctx, (__u8 *)&gopt, sizeof(gopt));
  2014  			if (ret)
  2015  				return ret;
  2016  		}
  2017  
  2018  		return CTX_ACT_REDIRECT;
  2019  	}
  2020  #endif
  2021  
  2022  	if (need_opt)
  2023  		return nodeport_add_tunnel_encap_opt(ctx,
  2024  						     IPV4_DIRECT_ROUTING,
  2025  						     src_port,
  2026  						     tunnel_endpoint,
  2027  						     src_sec_identity,
  2028  						     dst_sec_identity,
  2029  						     &gopt,
  2030  						     sizeof(gopt),
  2031  						     (enum trace_reason)CT_NEW,
  2032  						     TRACE_PAYLOAD_LEN,
  2033  						     ifindex);
  2034  
  2035  	return nodeport_add_tunnel_encap(ctx,
  2036  					 IPV4_DIRECT_ROUTING,
  2037  					 src_port,
  2038  					 tunnel_endpoint,
  2039  					 src_sec_identity,
  2040  					 dst_sec_identity,
  2041  					 (enum trace_reason)CT_NEW,
  2042  					 TRACE_PAYLOAD_LEN,
  2043  					 ifindex);
  2044  }
  2045  #endif /* DSR_ENCAP_MODE */
  2046  
  2047  static __always_inline int
  2048  nodeport_extract_dsr_v4(struct __ctx_buff *ctx,
  2049  			const struct iphdr *ip4 __maybe_unused,
  2050  			const struct ipv4_ct_tuple *tuple, int l4_off,
  2051  			__be32 *addr, __be16 *port, bool *dsr)
  2052  {
  2053  	struct ipv4_ct_tuple tmp = *tuple;
  2054  
  2055  	/* Parse DSR info from the packet, to get the addr/port of the
  2056  	 * addressed service. We need this for RevDNATing the backend's replies.
  2057  	 *
  2058  	 * TCP connections have the DSR Option only in their SYN packet.
  2059  	 * To identify that a non-SYN packet belongs to a DSR connection,
  2060  	 * we need to check whether a corresponding CT entry with .dsr flag exists.
  2061  	 */
  2062  	if (tuple->nexthdr == IPPROTO_TCP) {
  2063  		union tcp_flags tcp_flags = {};
  2064  
  2065  		if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0)
  2066  			return DROP_CT_INVALID_HDR;
  2067  
  2068  		ipv4_ct_tuple_reverse(&tmp);
  2069  
  2070  		if (!(tcp_flags.value & TCP_FLAG_SYN)) {
  2071  			/* If the packet belongs to a tracked DSR connection,
  2072  			 * trigger a CT update.
  2073  			 * We don't have any DSR info to report back, and that's ok.
  2074  			 */
  2075  			*dsr = ct_has_dsr_egress_entry4(get_ct_map4(&tmp), &tmp);
  2076  			*port = 0;
  2077  			return 0;
  2078  		}
  2079  	}
  2080  
  2081  #if defined(IS_BPF_OVERLAY)
  2082  	{
  2083  		struct geneve_dsr_opt4 gopt;
  2084  		int ret = 0;
  2085  
  2086  		ret = ctx_get_tunnel_opt(ctx, &gopt, sizeof(gopt));
  2087  
  2088  		if (ret > 0) {
  2089  			if (gopt.hdr.opt_class == bpf_htons(DSR_GENEVE_OPT_CLASS) &&
  2090  			    gopt.hdr.type == DSR_GENEVE_OPT_TYPE) {
  2091  				*dsr = true;
  2092  				*port = gopt.port;
  2093  				*addr = gopt.addr;
  2094  				return 0;
  2095  			}
  2096  		}
  2097  	}
  2098  #else
  2099  	/* Check whether IPv4 header contains a 64-bit option (IPv4 header
  2100  	 * w/o option (5 x 32-bit words) + the DSR option (2 x 32-bit words)).
  2101  	 */
  2102  	if (ip4->ihl >= 0x7) {
  2103  		struct dsr_opt_v4 opt;
  2104  
  2105  		if (ctx_load_bytes(ctx, ETH_HLEN + sizeof(struct iphdr),
  2106  				   &opt, sizeof(opt)) < 0)
  2107  			return DROP_INVALID;
  2108  
  2109  		if (opt.type == DSR_IPV4_OPT_TYPE && opt.len == sizeof(opt)) {
  2110  			*dsr = true;
  2111  			*addr = bpf_ntohl(opt.addr);
  2112  			*port = bpf_ntohs(opt.port);
  2113  			return 0;
  2114  		}
  2115  	}
  2116  #endif
  2117  
  2118  	/* SYN for a new connection that's not / no longer DSR.
  2119  	 * If it's reopened, avoid sending subsequent traffic down the DSR path.
  2120  	 */
  2121  	if (tuple->nexthdr == IPPROTO_TCP)
  2122  		ct_update_dsr(get_ct_map4(&tmp), &tmp, false);
  2123  
  2124  	return 0;
  2125  }
  2126  
  2127  static __always_inline struct ipv4_nat_entry *
  2128  nodeport_dsr_lookup_v4_nat_entry(const struct ipv4_ct_tuple *nat_tuple)
  2129  {
  2130  	return snat_v4_lookup(nat_tuple);
  2131  }
  2132  
  2133  static __always_inline int dsr_reply_icmp4(struct __ctx_buff *ctx,
  2134  					   struct iphdr *ip4 __maybe_unused,
  2135  					   __be32 svc_addr __maybe_unused,
  2136  					   __be16 dport __maybe_unused,
  2137  					   int code, __be16 ohead __maybe_unused)
  2138  {
  2139  #ifdef ENABLE_DSR_ICMP_ERRORS
  2140  	const __s32 orig_dgram = 8, off = ETH_HLEN;
  2141  	const __u32 l3_max = MAX_IPOPTLEN + sizeof(*ip4) + orig_dgram;
  2142  	__be16 type = bpf_htons(ETH_P_IP);
  2143  	__s32 len_new = off + ipv4_hdrlen(ip4) + orig_dgram;
  2144  	__s32 len_old = ctx_full_len(ctx);
  2145  	__u8 reason = (__u8)-code;
  2146  	__u8 tmp[l3_max];
  2147  	union macaddr smac, dmac;
  2148  	struct icmphdr icmp __align_stack_8 = {
  2149  		.type		= ICMP_DEST_UNREACH,
  2150  		.code		= ICMP_FRAG_NEEDED,
  2151  		.un = {
  2152  			.frag = {
  2153  				.mtu = bpf_htons(THIS_MTU - ohead),
  2154  			},
  2155  		},
  2156  	};
  2157  	__u64 tot_len = sizeof(struct iphdr) + ipv4_hdrlen(ip4) + sizeof(icmp) + orig_dgram;
  2158  	struct iphdr ip __align_stack_8 = {
  2159  		.ihl		= sizeof(ip) >> 2,
  2160  		.version	= IPVERSION,
  2161  		.ttl		= IPDEFTTL,
  2162  		.tos		= ip4->tos,
  2163  		.id		= ip4->id,
  2164  		.protocol	= IPPROTO_ICMP,
  2165  		.saddr		= ip4->daddr,
  2166  		.daddr		= ip4->saddr,
  2167  		.frag_off	= bpf_htons(IP_DF),
  2168  		.tot_len	= bpf_htons((__u16)tot_len),
  2169  	};
  2170  
  2171  	struct iphdr inner_ip_hdr __align_stack_8 = *ip4;
  2172  	__s32 l4_dport_offset;
  2173  
  2174  	/* DSR changes the destination address from service ip to pod ip and
  2175  	 * destination port from service port to pod port. While resppnding
  2176  	 * back with ICMP error, it is necessary to set it to original ip and
  2177  	 * port.
  2178  	 * We do recompute the whole checksum here. Another way would be to
  2179  	 * unfold checksum and then do the math adding the diff.
  2180  	 */
  2181  	inner_ip_hdr.daddr = svc_addr;
  2182  	inner_ip_hdr.check = 0;
  2183  	inner_ip_hdr.check = csum_fold(csum_diff(NULL, 0, &inner_ip_hdr,
  2184  						 sizeof(inner_ip_hdr), 0));
  2185  
  2186  	if (inner_ip_hdr.protocol == IPPROTO_UDP)
  2187  		l4_dport_offset = UDP_DPORT_OFF;
  2188  	else if (inner_ip_hdr.protocol == IPPROTO_TCP)
  2189  		l4_dport_offset = TCP_DPORT_OFF;
  2190  
  2191  	update_metrics(ctx_full_len(ctx), METRIC_EGRESS, reason);
  2192  
  2193  	if (eth_load_saddr(ctx, smac.addr, 0) < 0)
  2194  		goto drop_err;
  2195  	if (eth_load_daddr(ctx, dmac.addr, 0) < 0)
  2196  		goto drop_err;
  2197  
  2198  	ip.check = csum_fold(csum_diff(NULL, 0, &ip, sizeof(ip), 0));
  2199  
  2200  	/* We use a workaround here in that we push zero-bytes into the
  2201  	 * payload in order to support dynamic IPv4 header size. This
  2202  	 * works given one's complement sum does not change.
  2203  	 */
  2204  	memset(tmp, 0, MAX_IPOPTLEN);
  2205  	if (ctx_store_bytes(ctx, len_new, tmp, MAX_IPOPTLEN, 0) < 0)
  2206  		goto drop_err;
  2207  	if (ctx_load_bytes(ctx, off, tmp, sizeof(tmp)) < 0)
  2208  		goto drop_err;
  2209  
  2210  	memcpy(tmp, &inner_ip_hdr, sizeof(inner_ip_hdr));
  2211  	memcpy(tmp + sizeof(inner_ip_hdr) + l4_dport_offset, &dport, sizeof(dport));
  2212  
  2213  	icmp.checksum = csum_fold(csum_diff(NULL, 0, tmp, sizeof(tmp),
  2214  					    csum_diff(NULL, 0, &icmp,
  2215  						      sizeof(icmp), 0)));
  2216  
  2217  	if (ctx_adjust_troom(ctx, -(len_old - len_new)) < 0)
  2218  		goto drop_err;
  2219  	if (ctx_adjust_hroom(ctx, sizeof(ip) + sizeof(icmp),
  2220  			     BPF_ADJ_ROOM_NET,
  2221  			     ctx_adjust_hroom_flags()) < 0)
  2222  		goto drop_err;
  2223  
  2224  	if (eth_store_daddr(ctx, smac.addr, 0) < 0)
  2225  		goto drop_err;
  2226  	if (eth_store_saddr(ctx, dmac.addr, 0) < 0)
  2227  		goto drop_err;
  2228  	if (ctx_store_bytes(ctx, ETH_ALEN * 2, &type, sizeof(type), 0) < 0)
  2229  		goto drop_err;
  2230  	if (ctx_store_bytes(ctx, off, &ip, sizeof(ip), 0) < 0)
  2231  		goto drop_err;
  2232  	if (ctx_store_bytes(ctx, off + sizeof(ip), &icmp,
  2233  			    sizeof(icmp), 0) < 0)
  2234  		goto drop_err;
  2235  	if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp),
  2236  			    &inner_ip_hdr, sizeof(inner_ip_hdr), 0) < 0)
  2237  		goto drop_err;
  2238  	if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp)
  2239  			    + sizeof(inner_ip_hdr) + l4_dport_offset,
  2240  			    &dport, sizeof(dport), 0) < 0)
  2241  		goto drop_err;
  2242  
  2243  	return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0);
  2244  drop_err:
  2245  #endif
  2246  	return send_drop_notify_error(ctx, UNKNOWN_ID, code, CTX_ACT_DROP,
  2247  				      METRIC_EGRESS);
  2248  }
  2249  
  2250  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_DSR)
  2251  int tail_nodeport_ipv4_dsr(struct __ctx_buff *ctx)
  2252  {
  2253  	void *data, *data_end;
  2254  	struct iphdr *ip4;
  2255  	int ret, oif = 0;
  2256  	__be16 ohead = 0;
  2257  	__s8 ext_err = 0;
  2258  	__be32 addr;
  2259  	__be16 port;
  2260  
  2261  	if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
  2262  		ret = DROP_INVALID;
  2263  		goto drop_err;
  2264  	}
  2265  	addr = ctx_load_meta(ctx, CB_ADDR_V4);
  2266  	port = (__be16)ctx_load_meta(ctx, CB_PORT);
  2267  
  2268  #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP
  2269  	ret = dsr_set_ipip4(ctx, ip4,
  2270  			    addr,
  2271  			    ctx_load_meta(ctx, CB_HINT), &ohead);
  2272  #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE
  2273  	ret = dsr_set_opt4(ctx, ip4,
  2274  			   addr,
  2275  			   port, &ohead);
  2276  #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE
  2277  	ret = encap_geneve_dsr_opt4(ctx, ctx_load_meta(ctx, CB_DSR_L3_OFF),
  2278  				    ip4, addr, port, &oif, &ohead);
  2279  	if (!IS_ERR(ret)) {
  2280  		if (ret == CTX_ACT_REDIRECT && oif) {
  2281  			cilium_capture_out(ctx);
  2282  			return ctx_redirect(ctx, oif, 0);
  2283  		}
  2284  	}
  2285  #else
  2286  # error "Invalid load balancer DSR encapsulation mode!"
  2287  #endif
  2288  	if (IS_ERR(ret)) {
  2289  		if (dsr_fail_needs_reply(ret))
  2290  			return dsr_reply_icmp4(ctx, ip4, addr, port, ret, ohead);
  2291  		goto drop_err;
  2292  	}
  2293  	if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
  2294  		ret = DROP_INVALID;
  2295  		goto drop_err;
  2296  	}
  2297  	ret = fib_redirect_v4(ctx, ETH_HLEN, ip4, true, false, &ext_err, &oif);
  2298  	if (fib_ok(ret)) {
  2299  		cilium_capture_out(ctx);
  2300  		return ret;
  2301  	}
  2302  drop_err:
  2303  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  2304  					  CTX_ACT_DROP, METRIC_EGRESS);
  2305  }
  2306  
  2307  static __always_inline int
  2308  nodeport_dsr_ingress_ipv4(struct __ctx_buff *ctx, struct ipv4_ct_tuple *tuple,
  2309  			  struct iphdr *ip4, bool has_l4_header, int l4_off,
  2310  			  __be32 addr, __be16 port, __s8 *ext_err)
  2311  {
  2312  	struct ct_state ct_state_new = {};
  2313  	__u32 monitor = 0;
  2314  	int ret;
  2315  
  2316  	/* lookup with SCOPE_FORWARD: */
  2317  	__ipv4_ct_tuple_reverse(tuple);
  2318  
  2319  	ret = ct_lazy_lookup4(get_ct_map4(tuple), tuple, ctx, ipv4_is_fragment(ip4),
  2320  			      l4_off, has_l4_header, CT_EGRESS, SCOPE_FORWARD,
  2321  			      CT_ENTRY_DSR, NULL, &monitor);
  2322  	if (ret < 0)
  2323  		return ret;
  2324  
  2325  	switch (ret) {
  2326  	case CT_NEW:
  2327  create_ct:
  2328  		if (port == 0)
  2329  			/* Not expected at all - nodeport_extract_dsr_v4() said
  2330  			 * there would be a CT entry! Without DSR info we can't
  2331  			 * do anything smart here.
  2332  			 */
  2333  			return DROP_INVALID;
  2334  
  2335  		ct_state_new.src_sec_id = WORLD_IPV4_ID;
  2336  		ct_state_new.dsr_internal = 1;
  2337  
  2338  		ret = ct_create4(get_ct_map4(tuple), NULL, tuple, ctx,
  2339  				 CT_EGRESS, &ct_state_new, ext_err);
  2340  		if (!IS_ERR(ret))
  2341  			ret = snat_v4_create_dsr(tuple, addr, port, ext_err);
  2342  
  2343  		if (IS_ERR(ret))
  2344  			return ret;
  2345  		break;
  2346  	case CT_ESTABLISHED:
  2347  		/* For TCP we only expect DSR info on the SYN, so CT_ESTABLISHED
  2348  		 * is unexpected and we need to refresh the CT entry.
  2349  		 *
  2350  		 * Otherwise we tolerate DSR info on an established connection.
  2351  		 * TODO: how do we know if we need to refresh the SNAT entry?
  2352  		 */
  2353  		if (tuple->nexthdr == IPPROTO_TCP && port)
  2354  			goto create_ct;
  2355  		break;
  2356  	default:
  2357  		return DROP_UNKNOWN_CT;
  2358  	}
  2359  
  2360  	return CTX_ACT_OK;
  2361  }
  2362  #endif /* ENABLE_DSR */
  2363  
  2364  static __always_inline struct lb4_reverse_nat *
  2365  nodeport_rev_dnat_get_info_ipv4(struct __ctx_buff *ctx,
  2366  				struct ipv4_ct_tuple *tuple)
  2367  {
  2368  	struct ipv4_nat_entry *dsr_entry __maybe_unused;
  2369  	struct ipv4_ct_tuple dsr_tuple __maybe_unused;
  2370  	__u16 rev_nat_index = 0;
  2371  
  2372  	if (!ct_has_nodeport_egress_entry4(get_ct_map4(tuple), tuple,
  2373  					   &rev_nat_index, is_defined(ENABLE_DSR)))
  2374  		return NULL;
  2375  
  2376  	if (rev_nat_index)
  2377  		return lb4_lookup_rev_nat_entry(ctx, rev_nat_index);
  2378  
  2379  #ifdef ENABLE_DSR
  2380  	dsr_tuple = *tuple;
  2381  
  2382  	dsr_tuple.flags = NAT_DIR_EGRESS;
  2383  	dsr_tuple.sport = tuple->dport;
  2384  	dsr_tuple.dport = tuple->sport;
  2385  
  2386  	dsr_entry = nodeport_dsr_lookup_v4_nat_entry(&dsr_tuple);
  2387  	if (dsr_entry)
  2388  		return &dsr_entry->nat_info;
  2389  #endif
  2390  
  2391  	return NULL;
  2392  }
  2393  
  2394  /* Reverse NAT handling of node-port traffic for the case where the
  2395   * backend i) was a local EP and bpf_lxc redirected to us, ii) was
  2396   * a remote backend and we got here after reverse SNAT from the
  2397   * tail_nodeport_nat_ingress_ipv4().
  2398   *
  2399   * Also, reverse NAT handling return path egress-gw traffic.
  2400   *
  2401   * CILIUM_CALL_IPV{4,6}_NODEPORT_REVNAT is plugged into CILIUM_MAP_CALLS
  2402   * of the bpf_host, bpf_overlay and of the bpf_lxc.
  2403   */
  2404  static __always_inline int
  2405  nodeport_rev_dnat_ingress_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
  2406  			       __s8 *ext_err)
  2407  {
  2408  	struct bpf_fib_lookup_padded fib_params = {
  2409  		.l = {
  2410  			.family		= AF_INET,
  2411  			.ifindex	= ctx_get_ifindex(ctx),
  2412  		},
  2413  	};
  2414  	int ifindex = 0, ret, l3_off = ETH_HLEN, l4_off;
  2415  	struct ipv4_ct_tuple tuple = {};
  2416  	struct ct_state ct_state = {};
  2417  	void *data, *data_end;
  2418  	struct iphdr *ip4;
  2419  	__u32 tunnel_endpoint __maybe_unused = 0;
  2420  	__u32 dst_sec_identity __maybe_unused = 0;
  2421  	__u32 src_sec_identity __maybe_unused = SECLABEL;
  2422  	bool allow_neigh_map = true;
  2423  	bool check_revdnat = true;
  2424  	bool has_l4_header;
  2425  
  2426  	if (!revalidate_data(ctx, &data, &data_end, &ip4))
  2427  		return DROP_INVALID;
  2428  
  2429  	has_l4_header = ipv4_has_l4_header(ip4);
  2430  
  2431  	ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
  2432  	if (ret < 0) {
  2433  		/* If it's not a SVC protocol, we don't need to check for RevDNAT: */
  2434  		if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4)
  2435  			check_revdnat = false;
  2436  		else
  2437  			return ret;
  2438  	}
  2439  
  2440  #if defined(ENABLE_EGRESS_GATEWAY_COMMON) && !defined(IS_BPF_OVERLAY)
  2441  	/* The gateway node needs to manually steer any reply traffic
  2442  	 * for a remote pod into the tunnel (to avoid iptables potentially
  2443  	 * dropping or accidentally SNATing the packets).
  2444  	 */
  2445  	if (egress_gw_reply_needs_redirect_hook(ip4, &tunnel_endpoint, &dst_sec_identity)) {
  2446  		trace->reason = TRACE_REASON_CT_REPLY;
  2447  		goto redirect;
  2448  	}
  2449  #endif /* ENABLE_EGRESS_GATEWAY_COMMON */
  2450  
  2451  	if (!check_revdnat)
  2452  		goto out;
  2453  
  2454  	ret = nodeport_rev_dnat_ingress_ipv4_hook(ctx, ip4, &tuple, &tunnel_endpoint,
  2455  						  &src_sec_identity, &dst_sec_identity);
  2456  	if (ret == CTX_ACT_OK)
  2457  		return ret;
  2458  	else if (ret == CTX_ACT_REDIRECT)
  2459  		goto redirect;
  2460  
  2461  	ret = ct_lazy_lookup4(get_ct_map4(&tuple), &tuple, ctx, ipv4_is_fragment(ip4),
  2462  			      l4_off, has_l4_header, CT_INGRESS, SCOPE_REVERSE,
  2463  			      CT_ENTRY_NODEPORT, &ct_state, &trace->monitor);
  2464  	if (ret == CT_REPLY) {
  2465  		trace->reason = TRACE_REASON_CT_REPLY;
  2466  		ret = lb4_rev_nat(ctx, l3_off, l4_off, ct_state.rev_nat_index, false,
  2467  				  &tuple, has_l4_header);
  2468  		if (IS_ERR(ret))
  2469  			return ret;
  2470  		if (!revalidate_data(ctx, &data, &data_end, &ip4))
  2471  			return DROP_INVALID;
  2472  		ctx_snat_done_set(ctx);
  2473  #ifndef HAVE_FIB_IFINDEX
  2474  		ifindex = ct_state.ifindex;
  2475  #endif
  2476  #if defined(TUNNEL_MODE)
  2477  		{
  2478  			struct remote_endpoint_info *info;
  2479  
  2480  			info = lookup_ip4_remote_endpoint(ip4->daddr, 0);
  2481  			if (info && info->tunnel_endpoint && !info->flag_skip_tunnel) {
  2482  				tunnel_endpoint = info->tunnel_endpoint;
  2483  				dst_sec_identity = info->sec_identity;
  2484  			}
  2485  		}
  2486  #endif
  2487  
  2488  		goto redirect;
  2489  	}
  2490  out:
  2491  	return CTX_ACT_OK;
  2492  
  2493  redirect:
  2494  	fib_params.l.ipv4_src = ip4->saddr;
  2495  	fib_params.l.ipv4_dst = ip4->daddr;
  2496  
  2497  	ret = ipv4_l3(ctx, l3_off, NULL, NULL, ip4);
  2498  	if (unlikely(ret != CTX_ACT_OK))
  2499  		return ret;
  2500  
  2501  #if (defined(ENABLE_EGRESS_GATEWAY_COMMON) && !defined(IS_BPF_OVERLAY)) || defined(TUNNEL_MODE)
  2502  	if (tunnel_endpoint) {
  2503  		__be16 src_port = tunnel_gen_src_port_v4(&tuple);
  2504  
  2505  		ret = nodeport_add_tunnel_encap(ctx, IPV4_DIRECT_ROUTING, src_port,
  2506  						tunnel_endpoint, src_sec_identity, dst_sec_identity,
  2507  						trace->reason, trace->monitor, &ifindex);
  2508  		if (IS_ERR(ret))
  2509  			return ret;
  2510  
  2511  		if (ret == CTX_ACT_REDIRECT && ifindex)
  2512  			return ctx_redirect(ctx, ifindex, 0);
  2513  
  2514  		fib_params.l.ipv4_src = IPV4_DIRECT_ROUTING;
  2515  		fib_params.l.ipv4_dst = tunnel_endpoint;
  2516  
  2517  		/* neigh map doesn't contain DMACs for other nodes */
  2518  		allow_neigh_map = false;
  2519  	}
  2520  #endif
  2521  
  2522  	return fib_redirect(ctx, true, &fib_params, allow_neigh_map, ext_err, &ifindex);
  2523  }
  2524  
  2525  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_REVNAT)
  2526  static __always_inline
  2527  int tail_nodeport_rev_dnat_ingress_ipv4(struct __ctx_buff *ctx)
  2528  {
  2529  	struct trace_ctx trace = {
  2530  		.reason = TRACE_REASON_UNKNOWN,
  2531  		.monitor = TRACE_PAYLOAD_LEN,
  2532  	};
  2533  	__s8 ext_err = 0;
  2534  	int ret = 0;
  2535  
  2536  	ret = nodeport_rev_dnat_ingress_ipv4(ctx, &trace, &ext_err);
  2537  	if (IS_ERR(ret))
  2538  		goto drop_err;
  2539  
  2540  	if (ret == CTX_ACT_OK) {
  2541  		/* When called by bpf_lxc to handle a reply by local backend,
  2542  		 * the packet *must* be redirected.
  2543  		 */
  2544  		if (is_defined(IS_BPF_LXC)) {
  2545  			ret = DROP_NAT_NO_MAPPING;
  2546  			goto drop_err;
  2547  		}
  2548  
  2549  		ctx_skip_nodeport_set(ctx);
  2550  		ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_FROM_NETDEV, &ext_err);
  2551  		goto drop_err;
  2552  	}
  2553  
  2554  #ifndef IS_BPF_LXC
  2555  	edt_set_aggregate(ctx, 0);
  2556  #endif
  2557  	cilium_capture_out(ctx);
  2558  	return ret;
  2559  
  2560  drop_err:
  2561  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  2562  					  CTX_ACT_DROP, METRIC_EGRESS);
  2563  }
  2564  
  2565  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS)
  2566  static __always_inline
  2567  int tail_nodeport_nat_ingress_ipv4(struct __ctx_buff *ctx)
  2568  {
  2569  	struct ipv4_nat_target target = {
  2570  		.min_port = NODEPORT_PORT_MIN_NAT,
  2571  		.max_port = NODEPORT_PORT_MAX_NAT,
  2572  	};
  2573  	struct trace_ctx trace = {
  2574  		.reason = TRACE_REASON_UNKNOWN,
  2575  		.monitor = TRACE_PAYLOAD_LEN,
  2576  	};
  2577  	__u32 src_id = 0;
  2578  	__s8 ext_err = 0;
  2579  	int ret;
  2580  
  2581  	ret = snat_v4_rev_nat(ctx, &target, &trace, &ext_err);
  2582  	if (IS_ERR(ret)) {
  2583  		if (ret == NAT_PUNT_TO_STACK ||
  2584  		    /* DROP_NAT_NO_MAPPING is unwanted behavior in a
  2585  		     * rev-SNAT context. Let's continue to passing it up
  2586  		     * to the host and revisiting this later if
  2587  		     * needed.
  2588  		     */
  2589  		    ret == DROP_NAT_NO_MAPPING) {
  2590  			/* In case of no mapping, recircle back to
  2591  			 * main path. SNAT is very expensive in terms
  2592  			 * of instructions and
  2593  			 * complexity. Consequently, this is done
  2594  			 * inside a tail call here (because we don't
  2595  			 * have BPF to BPF calls).
  2596  			 */
  2597  			goto recircle;
  2598  		}
  2599  		goto drop_err;
  2600  	}
  2601  
  2602  	ctx_snat_done_set(ctx);
  2603  
  2604  	/* At this point we know that a reverse SNAT mapping exists.
  2605  	 * Otherwise, we would have tail-called back to
  2606  	 * CALL_IPV4_FROM_NETDEV in the code above.
  2607  	 */
  2608  #if !defined(ENABLE_DSR) || (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) ||	\
  2609      (defined(ENABLE_EGRESS_GATEWAY_COMMON) && !defined(IS_BPF_OVERLAY))
  2610  
  2611  # if defined(ENABLE_HOST_FIREWALL) && defined(IS_BPF_HOST)
  2612  	ret = ipv4_host_policy_ingress(ctx, &src_id, &trace, &ext_err);
  2613  	if (IS_ERR(ret))
  2614  		goto drop_err;
  2615  
  2616  	/* We don't want to enforce host policies a second time,
  2617  	 * on recircle / after RevDNAT.
  2618  	 */
  2619  	ctx_skip_host_fw_set(ctx);
  2620  # endif
  2621  
  2622  	/* If we're not in full DSR mode, reply traffic from remote backends
  2623  	 * might pass back through the LB node and requires revDNAT.
  2624  	 *
  2625  	 * Also let nodeport_rev_dnat_ingress_ipv4() redirect EgressGW
  2626  	 * reply traffic into tunnel (see there for details).
  2627  	 */
  2628  	ret = invoke_traced_tailcall_if(__and(is_defined(ENABLE_HOST_FIREWALL),
  2629  					      is_defined(IS_BPF_HOST)),
  2630  					CILIUM_CALL_IPV4_NODEPORT_REVNAT,
  2631  					nodeport_rev_dnat_ingress_ipv4,
  2632  					&trace, &ext_err);
  2633  	if (IS_ERR(ret))
  2634  		goto drop_err;
  2635  
  2636  	/* No redirect needed: */
  2637  	if (ret == CTX_ACT_OK)
  2638  		goto recircle;
  2639  
  2640  	/* Redirected to egress interface: */
  2641  	edt_set_aggregate(ctx, 0);
  2642  	cilium_capture_out(ctx);
  2643  	return ret;
  2644  #endif
  2645  
  2646  recircle:
  2647  	ctx_skip_nodeport_set(ctx);
  2648  	ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_FROM_NETDEV, &ext_err);
  2649  
  2650  drop_err:
  2651  	return send_drop_notify_error_ext(ctx, src_id, ret, ext_err, CTX_ACT_DROP,
  2652  					  METRIC_INGRESS);
  2653  }
  2654  
  2655  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT_EGRESS)
  2656  static __always_inline
  2657  int tail_nodeport_nat_egress_ipv4(struct __ctx_buff *ctx)
  2658  {
  2659  	struct bpf_fib_lookup_padded fib_params = {
  2660  		.l = {
  2661  			.family		= AF_INET,
  2662  			.ifindex	= ctx_get_ifindex(ctx),
  2663  		},
  2664  	};
  2665  	struct ipv4_nat_target target = {
  2666  		.min_port = NODEPORT_PORT_MIN_NAT,
  2667  		.max_port = NODEPORT_PORT_MAX_NAT,
  2668  		/* Unfortunately, the bpf_fib_lookup() is not able to set src IP addr.
  2669  		 * So we need to assume that the direct routing device is going to be
  2670  		 * used to fwd the NodePort request, thus SNAT-ing to its IP addr.
  2671  		 * This will change once we have resolved GH#17158.
  2672  		 */
  2673  		.addr = IPV4_DIRECT_ROUTING,
  2674  	};
  2675  	struct ipv4_ct_tuple tuple = {};
  2676  	struct trace_ctx trace = {
  2677  		.reason = (enum trace_reason)CT_NEW,
  2678  		.monitor = TRACE_PAYLOAD_LEN,
  2679  	};
  2680  	int ret, l4_off, oif = 0;
  2681  	void *data, *data_end;
  2682  	bool has_l4_header;
  2683  	struct iphdr *ip4;
  2684  	__s8 ext_err = 0;
  2685  	__u32 dst_sec_identity __maybe_unused = 0;
  2686  #ifdef TUNNEL_MODE
  2687  	__u32 src_sec_identity = ctx_load_meta(ctx, CB_SRC_LABEL);
  2688  	__u8 cluster_id __maybe_unused = (__u8)ctx_load_meta(ctx, CB_CLUSTER_ID_EGRESS);
  2689  	struct remote_endpoint_info *info;
  2690  	__be32 tunnel_endpoint = 0;
  2691  #endif
  2692  
  2693  	if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
  2694  		ret = DROP_INVALID;
  2695  		goto drop_err;
  2696  	}
  2697  
  2698  	has_l4_header = ipv4_has_l4_header(ip4);
  2699  
  2700  #ifdef TUNNEL_MODE
  2701  	info = lookup_ip4_remote_endpoint(ip4->daddr, cluster_id);
  2702  	if (info && info->tunnel_endpoint != 0 && !info->flag_skip_tunnel) {
  2703  		tunnel_endpoint = info->tunnel_endpoint;
  2704  		dst_sec_identity = info->sec_identity;
  2705  
  2706  		target.addr = IPV4_GATEWAY;
  2707  #if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT)
  2708  		if (cluster_id && cluster_id != CLUSTER_ID)
  2709  			target.addr = IPV4_INTER_CLUSTER_SNAT;
  2710  #endif
  2711  	}
  2712  #endif
  2713  
  2714  	ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
  2715  	if (IS_ERR(ret))
  2716  		goto drop_err;
  2717  
  2718  	/* Extracted ports are in flipped order, but SNAT wants them to
  2719  	 * match the packet header:
  2720  	 */
  2721  	ipv4_ct_tuple_swap_ports(&tuple);
  2722  	tuple.flags = TUPLE_F_OUT;
  2723  
  2724  	ret = nodeport_nat_egress_ipv4_hook(ctx, ip4, dst_sec_identity, &tuple, l4_off, &ext_err);
  2725  	if (ret != CTX_ACT_OK)
  2726  		return ret;
  2727  
  2728  	ret = ipv4_l3(ctx, ETH_HLEN, NULL, NULL, ip4);
  2729  	if (unlikely(ret != CTX_ACT_OK))
  2730  		goto drop_err;
  2731  
  2732  	ret = __snat_v4_nat(ctx, &tuple, ip4, has_l4_header, l4_off,
  2733  			    true, &target, TCP_SPORT_OFF, &trace, &ext_err);
  2734  	if (IS_ERR(ret))
  2735  		goto drop_err;
  2736  
  2737  	/* This is also needed for from-overlay, to avoid a second SNAT by
  2738  	 * to-overlay or to-netdev.
  2739  	 */
  2740  	ctx_snat_done_set(ctx);
  2741  
  2742  #ifdef TUNNEL_MODE
  2743  	if (tunnel_endpoint) {
  2744  		__be16 src_port;
  2745  
  2746  #if __ctx_is == __ctx_skb
  2747  		{
  2748  			/* Append L2 hdr before redirecting to tunnel netdev.
  2749  			 * Otherwise, the kernel will drop such request in
  2750  			 * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/net/core/filter.c?h=v6.7.4#n2147
  2751  			 */
  2752  			bool l2_hdr_required = false;
  2753  
  2754  			ret = maybe_add_l2_hdr(ctx, ENCAP_IFINDEX, &l2_hdr_required);
  2755  			if (ret != 0)
  2756  				goto drop_err;
  2757  		}
  2758  #endif
  2759  
  2760  		src_port = tunnel_gen_src_port_v4(&tuple);
  2761  
  2762  		/* The request came from outside, so we need to
  2763  		 * set the security id in the tunnel header to WORLD_ID.
  2764  		 * Otherwise, the remote node will assume, that the
  2765  		 * request originated from a cluster node which will
  2766  		 * bypass any netpol which disallows LB requests from
  2767  		 * outside.
  2768  		 */
  2769  		ret = nodeport_add_tunnel_encap(ctx,
  2770  						IPV4_DIRECT_ROUTING,
  2771  						src_port,
  2772  						tunnel_endpoint,
  2773  						src_sec_identity,
  2774  						dst_sec_identity,
  2775  						trace.reason,
  2776  						trace.monitor,
  2777  						&oif);
  2778  		if (IS_ERR(ret))
  2779  			goto drop_err;
  2780  
  2781  		if (ret == CTX_ACT_REDIRECT && oif) {
  2782  			cilium_capture_out(ctx);
  2783  			return ctx_redirect(ctx, oif, 0);
  2784  		}
  2785  	}
  2786  #endif
  2787  	if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
  2788  		ret = DROP_INVALID;
  2789  		goto drop_err;
  2790  	}
  2791  
  2792  	fib_params.l.ipv4_src = ip4->saddr;
  2793  	fib_params.l.ipv4_dst = ip4->daddr;
  2794  
  2795  	ret = fib_redirect(ctx, true, &fib_params, false, &ext_err, &oif);
  2796  	if (fib_ok(ret)) {
  2797  		cilium_capture_out(ctx);
  2798  		return ret;
  2799  	}
  2800  drop_err:
  2801  	return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  2802  					  CTX_ACT_DROP, METRIC_EGRESS);
  2803  }
  2804  
  2805  static __always_inline int nodeport_svc_lb4(struct __ctx_buff *ctx,
  2806  					    struct ipv4_ct_tuple *tuple,
  2807  					    struct lb4_service *svc,
  2808  					    struct lb4_key *key,
  2809  					    struct iphdr *ip4,
  2810  					    int l3_off,
  2811  					    bool has_l4_header,
  2812  					    int l4_off,
  2813  					    __u32 src_sec_identity,
  2814  					    __s8 *ext_err)
  2815  {
  2816  	const bool skip_l3_xlate = DSR_ENCAP_MODE == DSR_ENCAP_IPIP;
  2817  	bool is_fragment = ipv4_is_fragment(ip4);
  2818  	struct ct_state ct_state_svc = {};
  2819  	__u32 cluster_id = 0;
  2820  	bool backend_local;
  2821  	__u32 monitor = 0;
  2822  	int ret;
  2823  
  2824  	if (!lb4_src_range_ok(svc, ip4->saddr))
  2825  		return DROP_NOT_IN_SRC_RANGE;
  2826  
  2827  	if (!lb4_svc_is_routable(svc))
  2828  		return DROP_IS_CLUSTER_IP;
  2829  
  2830  #if defined(ENABLE_L7_LB)
  2831  	if (lb4_svc_is_l7loadbalancer(svc) && svc->l7_lb_proxy_port > 0) {
  2832  		/* We cannot redirect from the XDP layer to cilium_host.
  2833  		 * Therefore, let the bpf_host to handle the L7 ingress
  2834  		 * request.
  2835  		 */
  2836  		if (ctx_is_xdp())
  2837  			return CTX_ACT_OK;
  2838  
  2839  		send_trace_notify(ctx, TRACE_TO_PROXY, src_sec_identity, UNKNOWN_ID,
  2840  				  bpf_ntohs((__u16)svc->l7_lb_proxy_port),
  2841  				  NATIVE_DEV_IFINDEX, TRACE_REASON_POLICY, monitor);
  2842  		return ctx_redirect_to_proxy_hairpin_ipv4(ctx, ip4,
  2843  							  (__be16)svc->l7_lb_proxy_port);
  2844  	}
  2845  #endif
  2846  	if (lb4_to_lb6_service(svc)) {
  2847  		ret = lb4_to_lb6(ctx, ip4, l3_off);
  2848  		if (!ret)
  2849  			return NAT_46X64_RECIRC;
  2850  	} else {
  2851  		ret = lb4_local(get_ct_map4(tuple), ctx, is_fragment, l3_off, l4_off,
  2852  				key, tuple, svc, &ct_state_svc,
  2853  				has_l4_header, skip_l3_xlate, &cluster_id,
  2854  				ext_err);
  2855  #ifdef SERVICE_NO_BACKEND_RESPONSE
  2856  		if (ret == DROP_NO_SERVICE) {
  2857  			/* Packet is TX'ed back out, avoid EDT false-positives: */
  2858  			edt_set_aggregate(ctx, 0);
  2859  			ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_NO_SERVICE,
  2860  						 ext_err);
  2861  		}
  2862  #endif
  2863  	}
  2864  	if (IS_ERR(ret))
  2865  		return ret;
  2866  
  2867  	backend_local = __lookup_ip4_endpoint(tuple->daddr);
  2868  	if (!backend_local && lb4_svc_is_hostport(svc))
  2869  		return DROP_INVALID;
  2870  	/* Reply from DSR packet is never seen on this node again
  2871  	 * hence no need to track in here.
  2872  	 */
  2873  	if (backend_local || !nodeport_uses_dsr4(tuple)) {
  2874  		struct ct_state ct_state = {};
  2875  
  2876  #if (defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT))
  2877  		if (src_sec_identity == 0)
  2878  			src_sec_identity = WORLD_IPV4_ID;
  2879  
  2880  		 /* Before forwarding the identity, make sure it's not local,
  2881  		  * as in that case the next hop would't understand it.
  2882  		  */
  2883  		if (identity_is_local(src_sec_identity))
  2884  			return DROP_INVALID_IDENTITY;
  2885  
  2886  		if (identity_is_host(src_sec_identity))
  2887  			return DROP_INVALID_IDENTITY;
  2888  #else
  2889  		src_sec_identity = WORLD_IPV4_ID;
  2890  #endif
  2891  
  2892  		/* lookup with SCOPE_FORWARD: */
  2893  		__ipv4_ct_tuple_reverse(tuple);
  2894  
  2895  		/* only match CT entries that belong to the same service: */
  2896  		ct_state.rev_nat_index = ct_state_svc.rev_nat_index;
  2897  
  2898  		/* Cache is_fragment in advance, lb4_local may invalidate ip4. */
  2899  		ret = ct_lazy_lookup4(get_ct_map4(tuple), tuple, ctx, is_fragment,
  2900  				      l4_off, has_l4_header, CT_EGRESS, SCOPE_FORWARD,
  2901  				      CT_ENTRY_NODEPORT, &ct_state, &monitor);
  2902  		if (ret < 0)
  2903  			return ret;
  2904  
  2905  		switch (ret) {
  2906  		case CT_NEW:
  2907  			ct_state.src_sec_id = src_sec_identity;
  2908  			ct_state.node_port = 1;
  2909  #ifndef HAVE_FIB_IFINDEX
  2910  			ct_state.ifindex = (__u16)NATIVE_DEV_IFINDEX;
  2911  #endif
  2912  
  2913  			ret = ct_create4(get_ct_map4(tuple), NULL, tuple, ctx,
  2914  					 CT_EGRESS, &ct_state, ext_err);
  2915  			if (IS_ERR(ret))
  2916  				return ret;
  2917  			break;
  2918  		case CT_ESTABLISHED:
  2919  			/* Note that we don't validate whether the matched CT entry
  2920  			 * has identical values (eg. .ifindex) as set above.
  2921  			 */
  2922  			break;
  2923  		default:
  2924  			return DROP_UNKNOWN_CT;
  2925  		}
  2926  
  2927  		/* Neighbour tracking is needed for local backend until
  2928  		 * https://github.com/cilium/cilium/issues/24062 is resolved.
  2929  		 */
  2930  		ret = neigh_record_ip4(ctx);
  2931  		if (ret < 0)
  2932  			return ret;
  2933  		if (backend_local) {
  2934  			ctx_set_xfer(ctx, XFER_PKT_NO_SVC);
  2935  			return CTX_ACT_OK;
  2936  		}
  2937  	}
  2938  
  2939  	/* TX request to remote backend: */
  2940  	edt_set_aggregate(ctx, 0);
  2941  	if (nodeport_uses_dsr4(tuple)) {
  2942  #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP
  2943  		ctx_store_meta(ctx, CB_HINT,
  2944  			       ((__u32)tuple->sport << 16) | tuple->dport);
  2945  		ctx_store_meta(ctx, CB_ADDR_V4, tuple->daddr);
  2946  #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE || DSR_ENCAP_MODE == DSR_ENCAP_NONE
  2947  		ctx_store_meta(ctx, CB_PORT, key->dport);
  2948  		ctx_store_meta(ctx, CB_ADDR_V4, key->address);
  2949  		ctx_store_meta(ctx, CB_DSR_SRC_LABEL, src_sec_identity);
  2950  		ctx_store_meta(ctx, CB_DSR_L3_OFF, l3_off);
  2951  #endif /* DSR_ENCAP_MODE */
  2952  		return tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_DSR, ext_err);
  2953  	}
  2954  
  2955  	ctx_store_meta(ctx, CB_SRC_LABEL, src_sec_identity);
  2956  	ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id);
  2957  	return tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_NAT_EGRESS,
  2958  				  ext_err);
  2959  }
  2960  
  2961  /* Main node-port entry point for host-external ingressing node-port traffic
  2962   * which handles the case of: i) backend is local EP, ii) backend is remote EP,
  2963   * iii) reply from remote backend EP.
  2964   */
  2965  static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
  2966  					struct iphdr *ip4,
  2967  					int l3_off,
  2968  					__u32 src_sec_identity,
  2969  					__s8 *ext_err,
  2970  					bool __maybe_unused *dsr)
  2971  {
  2972  	bool has_l4_header = ipv4_has_l4_header(ip4);
  2973  	struct ipv4_ct_tuple tuple = {};
  2974  	bool is_svc_proto = true;
  2975  	struct lb4_service *svc;
  2976  	struct lb4_key key = {};
  2977  	int ret, l4_off;
  2978  
  2979  	cilium_capture_in(ctx);
  2980  
  2981  	ret = lb4_extract_tuple(ctx, ip4, l3_off, &l4_off, &tuple);
  2982  	if (IS_ERR(ret)) {
  2983  		if (ret == DROP_UNSUPP_SERVICE_PROTO) {
  2984  			is_svc_proto = false;
  2985  			goto skip_service_lookup;
  2986  		}
  2987  		if (ret == DROP_UNKNOWN_L4) {
  2988  			ctx_set_xfer(ctx, XFER_PKT_NO_SVC);
  2989  			return CTX_ACT_OK;
  2990  		}
  2991  		return ret;
  2992  	}
  2993  
  2994  	lb4_fill_key(&key, &tuple);
  2995  
  2996  	svc = lb4_lookup_service(&key, false);
  2997  	if (svc) {
  2998  		return nodeport_svc_lb4(ctx, &tuple, svc, &key, ip4, l3_off,
  2999  					has_l4_header, l4_off,
  3000  					src_sec_identity, ext_err);
  3001  	} else {
  3002  skip_service_lookup:
  3003  #ifdef ENABLE_NAT_46X64_GATEWAY
  3004  		if (ip4->daddr != IPV4_DIRECT_ROUTING)
  3005  			return tail_call_internal(ctx, CILIUM_CALL_IPV46_RFC8215, ext_err);
  3006  #endif
  3007  		/* The packet is not destined to a service but it can be a reply
  3008  		 * packet from a remote backend, in which case we need to perform
  3009  		 * the reverse NAT.
  3010  		 */
  3011  		ctx_set_xfer(ctx, XFER_PKT_NO_SVC);
  3012  
  3013  #ifdef ENABLE_DSR
  3014  #if (defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE) || \
  3015  	(!defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE != DSR_ENCAP_GENEVE)
  3016  		if (is_svc_proto && nodeport_uses_dsr4(&tuple)) {
  3017  			/* Check if packet has embedded DSR info, or belongs to
  3018  			 * an established DSR connection:
  3019  			 */
  3020  			ret = nodeport_extract_dsr_v4(ctx, ip4, &tuple,
  3021  						      l4_off, &key.address,
  3022  						      &key.dport, dsr);
  3023  			if (IS_ERR(ret))
  3024  				return ret;
  3025  
  3026  			if (*dsr)
  3027  				/* Packet continues on its way to local backend: */
  3028  				return nodeport_dsr_ingress_ipv4(ctx, &tuple, ip4,
  3029  								 has_l4_header, l4_off,
  3030  								 key.address, key.dport,
  3031  								 ext_err);
  3032  		}
  3033  #endif
  3034  #endif /* ENABLE_DSR */
  3035  
  3036  #ifndef ENABLE_MASQUERADE_IPV4
  3037  		/* When BPF-Masquerading is off, we can skip the revSNAT path via
  3038  		 * CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS if:
  3039  		 * - the packet is ICMP, or
  3040  		 * - the packet is DSR-eligible (and thus not reply traffic by
  3041  		 *   a remote backend that would require revSNAT / revDNAT)
  3042  		 */
  3043  		if (!is_svc_proto || nodeport_uses_dsr4(&tuple))
  3044  			return CTX_ACT_OK;
  3045  #endif /* ENABLE_MASQUERADE_IPV4 */
  3046  
  3047  		ctx_store_meta(ctx, CB_SRC_LABEL, src_sec_identity);
  3048  		/* For NAT64 we might see an IPv4 reply from the backend to
  3049  		 * the LB entering this path. Thus, transform back to IPv6.
  3050  		 */
  3051  		if (is_svc_proto && snat_v6_has_v4_match(&tuple)) {
  3052  			ret = lb4_to_lb6(ctx, ip4, l3_off);
  3053  			if (ret)
  3054  				return ret;
  3055  			ctx_store_meta(ctx, CB_NAT_46X64, 0);
  3056  			return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS,
  3057  						  ext_err);
  3058  #ifdef ENABLE_NAT_46X64_GATEWAY
  3059  		} else if (is_svc_proto &&
  3060  			   snat_v6_has_v4_match_rfc8215(&tuple)) {
  3061  			ret = snat_remap_rfc8215(ctx, ip4, l3_off);
  3062  			if (ret)
  3063  				return ret;
  3064  			ctx_store_meta(ctx, CB_NAT_46X64, NAT46x64_MODE_ROUTE);
  3065  			return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS,
  3066  						  ext_err);
  3067  #endif
  3068  		}
  3069  
  3070  		return tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS, ext_err);
  3071  	}
  3072  }
  3073  
  3074  static __always_inline int
  3075  nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, bool *snat_done,
  3076  			   bool revdnat_only __maybe_unused,
  3077  			   struct trace_ctx *trace, __s8 *ext_err __maybe_unused)
  3078  {
  3079  	struct bpf_fib_lookup_padded fib_params __maybe_unused = {};
  3080  	int ret, l3_off = ETH_HLEN, l4_off;
  3081  	struct lb4_reverse_nat *nat_info;
  3082  	struct ipv4_ct_tuple tuple = {};
  3083  	struct ct_state ct_state = {};
  3084  	void *data, *data_end;
  3085  	bool has_l4_header, is_fragment;
  3086  	struct iphdr *ip4;
  3087  
  3088  	if (!revalidate_data(ctx, &data, &data_end, &ip4))
  3089  		return DROP_INVALID;
  3090  
  3091  	has_l4_header = ipv4_has_l4_header(ip4);
  3092  	is_fragment = ipv4_is_fragment(ip4);
  3093  
  3094  	ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
  3095  	if (ret < 0) {
  3096  		/* If it's not a SVC protocol, we don't need to check for RevDNAT: */
  3097  		if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4)
  3098  			return CTX_ACT_OK;
  3099  		return ret;
  3100  	}
  3101  
  3102  	nat_info = nodeport_rev_dnat_get_info_ipv4(ctx, &tuple);
  3103  	if (!nat_info)
  3104  		return CTX_ACT_OK;
  3105  
  3106  #if defined(IS_BPF_HOST) && !defined(ENABLE_SKIP_FIB)
  3107  	if (revdnat_only)
  3108  		goto skip_fib;
  3109  
  3110  	/* Perform FIB lookup with post-RevDNAT src IP, and redirect
  3111  	 * packet to the correct egress interface:
  3112  	 */
  3113  	fib_params.l.family = AF_INET;
  3114  	fib_params.l.ifindex = ctx_get_ifindex(ctx);
  3115  	fib_params.l.ipv4_src = nat_info->address;
  3116  	fib_params.l.ipv4_dst = tuple.daddr;
  3117  
  3118  	ret = nodeport_fib_lookup_and_redirect(ctx, &fib_params, ext_err);
  3119  	if (ret != CTX_ACT_OK)
  3120  		return ret;
  3121  
  3122  skip_fib:
  3123  #endif
  3124  
  3125  	/* Cache is_fragment in advance, nodeport_fib_lookup_and_redirect may invalidate ip4. */
  3126  	ret = ct_lazy_lookup4(get_ct_map4(&tuple), &tuple, ctx, is_fragment,
  3127  			      l4_off, has_l4_header, CT_INGRESS, SCOPE_REVERSE,
  3128  			      CT_ENTRY_NODEPORT | CT_ENTRY_DSR,
  3129  			      &ct_state, &trace->monitor);
  3130  
  3131  	/* nodeport_rev_dnat_get_info_ipv4() just checked that such a
  3132  	 * CT entry exists:
  3133  	 */
  3134  	if (ret == CT_REPLY) {
  3135  		trace->reason = TRACE_REASON_CT_REPLY;
  3136  
  3137  		ret = __lb4_rev_nat(ctx, l3_off, l4_off, &tuple,
  3138  				    nat_info, false, has_l4_header);
  3139  		if (IS_ERR(ret))
  3140  			return ret;
  3141  
  3142  		*snat_done = true;
  3143  
  3144  #ifdef ENABLE_DSR
  3145   #if defined(ENABLE_HIGH_SCALE_IPCACHE) &&				\
  3146       defined(IS_BPF_OVERLAY) &&						\
  3147       DSR_ENCAP_MODE == DSR_ENCAP_GENEVE
  3148  		/* For HS IPCache, we also need to revDNAT the OuterSrcIP: */
  3149  		if (ct_state.dsr_internal) {
  3150  			struct bpf_tunnel_key key;
  3151  
  3152  			if (ctx_get_tunnel_key(ctx, &key, sizeof(key), 0) < 0)
  3153  				return DROP_NO_TUNNEL_KEY;
  3154  
  3155  			/* kernel returns addresses in flipped locations: */
  3156  			key.remote_ipv4 = key.local_ipv4;
  3157  			key.local_ipv4 = bpf_ntohl(nat_info->address);
  3158  
  3159  			if (ctx_set_tunnel_key(ctx, &key, sizeof(key),
  3160  					       BPF_F_ZERO_CSUM_TX) < 0)
  3161  				return DROP_WRITE_ERROR;
  3162  		}
  3163   #endif
  3164  #endif
  3165  	}
  3166  
  3167  	return CTX_ACT_OK;
  3168  }
  3169  
  3170  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD)
  3171  int tail_handle_snat_fwd_ipv4(struct __ctx_buff *ctx)
  3172  {
  3173  	__u32 cluster_id = ctx_load_and_clear_meta(ctx, CB_CLUSTER_ID_EGRESS);
  3174  	struct trace_ctx trace = {
  3175  		.reason = TRACE_REASON_UNKNOWN,
  3176  		.monitor = 0,
  3177  	};
  3178  	enum trace_point obs_point;
  3179  	__be32 saddr = 0;
  3180  	int ret;
  3181  	__s8 ext_err = 0;
  3182  
  3183  #ifdef IS_BPF_OVERLAY
  3184  	obs_point = TRACE_TO_OVERLAY;
  3185  #else
  3186  	obs_point = TRACE_TO_NETWORK;
  3187  #endif
  3188  
  3189  	ret = nodeport_snat_fwd_ipv4(ctx, cluster_id, &saddr, &trace, &ext_err);
  3190  	if (IS_ERR(ret))
  3191  		return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  3192  						  CTX_ACT_DROP, METRIC_EGRESS);
  3193  
  3194  	/* Don't emit a trace event if the packet has been redirected to another
  3195  	 * interface.
  3196  	 * This can happen for egress gateway traffic that needs to egress from
  3197  	 * the interface to which the egress IP is assigned to.
  3198  	 */
  3199  	if (ret == CTX_ACT_OK)
  3200  		send_trace_notify4(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID, saddr,
  3201  				   TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX,
  3202  				   trace.reason, trace.monitor);
  3203  
  3204  	return ret;
  3205  }
  3206  
  3207  static __always_inline int
  3208  __handle_nat_fwd_ipv4(struct __ctx_buff *ctx, __u32 cluster_id __maybe_unused,
  3209  		      bool revdnat_only, struct trace_ctx *trace, __s8 *ext_err)
  3210  {
  3211  	bool snat_done = false;
  3212  	int ret;
  3213  
  3214  	ret = nodeport_rev_dnat_fwd_ipv4(ctx, &snat_done, revdnat_only, trace, ext_err);
  3215  	if (ret != CTX_ACT_OK || revdnat_only)
  3216  		return ret;
  3217  
  3218  #if !defined(ENABLE_DSR) ||						\
  3219      (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) ||		\
  3220       defined(ENABLE_MASQUERADE_IPV4) ||					\
  3221      (defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT))
  3222  	if (!snat_done) {
  3223  		ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id);
  3224  		ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD,
  3225  					 ext_err);
  3226  	}
  3227  #endif
  3228  
  3229  	if (is_defined(IS_BPF_HOST) && snat_done)
  3230  		ctx_snat_done_set(ctx);
  3231  
  3232  	return ret;
  3233  }
  3234  
  3235  static __always_inline int
  3236  handle_nat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
  3237  		    __s8 *ext_err)
  3238  {
  3239  	__u32 cb_nat_flags = ctx_load_and_clear_meta(ctx, CB_NAT_FLAGS);
  3240  	bool revdnat_only = cb_nat_flags & CB_NAT_FLAGS_REVDNAT_ONLY;
  3241  	__u32 cluster_id = ctx_load_and_clear_meta(ctx, CB_CLUSTER_ID_EGRESS);
  3242  
  3243  	return __handle_nat_fwd_ipv4(ctx, cluster_id, revdnat_only, trace, ext_err);
  3244  }
  3245  
  3246  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT_FWD)
  3247  static __always_inline
  3248  int tail_handle_nat_fwd_ipv4(struct __ctx_buff *ctx)
  3249  {
  3250  	struct trace_ctx trace = {
  3251  		.reason = TRACE_REASON_UNKNOWN,
  3252  		.monitor = TRACE_PAYLOAD_LEN,
  3253  	};
  3254  	int ret;
  3255  	enum trace_point obs_point;
  3256  	__s8 ext_err = 0;
  3257  
  3258  #ifdef IS_BPF_OVERLAY
  3259  	obs_point = TRACE_TO_OVERLAY;
  3260  #else
  3261  	obs_point = TRACE_TO_NETWORK;
  3262  #endif
  3263  
  3264  	ret = handle_nat_fwd_ipv4(ctx, &trace, &ext_err);
  3265  	if (IS_ERR(ret))
  3266  		return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err,
  3267  						  CTX_ACT_DROP, METRIC_EGRESS);
  3268  
  3269  	if (ret == CTX_ACT_OK)
  3270  		send_trace_notify(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID,
  3271  				  TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX,
  3272  				  trace.reason, trace.monitor);
  3273  
  3274  	return ret;
  3275  }
  3276  
  3277  #endif /* ENABLE_IPV4 */
  3278  
  3279  #ifdef ENABLE_HEALTH_CHECK
  3280  static __always_inline int
  3281  health_encap_v4(struct __ctx_buff *ctx, __u32 tunnel_ep,
  3282  		__u32 seclabel)
  3283  {
  3284  	__u32 key_size = TUNNEL_KEY_WITHOUT_SRC_IP;
  3285  	struct bpf_tunnel_key key;
  3286  
  3287  	/* When encapsulating, a packet originating from the local
  3288  	 * host is being considered as a packet from a remote node
  3289  	 * as it is being received.
  3290  	 */
  3291  	memset(&key, 0, sizeof(key));
  3292  	key.tunnel_id = get_tunnel_id(seclabel == HOST_ID ? LOCAL_NODE_ID : seclabel);
  3293  	key.remote_ipv4 = bpf_htonl(tunnel_ep);
  3294  	key.tunnel_ttl = IPDEFTTL;
  3295  
  3296  	if (unlikely(ctx_set_tunnel_key(ctx, &key, key_size,
  3297  					BPF_F_ZERO_CSUM_TX) < 0))
  3298  		return DROP_WRITE_ERROR;
  3299  	return 0;
  3300  }
  3301  
  3302  static __always_inline int
  3303  health_encap_v6(struct __ctx_buff *ctx, const union v6addr *tunnel_ep,
  3304  		__u32 seclabel)
  3305  {
  3306  	__u32 key_size = TUNNEL_KEY_WITHOUT_SRC_IP;
  3307  	struct bpf_tunnel_key key;
  3308  
  3309  	memset(&key, 0, sizeof(key));
  3310  	key.tunnel_id = get_tunnel_id(seclabel == HOST_ID ? LOCAL_NODE_ID : seclabel);
  3311  	key.remote_ipv6[0] = tunnel_ep->p1;
  3312  	key.remote_ipv6[1] = tunnel_ep->p2;
  3313  	key.remote_ipv6[2] = tunnel_ep->p3;
  3314  	key.remote_ipv6[3] = tunnel_ep->p4;
  3315  	key.tunnel_ttl = IPDEFTTL;
  3316  
  3317  	if (unlikely(ctx_set_tunnel_key(ctx, &key, key_size,
  3318  					BPF_F_ZERO_CSUM_TX |
  3319  					BPF_F_TUNINFO_IPV6) < 0))
  3320  		return DROP_WRITE_ERROR;
  3321  	return 0;
  3322  }
  3323  
  3324  static __always_inline int
  3325  lb_handle_health(struct __ctx_buff *ctx __maybe_unused, __be16 proto)
  3326  {
  3327  	void *data __maybe_unused, *data_end __maybe_unused;
  3328  	__sock_cookie key __maybe_unused;
  3329  	int ret __maybe_unused;
  3330  
  3331  	if ((ctx->mark & MARK_MAGIC_HEALTH_IPIP_DONE) ==
  3332  	    MARK_MAGIC_HEALTH_IPIP_DONE)
  3333  		return CTX_ACT_OK;
  3334  
  3335  	switch (proto) {
  3336  #if defined(ENABLE_IPV4) && DSR_ENCAP_MODE == DSR_ENCAP_IPIP
  3337  	case bpf_htons(ETH_P_IP): {
  3338  		struct lb4_health *val;
  3339  
  3340  		key = get_socket_cookie(ctx);
  3341  		val = map_lookup_elem(&LB4_HEALTH_MAP, &key);
  3342  		if (!val)
  3343  			return CTX_ACT_OK;
  3344  		ret = health_encap_v4(ctx, val->peer.address, 0);
  3345  		if (ret != 0)
  3346  			return ret;
  3347  		ctx->mark |= MARK_MAGIC_HEALTH_IPIP_DONE;
  3348  		return ctx_redirect(ctx, ENCAP4_IFINDEX, 0);
  3349  	}
  3350  #endif
  3351  #if defined(ENABLE_IPV6) && DSR_ENCAP_MODE == DSR_ENCAP_IPIP
  3352  	case bpf_htons(ETH_P_IPV6): {
  3353  		struct lb6_health *val;
  3354  
  3355  		key = get_socket_cookie(ctx);
  3356  		val = map_lookup_elem(&LB6_HEALTH_MAP, &key);
  3357  		if (!val)
  3358  			return CTX_ACT_OK;
  3359  		ret = health_encap_v6(ctx, &val->peer.address, 0);
  3360  		if (ret != 0)
  3361  			return ret;
  3362  		ctx->mark |= MARK_MAGIC_HEALTH_IPIP_DONE;
  3363  		return ctx_redirect(ctx, ENCAP6_IFINDEX, 0);
  3364  	}
  3365  #endif
  3366  	default:
  3367  		return CTX_ACT_OK;
  3368  	}
  3369  }
  3370  #endif /* ENABLE_HEALTH_CHECK */
  3371  
  3372  /* handle_nat_fwd() handles revDNAT, fib_lookup_redirect, and bpf_snat for
  3373   * nodeport. If revdnat_only is set to true, fib_lookup and bpf_snat are
  3374   * skipped. The typical use case of handle_nat_fwd(revdnat_only=true) is for
  3375   * handling reply traffic that requires revDNAT prior to wireguard/IPsec
  3376   * encryption.
  3377   */
  3378  static __always_inline int
  3379  handle_nat_fwd(struct __ctx_buff *ctx, __u32 cluster_id, __be16 proto,
  3380  	       bool revdnat_only, struct trace_ctx *trace __maybe_unused,
  3381  	       __s8 *ext_err __maybe_unused)
  3382  {
  3383  	int ret = CTX_ACT_OK;
  3384  	__u32 cb_nat_flags = 0;
  3385  
  3386  	if (revdnat_only)
  3387  		cb_nat_flags |= CB_NAT_FLAGS_REVDNAT_ONLY;
  3388  
  3389  	ctx_store_meta(ctx, CB_NAT_FLAGS, cb_nat_flags);
  3390  	ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id);
  3391  
  3392  	switch (proto) {
  3393  #ifdef ENABLE_IPV4
  3394  	case bpf_htons(ETH_P_IP):
  3395  		ret = invoke_traced_tailcall_if(__or4(__and(is_defined(ENABLE_IPV4),
  3396  							    is_defined(ENABLE_IPV6)),
  3397  						      __and(is_defined(ENABLE_HOST_FIREWALL),
  3398  							    is_defined(IS_BPF_HOST)),
  3399  						      __and(is_defined(ENABLE_CLUSTER_AWARE_ADDRESSING),
  3400  							    is_defined(ENABLE_INTER_CLUSTER_SNAT)),
  3401  						      __and(is_defined(ENABLE_EGRESS_GATEWAY_COMMON),
  3402  							    is_defined(IS_BPF_HOST))),
  3403  						CILIUM_CALL_IPV4_NODEPORT_NAT_FWD,
  3404  						handle_nat_fwd_ipv4, trace, ext_err);
  3405  		break;
  3406  #endif /* ENABLE_IPV4 */
  3407  #ifdef ENABLE_IPV6
  3408  	case bpf_htons(ETH_P_IPV6):
  3409  		ret = invoke_traced_tailcall_if(__or(__and(is_defined(ENABLE_IPV4),
  3410  							   is_defined(ENABLE_IPV6)),
  3411  						     __and(is_defined(ENABLE_HOST_FIREWALL),
  3412  							   is_defined(IS_BPF_HOST))),
  3413  						CILIUM_CALL_IPV6_NODEPORT_NAT_FWD,
  3414  						handle_nat_fwd_ipv6, trace, ext_err);
  3415  		break;
  3416  #endif /* ENABLE_IPV6 */
  3417  	default:
  3418  		build_bug_on(!(NODEPORT_PORT_MIN_NAT < NODEPORT_PORT_MAX_NAT));
  3419  		build_bug_on(!(NODEPORT_PORT_MIN     < NODEPORT_PORT_MAX));
  3420  		build_bug_on(!(NODEPORT_PORT_MAX     < NODEPORT_PORT_MIN_NAT));
  3421  		break;
  3422  	}
  3423  	return ret;
  3424  }
  3425  
  3426  #endif /* ENABLE_NODEPORT */