github.com/datadog/cilium@v1.6.12/bpf/lib/nat.h (about)

     1  /*
     2   *  Copyright (C) 2019 Authors of Cilium
     3   *
     4   *  This program is free software; you can redistribute it and/or modify
     5   *  it under the terms of the GNU General Public License as published by
     6   *  the Free Software Foundation; either version 2 of the License, or
     7   *  (at your option) any later version.
     8   *
     9   *  This program is distributed in the hope that it will be useful,
    10   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
    11   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    12   *  GNU General Public License for more details.
    13   *
    14   *  You should have received a copy of the GNU General Public License
    15   *  along with this program; if not, write to the Free Software
    16   *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    17   */
    18  /* Simple NAT engine in BPF. */
    19  #ifndef __LIB_NAT__
    20  #define __LIB_NAT__
    21  
    22  #include <linux/icmp.h>
    23  #include <linux/tcp.h>
    24  #include <linux/udp.h>
    25  #include <linux/ip.h>
    26  #include <linux/icmpv6.h>
    27  #include <linux/ipv6.h>
    28  
    29  #include "common.h"
    30  #include "drop.h"
    31  #include "signal.h"
    32  #include "conntrack.h"
    33  #include "conntrack_map.h"
    34  
    35  enum {
    36  	NAT_DIR_EGRESS  = TUPLE_F_OUT,
    37  	NAT_DIR_INGRESS = TUPLE_F_IN,
    38  };
    39  
    40  struct nat_entry {
    41  	__u64 created;
    42  	__u64 host_local;	/* Only single bit used. */
    43  	__u64 pad1;		/* Future use. */
    44  	__u64 pad2;		/* Future use. */
    45  };
    46  
    47  #define NAT_CONTINUE_XLATE 	0
    48  
    49  #ifdef HAVE_LRU_MAP_TYPE
    50  # define NAT_MAP_TYPE BPF_MAP_TYPE_LRU_HASH
    51  #else
    52  # define NAT_MAP_TYPE BPF_MAP_TYPE_HASH
    53  #endif
    54  
    55  #ifdef HAVE_LARGE_INSN_LIMIT
    56  # define SNAT_COLLISION_RETRIES		128
    57  # define SNAT_SIGNAL_THRES		64
    58  #else
    59  # if defined ENABLE_IPV4 && defined ENABLE_IPV6
    60  #  define SNAT_COLLISION_RETRIES	19
    61  # else
    62  #  define SNAT_COLLISION_RETRIES	20
    63  # endif
    64  # define SNAT_SIGNAL_THRES		10
    65  #endif
    66  
    67  static __always_inline __be16 __snat_clamp_port_range(__u16 start, __u16 end,
    68  						      __u16 val)
    69  {
    70  	return (val % (__u16)(end - start)) + start;
    71  }
    72  
    73  static __always_inline void *__snat_lookup(void *map, void *tuple)
    74  {
    75  	return map_lookup_elem(map, tuple);
    76  }
    77  
    78  static __always_inline int __snat_update(void *map, void *otuple, void *ostate,
    79  					 void *rtuple, void *rstate)
    80  {
    81  	int ret = map_update_elem(map, rtuple, rstate, BPF_NOEXIST);
    82  	if (!ret) {
    83  		ret = map_update_elem(map, otuple, ostate, BPF_NOEXIST);
    84  		if (ret)
    85  			map_delete_elem(map, rtuple);
    86  	}
    87  	return ret;
    88  }
    89  
    90  static __always_inline void __snat_delete(void *map, void *otuple,
    91  					  void *rtuple)
    92  {
    93  	map_delete_elem(map, otuple);
    94  	map_delete_elem(map, rtuple);
    95  }
    96  
    97  struct ipv4_nat_entry {
    98  	struct nat_entry common;
    99  	union {
   100  		struct {
   101  			__be32 to_saddr;
   102  			__be16 to_sport;
   103  		};
   104  		struct {
   105  			__be32 to_daddr;
   106  			__be16 to_dport;
   107  		};
   108  	};
   109  };
   110  
   111  struct ipv4_nat_target {
   112  	__be32 addr;
   113  	const __u16 min_port; /* host endianess */
   114  	const __u16 max_port; /* host endianess */
   115  	bool src_from_world;
   116  };
   117  
   118  #if defined ENABLE_IPV4 && (defined ENABLE_MASQUERADE || defined ENABLE_NODEPORT)
   119  struct bpf_elf_map __section_maps SNAT_MAPPING_IPV4 = {
   120  	.type		= NAT_MAP_TYPE,
   121  	.size_key	= sizeof(struct ipv4_ct_tuple),
   122  	.size_value	= sizeof(struct ipv4_nat_entry),
   123  	.pinning	= PIN_GLOBAL_NS,
   124  	.max_elem	= SNAT_MAPPING_IPV4_SIZE,
   125  #ifndef HAVE_LRU_MAP_TYPE
   126  	.flags		= CONDITIONAL_PREALLOC,
   127  #endif
   128  };
   129  
   130  static __always_inline
   131  struct ipv4_nat_entry *snat_v4_lookup(struct ipv4_ct_tuple *tuple)
   132  {
   133  	return __snat_lookup(&SNAT_MAPPING_IPV4, tuple);
   134  }
   135  
   136  static __always_inline int snat_v4_update(struct ipv4_ct_tuple *otuple,
   137  					  struct ipv4_nat_entry *ostate,
   138  					  struct ipv4_ct_tuple *rtuple,
   139  					  struct ipv4_nat_entry *rstate)
   140  {
   141  	return __snat_update(&SNAT_MAPPING_IPV4, otuple, ostate,
   142  			     rtuple, rstate);
   143  }
   144  
   145  static __always_inline void snat_v4_delete(struct ipv4_ct_tuple *otuple,
   146  					   struct ipv4_ct_tuple *rtuple)
   147  {
   148  	__snat_delete(&SNAT_MAPPING_IPV4, otuple, rtuple);
   149  }
   150  
   151  static __always_inline void snat_v4_swap_tuple(struct ipv4_ct_tuple *otuple,
   152  					       struct ipv4_ct_tuple *rtuple)
   153  {
   154  	__builtin_memset(rtuple, 0, sizeof(*rtuple));
   155  	rtuple->nexthdr = otuple->nexthdr;
   156  	rtuple->daddr = otuple->saddr;
   157  	rtuple->saddr = otuple->daddr;
   158  	rtuple->dport = otuple->sport;
   159  	rtuple->sport = otuple->dport;
   160  	rtuple->flags = otuple->flags == NAT_DIR_EGRESS ?
   161  			NAT_DIR_INGRESS : NAT_DIR_EGRESS;
   162  }
   163  
   164  static __always_inline int snat_v4_reverse_tuple(struct ipv4_ct_tuple *otuple,
   165  						 struct ipv4_ct_tuple *rtuple)
   166  {
   167  	struct ipv4_nat_entry *ostate;
   168  
   169  	ostate = snat_v4_lookup(otuple);
   170  	if (ostate) {
   171  		snat_v4_swap_tuple(otuple, rtuple);
   172  		rtuple->daddr = ostate->to_saddr;
   173  		rtuple->dport = ostate->to_sport;
   174  	}
   175  
   176  	return ostate ? 0 : -1;
   177  }
   178  
   179  static __always_inline void snat_v4_ct_canonicalize(struct ipv4_ct_tuple *otuple)
   180  {
   181  	__be32 addr = otuple->saddr;
   182  
   183  	otuple->flags = NAT_DIR_EGRESS;
   184  	/* Workaround #5848. */
   185  	otuple->saddr = otuple->daddr;
   186  	otuple->daddr = addr;
   187  }
   188  
   189  static __always_inline void snat_v4_delete_tuples(struct ipv4_ct_tuple *otuple)
   190  {
   191  	struct ipv4_ct_tuple rtuple;
   192  
   193  	if (otuple->flags & TUPLE_F_IN)
   194  		return;
   195  	snat_v4_ct_canonicalize(otuple);
   196  	if (!snat_v4_reverse_tuple(otuple, &rtuple))
   197  		snat_v4_delete(otuple, &rtuple);
   198  }
   199  
   200  static __always_inline int snat_v4_new_mapping(struct __sk_buff *skb,
   201  					       struct ipv4_ct_tuple *otuple,
   202  					       struct ipv4_nat_entry *ostate,
   203  					       const struct ipv4_nat_target *target)
   204  {
   205  	int ret = DROP_NAT_NO_MAPPING, retries;
   206  	struct ipv4_nat_entry rstate;
   207  	struct ipv4_ct_tuple rtuple;
   208  	__u16 port;
   209  
   210  	__builtin_memset(&rstate, 0, sizeof(rstate));
   211  	__builtin_memset(ostate, 0, sizeof(*ostate));
   212  
   213  	rstate.to_daddr = otuple->saddr;
   214  	rstate.to_dport = otuple->sport;
   215  
   216  	ostate->to_saddr = target->addr;
   217  
   218  	snat_v4_swap_tuple(otuple, &rtuple);
   219  	port = __snat_clamp_port_range(target->min_port,
   220  				       target->max_port,
   221  				       get_prandom_u32());
   222  
   223  	rtuple.dport = ostate->to_sport = bpf_htons(port);
   224  	rtuple.daddr = target->addr;
   225  
   226  	if (otuple->saddr == target->addr) {
   227  		ostate->common.host_local = 1;
   228  		rstate.common.host_local = ostate->common.host_local;
   229  	}
   230  
   231  #pragma unroll
   232  	for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) {
   233  		if (!snat_v4_lookup(&rtuple)) {
   234  			ostate->common.created = bpf_ktime_get_nsec();
   235  			rstate.common.created = ostate->common.created;
   236  
   237  			ret = snat_v4_update(otuple, ostate, &rtuple, &rstate);
   238  			if (!ret)
   239  				break;
   240  		}
   241  
   242  		port = __snat_clamp_port_range(target->min_port,
   243  					       target->max_port,
   244  					       port + 1);
   245  		rtuple.dport = ostate->to_sport = bpf_htons(port);
   246  	}
   247  
   248  	if (retries > SNAT_SIGNAL_THRES)
   249  		send_signal_nat_fill_up(skb, SIGNAL_NAT_PROTO_V4);
   250  	return !ret ? 0 : DROP_NAT_NO_MAPPING;
   251  }
   252  
   253  static __always_inline int snat_v4_track_local(struct __sk_buff *skb,
   254  					       struct ipv4_ct_tuple *tuple,
   255  					       struct ipv4_nat_entry *state,
   256  					       int dir, __u32 off,
   257  					       const struct ipv4_nat_target *target)
   258  {
   259  	struct ct_state ct_state;
   260  	struct ipv4_ct_tuple tmp;
   261  	bool needs_ct = false;
   262  	__u32 monitor = 0;
   263  	int ret, where;
   264  
   265  	if (state && state->common.host_local) {
   266  		needs_ct = true;
   267  	} else if (!state && dir == NAT_DIR_EGRESS) {
   268  		if (tuple->saddr == target->addr)
   269  			needs_ct = true;
   270  	}
   271  	if (!needs_ct)
   272  		return 0;
   273  
   274  	__builtin_memset(&ct_state, 0, sizeof(ct_state));
   275  	__builtin_memcpy(&tmp, tuple, sizeof(tmp));
   276  
   277  	where = dir == NAT_DIR_INGRESS ? CT_INGRESS : CT_EGRESS;
   278  
   279  	ret = ct_lookup4(get_ct_map4(&tmp), &tmp, skb, off, where,
   280  			 &ct_state, &monitor);
   281  	if (ret < 0) {
   282  		return ret;
   283  	} else if (ret == CT_NEW) {
   284  		ret = ct_create4(get_ct_map4(&tmp), &tmp, skb, where,
   285  				 &ct_state, false);
   286  		if (IS_ERR(ret))
   287  			return ret;
   288  	}
   289  
   290  	return 0;
   291  }
   292  
   293  static __always_inline int snat_v4_handle_mapping(struct __sk_buff *skb,
   294  						  struct ipv4_ct_tuple *tuple,
   295  						  struct ipv4_nat_entry **state,
   296  						  struct ipv4_nat_entry *tmp,
   297  						  int dir, __u32 off,
   298  						  const struct ipv4_nat_target *target)
   299  {
   300  	int ret;
   301  
   302  	*state = snat_v4_lookup(tuple);
   303  	ret = snat_v4_track_local(skb, tuple, *state, dir, off, target);
   304  	if (ret < 0)
   305  		return ret;
   306  	else if (*state)
   307  		return NAT_CONTINUE_XLATE;
   308  	else if (dir == NAT_DIR_INGRESS)
   309  		return tuple->nexthdr != IPPROTO_ICMP &&
   310  		       bpf_ntohs(tuple->dport) < target->min_port ?
   311  		       NAT_PUNT_TO_STACK : DROP_NAT_NO_MAPPING;
   312  	else
   313  		return snat_v4_new_mapping(skb, tuple, (*state = tmp), target);
   314  }
   315  
   316  static __always_inline int snat_v4_rewrite_egress(struct __sk_buff *skb,
   317  						  struct ipv4_ct_tuple *tuple,
   318  						  struct ipv4_nat_entry *state,
   319  						  __u32 off)
   320  {
   321  	struct csum_offset csum = {};
   322  	__be32 sum_l4 = 0, sum;
   323  	int ret;
   324  
   325  	if (state->to_saddr == tuple->saddr &&
   326  	    state->to_sport == tuple->sport)
   327  		return 0;
   328  	sum = csum_diff(&tuple->saddr, 4, &state->to_saddr, 4, 0);
   329  	csum_l4_offset_and_flags(tuple->nexthdr, &csum);
   330  	if (state->to_sport != tuple->sport) {
   331  		switch (tuple->nexthdr) {
   332  		case IPPROTO_TCP:
   333  		case IPPROTO_UDP:
   334  			ret = l4_modify_port(skb, off,
   335  					     offsetof(struct tcphdr, source),
   336  					     &csum, state->to_sport,
   337  					     tuple->sport);
   338  			if (ret < 0)
   339  				return ret;
   340  			break;
   341  		case IPPROTO_ICMP: {
   342  			__be32 from, to;
   343  
   344  			if (skb_store_bytes(skb, off +
   345  					    offsetof(struct icmphdr, un.echo.id),
   346  					    &state->to_sport,
   347  					    sizeof(state->to_sport), 0) < 0)
   348  				return DROP_WRITE_ERROR;
   349  			from = tuple->sport;
   350  			to = state->to_sport;
   351  			sum_l4 = csum_diff(&from, 4, &to, 4, 0);
   352  			csum.offset = offsetof(struct icmphdr, checksum);
   353  			break;
   354  		}}
   355  	}
   356  	if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct iphdr, saddr),
   357  			    &state->to_saddr, 4, 0) < 0)
   358  		return DROP_WRITE_ERROR;
   359  	if (l3_csum_replace(skb, ETH_HLEN + offsetof(struct iphdr, check),
   360  			    0, sum, 0) < 0)
   361  		return DROP_CSUM_L3;
   362  	if (tuple->nexthdr == IPPROTO_ICMP)
   363  		sum = sum_l4;
   364  	if (csum.offset &&
   365  	    csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   366                  return DROP_CSUM_L4;
   367  	return 0;
   368  }
   369  
   370  static __always_inline int snat_v4_rewrite_ingress(struct __sk_buff *skb,
   371  						   struct ipv4_ct_tuple *tuple,
   372  						   struct ipv4_nat_entry *state,
   373  						   __u32 off)
   374  {
   375  	struct csum_offset csum = {};
   376  	__be32 sum_l4 = 0, sum;
   377  	int ret;
   378  
   379  	if (state->to_daddr == tuple->daddr &&
   380  	    state->to_dport == tuple->dport)
   381  		return 0;
   382  	sum = csum_diff(&tuple->daddr, 4, &state->to_daddr, 4, 0);
   383  	csum_l4_offset_and_flags(tuple->nexthdr, &csum);
   384  	if (state->to_dport != tuple->dport) {
   385  		switch (tuple->nexthdr) {
   386  		case IPPROTO_TCP:
   387  		case IPPROTO_UDP:
   388  			ret = l4_modify_port(skb, off,
   389  					     offsetof(struct tcphdr, dest),
   390  					     &csum, state->to_dport,
   391  					     tuple->dport);
   392  			if (ret < 0)
   393  				return ret;
   394  			break;
   395  		case IPPROTO_ICMP: {
   396  			__be32 from, to;
   397  
   398  			if (skb_store_bytes(skb, off +
   399  					    offsetof(struct icmphdr, un.echo.id),
   400  					    &state->to_dport,
   401  					    sizeof(state->to_dport), 0) < 0)
   402  				return DROP_WRITE_ERROR;
   403  			from = tuple->dport;
   404  			to = state->to_dport;
   405  			sum_l4 = csum_diff(&from, 4, &to, 4, 0);
   406  			csum.offset = offsetof(struct icmphdr, checksum);
   407  			break;
   408  		}}
   409  	}
   410  	if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct iphdr, daddr),
   411  			    &state->to_daddr, 4, 0) < 0)
   412  		return DROP_WRITE_ERROR;
   413  	if (l3_csum_replace(skb, ETH_HLEN + offsetof(struct iphdr, check),
   414  			    0, sum, 0) < 0)
   415  		return DROP_CSUM_L3;
   416  	if (tuple->nexthdr == IPPROTO_ICMP)
   417  		sum = sum_l4;
   418  	if (csum.offset &&
   419  	    csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   420                  return DROP_CSUM_L4;
   421  	return 0;
   422  }
   423  
   424  static __always_inline bool snat_v4_can_skip(const struct ipv4_nat_target *target,
   425  					     const struct ipv4_ct_tuple *tuple, int dir)
   426  {
   427  	__u16 dport = bpf_ntohs(tuple->dport), sport = bpf_ntohs(tuple->sport);
   428  
   429  	if (dir == NAT_DIR_EGRESS && !target->src_from_world && sport < NAT_MIN_EGRESS)
   430  		return true;
   431  	if (dir == NAT_DIR_INGRESS && (dport < target->min_port || dport > target->max_port))
   432  		return true;
   433  	return false;
   434  }
   435  
   436  static __always_inline int snat_v4_process(struct __sk_buff *skb, int dir,
   437  					   const struct ipv4_nat_target *target)
   438  {
   439  	struct ipv4_nat_entry *state, tmp;
   440  	struct ipv4_ct_tuple tuple = {};
   441  	struct icmphdr icmphdr;
   442  	void *data, *data_end;
   443  	struct iphdr *ip4;
   444  	struct {
   445  		__be16 sport;
   446  		__be16 dport;
   447  	} l4hdr;
   448  	__u32 off;
   449  	int ret;
   450  
   451  	build_bug_on(sizeof(struct ipv4_nat_entry) > 64);
   452  
   453  	if (!revalidate_data(skb, &data, &data_end, &ip4))
   454  		return DROP_INVALID;
   455  
   456  	tuple.nexthdr = ip4->protocol;
   457  	tuple.daddr = ip4->daddr;
   458  	tuple.saddr = ip4->saddr;
   459  	tuple.flags = dir;
   460  	off = ((void *)ip4 - data) + ipv4_hdrlen(ip4);
   461  	switch (tuple.nexthdr) {
   462  	case IPPROTO_TCP:
   463  	case IPPROTO_UDP:
   464  		if (skb_load_bytes(skb, off, &l4hdr, sizeof(l4hdr)) < 0)
   465  			return DROP_INVALID;
   466  		tuple.dport = l4hdr.dport;
   467  		tuple.sport = l4hdr.sport;
   468  		break;
   469  	case IPPROTO_ICMP:
   470  		if (skb_load_bytes(skb, off, &icmphdr, sizeof(icmphdr)) < 0)
   471  			return DROP_INVALID;
   472  		if (icmphdr.type != ICMP_ECHO &&
   473  		    icmphdr.type != ICMP_ECHOREPLY)
   474  			return DROP_NAT_UNSUPP_PROTO;
   475  		if (dir == NAT_DIR_EGRESS) {
   476  			tuple.dport = 0;
   477  			tuple.sport = icmphdr.un.echo.id;
   478  		} else {
   479  			tuple.dport = icmphdr.un.echo.id;
   480  			tuple.sport = 0;
   481  		}
   482  		break;
   483  	default:
   484  		return DROP_NAT_UNSUPP_PROTO;
   485  	};
   486  
   487  	if (snat_v4_can_skip(target, &tuple, dir))
   488  		return NAT_PUNT_TO_STACK;
   489  	ret = snat_v4_handle_mapping(skb, &tuple, &state, &tmp, dir, off, target);
   490  	if (ret > 0)
   491  		return TC_ACT_OK;
   492  	if (ret < 0)
   493  		return ret;
   494  
   495  	return dir == NAT_DIR_EGRESS ?
   496  	       snat_v4_rewrite_egress(skb, &tuple, state, off) :
   497  	       snat_v4_rewrite_ingress(skb, &tuple, state, off);
   498  }
   499  #else
   500  static __always_inline int snat_v4_process(struct __sk_buff *skb, int dir,
   501  					   const struct ipv4_nat_target *target)
   502  {
   503  	return TC_ACT_OK;
   504  }
   505  
   506  static __always_inline void snat_v4_delete_tuples(struct ipv4_ct_tuple *tuple)
   507  {
   508  }
   509  #endif
   510  
   511  struct ipv6_nat_entry {
   512  	struct nat_entry common;
   513  	union {
   514  		struct {
   515  			union v6addr to_saddr;
   516  			__be16       to_sport;
   517  		};
   518  		struct {
   519  			union v6addr to_daddr;
   520  			__be16       to_dport;
   521  		};
   522  	};
   523  };
   524  
   525  struct ipv6_nat_target {
   526  	union v6addr addr;
   527  	const __u16 min_port; /* host endianess */
   528  	const __u16 max_port; /* host endianess */
   529  	bool src_from_world;
   530  };
   531  
   532  #if defined ENABLE_IPV6 && (defined ENABLE_MASQUERADE || defined ENABLE_NODEPORT)
   533  struct bpf_elf_map __section_maps SNAT_MAPPING_IPV6 = {
   534  	.type		= NAT_MAP_TYPE,
   535  	.size_key	= sizeof(struct ipv6_ct_tuple),
   536  	.size_value	= sizeof(struct ipv6_nat_entry),
   537  	.pinning	= PIN_GLOBAL_NS,
   538  	.max_elem	= SNAT_MAPPING_IPV6_SIZE,
   539  #ifndef HAVE_LRU_MAP_TYPE
   540  	.flags		= CONDITIONAL_PREALLOC,
   541  #endif
   542  };
   543  
   544  static __always_inline
   545  struct ipv6_nat_entry *snat_v6_lookup(struct ipv6_ct_tuple *tuple)
   546  {
   547  	return __snat_lookup(&SNAT_MAPPING_IPV6, tuple);
   548  }
   549  
   550  static __always_inline int snat_v6_update(struct ipv6_ct_tuple *otuple,
   551  					  struct ipv6_nat_entry *ostate,
   552  					  struct ipv6_ct_tuple *rtuple,
   553  					  struct ipv6_nat_entry *rstate)
   554  {
   555  	return __snat_update(&SNAT_MAPPING_IPV6, otuple, ostate,
   556  			     rtuple, rstate);
   557  }
   558  
   559  static __always_inline void snat_v6_delete(struct ipv6_ct_tuple *otuple,
   560  					   struct ipv6_ct_tuple *rtuple)
   561  {
   562  	__snat_delete(&SNAT_MAPPING_IPV6, otuple, rtuple);
   563  }
   564  
   565  static __always_inline void snat_v6_swap_tuple(struct ipv6_ct_tuple *otuple,
   566  					       struct ipv6_ct_tuple *rtuple)
   567  {
   568  	__builtin_memset(rtuple, 0, sizeof(*rtuple));
   569  	rtuple->nexthdr = otuple->nexthdr;
   570  	rtuple->daddr = otuple->saddr;
   571  	rtuple->saddr = otuple->daddr;
   572  	rtuple->dport = otuple->sport;
   573  	rtuple->sport = otuple->dport;
   574  	rtuple->flags = otuple->flags == NAT_DIR_EGRESS ?
   575  			NAT_DIR_INGRESS : NAT_DIR_EGRESS;
   576  }
   577  
   578  static __always_inline int snat_v6_reverse_tuple(struct ipv6_ct_tuple *otuple,
   579  						 struct ipv6_ct_tuple *rtuple)
   580  {
   581  	struct ipv6_nat_entry *ostate;
   582  
   583  	ostate = snat_v6_lookup(otuple);
   584  	if (ostate) {
   585  		snat_v6_swap_tuple(otuple, rtuple);
   586  		rtuple->daddr = ostate->to_saddr;
   587  		rtuple->dport = ostate->to_sport;
   588  	}
   589  
   590  	return ostate ? 0 : -1;
   591  }
   592  
   593  static __always_inline void snat_v6_ct_canonicalize(struct ipv6_ct_tuple *otuple)
   594  {
   595  	union v6addr addr = {};
   596  
   597  	otuple->flags = NAT_DIR_EGRESS;
   598  	/* Workaround #5848. */
   599  	ipv6_addr_copy(&addr, &otuple->saddr);
   600  	ipv6_addr_copy(&otuple->saddr, &otuple->daddr);
   601  	ipv6_addr_copy(&otuple->daddr, &addr);
   602  }
   603  
   604  static __always_inline void snat_v6_delete_tuples(struct ipv6_ct_tuple *otuple)
   605  {
   606  	struct ipv6_ct_tuple rtuple;
   607  
   608  	if (otuple->flags & TUPLE_F_IN)
   609  		return;
   610  	snat_v6_ct_canonicalize(otuple);
   611  	if (!snat_v6_reverse_tuple(otuple, &rtuple))
   612  		snat_v6_delete(otuple, &rtuple);
   613  }
   614  
   615  static __always_inline int snat_v6_new_mapping(struct __sk_buff *skb,
   616  					       struct ipv6_ct_tuple *otuple,
   617  					       struct ipv6_nat_entry *ostate,
   618  					       const struct ipv6_nat_target *target)
   619  {
   620  	int ret = DROP_NAT_NO_MAPPING, retries;
   621  	struct ipv6_nat_entry rstate;
   622  	struct ipv6_ct_tuple rtuple;
   623  	__u16 port;
   624  
   625  	__builtin_memset(&rstate, 0, sizeof(rstate));
   626  	__builtin_memset(ostate, 0, sizeof(*ostate));
   627  
   628  	rstate.to_daddr = otuple->saddr;
   629  	rstate.to_dport = otuple->sport;
   630  
   631  	ostate->to_saddr = target->addr;
   632  
   633  	snat_v6_swap_tuple(otuple, &rtuple);
   634  	port = __snat_clamp_port_range(target->min_port,
   635  				       target->max_port,
   636  				       get_prandom_u32());
   637  
   638  	rtuple.dport = ostate->to_sport = bpf_htons(port);
   639  	rtuple.daddr = target->addr;
   640  
   641  	if (!ipv6_addrcmp(&otuple->saddr, &rtuple.daddr)) {
   642  		ostate->common.host_local = 1;
   643  		rstate.common.host_local = ostate->common.host_local;
   644  	}
   645  
   646  #pragma unroll
   647  	for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) {
   648  		if (!snat_v6_lookup(&rtuple)) {
   649  			ostate->common.created = bpf_ktime_get_nsec();
   650  			rstate.common.created = ostate->common.created;
   651  
   652  			ret = snat_v6_update(otuple, ostate, &rtuple, &rstate);
   653  			if (!ret)
   654  				break;
   655  		}
   656  
   657  		port = __snat_clamp_port_range(target->min_port,
   658  					       target->max_port,
   659  					       port + 1);
   660  		rtuple.dport = ostate->to_sport = bpf_htons(port);
   661  	}
   662  
   663  	if (retries > SNAT_SIGNAL_THRES)
   664  		send_signal_nat_fill_up(skb, SIGNAL_NAT_PROTO_V6);
   665  	return !ret ? 0 : DROP_NAT_NO_MAPPING;
   666  }
   667  
   668  static __always_inline int snat_v6_track_local(struct __sk_buff *skb,
   669  					       struct ipv6_ct_tuple *tuple,
   670  					       struct ipv6_nat_entry *state,
   671  					       int dir, __u32 off,
   672  					       const struct ipv6_nat_target *target)
   673  {
   674  	struct ct_state ct_state;
   675  	struct ipv6_ct_tuple tmp;
   676  	bool needs_ct = false;
   677  	__u32 monitor = 0;
   678  	int ret, where;
   679  
   680  	if (state && state->common.host_local) {
   681  		needs_ct = true;
   682  	} else if (!state && dir == NAT_DIR_EGRESS) {
   683  		if (!ipv6_addrcmp(&tuple->saddr, (void *)&target->addr))
   684  			needs_ct = true;
   685  	}
   686  	if (!needs_ct)
   687  		return 0;
   688  
   689  	__builtin_memset(&ct_state, 0, sizeof(ct_state));
   690  	__builtin_memcpy(&tmp, tuple, sizeof(tmp));
   691  
   692  	where = dir == NAT_DIR_INGRESS ? CT_INGRESS : CT_EGRESS;
   693  
   694  	ret = ct_lookup6(get_ct_map6(&tmp), &tmp, skb, off, where,
   695  			 &ct_state, &monitor);
   696  	if (ret < 0) {
   697  		return ret;
   698  	} else if (ret == CT_NEW) {
   699  		ret = ct_create6(get_ct_map6(&tmp), &tmp, skb, where,
   700  				 &ct_state, false);
   701  		if (IS_ERR(ret))
   702  			return ret;
   703  	}
   704  
   705  	return 0;
   706  }
   707  
   708  static __always_inline int snat_v6_handle_mapping(struct __sk_buff *skb,
   709  						  struct ipv6_ct_tuple *tuple,
   710  						  struct ipv6_nat_entry **state,
   711  						  struct ipv6_nat_entry *tmp,
   712  						  int dir, __u32 off,
   713  						  const struct ipv6_nat_target *target)
   714  {
   715  	int ret;
   716  
   717  	*state = snat_v6_lookup(tuple);
   718  	ret = snat_v6_track_local(skb, tuple, *state, dir, off, target);
   719  	if (ret < 0)
   720  		return ret;
   721  	else if (*state)
   722  		return NAT_CONTINUE_XLATE;
   723  	else if (dir == NAT_DIR_INGRESS)
   724  		return tuple->nexthdr != IPPROTO_ICMPV6 &&
   725  		       bpf_ntohs(tuple->dport) < target->min_port ?
   726  		       NAT_PUNT_TO_STACK : DROP_NAT_NO_MAPPING;
   727  	else
   728  		return snat_v6_new_mapping(skb, tuple, (*state = tmp), target);
   729  }
   730  
   731  static __always_inline int snat_v6_rewrite_egress(struct __sk_buff *skb,
   732  						  struct ipv6_ct_tuple *tuple,
   733  						  struct ipv6_nat_entry *state,
   734  						  __u32 off)
   735  {
   736  	struct csum_offset csum = {};
   737  	__be32 sum;
   738  	int ret;
   739  
   740  	if (!ipv6_addrcmp(&state->to_saddr, &tuple->saddr) &&
   741  	    state->to_sport == tuple->sport)
   742  		return 0;
   743  	sum = csum_diff(&tuple->saddr, 16, &state->to_saddr, 16, 0);
   744  	csum_l4_offset_and_flags(tuple->nexthdr, &csum);
   745  	if (state->to_sport != tuple->sport) {
   746  		switch (tuple->nexthdr) {
   747  		case IPPROTO_TCP:
   748  		case IPPROTO_UDP:
   749  			ret = l4_modify_port(skb, off, offsetof(struct tcphdr, source),
   750  					     &csum, state->to_sport, tuple->sport);
   751  			if (ret < 0)
   752  				return ret;
   753  			break;
   754  		case IPPROTO_ICMPV6: {
   755  			__be32 from, to;
   756  
   757  			if (skb_store_bytes(skb, off +
   758  					    offsetof(struct icmp6hdr,
   759  						     icmp6_dataun.u_echo.identifier),
   760  					    &state->to_sport,
   761  					    sizeof(state->to_sport), 0) < 0)
   762  				return DROP_WRITE_ERROR;
   763  			from = tuple->sport;
   764  			to = state->to_sport;
   765  			sum = csum_diff(&from, 4, &to, 4, sum);
   766  			break;
   767  		}}
   768  	}
   769  	if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct ipv6hdr, saddr),
   770  			    &state->to_saddr, 16, 0) < 0)
   771  		return DROP_WRITE_ERROR;
   772  	if (csum.offset &&
   773  	    csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   774                  return DROP_CSUM_L4;
   775  	return 0;
   776  }
   777  
   778  static __always_inline int snat_v6_rewrite_ingress(struct __sk_buff *skb,
   779  						   struct ipv6_ct_tuple *tuple,
   780  						   struct ipv6_nat_entry *state,
   781  						   __u32 off)
   782  {
   783  	struct csum_offset csum = {};
   784  	__be32 sum;
   785  	int ret;
   786  
   787  	if (!ipv6_addrcmp(&state->to_daddr, &tuple->daddr) &&
   788  	    state->to_dport == tuple->dport)
   789  		return 0;
   790  	sum = csum_diff(&tuple->daddr, 16, &state->to_daddr, 16, 0);
   791  	csum_l4_offset_and_flags(tuple->nexthdr, &csum);
   792  	if (state->to_dport != tuple->dport) {
   793  		switch (tuple->nexthdr) {
   794  		case IPPROTO_TCP:
   795  		case IPPROTO_UDP:
   796  			ret = l4_modify_port(skb, off,
   797  					     offsetof(struct tcphdr, dest),
   798  					     &csum, state->to_dport,
   799  					     tuple->dport);
   800  			if (ret < 0)
   801  				return ret;
   802  			break;
   803  		case IPPROTO_ICMPV6: {
   804  			__be32 from, to;
   805  
   806  			if (skb_store_bytes(skb, off +
   807  					    offsetof(struct icmp6hdr,
   808  						     icmp6_dataun.u_echo.identifier),
   809  					    &state->to_dport,
   810  					    sizeof(state->to_dport), 0) < 0)
   811  				return DROP_WRITE_ERROR;
   812  			from = tuple->dport;
   813  			to = state->to_dport;
   814  			sum = csum_diff(&from, 4, &to, 4, sum);
   815  			break;
   816  		}}
   817  	}
   818  	if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct ipv6hdr, daddr),
   819  			    &state->to_daddr, 16, 0) < 0)
   820  		return DROP_WRITE_ERROR;
   821  	if (csum.offset &&
   822  	    csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   823                  return DROP_CSUM_L4;
   824  	return 0;
   825  }
   826  
   827  static __always_inline bool snat_v6_can_skip(const struct ipv6_nat_target *target,
   828  					     const struct ipv6_ct_tuple *tuple, int dir)
   829  {
   830  	__u16 dport = bpf_ntohs(tuple->dport), sport = bpf_ntohs(tuple->sport);
   831  
   832  	if (dir == NAT_DIR_EGRESS && !target->src_from_world && sport < NAT_MIN_EGRESS)
   833  		return true;
   834  	if (dir == NAT_DIR_INGRESS && (dport < target->min_port || dport > target->max_port))
   835  		return true;
   836  	return false;
   837  }
   838  
   839  static __always_inline int snat_v6_process(struct __sk_buff *skb, int dir,
   840  					   const struct ipv6_nat_target *target)
   841  {
   842  	struct ipv6_nat_entry *state, tmp;
   843  	struct ipv6_ct_tuple tuple = {};
   844  	struct icmp6hdr icmp6hdr;
   845  	void *data, *data_end;
   846  	struct ipv6hdr *ip6;
   847  	int ret, hdrlen;
   848  	struct {
   849  		__be16 sport;
   850  		__be16 dport;
   851  	} l4hdr;
   852  	__u8 nexthdr;
   853  	__u32 off;
   854  
   855  	build_bug_on(sizeof(struct ipv6_nat_entry) > 64);
   856  
   857  	if (!revalidate_data(skb, &data, &data_end, &ip6))
   858  		return DROP_INVALID;
   859  
   860  	nexthdr = ip6->nexthdr;
   861  	hdrlen = ipv6_hdrlen(skb, ETH_HLEN, &nexthdr);
   862  	if (hdrlen < 0)
   863  		return hdrlen;
   864  
   865  	tuple.nexthdr = nexthdr;
   866  	ipv6_addr_copy(&tuple.daddr, (union v6addr *)&ip6->daddr);
   867  	ipv6_addr_copy(&tuple.saddr, (union v6addr *)&ip6->saddr);
   868  	tuple.flags = dir;
   869  	off = ((void *)ip6 - data) + hdrlen;
   870  	switch (tuple.nexthdr) {
   871  	case IPPROTO_TCP:
   872  	case IPPROTO_UDP:
   873  		if (skb_load_bytes(skb, off, &l4hdr, sizeof(l4hdr)) < 0)
   874  			return DROP_INVALID;
   875  		tuple.dport = l4hdr.dport;
   876  		tuple.sport = l4hdr.sport;
   877  		break;
   878  	case IPPROTO_ICMPV6:
   879  		if (skb_load_bytes(skb, off, &icmp6hdr, sizeof(icmp6hdr)) < 0)
   880  			return DROP_INVALID;
   881  		/* Letting neighbor solicitation / advertisement pass through. */
   882  		if (icmp6hdr.icmp6_type == 135 || icmp6hdr.icmp6_type == 136)
   883  			return TC_ACT_OK;
   884  		if (icmp6hdr.icmp6_type != ICMPV6_ECHO_REQUEST &&
   885  		    icmp6hdr.icmp6_type != ICMPV6_ECHO_REPLY)
   886  			return DROP_NAT_UNSUPP_PROTO;
   887  		if (dir == NAT_DIR_EGRESS) {
   888  			tuple.dport = 0;
   889  			tuple.sport = icmp6hdr.icmp6_dataun.u_echo.identifier;
   890  		} else {
   891  			tuple.dport = icmp6hdr.icmp6_dataun.u_echo.identifier;
   892  			tuple.sport = 0;
   893  		}
   894  		break;
   895  	default:
   896  		return DROP_NAT_UNSUPP_PROTO;
   897  	};
   898  
   899  	if (snat_v6_can_skip(target, &tuple, dir))
   900  		return NAT_PUNT_TO_STACK;
   901  	ret = snat_v6_handle_mapping(skb, &tuple, &state, &tmp, dir, off, target);
   902  	if (ret > 0)
   903  		return TC_ACT_OK;
   904  	if (ret < 0)
   905  		return ret;
   906  
   907  	return dir == NAT_DIR_EGRESS ?
   908  	       snat_v6_rewrite_egress(skb, &tuple, state, off) :
   909  	       snat_v6_rewrite_ingress(skb, &tuple, state, off);
   910  }
   911  #else
   912  static __always_inline int snat_v6_process(struct __sk_buff *skb, int dir,
   913  					   const struct ipv6_nat_target *target)
   914  {
   915  	return TC_ACT_OK;
   916  }
   917  
   918  static __always_inline void snat_v6_delete_tuples(struct ipv6_ct_tuple *tuple)
   919  {
   920  }
   921  #endif
   922  
   923  #ifdef CONNTRACK
   924  static __always_inline void ct_delete4(void *map, struct ipv4_ct_tuple *tuple,
   925  				       struct __sk_buff *skb)
   926  {
   927  	int err;
   928  
   929  	if ((err = map_delete_elem(map, tuple)) < 0)
   930  		cilium_dbg(skb, DBG_ERROR_RET, BPF_FUNC_map_delete_elem, err);
   931  	else
   932  		snat_v4_delete_tuples(tuple);
   933  }
   934  
   935  static __always_inline void ct_delete6(void *map, struct ipv6_ct_tuple *tuple,
   936  				       struct __sk_buff *skb)
   937  {
   938  	int err;
   939  
   940  	if ((err = map_delete_elem(map, tuple)) < 0)
   941  		cilium_dbg(skb, DBG_ERROR_RET, BPF_FUNC_map_delete_elem, err);
   942  	else
   943  		snat_v6_delete_tuples(tuple);
   944  }
   945  #else
   946  static __always_inline void ct_delete4(void *map, struct ipv4_ct_tuple *tuple,
   947  				       struct __sk_buff *skb)
   948  {
   949  }
   950  
   951  static __always_inline void ct_delete6(void *map, struct ipv6_ct_tuple *tuple,
   952  				       struct __sk_buff *skb)
   953  {
   954  }
   955  #endif
   956  
   957  static __always_inline int snat_process(struct __sk_buff *skb, int dir)
   958  {
   959  	int ret = TC_ACT_OK;
   960  
   961  #ifdef ENABLE_MASQUERADE
   962  	switch (skb->protocol) {
   963  #ifdef ENABLE_IPV4
   964  	case bpf_htons(ETH_P_IP): {
   965  		struct ipv4_nat_target target = {
   966  			.min_port = SNAT_MAPPING_MIN_PORT,
   967  			.max_port = SNAT_MAPPING_MAX_PORT,
   968  			.addr  = SNAT_IPV4_EXTERNAL,
   969  		};
   970  		ret = snat_v4_process(skb, dir, &target);
   971  		break; }
   972  #endif
   973  #ifdef ENABLE_IPV6
   974  	case bpf_htons(ETH_P_IPV6): {
   975  		struct ipv6_nat_target target = {
   976  			.min_port = SNAT_MAPPING_MIN_PORT,
   977  			.max_port = SNAT_MAPPING_MAX_PORT,
   978  		};
   979  		BPF_V6(target.addr, SNAT_IPV6_EXTERNAL);
   980  		ret = snat_v6_process(skb, dir, &target);
   981  		break; }
   982  #endif
   983  	}
   984  	if (IS_ERR(ret))
   985  		return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, dir);
   986  #endif
   987  	return ret;
   988  }
   989  #endif /* __LIB_NAT__ */