github.com/datadog/cilium@v1.6.12/bpf/lib/lb.h (about)

     1  /*
     2   *  Copyright (C) 2016-2019 Authors of Cilium
     3   *
     4   *  This program is free software; you can redistribute it and/or modify
     5   *  it under the terms of the GNU General Public License as published by
     6   *  the Free Software Foundation; either version 2 of the License, or
     7   *  (at your option) any later version.
     8   *
     9   *  This program is distributed in the hope that it will be useful,
    10   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
    11   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    12   *  GNU General Public License for more details.
    13   *
    14   *  You should have received a copy of the GNU General Public License
    15   *  along with this program; if not, write to the Free Software
    16   *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    17   */
    18  
    19  /**
    20   * Configuration:
    21   * LB_L4: Include L4 matching and rewriting capabilities
    22   * LB_L3: Enable fallback to L3 LB entries
    23   *
    24   * Either LB_L4, LB_L3, or both need to be set to enable forward
    25   * translation. Reverse translation will awlays occur regardless
    26   * of the settings.
    27   */
    28  
    29  
    30  #ifndef __LB_H_
    31  #define __LB_H_
    32  
    33  #include "csum.h"
    34  #include "conntrack.h"
    35  
    36  #define CILIUM_LB_MAP_MAX_FE		256
    37  
    38  #ifdef ENABLE_IPV6
    39  struct bpf_elf_map __section_maps LB6_REVERSE_NAT_MAP = {
    40  	.type		= BPF_MAP_TYPE_HASH,
    41  	.size_key	= sizeof(__u16),
    42  	.size_value	= sizeof(struct lb6_reverse_nat),
    43  	.pinning	= PIN_GLOBAL_NS,
    44  	.max_elem	= CILIUM_LB_MAP_MAX_ENTRIES,
    45  	.flags		= CONDITIONAL_PREALLOC,
    46  };
    47  
    48  struct bpf_elf_map __section_maps LB6_SERVICES_MAP_V2 = {
    49  	.type		= BPF_MAP_TYPE_HASH,
    50  	.size_key	= sizeof(struct lb6_key_v2),
    51  	.size_value	= sizeof(struct lb6_service_v2),
    52  	.pinning	= PIN_GLOBAL_NS,
    53  	.max_elem	= CILIUM_LB_MAP_MAX_ENTRIES,
    54  	.flags		= CONDITIONAL_PREALLOC,
    55  };
    56  
    57  struct bpf_elf_map __section_maps LB6_RR_SEQ_MAP_V2 = {
    58  	.type           = BPF_MAP_TYPE_HASH,
    59  	.size_key       = sizeof(struct lb6_key_v2),
    60  	.size_value     = sizeof(struct lb_sequence),
    61  	.pinning        = PIN_GLOBAL_NS,
    62  	.max_elem       = CILIUM_LB_MAP_MAX_FE,
    63  	.flags		= CONDITIONAL_PREALLOC,
    64  };
    65  
    66  struct bpf_elf_map __section_maps LB6_BACKEND_MAP = {
    67  	.type           = BPF_MAP_TYPE_HASH,
    68  	.size_key       = sizeof(__u16),
    69  	.size_value     = sizeof(struct lb6_backend),
    70  	.pinning        = PIN_GLOBAL_NS,
    71  	.max_elem       = CILIUM_LB_MAP_MAX_ENTRIES,
    72  	.flags          = CONDITIONAL_PREALLOC,
    73  };
    74  
    75  #endif /* ENABLE_IPV6 */
    76  
    77  #ifdef ENABLE_IPV4
    78  struct bpf_elf_map __section_maps LB4_REVERSE_NAT_MAP = {
    79  	.type		= BPF_MAP_TYPE_HASH,
    80  	.size_key	= sizeof(__u16),
    81  	.size_value	= sizeof(struct lb4_reverse_nat),
    82  	.pinning	= PIN_GLOBAL_NS,
    83  	.max_elem	= CILIUM_LB_MAP_MAX_ENTRIES,
    84  	.flags		= CONDITIONAL_PREALLOC,
    85  };
    86  
    87  struct bpf_elf_map __section_maps LB4_SERVICES_MAP_V2 = {
    88  	.type		= BPF_MAP_TYPE_HASH,
    89  	.size_key	= sizeof(struct lb4_key_v2),
    90  	.size_value	= sizeof(struct lb4_service_v2),
    91  	.pinning	= PIN_GLOBAL_NS,
    92  	.max_elem	= CILIUM_LB_MAP_MAX_ENTRIES,
    93  	.flags		= CONDITIONAL_PREALLOC,
    94  };
    95  
    96  struct bpf_elf_map __section_maps LB4_RR_SEQ_MAP_V2 = {
    97  	.type           = BPF_MAP_TYPE_HASH,
    98  	.size_key       = sizeof(struct lb4_key_v2),
    99  	.size_value     = sizeof(struct lb_sequence),
   100  	.pinning        = PIN_GLOBAL_NS,
   101  	.max_elem       = CILIUM_LB_MAP_MAX_FE,
   102  	.flags		= CONDITIONAL_PREALLOC,
   103  };
   104  
   105  struct bpf_elf_map __section_maps LB4_BACKEND_MAP = {
   106  	.type           = BPF_MAP_TYPE_HASH,
   107  	.size_key       = sizeof(__u16),
   108  	.size_value     = sizeof(struct lb4_backend),
   109  	.pinning        = PIN_GLOBAL_NS,
   110  	.max_elem       = CILIUM_LB_MAP_MAX_ENTRIES,
   111  	.flags          = CONDITIONAL_PREALLOC,
   112  };
   113  
   114  #endif /* ENABLE_IPV4 */
   115  
   116  
   117  #define REV_NAT_F_TUPLE_SADDR 1
   118  #ifdef LB_DEBUG
   119  #define cilium_dbg_lb cilium_dbg
   120  #else
   121  #define cilium_dbg_lb(a, b, c, d)
   122  #endif
   123  
   124  static inline int lb6_select_slave(__u16 count)
   125  {
   126  	/* Slave 0 is reserved for the master slot */
   127  	return (get_prandom_u32() % count) + 1;
   128  }
   129  
   130  static inline int lb4_select_slave(__u16 count)
   131  {
   132  	/* Slave 0 is reserved for the master slot */
   133  	return (get_prandom_u32() % count) + 1;
   134  }
   135  
   136  static inline int __inline__ extract_l4_port(struct __sk_buff *skb, __u8 nexthdr,
   137  					     int l4_off, __be16 *port)
   138  {
   139  	int ret;
   140  
   141  	switch (nexthdr) {
   142  	case IPPROTO_TCP:
   143  	case IPPROTO_UDP:
   144  		/* Port offsets for UDP and TCP are the same */
   145  		ret = l4_load_port(skb, l4_off + TCP_DPORT_OFF, port);
   146  		if (IS_ERR(ret))
   147  			return ret;
   148  		break;
   149  
   150  	case IPPROTO_ICMPV6:
   151  	case IPPROTO_ICMP:
   152  		break;
   153  
   154  	default:
   155  		/* Pass unknown L4 to stack */
   156  		return DROP_UNKNOWN_L4;
   157  	}
   158  
   159  	return 0;
   160  }
   161  
   162  static inline int __inline__ reverse_map_l4_port(struct __sk_buff *skb, __u8 nexthdr,
   163  						 __be16 port, int l4_off,
   164  						 struct csum_offset *csum_off)
   165  {
   166  	switch (nexthdr) {
   167  	case IPPROTO_TCP:
   168  	case IPPROTO_UDP:
   169  		if (port) {
   170  			__be16 old_port;
   171  			int ret;
   172  
   173  			/* Port offsets for UDP and TCP are the same */
   174  			ret = l4_load_port(skb, l4_off + TCP_SPORT_OFF, &old_port);
   175  			if (IS_ERR(ret))
   176  				return ret;
   177  
   178  			if (port != old_port) {
   179  				ret = l4_modify_port(skb, l4_off, TCP_SPORT_OFF,
   180  						     csum_off, port, old_port);
   181  				if (IS_ERR(ret))
   182  					return ret;
   183  			}
   184  		}
   185  		break;
   186  
   187  	case IPPROTO_ICMPV6:
   188  	case IPPROTO_ICMP:
   189  		break;
   190  
   191  	default:
   192  		return DROP_UNKNOWN_L4;
   193  	}
   194  
   195  	return 0;
   196  }
   197  
   198  #ifdef ENABLE_IPV6
   199  static inline int __inline__ __lb6_rev_nat(struct __sk_buff *skb, int l4_off,
   200  					 struct csum_offset *csum_off,
   201  					 struct ipv6_ct_tuple *tuple, int flags,
   202  					 struct lb6_reverse_nat *nat)
   203  {
   204  	union v6addr old_saddr;
   205  	union v6addr tmp;
   206  	__u8 *new_saddr;
   207  	__be32 sum;
   208  	int ret;
   209  
   210  	cilium_dbg_lb(skb, DBG_LB6_REVERSE_NAT, nat->address.p4, nat->port);
   211  
   212  	if (nat->port) {
   213  		ret = reverse_map_l4_port(skb, tuple->nexthdr, nat->port, l4_off, csum_off);
   214  		if (IS_ERR(ret))
   215  			return ret;
   216  	}
   217  
   218  	if (flags & REV_NAT_F_TUPLE_SADDR) {
   219  		ipv6_addr_copy(&old_saddr, &tuple->saddr);
   220  		ipv6_addr_copy(&tuple->saddr, &nat->address);
   221  		new_saddr = tuple->saddr.addr;
   222  	} else {
   223  		if (ipv6_load_saddr(skb, ETH_HLEN, &old_saddr) < 0)
   224  			return DROP_INVALID;
   225  
   226  		ipv6_addr_copy(&tmp, &nat->address);
   227  		new_saddr = tmp.addr;
   228  	}
   229  
   230  	ret = ipv6_store_saddr(skb, new_saddr, ETH_HLEN);
   231  	if (IS_ERR(ret))
   232  		return DROP_WRITE_ERROR;
   233  
   234  	sum = csum_diff(old_saddr.addr, 16, new_saddr, 16, 0);
   235  	if (csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   236  		return DROP_CSUM_L4;
   237  
   238  	return 0;
   239  }
   240  
   241  /** Perform IPv6 reverse NAT based on reverse NAT index
   242   * @arg skb		packet
   243   * @arg l4_off		offset to L4
   244   * @arg csum_off	offset to L4 checksum field
   245   * @arg csum_flags	checksum flags
   246   * @arg index		reverse NAT index
   247   * @arg tuple		tuple
   248   * @arg saddr_tuple	If set, tuple address will be updated with new source address
   249   */
   250  static inline int __inline__ lb6_rev_nat(struct __sk_buff *skb, int l4_off,
   251  					 struct csum_offset *csum_off, __u16 index,
   252  					 struct ipv6_ct_tuple *tuple, int flags)
   253  {
   254  	struct lb6_reverse_nat *nat;
   255  
   256  	cilium_dbg_lb(skb, DBG_LB6_REVERSE_NAT_LOOKUP, index, 0);
   257  	nat = map_lookup_elem(&LB6_REVERSE_NAT_MAP, &index);
   258  	if (nat == NULL)
   259  		return 0;
   260  
   261  	return __lb6_rev_nat(skb, l4_off, csum_off, tuple, flags, nat);
   262  }
   263  
   264  /** Extract IPv6 LB key from packet
   265   * @arg skb		Packet
   266   * @arg tuple		Tuple
   267   * @arg l4_off		Offset to L4 header
   268   * @arg key		Pointer to store LB key in
   269   * @arg csum_off	Pointer to store L4 checksum field offset and flags
   270   * @arg dir		Flow direction
   271   *
   272   * Expects the skb to be validated for direct packet access up to L4. Fills
   273   * lb6_key_v2 based on L4 nexthdr.
   274   *
   275   * Returns:
   276   *   - TC_ACT_OK on successful extraction
   277   *   - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack)
   278   *   - Negative error code
   279   */
   280  static inline int __inline__ lb6_extract_key_v2(struct __sk_buff *skb,
   281  						struct ipv6_ct_tuple *tuple,
   282  						int l4_off,
   283  						struct lb6_key_v2 *key,
   284  						struct csum_offset *csum_off,
   285  						int dir)
   286  {
   287  	union v6addr *addr;
   288  	// FIXME(brb): set after adding support for different L4 protocols in LB
   289  	key->proto = 0;
   290  	addr = (dir == CT_INGRESS) ? &tuple->saddr : &tuple->daddr;
   291  	ipv6_addr_copy(&key->address, addr);
   292  	csum_l4_offset_and_flags(tuple->nexthdr, csum_off);
   293  
   294  #ifdef LB_L4
   295  	return extract_l4_port(skb, tuple->nexthdr, l4_off, &key->dport);
   296  #else
   297  	return 0;
   298  #endif
   299  }
   300  
   301  static inline
   302  struct lb6_service_v2 *__lb6_lookup_service_v2(struct lb6_key_v2 *key)
   303  {
   304  	key->slave = 0;
   305  #ifdef LB_L4
   306  	if (key->dport) {
   307  		struct lb6_service_v2 *svc;
   308  
   309  		/* FIXME: The verifier barks on these calls right now for some reason */
   310  		/* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */
   311  		svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key);
   312  		if (svc && svc->count != 0)
   313  			return svc;
   314  
   315  		key->dport = 0;
   316  	}
   317  #endif
   318  
   319  #ifdef LB_L3
   320  	if (1) {
   321  		struct lb6_service_v2 *svc;
   322  
   323  		/* FIXME: The verifier barks on these calls right now for some reason */
   324  		/* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */
   325  		svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key);
   326  		if (svc && svc->count != 0)
   327  			return svc;
   328  	}
   329  #endif
   330  	return NULL;
   331  }
   332  
   333  static inline
   334  struct lb6_service_v2 *lb6_lookup_service_v2(struct __sk_buff *skb,
   335  					     struct lb6_key_v2 *key)
   336  {
   337  	struct lb6_service_v2 *svc = __lb6_lookup_service_v2(key);
   338  
   339  
   340  	if (!svc)
   341  		cilium_dbg_lb(skb, DBG_LB6_LOOKUP_MASTER_FAIL, 0, 0);
   342  
   343  	return svc;
   344  }
   345  
   346  static inline struct lb6_backend *__lb6_lookup_backend(__u16 backend_id)
   347  {
   348  	return map_lookup_elem(&LB6_BACKEND_MAP, &backend_id);
   349  }
   350  
   351  static inline struct lb6_backend *lb6_lookup_backend(struct __sk_buff *skb,
   352  						     __u16 backend_id)
   353  {
   354  	struct lb6_backend *backend;
   355  
   356  	backend = __lb6_lookup_backend(backend_id);
   357  	if (!backend) {
   358  		cilium_dbg_lb(skb, DBG_LB6_LOOKUP_BACKEND_FAIL, backend_id, 0);
   359  	}
   360  
   361  	return backend;
   362  }
   363  
   364  static inline
   365  struct lb6_service_v2 *__lb6_lookup_slave_v2(struct lb6_key_v2 *key)
   366  {
   367  	return map_lookup_elem(&LB6_SERVICES_MAP_V2, key);
   368  }
   369  
   370  static inline
   371  struct lb6_service_v2 *lb6_lookup_slave_v2(struct __sk_buff *skb,
   372  					   struct lb6_key_v2 *key, __u16 slave)
   373  {
   374  	struct lb6_service_v2 *svc;
   375  
   376  	key->slave = slave;
   377  	cilium_dbg_lb(skb, DBG_LB6_LOOKUP_SLAVE, key->slave, key->dport);
   378  	svc = __lb6_lookup_slave_v2(key);
   379  	if (svc != NULL) {
   380  		return svc;
   381  	}
   382  
   383  	cilium_dbg_lb(skb, DBG_LB6_LOOKUP_SLAVE_V2_FAIL, key->slave, key->dport);
   384  
   385  	return NULL;
   386  }
   387  
   388  static inline int __inline__ lb6_xlate_v2(struct __sk_buff *skb,
   389  					  union v6addr *new_dst, __u8 nexthdr,
   390  				          int l3_off, int l4_off,
   391  					  struct csum_offset *csum_off,
   392  					  struct lb6_key_v2 *key,
   393  					  struct lb6_service_v2 *svc,
   394  					  struct lb6_backend *backend)
   395  {
   396  	ipv6_store_daddr(skb, new_dst->addr, l3_off);
   397  
   398  	if (csum_off) {
   399  		__be32 sum = csum_diff(key->address.addr, 16, new_dst->addr, 16, 0);
   400  		if (csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   401  			return DROP_CSUM_L4;
   402  	}
   403  
   404  #ifdef LB_L4
   405  	if (backend->port && key->dport != backend->port &&
   406  	    (nexthdr == IPPROTO_TCP || nexthdr == IPPROTO_UDP)) {
   407  		__be16 tmp = backend->port;
   408  		int ret;
   409  
   410  		/* Port offsets for UDP and TCP are the same */
   411  		ret = l4_modify_port(skb, l4_off, TCP_DPORT_OFF, csum_off, tmp, key->dport);
   412  		if (IS_ERR(ret))
   413  			return ret;
   414  	}
   415  #endif
   416  
   417  	return TC_ACT_OK;
   418  }
   419  
   420  static inline int __inline__ lb6_local(void *map, struct __sk_buff *skb,
   421  				       int l3_off, int l4_off,
   422  				       struct csum_offset *csum_off,
   423  				       struct lb6_key_v2 *key,
   424  				       struct ipv6_ct_tuple *tuple,
   425  				       struct lb6_service_v2 *svc_v2,
   426  				       struct ct_state *state)
   427  {
   428  	__u32 monitor; // Deliberately ignored; regular CT will determine monitoring.
   429  	union v6addr *addr;
   430  	__u8 flags = tuple->flags;
   431  	struct lb6_backend *backend;
   432  	struct lb6_service_v2 *slave_svc;
   433  	int slave;
   434  	int ret;
   435  
   436  	/* See lb4_local comments re svc endpoint lookup process */
   437  
   438  	ret = ct_lookup6(map, tuple, skb, l4_off, CT_SERVICE, state, &monitor);
   439  	switch(ret) {
   440  	case CT_NEW:
   441  		slave = lb6_select_slave(svc_v2->count);
   442  		if ((slave_svc = lb6_lookup_slave_v2(skb, key, slave)) == NULL) {
   443  			goto drop_no_service;
   444  		}
   445  		backend = lb6_lookup_backend(skb, slave_svc->backend_id);
   446  		if (backend == NULL) {
   447  			goto drop_no_service;
   448  		}
   449  		state->backend_id = slave_svc->backend_id;
   450  		state->rev_nat_index = svc_v2->rev_nat_index;
   451  		ret = ct_create6(map, tuple, skb, CT_SERVICE, state, false);
   452  		/* Fail closed, if the conntrack entry create fails drop
   453  		 * service lookup.
   454  		 */
   455  		if (IS_ERR(ret)) {
   456  			goto drop_no_service;
   457  		}
   458  		goto update_state;
   459  	case CT_ESTABLISHED:
   460  	case CT_RELATED:
   461  	case CT_REPLY:
   462  		// See lb4_local comment
   463  		if (state->rev_nat_index == 0) {
   464  			state->rev_nat_index = svc_v2->rev_nat_index;
   465  			ct_update6_rev_nat_index(map, tuple, state);
   466  		}
   467  		break;
   468  	default:
   469  		goto drop_no_service;
   470  	}
   471  
   472  	// See lb4_local comment
   473  	if (state->rev_nat_index != svc_v2->rev_nat_index) {
   474  		cilium_dbg_lb(skb, DBG_LB_STALE_CT, svc_v2->rev_nat_index,
   475  			      state->rev_nat_index);
   476  		slave = lb6_select_slave(svc_v2->count);
   477  		if (!(slave_svc = lb6_lookup_slave_v2(skb, key, slave))) {
   478  			goto drop_no_service;
   479  		}
   480  		state->backend_id = slave_svc->backend_id;
   481  		ct_update6_backend_id(map, tuple, state);
   482  		state->rev_nat_index = svc_v2->rev_nat_index;
   483  		ct_update6_rev_nat_index(map, tuple, state);
   484  	}
   485  	/* If the lookup fails it means the user deleted the backend out from
   486  	 * underneath us. To resolve this fall back to hash. If this is a TCP
   487  	 * session we are likely to get a TCP RST.
   488  	 */
   489  	if (!(backend = lb6_lookup_backend(skb, state->backend_id))) {
   490  		key->slave = 0;
   491  		if (!(svc_v2 = lb6_lookup_service_v2(skb, key))) {
   492  			goto drop_no_service;
   493  		}
   494  		slave = lb6_select_slave(svc_v2->count);
   495  		if (!(slave_svc = lb6_lookup_slave_v2(skb, key, slave))) {
   496  			goto drop_no_service;
   497  		}
   498  		backend = lb6_lookup_backend(skb, slave_svc->backend_id);
   499  		if (backend == NULL) {
   500  			goto drop_no_service;
   501  		}
   502  		state->backend_id = slave_svc->backend_id;
   503  		ct_update6_backend_id(map, tuple, state);
   504  	}
   505  
   506  update_state:
   507  	/* Restore flags so that SERVICE flag is only used in used when the
   508  	 * service lookup happens and future lookups use EGRESS or INGRESS.
   509  	 */
   510  	tuple->flags = flags;
   511  	ipv6_addr_copy(&tuple->daddr, &backend->address);
   512  	addr = &tuple->daddr;
   513  	state->rev_nat_index = svc_v2->rev_nat_index;
   514  
   515  	return lb6_xlate_v2(skb, addr, tuple->nexthdr, l3_off, l4_off,
   516  			 csum_off, key, svc_v2, backend);
   517  
   518  drop_no_service:
   519  	tuple->flags = flags;
   520  	return DROP_NO_SERVICE;
   521  }
   522  #endif /* ENABLE_IPV6 */
   523  
   524  #ifdef ENABLE_IPV4
   525  static inline int __inline__ __lb4_rev_nat(struct __sk_buff *skb, int l3_off, int l4_off,
   526  					 struct csum_offset *csum_off,
   527  					 struct ipv4_ct_tuple *tuple, int flags,
   528  					 struct lb4_reverse_nat *nat,
   529  					 struct ct_state *ct_state)
   530  {
   531  	__be32 old_sip, new_sip, sum = 0;
   532  	int ret;
   533  
   534  	cilium_dbg_lb(skb, DBG_LB4_REVERSE_NAT, nat->address, nat->port);
   535  
   536  	if (nat->port) {
   537  		ret = reverse_map_l4_port(skb, tuple->nexthdr, nat->port, l4_off, csum_off);
   538  		if (IS_ERR(ret))
   539  			return ret;
   540  	}
   541  
   542  	if (flags & REV_NAT_F_TUPLE_SADDR) {
   543  		old_sip = tuple->saddr;
   544  		tuple->saddr = new_sip = nat->address;
   545  	} else {
   546  		ret = skb_load_bytes(skb, l3_off + offsetof(struct iphdr, saddr), &old_sip, 4);
   547  		if (IS_ERR(ret))
   548  			return ret;
   549  
   550  		new_sip = nat->address;
   551  	}
   552  
   553  	if (ct_state->loopback) {
   554  		/* The packet was looped back to the sending endpoint on the
   555  		 * forward service translation. This implies that the original
   556  		 * source address of the packet is the source address of the
   557  		 * current packet. We therefore need to make the current source
   558  		 * address the new destination address */
   559  		__be32 old_dip;
   560  
   561  		ret = skb_load_bytes(skb, l3_off + offsetof(struct iphdr, daddr), &old_dip, 4);
   562  		if (IS_ERR(ret))
   563  			return ret;
   564  
   565  		cilium_dbg_lb(skb, DBG_LB4_LOOPBACK_SNAT_REV, old_dip, old_sip);
   566  
   567  		ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, daddr), &old_sip, 4, 0);
   568  		if (IS_ERR(ret))
   569  			return DROP_WRITE_ERROR;
   570  
   571  		sum = csum_diff(&old_dip, 4, &old_sip, 4, 0);
   572  
   573  		/* Update the tuple address which is representing the destination address */
   574  		tuple->saddr = old_sip;
   575  	}
   576  
   577          ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, saddr), &new_sip, 4, 0);
   578  	if (IS_ERR(ret))
   579  		return DROP_WRITE_ERROR;
   580  
   581  	sum = csum_diff(&old_sip, 4, &new_sip, 4, sum);
   582  	if (l3_csum_replace(skb, l3_off + offsetof(struct iphdr, check), 0, sum, 0) < 0)
   583  		return DROP_CSUM_L3;
   584  
   585  	if (csum_off->offset &&
   586  	    csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   587  		return DROP_CSUM_L4;
   588  
   589  	return 0;
   590  }
   591  
   592  
   593  /** Perform IPv4 reverse NAT based on reverse NAT index
   594   * @arg skb		packet
   595   * @arg l3_off		offset to L3
   596   * @arg l4_off		offset to L4
   597   * @arg csum_off	offset to L4 checksum field
   598   * @arg csum_flags	checksum flags
   599   * @arg index		reverse NAT index
   600   * @arg tuple		tuple
   601   */
   602  static inline int __inline__ lb4_rev_nat(struct __sk_buff *skb, int l3_off, int l4_off,
   603  					 struct csum_offset *csum_off,
   604  					 struct ct_state *ct_state,
   605  					 struct ipv4_ct_tuple *tuple, int flags)
   606  {
   607  	struct lb4_reverse_nat *nat;
   608  
   609  	cilium_dbg_lb(skb, DBG_LB4_REVERSE_NAT_LOOKUP, ct_state->rev_nat_index, 0);
   610  	nat = map_lookup_elem(&LB4_REVERSE_NAT_MAP, &ct_state->rev_nat_index);
   611  	if (nat == NULL)
   612  		return 0;
   613  
   614  	return __lb4_rev_nat(skb, l3_off, l4_off, csum_off, tuple, flags, nat,
   615  			     ct_state);
   616  }
   617  
   618  /** Extract IPv4 LB key from packet
   619   * @arg skb		Packet
   620   * @arg tuple		Tuple
   621   * @arg l4_off		Offset to L4 header
   622   * @arg key		Pointer to store LB key in
   623   * @arg csum_off	Pointer to store L4 checksum field offset  in
   624   * @arg dir		Flow direction
   625   *
   626   * Returns:
   627   *   - TC_ACT_OK on successful extraction
   628   *   - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack)
   629   *   - Negative error code
   630   */
   631  static inline int __inline__ lb4_extract_key_v2(struct __sk_buff *skb,
   632  						struct ipv4_ct_tuple *tuple,
   633  						int l4_off,
   634  						struct lb4_key_v2 *key,
   635  						struct csum_offset *csum_off,
   636  						int dir)
   637  {
   638  	// FIXME: set after adding support for different L4 protocols in LB
   639  	key->proto = 0;
   640  	key->address = (dir == CT_INGRESS) ? tuple->saddr : tuple->daddr;
   641  	csum_l4_offset_and_flags(tuple->nexthdr, csum_off);
   642  
   643  #ifdef LB_L4
   644  	return extract_l4_port(skb, tuple->nexthdr, l4_off, &key->dport);
   645  #else
   646  	return 0;
   647  #endif
   648  }
   649  
   650  static inline
   651  struct lb4_service_v2 *__lb4_lookup_service_v2(struct lb4_key_v2 *key)
   652  {
   653  	key->slave = 0;
   654  #ifdef LB_L4
   655  	if (key->dport) {
   656  		struct lb4_service_v2 *svc;
   657  
   658  		/* FIXME: The verifier barks on these calls right now for some reason */
   659  		/* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */
   660  		svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
   661  		if (svc && svc->count != 0)
   662  			return svc;
   663  
   664  		key->dport = 0;
   665  	}
   666  #endif
   667  
   668  #ifdef LB_L3
   669  	if (1) {
   670  		struct lb4_service_v2 *svc;
   671  
   672  		/* FIXME: The verifier barks on these calls right now for some reason */
   673  		/* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */
   674  		svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
   675  		if (svc && svc->count != 0)
   676  			return svc;
   677  	}
   678  #endif
   679  	return NULL;
   680  }
   681  
   682  static inline
   683  struct lb4_service_v2 *lb4_lookup_service_v2(struct __sk_buff *skb,
   684  					     struct lb4_key_v2 *key)
   685  {
   686  	struct lb4_service_v2 *svc = __lb4_lookup_service_v2(key);
   687  
   688  	if (!svc)
   689  		cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER_FAIL, 0, 0);
   690  
   691  	return svc;
   692  }
   693  
   694  static inline struct lb4_backend *__lb4_lookup_backend(__u16 backend_id)
   695  {
   696  	return map_lookup_elem(&LB4_BACKEND_MAP, &backend_id);
   697  }
   698  
   699  static inline struct lb4_backend *lb4_lookup_backend(struct __sk_buff *skb,
   700  						     __u16 backend_id)
   701  {
   702  	struct lb4_backend *backend;
   703  
   704  	backend = __lb4_lookup_backend(backend_id);
   705  	if (!backend) {
   706  		cilium_dbg_lb(skb, DBG_LB4_LOOKUP_BACKEND_FAIL, backend_id, 0);
   707  	}
   708  
   709  	return backend;
   710  }
   711  
   712  static inline
   713  struct lb4_service_v2 *__lb4_lookup_slave_v2(struct lb4_key_v2 *key)
   714  {
   715  	return map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
   716  }
   717  
   718  static inline
   719  struct lb4_service_v2 *lb4_lookup_slave_v2(struct __sk_buff *skb,
   720  					   struct lb4_key_v2 *key, __u16 slave)
   721  {
   722  	struct lb4_service_v2 *svc;
   723  
   724  	key->slave = slave;
   725  	cilium_dbg_lb(skb, DBG_LB4_LOOKUP_SLAVE, key->slave, key->dport);
   726  	svc = __lb4_lookup_slave_v2(key);
   727  	if (svc != NULL) {
   728  		return svc;
   729  	}
   730  
   731  	cilium_dbg_lb(skb, DBG_LB4_LOOKUP_SLAVE_V2_FAIL, key->slave, key->dport);
   732  
   733  	return NULL;
   734  }
   735  
   736  static inline int __inline__
   737  lb4_xlate_v2(struct __sk_buff *skb, __be32 *new_daddr, __be32 *new_saddr,
   738  	     __be32 *old_saddr, __u8 nexthdr, int l3_off, int l4_off,
   739  	     struct csum_offset *csum_off, struct lb4_key_v2 *key,
   740  	     struct lb4_service_v2 *svc, struct lb4_backend *backend)
   741  {
   742  	int ret;
   743  	__be32 sum;
   744  
   745  	ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, daddr), new_daddr, 4, 0);
   746  	if (ret < 0)
   747  		return DROP_WRITE_ERROR;
   748  
   749  	sum = csum_diff(&key->address, 4, new_daddr, 4, 0);
   750  
   751  	if (new_saddr && *new_saddr) {
   752  		cilium_dbg_lb(skb, DBG_LB4_LOOPBACK_SNAT, *old_saddr, *new_saddr);
   753  		ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, saddr), new_saddr, 4, 0);
   754  		if (ret < 0)
   755  			return DROP_WRITE_ERROR;
   756  
   757  		sum = csum_diff(old_saddr, 4, new_saddr, 4, sum);
   758  	}
   759  
   760  	if (l3_csum_replace(skb, l3_off + offsetof(struct iphdr, check), 0, sum, 0) < 0)
   761  		return DROP_CSUM_L3;
   762  
   763  	if (csum_off->offset) {
   764  		if (csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   765  			return DROP_CSUM_L4;
   766  	}
   767  
   768  #ifdef LB_L4
   769  	if (backend->port && key->dport != backend->port &&
   770  	    (nexthdr == IPPROTO_TCP || nexthdr == IPPROTO_UDP)) {
   771  		__be16 tmp = backend->port;
   772  		/* Port offsets for UDP and TCP are the same */
   773  		ret = l4_modify_port(skb, l4_off, TCP_DPORT_OFF, csum_off, tmp, key->dport);
   774  		if (IS_ERR(ret))
   775  			return ret;
   776  	}
   777  #endif
   778  
   779  	return TC_ACT_OK;
   780  }
   781  
   782  static inline int __inline__ lb4_local(void *map, struct __sk_buff *skb,
   783  				       int l3_off, int l4_off,
   784  				       struct csum_offset *csum_off,
   785  				       struct lb4_key_v2 *key,
   786  				       struct ipv4_ct_tuple *tuple,
   787  					struct lb4_service_v2 *svc_v2,
   788  				       struct ct_state *state, __be32 saddr)
   789  {
   790  	__u32 monitor; // Deliberately ignored; regular CT will determine monitoring.
   791  	__be32 new_saddr = 0, new_daddr;
   792  	__u8 flags = tuple->flags;
   793  	struct lb4_backend *backend;
   794  	struct lb4_service_v2 *slave_svc;
   795  	int slave;
   796  	int ret;
   797  
   798  	ret = ct_lookup4(map, tuple, skb, l4_off, CT_SERVICE, state, &monitor);
   799  	switch(ret) {
   800  	case CT_NEW:
   801  		/* No CT entry has been found, so select a svc endpoint */
   802  		slave = lb4_select_slave(svc_v2->count);
   803  		if ((slave_svc = lb4_lookup_slave_v2(skb, key, slave)) == NULL) {
   804  			goto drop_no_service;
   805  		}
   806  		backend = lb4_lookup_backend(skb, slave_svc->backend_id);
   807  		if (backend == NULL) {
   808  			goto drop_no_service;
   809  		}
   810  		state->backend_id = slave_svc->backend_id;
   811  		state->rev_nat_index = svc_v2->rev_nat_index;
   812  		ret = ct_create4(map, tuple, skb, CT_SERVICE, state, false);
   813  		/* Fail closed, if the conntrack entry create fails drop
   814  		 * service lookup.
   815  		 */
   816  		if (IS_ERR(ret)) {
   817  			goto drop_no_service;
   818  		}
   819  		goto update_state;
   820  	case CT_ESTABLISHED:
   821  	case CT_RELATED:
   822  	case CT_REPLY:
   823  		// For backward-compatibility we need to update reverse NAT index
   824  		// in the CT_SERVICE entry for old connections, as later in the code
   825  		// we check whether the right backend is used. Having it set to 0
   826  		// would trigger a new backend selection which would in many cases
   827  		// would pick a different backend.
   828  		if (unlikely(state->rev_nat_index == 0)) {
   829  			state->rev_nat_index = svc_v2->rev_nat_index;
   830  			ct_update4_rev_nat_index(map, tuple, state);
   831  		}
   832  		break;
   833  	default:
   834  		goto drop_no_service;
   835  	}
   836  
   837  	// If the CT_SERVICE entry is from a non-related connection (e.g.
   838  	// endpoint has been removed, but its CT entries were not (it is
   839  	// totally possible due to the bug in DumpReliablyWithCallback)),
   840  	// then a wrong (=from unrelated service) backend can be selected.
   841  	// To avoid this, check that reverse NAT indices match. If not,
   842  	// select a new backend.
   843  	if (state->rev_nat_index != svc_v2->rev_nat_index) {
   844  		cilium_dbg_lb(skb, DBG_LB_STALE_CT, svc_v2->rev_nat_index,
   845  			      state->rev_nat_index);
   846  		slave = lb4_select_slave(svc_v2->count);
   847  		if (!(slave_svc = lb4_lookup_slave_v2(skb, key, slave))) {
   848  			goto drop_no_service;
   849  		}
   850  		state->backend_id = slave_svc->backend_id;
   851  		ct_update4_backend_id(map, tuple, state);
   852  		state->rev_nat_index = svc_v2->rev_nat_index;
   853  		ct_update4_rev_nat_index(map, tuple, state);
   854  	}
   855  	/* If the lookup fails it means the user deleted the backend out from
   856  	 * underneath us. To resolve this fall back to hash. If this is a TCP
   857  	 * session we are likely to get a TCP RST.
   858  	 */
   859  	if (!(backend = lb4_lookup_backend(skb, state->backend_id))) {
   860  		key->slave = 0;
   861  		if (!(svc_v2 = lb4_lookup_service_v2(skb, key))) {
   862  			goto drop_no_service;
   863  		}
   864  		slave = lb4_select_slave(svc_v2->count);
   865  		if (!(slave_svc = lb4_lookup_slave_v2(skb, key, slave))) {
   866  			goto drop_no_service;
   867  		}
   868  		backend = lb4_lookup_backend(skb, slave_svc->backend_id);
   869  		if (backend == NULL) {
   870  			goto drop_no_service;
   871  		}
   872  		state->backend_id = slave_svc->backend_id;
   873  		ct_update4_backend_id(map, tuple, state);
   874  	}
   875  
   876  update_state:
   877  	/* Restore flags so that SERVICE flag is only used in used when the
   878  	 * service lookup happens and future lookups use EGRESS or INGRESS.
   879  	 */
   880  	tuple->flags = flags;
   881  	state->rev_nat_index = svc_v2->rev_nat_index;
   882  	state->addr = new_daddr = backend->address;
   883  
   884  #ifndef DISABLE_LOOPBACK_LB
   885  	/* Special loopback case: The origin endpoint has transmitted to a
   886  	 * service which is being translated back to the source. This would
   887  	 * result in a packet with identical source and destination address.
   888  	 * Linux considers such packets as martian source and will drop unless
   889  	 * received on a loopback device. Perform NAT on the source address
   890  	 * to make it appear from an outside address.
   891  	 */
   892  	if (saddr == backend->address) {
   893  		new_saddr = IPV4_LOOPBACK;
   894  		state->loopback = 1;
   895  		state->addr = new_saddr;
   896  		state->svc_addr = saddr;
   897  	}
   898  #endif
   899  
   900  	if (!state->loopback)
   901  		tuple->daddr = backend->address;
   902  
   903  	return lb4_xlate_v2(skb, &new_daddr, &new_saddr, &saddr,
   904  			 tuple->nexthdr, l3_off, l4_off, csum_off, key,
   905  			 svc_v2, backend);
   906  
   907  drop_no_service:
   908  		tuple->flags = flags;
   909  		return DROP_NO_SERVICE;
   910  }
   911  #endif /* ENABLE_IPV4 */
   912  
   913  #endif /* __LB_H_ */