github.com/cilium/cilium@v1.16.2/bpf/lib/lb.h

github.com/cilium/cilium@v1.16.2/bpf/lib/lb.h (about)

     1  /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
     2  /* Copyright Authors of Cilium */
     3  
     4  #pragma once
     5  
     6  #include "bpf/compiler.h"
     7  #include "csum.h"
     8  #include "conntrack.h"
     9  #include "ipv4.h"
    10  #include "hash.h"
    11  #include "ids.h"
    12  #include "nat_46x64.h"
    13  #include "ratelimit.h"
    14  
    15  #ifndef SKIP_CALLS_MAP
    16  #include "drop.h"
    17  #endif
    18  
    19  #ifdef ENABLE_IPV6
    20  struct {
    21  	__uint(type, BPF_MAP_TYPE_HASH);
    22  	__type(key, __u16);
    23  	__type(value, struct lb6_reverse_nat);
    24  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    25  	__uint(max_entries, CILIUM_LB_REV_NAT_MAP_MAX_ENTRIES);
    26  	__uint(map_flags, CONDITIONAL_PREALLOC);
    27  } LB6_REVERSE_NAT_MAP __section_maps_btf;
    28  
    29  struct {
    30  	__uint(type, BPF_MAP_TYPE_HASH);
    31  	__type(key, struct lb6_key);
    32  	__type(value, struct lb6_service);
    33  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    34  	__uint(max_entries, CILIUM_LB_SERVICE_MAP_MAX_ENTRIES);
    35  	__uint(map_flags, CONDITIONAL_PREALLOC);
    36  } LB6_SERVICES_MAP_V2 __section_maps_btf;
    37  
    38  struct {
    39  	__uint(type, BPF_MAP_TYPE_HASH);
    40  	__type(key, __u32);
    41  	__type(value, struct lb6_backend);
    42  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    43  	__uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES);
    44  	__uint(map_flags, CONDITIONAL_PREALLOC);
    45  } LB6_BACKEND_MAP __section_maps_btf;
    46  
    47  #ifdef ENABLE_SESSION_AFFINITY
    48  struct {
    49  	__uint(type, BPF_MAP_TYPE_LRU_HASH);
    50  	__type(key, struct lb6_affinity_key);
    51  	__type(value, struct lb_affinity_val);
    52  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    53  	__uint(max_entries, CILIUM_LB_AFFINITY_MAP_MAX_ENTRIES);
    54  } LB6_AFFINITY_MAP __section_maps_btf;
    55  #endif
    56  
    57  #ifdef ENABLE_SRC_RANGE_CHECK
    58  struct {
    59  	__uint(type, BPF_MAP_TYPE_LPM_TRIE);
    60  	__type(key, struct lb6_src_range_key);
    61  	__type(value, __u8);
    62  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    63  	__uint(max_entries, LB6_SRC_RANGE_MAP_SIZE);
    64  	__uint(map_flags, BPF_F_NO_PREALLOC);
    65  } LB6_SRC_RANGE_MAP __section_maps_btf;
    66  #endif
    67  
    68  #ifdef ENABLE_HEALTH_CHECK
    69  struct {
    70  	__uint(type, BPF_MAP_TYPE_LRU_HASH);
    71  	__type(key, __sock_cookie);
    72  	__type(value, struct lb6_health);
    73  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    74  	__uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES);
    75  } LB6_HEALTH_MAP __section_maps_btf;
    76  #endif
    77  
    78  #if LB_SELECTION == LB_SELECTION_MAGLEV
    79  struct {
    80  	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
    81  	__type(key, __u16);
    82  	__type(value, __u32);
    83  	__uint(pinning, LIBBPF_PIN_BY_NAME);
    84  	__uint(max_entries, CILIUM_LB_MAGLEV_MAP_MAX_ENTRIES);
    85  	__uint(map_flags, CONDITIONAL_PREALLOC);
    86  	/* Maglev inner map definition */
    87  	__array(values, struct {
    88  		__uint(type, BPF_MAP_TYPE_ARRAY);
    89  		__uint(key_size, sizeof(__u32));
    90  		__uint(value_size, sizeof(__u32) * LB_MAGLEV_LUT_SIZE);
    91  		__uint(max_entries, 1);
    92  	});
    93  } LB6_MAGLEV_MAP_OUTER __section_maps_btf;
    94  #endif /* LB_SELECTION == LB_SELECTION_MAGLEV */
    95  #endif /* ENABLE_IPV6 */
    96  
    97  #ifdef ENABLE_IPV4
    98  struct {
    99  	__uint(type, BPF_MAP_TYPE_HASH);
   100  	__type(key, __u16);
   101  	__type(value, struct lb4_reverse_nat);
   102  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   103  	__uint(max_entries, CILIUM_LB_REV_NAT_MAP_MAX_ENTRIES);
   104  	__uint(map_flags, CONDITIONAL_PREALLOC);
   105  } LB4_REVERSE_NAT_MAP __section_maps_btf;
   106  
   107  struct {
   108  	__uint(type, BPF_MAP_TYPE_HASH);
   109  	__type(key, struct lb4_key);
   110  	__type(value, struct lb4_service);
   111  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   112  	__uint(max_entries, CILIUM_LB_SERVICE_MAP_MAX_ENTRIES);
   113  	__uint(map_flags, CONDITIONAL_PREALLOC);
   114  } LB4_SERVICES_MAP_V2 __section_maps_btf;
   115  
   116  struct {
   117  	__uint(type, BPF_MAP_TYPE_HASH);
   118  	__type(key, __u32);
   119  	__type(value, struct lb4_backend);
   120  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   121  	__uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES);
   122  	__uint(map_flags, CONDITIONAL_PREALLOC);
   123  } LB4_BACKEND_MAP __section_maps_btf;
   124  
   125  #ifdef ENABLE_SESSION_AFFINITY
   126  struct {
   127  	__uint(type, BPF_MAP_TYPE_LRU_HASH);
   128  	__type(key, struct lb4_affinity_key);
   129  	__type(value, struct lb_affinity_val);
   130  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   131  	__uint(max_entries, CILIUM_LB_AFFINITY_MAP_MAX_ENTRIES);
   132  } LB4_AFFINITY_MAP __section_maps_btf;
   133  #endif
   134  
   135  #ifdef ENABLE_SRC_RANGE_CHECK
   136  struct {
   137  	__uint(type, BPF_MAP_TYPE_LPM_TRIE);
   138  	__type(key, struct lb4_src_range_key);
   139  	__type(value, __u8);
   140  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   141  	__uint(max_entries, LB4_SRC_RANGE_MAP_SIZE);
   142  	__uint(map_flags, BPF_F_NO_PREALLOC);
   143  } LB4_SRC_RANGE_MAP __section_maps_btf;
   144  #endif
   145  
   146  #ifdef ENABLE_HEALTH_CHECK
   147  struct {
   148  	__uint(type, BPF_MAP_TYPE_LRU_HASH);
   149  	__type(key, __sock_cookie);
   150  	__type(value, struct lb4_health);
   151  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   152  	__uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES);
   153  } LB4_HEALTH_MAP __section_maps_btf;
   154  #endif
   155  
   156  #if LB_SELECTION == LB_SELECTION_MAGLEV
   157  struct {
   158  	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
   159  	__type(key, __u16);
   160  	__type(value, __u32);
   161  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   162  	__uint(max_entries, CILIUM_LB_MAGLEV_MAP_MAX_ENTRIES);
   163  	__uint(map_flags, CONDITIONAL_PREALLOC);
   164  	/* Maglev inner map definition */
   165  	__array(values, struct {
   166  		__uint(type, BPF_MAP_TYPE_ARRAY);
   167  		__uint(key_size, sizeof(__u32));
   168  		__uint(value_size, sizeof(__u32) * LB_MAGLEV_LUT_SIZE);
   169  		__uint(max_entries, 1);
   170  	});
   171  } LB4_MAGLEV_MAP_OUTER __section_maps_btf;
   172  #endif /* LB_SELECTION == LB_SELECTION_MAGLEV */
   173  #endif /* ENABLE_IPV4 */
   174  
   175  #ifdef ENABLE_SESSION_AFFINITY
   176  struct {
   177  	__uint(type, BPF_MAP_TYPE_HASH);
   178  	__type(key, struct lb_affinity_match);
   179  	__type(value, __u8);
   180  	__uint(pinning, LIBBPF_PIN_BY_NAME);
   181  	__uint(max_entries, CILIUM_LB_AFFINITY_MAP_MAX_ENTRIES);
   182  	__uint(map_flags, CONDITIONAL_PREALLOC);
   183  } LB_AFFINITY_MATCH_MAP __section_maps_btf;
   184  #endif
   185  
   186  #ifndef DSR_XLATE_MODE
   187  # define DSR_XLATE_MODE		0
   188  # define DSR_XLATE_FRONTEND	1
   189  #endif
   190  #ifdef LB_DEBUG
   191  #define cilium_dbg_lb cilium_dbg
   192  #else
   193  #define cilium_dbg_lb(a, b, c, d)
   194  #endif
   195  
   196  #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING
   197  #include "act.h"
   198  #endif
   199  
   200  static __always_inline bool lb_is_svc_proto(__u8 proto)
   201  {
   202  	switch (proto) {
   203  	case IPPROTO_TCP:
   204  	case IPPROTO_UDP:
   205  #ifdef ENABLE_SCTP
   206  	case IPPROTO_SCTP:
   207  #endif /* ENABLE_SCTP */
   208  		return true;
   209  	default:
   210  		return false;
   211  	}
   212  }
   213  
   214  static __always_inline
   215  bool lb4_svc_is_loadbalancer(const struct lb4_service *svc __maybe_unused)
   216  {
   217  	return svc->flags & SVC_FLAG_LOADBALANCER;
   218  }
   219  
   220  static __always_inline
   221  bool lb6_svc_is_loadbalancer(const struct lb6_service *svc __maybe_unused)
   222  {
   223  	return svc->flags & SVC_FLAG_LOADBALANCER;
   224  }
   225  
   226  static __always_inline
   227  bool lb4_svc_is_nodeport(const struct lb4_service *svc __maybe_unused)
   228  {
   229  #ifdef ENABLE_NODEPORT
   230  	return svc->flags & SVC_FLAG_NODEPORT;
   231  #else
   232  	return false;
   233  #endif /* ENABLE_NODEPORT */
   234  }
   235  
   236  static __always_inline
   237  bool lb6_svc_is_nodeport(const struct lb6_service *svc __maybe_unused)
   238  {
   239  #ifdef ENABLE_NODEPORT
   240  	return svc->flags & SVC_FLAG_NODEPORT;
   241  #else
   242  	return false;
   243  #endif /* ENABLE_NODEPORT */
   244  }
   245  
   246  static __always_inline
   247  bool lb4_svc_is_external_ip(const struct lb4_service *svc __maybe_unused)
   248  {
   249  	return svc->flags & SVC_FLAG_EXTERNAL_IP;
   250  }
   251  
   252  static __always_inline
   253  bool lb6_svc_is_external_ip(const struct lb6_service *svc __maybe_unused)
   254  {
   255  	return svc->flags & SVC_FLAG_EXTERNAL_IP;
   256  }
   257  
   258  static __always_inline
   259  bool lb4_svc_is_hostport(const struct lb4_service *svc __maybe_unused)
   260  {
   261  	return svc->flags & SVC_FLAG_HOSTPORT;
   262  }
   263  
   264  static __always_inline
   265  bool lb6_svc_is_hostport(const struct lb6_service *svc __maybe_unused)
   266  {
   267  	return svc->flags & SVC_FLAG_HOSTPORT;
   268  }
   269  
   270  static __always_inline
   271  bool lb4_svc_is_loopback(const struct lb4_service *svc __maybe_unused)
   272  {
   273  	return svc->flags2 & SVC_FLAG_LOOPBACK;
   274  }
   275  
   276  static __always_inline
   277  bool lb6_svc_is_loopback(const struct lb6_service *svc __maybe_unused)
   278  {
   279  	return svc->flags2 & SVC_FLAG_LOOPBACK;
   280  }
   281  
   282  static __always_inline
   283  bool lb4_svc_has_src_range_check(const struct lb4_service *svc __maybe_unused)
   284  {
   285  #ifdef ENABLE_SRC_RANGE_CHECK
   286  	return svc->flags & SVC_FLAG_SOURCE_RANGE;
   287  #else
   288  	return false;
   289  #endif /* ENABLE_SRC_RANGE_CHECK */
   290  }
   291  
   292  static __always_inline
   293  bool lb6_svc_has_src_range_check(const struct lb6_service *svc __maybe_unused)
   294  {
   295  #ifdef ENABLE_SRC_RANGE_CHECK
   296  	return svc->flags & SVC_FLAG_SOURCE_RANGE;
   297  #else
   298  	return false;
   299  #endif /* ENABLE_SRC_RANGE_CHECK */
   300  }
   301  
   302  static __always_inline bool lb_skip_l4_dnat(void)
   303  {
   304  	return DSR_XLATE_MODE == DSR_XLATE_FRONTEND;
   305  }
   306  
   307  static __always_inline
   308  bool lb4_svc_is_two_scopes(const struct lb4_service *svc)
   309  {
   310  	return svc->flags2 & SVC_FLAG_TWO_SCOPES;
   311  }
   312  
   313  static __always_inline
   314  bool lb6_svc_is_two_scopes(const struct lb6_service *svc)
   315  {
   316  	return svc->flags2 & SVC_FLAG_TWO_SCOPES;
   317  }
   318  
   319  static __always_inline
   320  bool lb4_svc_is_affinity(const struct lb4_service *svc)
   321  {
   322  	return svc->flags & SVC_FLAG_AFFINITY;
   323  }
   324  
   325  static __always_inline
   326  bool lb6_svc_is_affinity(const struct lb6_service *svc)
   327  {
   328  	return svc->flags & SVC_FLAG_AFFINITY;
   329  }
   330  
   331  static __always_inline bool __lb_svc_is_routable(__u8 flags)
   332  {
   333  	return (flags & SVC_FLAG_ROUTABLE) != 0;
   334  }
   335  
   336  static __always_inline
   337  bool lb4_svc_is_routable(const struct lb4_service *svc)
   338  {
   339  	return __lb_svc_is_routable(svc->flags);
   340  }
   341  
   342  static __always_inline
   343  bool lb6_svc_is_routable(const struct lb6_service *svc)
   344  {
   345  	return __lb_svc_is_routable(svc->flags);
   346  }
   347  
   348  #ifdef ENABLE_LOCAL_REDIRECT_POLICY
   349  static __always_inline
   350  bool lb4_svc_is_localredirect(const struct lb4_service *svc)
   351  {
   352  	return svc->flags2 & SVC_FLAG_LOCALREDIRECT;
   353  }
   354  
   355  static __always_inline
   356  bool lb6_svc_is_localredirect(const struct lb6_service *svc)
   357  {
   358  	return svc->flags2 & SVC_FLAG_LOCALREDIRECT;
   359  }
   360  #endif /* ENABLE_LOCAL_REDIRECT_POLICY */
   361  
   362  static __always_inline
   363  bool lb4_svc_is_l7loadbalancer(const struct lb4_service *svc __maybe_unused)
   364  {
   365  #ifdef ENABLE_L7_LB
   366  	return svc->flags2 & SVC_FLAG_L7LOADBALANCER;
   367  #else
   368  	return false;
   369  #endif
   370  }
   371  
   372  static __always_inline
   373  bool lb6_svc_is_l7loadbalancer(const struct lb6_service *svc __maybe_unused)
   374  {
   375  #ifdef ENABLE_L7_LB
   376  	return svc->flags2 & SVC_FLAG_L7LOADBALANCER;
   377  #else
   378  	return false;
   379  #endif
   380  }
   381  
   382  static __always_inline int reverse_map_l4_port(struct __ctx_buff *ctx, __u8 nexthdr,
   383  					       __be16 old_port, __be16 port, int l4_off,
   384  					       struct csum_offset *csum_off)
   385  {
   386  	switch (nexthdr) {
   387  	case IPPROTO_TCP:
   388  	case IPPROTO_UDP:
   389  #ifdef ENABLE_SCTP
   390  	case IPPROTO_SCTP:
   391  #endif  /* ENABLE_SCTP */
   392  		if (port) {
   393  			int ret;
   394  
   395  			if (port != old_port) {
   396  #ifdef ENABLE_SCTP
   397  				/* This will change the SCTP checksum, which we cannot fix right now.
   398  				 * This will likely need kernel changes before we can remove this.
   399  				 */
   400  				if (nexthdr == IPPROTO_SCTP)
   401  					return DROP_CSUM_L4;
   402  #endif  /* ENABLE_SCTP */
   403  				ret = l4_modify_port(ctx, l4_off, TCP_SPORT_OFF,
   404  						     csum_off, port, old_port);
   405  				if (IS_ERR(ret))
   406  					return ret;
   407  			}
   408  		}
   409  		break;
   410  
   411  	case IPPROTO_ICMPV6:
   412  	case IPPROTO_ICMP:
   413  		return CTX_ACT_OK;
   414  
   415  	default:
   416  		return DROP_UNKNOWN_L4;
   417  	}
   418  
   419  	return 0;
   420  }
   421  
   422  static __always_inline int
   423  lb_l4_xlate(struct __ctx_buff *ctx, __u8 nexthdr __maybe_unused, int l4_off,
   424  	    struct csum_offset *csum_off, __be16 dport, __be16 backend_port)
   425  {
   426  	if (likely(backend_port) && dport != backend_port) {
   427  		int ret;
   428  
   429  #ifdef ENABLE_SCTP
   430  		/* This will change the SCTP checksum, which we cannot fix right now.
   431  		 * This will likely need kernel changes before we can remove this.
   432  		 */
   433  		if (nexthdr == IPPROTO_SCTP)
   434  			return DROP_CSUM_L4;
   435  #endif  /* ENABLE_SCTP */
   436  
   437  		/* Port offsets for UDP and TCP are the same */
   438  		ret = l4_modify_port(ctx, l4_off, TCP_DPORT_OFF, csum_off,
   439  				     backend_port, dport);
   440  		if (IS_ERR(ret))
   441  			return ret;
   442  	}
   443  
   444  	return CTX_ACT_OK;
   445  }
   446  
   447  #ifdef ENABLE_IPV6
   448  static __always_inline int __lb6_rev_nat(struct __ctx_buff *ctx, int l4_off,
   449  					 struct ipv6_ct_tuple *tuple,
   450  					 struct lb6_reverse_nat *nat)
   451  {
   452  	struct csum_offset csum_off = {};
   453  	union v6addr old_saddr __align_stack_8;
   454  	__be32 sum;
   455  	int ret;
   456  
   457  	cilium_dbg_lb(ctx, DBG_LB6_REVERSE_NAT, nat->address.p4, nat->port);
   458  
   459  	csum_l4_offset_and_flags(tuple->nexthdr, &csum_off);
   460  
   461  	if (nat->port) {
   462  		ret = reverse_map_l4_port(ctx, tuple->nexthdr, tuple->dport,
   463  					  nat->port, l4_off, &csum_off);
   464  		if (IS_ERR(ret))
   465  			return ret;
   466  	}
   467  
   468  	ipv6_addr_copy(&old_saddr, &tuple->saddr);
   469  	ipv6_addr_copy(&tuple->saddr, &nat->address);
   470  
   471  	ret = ipv6_store_saddr(ctx, nat->address.addr, ETH_HLEN);
   472  	if (IS_ERR(ret))
   473  		return DROP_WRITE_ERROR;
   474  
   475  	sum = csum_diff(old_saddr.addr, 16, nat->address.addr, 16, 0);
   476  	if (csum_off.offset &&
   477  	    csum_l4_replace(ctx, l4_off, &csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0)
   478  		return DROP_CSUM_L4;
   479  
   480  	return 0;
   481  }
   482  
   483  static __always_inline struct lb6_reverse_nat *
   484  lb6_lookup_rev_nat_entry(struct __ctx_buff *ctx __maybe_unused, __u16 index)
   485  {
   486  	cilium_dbg_lb(ctx, DBG_LB6_REVERSE_NAT_LOOKUP, index, 0);
   487  
   488  	return map_lookup_elem(&LB6_REVERSE_NAT_MAP, &index);
   489  }
   490  
   491  /** Perform IPv6 reverse NAT based on reverse NAT index
   492   * @arg ctx		packet
   493   * @arg l4_off		offset to L4
   494   * @arg index		reverse NAT index
   495   * @arg tuple		tuple
   496   */
   497  static __always_inline int lb6_rev_nat(struct __ctx_buff *ctx, int l4_off,
   498  				       __u16 index, struct ipv6_ct_tuple *tuple)
   499  {
   500  	struct lb6_reverse_nat *nat;
   501  
   502  	nat = lb6_lookup_rev_nat_entry(ctx, index);
   503  	if (nat == NULL)
   504  		return 0;
   505  
   506  	return __lb6_rev_nat(ctx, l4_off, tuple, nat);
   507  }
   508  
   509  static __always_inline void
   510  lb6_fill_key(struct lb6_key *key, struct ipv6_ct_tuple *tuple)
   511  {
   512  	/* FIXME: set after adding support for different L4 protocols in LB */
   513  	key->proto = 0;
   514  	ipv6_addr_copy(&key->address, &tuple->daddr);
   515  	key->dport = tuple->sport;
   516  }
   517  
   518  /** Extract IPv6 CT tuple from packet
   519   * @arg ctx		Packet
   520   * @arg ip6		Pointer to L3 header
   521   * @arg l3_off		Offset to L3 header
   522   * @arg l4_off		Offset to L4 header
   523   * @arg tuple		CT tuple
   524   *
   525   * Expects the ctx to be validated for direct packet access up to L4.
   526   *
   527   * Returns:
   528   *   - CTX_ACT_OK on successful extraction
   529   *   - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack)
   530   *   - Negative error code
   531   */
   532  static __always_inline int
   533  lb6_extract_tuple(struct __ctx_buff *ctx, struct ipv6hdr *ip6, int l3_off,
   534  		  int *l4_off, struct ipv6_ct_tuple *tuple)
   535  {
   536  	int ret;
   537  
   538  	tuple->nexthdr = ip6->nexthdr;
   539  	ipv6_addr_copy(&tuple->daddr, (union v6addr *)&ip6->daddr);
   540  	ipv6_addr_copy(&tuple->saddr, (union v6addr *)&ip6->saddr);
   541  
   542  	ret = ipv6_hdrlen_offset(ctx, &tuple->nexthdr, l3_off);
   543  	if (ret < 0) {
   544  		/* Make sure *l4_off is always initialized on return, because
   545  		 * Clang can spill it from a register to the stack even in error
   546  		 * flows where this value is no longer used, and this pattern is
   547  		 * rejected by the verifier.
   548  		 * Use a prominent value (-1) to highlight any potential misuse.
   549  		 */
   550  		*l4_off = -1;
   551  		return ret;
   552  	}
   553  
   554  	*l4_off = l3_off + ret;
   555  
   556  	switch (tuple->nexthdr) {
   557  	case IPPROTO_TCP:
   558  	case IPPROTO_UDP:
   559  #ifdef ENABLE_SCTP
   560  	case IPPROTO_SCTP:
   561  #endif  /* ENABLE_SCTP */
   562  		if (l4_load_ports(ctx, *l4_off, &tuple->dport) < 0)
   563  			return DROP_CT_INVALID_HDR;
   564  		return 0;
   565  	case IPPROTO_ICMPV6:
   566  		return DROP_UNSUPP_SERVICE_PROTO;
   567  	default:
   568  		return DROP_UNKNOWN_L4;
   569  	}
   570  }
   571  
   572  static __always_inline
   573  bool lb6_src_range_ok(const struct lb6_service *svc __maybe_unused,
   574  		      const union v6addr *saddr __maybe_unused)
   575  {
   576  #ifdef ENABLE_SRC_RANGE_CHECK
   577  	struct lb6_src_range_key key;
   578  
   579  	if (!lb6_svc_has_src_range_check(svc))
   580  		return true;
   581  
   582  	key = (typeof(key)) {
   583  		.lpm_key = { SRC_RANGE_STATIC_PREFIX(key), {} },
   584  		.rev_nat_id = svc->rev_nat_index,
   585  		.addr = *saddr,
   586  	};
   587  
   588  	if (map_lookup_elem(&LB6_SRC_RANGE_MAP, &key))
   589  		return true;
   590  
   591  	return false;
   592  #else
   593  	return true;
   594  #endif /* ENABLE_SRC_RANGE_CHECK */
   595  }
   596  
   597  static __always_inline bool
   598  lb6_to_lb4_service(const struct lb6_service *svc __maybe_unused)
   599  {
   600  #ifdef ENABLE_NAT_46X64
   601  	return svc->flags2 & SVC_FLAG_NAT_46X64;
   602  #else
   603  	return false;
   604  #endif
   605  }
   606  
   607  static __always_inline
   608  struct lb6_service *lb6_lookup_service(struct lb6_key *key,
   609  				       const bool scope_switch)
   610  {
   611  	struct lb6_service *svc;
   612  
   613  	key->scope = LB_LOOKUP_SCOPE_EXT;
   614  	key->backend_slot = 0;
   615  	svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key);
   616  	if (svc) {
   617  		if (!scope_switch || !lb6_svc_is_two_scopes(svc))
   618  			return svc;
   619  		key->scope = LB_LOOKUP_SCOPE_INT;
   620  		svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key);
   621  	}
   622  
   623  	return svc;
   624  }
   625  
   626  static __always_inline struct lb6_backend *__lb6_lookup_backend(__u32 backend_id)
   627  {
   628  	return map_lookup_elem(&LB6_BACKEND_MAP, &backend_id);
   629  }
   630  
   631  static __always_inline struct lb6_backend *
   632  lb6_lookup_backend(struct __ctx_buff *ctx __maybe_unused, __u32 backend_id)
   633  {
   634  	struct lb6_backend *backend;
   635  
   636  	backend = __lb6_lookup_backend(backend_id);
   637  	if (!backend)
   638  		cilium_dbg_lb(ctx, DBG_LB6_LOOKUP_BACKEND_FAIL, backend_id, 0);
   639  
   640  	return backend;
   641  }
   642  
   643  static __always_inline
   644  struct lb6_service *__lb6_lookup_backend_slot(struct lb6_key *key)
   645  {
   646  	return map_lookup_elem(&LB6_SERVICES_MAP_V2, key);
   647  }
   648  
   649  static __always_inline
   650  struct lb6_service *lb6_lookup_backend_slot(struct __ctx_buff *ctx __maybe_unused,
   651  					    struct lb6_key *key, __u16 slot)
   652  {
   653  	struct lb6_service *svc;
   654  
   655  	key->backend_slot = slot;
   656  	cilium_dbg_lb(ctx, DBG_LB6_LOOKUP_BACKEND_SLOT, key->backend_slot, key->dport);
   657  	svc = __lb6_lookup_backend_slot(key);
   658  	if (svc)
   659  		return svc;
   660  
   661  	cilium_dbg_lb(ctx, DBG_LB6_LOOKUP_BACKEND_SLOT_V2_FAIL, key->backend_slot, key->dport);
   662  
   663  	return NULL;
   664  }
   665  
   666  /* Backend slot 0 is always reserved for the service frontend. */
   667  #if LB_SELECTION == LB_SELECTION_RANDOM
   668  static __always_inline __u32
   669  lb6_select_backend_id(struct __ctx_buff *ctx,
   670  		      struct lb6_key *key,
   671  		      const struct ipv6_ct_tuple *tuple __maybe_unused,
   672  		      const struct lb6_service *svc)
   673  {
   674  	__u16 slot = (get_prandom_u32() % svc->count) + 1;
   675  	struct lb6_service *be = lb6_lookup_backend_slot(ctx, key, slot);
   676  
   677  	return be ? be->backend_id : 0;
   678  }
   679  #elif LB_SELECTION == LB_SELECTION_MAGLEV
   680  static __always_inline __u32
   681  lb6_select_backend_id(struct __ctx_buff *ctx __maybe_unused,
   682  		      struct lb6_key *key __maybe_unused,
   683  		      const struct ipv6_ct_tuple *tuple,
   684  		      const struct lb6_service *svc)
   685  {
   686  	__u32 zero = 0, index = svc->rev_nat_index;
   687  	__u32 *backend_ids;
   688  	void *maglev_lut;
   689  
   690  	maglev_lut = map_lookup_elem(&LB6_MAGLEV_MAP_OUTER, &index);
   691  	if (unlikely(!maglev_lut))
   692  		return 0;
   693  
   694  	backend_ids = map_lookup_elem(maglev_lut, &zero);
   695  	if (unlikely(!backend_ids))
   696  		return 0;
   697  
   698  	index = hash_from_tuple_v6(tuple) % LB_MAGLEV_LUT_SIZE;
   699          return map_array_get_32(backend_ids, index, (LB_MAGLEV_LUT_SIZE - 1) << 2);
   700  }
   701  #elif LB_SELECTION == LB_SELECTION_FIRST
   702  /* Backend selection for tests that always chooses first slot. */
   703  static __always_inline __u32
   704  lb6_select_backend_id(struct __ctx_buff *ctx __maybe_unused,
   705  		      struct lb6_key *key __maybe_unused,
   706  		      const struct ipv6_ct_tuple *tuple,
   707  		      const struct lb6_service *svc)
   708  {
   709  	struct lb6_service *be = lb6_lookup_backend_slot(ctx, key, 1);
   710  
   711  	return be ? be->backend_id : 0;
   712  }
   713  #else
   714  # error "Invalid load balancer backend selection algorithm!"
   715  #endif /* LB_SELECTION */
   716  
   717  static __always_inline int lb6_xlate(struct __ctx_buff *ctx, __u8 nexthdr,
   718  				     int l3_off, int l4_off,
   719  				     const struct lb6_key *key,
   720  				     const struct lb6_backend *backend,
   721  				     const bool skip_l3_xlate)
   722  {
   723  	const union v6addr *new_dst = &backend->address;
   724  	struct csum_offset csum_off = {};
   725  
   726  	csum_l4_offset_and_flags(nexthdr, &csum_off);
   727  
   728  	if (skip_l3_xlate)
   729  		goto l4_xlate;
   730  
   731  	if (ipv6_store_daddr(ctx, new_dst->addr, l3_off) < 0)
   732  		return DROP_WRITE_ERROR;
   733  	if (csum_off.offset) {
   734  		__be32 sum = csum_diff(key->address.addr, 16, new_dst->addr,
   735  				       16, 0);
   736  
   737  		if (csum_l4_replace(ctx, l4_off, &csum_off, 0, sum,
   738  				    BPF_F_PSEUDO_HDR) < 0)
   739  			return DROP_CSUM_L4;
   740  	}
   741  
   742  l4_xlate:
   743  	return lb_l4_xlate(ctx, nexthdr, l4_off, &csum_off, key->dport,
   744  			   backend->port);
   745  }
   746  
   747  #ifdef ENABLE_SESSION_AFFINITY
   748  static __always_inline __u32
   749  __lb6_affinity_backend_id(const struct lb6_service *svc, bool netns_cookie,
   750  			  union lb6_affinity_client_id *id)
   751  {
   752  	struct lb6_affinity_key key = {
   753  		.rev_nat_id	= svc->rev_nat_index,
   754  		.netns_cookie	= netns_cookie,
   755  	};
   756  	struct lb_affinity_val *val;
   757  
   758  	if (netns_cookie)
   759  		key.client_id.client_cookie = id->client_cookie;
   760  	else
   761  		ipv6_addr_copy_unaligned(&key.client_id.client_ip, &id->client_ip);
   762  
   763  	val = map_lookup_elem(&LB6_AFFINITY_MAP, &key);
   764  	if (val != NULL) {
   765  		__u32 now = bpf_mono_now();
   766  		struct lb_affinity_match match = {
   767  			.rev_nat_id	= svc->rev_nat_index,
   768  			.backend_id	= val->backend_id,
   769  		};
   770  
   771  		if (READ_ONCE(val->last_used) +
   772  		    bpf_sec_to_mono(svc->affinity_timeout) <= now) {
   773  			map_delete_elem(&LB6_AFFINITY_MAP, &key);
   774  			return 0;
   775  		}
   776  
   777  		if (!map_lookup_elem(&LB_AFFINITY_MATCH_MAP, &match)) {
   778  			map_delete_elem(&LB6_AFFINITY_MAP, &key);
   779  			return 0;
   780  		}
   781  
   782  		WRITE_ONCE(val->last_used, now);
   783  		return val->backend_id;
   784  	}
   785  
   786  	return 0;
   787  }
   788  
   789  static __always_inline __u32
   790  lb6_affinity_backend_id_by_addr(const struct lb6_service *svc,
   791  				union lb6_affinity_client_id *id)
   792  {
   793  	return __lb6_affinity_backend_id(svc, false, id);
   794  }
   795  
   796  static __always_inline void
   797  __lb6_update_affinity(const struct lb6_service *svc, bool netns_cookie,
   798  		      union lb6_affinity_client_id *id, __u32 backend_id)
   799  {
   800  	__u32 now = bpf_mono_now();
   801  	struct lb6_affinity_key key = {
   802  		.rev_nat_id	= svc->rev_nat_index,
   803  		.netns_cookie	= netns_cookie,
   804  	};
   805  	struct lb_affinity_val val = {
   806  		.backend_id	= backend_id,
   807  		.last_used	= now,
   808  	};
   809  
   810  	if (netns_cookie)
   811  		key.client_id.client_cookie = id->client_cookie;
   812  	else
   813  		ipv6_addr_copy_unaligned(&key.client_id.client_ip, &id->client_ip);
   814  
   815  	map_update_elem(&LB6_AFFINITY_MAP, &key, &val, 0);
   816  }
   817  
   818  static __always_inline void
   819  lb6_update_affinity_by_addr(const struct lb6_service *svc,
   820  			    union lb6_affinity_client_id *id, __u32 backend_id)
   821  {
   822  	__lb6_update_affinity(svc, false, id, backend_id);
   823  }
   824  #endif /* ENABLE_SESSION_AFFINITY */
   825  
   826  static __always_inline __u32
   827  lb6_affinity_backend_id_by_netns(const struct lb6_service *svc __maybe_unused,
   828  				 union lb6_affinity_client_id *id __maybe_unused)
   829  {
   830  #if defined(ENABLE_SESSION_AFFINITY)
   831  	return __lb6_affinity_backend_id(svc, true, id);
   832  #else
   833  	return 0;
   834  #endif
   835  }
   836  
   837  static __always_inline void
   838  lb6_update_affinity_by_netns(const struct lb6_service *svc __maybe_unused,
   839  			     union lb6_affinity_client_id *id __maybe_unused,
   840  			     __u32 backend_id __maybe_unused)
   841  {
   842  #if defined(ENABLE_SESSION_AFFINITY)
   843  	__lb6_update_affinity(svc, true, id, backend_id);
   844  #endif
   845  }
   846  
   847  static __always_inline int
   848  lb6_to_lb4(struct __ctx_buff *ctx __maybe_unused,
   849  	   const struct ipv6hdr *ip6 __maybe_unused)
   850  {
   851  #ifdef ENABLE_NAT_46X64
   852  	__be32 src4, dst4;
   853  
   854  	build_v4_from_v6((const union v6addr *)&ip6->saddr, &src4);
   855  	build_v4_from_v6((const union v6addr *)&ip6->daddr, &dst4);
   856  
   857  	return ipv6_to_ipv4(ctx, src4, dst4);
   858  #else
   859  	return DROP_NAT_46X64_DISABLED;
   860  #endif
   861  }
   862  
   863  static __always_inline int lb6_local(const void *map, struct __ctx_buff *ctx,
   864  				     int l3_off, int l4_off,
   865  				     struct lb6_key *key,
   866  				     struct ipv6_ct_tuple *tuple,
   867  				     const struct lb6_service *svc,
   868  				     struct ct_state *state,
   869  				     const bool skip_l3_xlate,
   870  				     __s8 *ext_err)
   871  {
   872  	__u32 monitor; /* Deliberately ignored; regular CT will determine monitoring. */
   873  	__u8 flags = tuple->flags;
   874  	struct lb6_backend *backend;
   875  	__u32 backend_id = 0;
   876  	int ret;
   877  #ifdef ENABLE_SESSION_AFFINITY
   878  	union lb6_affinity_client_id client_id;
   879  
   880  	ipv6_addr_copy(&client_id.client_ip, &tuple->saddr);
   881  #endif
   882  
   883  	state->rev_nat_index = svc->rev_nat_index;
   884  
   885  	/* See lb4_local comments re svc endpoint lookup process */
   886  	ret = ct_lazy_lookup6(map, tuple, ctx, l4_off, CT_SERVICE,
   887  			      SCOPE_REVERSE, CT_ENTRY_SVC, state, &monitor);
   888  	if (ret < 0)
   889  		goto drop_err;
   890  
   891  	switch (ret) {
   892  	case CT_NEW:
   893  		if (unlikely(svc->count == 0))
   894  			goto no_service;
   895  
   896  #ifdef ENABLE_SESSION_AFFINITY
   897  		if (lb6_svc_is_affinity(svc)) {
   898  			backend_id = lb6_affinity_backend_id_by_addr(svc, &client_id);
   899  			if (backend_id != 0) {
   900  				backend = lb6_lookup_backend(ctx, backend_id);
   901  				if (backend == NULL)
   902  					backend_id = 0;
   903  			}
   904  		}
   905  #endif
   906  		if (backend_id == 0) {
   907  			backend_id = lb6_select_backend_id(ctx, key, tuple, svc);
   908  			backend = lb6_lookup_backend(ctx, backend_id);
   909  			if (backend == NULL)
   910  				goto no_service;
   911  		}
   912  
   913  		state->backend_id = backend_id;
   914  
   915  		ret = ct_create6(map, NULL, tuple, ctx, CT_SERVICE, state, ext_err);
   916  		/* Fail closed, if the conntrack entry create fails drop
   917  		 * service lookup.
   918  		 */
   919  		if (IS_ERR(ret))
   920  			goto drop_err;
   921  
   922  #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING
   923  		_lb_act_conn_open(ct_state->rev_nat_index, backend->zone);
   924  #endif
   925  
   926  		break;
   927  	case CT_REPLY:
   928  		backend_id = state->backend_id;
   929  
   930  		/* If the lookup fails it means the user deleted the backend out from
   931  		 * underneath us. To resolve this fall back to hash. If this is a TCP
   932  		 * session we are likely to get a TCP RST.
   933  		 */
   934  		backend = lb6_lookup_backend(ctx, backend_id);
   935  #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING
   936  		if (state->closing && backend)
   937  			_lb_act_conn_closed(svc->rev_nat_index, backend->zone);
   938  #endif
   939  		if (unlikely(!backend || backend->flags != BE_STATE_ACTIVE)) {
   940  			/* Drain existing connections, but redirect new ones to only
   941  			 * active backends.
   942  			 */
   943  			if (backend && !state->syn)
   944  				break;
   945  
   946  			if (unlikely(svc->count == 0))
   947  				goto no_service;
   948  
   949  			backend_id = lb6_select_backend_id(ctx, key, tuple, svc);
   950  			backend = lb6_lookup_backend(ctx, backend_id);
   951  			if (!backend)
   952  				goto no_service;
   953  
   954  			state->rev_nat_index = svc->rev_nat_index;
   955  			ct_update_svc_entry(map, tuple, backend_id, svc->rev_nat_index);
   956  		}
   957  
   958  		break;
   959  	default:
   960  		ret = DROP_UNKNOWN_CT;
   961  		goto drop_err;
   962  	}
   963  
   964  	/* Restore flags so that SERVICE flag is only used in used when the
   965  	 * service lookup happens and future lookups use EGRESS or INGRESS.
   966  	 */
   967  	tuple->flags = flags;
   968  #ifdef ENABLE_SESSION_AFFINITY
   969  	if (lb6_svc_is_affinity(svc))
   970  		lb6_update_affinity_by_addr(svc, &client_id, backend_id);
   971  #endif
   972  
   973  	ipv6_addr_copy(&tuple->daddr, &backend->address);
   974  
   975  	if (lb_skip_l4_dnat())
   976  		return CTX_ACT_OK;
   977  
   978  	if (likely(backend->port))
   979  		tuple->sport = backend->port;
   980  
   981  	return lb6_xlate(ctx, tuple->nexthdr, l3_off, l4_off,
   982  			 key, backend, skip_l3_xlate);
   983  no_service:
   984  	ret = DROP_NO_SERVICE;
   985  drop_err:
   986  	tuple->flags = flags;
   987  	return ret;
   988  }
   989  
   990  /* lb6_ctx_store_state() stores per packet load balancing state to be picked
   991   * up on the continuation tail call.
   992   * Note that the IP headers are already xlated and the tuple is re-initialized
   993   * from the xlated headers before restoring state.
   994   * NOTE: if lb_skip_l4_dnat() this is not the case as xlate is skipped. We
   995   * lose the updated tuple daddr in that case.
   996   */
   997  static __always_inline void lb6_ctx_store_state(struct __ctx_buff *ctx,
   998  						const struct ct_state *state,
   999  					       __u16 proxy_port)
  1000  {
  1001  	ctx_store_meta(ctx, CB_PROXY_MAGIC, (__u32)proxy_port << 16);
  1002  	ctx_store_meta(ctx, CB_CT_STATE, (__u32)state->rev_nat_index);
  1003  }
  1004  
  1005  /* lb6_ctx_restore_state() restores per packet load balancing state from the
  1006   * previous tail call.
  1007   * tuple->flags does not need to be restored, as it will be reinitialized from
  1008   * the packet.
  1009   */
  1010  static __always_inline void lb6_ctx_restore_state(struct __ctx_buff *ctx,
  1011  						  struct ct_state *state,
  1012  						 __u16 *proxy_port)
  1013  {
  1014  	state->rev_nat_index = (__u16)ctx_load_and_clear_meta(ctx, CB_CT_STATE);
  1015  
  1016  	/* No loopback support for IPv6, see lb6_local() above. */
  1017  
  1018  	*proxy_port = ctx_load_and_clear_meta(ctx, CB_PROXY_MAGIC) >> 16;
  1019  }
  1020  
  1021  #else
  1022  
  1023  /* Stubs for v4-in-v6 socket cgroup hook case when only v4 is enabled to avoid
  1024   * additional map management.
  1025   */
  1026  static __always_inline
  1027  struct lb6_service *lb6_lookup_service(struct lb6_key *key __maybe_unused,
  1028  				       const bool scope_switch __maybe_unused)
  1029  {
  1030  	return NULL;
  1031  }
  1032  
  1033  static __always_inline
  1034  struct lb6_service *__lb6_lookup_backend_slot(struct lb6_key *key __maybe_unused)
  1035  {
  1036  	return NULL;
  1037  }
  1038  
  1039  static __always_inline struct lb6_backend *
  1040  __lb6_lookup_backend(__u16 backend_id __maybe_unused)
  1041  {
  1042  	return NULL;
  1043  }
  1044  
  1045  static __always_inline bool
  1046  lb6_to_lb4_service(const struct lb6_service *svc __maybe_unused)
  1047  {
  1048  	return false;
  1049  }
  1050  #endif /* ENABLE_IPV6 */
  1051  
  1052  #ifdef ENABLE_IPV4
  1053  static __always_inline int __lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int l4_off,
  1054  					 struct ipv4_ct_tuple *tuple,
  1055  					 const struct lb4_reverse_nat *nat,
  1056  					 bool loopback __maybe_unused, bool has_l4_header)
  1057  {
  1058  	__be32 old_sip = tuple->saddr, sum = 0;
  1059  	int ret;
  1060  
  1061  	cilium_dbg_lb(ctx, DBG_LB4_REVERSE_NAT, nat->address, nat->port);
  1062  
  1063  	tuple->saddr = nat->address;
  1064  
  1065  #ifndef DISABLE_LOOPBACK_LB
  1066  	if (loopback) {
  1067  		/* The packet was looped back to the sending endpoint on the
  1068  		 * forward service translation. This implies that the original
  1069  		 * source address of the packet is the source address of the
  1070  		 * current packet. We therefore need to make the current source
  1071  		 * address the new destination address.
  1072  		 */
  1073  		__be32 old_dip = tuple->daddr;
  1074  
  1075  		cilium_dbg_lb(ctx, DBG_LB4_LOOPBACK_SNAT_REV, old_dip, old_sip);
  1076  
  1077  		ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, daddr), &old_sip, 4, 0);
  1078  		if (IS_ERR(ret))
  1079  			return DROP_WRITE_ERROR;
  1080  
  1081  		sum = csum_diff(&old_dip, 4, &old_sip, 4, 0);
  1082  
  1083  		/* Update the tuple address which is representing the destination address */
  1084  		tuple->saddr = old_sip;
  1085  	}
  1086  #endif
  1087  
  1088  	ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, saddr),
  1089  			      &nat->address, 4, 0);
  1090  	if (IS_ERR(ret))
  1091  		return DROP_WRITE_ERROR;
  1092  
  1093  	sum = csum_diff(&old_sip, 4, &nat->address, 4, sum);
  1094  	if (ipv4_csum_update_by_diff(ctx, l3_off, sum) < 0)
  1095  		return DROP_CSUM_L3;
  1096  
  1097  	if (has_l4_header) {
  1098  		struct csum_offset csum_off = {};
  1099  
  1100  		csum_l4_offset_and_flags(tuple->nexthdr, &csum_off);
  1101  
  1102  		if (nat->port) {
  1103  			/* We expect to only handle replies. Thus the extracted CT tuple
  1104  			 * will have the packet's source port in .dport.
  1105  			 */
  1106  			ret = reverse_map_l4_port(ctx, tuple->nexthdr, tuple->dport,
  1107  						  nat->port, l4_off, &csum_off);
  1108  			if (IS_ERR(ret))
  1109  				return ret;
  1110  		}
  1111  
  1112  		if (csum_off.offset &&
  1113  		    csum_l4_replace(ctx, l4_off, &csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0)
  1114  			return DROP_CSUM_L4;
  1115  	}
  1116  
  1117  	return 0;
  1118  }
  1119  
  1120  static __always_inline struct lb4_reverse_nat *
  1121  lb4_lookup_rev_nat_entry(struct __ctx_buff *ctx __maybe_unused, __u16 index)
  1122  {
  1123  	cilium_dbg_lb(ctx, DBG_LB4_REVERSE_NAT_LOOKUP, index, 0);
  1124  
  1125  	return map_lookup_elem(&LB4_REVERSE_NAT_MAP, &index);
  1126  }
  1127  
  1128  /** Perform IPv4 reverse NAT based on reverse NAT index
  1129   * @arg ctx		packet
  1130   * @arg l3_off		offset to L3
  1131   * @arg l4_off		offset to L4
  1132   * @arg index		reverse NAT index
  1133   * @arg loopback	loopback connection
  1134   * @arg tuple		tuple
  1135   */
  1136  static __always_inline int lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int l4_off,
  1137  				       __u16 index, bool loopback,
  1138  				       struct ipv4_ct_tuple *tuple, bool has_l4_header)
  1139  {
  1140  	struct lb4_reverse_nat *nat;
  1141  
  1142  	nat = lb4_lookup_rev_nat_entry(ctx, index);
  1143  	if (nat == NULL)
  1144  		return 0;
  1145  
  1146  	return __lb4_rev_nat(ctx, l3_off, l4_off, tuple, nat,
  1147  			     loopback, has_l4_header);
  1148  }
  1149  
  1150  static __always_inline void
  1151  lb4_fill_key(struct lb4_key *key, const struct ipv4_ct_tuple *tuple)
  1152  {
  1153  	/* FIXME: set after adding support for different L4 protocols in LB */
  1154  	key->proto = 0;
  1155  	key->address = tuple->daddr;
  1156  	/* CT tuple has ports in reverse order: */
  1157  	key->dport = tuple->sport;
  1158  }
  1159  
  1160  /** Extract IPv4 CT tuple from packet
  1161   * @arg ctx		Packet
  1162   * @arg ip4		Pointer to L3 header
  1163   * @arg l3_off		Offset to L3 header
  1164   * @arg l4_off		Offset to L4 header
  1165   * @arg tuple		CT tuple
  1166   *
  1167   * Returns:
  1168   *   - CTX_ACT_OK on successful extraction
  1169   *   - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack)
  1170   *   - Negative error code
  1171   */
  1172  static __always_inline int
  1173  lb4_extract_tuple(struct __ctx_buff *ctx, struct iphdr *ip4, int l3_off, int *l4_off,
  1174  		  struct ipv4_ct_tuple *tuple)
  1175  {
  1176  	int ret;
  1177  
  1178  	tuple->nexthdr = ip4->protocol;
  1179  	tuple->daddr = ip4->daddr;
  1180  	tuple->saddr = ip4->saddr;
  1181  
  1182  	*l4_off = l3_off + ipv4_hdrlen(ip4);
  1183  
  1184  	switch (tuple->nexthdr) {
  1185  	case IPPROTO_TCP:
  1186  	case IPPROTO_UDP:
  1187  #ifdef ENABLE_SCTP
  1188  	case IPPROTO_SCTP:
  1189  #endif  /* ENABLE_SCTP */
  1190  		ret = ipv4_load_l4_ports(ctx, ip4, *l4_off, CT_EGRESS,
  1191  					 &tuple->dport, NULL);
  1192  
  1193  		if (IS_ERR(ret))
  1194  			return ret;
  1195  		return 0;
  1196  	case IPPROTO_ICMP:
  1197  		return DROP_UNSUPP_SERVICE_PROTO;
  1198  	default:
  1199  		return DROP_UNKNOWN_L4;
  1200  	}
  1201  }
  1202  
  1203  static __always_inline
  1204  bool lb4_src_range_ok(const struct lb4_service *svc __maybe_unused,
  1205  		      __u32 saddr __maybe_unused)
  1206  {
  1207  #ifdef ENABLE_SRC_RANGE_CHECK
  1208  	struct lb4_src_range_key key;
  1209  
  1210  	if (!lb4_svc_has_src_range_check(svc))
  1211  		return true;
  1212  
  1213  	key = (typeof(key)) {
  1214  		.lpm_key = { SRC_RANGE_STATIC_PREFIX(key), {} },
  1215  		.rev_nat_id = svc->rev_nat_index,
  1216  		.addr = saddr,
  1217  	};
  1218  
  1219  	if (map_lookup_elem(&LB4_SRC_RANGE_MAP, &key))
  1220  		return true;
  1221  
  1222  	return false;
  1223  #else
  1224  	return true;
  1225  #endif /* ENABLE_SRC_RANGE_CHECK */
  1226  }
  1227  
  1228  static __always_inline bool
  1229  lb4_to_lb6_service(const struct lb4_service *svc __maybe_unused)
  1230  {
  1231  #ifdef ENABLE_NAT_46X64
  1232  	return svc->flags2 & SVC_FLAG_NAT_46X64;
  1233  #else
  1234  	return false;
  1235  #endif
  1236  }
  1237  
  1238  static __always_inline
  1239  struct lb4_service *lb4_lookup_service(struct lb4_key *key,
  1240  				       const bool scope_switch)
  1241  {
  1242  	struct lb4_service *svc;
  1243  
  1244  	key->scope = LB_LOOKUP_SCOPE_EXT;
  1245  	key->backend_slot = 0;
  1246  	svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
  1247  	if (svc) {
  1248  		if (!scope_switch || !lb4_svc_is_two_scopes(svc))
  1249  			return svc;
  1250  		key->scope = LB_LOOKUP_SCOPE_INT;
  1251  		svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
  1252  	}
  1253  
  1254  	return svc;
  1255  }
  1256  
  1257  static __always_inline struct lb4_backend *__lb4_lookup_backend(__u32 backend_id)
  1258  {
  1259  	return map_lookup_elem(&LB4_BACKEND_MAP, &backend_id);
  1260  }
  1261  
  1262  static __always_inline struct lb4_backend *
  1263  lb4_lookup_backend(struct __ctx_buff *ctx __maybe_unused, __u32 backend_id)
  1264  {
  1265  	struct lb4_backend *backend;
  1266  
  1267  	backend = __lb4_lookup_backend(backend_id);
  1268  	if (!backend)
  1269  		cilium_dbg_lb(ctx, DBG_LB4_LOOKUP_BACKEND_FAIL, backend_id, 0);
  1270  
  1271  	return backend;
  1272  }
  1273  
  1274  static __always_inline
  1275  struct lb4_service *__lb4_lookup_backend_slot(struct lb4_key *key)
  1276  {
  1277  	return map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
  1278  }
  1279  
  1280  static __always_inline
  1281  struct lb4_service *lb4_lookup_backend_slot(struct __ctx_buff *ctx __maybe_unused,
  1282  					    struct lb4_key *key, __u16 slot)
  1283  {
  1284  	struct lb4_service *svc;
  1285  
  1286  	key->backend_slot = slot;
  1287  	cilium_dbg_lb(ctx, DBG_LB4_LOOKUP_BACKEND_SLOT, key->backend_slot, key->dport);
  1288  	svc = __lb4_lookup_backend_slot(key);
  1289  	if (svc)
  1290  		return svc;
  1291  
  1292  	cilium_dbg_lb(ctx, DBG_LB4_LOOKUP_BACKEND_SLOT_V2_FAIL, key->backend_slot, key->dport);
  1293  
  1294  	return NULL;
  1295  }
  1296  
  1297  /* Backend slot 0 is always reserved for the service frontend. */
  1298  #if LB_SELECTION == LB_SELECTION_RANDOM
  1299  static __always_inline __u32
  1300  lb4_select_backend_id(struct __ctx_buff *ctx,
  1301  		      struct lb4_key *key,
  1302  		      const struct ipv4_ct_tuple *tuple __maybe_unused,
  1303  		      const struct lb4_service *svc)
  1304  {
  1305  	__u16 slot = (get_prandom_u32() % svc->count) + 1;
  1306  	struct lb4_service *be = lb4_lookup_backend_slot(ctx, key, slot);
  1307  
  1308  	return be ? be->backend_id : 0;
  1309  }
  1310  #elif LB_SELECTION == LB_SELECTION_MAGLEV
  1311  static __always_inline __u32
  1312  lb4_select_backend_id(struct __ctx_buff *ctx __maybe_unused,
  1313  		      struct lb4_key *key __maybe_unused,
  1314  		      const struct ipv4_ct_tuple *tuple,
  1315  		      const struct lb4_service *svc)
  1316  {
  1317  	__u32 zero = 0, index = svc->rev_nat_index;
  1318  	__u32 *backend_ids;
  1319  	void *maglev_lut;
  1320  
  1321  	maglev_lut = map_lookup_elem(&LB4_MAGLEV_MAP_OUTER, &index);
  1322  	if (unlikely(!maglev_lut))
  1323  		return 0;
  1324  
  1325  	backend_ids = map_lookup_elem(maglev_lut, &zero);
  1326  	if (unlikely(!backend_ids))
  1327  		return 0;
  1328  
  1329  	index = hash_from_tuple_v4(tuple) % LB_MAGLEV_LUT_SIZE;
  1330          return map_array_get_32(backend_ids, index, (LB_MAGLEV_LUT_SIZE - 1) << 2);
  1331  }
  1332  #elif LB_SELECTION == LB_SELECTION_FIRST
  1333  /* Backend selection for tests that always chooses first slot. */
  1334  static __always_inline __u32
  1335  lb4_select_backend_id(struct __ctx_buff *ctx,
  1336  		      struct lb4_key *key,
  1337  		      const struct ipv4_ct_tuple *tuple __maybe_unused,
  1338  		      const struct lb4_service *svc)
  1339  {
  1340  	struct lb4_service *be = lb4_lookup_backend_slot(ctx, key, 1);
  1341  
  1342  	return be ? be->backend_id : 0;
  1343  }
  1344  #else
  1345  # error "Invalid load balancer backend selection algorithm!"
  1346  #endif /* LB_SELECTION */
  1347  
  1348  static __always_inline int
  1349  lb4_xlate(struct __ctx_buff *ctx, __be32 *new_saddr __maybe_unused,
  1350  	  __be32 *old_saddr __maybe_unused, __u8 nexthdr __maybe_unused, int l3_off,
  1351  	  int l4_off, struct lb4_key *key,
  1352  	  const struct lb4_backend *backend __maybe_unused, bool has_l4_header,
  1353  	  const bool skip_l3_xlate)
  1354  {
  1355  	const __be32 *new_daddr = &backend->address;
  1356  	struct csum_offset csum_off = {};
  1357  	__be32 sum;
  1358  	int ret;
  1359  
  1360  	if (has_l4_header)
  1361  		csum_l4_offset_and_flags(nexthdr, &csum_off);
  1362  
  1363  	if (skip_l3_xlate)
  1364  		goto l4_xlate;
  1365  
  1366  	ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, daddr),
  1367  			      new_daddr, 4, 0);
  1368  	if (ret < 0)
  1369  		return DROP_WRITE_ERROR;
  1370  
  1371  	sum = csum_diff(&key->address, 4, new_daddr, 4, 0);
  1372  #ifndef DISABLE_LOOPBACK_LB
  1373  	if (new_saddr && *new_saddr) {
  1374  		cilium_dbg_lb(ctx, DBG_LB4_LOOPBACK_SNAT, *old_saddr, *new_saddr);
  1375  
  1376  		ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, saddr),
  1377  				      new_saddr, 4, 0);
  1378  		if (ret < 0)
  1379  			return DROP_WRITE_ERROR;
  1380  
  1381  		sum = csum_diff(old_saddr, 4, new_saddr, 4, sum);
  1382  	}
  1383  #endif /* DISABLE_LOOPBACK_LB */
  1384  	if (ipv4_csum_update_by_diff(ctx, l3_off, sum) < 0)
  1385  		return DROP_CSUM_L3;
  1386  	if (csum_off.offset) {
  1387  		if (csum_l4_replace(ctx, l4_off, &csum_off, 0, sum,
  1388  				    BPF_F_PSEUDO_HDR) < 0)
  1389  			return DROP_CSUM_L4;
  1390  	}
  1391  
  1392  l4_xlate:
  1393  	return has_l4_header ? lb_l4_xlate(ctx, nexthdr, l4_off, &csum_off,
  1394  					   key->dport, backend->port) :
  1395  			       CTX_ACT_OK;
  1396  }
  1397  
  1398  #ifdef ENABLE_SESSION_AFFINITY
  1399  static __always_inline __u32
  1400  __lb4_affinity_backend_id(const struct lb4_service *svc, bool netns_cookie,
  1401  			  const union lb4_affinity_client_id *id)
  1402  {
  1403  	struct lb4_affinity_key key = {
  1404  		.rev_nat_id	= svc->rev_nat_index,
  1405  		.netns_cookie	= netns_cookie,
  1406  		.client_id	= *id,
  1407  	};
  1408  	struct lb_affinity_val *val;
  1409  
  1410  	val = map_lookup_elem(&LB4_AFFINITY_MAP, &key);
  1411  	if (val != NULL) {
  1412  		__u32 now = bpf_mono_now();
  1413  		struct lb_affinity_match match = {
  1414  			.rev_nat_id	= svc->rev_nat_index,
  1415  			.backend_id	= val->backend_id,
  1416  		};
  1417  
  1418  		/* We have seconds granularity for timing values here.
  1419  		 * To ensure that session affinity timeout works properly we don't include
  1420  		 * the upper bound from the time range.
  1421  		 * Session is sticky for range [current, last_used + affinity_timeout)
  1422  		 */
  1423  		if (READ_ONCE(val->last_used) +
  1424  		    bpf_sec_to_mono(svc->affinity_timeout) <= now) {
  1425  			map_delete_elem(&LB4_AFFINITY_MAP, &key);
  1426  			return 0;
  1427  		}
  1428  
  1429  		if (!map_lookup_elem(&LB_AFFINITY_MATCH_MAP, &match)) {
  1430  			map_delete_elem(&LB4_AFFINITY_MAP, &key);
  1431  			return 0;
  1432  		}
  1433  
  1434  		WRITE_ONCE(val->last_used, now);
  1435  		return val->backend_id;
  1436  	}
  1437  
  1438  	return 0;
  1439  }
  1440  
  1441  static __always_inline __u32
  1442  lb4_affinity_backend_id_by_addr(const struct lb4_service *svc,
  1443  				union lb4_affinity_client_id *id)
  1444  {
  1445  	return __lb4_affinity_backend_id(svc, false, id);
  1446  }
  1447  
  1448  static __always_inline void
  1449  __lb4_update_affinity(const struct lb4_service *svc, bool netns_cookie,
  1450  		      const union lb4_affinity_client_id *id,
  1451  		      __u32 backend_id)
  1452  {
  1453  	__u32 now = bpf_mono_now();
  1454  	struct lb4_affinity_key key = {
  1455  		.rev_nat_id	= svc->rev_nat_index,
  1456  		.netns_cookie	= netns_cookie,
  1457  		.client_id	= *id,
  1458  	};
  1459  	struct lb_affinity_val val = {
  1460  		.backend_id	= backend_id,
  1461  		.last_used	= now,
  1462  	};
  1463  
  1464  	map_update_elem(&LB4_AFFINITY_MAP, &key, &val, 0);
  1465  }
  1466  
  1467  static __always_inline void
  1468  lb4_update_affinity_by_addr(const struct lb4_service *svc,
  1469  			    union lb4_affinity_client_id *id, __u32 backend_id)
  1470  {
  1471  	__lb4_update_affinity(svc, false, id, backend_id);
  1472  }
  1473  #endif /* ENABLE_SESSION_AFFINITY */
  1474  
  1475  static __always_inline __u32
  1476  lb4_affinity_backend_id_by_netns(const struct lb4_service *svc __maybe_unused,
  1477  				 union lb4_affinity_client_id *id __maybe_unused)
  1478  {
  1479  #if defined(ENABLE_SESSION_AFFINITY)
  1480  	return __lb4_affinity_backend_id(svc, true, id);
  1481  #else
  1482  	return 0;
  1483  #endif
  1484  }
  1485  
  1486  static __always_inline void
  1487  lb4_update_affinity_by_netns(const struct lb4_service *svc __maybe_unused,
  1488  			     union lb4_affinity_client_id *id __maybe_unused,
  1489  			     __u32 backend_id __maybe_unused)
  1490  {
  1491  #if defined(ENABLE_SESSION_AFFINITY)
  1492  	__lb4_update_affinity(svc, true, id, backend_id);
  1493  #endif
  1494  }
  1495  
  1496  static __always_inline int
  1497  lb4_to_lb6(struct __ctx_buff *ctx __maybe_unused,
  1498  	   const struct iphdr *ip4 __maybe_unused,
  1499  	   int l3_off __maybe_unused)
  1500  {
  1501  #ifdef ENABLE_NAT_46X64
  1502  	union v6addr src6, dst6;
  1503  
  1504  	build_v4_in_v6(&src6, ip4->saddr);
  1505  	build_v4_in_v6(&dst6, ip4->daddr);
  1506  
  1507  	return ipv4_to_ipv6(ctx, l3_off, &src6, &dst6);
  1508  #else
  1509  	return DROP_NAT_46X64_DISABLED;
  1510  #endif
  1511  }
  1512  
  1513  static __always_inline int lb4_local(const void *map, struct __ctx_buff *ctx,
  1514  				     bool is_fragment, int l3_off, int l4_off,
  1515  				     struct lb4_key *key,
  1516  				     struct ipv4_ct_tuple *tuple,
  1517  				     const struct lb4_service *svc,
  1518  				     struct ct_state *state,
  1519  				     bool has_l4_header,
  1520  				     const bool skip_l3_xlate,
  1521  				     __u32 *cluster_id __maybe_unused,
  1522  				     __s8 *ext_err)
  1523  {
  1524  	__u32 monitor; /* Deliberately ignored; regular CT will determine monitoring. */
  1525  	__be32 saddr = tuple->saddr;
  1526  	__u8 flags = tuple->flags;
  1527  	struct lb4_backend *backend;
  1528  	__u32 backend_id = 0;
  1529  	__be32 new_saddr = 0;
  1530  	int ret;
  1531  #ifdef ENABLE_SESSION_AFFINITY
  1532  	union lb4_affinity_client_id client_id = {
  1533  		.client_ip = saddr,
  1534  	};
  1535  #endif
  1536  
  1537  	state->rev_nat_index = svc->rev_nat_index;
  1538  
  1539  	ret = ct_lazy_lookup4(map, tuple, ctx, is_fragment, l4_off, has_l4_header,
  1540  			      CT_SERVICE, SCOPE_REVERSE, CT_ENTRY_SVC, state, &monitor);
  1541  	if (ret < 0)
  1542  		goto drop_err;
  1543  
  1544  	switch (ret) {
  1545  	case CT_NEW:
  1546  		if (unlikely(svc->count == 0))
  1547  			goto no_service;
  1548  
  1549  #ifdef ENABLE_SESSION_AFFINITY
  1550  		if (lb4_svc_is_affinity(svc)) {
  1551  			backend_id = lb4_affinity_backend_id_by_addr(svc, &client_id);
  1552  			if (backend_id != 0) {
  1553  				backend = lb4_lookup_backend(ctx, backend_id);
  1554  				if (backend == NULL)
  1555  					backend_id = 0;
  1556  			}
  1557  		}
  1558  #endif
  1559  		if (backend_id == 0) {
  1560  			/* No CT entry has been found, so select a svc endpoint */
  1561  			backend_id = lb4_select_backend_id(ctx, key, tuple, svc);
  1562  			backend = lb4_lookup_backend(ctx, backend_id);
  1563  			if (backend == NULL)
  1564  				goto no_service;
  1565  		}
  1566  
  1567  		state->backend_id = backend_id;
  1568  
  1569  		ret = ct_create4(map, NULL, tuple, ctx, CT_SERVICE, state, ext_err);
  1570  		/* Fail closed, if the conntrack entry create fails drop
  1571  		 * service lookup.
  1572  		 */
  1573  		if (IS_ERR(ret))
  1574  			goto drop_err;
  1575  
  1576  #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING
  1577  		_lb_act_conn_open(state->rev_nat_index, backend->zone);
  1578  #endif
  1579  
  1580  		break;
  1581  	case CT_REPLY:
  1582  		backend_id = state->backend_id;
  1583  
  1584  		/* If the lookup fails it means the user deleted the backend out from
  1585  		 * underneath us. To resolve this fall back to hash. If this is a TCP
  1586  		 * session we are likely to get a TCP RST.
  1587  		 */
  1588  		backend = lb4_lookup_backend(ctx, backend_id);
  1589  #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING
  1590  		if (state->closing && backend)
  1591  			_lb_act_conn_closed(svc->rev_nat_index, backend->zone);
  1592  #endif
  1593  		if (unlikely(!backend || backend->flags != BE_STATE_ACTIVE)) {
  1594  			/* Drain existing connections, but redirect new ones to only
  1595  			 * active backends.
  1596  			 */
  1597  			if (backend && !state->syn)
  1598  				break;
  1599  
  1600  			if (unlikely(svc->count == 0))
  1601  				goto no_service;
  1602  
  1603  			backend_id = lb4_select_backend_id(ctx, key, tuple, svc);
  1604  			backend = lb4_lookup_backend(ctx, backend_id);
  1605  			if (!backend)
  1606  				goto no_service;
  1607  
  1608  			state->rev_nat_index = svc->rev_nat_index;
  1609  			ct_update_svc_entry(map, tuple, backend_id, svc->rev_nat_index);
  1610  		}
  1611  
  1612  		break;
  1613  	default:
  1614  		ret = DROP_UNKNOWN_CT;
  1615  		goto drop_err;
  1616  	}
  1617  
  1618  #ifdef ENABLE_CLUSTER_AWARE_ADDRESSING
  1619  	*cluster_id = backend->cluster_id;
  1620  #endif
  1621  
  1622  	/* Restore flags so that SERVICE flag is only used in used when the
  1623  	 * service lookup happens and future lookups use EGRESS or INGRESS.
  1624  	 */
  1625  	tuple->flags = flags;
  1626  #ifdef ENABLE_SESSION_AFFINITY
  1627  	if (lb4_svc_is_affinity(svc))
  1628  		lb4_update_affinity_by_addr(svc, &client_id, backend_id);
  1629  #endif
  1630  #ifndef DISABLE_LOOPBACK_LB
  1631  	/* Special loopback case: The origin endpoint has transmitted to a
  1632  	 * service which is being translated back to the source. This would
  1633  	 * result in a packet with identical source and destination address.
  1634  	 * Linux considers such packets as martian source and will drop unless
  1635  	 * received on a loopback device. Perform NAT on the source address
  1636  	 * to make it appear from an outside address.
  1637  	 */
  1638  	if (saddr == backend->address) {
  1639  		new_saddr = IPV4_LOOPBACK;
  1640  		state->loopback = 1;
  1641  	}
  1642  
  1643  	if (!state->loopback)
  1644  #endif
  1645  		tuple->daddr = backend->address;
  1646  
  1647  	if (lb_skip_l4_dnat())
  1648  		return CTX_ACT_OK;
  1649  
  1650  	/* CT tuple contains ports in reverse order: */
  1651  	if (likely(backend->port))
  1652  		tuple->sport = backend->port;
  1653  
  1654  	return lb4_xlate(ctx, &new_saddr, &saddr,
  1655  			 tuple->nexthdr, l3_off, l4_off, key,
  1656  			 backend, has_l4_header, skip_l3_xlate);
  1657  no_service:
  1658  	ret = DROP_NO_SERVICE;
  1659  drop_err:
  1660  	tuple->flags = flags;
  1661  	return ret;
  1662  }
  1663  
  1664  /* lb4_ctx_store_state() stores per packet load balancing state to be picked
  1665   * up on the continuation tail call.
  1666   * Note that the IP headers are already xlated and the tuple is re-initialized
  1667   * from the xlated headers before restoring state.
  1668   * NOTE: if lb_skip_l4_dnat() this is not the case as xlate is skipped. We
  1669   * lose the updated tuple daddr in that case.
  1670   */
  1671  static __always_inline void lb4_ctx_store_state(struct __ctx_buff *ctx,
  1672  						const struct ct_state *state,
  1673  					       __u16 proxy_port, __u32 cluster_id)
  1674  {
  1675  	ctx_store_meta(ctx, CB_PROXY_MAGIC, (__u32)proxy_port << 16);
  1676  	ctx_store_meta(ctx, CB_CT_STATE, (__u32)state->rev_nat_index << 16 |
  1677  #ifndef DISABLE_LOOPBACK_LB
  1678  		       state->loopback);
  1679  #else
  1680  		       0);
  1681  #endif
  1682  	ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id);
  1683  }
  1684  
  1685  /* lb4_ctx_restore_state() restores per packet load balancing state from the
  1686   * previous tail call.
  1687   * tuple->flags does not need to be restored, as it will be reinitialized from
  1688   * the packet.
  1689   */
  1690  static __always_inline void
  1691  lb4_ctx_restore_state(struct __ctx_buff *ctx, struct ct_state *state,
  1692  		       __u16 *proxy_port, __u32 *cluster_id __maybe_unused)
  1693  {
  1694  	__u32 meta = ctx_load_and_clear_meta(ctx, CB_CT_STATE);
  1695  #ifndef DISABLE_LOOPBACK_LB
  1696  	if (meta & 1)
  1697  		state->loopback = 1;
  1698  #endif
  1699  	state->rev_nat_index = meta >> 16;
  1700  
  1701  	*proxy_port = ctx_load_and_clear_meta(ctx, CB_PROXY_MAGIC) >> 16;
  1702  
  1703  #ifdef ENABLE_CLUSTER_AWARE_ADDRESSING
  1704  	*cluster_id = ctx_load_and_clear_meta(ctx, CB_CLUSTER_ID_EGRESS);
  1705  #endif
  1706  }
  1707  
  1708  /* Because we use tail calls and this file is included in bpf_sock.h */
  1709  #ifndef SKIP_CALLS_MAP
  1710  #ifdef SERVICE_NO_BACKEND_RESPONSE
  1711  
  1712  #define ICMP_PACKET_MAX_SAMPLE_SIZE 64
  1713  
  1714  static __always_inline
  1715  __wsum icmp_wsum_accumulate(void *data_start, void *data_end, int sample_len);
  1716  
  1717  static __always_inline
  1718  int __tail_no_service_ipv4(struct __ctx_buff *ctx)
  1719  {
  1720  	void *data, *data_end;
  1721  	struct ethhdr *ethhdr;
  1722  	struct iphdr *ip4;
  1723  	struct icmphdr *icmphdr;
  1724  	union macaddr smac = {};
  1725  	union macaddr dmac = {};
  1726  	__be32	saddr;
  1727  	__be32	daddr;
  1728  	__u8	tos;
  1729  	__wsum csum;
  1730  	int sample_len;
  1731  	int ret;
  1732  	const int inner_offset = sizeof(struct ethhdr) + sizeof(struct iphdr) +
  1733  		sizeof(struct icmphdr);
  1734  
  1735  	if (!revalidate_data(ctx, &data, &data_end, &ip4))
  1736  		return DROP_INVALID;
  1737  
  1738  	/* copy the incoming src and dest IPs and mac addresses to the stack.
  1739  	 * the pointers will not be valid after adding headroom.
  1740  	 */
  1741  
  1742  	if (eth_load_saddr(ctx, smac.addr, 0) < 0)
  1743  		return DROP_INVALID;
  1744  
  1745  	if (eth_load_daddr(ctx, dmac.addr, 0) < 0)
  1746  		return DROP_INVALID;
  1747  
  1748  	saddr = ip4->saddr;
  1749  	daddr = ip4->daddr;
  1750  	tos = ip4->tos;
  1751  
  1752  	/* Resize to ethernet header + 64 bytes or less */
  1753  	sample_len = ctx_full_len(ctx);
  1754  	if (sample_len > ICMP_PACKET_MAX_SAMPLE_SIZE)
  1755  		sample_len = ICMP_PACKET_MAX_SAMPLE_SIZE;
  1756  	ctx_adjust_troom(ctx, sample_len + sizeof(struct ethhdr) - ctx_full_len(ctx));
  1757  
  1758  	data = ctx_data(ctx);
  1759  	data_end = ctx_data_end(ctx);
  1760  
  1761  	/* Calculate the checksum of the ICMP sample */
  1762  	csum = icmp_wsum_accumulate(data + sizeof(struct ethhdr), data_end, sample_len);
  1763  
  1764  	/* We need to insert a IPv4 and ICMP header before the original packet.
  1765  	 * Make that room.
  1766  	 */
  1767  
  1768  #if __ctx_is == __ctx_xdp
  1769  	ret = xdp_adjust_head(ctx, 0 - (int)(sizeof(struct iphdr) + sizeof(struct icmphdr)));
  1770  #else
  1771  	ret = skb_adjust_room(ctx, sizeof(struct iphdr) + sizeof(struct icmphdr),
  1772  			      BPF_ADJ_ROOM_MAC, 0);
  1773  #endif
  1774  
  1775  	if (ret < 0)
  1776  		return DROP_INVALID;
  1777  
  1778  	/* changing size invalidates pointers, so we need to re-fetch them. */
  1779  	data = ctx_data(ctx);
  1780  	data_end = ctx_data_end(ctx);
  1781  
  1782  	/* Bound check all 3 headers at once. */
  1783  	if (data + inner_offset > data_end)
  1784  		return DROP_INVALID;
  1785  
  1786  	/* Write reversed eth header, ready for egress */
  1787  	ethhdr = data;
  1788  	memcpy(ethhdr->h_dest, smac.addr, sizeof(smac.addr));
  1789  	memcpy(ethhdr->h_source, dmac.addr, sizeof(dmac.addr));
  1790  	ethhdr->h_proto = bpf_htons(ETH_P_IP);
  1791  
  1792  	/* Write reversed ip header, ready for egress */
  1793  	ip4 = data + sizeof(struct ethhdr);
  1794  	ip4->version = 4;
  1795  	ip4->ihl = sizeof(struct iphdr) >> 2;
  1796  	ip4->tos = tos;
  1797  	ip4->tot_len = bpf_htons(sizeof(struct iphdr) + sizeof(struct icmphdr) +
  1798  		       (__u16)sample_len);
  1799  	ip4->id = 0;
  1800  	ip4->frag_off = 0;
  1801  	ip4->ttl = IPDEFTTL;
  1802  	ip4->protocol = IPPROTO_ICMP;
  1803  	ip4->check = 0;
  1804  	ip4->daddr = saddr;
  1805  	ip4->saddr = daddr;
  1806  	ip4->check = csum_fold(csum_diff(ip4, 0, ip4, sizeof(struct iphdr), 0));
  1807  
  1808  	/* Write reversed icmp header */
  1809  	icmphdr = data + sizeof(struct ethhdr) + sizeof(struct iphdr);
  1810  	icmphdr->type = ICMP_DEST_UNREACH;
  1811  	icmphdr->code = ICMP_PORT_UNREACH;
  1812  	icmphdr->checksum = 0;
  1813  	icmphdr->un.gateway = 0;
  1814  
  1815  	/* Add ICMP header checksum to sum of its body */
  1816  	csum += csum_diff(icmphdr, 0, icmphdr, sizeof(struct icmphdr), 0);
  1817  	icmphdr->checksum = csum_fold(csum);
  1818  
  1819  	/* Redirect ICMP to the interface we received it on. */
  1820  	cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY,
  1821  			   ctx_get_ifindex(ctx));
  1822  	return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0);
  1823  }
  1824  
  1825  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NO_SERVICE)
  1826  int tail_no_service_ipv4(struct __ctx_buff *ctx)
  1827  {
  1828  	__u32 src_sec_identity = ctx_load_meta(ctx, CB_SRC_LABEL);
  1829  	int ret;
  1830  
  1831  	ret = __tail_no_service_ipv4(ctx);
  1832  	if (IS_ERR(ret))
  1833  		return send_drop_notify_error(ctx, src_sec_identity, ret,
  1834  			CTX_ACT_DROP, METRIC_INGRESS);
  1835  
  1836  	return ret;
  1837  }
  1838  #endif /* SERVICE_NO_BACKEND_RESPONSE */
  1839  #endif /* SKIP_CALLS_MAP */
  1840  
  1841  #endif /* ENABLE_IPV4 */
  1842  
  1843  #ifdef ENABLE_IPV6
  1844  
  1845  /* Because we use tail calls and this file is included in bpf_sock.h */
  1846  #ifndef SKIP_CALLS_MAP
  1847  #ifdef SERVICE_NO_BACKEND_RESPONSE
  1848  
  1849  #define ICMPV6_PACKET_MAX_SAMPLE_SIZE 1280 - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)
  1850  
  1851  static __always_inline
  1852  __wsum icmp_wsum_accumulate(void *data_start, void *data_end, int sample_len);
  1853  
  1854  /* The IPv6 pseudo-header */
  1855  struct ipv6_pseudo_header_t {
  1856  	union {
  1857  		struct header {
  1858  			struct in6_addr src_ip;
  1859  			struct in6_addr dst_ip;
  1860  			__be32 top_level_length;
  1861  			__u8 zero[3];
  1862  			__u8 next_header;
  1863  		} __packed fields;
  1864  		__u16 words[20];
  1865  	};
  1866  };
  1867  
  1868  static __always_inline
  1869  int __tail_no_service_ipv6(struct __ctx_buff *ctx)
  1870  {
  1871  	void *data, *data_end;
  1872  	struct ethhdr *ethhdr;
  1873  	struct ipv6hdr *ip6;
  1874  	struct icmp6hdr *icmphdr;
  1875  	struct ipv6_pseudo_header_t pseudo_header;
  1876  	union macaddr smac = {};
  1877  	union macaddr dmac = {};
  1878  	struct in6_addr saddr;
  1879  	struct in6_addr daddr;
  1880  	struct ratelimit_key rkey = {};
  1881  	/* Rate limit to 100 ICMPv6 replies per second, burstable to 1000 responses/s */
  1882  	struct ratelimit_settings settings = {
  1883  		.bucket_size = 1000,
  1884  		.tokens_per_topup = 100,
  1885  		.topup_interval_ns = NSEC_PER_SEC,
  1886  	};
  1887  	__wsum csum;
  1888  	__u64 sample_len;
  1889  	int i;
  1890  	int ret;
  1891  	const int inner_offset = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) +
  1892  		sizeof(struct icmp6hdr);
  1893  
  1894  	rkey.netdev_idx = ctx_get_ifindex(ctx);
  1895  	if (!ratelimit_check_and_take(&rkey, &settings))
  1896  		return DROP_RATE_LIMITED;
  1897  
  1898  	if (!revalidate_data(ctx, &data, &data_end, &ip6))
  1899  		return DROP_INVALID;
  1900  
  1901  	/* copy the incoming src and dest IPs and mac addresses to the stack.
  1902  	 * the pointers will not be valid after adding headroom.
  1903  	 */
  1904  
  1905  	if (eth_load_saddr(ctx, smac.addr, 0) < 0)
  1906  		return DROP_INVALID;
  1907  
  1908  	if (eth_load_daddr(ctx, dmac.addr, 0) < 0)
  1909  		return DROP_INVALID;
  1910  
  1911  	memcpy(&saddr, &ip6->saddr, sizeof(struct in6_addr));
  1912  	memcpy(&daddr, &ip6->daddr, sizeof(struct in6_addr));
  1913  
  1914  	/* Resize to min MTU - IPv6 hdr + ICMPv6 hdr */
  1915  	sample_len = ctx_full_len(ctx);
  1916  	if (sample_len > (__u64)ICMPV6_PACKET_MAX_SAMPLE_SIZE)
  1917  		sample_len = ICMPV6_PACKET_MAX_SAMPLE_SIZE;
  1918  	ctx_adjust_troom(ctx, sample_len + sizeof(struct ethhdr) - ctx_full_len(ctx));
  1919  
  1920  	data = ctx_data(ctx);
  1921  	data_end = ctx_data_end(ctx);
  1922  
  1923  	/* Calculate the unfolded checksum of the ICMPv6 sample */
  1924  	csum = icmp_wsum_accumulate(data + sizeof(struct ethhdr), data_end, sample_len);
  1925  
  1926  	/* We need to insert a IPv6 and ICMPv6 header before the original packet.
  1927  	 * Make that room.
  1928  	 */
  1929  
  1930  #if __ctx_is == __ctx_xdp
  1931  	ret = xdp_adjust_head(ctx, 0 - (int)(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr)));
  1932  #else
  1933  	ret = skb_adjust_room(ctx, sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr),
  1934  			      BPF_ADJ_ROOM_MAC, 0);
  1935  #endif
  1936  
  1937  	if (ret < 0)
  1938  		return DROP_INVALID;
  1939  
  1940  	/* changing size invalidates pointers, so we need to re-fetch them. */
  1941  	data = ctx_data(ctx);
  1942  	data_end = ctx_data_end(ctx);
  1943  
  1944  	/* Bound check all 3 headers at once. */
  1945  	if (data + inner_offset > data_end)
  1946  		return DROP_INVALID;
  1947  
  1948  	/* Write reversed eth header, ready for egress */
  1949  	ethhdr = data;
  1950  	memcpy(ethhdr->h_dest, smac.addr, sizeof(smac.addr));
  1951  	memcpy(ethhdr->h_source, dmac.addr, sizeof(dmac.addr));
  1952  	ethhdr->h_proto = bpf_htons(ETH_P_IPV6);
  1953  
  1954  	/* Write reversed ip header, ready for egress */
  1955  	ip6 = data + sizeof(struct ethhdr);
  1956  	ip6->version = 6;
  1957  	ip6->priority = 0;
  1958  	ip6->flow_lbl[0] = 0;
  1959  	ip6->flow_lbl[1] = 0;
  1960  	ip6->flow_lbl[2] = 0;
  1961  	ip6->payload_len = bpf_htons(sizeof(struct icmp6hdr) + (__u16)sample_len);
  1962  	ip6->nexthdr = IPPROTO_ICMPV6;
  1963  	ip6->hop_limit = IPDEFTTL;
  1964  	memcpy(&ip6->daddr, &saddr, sizeof(struct in6_addr));
  1965  	memcpy(&ip6->saddr, &daddr, sizeof(struct in6_addr));
  1966  
  1967  	/* Write reversed icmp header */
  1968  	icmphdr = data + sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
  1969  	icmphdr->icmp6_type = ICMPV6_DEST_UNREACH;
  1970  	icmphdr->icmp6_code = ICMPV6_PORT_UNREACH;
  1971  	icmphdr->icmp6_cksum = 0;
  1972  	icmphdr->icmp6_dataun.un_data32[0] = 0;
  1973  
  1974  	/* Add the ICMP header to the checksum (only type and code are non-zero) */
  1975  	csum += ((__u16)icmphdr->icmp6_code) << 8 | (__u16)icmphdr->icmp6_type;
  1976  
  1977  	/* Fill pseudo header */
  1978  	memcpy(&pseudo_header.fields.src_ip, &ip6->saddr, sizeof(struct in6_addr));
  1979  	memcpy(&pseudo_header.fields.dst_ip, &ip6->daddr, sizeof(struct in6_addr));
  1980  	pseudo_header.fields.top_level_length = bpf_htonl(sizeof(struct icmp6hdr) +
  1981  						(__u32)sample_len);
  1982  	__bpf_memzero(pseudo_header.fields.zero, sizeof(pseudo_header.fields.zero));
  1983  	pseudo_header.fields.next_header = IPPROTO_ICMPV6;
  1984  
  1985  	#pragma unroll
  1986  	for (i = 0; i < (int)(sizeof(pseudo_header.words) / sizeof(__u16)); i++)
  1987  		csum += pseudo_header.words[i];
  1988  
  1989  	icmphdr->icmp6_cksum = csum_fold(csum);
  1990  
  1991  	/* Redirect ICMP to the interface we received it on. */
  1992  	cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY,
  1993  			   ctx_get_ifindex(ctx));
  1994  	return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0);
  1995  }
  1996  
  1997  __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NO_SERVICE)
  1998  int tail_no_service_ipv6(struct __ctx_buff *ctx)
  1999  {
  2000  	__u32 src_sec_identity = ctx_load_meta(ctx, CB_SRC_LABEL);
  2001  	int ret;
  2002  
  2003  	ret = __tail_no_service_ipv6(ctx);
  2004  	if (IS_ERR(ret))
  2005  		return send_drop_notify_error(ctx, src_sec_identity, ret,
  2006  			CTX_ACT_DROP, METRIC_INGRESS);
  2007  
  2008  	return ret;
  2009  }
  2010  #endif /* SERVICE_NO_BACKEND_RESPONSE */
  2011  #endif /* SKIP_CALLS_MAP */
  2012  #endif /* ENABLE_IPV6 */
  2013  
  2014  #ifdef SERVICE_NO_BACKEND_RESPONSE
  2015  
  2016  static __always_inline
  2017  __wsum icmp_wsum_accumulate(void *data_start, void *data_end, int sample_len)
  2018  {
  2019  	/* Unrolled loop to calculate the checksum of the ICMP sample
  2020  	 * Done manually because the compiler refuses with #pragma unroll
  2021  	 */
  2022  	__wsum wsum = 0;
  2023  
  2024  	#define body(i) if ((i) > sample_len) \
  2025  		return wsum; \
  2026  	if (data_start + (i) + sizeof(__u16) > data_end) { \
  2027  		if (data_start + (i) + sizeof(__u8) <= data_end)\
  2028  			wsum += *(__u8 *)(data_start + (i)); \
  2029  		return wsum; \
  2030  	} \
  2031  	wsum += *(__u16 *)(data_start + (i));
  2032  
  2033  	#define body4(i) body(i)\
  2034  		body(i + 2) \
  2035  		body(i + 4) \
  2036  		body(i + 6)
  2037  
  2038  	#define body16(i) body4(i)\
  2039  		body4(i + 8) \
  2040  		body4(i + 16) \
  2041  		body4(i + 24)
  2042  
  2043  	#define body128(i) body16(i)\
  2044  		body16(i + 32) \
  2045  		body16(i + 64) \
  2046  		body16(i + 96)
  2047  
  2048  	body128(0)
  2049  	body128(256)
  2050  	body128(512)
  2051  	body128(768)
  2052  	body128(1024)
  2053  
  2054  	return wsum;
  2055  }
  2056  
  2057  #endif /* SERVICE_NO_BACKEND_RESPONSE */
  2058  
  2059  /* sock_local_cookie retrieves the socket cookie for the
  2060   * passed socket structure.
  2061   */
  2062  static __always_inline __maybe_unused
  2063  __sock_cookie sock_local_cookie(struct bpf_sock_addr *ctx)
  2064  {
  2065  #ifdef HAVE_SOCKET_COOKIE
  2066  	/* prandom() breaks down on UDP, hence preference is on
  2067  	 * socket cookie as built-in selector. On older kernels,
  2068  	 * get_socket_cookie() provides a unique per netns cookie
  2069  	 * for the life-time of the socket. For newer kernels this
  2070  	 * is fixed to be a unique system _global_ cookie. Older
  2071  	 * kernels could have a cookie collision when two pods with
  2072  	 * different netns talk to same service backend, but that
  2073  	 * is fine since we always reverse translate to the same
  2074  	 * service IP/port pair. The only case that could happen
  2075  	 * for older kernels is that we have a cookie collision
  2076  	 * where one pod talks to the service IP/port and the
  2077  	 * other pod talks to that same specific backend IP/port
  2078  	 * directly _w/o_ going over service IP/port. Then the
  2079  	 * reverse sock addr is translated to the service IP/port.
  2080  	 * With a global socket cookie this collision cannot take
  2081  	 * place. There, only the even more unlikely case could
  2082  	 * happen where the same UDP socket talks first to the
  2083  	 * service and then to the same selected backend IP/port
  2084  	 * directly which can be considered negligible.
  2085  	 */
  2086         return get_socket_cookie(ctx);
  2087  #else
  2088         return ctx->protocol == IPPROTO_TCP ? get_prandom_u32() : 0;
  2089  #endif
  2090  }