github.com/cilium/cilium@v1.16.2/bpf/lib/common.h (about)

     1  /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
     2  /* Copyright Authors of Cilium */
     3  
     4  #pragma once
     5  
     6  #include <bpf/ctx/ctx.h>
     7  #include <bpf/api.h>
     8  
     9  #include <linux/if_ether.h>
    10  #include <linux/ipv6.h>
    11  #include <linux/in.h>
    12  #include <linux/socket.h>
    13  
    14  #include "endian.h"
    15  #include "eth.h"
    16  #include "mono.h"
    17  #include "config.h"
    18  #include "tunnel.h"
    19  
    20  #include "source_info.h"
    21  
    22  #ifndef AF_INET
    23  #define AF_INET 2
    24  #endif
    25  
    26  #ifndef AF_INET6
    27  #define AF_INET6 10
    28  #endif
    29  
    30  #ifndef IP_DF
    31  #define IP_DF 0x4000
    32  #endif
    33  
    34  #ifndef EVENT_SOURCE
    35  #define EVENT_SOURCE 0
    36  #endif
    37  
    38  #ifndef THIS_MTU
    39  /* If not available, fall back to generically detected MTU instead of more
    40   * fine-grained per-device MTU.
    41   */
    42  # define THIS_MTU MTU
    43  #endif
    44  
    45  #ifdef PREALLOCATE_MAPS
    46  #define CONDITIONAL_PREALLOC 0
    47  #else
    48  #define CONDITIONAL_PREALLOC BPF_F_NO_PREALLOC
    49  #endif
    50  
    51  #if defined(ENABLE_EGRESS_GATEWAY)
    52  #define ENABLE_EGRESS_GATEWAY_COMMON
    53  #endif
    54  
    55  #if defined(ENCAP_IFINDEX) || defined(ENABLE_EGRESS_GATEWAY_COMMON) || \
    56      (defined(ENABLE_DSR) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE)
    57  #define HAVE_ENCAP	1
    58  
    59  /* NOT_VTEP_DST is passed to an encapsulation function when the
    60   * destination of the tunnel is not a VTEP.
    61   */
    62  #define NOT_VTEP_DST 0
    63  #endif
    64  
    65  /* XFER_FLAGS that get transferred from XDP to SKB */
    66  enum {
    67  	XFER_PKT_NO_SVC		= (1 << 0),  /* Skip upper service handling. */
    68  	XFER_UNUSED		= (1 << 1),
    69  	XFER_PKT_SNAT_DONE	= (1 << 2),  /* SNAT is done */
    70  };
    71  
    72  /* For use in ctx_get_xfer(), after XDP called ctx_move_xfer(). */
    73  enum {
    74  	XFER_FLAGS = 0,		/* XFER_PKT_* */
    75  };
    76  
    77  /* FIB errors from BPF neighbor map. */
    78  #define BPF_FIB_MAP_NO_NEIGH	100
    79  
    80  #define CILIUM_CALL_DROP_NOTIFY			1
    81  #define CILIUM_CALL_ERROR_NOTIFY		2
    82  /*
    83   * A gap in the macro numbering sequence was created by #24921.
    84   * It can be reused for a new macro in the future, but caution is needed when
    85   * backporting changes as it may conflict with older versions of the code.
    86   */
    87  #define CILIUM_CALL_HANDLE_ICMP6_NS		4
    88  #define CILIUM_CALL_SEND_ICMP6_TIME_EXCEEDED	5
    89  #define CILIUM_CALL_ARP				6
    90  #define CILIUM_CALL_IPV4_FROM_LXC		7
    91  #define CILIUM_CALL_IPV4_FROM_NETDEV		CILIUM_CALL_IPV4_FROM_LXC
    92  #define CILIUM_CALL_IPV4_FROM_OVERLAY		CILIUM_CALL_IPV4_FROM_LXC
    93  #define CILIUM_CALL_IPV46_RFC8215		8
    94  #define CILIUM_CALL_IPV64_RFC8215		9
    95  #define CILIUM_CALL_IPV6_FROM_LXC		10
    96  #define CILIUM_CALL_IPV6_FROM_NETDEV		CILIUM_CALL_IPV6_FROM_LXC
    97  #define CILIUM_CALL_IPV6_FROM_OVERLAY		CILIUM_CALL_IPV6_FROM_LXC
    98  #define CILIUM_CALL_IPV4_TO_LXC_POLICY_ONLY	11
    99  #define CILIUM_CALL_IPV4_TO_HOST_POLICY_ONLY	CILIUM_CALL_IPV4_TO_LXC_POLICY_ONLY
   100  #define CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY	12
   101  #define CILIUM_CALL_IPV6_TO_HOST_POLICY_ONLY	CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY
   102  #define CILIUM_CALL_IPV4_TO_ENDPOINT		13
   103  #define CILIUM_CALL_IPV6_TO_ENDPOINT		14
   104  #define CILIUM_CALL_IPV4_NODEPORT_NAT_EGRESS	15
   105  #define CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS	16
   106  #define CILIUM_CALL_IPV4_NODEPORT_REVNAT	17
   107  #define CILIUM_CALL_IPV6_NODEPORT_REVNAT	18
   108  #define CILIUM_CALL_IPV4_NODEPORT_NAT_FWD	19
   109  #define CILIUM_CALL_IPV4_NODEPORT_DSR		20
   110  #define CILIUM_CALL_IPV6_NODEPORT_DSR		21
   111  #define CILIUM_CALL_IPV4_FROM_HOST		22
   112  #define CILIUM_CALL_IPV6_FROM_HOST		23
   113  #define CILIUM_CALL_IPV6_NODEPORT_NAT_FWD	24
   114  #define CILIUM_CALL_IPV4_FROM_LXC_CONT		25
   115  #define CILIUM_CALL_IPV6_FROM_LXC_CONT		26
   116  #define CILIUM_CALL_IPV4_CT_INGRESS		27
   117  #define CILIUM_CALL_IPV4_CT_INGRESS_POLICY_ONLY	28
   118  #define CILIUM_CALL_IPV4_CT_EGRESS		29
   119  #define CILIUM_CALL_IPV6_CT_INGRESS		30
   120  #define CILIUM_CALL_IPV6_CT_INGRESS_POLICY_ONLY	31
   121  #define CILIUM_CALL_IPV6_CT_EGRESS		32
   122  #define CILIUM_CALL_SRV6_ENCAP			33
   123  #define CILIUM_CALL_SRV6_DECAP			34
   124  /* Unused CILIUM_CALL_SRV6_REPLY		35 */
   125  #define CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS	36
   126  #define CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS	37
   127  #define CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD	38
   128  #define CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD	39
   129  /* Unused CILIUM_CALL_IPV4_NODEPORT_DSR_INGRESS	40
   130   * Unused CILIUM_CALL_IPV6_NODEPORT_DSR_INGRESS	41
   131   */
   132  #define CILIUM_CALL_IPV4_INTER_CLUSTER_REVSNAT	42
   133  #define CILIUM_CALL_IPV4_CONT_FROM_HOST		43
   134  #define CILIUM_CALL_IPV4_CONT_FROM_NETDEV	44
   135  #define CILIUM_CALL_IPV6_CONT_FROM_HOST		45
   136  #define CILIUM_CALL_IPV6_CONT_FROM_NETDEV	46
   137  #define CILIUM_CALL_IPV4_NO_SERVICE		47
   138  #define CILIUM_CALL_IPV6_NO_SERVICE		48
   139  #define CILIUM_CALL_MULTICAST_EP_DELIVERY       49
   140  #define CILIUM_CALL_SIZE			50
   141  
   142  typedef __u64 mac_t;
   143  
   144  union v6addr {
   145  	struct {
   146  		__u32 p1;
   147  		__u32 p2;
   148  		__u32 p3;
   149  		__u32 p4;
   150  	};
   151  	struct {
   152  		__u64 d1;
   153  		__u64 d2;
   154  	};
   155  	__u8 addr[16];
   156  } __packed;
   157  
   158  static __always_inline bool validate_ethertype_l2_off(struct __ctx_buff *ctx,
   159  						      int l2_off, __u16 *proto)
   160  {
   161  	const __u64 tot_len = l2_off + ETH_HLEN;
   162  	void *data_end = ctx_data_end(ctx);
   163  	void *data = ctx_data(ctx);
   164  	struct ethhdr *eth;
   165  
   166  	if (ETH_HLEN == 0) {
   167  		/* The packet is received on L2-less device. Determine L3
   168  		 * protocol from skb->protocol.
   169  		 */
   170  		*proto = ctx_get_protocol(ctx);
   171  		return true;
   172  	}
   173  
   174  	if (data + tot_len > data_end)
   175  		return false;
   176  
   177  	eth = data + l2_off;
   178  
   179  	*proto = eth->h_proto;
   180  
   181  	return eth_is_supported_ethertype(*proto);
   182  }
   183  
   184  static __always_inline bool validate_ethertype(struct __ctx_buff *ctx,
   185  					       __u16 *proto)
   186  {
   187  	return validate_ethertype_l2_off(ctx, 0, proto);
   188  }
   189  
   190  static __always_inline __maybe_unused bool
   191  ____revalidate_data_pull(struct __ctx_buff *ctx, void **data_, void **data_end_,
   192  			 void **l3, const __u32 l3_len, const bool pull,
   193  			 __u32 l3_off)
   194  {
   195  	const __u64 tot_len = l3_off + l3_len;
   196  	void *data_end;
   197  	void *data;
   198  
   199  	/* Verifier workaround, do this unconditionally: invalid size of register spill. */
   200  	if (pull)
   201  		ctx_pull_data(ctx, tot_len);
   202  	data_end = ctx_data_end(ctx);
   203  	data = ctx_data(ctx);
   204  	if (data + tot_len > data_end)
   205  		return false;
   206  
   207  	/* Verifier workaround: pointer arithmetic on pkt_end prohibited. */
   208  	*data_ = data;
   209  	*data_end_ = data_end;
   210  
   211  	*l3 = data + l3_off;
   212  	return true;
   213  }
   214  
   215  static __always_inline __maybe_unused bool
   216  __revalidate_data_pull(struct __ctx_buff *ctx, void **data, void **data_end,
   217  		       void **l3, const __u32 l3_off, const __u32 l3_len,
   218  		       const bool pull)
   219  {
   220  	return ____revalidate_data_pull(ctx, data, data_end, l3, l3_len, pull,
   221  					l3_off);
   222  }
   223  
   224  static __always_inline __u32 get_tunnel_id(__u32 identity)
   225  {
   226  #if defined ENABLE_IPV4 && defined ENABLE_IPV6
   227  	if (identity == WORLD_IPV4_ID || identity == WORLD_IPV6_ID)
   228  		return WORLD_ID;
   229  #endif
   230  	return identity;
   231  }
   232  
   233  static __always_inline __u32 get_id_from_tunnel_id(__u32 tunnel_id, __u16 proto  __maybe_unused)
   234  {
   235  #if defined ENABLE_IPV4 && defined ENABLE_IPV6
   236  	if (tunnel_id == WORLD_ID) {
   237  		switch (proto) {
   238  		case bpf_htons(ETH_P_IP):
   239  			return WORLD_IPV4_ID;
   240  		case bpf_htons(ETH_P_IPV6):
   241  			return WORLD_IPV6_ID;
   242  		}
   243  	}
   244  #endif
   245  	return tunnel_id;
   246  }
   247  
   248  /* revalidate_data_pull() initializes the provided pointers from the ctx and
   249   * ensures that the data is pulled in for access. Should be used the first
   250   * time that the ctx data is accessed, subsequent calls can be made to
   251   * revalidate_data() which is cheaper.
   252   * Returns true if 'ctx' is long enough for an IP header of the provided type,
   253   * false otherwise.
   254   */
   255  #define revalidate_data_pull(ctx, data, data_end, ip)			\
   256  	__revalidate_data_pull(ctx, data, data_end, (void **)ip, ETH_HLEN, sizeof(**ip), true)
   257  
   258  #define revalidate_data_l3_off(ctx, data, data_end, ip, l3_off)		\
   259  	__revalidate_data_pull(ctx, data, data_end, (void **)ip, l3_off, sizeof(**ip), false)
   260  
   261  /* revalidate_data() initializes the provided pointers from the ctx.
   262   * Returns true if 'ctx' is long enough for an IP header of the provided type,
   263   * false otherwise.
   264   */
   265  #define revalidate_data(ctx, data, data_end, ip)			\
   266  	revalidate_data_l3_off(ctx, data, data_end, ip, ETH_HLEN)
   267  
   268  /* Macros for working with L3 cilium defined IPV6 addresses */
   269  #define BPF_V6(dst, ...)	BPF_V6_1(dst, fetch_ipv6(__VA_ARGS__))
   270  #define BPF_V6_1(dst, ...)	BPF_V6_2(dst, __VA_ARGS__)
   271  #define BPF_V6_2(dst, a1, a2)		\
   272  	({					\
   273  		dst.d1 = a1;			\
   274  		dst.d2 = a2;			\
   275  	})
   276  
   277  #define ENDPOINT_KEY_IPV4 1
   278  #define ENDPOINT_KEY_IPV6 2
   279  
   280  /* Structure representing an IPv4 or IPv6 address, being used as the key
   281   * for the endpoints map.
   282   */
   283  struct endpoint_key {
   284  	union {
   285  		struct {
   286  			__u32		ip4;
   287  			__u32		pad1;
   288  			__u32		pad2;
   289  			__u32		pad3;
   290  		};
   291  		union v6addr	ip6;
   292  	};
   293  	__u8 family;
   294  	__u8 key;
   295  	__u16 cluster_id;
   296  } __packed;
   297  
   298  struct tunnel_key {
   299  	union {
   300  		struct {
   301  			__u32		ip4;
   302  			__u32		pad1;
   303  			__u32		pad2;
   304  			__u32		pad3;
   305  		};
   306  		union v6addr	ip6;
   307  	};
   308  	__u8 family;
   309  	__u8 pad;
   310  	__u16 cluster_id;
   311  } __packed;
   312  
   313  struct tunnel_value {
   314  	union {
   315  		struct {
   316  			__u32		ip4;
   317  			__u32		pad1;
   318  			__u32		pad2;
   319  			__u32		pad3;
   320  		};
   321  		union v6addr	ip6;
   322  	};
   323  	__u8 family;
   324  	__u8 key;
   325  	__u16 pad;
   326  } __packed;
   327  
   328  #define ENDPOINT_F_HOST		1 /* Special endpoint representing local host */
   329  
   330  /* Value of endpoint map */
   331  struct endpoint_info {
   332  	__u32		ifindex;
   333  	__u16		unused; /* used to be sec_label, no longer used */
   334  	__u16		lxc_id;
   335  	__u32		flags;
   336  	mac_t		mac;
   337  	mac_t		node_mac;
   338  	__u32		sec_id;
   339  	__u32		pad[3];
   340  };
   341  
   342  struct edt_id {
   343  	__u64		id;
   344  };
   345  
   346  struct edt_info {
   347  	__u64		bps;
   348  	__u64		t_last;
   349  	__u64		t_horizon_drop;
   350  	__u64		pad[4];
   351  };
   352  
   353  struct remote_endpoint_info {
   354  	__u32		sec_identity;
   355  	__u32		tunnel_endpoint;
   356  	__u16		pad;
   357  	__u8		key;
   358  	__u8		flag_skip_tunnel:1,
   359  			pad2:7;
   360  };
   361  
   362  /*
   363   * Longest-prefix match map lookup only matches the number of bits from the
   364   * beginning of the key stored in the map indicated by the 'lpm_key' field in
   365   * the same stored map key, not including the 'lpm_key' field itself. Note that
   366   * the 'lpm_key' value passed in the lookup function argument needs to be a
   367   * "full prefix" (POLICY_FULL_PREFIX defined below).
   368   *
   369   * Since we need to be able to wildcard 'sec_label' independently on 'protocol'
   370   * and 'dport' fields, we'll need to do that explicitly with a separate lookup
   371   * where 'sec_label' is zero. For the 'protocol' and 'port' we can use the
   372   * longest-prefix match by placing them at the end ot the key in this specific
   373   * order, as we want to be able to wildcard those fields in a specific pattern:
   374   * 'protocol' can only be wildcarded if dport is also fully wildcarded.
   375   * 'protocol' is never partially wildcarded, so it is either fully wildcarded or
   376   * not wildcarded at all. 'dport' can be partially wildcarded, but only when
   377   * 'protocol' is fully specified. This follows the logic that the destination
   378   * port is a property of a transport protocol and can not be specified without
   379   * also specifying the protocol.
   380   */
   381  struct policy_key {
   382  	struct bpf_lpm_trie_key lpm_key;
   383  	__u32		sec_label;
   384  	__u8		egress:1,
   385  			pad:7;
   386  	__u8		protocol; /* can be wildcarded if 'dport' is fully wildcarded */
   387  	__u16		dport; /* can be wildcarded with CIDR-like prefix */
   388  };
   389  
   390  /* POLICY_FULL_PREFIX gets full prefix length of policy_key */
   391  #define POLICY_FULL_PREFIX						\
   392    (8 * (sizeof(struct policy_key) - sizeof(struct bpf_lpm_trie_key)))
   393  
   394  struct policy_entry {
   395  	__be16		proxy_port;
   396  	__u8		deny:1,
   397  			wildcard_protocol:1, /* protocol is fully wildcarded */
   398  			wildcard_dport:1, /* dport is fully wildcarded */
   399  			pad:5;
   400  	__u8		auth_type;
   401  	__u16		pad1;
   402  	__u16		pad2;
   403  	__u64		packets;
   404  	__u64		bytes;
   405  };
   406  
   407  struct auth_key {
   408  	__u32       local_sec_label;
   409  	__u32       remote_sec_label;
   410  	__u16       remote_node_id; /* zero for local node */
   411  	__u8        auth_type;
   412  	__u8        pad;
   413  };
   414  
   415  /* expiration is Unix epoch time in unit nanosecond/2^9 (ns/512). */
   416  struct auth_info {
   417  	__u64       expiration;
   418  };
   419  
   420  /*
   421   * Runtime configuration items for the datapath.
   422   */
   423  enum {
   424  	RUNTIME_CONFIG_UTIME_OFFSET = 0, /* Index to Unix time offset in 512 ns units */
   425  	/* Last monotonic time, periodically set by the agent to
   426  	 * tell the datapath its still updating maps
   427  	 */
   428  	RUNTIME_CONFIG_AGENT_LIVENESS = 1,
   429  };
   430  
   431  struct metrics_key {
   432  	__u8      reason;	/* 0: forwarded, >0 dropped */
   433  	__u8      dir:2,	/* 1: ingress 2: egress */
   434  		  pad:6;
   435  	__u16	  line;		/* __MAGIC_LINE__ */
   436  	__u8	  file;		/* __MAGIC_FILE__, needs to fit __source_file_name_to_id */
   437  	__u8	  reserved[3];	/* reserved for future extension */
   438  };
   439  
   440  
   441  struct metrics_value {
   442  	__u64	count;
   443  	__u64	bytes;
   444  };
   445  
   446  struct egress_gw_policy_key {
   447  	struct bpf_lpm_trie_key lpm_key;
   448  	__u32 saddr;
   449  	__u32 daddr;
   450  };
   451  
   452  struct egress_gw_policy_entry {
   453  	__u32 egress_ip;
   454  	__u32 gateway_ip;
   455  };
   456  
   457  struct srv6_vrf_key4 {
   458  	struct bpf_lpm_trie_key lpm;
   459  	__u32 src_ip;
   460  	__u32 dst_cidr;
   461  };
   462  
   463  struct srv6_vrf_key6 {
   464  	struct bpf_lpm_trie_key lpm;
   465  	union v6addr src_ip;
   466  	union v6addr dst_cidr;
   467  };
   468  
   469  struct srv6_policy_key4 {
   470  	struct bpf_lpm_trie_key lpm;
   471  	__u32 vrf_id;
   472  	__u32 dst_cidr;
   473  };
   474  
   475  struct srv6_policy_key6 {
   476  	struct bpf_lpm_trie_key lpm;
   477  	__u32 vrf_id;
   478  	union v6addr dst_cidr;
   479  };
   480  
   481  struct vtep_key {
   482  	__u32 vtep_ip;
   483  };
   484  
   485  struct vtep_value {
   486  	__u64 vtep_mac;
   487  	__u32 tunnel_endpoint;
   488  };
   489  
   490  struct node_key {
   491  	__u16 pad1;
   492  	__u8 pad2;
   493  	__u8 family;
   494  	union {
   495  		struct {
   496  			__u32 ip4;
   497  			__u32 pad4;
   498  			__u32 pad5;
   499  			__u32 pad6;
   500  		};
   501  		union v6addr ip6;
   502  	};
   503  };
   504  
   505  enum {
   506  	POLICY_INGRESS = 1,
   507  	POLICY_EGRESS = 2,
   508  };
   509  
   510  enum {
   511  	POLICY_MATCH_NONE = 0,
   512  	POLICY_MATCH_L3_ONLY = 1,
   513  	POLICY_MATCH_L3_L4 = 2,
   514  	POLICY_MATCH_L4_ONLY = 3,
   515  	POLICY_MATCH_ALL = 4,
   516  	POLICY_MATCH_L3_PROTO = 5,
   517  	POLICY_MATCH_PROTO_ONLY = 6,
   518  };
   519  
   520  enum {
   521  	CAPTURE_INGRESS = 1,
   522  	CAPTURE_EGRESS = 2,
   523  };
   524  
   525  enum {
   526  	CILIUM_NOTIFY_UNSPEC,
   527  	CILIUM_NOTIFY_DROP,
   528  	CILIUM_NOTIFY_DBG_MSG,
   529  	CILIUM_NOTIFY_DBG_CAPTURE,
   530  	CILIUM_NOTIFY_TRACE,
   531  	CILIUM_NOTIFY_POLICY_VERDICT,
   532  	CILIUM_NOTIFY_CAPTURE,
   533  	CILIUM_NOTIFY_TRACE_SOCK,
   534  };
   535  
   536  #define NOTIFY_COMMON_HDR \
   537  	__u8		type;		\
   538  	__u8		subtype;	\
   539  	__u16		source;		\
   540  	__u32		hash;
   541  
   542  #define NOTIFY_CAPTURE_HDR \
   543  	NOTIFY_COMMON_HDR						\
   544  	__u32		len_orig;	/* Length of original packet */	\
   545  	__u16		len_cap;	/* Length of captured bytes */	\
   546  	__u16		version;	/* Capture header version */
   547  
   548  #define __notify_common_hdr(t, s)	\
   549  	.type		= (t),		\
   550  	.subtype	= (s),		\
   551  	.source		= EVENT_SOURCE,	\
   552  	.hash		= get_hash_recalc(ctx)
   553  
   554  #define __notify_pktcap_hdr(o, c)	\
   555  	.len_orig	= (o),		\
   556  	.len_cap	= (c),		\
   557  	.version	= NOTIFY_CAPTURE_VER
   558  
   559  /* Capture notifications version. Must be incremented when format changes. */
   560  #define NOTIFY_CAPTURE_VER 1
   561  
   562  #ifndef TRACE_PAYLOAD_LEN
   563  #define TRACE_PAYLOAD_LEN 128ULL
   564  #endif
   565  
   566  #ifndef BPF_F_PSEUDO_HDR
   567  # define BPF_F_PSEUDO_HDR                (1ULL << 4)
   568  #endif
   569  
   570  #define IS_ERR(x) (unlikely((x < 0) || (x == CTX_ACT_DROP)))
   571  
   572  /* Return value to indicate that proxy redirection is required */
   573  #define POLICY_ACT_PROXY_REDIRECT (1 << 16)
   574  
   575  /* Cilium error codes, must NOT overlap with TC return codes.
   576   * These also serve as drop reasons for metrics,
   577   * where reason > 0 corresponds to -(DROP_*)
   578   *
   579   * These are shared with pkg/monitor/api/drop.go and api/v1/flow/flow.proto.
   580   * When modifying any of the below, those files should also be updated.
   581   */
   582  #define DROP_UNUSED1		-130 /* unused */
   583  #define DROP_UNUSED2		-131 /* unused */
   584  #define DROP_INVALID_SIP	-132
   585  #define DROP_POLICY		-133
   586  #define DROP_INVALID		-134
   587  #define DROP_CT_INVALID_HDR	-135
   588  #define DROP_FRAG_NEEDED	-136
   589  #define DROP_CT_UNKNOWN_PROTO	-137
   590  #define DROP_UNUSED4		-138 /* unused */
   591  #define DROP_UNKNOWN_L3		-139
   592  #define DROP_MISSED_TAIL_CALL	-140
   593  #define DROP_WRITE_ERROR	-141
   594  #define DROP_UNKNOWN_L4		-142
   595  #define DROP_UNKNOWN_ICMP_CODE	-143
   596  #define DROP_UNKNOWN_ICMP_TYPE	-144
   597  #define DROP_UNKNOWN_ICMP6_CODE	-145
   598  #define DROP_UNKNOWN_ICMP6_TYPE	-146
   599  #define DROP_NO_TUNNEL_KEY	-147
   600  #define DROP_UNUSED5		-148 /* unused */
   601  #define DROP_UNUSED6		-149 /* unused */
   602  #define DROP_UNKNOWN_TARGET	-150
   603  #define DROP_UNROUTABLE		-151
   604  #define DROP_UNUSED7		-152 /* unused */
   605  #define DROP_CSUM_L3		-153
   606  #define DROP_CSUM_L4		-154
   607  #define DROP_CT_CREATE_FAILED	-155
   608  #define DROP_INVALID_EXTHDR	-156
   609  #define DROP_FRAG_NOSUPPORT	-157
   610  #define DROP_NO_SERVICE		-158
   611  #define DROP_UNSUPP_SERVICE_PROTO	-159
   612  #define DROP_NO_TUNNEL_ENDPOINT -160
   613  #define DROP_NAT_46X64_DISABLED	-161
   614  #define DROP_EDT_HORIZON	-162
   615  #define DROP_UNKNOWN_CT		-163
   616  #define DROP_HOST_UNREACHABLE	-164
   617  #define DROP_NO_CONFIG		-165
   618  #define DROP_UNSUPPORTED_L2	-166
   619  #define DROP_NAT_NO_MAPPING	-167
   620  #define DROP_NAT_UNSUPP_PROTO	-168
   621  #define DROP_NO_FIB		-169
   622  #define DROP_ENCAP_PROHIBITED	-170
   623  #define DROP_INVALID_IDENTITY	-171
   624  #define DROP_UNKNOWN_SENDER	-172
   625  #define DROP_NAT_NOT_NEEDED	-173 /* Mapped as drop code, though drop not necessary. */
   626  #define DROP_IS_CLUSTER_IP	-174
   627  #define DROP_FRAG_NOT_FOUND	-175
   628  #define DROP_FORBIDDEN_ICMP6	-176
   629  #define DROP_NOT_IN_SRC_RANGE	-177
   630  #define DROP_PROXY_LOOKUP_FAILED	-178
   631  #define DROP_PROXY_SET_FAILED	-179
   632  #define DROP_PROXY_UNKNOWN_PROTO	-180
   633  #define DROP_POLICY_DENY	-181
   634  #define DROP_VLAN_FILTERED	-182
   635  #define DROP_INVALID_VNI	-183
   636  #define DROP_INVALID_TC_BUFFER  -184
   637  #define DROP_NO_SID		-185
   638  #define DROP_MISSING_SRV6_STATE	-186 /* unused */
   639  #define DROP_NAT46		-187
   640  #define DROP_NAT64		-188
   641  #define DROP_POLICY_AUTH_REQUIRED	-189
   642  #define DROP_CT_NO_MAP_FOUND	-190
   643  #define DROP_SNAT_NO_MAP_FOUND	-191
   644  #define DROP_INVALID_CLUSTER_ID	-192
   645  #define DROP_DSR_ENCAP_UNSUPP_PROTO	-193
   646  #define DROP_NO_EGRESS_GATEWAY	-194
   647  #define DROP_UNENCRYPTED_TRAFFIC	-195
   648  #define DROP_TTL_EXCEEDED	-196
   649  #define DROP_NO_NODE_ID		-197
   650  #define DROP_RATE_LIMITED	-198
   651  #define DROP_IGMP_HANDLED	-199
   652  #define DROP_IGMP_SUBSCRIBED    -200
   653  #define DROP_MULTICAST_HANDLED  -201
   654  #define DROP_HOST_NOT_READY	-202
   655  #define DROP_EP_NOT_READY	-203
   656  #define DROP_NO_EGRESS_IP	-204
   657  
   658  #define NAT_PUNT_TO_STACK	DROP_NAT_NOT_NEEDED
   659  #define NAT_NEEDED		CTX_ACT_OK
   660  #define NAT_46X64_RECIRC	100
   661  
   662  /* Cilium metrics reasons for forwarding packets and other stats.
   663   * If reason is larger than below then this is a drop reason and
   664   * value corresponds to -(DROP_*), see above.
   665   *
   666   * These are shared with pkg/monitor/api/drop.go.
   667   * When modifying any of the below, those files should also be updated.
   668   */
   669  #define REASON_FORWARDED		0
   670  #define REASON_PLAINTEXT		3
   671  #define REASON_DECRYPT			4
   672  #define REASON_LB_NO_BACKEND_SLOT	5
   673  #define REASON_LB_NO_BACKEND		6
   674  #define REASON_LB_REVNAT_UPDATE		7
   675  #define REASON_LB_REVNAT_STALE		8
   676  #define REASON_FRAG_PACKET		9
   677  #define REASON_FRAG_PACKET_UPDATE	10
   678  #define REASON_MISSED_CUSTOM_CALL	11
   679  
   680  /* Lookup scope for externalTrafficPolicy=Local */
   681  #define LB_LOOKUP_SCOPE_EXT	0
   682  #define LB_LOOKUP_SCOPE_INT	1
   683  
   684  /* Cilium metrics direction for dropping/forwarding packet */
   685  enum metric_dir {
   686  	METRIC_INGRESS = 1,
   687  	METRIC_EGRESS,
   688  	METRIC_SERVICE
   689  } __packed;
   690  
   691  /* Magic ctx->mark identifies packets origination and encryption status.
   692   *
   693   * The upper 16 bits plus lower 8 bits (e.g. mask 0XFFFF00FF) contain the
   694   * packets security identity. The lower/upper halves are swapped to recover
   695   * the identity.
   696   *
   697   * In case of MARK_MAGIC_PROXY_EGRESS_EPID the upper 16 bits carry the Endpoint
   698   * ID instead of the security identity and the lower 8 bits will be zeroes.
   699   *
   700   * The 4 bits at 0X0F00 provide
   701   *  - the magic marker values which indicate whether the packet is coming from
   702   *    an ingress or egress proxy, a local process and its current encryption
   703   *    status.
   704   *
   705   * The 4 bits at 0xF000 provide
   706   *  - the key index to use for encryption when multiple keys are in-flight.
   707   *    In the IPsec case this becomes the SPI on the wire.
   708   */
   709  #define MARK_MAGIC_HOST_MASK		0x0F00
   710  #define MARK_MAGIC_PROXY_TO_WORLD	0x0800
   711  #define MARK_MAGIC_PROXY_EGRESS_EPID	0x0900 /* mark carries source endpoint ID */
   712  #define MARK_MAGIC_PROXY_INGRESS	0x0A00
   713  #define MARK_MAGIC_PROXY_EGRESS		0x0B00
   714  #define MARK_MAGIC_HOST			0x0C00
   715  #define MARK_MAGIC_DECRYPT		0x0D00
   716  /* used to identify encrypted overlay traffic post decryption.
   717   * therefore, SPI bit can be reused to not steal an additional magic mark value.
   718   */
   719  #define MARK_MAGIC_DECRYPTED_OVERLAY	0x1D00
   720  #define MARK_MAGIC_ENCRYPT		0x0E00
   721  #define MARK_MAGIC_IDENTITY		0x0F00 /* mark carries identity */
   722  #define MARK_MAGIC_TO_PROXY		0x0200
   723  #define MARK_MAGIC_SNAT_DONE		0x0300
   724  #define MARK_MAGIC_OVERLAY		0x0400
   725  #define MARK_MAGIC_EGW_DONE		0x0500
   726  
   727  #define MARK_MAGIC_KEY_MASK		0xFF00
   728  
   729  
   730  /* The mark is used to indicate that the WireGuard tunnel device is done
   731   * encrypting a packet. The MSB invades the Kubernetes mark "space" which is
   732   * fine, as it's not used by K8s. See pkg/datapath/linux/linux_defaults/mark.go
   733   * for more details.
   734   */
   735  #define MARK_MAGIC_WG_ENCRYPTED		0x1E00
   736  
   737  /* MARK_MAGIC_HEALTH_IPIP_DONE can overlap with MARK_MAGIC_SNAT_DONE with both
   738   * being mutual exclusive given former is only under DSR. Used to push health
   739   * probe packets to ipip tunnel device & to avoid looping back.
   740   */
   741  #define MARK_MAGIC_HEALTH_IPIP_DONE	MARK_MAGIC_SNAT_DONE
   742  
   743  /* MARK_MAGIC_HEALTH can overlap with MARK_MAGIC_DECRYPT with both being
   744   * mutual exclusive. Note, MARK_MAGIC_HEALTH is user-facing UAPI for LB!
   745   */
   746  #define MARK_MAGIC_HEALTH		MARK_MAGIC_DECRYPT
   747  
   748  /* MARK_MAGIC_CLUSTER_ID shouldn't interfere with MARK_MAGIC_TO_PROXY. Lower
   749   * 8bits carries cluster_id, and when extended via the 'max-connected-clusters'
   750   * option, the upper 16bits may also be used for cluster_id, starting at the
   751   * most significant bit.
   752   */
   753  #define MARK_MAGIC_CLUSTER_ID		MARK_MAGIC_TO_PROXY
   754  
   755  /* IPv4 option used to carry service addr and port for DSR.
   756   *
   757   * Copy = 1 (option is copied to each fragment)
   758   * Class = 0 (control option)
   759   * Number = 26 (not used according to [1])
   760   * Len = 8 (option type (1) + option len (1) + addr (4) + port (2))
   761   *
   762   * [1]: https://www.iana.org/assignments/ip-parameters/ip-parameters.xhtml
   763   */
   764  #define DSR_IPV4_OPT_TYPE	(IPOPT_COPY | 0x1a)
   765  
   766  /* IPv6 option type of Destination Option used to carry service IPv6 addr and
   767   * port for DSR.
   768   *
   769   * 0b00		- "skip over this option and continue processing the header"
   770   *     0	- "Option Data does not change en-route"
   771   *      11011   - Unassigned [1]
   772   *
   773   * [1]:  https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#ipv6-parameters-2
   774   */
   775  #define DSR_IPV6_OPT_TYPE	0x1B
   776  #define DSR_IPV6_OPT_LEN	(sizeof(struct dsr_opt_v6) - 4)
   777  #define DSR_IPV6_EXT_LEN	((sizeof(struct dsr_opt_v6) - 8) / 8)
   778  
   779  /* encrypt_config is the current encryption context on the node */
   780  struct encrypt_config {
   781  	__u8 encrypt_key;
   782  } __packed;
   783  
   784  /**
   785   * or_encrypt_key - mask and shift key into encryption format
   786   */
   787  static __always_inline __u32 or_encrypt_key(__u8 key)
   788  {
   789  	return (((__u32)key & 0x0F) << 12) | MARK_MAGIC_ENCRYPT;
   790  }
   791  
   792  /*
   793   * ctx->tc_index uses
   794   *
   795   * cilium_host @egress
   796   *   bpf_host -> bpf_lxc
   797   */
   798  #define TC_INDEX_F_FROM_INGRESS_PROXY	1
   799  #define TC_INDEX_F_FROM_EGRESS_PROXY	2
   800  #define TC_INDEX_F_SKIP_NODEPORT	4
   801  #define TC_INDEX_F_UNUSED		8
   802  #define TC_INDEX_F_SKIP_HOST_FIREWALL	16
   803  
   804  #define CB_NAT_FLAGS_REVDNAT_ONLY	(1 << 0)
   805  
   806  /*
   807   * For use in ctx_{load,store}_meta(), which operates on sk_buff->cb or
   808   * the cilium_xdp_scratch pad.
   809   * The verifier only exposes the first 5 slots in cb[], so this enum
   810   * only contains 5 entries. Aliases are added to the slots to re-use
   811   * them under different names in different parts of the datapath.
   812   * Take care to not clobber slots used by other functions in the same
   813   * code path.
   814   */
   815  /* ctx_{load,store}_meta() usage: */
   816  enum {
   817  	CB_SRC_LABEL,
   818  #define	CB_PORT			CB_SRC_LABEL	/* Alias, non-overlapping */
   819  #define	CB_HINT			CB_SRC_LABEL	/* Alias, non-overlapping */
   820  #define	CB_PROXY_MAGIC		CB_SRC_LABEL	/* Alias, non-overlapping */
   821  #define	CB_ENCRYPT_MAGIC	CB_SRC_LABEL	/* Alias, non-overlapping */
   822  #define	CB_DST_ENDPOINT_ID	CB_SRC_LABEL    /* Alias, non-overlapping */
   823  #define CB_SRV6_SID_1		CB_SRC_LABEL	/* Alias, non-overlapping */
   824  	CB_IFINDEX,
   825  #define	CB_NAT_46X64		CB_IFINDEX	/* Alias, non-overlapping */
   826  #define	CB_ADDR_V4		CB_IFINDEX	/* Alias, non-overlapping */
   827  #define	CB_ADDR_V6_1		CB_IFINDEX	/* Alias, non-overlapping */
   828  #define	CB_IPCACHE_SRC_LABEL	CB_IFINDEX	/* Alias, non-overlapping */
   829  #define CB_SRV6_SID_2		CB_IFINDEX	/* Alias, non-overlapping */
   830  #define CB_CLUSTER_ID_EGRESS	CB_IFINDEX	/* Alias, non-overlapping */
   831  #define CB_HSIPC_ADDR_V4	CB_IFINDEX	/* Alias, non-overlapping */
   832  #define CB_TRACED		CB_IFINDEX	/* Alias, non-overlapping */
   833  	CB_POLICY,
   834  #define	CB_ADDR_V6_2		CB_POLICY	/* Alias, non-overlapping */
   835  #define CB_SRV6_SID_3		CB_POLICY	/* Alias, non-overlapping */
   836  #define	CB_CLUSTER_ID_INGRESS	CB_POLICY	/* Alias, non-overlapping */
   837  #define CB_HSIPC_PORT		CB_POLICY	/* Alias, non-overlapping */
   838  #define CB_DSR_SRC_LABEL	CB_POLICY	/* Alias, non-overlapping */
   839  #define CB_NAT_FLAGS		CB_POLICY	/* Alias, non-overlapping */
   840  	CB_3,
   841  #define	CB_ADDR_V6_3		CB_3		/* Alias, non-overlapping */
   842  #define	CB_FROM_HOST		CB_3		/* Alias, non-overlapping */
   843  #define CB_SRV6_SID_4		CB_3		/* Alias, non-overlapping */
   844  #define CB_DSR_L3_OFF		CB_3		/* Alias, non-overlapping */
   845  	CB_CT_STATE,
   846  #define	CB_ADDR_V6_4		CB_CT_STATE	/* Alias, non-overlapping */
   847  #define	CB_ENCRYPT_IDENTITY	CB_CT_STATE	/* Alias, non-overlapping,
   848  						 * Not used by xfrm.
   849  						 */
   850  #define	CB_ENCRYPT_DST		CB_CT_STATE	/* Alias, non-overlapping,
   851  						 * Not used by xfrm.
   852  						 * Can be removed in v1.15.
   853  						 */
   854  #define	CB_CUSTOM_CALLS		CB_CT_STATE	/* Alias, non-overlapping */
   855  #define	CB_SRV6_VRF_ID		CB_CT_STATE	/* Alias, non-overlapping */
   856  #define	CB_FROM_TUNNEL		CB_CT_STATE	/* Alias, non-overlapping */
   857  };
   858  
   859  /* Magic values for CB_FROM_HOST.
   860   * CB_FROM_HOST overlaps with CB_NAT46_STATE, so this value must be distinct
   861   * from any in enum NAT46 below!
   862   */
   863  #define FROM_HOST_L7_LB 0xFACADE42
   864  
   865  #define TUPLE_F_OUT		0	/* Outgoing flow */
   866  #define TUPLE_F_IN		1	/* Incoming flow */
   867  #define TUPLE_F_RELATED		2	/* Flow represents related packets */
   868  #define TUPLE_F_SERVICE		4	/* Flow represents packets to service */
   869  
   870  enum ct_dir {
   871  	CT_EGRESS,
   872  	CT_INGRESS,
   873  	CT_SERVICE,
   874  } __packed;
   875  
   876  #ifdef ENABLE_NODEPORT
   877  #define NAT_MIN_EGRESS		NODEPORT_PORT_MIN_NAT
   878  #else
   879  #define NAT_MIN_EGRESS		EPHEMERAL_MIN
   880  #endif
   881  
   882  enum ct_status {
   883  	CT_NEW,
   884  	CT_ESTABLISHED,
   885  	CT_REPLY,
   886  	CT_RELATED,
   887  } __packed;
   888  
   889  /* Service flags (lb{4,6}_service->flags) */
   890  enum {
   891  	SVC_FLAG_EXTERNAL_IP  = (1 << 0),  /* External IPs */
   892  	SVC_FLAG_NODEPORT     = (1 << 1),  /* NodePort service */
   893  	SVC_FLAG_EXT_LOCAL_SCOPE = (1 << 2), /* externalTrafficPolicy=Local */
   894  	SVC_FLAG_HOSTPORT     = (1 << 3),  /* hostPort forwarding */
   895  	SVC_FLAG_AFFINITY     = (1 << 4),  /* sessionAffinity=clientIP */
   896  	SVC_FLAG_LOADBALANCER = (1 << 5),  /* LoadBalancer service */
   897  	SVC_FLAG_ROUTABLE     = (1 << 6),  /* Not a surrogate/ClusterIP entry */
   898  	SVC_FLAG_SOURCE_RANGE = (1 << 7),  /* Check LoadBalancer source range */
   899  };
   900  
   901  /* Service flags (lb{4,6}_service->flags2) */
   902  enum {
   903  	SVC_FLAG_LOCALREDIRECT  = (1 << 0),  /* local redirect */
   904  	SVC_FLAG_NAT_46X64      = (1 << 1),  /* NAT-46/64 entry */
   905  	SVC_FLAG_L7LOADBALANCER = (1 << 2),  /* tproxy redirect to local l7 loadbalancer */
   906  	SVC_FLAG_LOOPBACK       = (1 << 3),  /* hostport with a loopback hostIP */
   907  	SVC_FLAG_INT_LOCAL_SCOPE = (1 << 4), /* internalTrafficPolicy=Local */
   908  	SVC_FLAG_TWO_SCOPES     = (1 << 5),  /* two sets of backends are used for external/internal connections */
   909  };
   910  
   911  /* Backend flags (lb{4,6}_backends->flags) */
   912  enum {
   913  	BE_STATE_ACTIVE		= 0,
   914  	BE_STATE_TERMINATING,
   915  	BE_STATE_QUARANTINED,
   916  	BE_STATE_MAINTENANCE,
   917  };
   918  
   919  struct ipv6_ct_tuple {
   920  	/* Address fields are reversed, i.e.,
   921  	 * these field names are correct for reply direction traffic.
   922  	 */
   923  	union v6addr	daddr;
   924  	union v6addr	saddr;
   925  	/* The order of dport+sport must not be changed!
   926  	 * These field names are correct for original direction traffic.
   927  	 */
   928  	__be16		dport;
   929  	__be16		sport;
   930  	__u8		nexthdr;
   931  	__u8		flags;
   932  } __packed;
   933  
   934  struct ipv4_ct_tuple {
   935  	/* Address fields are reversed, i.e.,
   936  	 * these field names are correct for reply direction traffic.
   937  	 */
   938  	__be32		daddr;
   939  	__be32		saddr;
   940  	/* The order of dport+sport must not be changed!
   941  	 * These field names are correct for original direction traffic.
   942  	 */
   943  	__be16		dport;
   944  	__be16		sport;
   945  	__u8		nexthdr;
   946  	__u8		flags;
   947  } __packed;
   948  
   949  struct ct_entry {
   950  	__u64 reserved0;	/* unused since v1.16 */
   951  	__u64 backend_id;
   952  	__u64 packets;
   953  	__u64 bytes;
   954  	__u32 lifetime;
   955  	__u16 rx_closing:1,
   956  	      tx_closing:1,
   957  	      reserved1:1,	/* unused since v1.12 */
   958  	      lb_loopback:1,
   959  	      seen_non_syn:1,
   960  	      node_port:1,
   961  	      proxy_redirect:1,	/* Connection is redirected to a proxy */
   962  	      dsr_internal:1,	/* DSR is k8s service related, cluster internal */
   963  	      from_l7lb:1,	/* Connection is originated from an L7 LB proxy */
   964  	      reserved2:1,	/* unused since v1.14 */
   965  	      from_tunnel:1,	/* Connection is over tunnel */
   966  	      reserved3:5;
   967  	__u16 rev_nat_index;
   968  	/* In the kernel ifindex is u32, so we need to check in cilium-agent
   969  	 * that ifindex of a NodePort device is <= MAX(u16).
   970  	 * Unused when HAVE_FIB_INDEX is available.
   971  	 */
   972  	__u16 ifindex;
   973  
   974  	/* *x_flags_seen represents the OR of all TCP flags seen for the
   975  	 * transmit/receive direction of this entry.
   976  	 */
   977  	__u8  tx_flags_seen;
   978  	__u8  rx_flags_seen;
   979  
   980  	__u32 src_sec_id; /* Used from userspace proxies, do not change offset! */
   981  
   982  	/* last_*x_report is a timestamp of the last time a monitor
   983  	 * notification was sent for the transmit/receive direction.
   984  	 */
   985  	__u32 last_tx_report;
   986  	__u32 last_rx_report;
   987  };
   988  
   989  struct lb6_key {
   990  	union v6addr address;	/* Service virtual IPv6 address */
   991  	__be16 dport;		/* L4 port filter, if unset, all ports apply */
   992  	__u16 backend_slot;	/* Backend iterator, 0 indicates the svc frontend */
   993  	__u8 proto;		/* L4 protocol, currently not used (set to 0) */
   994  	__u8 scope;		/* LB_LOOKUP_SCOPE_* for externalTrafficPolicy=Local */
   995  	__u8 pad[2];
   996  };
   997  
   998  /* See lb4_service comments */
   999  struct lb6_service {
  1000  	union {
  1001  		__u32 backend_id;	/* Backend ID in lb6_backends */
  1002  		__u32 affinity_timeout;	/* In seconds, only for svc frontend */
  1003  		__u32 l7_lb_proxy_port;	/* In host byte order, only when flags2 && SVC_FLAG_L7LOADBALANCER */
  1004  	};
  1005  	__u16 count;
  1006  	__u16 rev_nat_index;
  1007  	__u8 flags;
  1008  	__u8 flags2;
  1009  	__u8 pad[2];
  1010  };
  1011  
  1012  /* See lb4_backend comments */
  1013  struct lb6_backend {
  1014  	union v6addr address;
  1015  	__be16 port;
  1016  	__u8 proto;
  1017  	__u8 flags;
  1018  	__u16 cluster_id;	/* With this field, we can distinguish two
  1019  				 * backends that have the same IP address,
  1020  				 * but belong to the different cluster.
  1021  				 */
  1022  	__u8 zone;
  1023  	__u8 pad;
  1024  };
  1025  
  1026  struct lb6_health {
  1027  	struct lb6_backend peer;
  1028  };
  1029  
  1030  struct lb6_reverse_nat {
  1031  	union v6addr address;
  1032  	__be16 port;
  1033  } __packed;
  1034  
  1035  struct ipv6_revnat_tuple {
  1036  	__sock_cookie cookie;
  1037  	union v6addr address;
  1038  	__be16 port;
  1039  	__u16 pad;
  1040  };
  1041  
  1042  struct ipv6_revnat_entry {
  1043  	union v6addr address;
  1044  	__be16 port;
  1045  	__u16 rev_nat_index;
  1046  };
  1047  
  1048  struct lb4_key {
  1049  	__be32 address;		/* Service virtual IPv4 address */
  1050  	__be16 dport;		/* L4 port filter, if unset, all ports apply */
  1051  	__u16 backend_slot;	/* Backend iterator, 0 indicates the svc frontend */
  1052  	__u8 proto;		/* L4 protocol, currently not used (set to 0) */
  1053  	__u8 scope;		/* LB_LOOKUP_SCOPE_* for externalTrafficPolicy=Local */
  1054  	__u8 pad[2];
  1055  };
  1056  
  1057  struct lb4_service {
  1058  	union {
  1059  		__u32 backend_id;	/* Backend ID in lb4_backends */
  1060  		__u32 affinity_timeout;	/* In seconds, only for svc frontend */
  1061  		__u32 l7_lb_proxy_port;	/* In host byte order, only when flags2 && SVC_FLAG_L7LOADBALANCER */
  1062  	};
  1063  	/* For the service frontend, count denotes number of service backend
  1064  	 * slots (otherwise zero).
  1065  	 */
  1066  	__u16 count;
  1067  	__u16 rev_nat_index;	/* Reverse NAT ID in lb4_reverse_nat */
  1068  	__u8 flags;
  1069  	__u8 flags2;
  1070  	__u8  pad[2];
  1071  };
  1072  
  1073  struct lb4_backend {
  1074  	__be32 address;		/* Service endpoint IPv4 address */
  1075  	__be16 port;		/* L4 port filter */
  1076  	__u8 proto;		/* L4 protocol, currently not used (set to 0) */
  1077  	__u8 flags;
  1078  	__u16 cluster_id;	/* With this field, we can distinguish two
  1079  				 * backends that have the same IP address,
  1080  				 * but belong to the different cluster.
  1081  				 */
  1082  	__u8 zone;
  1083  	__u8 pad;
  1084  };
  1085  
  1086  struct lb4_health {
  1087  	struct lb4_backend peer;
  1088  };
  1089  
  1090  struct lb4_reverse_nat {
  1091  	__be32 address;
  1092  	__be16 port;
  1093  } __packed;
  1094  
  1095  struct ipv4_revnat_tuple {
  1096  	__sock_cookie cookie;
  1097  	__be32 address;
  1098  	__be16 port;
  1099  	__u16 pad;
  1100  };
  1101  
  1102  struct ipv4_revnat_entry {
  1103  	__be32 address;
  1104  	__be16 port;
  1105  	__u16 rev_nat_index;
  1106  };
  1107  
  1108  union lb4_affinity_client_id {
  1109  	__u32 client_ip;
  1110  	__net_cookie client_cookie;
  1111  } __packed;
  1112  
  1113  struct lb4_affinity_key {
  1114  	union lb4_affinity_client_id client_id;
  1115  	__u16 rev_nat_id;
  1116  	__u8 netns_cookie:1,
  1117  	     reserved:7;
  1118  	__u8 pad1;
  1119  	__u32 pad2;
  1120  } __packed;
  1121  
  1122  union lb6_affinity_client_id {
  1123  	union v6addr client_ip;
  1124  	__net_cookie client_cookie;
  1125  } __packed;
  1126  
  1127  struct lb6_affinity_key {
  1128  	union lb6_affinity_client_id client_id;
  1129  	__u16 rev_nat_id;
  1130  	__u8 netns_cookie:1,
  1131  	     reserved:7;
  1132  	__u8 pad1;
  1133  	__u32 pad2;
  1134  } __packed;
  1135  
  1136  struct lb_affinity_val {
  1137  	__u64 last_used;
  1138  	__u32 backend_id;
  1139  	__u32 pad;
  1140  } __packed;
  1141  
  1142  struct lb_affinity_match {
  1143  	__u32 backend_id;
  1144  	__u16 rev_nat_id;
  1145  	__u16 pad;
  1146  } __packed;
  1147  
  1148  struct ct_state {
  1149  	__u16 rev_nat_index;
  1150  #ifndef DISABLE_LOOPBACK_LB
  1151  	__u16 loopback:1,
  1152  #else
  1153  	__u16 loopback_disabled:1,
  1154  #endif
  1155  	      node_port:1,
  1156  	      dsr_internal:1,   /* DSR is k8s service related, cluster internal */
  1157  	      syn:1,
  1158  	      proxy_redirect:1,	/* Connection is redirected to a proxy */
  1159  	      from_l7lb:1,	/* Connection is originated from an L7 LB proxy */
  1160  	      reserved1:1,	/* Was auth_required, not used in production anywhere */
  1161  	      from_tunnel:1,	/* Connection is from tunnel */
  1162  		  closing:1,
  1163  	      reserved:7;
  1164  	__u32 src_sec_id;
  1165  #ifndef HAVE_FIB_IFINDEX
  1166  	__u16 ifindex;
  1167  #endif
  1168  	__u32 backend_id;	/* Backend ID in lb4_backends */
  1169  };
  1170  
  1171  static __always_inline bool ct_state_is_from_l7lb(const struct ct_state *ct_state __maybe_unused)
  1172  {
  1173  #ifdef ENABLE_L7_LB
  1174  	return ct_state->from_l7lb;
  1175  #else
  1176  	return false;
  1177  #endif
  1178  }
  1179  
  1180  #define SRC_RANGE_STATIC_PREFIX(STRUCT)		\
  1181  	(8 * (sizeof(STRUCT) - sizeof(struct bpf_lpm_trie_key)))
  1182  
  1183  struct lb4_src_range_key {
  1184  	struct bpf_lpm_trie_key lpm_key;
  1185  	__u16 rev_nat_id;
  1186  	__u16 pad;
  1187  	__u32 addr;
  1188  };
  1189  
  1190  struct lb6_src_range_key {
  1191  	struct bpf_lpm_trie_key lpm_key;
  1192  	__u16 rev_nat_id;
  1193  	__u16 pad;
  1194  	union v6addr addr;
  1195  };
  1196  
  1197  static __always_inline int redirect_ep(struct __ctx_buff *ctx __maybe_unused,
  1198  				       int ifindex __maybe_unused,
  1199  				       bool needs_backlog __maybe_unused,
  1200  				       bool from_tunnel)
  1201  {
  1202  	/* Going via CPU backlog queue (aka needs_backlog) is required
  1203  	 * whenever we cannot do a fast ingress -> ingress switch but
  1204  	 * instead need an ingress -> egress netns traversal or vice
  1205  	 * versa.
  1206  	 *
  1207  	 * This is also the case if BPF host routing is disabled, or if
  1208  	 * we are currently on egress which is indicated by ingress_ifindex
  1209  	 * being 0. The latter is cleared upon skb scrubbing.
  1210  	 *
  1211  	 * In case of netkit, we're on the egress side and need a regular
  1212  	 * redirect to the peer device's ifindex. In case of veth we're
  1213  	 * on ingress and need a redirect peer to get to the target. Both
  1214  	 * only traverse the CPU backlog queue once. In case of phys ->
  1215  	 * Pod, the ingress_ifindex is > 0 and in both device types we
  1216  	 * do want a redirect peer into the target Pod's netns.
  1217  	 */
  1218  	if (needs_backlog || !is_defined(ENABLE_HOST_ROUTING) ||
  1219  	    ctx_get_ingress_ifindex(ctx) == 0) {
  1220  		return ctx_redirect(ctx, ifindex, 0);
  1221  	}
  1222  
  1223  	/* When coming from overlay, we need to set packet type
  1224  	 * to HOST as otherwise we might get dropped in IP layer.
  1225  	 */
  1226  	if (from_tunnel)
  1227  		ctx_change_type(ctx, PACKET_HOST);
  1228  
  1229  	return ctx_redirect_peer(ctx, ifindex, 0);
  1230  }
  1231  
  1232  static __always_inline __u64 ctx_adjust_hroom_flags(void)
  1233  {
  1234  #ifdef HAVE_CSUM_LEVEL
  1235  	return BPF_F_ADJ_ROOM_NO_CSUM_RESET;
  1236  #else
  1237  	return 0;
  1238  #endif
  1239  }
  1240  
  1241  struct lpm_v4_key {
  1242  	struct bpf_lpm_trie_key lpm;
  1243  	__u8 addr[4];
  1244  };
  1245  
  1246  struct lpm_v6_key {
  1247  	struct bpf_lpm_trie_key lpm;
  1248  	__u8 addr[16];
  1249  };
  1250  
  1251  struct lpm_val {
  1252  	/* Just dummy for now. */
  1253  	__u8 flags;
  1254  };
  1255  
  1256  struct skip_lb4_key {
  1257  	__u64 netns_cookie;     /* Source pod netns cookie */
  1258  	__u32 address;          /* Destination service virtual IPv4 address */
  1259  	__u16 port;             /* Destination service virtual layer4 port */
  1260  	__u16 pad;
  1261  };
  1262  
  1263  struct skip_lb6_key {
  1264  	__u64 netns_cookie;     /* Source pod netns cookie */
  1265  	union v6addr address;   /* Destination service virtual IPv6 address */
  1266  	__u32 pad;
  1267  	__u16 port;             /* Destination service virtual layer4 port */
  1268  	__u16 pad2;
  1269  };
  1270  
  1271  /* Older kernels don't support the larger tunnel key structure and we don't
  1272   * need it since we only want to retrieve the tunnel ID anyway.
  1273   */
  1274  #define TUNNEL_KEY_WITHOUT_SRC_IP offsetof(struct bpf_tunnel_key, local_ipv4)
  1275  
  1276  #include "overloadable.h"