github.com/cilium/cilium@v1.16.2/bpf/lib/proxy.h (about)

     1  /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
     2  /* Copyright Authors of Cilium */
     3  
     4  #pragma once
     5  
     6  #include "conntrack.h"
     7  
     8  #if !(__ctx_is == __ctx_skb)
     9  #error "Proxy redirection is only supported from skb context"
    10  #endif
    11  
    12  #ifdef ENABLE_TPROXY
    13  static __always_inline int
    14  assign_socket_tcp(struct __ctx_buff *ctx,
    15  		  struct bpf_sock_tuple *tuple, __u32 len, bool established)
    16  {
    17  	int result = DROP_PROXY_LOOKUP_FAILED;
    18  	struct bpf_sock *sk;
    19  	__u32 dbg_ctx;
    20  
    21  	sk = skc_lookup_tcp(ctx, tuple, len, BPF_F_CURRENT_NETNS, 0);
    22  	if (!sk)
    23  		goto out;
    24  
    25  	if (established && sk->state == BPF_TCP_TIME_WAIT)
    26  		goto release;
    27  	if (established && sk->state == BPF_TCP_LISTEN)
    28  		goto release;
    29  
    30  	dbg_ctx = READ_ONCE(sk)->family << 16 | ctx->protocol;
    31  	result = sk_assign(ctx, sk, 0);
    32  	cilium_dbg(ctx, DBG_SK_ASSIGN, -result, dbg_ctx);
    33  	if (result == 0)
    34  		result = CTX_ACT_OK;
    35  	else
    36  		result = DROP_PROXY_SET_FAILED;
    37  release:
    38  	sk_release(sk);
    39  out:
    40  	return result;
    41  }
    42  
    43  static __always_inline int
    44  assign_socket_udp(struct __ctx_buff *ctx,
    45  		  struct bpf_sock_tuple *tuple, __u32 len,
    46  		  bool established __maybe_unused)
    47  {
    48  	int result = DROP_PROXY_LOOKUP_FAILED;
    49  	struct bpf_sock *sk;
    50  	__u32 dbg_ctx;
    51  
    52  	sk = sk_lookup_udp(ctx, tuple, len, BPF_F_CURRENT_NETNS, 0);
    53  	if (!sk)
    54  		goto out;
    55  
    56  	dbg_ctx = READ_ONCE(sk)->family << 16 | ctx->protocol;
    57  	result = sk_assign(ctx, sk, 0);
    58  	cilium_dbg(ctx, DBG_SK_ASSIGN, -result, dbg_ctx);
    59  	if (result == 0)
    60  		result = CTX_ACT_OK;
    61  	else
    62  		result = DROP_PROXY_SET_FAILED;
    63  	sk_release(sk);
    64  out:
    65  	return result;
    66  }
    67  
    68  static __always_inline int
    69  assign_socket(struct __ctx_buff *ctx,
    70  	      struct bpf_sock_tuple *tuple, __u32 len,
    71  	      __u8 nexthdr, bool established)
    72  {
    73  	/* Workaround: While the below functions are nearly identical in C
    74  	 * implementation, the 'struct bpf_sock *' has a different verifier
    75  	 * pointer type, which means we can't fold these implementations
    76  	 * together.
    77  	 */
    78  	switch (nexthdr) {
    79  	case IPPROTO_TCP:
    80  		return assign_socket_tcp(ctx, tuple, len, established);
    81  	case IPPROTO_UDP:
    82  		return assign_socket_udp(ctx, tuple, len, established);
    83  	}
    84  	return DROP_PROXY_UNKNOWN_PROTO;
    85  }
    86  
    87  /**
    88   * combine_ports joins the specified ports in a manner consistent with
    89   * pkg/monitor/dataapth_debug.go to report the ports ino monitor messages.
    90   */
    91  static __always_inline __u32
    92  combine_ports(__u16 dport, __u16 sport)
    93  {
    94  	return (bpf_ntohs(dport) << 16) | bpf_ntohs(sport);
    95  }
    96  
    97  #define CTX_REDIRECT_FN(NAME, CT_TUPLE_TYPE, SK_FIELD,				\
    98  			DBG_LOOKUP_CODE, DADDR_DBG, SADDR_DBG)			\
    99  /**										\
   100   * ctx_redirect_to_proxy_ingress4 / ctx_redirect_to_proxy_ingress6		\
   101   * @ctx			pointer to program context				\
   102   * @tuple		pointer to *scratch buffer* with packet tuple		\
   103   * @proxy_port		port to redirect traffic towards			\
   104   *										\
   105   * Prefetch the proxy socket and associate with the ctx. Must be run on tc	\
   106   * ingress. Will modify 'tuple'!						\
   107   */										\
   108  static __always_inline int							\
   109  NAME(struct __ctx_buff *ctx, const CT_TUPLE_TYPE * ct_tuple,			\
   110       __be16 proxy_port, void *tproxy_addr)					\
   111  {										\
   112  	struct bpf_sock_tuple *tuple = (struct bpf_sock_tuple *)ct_tuple;	\
   113  	__u8 nexthdr = ct_tuple->nexthdr;					\
   114  	__u32 len = sizeof(tuple->SK_FIELD);					\
   115  	__u16 port;								\
   116  	int result;								\
   117  										\
   118  	/* The provided 'ct_tuple' is in the internal Cilium format, which	\
   119  	 * reverses the source/destination ports as compared with the actual	\
   120  	 * packet contents. 'bpf_sock_tuple' in the eBPF API needs these to	\
   121  	 * match normal packet ordering to successfully look up the		\
   122  	 * corresponding socket. So, swap them here.				\
   123  	 */									\
   124  	port = tuple->SK_FIELD.sport;						\
   125  	tuple->SK_FIELD.sport = tuple->SK_FIELD.dport;				\
   126  	tuple->SK_FIELD.dport = port;						\
   127  										\
   128  	/* Look for established socket locally first */				\
   129  	cilium_dbg3(ctx, DBG_LOOKUP_CODE,					\
   130  		    tuple->SK_FIELD.SADDR_DBG, tuple->SK_FIELD.DADDR_DBG,	\
   131  		    combine_ports(tuple->SK_FIELD.dport, tuple->SK_FIELD.sport));	\
   132  	result = assign_socket(ctx, tuple, len, nexthdr, true);			\
   133  	if (result == CTX_ACT_OK)						\
   134  		goto out;							\
   135  										\
   136  	/* if there's no established connection, locate the tproxy socket on the tproxy_addr IP */ \
   137  	tuple->SK_FIELD.dport = proxy_port;	 \
   138  	tuple->SK_FIELD.sport = 0;	\
   139  	memcpy(&tuple->SK_FIELD.daddr, tproxy_addr, sizeof(tuple->SK_FIELD.daddr)); \
   140  	memset(&tuple->SK_FIELD.saddr, 0, sizeof(tuple->SK_FIELD.saddr));	\
   141  	cilium_dbg3(ctx, DBG_LOOKUP_CODE,					\
   142  		    tuple->SK_FIELD.SADDR_DBG, tuple->SK_FIELD.DADDR_DBG,	\
   143  		    combine_ports(tuple->SK_FIELD.dport, tuple->SK_FIELD.sport));	\
   144  	result = assign_socket(ctx, tuple, len, nexthdr, false);		\
   145  	if (result == CTX_ACT_OK)						\
   146  		goto out;	\
   147  										\
   148  	/* if there's no tproxy socket on tproxy_addr look for one bound to all interfaces */ \
   149  	memset(&tuple->SK_FIELD.daddr, 0, sizeof(tuple->SK_FIELD.daddr));	\
   150  	cilium_dbg3(ctx, DBG_LOOKUP_CODE,					\
   151  		    tuple->SK_FIELD.SADDR_DBG, tuple->SK_FIELD.DADDR_DBG,	\
   152  		    combine_ports(tuple->SK_FIELD.dport, tuple->SK_FIELD.sport));	\
   153  	result = assign_socket(ctx, tuple, len, nexthdr, false);		\
   154  										\
   155  out:										\
   156  	return result;								\
   157  }
   158  
   159  #ifdef ENABLE_IPV4
   160  CTX_REDIRECT_FN(ctx_redirect_to_proxy_ingress4, struct ipv4_ct_tuple, ipv4,
   161  		DBG_SK_LOOKUP4, daddr, saddr)
   162  #endif
   163  #ifdef ENABLE_IPV6
   164  CTX_REDIRECT_FN(ctx_redirect_to_proxy_ingress6, struct ipv6_ct_tuple, ipv6,
   165  		DBG_SK_LOOKUP6, daddr[3], saddr[3])
   166  #endif
   167  #undef CTX_REDIRECT_FN
   168  #endif /* ENABLE_TPROXY */
   169  
   170  /**
   171   * __ctx_redirect_to_proxy configures the ctx with the proxy mark and proxy
   172   * port number to ensure that the stack redirects the packet into the proxy.
   173   *
   174   * It is called from both ingress and egress side of endpoint devices.
   175   *
   176   * In regular veth mode:
   177   * * To apply egress policy, the egressing endpoint configures the mark,
   178   *   which returns CTX_ACT_OK to pass the packet to the stack in the context
   179   *   of the source device (stack ingress).
   180   * * To apply ingress policy, the egressing endpoint or netdev program tail
   181   *   calls into the policy program which configures the mark here, which
   182   *   returns CTX_ACT_OK to pass the packet to the stack in the context of the
   183   *   source device (netdev or egress endpoint device, stack ingress).
   184   *
   185   * In chaining mode with bridged endpoint devices:
   186   * * To apply egress policy, the egressing endpoint configures the mark,
   187   *   which is propagated via ctx_store_meta() in the caller. The redirect() call
   188   *   here redirects the packet to the ingress TC filter configured on the bridge
   189   *   master device.
   190   * * To apply ingress policy, the stack transmits the packet into the bridge
   191   *   master device which tail calls into the policy program for the ingress
   192   *   endpoint, which configures mark and cb[] as described for the egress path.
   193   *   The redirect() call here redirects the packet to the ingress TC filter
   194   *   configured on the bridge master device.
   195   * * In both cases for bridged endpoint devices, the bridge master device has
   196   *   a BPF program configured upon ingress to transfer the cb[] to the mark
   197   *   before passing the traffic up to the stack towards the proxy.
   198   */
   199  static __always_inline int
   200  __ctx_redirect_to_proxy(struct __ctx_buff *ctx, void *tuple __maybe_unused,
   201  			__be16 proxy_port, bool from_host __maybe_unused,
   202  			bool ipv4 __maybe_unused)
   203  {
   204  	int result __maybe_unused = CTX_ACT_OK;
   205  
   206  #ifdef ENABLE_TPROXY
   207  	if (!from_host)
   208  		ctx->mark |= MARK_MAGIC_TO_PROXY;
   209  	else
   210  #endif
   211  		ctx->mark = MARK_MAGIC_TO_PROXY | proxy_port << 16;
   212  
   213  	cilium_dbg_capture(ctx, DBG_CAPTURE_PROXY_PRE, proxy_port);
   214  
   215  #ifdef ENABLE_TPROXY
   216  	if (proxy_port && !from_host) {
   217  #ifdef ENABLE_IPV4
   218  		if (ipv4) {
   219  			__be32 ipv4_localhost = bpf_htonl(INADDR_LOOPBACK);
   220  
   221  			result =
   222  			ctx_redirect_to_proxy_ingress4(ctx, tuple, proxy_port, &ipv4_localhost);
   223  		}
   224  #endif /* ENABLE_IPV4 */
   225  #ifdef ENABLE_IPV6
   226  		if (!ipv4) {
   227  			union v6addr ipv6_localhost = { .addr[15] = 1,};
   228  
   229  			result =
   230  			ctx_redirect_to_proxy_ingress6(ctx, tuple, proxy_port, &ipv6_localhost);
   231  		}
   232  #endif /* ENABLE_IPV6 */
   233  	}
   234  #endif /* ENABLE_TPROXY */
   235  	ctx_change_type(ctx, PACKET_HOST); /* Required for ingress packets from overlay */
   236  	return result;
   237  }
   238  
   239  #ifdef ENABLE_IPV4
   240  static __always_inline int
   241  ctx_redirect_to_proxy4(struct __ctx_buff *ctx, void *tuple __maybe_unused,
   242  		       __be16 proxy_port, bool from_host __maybe_unused)
   243  {
   244  	return __ctx_redirect_to_proxy(ctx, tuple, proxy_port, from_host, true);
   245  }
   246  #endif /* ENABLE_IPV4 */
   247  
   248  #ifdef ENABLE_IPV6
   249  static __always_inline int
   250  ctx_redirect_to_proxy6(struct __ctx_buff *ctx, void *tuple __maybe_unused,
   251  		       __be16 proxy_port, bool from_host __maybe_unused)
   252  {
   253  	return __ctx_redirect_to_proxy(ctx, tuple, proxy_port, from_host, false);
   254  }
   255  #endif /* ENABLE_IPV6 */
   256  
   257  #ifdef ENABLE_TPROXY
   258  #define IP_TUPLE_EXTRACT_FN(NAME, PREFIX)				\
   259  /**									\
   260   * extract_tuple4 / extract_tuple6					\
   261   *									\
   262   * Extracts the packet 5-tuple into 'tuple'.				\
   263   *									\
   264   * Note that it doesn't fully initialize 'tuple' as the directionality	\
   265   * bit is unused in the proxy paths.					\
   266   */									\
   267  static __always_inline int						\
   268  NAME(struct __ctx_buff *ctx, struct PREFIX ## _ct_tuple *tuple)		\
   269  {									\
   270  	int err;							\
   271  									\
   272  	err = PREFIX ## _extract_tuple(ctx, tuple);			\
   273  	if (err != CTX_ACT_OK)						\
   274  		return err;						\
   275  									\
   276  	__ ## PREFIX ## _ct_tuple_reverse(tuple);			\
   277  									\
   278  	return CTX_ACT_OK;						\
   279  }
   280  
   281  #ifdef ENABLE_IPV4
   282  IP_TUPLE_EXTRACT_FN(extract_tuple4, ipv4)
   283  #endif /* ENABLE_IPV4 */
   284  #ifdef ENABLE_IPV6
   285  IP_TUPLE_EXTRACT_FN(extract_tuple6, ipv6)
   286  #endif /* ENABLE_IPV6 */
   287  #endif /* ENABLE_TPROXY */
   288  
   289  /**
   290   * ctx_redirect_to_proxy_first() applies changes to the context to forward
   291   * the packet towards the proxy. It is designed to run as the first function
   292   * that accesses the context from the current BPF program.
   293   */
   294  static __always_inline int
   295  ctx_redirect_to_proxy_first(struct __ctx_buff *ctx, __be16 proxy_port)
   296  {
   297  	int ret = CTX_ACT_OK;
   298  #if defined(ENABLE_TPROXY)
   299  	__u16 proto;
   300  #ifdef ENABLE_IPV4
   301  	__be32 ipv4_localhost = bpf_htonl(INADDR_LOOPBACK);
   302  #endif
   303  #ifdef ENABLE_IPV6
   304  	union v6addr ipv6_localhost = { .addr[15] = 1,};
   305  #endif
   306  
   307  	/**
   308  	 * For reply traffic to egress proxy for a local endpoint, we skip the
   309  	 * policy & proxy_port lookup and just hairpin & rely on local stack
   310  	 * routing via ctx->mark to ensure that the return traffic reaches the
   311  	 * proxy. This is only relevant for endpoint-routes mode but we don't
   312  	 * have a macro for this so the logic applies unconditionally here.
   313  	 * See ct_state.proxy_redirect usage in bpf_lxc.c for more info.
   314  	 */
   315  	if (!proxy_port)
   316  		goto mark;
   317  
   318  	if (!validate_ethertype(ctx, &proto))
   319  		return DROP_UNSUPPORTED_L2;
   320  
   321  	ret = DROP_UNKNOWN_L3;
   322  	switch (proto) {
   323  #ifdef ENABLE_IPV6
   324  	case bpf_htons(ETH_P_IPV6): {
   325  		struct ipv6_ct_tuple tuple;
   326  
   327  		ret = extract_tuple6(ctx, &tuple);
   328  		if (ret < 0)
   329  			return ret;
   330  		ret = ctx_redirect_to_proxy_ingress6(ctx, &tuple, proxy_port, &ipv6_localhost);
   331  		break;
   332  	}
   333  #endif /* ENABLE_IPV6 */
   334  #ifdef ENABLE_IPV4
   335  	case bpf_htons(ETH_P_IP): {
   336  		struct ipv4_ct_tuple tuple;
   337  
   338  		ret = extract_tuple4(ctx, &tuple);
   339  		if (ret < 0)
   340  			return ret;
   341  
   342  		ret = ctx_redirect_to_proxy_ingress4(ctx, &tuple, proxy_port, &ipv4_localhost);
   343  		break;
   344  	}
   345  #endif /* ENABLE_IPV4 */
   346  	default:
   347  		goto out;
   348  	}
   349  #endif /* ENABLE_TPROXY */
   350  
   351  mark: __maybe_unused;
   352  	cilium_dbg_capture(ctx, DBG_CAPTURE_PROXY_POST, proxy_port);
   353  	ctx->mark = MARK_MAGIC_TO_PROXY | (proxy_port << 16);
   354  	ctx_change_type(ctx, PACKET_HOST);
   355  
   356  out: __maybe_unused;
   357  	return ret;
   358  }
   359  
   360  /**
   361   * tc_index_from_ingress_proxy - returns true if packet originates from ingress proxy
   362   */
   363  static __always_inline bool tc_index_from_ingress_proxy(struct __ctx_buff *ctx)
   364  {
   365  	volatile __u32 tc_index = ctx->tc_index;
   366  #ifdef DEBUG
   367  	if (tc_index & TC_INDEX_F_FROM_INGRESS_PROXY)
   368  		cilium_dbg(ctx, DBG_SKIP_PROXY, tc_index, 0);
   369  #endif
   370  
   371  	return tc_index & TC_INDEX_F_FROM_INGRESS_PROXY;
   372  }
   373  
   374  /**
   375   * tc_index_from_egress_proxy - returns true if packet originates from egress proxy
   376   */
   377  static __always_inline bool tc_index_from_egress_proxy(struct __ctx_buff *ctx)
   378  {
   379  	volatile __u32 tc_index = ctx->tc_index;
   380  #ifdef DEBUG
   381  	if (tc_index & TC_INDEX_F_FROM_EGRESS_PROXY)
   382  		cilium_dbg(ctx, DBG_SKIP_PROXY, tc_index, 0);
   383  #endif
   384  
   385  	return tc_index & TC_INDEX_F_FROM_EGRESS_PROXY;
   386  }