github.com/cilium/cilium@v1.16.2/bpf/lib/proxy.h (about) 1 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 /* Copyright Authors of Cilium */ 3 4 #pragma once 5 6 #include "conntrack.h" 7 8 #if !(__ctx_is == __ctx_skb) 9 #error "Proxy redirection is only supported from skb context" 10 #endif 11 12 #ifdef ENABLE_TPROXY 13 static __always_inline int 14 assign_socket_tcp(struct __ctx_buff *ctx, 15 struct bpf_sock_tuple *tuple, __u32 len, bool established) 16 { 17 int result = DROP_PROXY_LOOKUP_FAILED; 18 struct bpf_sock *sk; 19 __u32 dbg_ctx; 20 21 sk = skc_lookup_tcp(ctx, tuple, len, BPF_F_CURRENT_NETNS, 0); 22 if (!sk) 23 goto out; 24 25 if (established && sk->state == BPF_TCP_TIME_WAIT) 26 goto release; 27 if (established && sk->state == BPF_TCP_LISTEN) 28 goto release; 29 30 dbg_ctx = READ_ONCE(sk)->family << 16 | ctx->protocol; 31 result = sk_assign(ctx, sk, 0); 32 cilium_dbg(ctx, DBG_SK_ASSIGN, -result, dbg_ctx); 33 if (result == 0) 34 result = CTX_ACT_OK; 35 else 36 result = DROP_PROXY_SET_FAILED; 37 release: 38 sk_release(sk); 39 out: 40 return result; 41 } 42 43 static __always_inline int 44 assign_socket_udp(struct __ctx_buff *ctx, 45 struct bpf_sock_tuple *tuple, __u32 len, 46 bool established __maybe_unused) 47 { 48 int result = DROP_PROXY_LOOKUP_FAILED; 49 struct bpf_sock *sk; 50 __u32 dbg_ctx; 51 52 sk = sk_lookup_udp(ctx, tuple, len, BPF_F_CURRENT_NETNS, 0); 53 if (!sk) 54 goto out; 55 56 dbg_ctx = READ_ONCE(sk)->family << 16 | ctx->protocol; 57 result = sk_assign(ctx, sk, 0); 58 cilium_dbg(ctx, DBG_SK_ASSIGN, -result, dbg_ctx); 59 if (result == 0) 60 result = CTX_ACT_OK; 61 else 62 result = DROP_PROXY_SET_FAILED; 63 sk_release(sk); 64 out: 65 return result; 66 } 67 68 static __always_inline int 69 assign_socket(struct __ctx_buff *ctx, 70 struct bpf_sock_tuple *tuple, __u32 len, 71 __u8 nexthdr, bool established) 72 { 73 /* Workaround: While the below functions are nearly identical in C 74 * implementation, the 'struct bpf_sock *' has a different verifier 75 * pointer type, which means we can't fold these implementations 76 * together. 77 */ 78 switch (nexthdr) { 79 case IPPROTO_TCP: 80 return assign_socket_tcp(ctx, tuple, len, established); 81 case IPPROTO_UDP: 82 return assign_socket_udp(ctx, tuple, len, established); 83 } 84 return DROP_PROXY_UNKNOWN_PROTO; 85 } 86 87 /** 88 * combine_ports joins the specified ports in a manner consistent with 89 * pkg/monitor/dataapth_debug.go to report the ports ino monitor messages. 90 */ 91 static __always_inline __u32 92 combine_ports(__u16 dport, __u16 sport) 93 { 94 return (bpf_ntohs(dport) << 16) | bpf_ntohs(sport); 95 } 96 97 #define CTX_REDIRECT_FN(NAME, CT_TUPLE_TYPE, SK_FIELD, \ 98 DBG_LOOKUP_CODE, DADDR_DBG, SADDR_DBG) \ 99 /** \ 100 * ctx_redirect_to_proxy_ingress4 / ctx_redirect_to_proxy_ingress6 \ 101 * @ctx pointer to program context \ 102 * @tuple pointer to *scratch buffer* with packet tuple \ 103 * @proxy_port port to redirect traffic towards \ 104 * \ 105 * Prefetch the proxy socket and associate with the ctx. Must be run on tc \ 106 * ingress. Will modify 'tuple'! \ 107 */ \ 108 static __always_inline int \ 109 NAME(struct __ctx_buff *ctx, const CT_TUPLE_TYPE * ct_tuple, \ 110 __be16 proxy_port, void *tproxy_addr) \ 111 { \ 112 struct bpf_sock_tuple *tuple = (struct bpf_sock_tuple *)ct_tuple; \ 113 __u8 nexthdr = ct_tuple->nexthdr; \ 114 __u32 len = sizeof(tuple->SK_FIELD); \ 115 __u16 port; \ 116 int result; \ 117 \ 118 /* The provided 'ct_tuple' is in the internal Cilium format, which \ 119 * reverses the source/destination ports as compared with the actual \ 120 * packet contents. 'bpf_sock_tuple' in the eBPF API needs these to \ 121 * match normal packet ordering to successfully look up the \ 122 * corresponding socket. So, swap them here. \ 123 */ \ 124 port = tuple->SK_FIELD.sport; \ 125 tuple->SK_FIELD.sport = tuple->SK_FIELD.dport; \ 126 tuple->SK_FIELD.dport = port; \ 127 \ 128 /* Look for established socket locally first */ \ 129 cilium_dbg3(ctx, DBG_LOOKUP_CODE, \ 130 tuple->SK_FIELD.SADDR_DBG, tuple->SK_FIELD.DADDR_DBG, \ 131 combine_ports(tuple->SK_FIELD.dport, tuple->SK_FIELD.sport)); \ 132 result = assign_socket(ctx, tuple, len, nexthdr, true); \ 133 if (result == CTX_ACT_OK) \ 134 goto out; \ 135 \ 136 /* if there's no established connection, locate the tproxy socket on the tproxy_addr IP */ \ 137 tuple->SK_FIELD.dport = proxy_port; \ 138 tuple->SK_FIELD.sport = 0; \ 139 memcpy(&tuple->SK_FIELD.daddr, tproxy_addr, sizeof(tuple->SK_FIELD.daddr)); \ 140 memset(&tuple->SK_FIELD.saddr, 0, sizeof(tuple->SK_FIELD.saddr)); \ 141 cilium_dbg3(ctx, DBG_LOOKUP_CODE, \ 142 tuple->SK_FIELD.SADDR_DBG, tuple->SK_FIELD.DADDR_DBG, \ 143 combine_ports(tuple->SK_FIELD.dport, tuple->SK_FIELD.sport)); \ 144 result = assign_socket(ctx, tuple, len, nexthdr, false); \ 145 if (result == CTX_ACT_OK) \ 146 goto out; \ 147 \ 148 /* if there's no tproxy socket on tproxy_addr look for one bound to all interfaces */ \ 149 memset(&tuple->SK_FIELD.daddr, 0, sizeof(tuple->SK_FIELD.daddr)); \ 150 cilium_dbg3(ctx, DBG_LOOKUP_CODE, \ 151 tuple->SK_FIELD.SADDR_DBG, tuple->SK_FIELD.DADDR_DBG, \ 152 combine_ports(tuple->SK_FIELD.dport, tuple->SK_FIELD.sport)); \ 153 result = assign_socket(ctx, tuple, len, nexthdr, false); \ 154 \ 155 out: \ 156 return result; \ 157 } 158 159 #ifdef ENABLE_IPV4 160 CTX_REDIRECT_FN(ctx_redirect_to_proxy_ingress4, struct ipv4_ct_tuple, ipv4, 161 DBG_SK_LOOKUP4, daddr, saddr) 162 #endif 163 #ifdef ENABLE_IPV6 164 CTX_REDIRECT_FN(ctx_redirect_to_proxy_ingress6, struct ipv6_ct_tuple, ipv6, 165 DBG_SK_LOOKUP6, daddr[3], saddr[3]) 166 #endif 167 #undef CTX_REDIRECT_FN 168 #endif /* ENABLE_TPROXY */ 169 170 /** 171 * __ctx_redirect_to_proxy configures the ctx with the proxy mark and proxy 172 * port number to ensure that the stack redirects the packet into the proxy. 173 * 174 * It is called from both ingress and egress side of endpoint devices. 175 * 176 * In regular veth mode: 177 * * To apply egress policy, the egressing endpoint configures the mark, 178 * which returns CTX_ACT_OK to pass the packet to the stack in the context 179 * of the source device (stack ingress). 180 * * To apply ingress policy, the egressing endpoint or netdev program tail 181 * calls into the policy program which configures the mark here, which 182 * returns CTX_ACT_OK to pass the packet to the stack in the context of the 183 * source device (netdev or egress endpoint device, stack ingress). 184 * 185 * In chaining mode with bridged endpoint devices: 186 * * To apply egress policy, the egressing endpoint configures the mark, 187 * which is propagated via ctx_store_meta() in the caller. The redirect() call 188 * here redirects the packet to the ingress TC filter configured on the bridge 189 * master device. 190 * * To apply ingress policy, the stack transmits the packet into the bridge 191 * master device which tail calls into the policy program for the ingress 192 * endpoint, which configures mark and cb[] as described for the egress path. 193 * The redirect() call here redirects the packet to the ingress TC filter 194 * configured on the bridge master device. 195 * * In both cases for bridged endpoint devices, the bridge master device has 196 * a BPF program configured upon ingress to transfer the cb[] to the mark 197 * before passing the traffic up to the stack towards the proxy. 198 */ 199 static __always_inline int 200 __ctx_redirect_to_proxy(struct __ctx_buff *ctx, void *tuple __maybe_unused, 201 __be16 proxy_port, bool from_host __maybe_unused, 202 bool ipv4 __maybe_unused) 203 { 204 int result __maybe_unused = CTX_ACT_OK; 205 206 #ifdef ENABLE_TPROXY 207 if (!from_host) 208 ctx->mark |= MARK_MAGIC_TO_PROXY; 209 else 210 #endif 211 ctx->mark = MARK_MAGIC_TO_PROXY | proxy_port << 16; 212 213 cilium_dbg_capture(ctx, DBG_CAPTURE_PROXY_PRE, proxy_port); 214 215 #ifdef ENABLE_TPROXY 216 if (proxy_port && !from_host) { 217 #ifdef ENABLE_IPV4 218 if (ipv4) { 219 __be32 ipv4_localhost = bpf_htonl(INADDR_LOOPBACK); 220 221 result = 222 ctx_redirect_to_proxy_ingress4(ctx, tuple, proxy_port, &ipv4_localhost); 223 } 224 #endif /* ENABLE_IPV4 */ 225 #ifdef ENABLE_IPV6 226 if (!ipv4) { 227 union v6addr ipv6_localhost = { .addr[15] = 1,}; 228 229 result = 230 ctx_redirect_to_proxy_ingress6(ctx, tuple, proxy_port, &ipv6_localhost); 231 } 232 #endif /* ENABLE_IPV6 */ 233 } 234 #endif /* ENABLE_TPROXY */ 235 ctx_change_type(ctx, PACKET_HOST); /* Required for ingress packets from overlay */ 236 return result; 237 } 238 239 #ifdef ENABLE_IPV4 240 static __always_inline int 241 ctx_redirect_to_proxy4(struct __ctx_buff *ctx, void *tuple __maybe_unused, 242 __be16 proxy_port, bool from_host __maybe_unused) 243 { 244 return __ctx_redirect_to_proxy(ctx, tuple, proxy_port, from_host, true); 245 } 246 #endif /* ENABLE_IPV4 */ 247 248 #ifdef ENABLE_IPV6 249 static __always_inline int 250 ctx_redirect_to_proxy6(struct __ctx_buff *ctx, void *tuple __maybe_unused, 251 __be16 proxy_port, bool from_host __maybe_unused) 252 { 253 return __ctx_redirect_to_proxy(ctx, tuple, proxy_port, from_host, false); 254 } 255 #endif /* ENABLE_IPV6 */ 256 257 #ifdef ENABLE_TPROXY 258 #define IP_TUPLE_EXTRACT_FN(NAME, PREFIX) \ 259 /** \ 260 * extract_tuple4 / extract_tuple6 \ 261 * \ 262 * Extracts the packet 5-tuple into 'tuple'. \ 263 * \ 264 * Note that it doesn't fully initialize 'tuple' as the directionality \ 265 * bit is unused in the proxy paths. \ 266 */ \ 267 static __always_inline int \ 268 NAME(struct __ctx_buff *ctx, struct PREFIX ## _ct_tuple *tuple) \ 269 { \ 270 int err; \ 271 \ 272 err = PREFIX ## _extract_tuple(ctx, tuple); \ 273 if (err != CTX_ACT_OK) \ 274 return err; \ 275 \ 276 __ ## PREFIX ## _ct_tuple_reverse(tuple); \ 277 \ 278 return CTX_ACT_OK; \ 279 } 280 281 #ifdef ENABLE_IPV4 282 IP_TUPLE_EXTRACT_FN(extract_tuple4, ipv4) 283 #endif /* ENABLE_IPV4 */ 284 #ifdef ENABLE_IPV6 285 IP_TUPLE_EXTRACT_FN(extract_tuple6, ipv6) 286 #endif /* ENABLE_IPV6 */ 287 #endif /* ENABLE_TPROXY */ 288 289 /** 290 * ctx_redirect_to_proxy_first() applies changes to the context to forward 291 * the packet towards the proxy. It is designed to run as the first function 292 * that accesses the context from the current BPF program. 293 */ 294 static __always_inline int 295 ctx_redirect_to_proxy_first(struct __ctx_buff *ctx, __be16 proxy_port) 296 { 297 int ret = CTX_ACT_OK; 298 #if defined(ENABLE_TPROXY) 299 __u16 proto; 300 #ifdef ENABLE_IPV4 301 __be32 ipv4_localhost = bpf_htonl(INADDR_LOOPBACK); 302 #endif 303 #ifdef ENABLE_IPV6 304 union v6addr ipv6_localhost = { .addr[15] = 1,}; 305 #endif 306 307 /** 308 * For reply traffic to egress proxy for a local endpoint, we skip the 309 * policy & proxy_port lookup and just hairpin & rely on local stack 310 * routing via ctx->mark to ensure that the return traffic reaches the 311 * proxy. This is only relevant for endpoint-routes mode but we don't 312 * have a macro for this so the logic applies unconditionally here. 313 * See ct_state.proxy_redirect usage in bpf_lxc.c for more info. 314 */ 315 if (!proxy_port) 316 goto mark; 317 318 if (!validate_ethertype(ctx, &proto)) 319 return DROP_UNSUPPORTED_L2; 320 321 ret = DROP_UNKNOWN_L3; 322 switch (proto) { 323 #ifdef ENABLE_IPV6 324 case bpf_htons(ETH_P_IPV6): { 325 struct ipv6_ct_tuple tuple; 326 327 ret = extract_tuple6(ctx, &tuple); 328 if (ret < 0) 329 return ret; 330 ret = ctx_redirect_to_proxy_ingress6(ctx, &tuple, proxy_port, &ipv6_localhost); 331 break; 332 } 333 #endif /* ENABLE_IPV6 */ 334 #ifdef ENABLE_IPV4 335 case bpf_htons(ETH_P_IP): { 336 struct ipv4_ct_tuple tuple; 337 338 ret = extract_tuple4(ctx, &tuple); 339 if (ret < 0) 340 return ret; 341 342 ret = ctx_redirect_to_proxy_ingress4(ctx, &tuple, proxy_port, &ipv4_localhost); 343 break; 344 } 345 #endif /* ENABLE_IPV4 */ 346 default: 347 goto out; 348 } 349 #endif /* ENABLE_TPROXY */ 350 351 mark: __maybe_unused; 352 cilium_dbg_capture(ctx, DBG_CAPTURE_PROXY_POST, proxy_port); 353 ctx->mark = MARK_MAGIC_TO_PROXY | (proxy_port << 16); 354 ctx_change_type(ctx, PACKET_HOST); 355 356 out: __maybe_unused; 357 return ret; 358 } 359 360 /** 361 * tc_index_from_ingress_proxy - returns true if packet originates from ingress proxy 362 */ 363 static __always_inline bool tc_index_from_ingress_proxy(struct __ctx_buff *ctx) 364 { 365 volatile __u32 tc_index = ctx->tc_index; 366 #ifdef DEBUG 367 if (tc_index & TC_INDEX_F_FROM_INGRESS_PROXY) 368 cilium_dbg(ctx, DBG_SKIP_PROXY, tc_index, 0); 369 #endif 370 371 return tc_index & TC_INDEX_F_FROM_INGRESS_PROXY; 372 } 373 374 /** 375 * tc_index_from_egress_proxy - returns true if packet originates from egress proxy 376 */ 377 static __always_inline bool tc_index_from_egress_proxy(struct __ctx_buff *ctx) 378 { 379 volatile __u32 tc_index = ctx->tc_index; 380 #ifdef DEBUG 381 if (tc_index & TC_INDEX_F_FROM_EGRESS_PROXY) 382 cilium_dbg(ctx, DBG_SKIP_PROXY, tc_index, 0); 383 #endif 384 385 return tc_index & TC_INDEX_F_FROM_EGRESS_PROXY; 386 }