github.com/cilium/cilium@v1.16.2/bpf/lib/nodeport.h (about) 1 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 /* Copyright Authors of Cilium */ 3 4 #pragma once 5 6 #include <bpf/ctx/ctx.h> 7 #include <bpf/api.h> 8 9 #include "bpf/compiler.h" 10 #include "tailcall.h" 11 #include "nat.h" 12 #include "edt.h" 13 #include "lb.h" 14 #include "common.h" 15 #include "overloadable.h" 16 #include "egress_gateway.h" 17 #include "eps.h" 18 #include "conntrack.h" 19 #include "csum.h" 20 #include "encap.h" 21 #include "identity.h" 22 #include "trace.h" 23 #include "ghash.h" 24 #include "pcap.h" 25 #include "host_firewall.h" 26 #include "stubs.h" 27 #include "proxy_hairpin.h" 28 #include "fib.h" 29 30 #define nodeport_nat_egress_ipv4_hook(ctx, ip4, info, tuple, l4_off, ext_err) CTX_ACT_OK 31 #define nodeport_rev_dnat_ingress_ipv4_hook(ctx, ip4, tuple, tunnel_endpoint, src_sec_identity, \ 32 dst_sec_identity) -1 33 34 #ifdef ENABLE_NODEPORT 35 /* The IPv6 extension should be 8-bytes aligned */ 36 struct dsr_opt_v6 { 37 struct ipv6_opt_hdr hdr; 38 __u8 opt_type; 39 __u8 opt_len; 40 union v6addr addr; 41 __be16 port; 42 __u16 pad; 43 }; 44 45 struct dsr_opt_v4 { 46 __u8 type; 47 __u8 len; 48 __u16 port; 49 __u32 addr; 50 }; 51 52 static __always_inline bool nodeport_uses_dsr(__u8 nexthdr __maybe_unused) 53 { 54 # if defined(ENABLE_DSR) && !defined(ENABLE_DSR_HYBRID) 55 return true; 56 # elif defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID) 57 if (nexthdr == IPPROTO_TCP) 58 return true; 59 return false; 60 # else 61 return false; 62 # endif 63 } 64 65 #ifdef HAVE_ENCAP 66 static __always_inline int 67 nodeport_add_tunnel_encap(struct __ctx_buff *ctx, __u32 src_ip, __be16 src_port, 68 __be32 dst_ip, __u32 src_sec_identity, __u32 dst_sec_identity, 69 enum trace_reason ct_reason, __u32 monitor, int *ifindex) 70 { 71 /* Let kernel choose the outer source ip */ 72 if (ctx_is_skb()) 73 src_ip = 0; 74 75 return __encap_with_nodeid(ctx, src_ip, src_port, dst_ip, 76 src_sec_identity, dst_sec_identity, NOT_VTEP_DST, 77 ct_reason, monitor, ifindex); 78 } 79 80 # if defined(ENABLE_DSR) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE 81 static __always_inline int 82 nodeport_add_tunnel_encap_opt(struct __ctx_buff *ctx, __u32 src_ip, __be16 src_port, 83 __be32 dst_ip, __u32 src_sec_identity, __u32 dst_sec_identity, 84 void *opt, __u32 opt_len, enum trace_reason ct_reason, 85 __u32 monitor, int *ifindex) 86 { 87 /* Let kernel choose the outer source ip */ 88 if (ctx_is_skb()) 89 src_ip = 0; 90 91 return __encap_with_nodeid_opt(ctx, src_ip, src_port, dst_ip, 92 src_sec_identity, dst_sec_identity, NOT_VTEP_DST, 93 opt, opt_len, ct_reason, monitor, ifindex); 94 } 95 # endif 96 #endif /* HAVE_ENCAP */ 97 98 static __always_inline bool dsr_fail_needs_reply(int code __maybe_unused) 99 { 100 #ifdef ENABLE_DSR_ICMP_ERRORS 101 if (code == DROP_FRAG_NEEDED) 102 return true; 103 #endif 104 return false; 105 } 106 107 static __always_inline bool dsr_is_too_big(struct __ctx_buff *ctx __maybe_unused, 108 __u16 expanded_len __maybe_unused) 109 { 110 #ifdef ENABLE_DSR_ICMP_ERRORS 111 if (expanded_len > THIS_MTU) 112 return true; 113 #endif 114 return false; 115 } 116 117 static __always_inline int 118 nodeport_fib_lookup_and_redirect(struct __ctx_buff *ctx, 119 struct bpf_fib_lookup_padded *fib_params, 120 __s8 *ext_err) 121 { 122 int oif = NATIVE_DEV_IFINDEX; 123 int ret; 124 125 ret = fib_lookup(ctx, &fib_params->l, sizeof(fib_params->l), 0); 126 *ext_err = (__s8)ret; 127 128 switch (ret) { 129 case BPF_FIB_LKUP_RET_SUCCESS: 130 case BPF_FIB_LKUP_RET_NO_NEIGH: 131 if ((__u32)oif == fib_params->l.ifindex) 132 return CTX_ACT_OK; 133 134 return fib_do_redirect(ctx, true, fib_params, true, ext_err, &oif); 135 default: 136 return DROP_NO_FIB; 137 } 138 } 139 140 #ifdef ENABLE_IPV6 141 static __always_inline bool nodeport_uses_dsr6(const struct ipv6_ct_tuple *tuple) 142 { 143 return nodeport_uses_dsr(tuple->nexthdr); 144 } 145 146 static __always_inline bool 147 nodeport_has_nat_conflict_ipv6(const struct ipv6hdr *ip6 __maybe_unused, 148 struct ipv6_nat_target *target __maybe_unused) 149 { 150 #if defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY) 151 union v6addr router_ip; 152 153 BPF_V6(router_ip, ROUTER_IP); 154 if (ipv6_addr_equals((union v6addr *)&ip6->saddr, &router_ip)) { 155 ipv6_addr_copy(&target->addr, &router_ip); 156 target->needs_ct = true; 157 158 return true; 159 } 160 #endif /* TUNNEL_MODE && IS_BPF_OVERLAY */ 161 162 #if defined(IS_BPF_HOST) 163 const union v6addr dr_addr = IPV6_DIRECT_ROUTING; 164 __u32 dr_ifindex = DIRECT_ROUTING_DEV_IFINDEX; 165 166 /* See comment in nodeport_has_nat_conflict_ipv4(). */ 167 if (dr_ifindex == NATIVE_DEV_IFINDEX && 168 ipv6_addr_equals((union v6addr *)&ip6->saddr, &dr_addr)) { 169 ipv6_addr_copy(&target->addr, &dr_addr); 170 target->needs_ct = true; 171 172 return true; 173 } 174 #endif /* IS_BPF_HOST */ 175 176 return false; 177 } 178 179 static __always_inline int nodeport_snat_fwd_ipv6(struct __ctx_buff *ctx, 180 union v6addr *saddr, 181 struct trace_ctx *trace, 182 __s8 *ext_err) 183 { 184 struct ipv6_nat_target target = { 185 .min_port = NODEPORT_PORT_MIN_NAT, 186 .max_port = NODEPORT_PORT_MAX_NAT, 187 }; 188 struct ipv6_ct_tuple tuple = {}; 189 int hdrlen, l4_off, ret; 190 void *data, *data_end; 191 struct ipv6hdr *ip6; 192 193 if (!revalidate_data(ctx, &data, &data_end, &ip6)) 194 return DROP_INVALID; 195 196 tuple.nexthdr = ip6->nexthdr; 197 hdrlen = ipv6_hdrlen(ctx, &tuple.nexthdr); 198 if (hdrlen < 0) 199 return hdrlen; 200 201 snat_v6_init_tuple(ip6, NAT_DIR_EGRESS, &tuple); 202 l4_off = ETH_HLEN + hdrlen; 203 204 if (lb_is_svc_proto(tuple.nexthdr) && 205 !nodeport_uses_dsr6(&tuple) && 206 nodeport_has_nat_conflict_ipv6(ip6, &target)) 207 goto apply_snat; 208 209 ret = snat_v6_needs_masquerade(ctx, &tuple, l4_off, &target); 210 if (IS_ERR(ret)) 211 goto out; 212 213 apply_snat: 214 ipv6_addr_copy(saddr, &tuple.saddr); 215 ret = snat_v6_nat(ctx, &tuple, l4_off, &target, trace, ext_err); 216 if (IS_ERR(ret)) 217 goto out; 218 219 /* See the equivalent v4 path for comment */ 220 if (is_defined(IS_BPF_HOST)) 221 ctx_snat_done_set(ctx); 222 223 out: 224 if (ret == NAT_PUNT_TO_STACK) 225 ret = CTX_ACT_OK; 226 227 return ret; 228 } 229 230 #ifdef ENABLE_DSR 231 #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP 232 static __always_inline void rss_gen_src6(union v6addr *src, 233 const union v6addr *client, 234 __be32 l4_hint) 235 { 236 __u32 bits = 128 - IPV6_RSS_PREFIX_BITS; 237 238 *src = (union v6addr)IPV6_RSS_PREFIX; 239 if (bits) { 240 __u32 todo; 241 242 if (bits > 96) { 243 todo = bits - 96; 244 src->p1 |= bpf_htonl(hash_32(client->p1 ^ l4_hint, todo)); 245 bits -= todo; 246 } 247 if (bits > 64) { 248 todo = bits - 64; 249 src->p2 |= bpf_htonl(hash_32(client->p2 ^ l4_hint, todo)); 250 bits -= todo; 251 } 252 if (bits > 32) { 253 todo = bits - 32; 254 src->p3 |= bpf_htonl(hash_32(client->p3 ^ l4_hint, todo)); 255 bits -= todo; 256 } 257 src->p4 |= bpf_htonl(hash_32(client->p4 ^ l4_hint, bits)); 258 } 259 } 260 261 static __always_inline int dsr_set_ipip6(struct __ctx_buff *ctx, 262 const struct ipv6hdr *ip6, 263 const union v6addr *backend_addr, 264 __be32 l4_hint, int *ohead) 265 { 266 __u16 payload_len = bpf_ntohs(ip6->payload_len) + sizeof(*ip6); 267 const int l3_off = ETH_HLEN; 268 union v6addr saddr; 269 struct { 270 __be16 payload_len; 271 __u8 nexthdr; 272 __u8 hop_limit; 273 } tp_new = { 274 .payload_len = bpf_htons(payload_len), 275 .nexthdr = IPPROTO_IPV6, 276 .hop_limit = IPDEFTTL, 277 }; 278 279 if (dsr_is_too_big(ctx, payload_len + sizeof(*ip6))) { 280 *ohead = sizeof(*ip6); 281 return DROP_FRAG_NEEDED; 282 } 283 284 rss_gen_src6(&saddr, (union v6addr *)&ip6->saddr, l4_hint); 285 286 if (ctx_adjust_hroom(ctx, sizeof(*ip6), BPF_ADJ_ROOM_NET, 287 ctx_adjust_hroom_flags())) 288 return DROP_INVALID; 289 if (ctx_store_bytes(ctx, l3_off + offsetof(struct ipv6hdr, payload_len), 290 &tp_new.payload_len, 4, 0) < 0) 291 return DROP_WRITE_ERROR; 292 if (ctx_store_bytes(ctx, l3_off + offsetof(struct ipv6hdr, daddr), 293 backend_addr, sizeof(ip6->daddr), 0) < 0) 294 return DROP_WRITE_ERROR; 295 if (ctx_store_bytes(ctx, l3_off + offsetof(struct ipv6hdr, saddr), 296 &saddr, sizeof(ip6->saddr), 0) < 0) 297 return DROP_WRITE_ERROR; 298 return 0; 299 } 300 #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE 301 static __always_inline int dsr_set_ext6(struct __ctx_buff *ctx, 302 struct ipv6hdr *ip6, 303 const union v6addr *svc_addr, 304 __be16 svc_port, int *ohead) 305 { 306 struct dsr_opt_v6 opt __align_stack_8 = {}; 307 __u16 payload_len = bpf_ntohs(ip6->payload_len) + sizeof(opt); 308 __u16 total_len = bpf_ntohs(ip6->payload_len) + sizeof(struct ipv6hdr) + sizeof(opt); 309 __u8 nexthdr = ip6->nexthdr; 310 int hdrlen; 311 312 /* The IPv6 extension should be 8-bytes aligned */ 313 build_bug_on((sizeof(struct dsr_opt_v6) % 8) != 0); 314 315 hdrlen = ipv6_hdrlen(ctx, &nexthdr); 316 if (hdrlen < 0) 317 return hdrlen; 318 319 /* See dsr_set_opt4(): */ 320 if (nexthdr == IPPROTO_TCP) { 321 union tcp_flags tcp_flags = { .value = 0 }; 322 323 if (l4_load_tcp_flags(ctx, ETH_HLEN + hdrlen, &tcp_flags) < 0) 324 return DROP_CT_INVALID_HDR; 325 326 if (!(tcp_flags.value & (TCP_FLAG_SYN))) 327 return 0; 328 } 329 330 if (dsr_is_too_big(ctx, total_len)) { 331 *ohead = sizeof(opt); 332 return DROP_FRAG_NEEDED; 333 } 334 335 opt.hdr.nexthdr = ip6->nexthdr; 336 ip6->nexthdr = NEXTHDR_DEST; 337 ip6->payload_len = bpf_htons(payload_len); 338 339 opt.hdr.hdrlen = DSR_IPV6_EXT_LEN; 340 opt.opt_type = DSR_IPV6_OPT_TYPE; 341 opt.opt_len = DSR_IPV6_OPT_LEN; 342 ipv6_addr_copy_unaligned(&opt.addr, svc_addr); 343 opt.port = svc_port; 344 345 if (ctx_adjust_hroom(ctx, sizeof(opt), BPF_ADJ_ROOM_NET, 346 ctx_adjust_hroom_flags())) 347 return DROP_INVALID; 348 if (ctx_store_bytes(ctx, ETH_HLEN + sizeof(*ip6), &opt, 349 sizeof(opt), 0) < 0) 350 return DROP_INVALID; 351 return 0; 352 } 353 #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE 354 static __always_inline int encap_geneve_dsr_opt6(struct __ctx_buff *ctx, 355 struct ipv6hdr *ip6, 356 const union v6addr *svc_addr, 357 __be16 svc_port, 358 int *ifindex, int *ohead) 359 { 360 struct remote_endpoint_info *info; 361 struct ipv6_ct_tuple tuple __align_stack_8 = {}; 362 struct geneve_dsr_opt6 gopt; 363 union v6addr *dst; 364 bool need_opt = true; 365 __u16 encap_len = sizeof(struct ipv6hdr) + sizeof(struct udphdr) + 366 sizeof(struct genevehdr) + ETH_HLEN; 367 __u16 payload_len = bpf_ntohs(ip6->payload_len) + sizeof(*ip6); 368 __u32 dst_sec_identity; 369 __be32 tunnel_endpoint; 370 __u16 total_len = 0; 371 __be16 src_port; 372 int l4_off, ret; 373 374 build_bug_on((sizeof(gopt) % 4) != 0); 375 376 dst = (union v6addr *)&ip6->daddr; 377 info = lookup_ip6_remote_endpoint(dst, 0); 378 if (!info || info->tunnel_endpoint == 0) 379 return DROP_NO_TUNNEL_ENDPOINT; 380 381 tunnel_endpoint = info->tunnel_endpoint; 382 dst_sec_identity = info->sec_identity; 383 384 ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple); 385 if (IS_ERR(ret)) 386 return ret; 387 388 src_port = tunnel_gen_src_port_v6(&tuple); 389 390 /* See encap_geneve_dsr_opt4(): */ 391 if (tuple.nexthdr == IPPROTO_TCP) { 392 union tcp_flags tcp_flags = { .value = 0 }; 393 394 if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0) 395 return DROP_CT_INVALID_HDR; 396 397 if (!(tcp_flags.value & (TCP_FLAG_SYN))) 398 need_opt = false; 399 } 400 401 if (need_opt) { 402 encap_len += sizeof(struct geneve_dsr_opt6); 403 set_geneve_dsr_opt6(svc_port, svc_addr, &gopt); 404 } 405 406 total_len = encap_len + payload_len; 407 408 if (dsr_is_too_big(ctx, total_len)) { 409 *ohead = encap_len; 410 return DROP_FRAG_NEEDED; 411 } 412 413 if (need_opt) 414 return nodeport_add_tunnel_encap_opt(ctx, 415 IPV4_DIRECT_ROUTING, 416 src_port, 417 tunnel_endpoint, 418 WORLD_IPV6_ID, 419 dst_sec_identity, 420 &gopt, 421 sizeof(gopt), 422 (enum trace_reason)CT_NEW, 423 TRACE_PAYLOAD_LEN, 424 ifindex); 425 426 return nodeport_add_tunnel_encap(ctx, 427 IPV4_DIRECT_ROUTING, 428 src_port, 429 tunnel_endpoint, 430 WORLD_IPV6_ID, 431 dst_sec_identity, 432 (enum trace_reason)CT_NEW, 433 TRACE_PAYLOAD_LEN, 434 ifindex); 435 } 436 #endif /* DSR_ENCAP_MODE */ 437 438 static __always_inline int find_dsr_v6(struct __ctx_buff *ctx, __u8 nexthdr, 439 struct dsr_opt_v6 *dsr_opt, bool *found) 440 { 441 struct ipv6_opt_hdr opthdr __align_stack_8; 442 int i, len = sizeof(struct ipv6hdr); 443 __u8 nh = nexthdr; 444 445 #pragma unroll 446 for (i = 0; i < IPV6_MAX_HEADERS; i++) { 447 switch (nh) { 448 case NEXTHDR_NONE: 449 return DROP_INVALID_EXTHDR; 450 451 case NEXTHDR_FRAGMENT: 452 return DROP_FRAG_NOSUPPORT; 453 454 case NEXTHDR_HOP: 455 case NEXTHDR_ROUTING: 456 case NEXTHDR_AUTH: 457 case NEXTHDR_DEST: 458 if (ctx_load_bytes(ctx, ETH_HLEN + len, &opthdr, sizeof(opthdr)) < 0) 459 return DROP_INVALID; 460 461 if (nh == NEXTHDR_DEST && opthdr.hdrlen == DSR_IPV6_EXT_LEN) { 462 if (ctx_load_bytes(ctx, ETH_HLEN + len, dsr_opt, 463 sizeof(*dsr_opt)) < 0) 464 return DROP_INVALID; 465 if (dsr_opt->opt_type == DSR_IPV6_OPT_TYPE && 466 dsr_opt->opt_len == DSR_IPV6_OPT_LEN) { 467 *found = true; 468 return 0; 469 } 470 } 471 472 if (nh == NEXTHDR_AUTH) 473 len += ipv6_authlen(&opthdr); 474 else 475 len += ipv6_optlen(&opthdr); 476 477 nh = opthdr.nexthdr; 478 break; 479 480 default: 481 return 0; 482 } 483 } 484 485 /* Reached limit of supported extension headers */ 486 return DROP_INVALID_EXTHDR; 487 } 488 489 static __always_inline int 490 nodeport_extract_dsr_v6(struct __ctx_buff *ctx, 491 struct ipv6hdr *ip6 __maybe_unused, 492 const struct ipv6_ct_tuple *tuple, int l4_off, 493 union v6addr *addr, __be16 *port, bool *dsr) 494 { 495 struct ipv6_ct_tuple tmp = *tuple; 496 497 if (tuple->nexthdr == IPPROTO_TCP) { 498 union tcp_flags tcp_flags = {}; 499 500 if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0) 501 return DROP_CT_INVALID_HDR; 502 503 ipv6_ct_tuple_reverse(&tmp); 504 505 if (!(tcp_flags.value & TCP_FLAG_SYN)) { 506 *dsr = ct_has_dsr_egress_entry6(get_ct_map6(&tmp), &tmp); 507 *port = 0; 508 return 0; 509 } 510 } 511 512 #if defined(IS_BPF_OVERLAY) 513 { 514 struct geneve_dsr_opt6 gopt; 515 int ret = ctx_get_tunnel_opt(ctx, &gopt, sizeof(gopt)); 516 517 if (ret > 0) { 518 if (gopt.hdr.opt_class == bpf_htons(DSR_GENEVE_OPT_CLASS) && 519 gopt.hdr.type == DSR_GENEVE_OPT_TYPE) { 520 *dsr = true; 521 *port = gopt.port; 522 ipv6_addr_copy_unaligned(addr, 523 (union v6addr *)&gopt.addr); 524 return 0; 525 } 526 } 527 } 528 #else 529 { 530 struct dsr_opt_v6 opt __align_stack_8 = {}; 531 int ret; 532 533 ret = find_dsr_v6(ctx, ip6->nexthdr, &opt, dsr); 534 if (ret != 0) 535 return ret; 536 537 if (*dsr) { 538 *addr = opt.addr; 539 *port = opt.port; 540 return 0; 541 } 542 } 543 #endif 544 545 /* SYN for a new connection that's not / no longer DSR. 546 * If it's reopened, avoid sending subsequent traffic down the DSR path. 547 */ 548 if (tuple->nexthdr == IPPROTO_TCP) 549 ct_update_dsr(get_ct_map6(&tmp), &tmp, false); 550 551 return 0; 552 } 553 554 static __always_inline struct ipv6_nat_entry * 555 nodeport_dsr_lookup_v6_nat_entry(const struct ipv6_ct_tuple *nat_tuple) 556 { 557 return snat_v6_lookup(nat_tuple); 558 } 559 560 static __always_inline int dsr_reply_icmp6(struct __ctx_buff *ctx, 561 const struct ipv6hdr *ip6 __maybe_unused, 562 const union v6addr *svc_addr __maybe_unused, 563 __be16 dport __maybe_unused, 564 int code, int ohead __maybe_unused) 565 { 566 #ifdef ENABLE_DSR_ICMP_ERRORS 567 const __s32 orig_dgram = 64, off = ETH_HLEN; 568 __u8 orig_ipv6_hdr[orig_dgram]; 569 __be16 type = bpf_htons(ETH_P_IPV6); 570 __u64 len_new = off + sizeof(*ip6) + orig_dgram; 571 __u64 len_old = ctx_full_len(ctx); 572 void *data_end = ctx_data_end(ctx); 573 void *data = ctx_data(ctx); 574 __u8 reason = (__u8)-code; 575 __wsum wsum; 576 union macaddr smac, dmac; 577 struct icmp6hdr icmp __align_stack_8 = { 578 .icmp6_type = ICMPV6_PKT_TOOBIG, 579 .icmp6_mtu = bpf_htonl(THIS_MTU - ohead), 580 }; 581 __u64 payload_len = sizeof(*ip6) + sizeof(icmp) + orig_dgram; 582 struct ipv6hdr ip __align_stack_8 = { 583 .version = 6, 584 .priority = ip6->priority, 585 .flow_lbl[0] = ip6->flow_lbl[0], 586 .flow_lbl[1] = ip6->flow_lbl[1], 587 .flow_lbl[2] = ip6->flow_lbl[2], 588 .nexthdr = IPPROTO_ICMPV6, 589 .hop_limit = IPDEFTTL, 590 .saddr = ip6->daddr, 591 .daddr = ip6->saddr, 592 .payload_len = bpf_htons((__u16)payload_len), 593 }; 594 struct ipv6hdr inner_ipv6_hdr __align_stack_8 = *ip6; 595 __s32 l4_dport_offset; 596 597 /* DSR changes the destination address from service ip to pod ip and 598 * destination port from service port to pod port. While resppnding 599 * back with ICMP error, it is necessary to set it to original ip and 600 * port. 601 */ 602 ipv6_addr_copy((union v6addr *)&inner_ipv6_hdr.daddr, svc_addr); 603 604 if (inner_ipv6_hdr.nexthdr == IPPROTO_UDP) 605 l4_dport_offset = UDP_DPORT_OFF; 606 else if (inner_ipv6_hdr.nexthdr == IPPROTO_TCP) 607 l4_dport_offset = TCP_DPORT_OFF; 608 else 609 goto drop_err; 610 611 if (ctx_load_bytes(ctx, off + sizeof(inner_ipv6_hdr), orig_ipv6_hdr, 612 sizeof(orig_ipv6_hdr)) < 0) 613 goto drop_err; 614 memcpy(orig_ipv6_hdr + l4_dport_offset, &dport, sizeof(dport)); 615 616 update_metrics(ctx_full_len(ctx), METRIC_EGRESS, reason); 617 618 if (eth_load_saddr(ctx, smac.addr, 0) < 0) 619 goto drop_err; 620 if (eth_load_daddr(ctx, dmac.addr, 0) < 0) 621 goto drop_err; 622 if (unlikely(data + len_new > data_end)) 623 goto drop_err; 624 625 wsum = ipv6_pseudohdr_checksum(&ip, IPPROTO_ICMPV6, 626 bpf_ntohs(ip.payload_len), 0); 627 icmp.icmp6_cksum = csum_fold(csum_diff(NULL, 0, orig_ipv6_hdr, sizeof(orig_ipv6_hdr), 628 csum_diff(NULL, 0, &inner_ipv6_hdr, 629 sizeof(inner_ipv6_hdr), 630 csum_diff(NULL, 0, &icmp, 631 sizeof(icmp), wsum)))); 632 633 if (ctx_adjust_troom(ctx, -(len_old - len_new)) < 0) 634 goto drop_err; 635 if (ctx_adjust_hroom(ctx, sizeof(ip) + sizeof(icmp), 636 BPF_ADJ_ROOM_NET, 637 ctx_adjust_hroom_flags()) < 0) 638 goto drop_err; 639 640 if (eth_store_daddr(ctx, smac.addr, 0) < 0) 641 goto drop_err; 642 if (eth_store_saddr(ctx, dmac.addr, 0) < 0) 643 goto drop_err; 644 if (ctx_store_bytes(ctx, ETH_ALEN * 2, &type, sizeof(type), 0) < 0) 645 goto drop_err; 646 if (ctx_store_bytes(ctx, off, &ip, sizeof(ip), 0) < 0) 647 goto drop_err; 648 if (ctx_store_bytes(ctx, off + sizeof(ip), &icmp, 649 sizeof(icmp), 0) < 0) 650 goto drop_err; 651 if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp), &inner_ipv6_hdr, 652 sizeof(inner_ipv6_hdr), 0) < 0) 653 goto drop_err; 654 if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp) + 655 sizeof(inner_ipv6_hdr) + l4_dport_offset, 656 &dport, sizeof(dport), 0) < 0) 657 goto drop_err; 658 659 return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0); 660 drop_err: 661 #endif 662 return send_drop_notify_error(ctx, UNKNOWN_ID, code, CTX_ACT_DROP, 663 METRIC_EGRESS); 664 } 665 666 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_DSR) 667 int tail_nodeport_ipv6_dsr(struct __ctx_buff *ctx) 668 { 669 struct bpf_fib_lookup_padded fib_params = { 670 .l = { 671 .family = AF_INET6, 672 .ifindex = ctx_get_ifindex(ctx), 673 }, 674 }; 675 int ret, oif = 0, ohead = 0; 676 void *data, *data_end; 677 struct ipv6hdr *ip6; 678 union v6addr addr; 679 __s8 ext_err = 0; 680 __be16 port; 681 682 if (!revalidate_data(ctx, &data, &data_end, &ip6)) { 683 ret = DROP_INVALID; 684 goto drop_err; 685 } 686 687 addr.p1 = ctx_load_meta(ctx, CB_ADDR_V6_1); 688 addr.p2 = ctx_load_meta(ctx, CB_ADDR_V6_2); 689 addr.p3 = ctx_load_meta(ctx, CB_ADDR_V6_3); 690 addr.p4 = ctx_load_meta(ctx, CB_ADDR_V6_4); 691 692 port = (__be16)ctx_load_meta(ctx, CB_PORT); 693 694 #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP 695 ret = dsr_set_ipip6(ctx, ip6, &addr, 696 ctx_load_meta(ctx, CB_HINT), &ohead); 697 #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE 698 ret = dsr_set_ext6(ctx, ip6, &addr, port, &ohead); 699 #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE 700 ret = encap_geneve_dsr_opt6(ctx, ip6, &addr, port, &oif, &ohead); 701 if (!IS_ERR(ret)) { 702 if (ret == CTX_ACT_REDIRECT && oif) { 703 cilium_capture_out(ctx); 704 return ctx_redirect(ctx, oif, 0); 705 } 706 707 fib_params.l.family = AF_INET; 708 } 709 #else 710 # error "Invalid load balancer DSR encapsulation mode!" 711 #endif 712 if (IS_ERR(ret)) { 713 if (dsr_fail_needs_reply(ret)) 714 return dsr_reply_icmp6(ctx, ip6, &addr, port, ret, ohead); 715 goto drop_err; 716 } 717 718 if (fib_params.l.family == AF_INET) { 719 struct iphdr *ip4; 720 721 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 722 ret = DROP_INVALID; 723 goto drop_err; 724 } 725 726 fib_params.l.ipv4_src = ip4->saddr; 727 fib_params.l.ipv4_dst = ip4->daddr; 728 } else { 729 if (!revalidate_data(ctx, &data, &data_end, &ip6)) { 730 ret = DROP_INVALID; 731 goto drop_err; 732 } 733 734 ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_src, 735 (union v6addr *)&ip6->saddr); 736 ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_dst, 737 (union v6addr *)&ip6->daddr); 738 } 739 740 ret = fib_redirect(ctx, true, &fib_params, false, &ext_err, &oif); 741 if (fib_ok(ret)) { 742 cilium_capture_out(ctx); 743 return ret; 744 } 745 drop_err: 746 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 747 CTX_ACT_DROP, METRIC_EGRESS); 748 } 749 750 static __always_inline int 751 nodeport_dsr_ingress_ipv6(struct __ctx_buff *ctx, struct ipv6_ct_tuple *tuple, 752 int l4_off, union v6addr *addr, __be16 port, 753 __s8 *ext_err) 754 { 755 struct ct_state ct_state_new = {}; 756 __u32 monitor = 0; 757 int ret; 758 759 /* look up with SCOPE_FORWARD: */ 760 __ipv6_ct_tuple_reverse(tuple); 761 762 ret = ct_lazy_lookup6(get_ct_map6(tuple), tuple, ctx, l4_off, 763 CT_EGRESS, SCOPE_FORWARD, CT_ENTRY_DSR, 764 NULL, &monitor); 765 if (ret < 0) 766 return ret; 767 768 switch (ret) { 769 case CT_NEW: 770 create_ct: 771 if (port == 0) 772 return DROP_INVALID; 773 774 ct_state_new.src_sec_id = WORLD_IPV6_ID; 775 ct_state_new.dsr_internal = 1; 776 777 ret = ct_create6(get_ct_map6(tuple), NULL, tuple, ctx, 778 CT_EGRESS, &ct_state_new, ext_err); 779 if (!IS_ERR(ret)) 780 ret = snat_v6_create_dsr(tuple, addr, port, ext_err); 781 782 if (IS_ERR(ret)) 783 return ret; 784 break; 785 case CT_ESTABLISHED: 786 if (tuple->nexthdr == IPPROTO_TCP && port) 787 goto create_ct; 788 break; 789 default: 790 return DROP_UNKNOWN_CT; 791 } 792 793 return CTX_ACT_OK; 794 } 795 #endif /* ENABLE_DSR */ 796 797 static __always_inline struct lb6_reverse_nat * 798 nodeport_rev_dnat_get_info_ipv6(struct __ctx_buff *ctx, 799 struct ipv6_ct_tuple *tuple) 800 { 801 struct ipv6_nat_entry *dsr_entry __maybe_unused; 802 struct ipv6_ct_tuple dsr_tuple __maybe_unused; 803 __u16 rev_nat_index = 0; 804 805 if (!ct_has_nodeport_egress_entry6(get_ct_map6(tuple), tuple, 806 &rev_nat_index, is_defined(ENABLE_DSR))) 807 return NULL; 808 809 if (rev_nat_index) 810 return lb6_lookup_rev_nat_entry(ctx, rev_nat_index); 811 812 #ifdef ENABLE_DSR 813 dsr_tuple = *tuple; 814 815 dsr_tuple.flags = NAT_DIR_EGRESS; 816 dsr_tuple.sport = tuple->dport; 817 dsr_tuple.dport = tuple->sport; 818 819 dsr_entry = nodeport_dsr_lookup_v6_nat_entry(&dsr_tuple); 820 if (dsr_entry) 821 return &dsr_entry->nat_info; 822 #endif 823 824 return NULL; 825 } 826 827 #ifdef ENABLE_NAT_46X64_GATEWAY 828 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV46_RFC8215) 829 int tail_nat_ipv46(struct __ctx_buff *ctx) 830 { 831 int ret, oif = 0, l3_off = ETH_HLEN; 832 void *data, *data_end; 833 struct ipv6hdr *ip6; 834 struct iphdr *ip4; 835 __s8 ext_err = 0; 836 837 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 838 ret = DROP_INVALID; 839 goto drop_err; 840 } 841 if (nat46_rfc8215(ctx, ip4, l3_off)) { 842 ret = DROP_NAT46; 843 goto drop_err; 844 } 845 if (!revalidate_data(ctx, &data, &data_end, &ip6)) { 846 ret = DROP_INVALID; 847 goto drop_err; 848 } 849 ret = fib_redirect_v6(ctx, l3_off, ip6, false, true, &ext_err, &oif); 850 if (fib_ok(ret)) { 851 cilium_capture_out(ctx); 852 return ret; 853 } 854 drop_err: 855 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 856 CTX_ACT_DROP, METRIC_EGRESS); 857 } 858 859 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV64_RFC8215) 860 int tail_nat_ipv64(struct __ctx_buff *ctx) 861 { 862 int ret, oif = 0, l3_off = ETH_HLEN; 863 void *data, *data_end; 864 struct ipv6hdr *ip6; 865 struct iphdr *ip4; 866 __s8 ext_err = 0; 867 868 if (!revalidate_data(ctx, &data, &data_end, &ip6)) { 869 ret = DROP_INVALID; 870 goto drop_err; 871 } 872 if (nat64_rfc8215(ctx, ip6)) { 873 ret = DROP_NAT64; 874 goto drop_err; 875 } 876 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 877 ret = DROP_INVALID; 878 goto drop_err; 879 } 880 ret = fib_redirect_v4(ctx, l3_off, ip4, false, true, &ext_err, &oif); 881 if (fib_ok(ret)) { 882 cilium_capture_out(ctx); 883 return ret; 884 } 885 drop_err: 886 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 887 CTX_ACT_DROP, METRIC_EGRESS); 888 } 889 #endif /* ENABLE_NAT_46X64_GATEWAY */ 890 891 static __always_inline int 892 nodeport_rev_dnat_ingress_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace, 893 __s8 *ext_err) 894 { 895 #ifdef ENABLE_NAT_46X64_GATEWAY 896 const bool nat_46x64_fib = nat46x64_cb_route(ctx); 897 #endif 898 struct bpf_fib_lookup_padded fib_params = { 899 .l = { 900 .family = AF_INET6, 901 .ifindex = ctx_get_ifindex(ctx), 902 }, 903 }; 904 int ret, l4_off; 905 struct ipv6_ct_tuple tuple __align_stack_8 = {}; 906 struct ct_state ct_state = {}; 907 void *data, *data_end; 908 struct ipv6hdr *ip6; 909 __u32 tunnel_endpoint __maybe_unused = 0; 910 __u32 dst_sec_identity __maybe_unused = 0; 911 __be16 src_port __maybe_unused = 0; 912 bool allow_neigh_map = true; 913 int ifindex = 0; 914 915 if (!revalidate_data(ctx, &data, &data_end, &ip6)) 916 return DROP_INVALID; 917 #ifdef ENABLE_NAT_46X64_GATEWAY 918 if (nat_46x64_fib) 919 goto fib_lookup; 920 #endif 921 ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple); 922 if (ret < 0) { 923 if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4) 924 goto out; 925 return ret; 926 } 927 928 ret = ct_lazy_lookup6(get_ct_map6(&tuple), &tuple, ctx, l4_off, 929 CT_INGRESS, SCOPE_REVERSE, CT_ENTRY_NODEPORT, 930 &ct_state, &trace->monitor); 931 if (ret == CT_REPLY) { 932 trace->reason = TRACE_REASON_CT_REPLY; 933 ret = ipv6_l3(ctx, ETH_HLEN, NULL, NULL, METRIC_EGRESS); 934 if (unlikely(ret != CTX_ACT_OK)) 935 return ret; 936 937 ret = lb6_rev_nat(ctx, l4_off, ct_state.rev_nat_index, 938 &tuple); 939 if (IS_ERR(ret)) 940 return ret; 941 if (!revalidate_data(ctx, &data, &data_end, &ip6)) 942 return DROP_INVALID; 943 ctx_snat_done_set(ctx); 944 #ifndef HAVE_FIB_IFINDEX 945 ifindex = ct_state.ifindex; 946 #endif 947 #ifdef TUNNEL_MODE 948 { 949 union v6addr *dst = (union v6addr *)&ip6->daddr; 950 struct remote_endpoint_info *info; 951 952 info = lookup_ip6_remote_endpoint(dst, 0); 953 if (info && info->tunnel_endpoint && !info->flag_skip_tunnel) { 954 tunnel_endpoint = info->tunnel_endpoint; 955 dst_sec_identity = info->sec_identity; 956 goto encap_redirect; 957 } 958 } 959 #endif 960 961 goto fib_lookup; 962 } 963 out: 964 return CTX_ACT_OK; 965 966 #ifdef TUNNEL_MODE 967 encap_redirect: 968 src_port = tunnel_gen_src_port_v6(&tuple); 969 970 ret = nodeport_add_tunnel_encap(ctx, IPV4_DIRECT_ROUTING, src_port, 971 tunnel_endpoint, SECLABEL, dst_sec_identity, 972 trace->reason, trace->monitor, &ifindex); 973 if (IS_ERR(ret)) 974 return ret; 975 976 if (ret == CTX_ACT_REDIRECT && ifindex) 977 return ctx_redirect(ctx, ifindex, 0); 978 979 fib_params.l.ipv4_src = IPV4_DIRECT_ROUTING; 980 fib_params.l.ipv4_dst = tunnel_endpoint; 981 fib_params.l.family = AF_INET; 982 983 /* neigh map doesn't contain DMACs for other nodes */ 984 allow_neigh_map = false; 985 goto fib_redirect; 986 #endif 987 988 fib_lookup: 989 if (is_v4_in_v6((union v6addr *)&ip6->saddr)) { 990 struct iphdr *ip4; 991 992 ret = lb6_to_lb4(ctx, ip6); 993 if (ret < 0) 994 return ret; 995 996 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 997 return DROP_INVALID; 998 999 fib_params.l.ipv4_src = ip4->saddr; 1000 fib_params.l.ipv4_dst = ip4->daddr; 1001 fib_params.l.family = AF_INET; 1002 } else { 1003 ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_src, 1004 (union v6addr *)&ip6->saddr); 1005 ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_dst, 1006 (union v6addr *)&ip6->daddr); 1007 } 1008 1009 #ifdef TUNNEL_MODE 1010 fib_redirect: 1011 #endif 1012 return fib_redirect(ctx, true, &fib_params, allow_neigh_map, ext_err, &ifindex); 1013 } 1014 1015 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_REVNAT) 1016 static __always_inline 1017 int tail_nodeport_rev_dnat_ingress_ipv6(struct __ctx_buff *ctx) 1018 { 1019 struct trace_ctx trace = { 1020 .reason = TRACE_REASON_CT_REPLY, 1021 .monitor = TRACE_PAYLOAD_LEN, 1022 }; 1023 __s8 ext_err = 0; 1024 int ret = 0; 1025 1026 ret = nodeport_rev_dnat_ingress_ipv6(ctx, &trace, &ext_err); 1027 if (IS_ERR(ret)) 1028 goto drop; 1029 1030 if (ret == CTX_ACT_OK) { 1031 if (is_defined(IS_BPF_LXC)) { 1032 ret = DROP_NAT_NO_MAPPING; 1033 goto drop; 1034 } 1035 1036 ctx_skip_nodeport_set(ctx); 1037 ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_FROM_NETDEV, &ext_err); 1038 goto drop; 1039 } 1040 1041 #ifndef IS_BPF_LXC 1042 edt_set_aggregate(ctx, 0); 1043 #endif 1044 cilium_capture_out(ctx); 1045 return ret; 1046 drop: 1047 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 1048 CTX_ACT_DROP, METRIC_EGRESS); 1049 } 1050 1051 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS) 1052 static __always_inline 1053 int tail_nodeport_nat_ingress_ipv6(struct __ctx_buff *ctx) 1054 { 1055 struct ipv6_nat_target target = { 1056 .min_port = NODEPORT_PORT_MIN_NAT, 1057 .max_port = NODEPORT_PORT_MAX_NAT, 1058 }; 1059 struct trace_ctx trace = { 1060 .reason = TRACE_REASON_CT_REPLY, 1061 .monitor = TRACE_PAYLOAD_LEN, 1062 }; 1063 __u32 src_id = 0; 1064 __s8 ext_err = 0; 1065 int ret; 1066 1067 ret = snat_v6_rev_nat(ctx, &target, &trace, &ext_err); 1068 if (IS_ERR(ret)) { 1069 if (ret == NAT_PUNT_TO_STACK || 1070 /* DROP_NAT_NO_MAPPING is unwanted behavior in a 1071 * rev-SNAT context. Let's continue to passing it 1072 * up to the host and revisiting this later if 1073 * needed. 1074 */ 1075 ret == DROP_NAT_NO_MAPPING) { 1076 /* In case of no mapping, recircle back to 1077 * main path. SNAT is very expensive in terms 1078 * of instructions and 1079 * complexity. Consequently, this is done 1080 * inside a tail call here (because we don't 1081 * have BPF to BPF calls). 1082 */ 1083 goto recircle; 1084 } 1085 goto drop_err; 1086 } 1087 1088 ctx_snat_done_set(ctx); 1089 1090 #if !defined(ENABLE_DSR) || (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) 1091 1092 # if defined(ENABLE_HOST_FIREWALL) && defined(IS_BPF_HOST) 1093 ret = ipv6_host_policy_ingress(ctx, &src_id, &trace, &ext_err); 1094 if (IS_ERR(ret)) 1095 goto drop_err; 1096 1097 ctx_skip_host_fw_set(ctx); 1098 # endif 1099 1100 ret = invoke_traced_tailcall_if(__and(is_defined(ENABLE_HOST_FIREWALL), 1101 is_defined(IS_BPF_HOST)), 1102 CILIUM_CALL_IPV6_NODEPORT_REVNAT, 1103 nodeport_rev_dnat_ingress_ipv6, 1104 &trace, &ext_err); 1105 if (IS_ERR(ret)) 1106 goto drop_err; 1107 1108 if (ret == CTX_ACT_OK) 1109 goto recircle; 1110 1111 edt_set_aggregate(ctx, 0); 1112 cilium_capture_out(ctx); 1113 return ret; 1114 #endif 1115 1116 recircle: 1117 ctx_skip_nodeport_set(ctx); 1118 ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_FROM_NETDEV, &ext_err); 1119 1120 drop_err: 1121 return send_drop_notify_error_ext(ctx, src_id, ret, ext_err, CTX_ACT_DROP, 1122 METRIC_INGRESS); 1123 } 1124 1125 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS) 1126 static __always_inline 1127 int tail_nodeport_nat_egress_ipv6(struct __ctx_buff *ctx) 1128 { 1129 const bool nat_46x64 = nat46x64_cb_xlate(ctx); 1130 struct bpf_fib_lookup_padded fib_params = { 1131 .l = { 1132 .family = AF_INET6, 1133 .ifindex = ctx_get_ifindex(ctx), 1134 }, 1135 }; 1136 struct ipv6_nat_target target = { 1137 .min_port = NODEPORT_PORT_MIN_NAT, 1138 .max_port = NODEPORT_PORT_MAX_NAT, 1139 .addr = IPV6_DIRECT_ROUTING, 1140 }; 1141 struct ipv6_ct_tuple tuple __align_stack_8 = {}; 1142 struct trace_ctx trace = { 1143 .reason = (enum trace_reason)CT_NEW, 1144 .monitor = TRACE_PAYLOAD_LEN, 1145 }; 1146 int ret, l4_off, oif = 0; 1147 void *data, *data_end; 1148 struct ipv6hdr *ip6; 1149 __s8 ext_err = 0; 1150 #ifdef TUNNEL_MODE 1151 struct remote_endpoint_info *info; 1152 __be32 tunnel_endpoint = 0; 1153 __u32 dst_sec_identity = 0; 1154 union v6addr *dst; 1155 #endif 1156 1157 if (nat_46x64) 1158 build_v4_in_v6(&target.addr, IPV4_DIRECT_ROUTING); 1159 1160 if (!revalidate_data(ctx, &data, &data_end, &ip6)) { 1161 ret = DROP_INVALID; 1162 goto drop_err; 1163 } 1164 1165 #ifdef TUNNEL_MODE 1166 dst = (union v6addr *)&ip6->daddr; 1167 info = lookup_ip6_remote_endpoint(dst, 0); 1168 if (info && info->tunnel_endpoint != 0 && !info->flag_skip_tunnel) { 1169 tunnel_endpoint = info->tunnel_endpoint; 1170 dst_sec_identity = info->sec_identity; 1171 1172 BPF_V6(target.addr, ROUTER_IP); 1173 } 1174 #endif 1175 1176 ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple); 1177 if (IS_ERR(ret)) 1178 goto drop_err; 1179 1180 ipv6_ct_tuple_swap_ports(&tuple); 1181 tuple.flags = TUPLE_F_OUT; 1182 1183 ret = ipv6_l3(ctx, ETH_HLEN, NULL, NULL, METRIC_EGRESS); 1184 if (unlikely(ret != CTX_ACT_OK)) 1185 goto drop_err; 1186 1187 ret = __snat_v6_nat(ctx, &tuple, l4_off, true, &target, TCP_SPORT_OFF, 1188 &trace, &ext_err); 1189 if (IS_ERR(ret)) 1190 goto drop_err; 1191 1192 ctx_snat_done_set(ctx); 1193 1194 #ifdef TUNNEL_MODE 1195 if (tunnel_endpoint) { 1196 __be16 src_port; 1197 1198 #if __ctx_is == __ctx_skb 1199 { 1200 /* See the corresponding v4 path for details */ 1201 bool l2_hdr_required = false; 1202 1203 ret = maybe_add_l2_hdr(ctx, ENCAP_IFINDEX, &l2_hdr_required); 1204 if (ret != 0) 1205 goto drop_err; 1206 } 1207 #endif 1208 1209 src_port = tunnel_gen_src_port_v6(&tuple); 1210 1211 ret = nodeport_add_tunnel_encap(ctx, 1212 IPV4_DIRECT_ROUTING, 1213 src_port, 1214 tunnel_endpoint, 1215 WORLD_IPV6_ID, 1216 dst_sec_identity, 1217 trace.reason, 1218 trace.monitor, 1219 &oif); 1220 if (IS_ERR(ret)) 1221 goto drop_err; 1222 1223 if (ret == CTX_ACT_REDIRECT && oif) { 1224 cilium_capture_out(ctx); 1225 return ctx_redirect(ctx, oif, 0); 1226 } 1227 1228 goto fib_ipv4; 1229 } 1230 #endif 1231 if (!revalidate_data(ctx, &data, &data_end, &ip6)) { 1232 ret = DROP_INVALID; 1233 goto drop_err; 1234 } 1235 if (nat_46x64) { 1236 struct iphdr *ip4; 1237 1238 ret = lb6_to_lb4(ctx, ip6); 1239 if (ret < 0) 1240 goto drop_err; 1241 1242 #ifdef TUNNEL_MODE 1243 fib_ipv4: 1244 #endif 1245 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 1246 ret = DROP_INVALID; 1247 goto drop_err; 1248 } 1249 fib_params.l.ipv4_src = ip4->saddr; 1250 fib_params.l.ipv4_dst = ip4->daddr; 1251 fib_params.l.family = AF_INET; 1252 } else { 1253 ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_src, 1254 (union v6addr *)&ip6->saddr); 1255 ipv6_addr_copy((union v6addr *)&fib_params.l.ipv6_dst, 1256 (union v6addr *)&ip6->daddr); 1257 } 1258 ret = fib_redirect(ctx, true, &fib_params, false, &ext_err, &oif); 1259 if (fib_ok(ret)) { 1260 cilium_capture_out(ctx); 1261 return ret; 1262 } 1263 drop_err: 1264 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 1265 CTX_ACT_DROP, METRIC_EGRESS); 1266 } 1267 1268 static __always_inline int nodeport_svc_lb6(struct __ctx_buff *ctx, 1269 struct ipv6_ct_tuple *tuple, 1270 struct lb6_service *svc, 1271 struct lb6_key *key, 1272 struct ipv6hdr *ip6, 1273 int l3_off, 1274 int l4_off, 1275 __u32 src_sec_identity __maybe_unused, 1276 __s8 *ext_err) 1277 { 1278 const bool skip_l3_xlate = DSR_ENCAP_MODE == DSR_ENCAP_IPIP; 1279 struct ct_state ct_state_svc = {}; 1280 bool backend_local; 1281 __u32 monitor = 0; 1282 int ret; 1283 1284 if (!lb6_src_range_ok(svc, (union v6addr *)&ip6->saddr)) 1285 return DROP_NOT_IN_SRC_RANGE; 1286 1287 if (!lb6_svc_is_routable(svc)) 1288 return DROP_IS_CLUSTER_IP; 1289 1290 #if defined(ENABLE_L7_LB) 1291 if (lb6_svc_is_l7loadbalancer(svc) && svc->l7_lb_proxy_port > 0) { 1292 if (ctx_is_xdp()) 1293 return CTX_ACT_OK; 1294 1295 send_trace_notify(ctx, TRACE_TO_PROXY, src_sec_identity, UNKNOWN_ID, 1296 bpf_ntohs((__u16)svc->l7_lb_proxy_port), 1297 NATIVE_DEV_IFINDEX, TRACE_REASON_POLICY, monitor); 1298 return ctx_redirect_to_proxy_hairpin_ipv6(ctx, 1299 (__be16)svc->l7_lb_proxy_port); 1300 } 1301 #endif 1302 ret = lb6_local(get_ct_map6(tuple), ctx, l3_off, l4_off, 1303 key, tuple, svc, &ct_state_svc, 1304 skip_l3_xlate, ext_err); 1305 1306 #ifdef SERVICE_NO_BACKEND_RESPONSE 1307 if (ret == DROP_NO_SERVICE) { 1308 edt_set_aggregate(ctx, 0); 1309 ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_NO_SERVICE, 1310 ext_err); 1311 } 1312 #endif 1313 1314 if (IS_ERR(ret)) 1315 return ret; 1316 1317 backend_local = __lookup_ip6_endpoint(&tuple->daddr); 1318 if (!backend_local && lb6_svc_is_hostport(svc)) 1319 return DROP_INVALID; 1320 if (backend_local || !nodeport_uses_dsr6(tuple)) { 1321 struct ct_state ct_state = {}; 1322 1323 /* lookup with SCOPE_FORWARD: */ 1324 __ipv6_ct_tuple_reverse(tuple); 1325 1326 /* only match CT entries that belong to the same service: */ 1327 ct_state.rev_nat_index = ct_state_svc.rev_nat_index; 1328 1329 ret = ct_lazy_lookup6(get_ct_map6(tuple), tuple, ctx, l4_off, 1330 CT_EGRESS, SCOPE_FORWARD, CT_ENTRY_NODEPORT, 1331 &ct_state, &monitor); 1332 if (ret < 0) 1333 return ret; 1334 1335 switch (ret) { 1336 case CT_NEW: 1337 ct_state.src_sec_id = WORLD_IPV6_ID; 1338 ct_state.node_port = 1; 1339 #ifndef HAVE_FIB_IFINDEX 1340 ct_state.ifindex = (__u16)NATIVE_DEV_IFINDEX; 1341 #endif 1342 1343 ret = ct_create6(get_ct_map6(tuple), NULL, tuple, ctx, 1344 CT_EGRESS, &ct_state, ext_err); 1345 if (IS_ERR(ret)) 1346 return ret; 1347 break; 1348 case CT_ESTABLISHED: 1349 /* Note that we don't validate whether the matched CT entry 1350 * has identical values (eg. .ifindex) as set above. 1351 */ 1352 break; 1353 default: 1354 return DROP_UNKNOWN_CT; 1355 } 1356 1357 ret = neigh_record_ip6(ctx); 1358 if (ret < 0) 1359 return ret; 1360 if (backend_local) { 1361 ctx_set_xfer(ctx, XFER_PKT_NO_SVC); 1362 return CTX_ACT_OK; 1363 } 1364 } 1365 1366 /* TX request to remote backend: */ 1367 edt_set_aggregate(ctx, 0); 1368 if (nodeport_uses_dsr6(tuple)) { 1369 #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP 1370 ctx_store_meta(ctx, CB_HINT, 1371 ((__u32)tuple->sport << 16) | tuple->dport); 1372 ctx_store_meta(ctx, CB_ADDR_V6_1, tuple->daddr.p1); 1373 ctx_store_meta(ctx, CB_ADDR_V6_2, tuple->daddr.p2); 1374 ctx_store_meta(ctx, CB_ADDR_V6_3, tuple->daddr.p3); 1375 ctx_store_meta(ctx, CB_ADDR_V6_4, tuple->daddr.p4); 1376 #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE || DSR_ENCAP_MODE == DSR_ENCAP_NONE 1377 ctx_store_meta(ctx, CB_PORT, key->dport); 1378 ctx_store_meta(ctx, CB_ADDR_V6_1, key->address.p1); 1379 ctx_store_meta(ctx, CB_ADDR_V6_2, key->address.p2); 1380 ctx_store_meta(ctx, CB_ADDR_V6_3, key->address.p3); 1381 ctx_store_meta(ctx, CB_ADDR_V6_4, key->address.p4); 1382 #endif /* DSR_ENCAP_MODE */ 1383 return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_DSR, ext_err); 1384 } else { 1385 /* This code path is not only hit for NAT64, but also 1386 * for NAT46. For the latter we initially hit the IPv4 1387 * NodePort path, then migrate the request to IPv6 and 1388 * recirculate into the regular IPv6 NodePort path. So 1389 * we need to make sure to not NAT back to IPv4 for 1390 * IPv4-in-IPv6 converted addresses. 1391 */ 1392 ctx_store_meta(ctx, CB_NAT_46X64, 1393 !is_v4_in_v6(&key->address) && 1394 lb6_to_lb4_service(svc)); 1395 return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS, 1396 ext_err); 1397 } 1398 } 1399 1400 /* See nodeport_lb4(). */ 1401 static __always_inline int nodeport_lb6(struct __ctx_buff *ctx, 1402 struct ipv6hdr *ip6, 1403 __u32 src_sec_identity, 1404 __s8 *ext_err, 1405 bool __maybe_unused *dsr) 1406 { 1407 bool is_svc_proto __maybe_unused = true; 1408 int ret, l3_off = ETH_HLEN, l4_off; 1409 struct ipv6_ct_tuple tuple __align_stack_8 = {}; 1410 struct lb6_service *svc; 1411 struct lb6_key key = {}; 1412 1413 cilium_capture_in(ctx); 1414 1415 ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple); 1416 if (IS_ERR(ret)) { 1417 if (ret == DROP_UNSUPP_SERVICE_PROTO) { 1418 is_svc_proto = false; 1419 goto skip_service_lookup; 1420 } 1421 if (ret == DROP_UNKNOWN_L4) { 1422 ctx_set_xfer(ctx, XFER_PKT_NO_SVC); 1423 return CTX_ACT_OK; 1424 } 1425 return ret; 1426 } 1427 1428 lb6_fill_key(&key, &tuple); 1429 1430 svc = lb6_lookup_service(&key, false); 1431 if (svc) { 1432 return nodeport_svc_lb6(ctx, &tuple, svc, &key, ip6, l3_off, 1433 l4_off, src_sec_identity, ext_err); 1434 } else { 1435 skip_service_lookup: 1436 #ifdef ENABLE_NAT_46X64_GATEWAY 1437 if (is_v4_in_v6_rfc8215((union v6addr *)&ip6->daddr)) { 1438 ret = neigh_record_ip6(ctx); 1439 if (ret < 0) 1440 return ret; 1441 if (is_v4_in_v6_rfc8215((union v6addr *)&ip6->saddr)) 1442 return tail_call_internal(ctx, CILIUM_CALL_IPV64_RFC8215, 1443 ext_err); 1444 ctx_store_meta(ctx, CB_NAT_46X64, NAT46x64_MODE_XLATE); 1445 return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS, 1446 ext_err); 1447 } 1448 #endif 1449 ctx_set_xfer(ctx, XFER_PKT_NO_SVC); 1450 1451 #ifdef ENABLE_DSR 1452 #if (defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE) || \ 1453 (!defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE != DSR_ENCAP_GENEVE) 1454 if (is_svc_proto && nodeport_uses_dsr6(&tuple)) { 1455 ret = nodeport_extract_dsr_v6(ctx, ip6, &tuple, l4_off, 1456 &key.address, 1457 &key.dport, dsr); 1458 if (IS_ERR(ret)) 1459 return ret; 1460 1461 if (*dsr) 1462 return nodeport_dsr_ingress_ipv6(ctx, &tuple, l4_off, 1463 &key.address, key.dport, 1464 ext_err); 1465 } 1466 #endif 1467 #endif /* ENABLE_DSR */ 1468 1469 #ifndef ENABLE_MASQUERADE_IPV6 1470 if (!is_svc_proto || nodeport_uses_dsr6(&tuple)) 1471 return CTX_ACT_OK; 1472 #endif /* ENABLE_MASQUERADE_IPV6 */ 1473 1474 ctx_store_meta(ctx, CB_NAT_46X64, 0); 1475 ctx_store_meta(ctx, CB_SRC_LABEL, src_sec_identity); 1476 return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS, 1477 ext_err); 1478 } 1479 } 1480 1481 static __always_inline int 1482 nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, bool *snat_done, 1483 bool revdnat_only __maybe_unused, 1484 struct trace_ctx *trace, __s8 *ext_err __maybe_unused) 1485 { 1486 struct bpf_fib_lookup_padded fib_params __maybe_unused = {}; 1487 struct lb6_reverse_nat *nat_info; 1488 struct ipv6_ct_tuple tuple __align_stack_8 = {}; 1489 void *data, *data_end; 1490 struct ipv6hdr *ip6; 1491 int ret, l4_off; 1492 1493 if (!revalidate_data(ctx, &data, &data_end, &ip6)) 1494 return DROP_INVALID; 1495 1496 ret = lb6_extract_tuple(ctx, ip6, ETH_HLEN, &l4_off, &tuple); 1497 if (ret < 0) { 1498 if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4) 1499 return CTX_ACT_OK; 1500 return ret; 1501 } 1502 1503 nat_info = nodeport_rev_dnat_get_info_ipv6(ctx, &tuple); 1504 if (!nat_info) 1505 return CTX_ACT_OK; 1506 1507 #if defined(IS_BPF_HOST) && !defined(ENABLE_SKIP_FIB) 1508 if (revdnat_only) 1509 goto skip_fib; 1510 1511 fib_params.l.family = AF_INET6; 1512 fib_params.l.ifindex = ctx_get_ifindex(ctx); 1513 ipv6_addr_copy((union v6addr *)fib_params.l.ipv6_src, 1514 &nat_info->address); 1515 ipv6_addr_copy((union v6addr *)fib_params.l.ipv6_dst, 1516 &tuple.daddr); 1517 1518 ret = nodeport_fib_lookup_and_redirect(ctx, &fib_params, ext_err); 1519 if (ret != CTX_ACT_OK) 1520 return ret; 1521 1522 skip_fib: 1523 #endif 1524 1525 ret = ct_lazy_lookup6(get_ct_map6(&tuple), &tuple, ctx, l4_off, CT_INGRESS, 1526 SCOPE_REVERSE, CT_ENTRY_NODEPORT | CT_ENTRY_DSR, 1527 NULL, &trace->monitor); 1528 if (ret == CT_REPLY) { 1529 trace->reason = TRACE_REASON_CT_REPLY; 1530 1531 ret = __lb6_rev_nat(ctx, l4_off, &tuple, nat_info); 1532 if (IS_ERR(ret)) 1533 return ret; 1534 1535 *snat_done = true; 1536 } 1537 1538 return CTX_ACT_OK; 1539 } 1540 1541 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD) 1542 int tail_handle_snat_fwd_ipv6(struct __ctx_buff *ctx) 1543 { 1544 struct trace_ctx trace = { 1545 .reason = TRACE_REASON_UNKNOWN, 1546 .monitor = 0, 1547 }; 1548 enum trace_point obs_point; 1549 union v6addr saddr = {}; 1550 int ret; 1551 __s8 ext_err = 0; 1552 1553 #ifdef IS_BPF_OVERLAY 1554 obs_point = TRACE_TO_OVERLAY; 1555 #else 1556 obs_point = TRACE_TO_NETWORK; 1557 #endif 1558 1559 ret = nodeport_snat_fwd_ipv6(ctx, &saddr, &trace, &ext_err); 1560 if (IS_ERR(ret)) 1561 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 1562 CTX_ACT_DROP, METRIC_EGRESS); 1563 1564 /* contrary to tail_handle_snat_fwd_ipv4, we don't check for 1565 * 1566 * ret == CTX_ACT_OK 1567 * 1568 * in order to emit the event, as egress gateway is not yet supported 1569 * for IPv6, and so it's not possible yet for masqueraded traffic to get 1570 * redirected to another interface 1571 */ 1572 send_trace_notify6(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID, &saddr, 1573 TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX, 1574 trace.reason, trace.monitor); 1575 1576 return ret; 1577 } 1578 1579 static __always_inline int 1580 __handle_nat_fwd_ipv6(struct __ctx_buff *ctx, bool revdnat_only, struct trace_ctx *trace, 1581 __s8 *ext_err) 1582 { 1583 bool snat_done = false; 1584 int ret; 1585 1586 ret = nodeport_rev_dnat_fwd_ipv6(ctx, &snat_done, revdnat_only, trace, ext_err); 1587 if (ret != CTX_ACT_OK || revdnat_only) 1588 return ret; 1589 1590 #if !defined(ENABLE_DSR) || \ 1591 (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \ 1592 defined(ENABLE_MASQUERADE_IPV6) 1593 if (!snat_done) 1594 ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD, 1595 ext_err); 1596 #endif 1597 1598 if (is_defined(IS_BPF_HOST) && snat_done) 1599 ctx_snat_done_set(ctx); 1600 1601 return ret; 1602 } 1603 1604 static __always_inline int 1605 handle_nat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace, 1606 __s8 *ext_err) 1607 { 1608 __u32 cb_nat_flags = ctx_load_and_clear_meta(ctx, CB_NAT_FLAGS); 1609 bool revdnat_only = cb_nat_flags & CB_NAT_FLAGS_REVDNAT_ONLY; 1610 1611 return __handle_nat_fwd_ipv6(ctx, revdnat_only, trace, ext_err); 1612 } 1613 1614 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT_FWD) 1615 static __always_inline 1616 int tail_handle_nat_fwd_ipv6(struct __ctx_buff *ctx) 1617 { 1618 struct trace_ctx trace = { 1619 .reason = TRACE_REASON_UNKNOWN, 1620 .monitor = TRACE_PAYLOAD_LEN, 1621 }; 1622 int ret; 1623 enum trace_point obs_point; 1624 __s8 ext_err = 0; 1625 1626 #ifdef IS_BPF_OVERLAY 1627 obs_point = TRACE_TO_OVERLAY; 1628 #else 1629 obs_point = TRACE_TO_NETWORK; 1630 #endif 1631 1632 ret = handle_nat_fwd_ipv6(ctx, &trace, &ext_err); 1633 if (IS_ERR(ret)) 1634 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 1635 CTX_ACT_DROP, METRIC_EGRESS); 1636 1637 if (ret == CTX_ACT_OK) 1638 send_trace_notify(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID, 1639 TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX, 1640 trace.reason, trace.monitor); 1641 1642 return ret; 1643 } 1644 #endif /* ENABLE_IPV6 */ 1645 1646 #ifdef ENABLE_IPV4 1647 static __always_inline bool nodeport_uses_dsr4(const struct ipv4_ct_tuple *tuple) 1648 { 1649 return nodeport_uses_dsr(tuple->nexthdr); 1650 } 1651 1652 static __always_inline bool 1653 nodeport_has_nat_conflict_ipv4(const struct iphdr *ip4 __maybe_unused, 1654 struct ipv4_nat_target *target __maybe_unused) 1655 { 1656 #if defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY) 1657 if (ip4->saddr == IPV4_GATEWAY) { 1658 target->addr = IPV4_GATEWAY; 1659 target->needs_ct = true; 1660 1661 return true; 1662 } 1663 #endif /* TUNNEL_MODE && IS_BPF_OVERLAY */ 1664 1665 #if defined(IS_BPF_HOST) 1666 __u32 dr_ifindex = DIRECT_ROUTING_DEV_IFINDEX; 1667 1668 /* NATIVE_DEV_IFINDEX == DIRECT_ROUTING_DEV_IFINDEX cannot be moved into 1669 * preprocessor, as the former is known only during load time (templating). 1670 * This checks whether bpf_host is running on the direct routing device. 1671 */ 1672 if (dr_ifindex == NATIVE_DEV_IFINDEX && 1673 ip4->saddr == IPV4_DIRECT_ROUTING) { 1674 target->addr = IPV4_DIRECT_ROUTING; 1675 target->needs_ct = true; 1676 1677 return true; 1678 } 1679 #endif /* IS_BPF_HOST */ 1680 1681 return false; 1682 } 1683 1684 static __always_inline int nodeport_snat_fwd_ipv4(struct __ctx_buff *ctx, 1685 __u32 cluster_id __maybe_unused, 1686 __be32 *saddr, 1687 struct trace_ctx *trace, 1688 __s8 *ext_err) 1689 { 1690 struct ipv4_nat_target target = { 1691 .min_port = NODEPORT_PORT_MIN_NAT, 1692 .max_port = NODEPORT_PORT_MAX_NAT, 1693 #if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT) 1694 .cluster_id = cluster_id, 1695 #endif 1696 }; 1697 struct ipv4_ct_tuple tuple = {}; 1698 void *data, *data_end; 1699 struct iphdr *ip4; 1700 int l4_off, ret; 1701 1702 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 1703 return DROP_INVALID; 1704 1705 snat_v4_init_tuple(ip4, NAT_DIR_EGRESS, &tuple); 1706 l4_off = ETH_HLEN + ipv4_hdrlen(ip4); 1707 1708 if (lb_is_svc_proto(tuple.nexthdr) && 1709 !nodeport_uses_dsr4(&tuple) && 1710 nodeport_has_nat_conflict_ipv4(ip4, &target)) 1711 goto apply_snat; 1712 1713 ret = snat_v4_needs_masquerade(ctx, &tuple, ip4, l4_off, &target); 1714 if (IS_ERR(ret)) 1715 goto out; 1716 1717 #if defined(ENABLE_EGRESS_GATEWAY_COMMON) && defined(IS_BPF_HOST) 1718 if (target.egress_gateway) { 1719 /* Send packet to the correct egress interface, and SNAT it there. */ 1720 ret = egress_gw_fib_lookup_and_redirect(ctx, target.addr, 1721 tuple.daddr, ext_err); 1722 if (ret != CTX_ACT_OK) 1723 return ret; 1724 1725 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 1726 return DROP_INVALID; 1727 } 1728 #endif 1729 1730 apply_snat: 1731 1732 *saddr = tuple.saddr; 1733 ret = snat_v4_nat(ctx, &tuple, ip4, l4_off, ipv4_has_l4_header(ip4), 1734 &target, trace, ext_err); 1735 if (IS_ERR(ret)) 1736 goto out; 1737 1738 /* If multiple netdevs process an outgoing packet, then this packets will 1739 * be handled multiple times by the "to-netdev" section. This can lead 1740 * to multiple SNATs. To prevent from that, set the SNAT done flag. 1741 * 1742 * XDP doesn't need the flag (there's no egress prog that would utilize it), 1743 * and for overlay traffic it makes no difference whether the inner packet 1744 * was SNATed. 1745 */ 1746 if (is_defined(IS_BPF_HOST)) 1747 ctx_snat_done_set(ctx); 1748 1749 out: 1750 if (ret == NAT_PUNT_TO_STACK) 1751 ret = CTX_ACT_OK; 1752 1753 return ret; 1754 } 1755 1756 #ifdef ENABLE_DSR 1757 #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP 1758 static __always_inline __be32 rss_gen_src4(__be32 client, __be32 l4_hint) 1759 { 1760 const __u32 bits = 32 - IPV4_RSS_PREFIX_BITS; 1761 __be32 src = IPV4_RSS_PREFIX; 1762 1763 if (bits) 1764 src |= bpf_htonl(hash_32(client ^ l4_hint, bits)); 1765 return src; 1766 } 1767 1768 /* 1769 * Original packet: [clientIP:clientPort -> serviceIP:servicePort] } IP/L4 1770 * 1771 * After DSR IPIP: [rssSrcIP -> backendIP] } IP 1772 * [clientIP:clientPort -> serviceIP:servicePort] } IP/L4 1773 */ 1774 static __always_inline int dsr_set_ipip4(struct __ctx_buff *ctx, 1775 const struct iphdr *ip4, 1776 __be32 backend_addr, 1777 __be32 l4_hint, __be16 *ohead) 1778 { 1779 __u16 tot_len = bpf_ntohs(ip4->tot_len) + sizeof(*ip4); 1780 const int l3_off = ETH_HLEN; 1781 __be32 sum; 1782 struct { 1783 __be16 tot_len; 1784 __be16 id; 1785 __be16 frag_off; 1786 __u8 ttl; 1787 __u8 protocol; 1788 __be32 saddr; 1789 __be32 daddr; 1790 } tp_old = { 1791 .tot_len = ip4->tot_len, 1792 .ttl = ip4->ttl, 1793 .protocol = ip4->protocol, 1794 .saddr = ip4->saddr, 1795 .daddr = ip4->daddr, 1796 }, tp_new = { 1797 .tot_len = bpf_htons(tot_len), 1798 .ttl = IPDEFTTL, 1799 .protocol = IPPROTO_IPIP, 1800 .saddr = rss_gen_src4(ip4->saddr, l4_hint), 1801 .daddr = backend_addr, 1802 }; 1803 1804 if (dsr_is_too_big(ctx, tot_len)) { 1805 *ohead = sizeof(*ip4); 1806 return DROP_FRAG_NEEDED; 1807 } 1808 1809 if (ctx_adjust_hroom(ctx, sizeof(*ip4), BPF_ADJ_ROOM_NET, 1810 ctx_adjust_hroom_flags())) 1811 return DROP_INVALID; 1812 sum = csum_diff(&tp_old, 16, &tp_new, 16, 0); 1813 if (ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, tot_len), 1814 &tp_new.tot_len, 2, 0) < 0) 1815 return DROP_WRITE_ERROR; 1816 if (ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, ttl), 1817 &tp_new.ttl, 2, 0) < 0) 1818 return DROP_WRITE_ERROR; 1819 if (ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, saddr), 1820 &tp_new.saddr, 8, 0) < 0) 1821 return DROP_WRITE_ERROR; 1822 if (ipv4_csum_update_by_diff(ctx, l3_off, sum) < 0) 1823 return DROP_CSUM_L3; 1824 return 0; 1825 } 1826 #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE 1827 static __always_inline int dsr_set_opt4(struct __ctx_buff *ctx, 1828 struct iphdr *ip4, __be32 svc_addr, 1829 __be16 svc_port, __be16 *ohead) 1830 { 1831 __u32 iph_old, iph_new; 1832 struct dsr_opt_v4 opt; 1833 __u16 tot_len = bpf_ntohs(ip4->tot_len) + sizeof(opt); 1834 __be32 sum; 1835 1836 if (ip4->protocol == IPPROTO_TCP) { 1837 union tcp_flags tcp_flags = { .value = 0 }; 1838 1839 if (l4_load_tcp_flags(ctx, ETH_HLEN + ipv4_hdrlen(ip4), &tcp_flags) < 0) 1840 return DROP_CT_INVALID_HDR; 1841 1842 /* Setting the option is required only for the first packet 1843 * (SYN), in the case of TCP, as for further packets of the 1844 * same connection a remote node will use a NAT entry to 1845 * reverse xlate a reply. 1846 */ 1847 if (!(tcp_flags.value & (TCP_FLAG_SYN))) 1848 return 0; 1849 } 1850 1851 if (ipv4_hdrlen(ip4) + sizeof(opt) > sizeof(struct iphdr) + MAX_IPOPTLEN) 1852 return DROP_CT_INVALID_HDR; 1853 1854 if (dsr_is_too_big(ctx, tot_len)) { 1855 *ohead = sizeof(opt); 1856 return DROP_FRAG_NEEDED; 1857 } 1858 1859 iph_old = *(__u32 *)ip4; 1860 ip4->ihl += sizeof(opt) >> 2; 1861 ip4->tot_len = bpf_htons(tot_len); 1862 iph_new = *(__u32 *)ip4; 1863 1864 opt.type = DSR_IPV4_OPT_TYPE; 1865 opt.len = sizeof(opt); 1866 opt.port = bpf_htons(svc_port); 1867 opt.addr = bpf_htonl(svc_addr); 1868 1869 sum = csum_diff(&iph_old, 4, &iph_new, 4, 0); 1870 sum = csum_diff(NULL, 0, &opt, sizeof(opt), sum); 1871 1872 if (ctx_adjust_hroom(ctx, sizeof(opt), BPF_ADJ_ROOM_NET, 1873 ctx_adjust_hroom_flags())) 1874 return DROP_INVALID; 1875 1876 if (ctx_store_bytes(ctx, ETH_HLEN + sizeof(*ip4), 1877 &opt, sizeof(opt), 0) < 0) 1878 return DROP_INVALID; 1879 if (ipv4_csum_update_by_diff(ctx, ETH_HLEN, sum) < 0) 1880 return DROP_CSUM_L3; 1881 1882 return 0; 1883 } 1884 #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE 1885 static __always_inline int encap_geneve_dsr_opt4(struct __ctx_buff *ctx, int l3_off __maybe_unused, 1886 struct iphdr *ip4, __be32 svc_addr, 1887 __be16 svc_port, int *ifindex, __be16 *ohead) 1888 { 1889 struct remote_endpoint_info *info __maybe_unused; 1890 struct geneve_dsr_opt4 gopt; 1891 bool need_opt = true; 1892 __u16 encap_len = sizeof(struct iphdr) + sizeof(struct udphdr) + 1893 sizeof(struct genevehdr) + ETH_HLEN; 1894 __u16 total_len = bpf_ntohs(ip4->tot_len); 1895 __u32 src_sec_identity = WORLD_IPV4_ID; 1896 __u32 dst_sec_identity; 1897 __be32 tunnel_endpoint; 1898 __be16 src_port = 0; 1899 #if __ctx_is == __ctx_xdp 1900 bool has_encap = l3_off > ETH_HLEN; 1901 struct iphdr *outer_ip4 = ip4; 1902 void *data, *data_end; 1903 1904 build_bug_on((sizeof(gopt) % 4) != 0); 1905 1906 if (has_encap) { 1907 /* point at the inner IPv4 header */ 1908 if (!revalidate_data_l3_off(ctx, &data, &data_end, &ip4, encap_len + ETH_HLEN)) 1909 return DROP_INVALID; 1910 1911 encap_len = 0; 1912 } else { 1913 struct ipv4_ct_tuple tuple = {}; 1914 int l4_off, ret; 1915 1916 ret = lb4_extract_tuple(ctx, ip4, l3_off, &l4_off, &tuple); 1917 if (IS_ERR(ret)) 1918 return ret; 1919 1920 src_port = tunnel_gen_src_port_v4(&tuple); 1921 } 1922 #endif 1923 1924 #ifdef ENABLE_HIGH_SCALE_IPCACHE 1925 #ifdef IS_BPF_OVERLAY 1926 src_sec_identity = ctx_load_meta(ctx, CB_DSR_SRC_LABEL); 1927 #endif 1928 1929 tunnel_endpoint = ip4->daddr; 1930 dst_sec_identity = 0; 1931 #else 1932 info = lookup_ip4_remote_endpoint(ip4->daddr, 0); 1933 if (!info || info->tunnel_endpoint == 0) 1934 return DROP_NO_TUNNEL_ENDPOINT; 1935 1936 tunnel_endpoint = info->tunnel_endpoint; 1937 dst_sec_identity = info->sec_identity; 1938 #endif 1939 1940 if (ip4->protocol == IPPROTO_TCP) { 1941 union tcp_flags tcp_flags = { .value = 0 }; 1942 1943 if (l4_load_tcp_flags(ctx, l3_off + ipv4_hdrlen(ip4), &tcp_flags) < 0) 1944 return DROP_CT_INVALID_HDR; 1945 1946 /* The GENEVE option is required only for the first packet 1947 * (SYN), in the case of TCP, as for further packets of the 1948 * same connection a remote node will use a NAT entry to 1949 * reverse xlate a reply. 1950 */ 1951 if (!(tcp_flags.value & (TCP_FLAG_SYN))) 1952 need_opt = false; 1953 } 1954 1955 if (need_opt) { 1956 encap_len += sizeof(struct geneve_dsr_opt4); 1957 set_geneve_dsr_opt4(svc_port, svc_addr, &gopt); 1958 } 1959 1960 if (dsr_is_too_big(ctx, total_len + encap_len)) { 1961 *ohead = encap_len; 1962 return DROP_FRAG_NEEDED; 1963 } 1964 1965 #if __ctx_is == __ctx_xdp 1966 if (has_encap) { 1967 int outer_l4_off = ETH_HLEN + ipv4_hdrlen(outer_ip4); 1968 __be32 lb_ip = IPV4_DIRECT_ROUTING; 1969 __wsum sum = 0; 1970 1971 /* update outer_ip4 daddr and saddr: */ 1972 sum = csum_diff(&outer_ip4->daddr, 4, &tunnel_endpoint, 4, 0); 1973 if (ctx_store_bytes(ctx, ETH_HLEN + offsetof(struct iphdr, daddr), 1974 &tunnel_endpoint, 4, 0) < 0) 1975 return DROP_WRITE_ERROR; 1976 1977 sum = csum_diff(&outer_ip4->saddr, 4, &lb_ip, 4, sum); 1978 if (ctx_store_bytes(ctx, ETH_HLEN + offsetof(struct iphdr, saddr), 1979 &lb_ip, 4, 0) < 0) 1980 return DROP_WRITE_ERROR; 1981 1982 /* adjust outer_ip4->csum: */ 1983 if (ipv4_csum_update_by_diff(ctx, ETH_HLEN, sum) < 0) 1984 return DROP_CSUM_L3; 1985 1986 /* insert the GENEVE-DSR option: */ 1987 if (need_opt) { 1988 __be16 new_length; 1989 int ret; 1990 1991 /* update udp->len */ 1992 if (ctx_load_bytes(ctx, outer_l4_off + offsetof(struct udphdr, len), 1993 &new_length, sizeof(new_length)) < 0) 1994 return DROP_INVALID; 1995 1996 new_length = bpf_htons(bpf_ntohs(new_length) + sizeof(gopt)); 1997 1998 if (ctx_store_bytes(ctx, outer_l4_off + offsetof(struct udphdr, len), 1999 &new_length, sizeof(new_length), 0) < 0) 2000 return DROP_WRITE_ERROR; 2001 2002 /* update outer_ip4->tot_len */ 2003 new_length = bpf_htons(total_len + sizeof(gopt)); 2004 2005 if (ipv4_csum_update_by_value(ctx, ETH_HLEN, outer_ip4->tot_len, 2006 new_length, sizeof(new_length)) < 0) 2007 return DROP_CSUM_L3; 2008 2009 if (ctx_store_bytes(ctx, ETH_HLEN + offsetof(struct iphdr, tot_len), 2010 &new_length, sizeof(new_length), 0) < 0) 2011 return DROP_WRITE_ERROR; 2012 2013 ret = ctx_set_tunnel_opt(ctx, (__u8 *)&gopt, sizeof(gopt)); 2014 if (ret) 2015 return ret; 2016 } 2017 2018 return CTX_ACT_REDIRECT; 2019 } 2020 #endif 2021 2022 if (need_opt) 2023 return nodeport_add_tunnel_encap_opt(ctx, 2024 IPV4_DIRECT_ROUTING, 2025 src_port, 2026 tunnel_endpoint, 2027 src_sec_identity, 2028 dst_sec_identity, 2029 &gopt, 2030 sizeof(gopt), 2031 (enum trace_reason)CT_NEW, 2032 TRACE_PAYLOAD_LEN, 2033 ifindex); 2034 2035 return nodeport_add_tunnel_encap(ctx, 2036 IPV4_DIRECT_ROUTING, 2037 src_port, 2038 tunnel_endpoint, 2039 src_sec_identity, 2040 dst_sec_identity, 2041 (enum trace_reason)CT_NEW, 2042 TRACE_PAYLOAD_LEN, 2043 ifindex); 2044 } 2045 #endif /* DSR_ENCAP_MODE */ 2046 2047 static __always_inline int 2048 nodeport_extract_dsr_v4(struct __ctx_buff *ctx, 2049 const struct iphdr *ip4 __maybe_unused, 2050 const struct ipv4_ct_tuple *tuple, int l4_off, 2051 __be32 *addr, __be16 *port, bool *dsr) 2052 { 2053 struct ipv4_ct_tuple tmp = *tuple; 2054 2055 /* Parse DSR info from the packet, to get the addr/port of the 2056 * addressed service. We need this for RevDNATing the backend's replies. 2057 * 2058 * TCP connections have the DSR Option only in their SYN packet. 2059 * To identify that a non-SYN packet belongs to a DSR connection, 2060 * we need to check whether a corresponding CT entry with .dsr flag exists. 2061 */ 2062 if (tuple->nexthdr == IPPROTO_TCP) { 2063 union tcp_flags tcp_flags = {}; 2064 2065 if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0) 2066 return DROP_CT_INVALID_HDR; 2067 2068 ipv4_ct_tuple_reverse(&tmp); 2069 2070 if (!(tcp_flags.value & TCP_FLAG_SYN)) { 2071 /* If the packet belongs to a tracked DSR connection, 2072 * trigger a CT update. 2073 * We don't have any DSR info to report back, and that's ok. 2074 */ 2075 *dsr = ct_has_dsr_egress_entry4(get_ct_map4(&tmp), &tmp); 2076 *port = 0; 2077 return 0; 2078 } 2079 } 2080 2081 #if defined(IS_BPF_OVERLAY) 2082 { 2083 struct geneve_dsr_opt4 gopt; 2084 int ret = 0; 2085 2086 ret = ctx_get_tunnel_opt(ctx, &gopt, sizeof(gopt)); 2087 2088 if (ret > 0) { 2089 if (gopt.hdr.opt_class == bpf_htons(DSR_GENEVE_OPT_CLASS) && 2090 gopt.hdr.type == DSR_GENEVE_OPT_TYPE) { 2091 *dsr = true; 2092 *port = gopt.port; 2093 *addr = gopt.addr; 2094 return 0; 2095 } 2096 } 2097 } 2098 #else 2099 /* Check whether IPv4 header contains a 64-bit option (IPv4 header 2100 * w/o option (5 x 32-bit words) + the DSR option (2 x 32-bit words)). 2101 */ 2102 if (ip4->ihl >= 0x7) { 2103 struct dsr_opt_v4 opt; 2104 2105 if (ctx_load_bytes(ctx, ETH_HLEN + sizeof(struct iphdr), 2106 &opt, sizeof(opt)) < 0) 2107 return DROP_INVALID; 2108 2109 if (opt.type == DSR_IPV4_OPT_TYPE && opt.len == sizeof(opt)) { 2110 *dsr = true; 2111 *addr = bpf_ntohl(opt.addr); 2112 *port = bpf_ntohs(opt.port); 2113 return 0; 2114 } 2115 } 2116 #endif 2117 2118 /* SYN for a new connection that's not / no longer DSR. 2119 * If it's reopened, avoid sending subsequent traffic down the DSR path. 2120 */ 2121 if (tuple->nexthdr == IPPROTO_TCP) 2122 ct_update_dsr(get_ct_map4(&tmp), &tmp, false); 2123 2124 return 0; 2125 } 2126 2127 static __always_inline struct ipv4_nat_entry * 2128 nodeport_dsr_lookup_v4_nat_entry(const struct ipv4_ct_tuple *nat_tuple) 2129 { 2130 return snat_v4_lookup(nat_tuple); 2131 } 2132 2133 static __always_inline int dsr_reply_icmp4(struct __ctx_buff *ctx, 2134 struct iphdr *ip4 __maybe_unused, 2135 __be32 svc_addr __maybe_unused, 2136 __be16 dport __maybe_unused, 2137 int code, __be16 ohead __maybe_unused) 2138 { 2139 #ifdef ENABLE_DSR_ICMP_ERRORS 2140 const __s32 orig_dgram = 8, off = ETH_HLEN; 2141 const __u32 l3_max = MAX_IPOPTLEN + sizeof(*ip4) + orig_dgram; 2142 __be16 type = bpf_htons(ETH_P_IP); 2143 __s32 len_new = off + ipv4_hdrlen(ip4) + orig_dgram; 2144 __s32 len_old = ctx_full_len(ctx); 2145 __u8 reason = (__u8)-code; 2146 __u8 tmp[l3_max]; 2147 union macaddr smac, dmac; 2148 struct icmphdr icmp __align_stack_8 = { 2149 .type = ICMP_DEST_UNREACH, 2150 .code = ICMP_FRAG_NEEDED, 2151 .un = { 2152 .frag = { 2153 .mtu = bpf_htons(THIS_MTU - ohead), 2154 }, 2155 }, 2156 }; 2157 __u64 tot_len = sizeof(struct iphdr) + ipv4_hdrlen(ip4) + sizeof(icmp) + orig_dgram; 2158 struct iphdr ip __align_stack_8 = { 2159 .ihl = sizeof(ip) >> 2, 2160 .version = IPVERSION, 2161 .ttl = IPDEFTTL, 2162 .tos = ip4->tos, 2163 .id = ip4->id, 2164 .protocol = IPPROTO_ICMP, 2165 .saddr = ip4->daddr, 2166 .daddr = ip4->saddr, 2167 .frag_off = bpf_htons(IP_DF), 2168 .tot_len = bpf_htons((__u16)tot_len), 2169 }; 2170 2171 struct iphdr inner_ip_hdr __align_stack_8 = *ip4; 2172 __s32 l4_dport_offset; 2173 2174 /* DSR changes the destination address from service ip to pod ip and 2175 * destination port from service port to pod port. While resppnding 2176 * back with ICMP error, it is necessary to set it to original ip and 2177 * port. 2178 * We do recompute the whole checksum here. Another way would be to 2179 * unfold checksum and then do the math adding the diff. 2180 */ 2181 inner_ip_hdr.daddr = svc_addr; 2182 inner_ip_hdr.check = 0; 2183 inner_ip_hdr.check = csum_fold(csum_diff(NULL, 0, &inner_ip_hdr, 2184 sizeof(inner_ip_hdr), 0)); 2185 2186 if (inner_ip_hdr.protocol == IPPROTO_UDP) 2187 l4_dport_offset = UDP_DPORT_OFF; 2188 else if (inner_ip_hdr.protocol == IPPROTO_TCP) 2189 l4_dport_offset = TCP_DPORT_OFF; 2190 2191 update_metrics(ctx_full_len(ctx), METRIC_EGRESS, reason); 2192 2193 if (eth_load_saddr(ctx, smac.addr, 0) < 0) 2194 goto drop_err; 2195 if (eth_load_daddr(ctx, dmac.addr, 0) < 0) 2196 goto drop_err; 2197 2198 ip.check = csum_fold(csum_diff(NULL, 0, &ip, sizeof(ip), 0)); 2199 2200 /* We use a workaround here in that we push zero-bytes into the 2201 * payload in order to support dynamic IPv4 header size. This 2202 * works given one's complement sum does not change. 2203 */ 2204 memset(tmp, 0, MAX_IPOPTLEN); 2205 if (ctx_store_bytes(ctx, len_new, tmp, MAX_IPOPTLEN, 0) < 0) 2206 goto drop_err; 2207 if (ctx_load_bytes(ctx, off, tmp, sizeof(tmp)) < 0) 2208 goto drop_err; 2209 2210 memcpy(tmp, &inner_ip_hdr, sizeof(inner_ip_hdr)); 2211 memcpy(tmp + sizeof(inner_ip_hdr) + l4_dport_offset, &dport, sizeof(dport)); 2212 2213 icmp.checksum = csum_fold(csum_diff(NULL, 0, tmp, sizeof(tmp), 2214 csum_diff(NULL, 0, &icmp, 2215 sizeof(icmp), 0))); 2216 2217 if (ctx_adjust_troom(ctx, -(len_old - len_new)) < 0) 2218 goto drop_err; 2219 if (ctx_adjust_hroom(ctx, sizeof(ip) + sizeof(icmp), 2220 BPF_ADJ_ROOM_NET, 2221 ctx_adjust_hroom_flags()) < 0) 2222 goto drop_err; 2223 2224 if (eth_store_daddr(ctx, smac.addr, 0) < 0) 2225 goto drop_err; 2226 if (eth_store_saddr(ctx, dmac.addr, 0) < 0) 2227 goto drop_err; 2228 if (ctx_store_bytes(ctx, ETH_ALEN * 2, &type, sizeof(type), 0) < 0) 2229 goto drop_err; 2230 if (ctx_store_bytes(ctx, off, &ip, sizeof(ip), 0) < 0) 2231 goto drop_err; 2232 if (ctx_store_bytes(ctx, off + sizeof(ip), &icmp, 2233 sizeof(icmp), 0) < 0) 2234 goto drop_err; 2235 if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp), 2236 &inner_ip_hdr, sizeof(inner_ip_hdr), 0) < 0) 2237 goto drop_err; 2238 if (ctx_store_bytes(ctx, off + sizeof(ip) + sizeof(icmp) 2239 + sizeof(inner_ip_hdr) + l4_dport_offset, 2240 &dport, sizeof(dport), 0) < 0) 2241 goto drop_err; 2242 2243 return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0); 2244 drop_err: 2245 #endif 2246 return send_drop_notify_error(ctx, UNKNOWN_ID, code, CTX_ACT_DROP, 2247 METRIC_EGRESS); 2248 } 2249 2250 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_DSR) 2251 int tail_nodeport_ipv4_dsr(struct __ctx_buff *ctx) 2252 { 2253 void *data, *data_end; 2254 struct iphdr *ip4; 2255 int ret, oif = 0; 2256 __be16 ohead = 0; 2257 __s8 ext_err = 0; 2258 __be32 addr; 2259 __be16 port; 2260 2261 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 2262 ret = DROP_INVALID; 2263 goto drop_err; 2264 } 2265 addr = ctx_load_meta(ctx, CB_ADDR_V4); 2266 port = (__be16)ctx_load_meta(ctx, CB_PORT); 2267 2268 #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP 2269 ret = dsr_set_ipip4(ctx, ip4, 2270 addr, 2271 ctx_load_meta(ctx, CB_HINT), &ohead); 2272 #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE 2273 ret = dsr_set_opt4(ctx, ip4, 2274 addr, 2275 port, &ohead); 2276 #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE 2277 ret = encap_geneve_dsr_opt4(ctx, ctx_load_meta(ctx, CB_DSR_L3_OFF), 2278 ip4, addr, port, &oif, &ohead); 2279 if (!IS_ERR(ret)) { 2280 if (ret == CTX_ACT_REDIRECT && oif) { 2281 cilium_capture_out(ctx); 2282 return ctx_redirect(ctx, oif, 0); 2283 } 2284 } 2285 #else 2286 # error "Invalid load balancer DSR encapsulation mode!" 2287 #endif 2288 if (IS_ERR(ret)) { 2289 if (dsr_fail_needs_reply(ret)) 2290 return dsr_reply_icmp4(ctx, ip4, addr, port, ret, ohead); 2291 goto drop_err; 2292 } 2293 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 2294 ret = DROP_INVALID; 2295 goto drop_err; 2296 } 2297 ret = fib_redirect_v4(ctx, ETH_HLEN, ip4, true, false, &ext_err, &oif); 2298 if (fib_ok(ret)) { 2299 cilium_capture_out(ctx); 2300 return ret; 2301 } 2302 drop_err: 2303 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 2304 CTX_ACT_DROP, METRIC_EGRESS); 2305 } 2306 2307 static __always_inline int 2308 nodeport_dsr_ingress_ipv4(struct __ctx_buff *ctx, struct ipv4_ct_tuple *tuple, 2309 struct iphdr *ip4, bool has_l4_header, int l4_off, 2310 __be32 addr, __be16 port, __s8 *ext_err) 2311 { 2312 struct ct_state ct_state_new = {}; 2313 __u32 monitor = 0; 2314 int ret; 2315 2316 /* lookup with SCOPE_FORWARD: */ 2317 __ipv4_ct_tuple_reverse(tuple); 2318 2319 ret = ct_lazy_lookup4(get_ct_map4(tuple), tuple, ctx, ipv4_is_fragment(ip4), 2320 l4_off, has_l4_header, CT_EGRESS, SCOPE_FORWARD, 2321 CT_ENTRY_DSR, NULL, &monitor); 2322 if (ret < 0) 2323 return ret; 2324 2325 switch (ret) { 2326 case CT_NEW: 2327 create_ct: 2328 if (port == 0) 2329 /* Not expected at all - nodeport_extract_dsr_v4() said 2330 * there would be a CT entry! Without DSR info we can't 2331 * do anything smart here. 2332 */ 2333 return DROP_INVALID; 2334 2335 ct_state_new.src_sec_id = WORLD_IPV4_ID; 2336 ct_state_new.dsr_internal = 1; 2337 2338 ret = ct_create4(get_ct_map4(tuple), NULL, tuple, ctx, 2339 CT_EGRESS, &ct_state_new, ext_err); 2340 if (!IS_ERR(ret)) 2341 ret = snat_v4_create_dsr(tuple, addr, port, ext_err); 2342 2343 if (IS_ERR(ret)) 2344 return ret; 2345 break; 2346 case CT_ESTABLISHED: 2347 /* For TCP we only expect DSR info on the SYN, so CT_ESTABLISHED 2348 * is unexpected and we need to refresh the CT entry. 2349 * 2350 * Otherwise we tolerate DSR info on an established connection. 2351 * TODO: how do we know if we need to refresh the SNAT entry? 2352 */ 2353 if (tuple->nexthdr == IPPROTO_TCP && port) 2354 goto create_ct; 2355 break; 2356 default: 2357 return DROP_UNKNOWN_CT; 2358 } 2359 2360 return CTX_ACT_OK; 2361 } 2362 #endif /* ENABLE_DSR */ 2363 2364 static __always_inline struct lb4_reverse_nat * 2365 nodeport_rev_dnat_get_info_ipv4(struct __ctx_buff *ctx, 2366 struct ipv4_ct_tuple *tuple) 2367 { 2368 struct ipv4_nat_entry *dsr_entry __maybe_unused; 2369 struct ipv4_ct_tuple dsr_tuple __maybe_unused; 2370 __u16 rev_nat_index = 0; 2371 2372 if (!ct_has_nodeport_egress_entry4(get_ct_map4(tuple), tuple, 2373 &rev_nat_index, is_defined(ENABLE_DSR))) 2374 return NULL; 2375 2376 if (rev_nat_index) 2377 return lb4_lookup_rev_nat_entry(ctx, rev_nat_index); 2378 2379 #ifdef ENABLE_DSR 2380 dsr_tuple = *tuple; 2381 2382 dsr_tuple.flags = NAT_DIR_EGRESS; 2383 dsr_tuple.sport = tuple->dport; 2384 dsr_tuple.dport = tuple->sport; 2385 2386 dsr_entry = nodeport_dsr_lookup_v4_nat_entry(&dsr_tuple); 2387 if (dsr_entry) 2388 return &dsr_entry->nat_info; 2389 #endif 2390 2391 return NULL; 2392 } 2393 2394 /* Reverse NAT handling of node-port traffic for the case where the 2395 * backend i) was a local EP and bpf_lxc redirected to us, ii) was 2396 * a remote backend and we got here after reverse SNAT from the 2397 * tail_nodeport_nat_ingress_ipv4(). 2398 * 2399 * Also, reverse NAT handling return path egress-gw traffic. 2400 * 2401 * CILIUM_CALL_IPV{4,6}_NODEPORT_REVNAT is plugged into CILIUM_MAP_CALLS 2402 * of the bpf_host, bpf_overlay and of the bpf_lxc. 2403 */ 2404 static __always_inline int 2405 nodeport_rev_dnat_ingress_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace, 2406 __s8 *ext_err) 2407 { 2408 struct bpf_fib_lookup_padded fib_params = { 2409 .l = { 2410 .family = AF_INET, 2411 .ifindex = ctx_get_ifindex(ctx), 2412 }, 2413 }; 2414 int ifindex = 0, ret, l3_off = ETH_HLEN, l4_off; 2415 struct ipv4_ct_tuple tuple = {}; 2416 struct ct_state ct_state = {}; 2417 void *data, *data_end; 2418 struct iphdr *ip4; 2419 __u32 tunnel_endpoint __maybe_unused = 0; 2420 __u32 dst_sec_identity __maybe_unused = 0; 2421 __u32 src_sec_identity __maybe_unused = SECLABEL; 2422 bool allow_neigh_map = true; 2423 bool check_revdnat = true; 2424 bool has_l4_header; 2425 2426 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 2427 return DROP_INVALID; 2428 2429 has_l4_header = ipv4_has_l4_header(ip4); 2430 2431 ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple); 2432 if (ret < 0) { 2433 /* If it's not a SVC protocol, we don't need to check for RevDNAT: */ 2434 if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4) 2435 check_revdnat = false; 2436 else 2437 return ret; 2438 } 2439 2440 #if defined(ENABLE_EGRESS_GATEWAY_COMMON) && !defined(IS_BPF_OVERLAY) 2441 /* The gateway node needs to manually steer any reply traffic 2442 * for a remote pod into the tunnel (to avoid iptables potentially 2443 * dropping or accidentally SNATing the packets). 2444 */ 2445 if (egress_gw_reply_needs_redirect_hook(ip4, &tunnel_endpoint, &dst_sec_identity)) { 2446 trace->reason = TRACE_REASON_CT_REPLY; 2447 goto redirect; 2448 } 2449 #endif /* ENABLE_EGRESS_GATEWAY_COMMON */ 2450 2451 if (!check_revdnat) 2452 goto out; 2453 2454 ret = nodeport_rev_dnat_ingress_ipv4_hook(ctx, ip4, &tuple, &tunnel_endpoint, 2455 &src_sec_identity, &dst_sec_identity); 2456 if (ret == CTX_ACT_OK) 2457 return ret; 2458 else if (ret == CTX_ACT_REDIRECT) 2459 goto redirect; 2460 2461 ret = ct_lazy_lookup4(get_ct_map4(&tuple), &tuple, ctx, ipv4_is_fragment(ip4), 2462 l4_off, has_l4_header, CT_INGRESS, SCOPE_REVERSE, 2463 CT_ENTRY_NODEPORT, &ct_state, &trace->monitor); 2464 if (ret == CT_REPLY) { 2465 trace->reason = TRACE_REASON_CT_REPLY; 2466 ret = lb4_rev_nat(ctx, l3_off, l4_off, ct_state.rev_nat_index, false, 2467 &tuple, has_l4_header); 2468 if (IS_ERR(ret)) 2469 return ret; 2470 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 2471 return DROP_INVALID; 2472 ctx_snat_done_set(ctx); 2473 #ifndef HAVE_FIB_IFINDEX 2474 ifindex = ct_state.ifindex; 2475 #endif 2476 #if defined(TUNNEL_MODE) 2477 { 2478 struct remote_endpoint_info *info; 2479 2480 info = lookup_ip4_remote_endpoint(ip4->daddr, 0); 2481 if (info && info->tunnel_endpoint && !info->flag_skip_tunnel) { 2482 tunnel_endpoint = info->tunnel_endpoint; 2483 dst_sec_identity = info->sec_identity; 2484 } 2485 } 2486 #endif 2487 2488 goto redirect; 2489 } 2490 out: 2491 return CTX_ACT_OK; 2492 2493 redirect: 2494 fib_params.l.ipv4_src = ip4->saddr; 2495 fib_params.l.ipv4_dst = ip4->daddr; 2496 2497 ret = ipv4_l3(ctx, l3_off, NULL, NULL, ip4); 2498 if (unlikely(ret != CTX_ACT_OK)) 2499 return ret; 2500 2501 #if (defined(ENABLE_EGRESS_GATEWAY_COMMON) && !defined(IS_BPF_OVERLAY)) || defined(TUNNEL_MODE) 2502 if (tunnel_endpoint) { 2503 __be16 src_port = tunnel_gen_src_port_v4(&tuple); 2504 2505 ret = nodeport_add_tunnel_encap(ctx, IPV4_DIRECT_ROUTING, src_port, 2506 tunnel_endpoint, src_sec_identity, dst_sec_identity, 2507 trace->reason, trace->monitor, &ifindex); 2508 if (IS_ERR(ret)) 2509 return ret; 2510 2511 if (ret == CTX_ACT_REDIRECT && ifindex) 2512 return ctx_redirect(ctx, ifindex, 0); 2513 2514 fib_params.l.ipv4_src = IPV4_DIRECT_ROUTING; 2515 fib_params.l.ipv4_dst = tunnel_endpoint; 2516 2517 /* neigh map doesn't contain DMACs for other nodes */ 2518 allow_neigh_map = false; 2519 } 2520 #endif 2521 2522 return fib_redirect(ctx, true, &fib_params, allow_neigh_map, ext_err, &ifindex); 2523 } 2524 2525 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_REVNAT) 2526 static __always_inline 2527 int tail_nodeport_rev_dnat_ingress_ipv4(struct __ctx_buff *ctx) 2528 { 2529 struct trace_ctx trace = { 2530 .reason = TRACE_REASON_UNKNOWN, 2531 .monitor = TRACE_PAYLOAD_LEN, 2532 }; 2533 __s8 ext_err = 0; 2534 int ret = 0; 2535 2536 ret = nodeport_rev_dnat_ingress_ipv4(ctx, &trace, &ext_err); 2537 if (IS_ERR(ret)) 2538 goto drop_err; 2539 2540 if (ret == CTX_ACT_OK) { 2541 /* When called by bpf_lxc to handle a reply by local backend, 2542 * the packet *must* be redirected. 2543 */ 2544 if (is_defined(IS_BPF_LXC)) { 2545 ret = DROP_NAT_NO_MAPPING; 2546 goto drop_err; 2547 } 2548 2549 ctx_skip_nodeport_set(ctx); 2550 ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_FROM_NETDEV, &ext_err); 2551 goto drop_err; 2552 } 2553 2554 #ifndef IS_BPF_LXC 2555 edt_set_aggregate(ctx, 0); 2556 #endif 2557 cilium_capture_out(ctx); 2558 return ret; 2559 2560 drop_err: 2561 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 2562 CTX_ACT_DROP, METRIC_EGRESS); 2563 } 2564 2565 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS) 2566 static __always_inline 2567 int tail_nodeport_nat_ingress_ipv4(struct __ctx_buff *ctx) 2568 { 2569 struct ipv4_nat_target target = { 2570 .min_port = NODEPORT_PORT_MIN_NAT, 2571 .max_port = NODEPORT_PORT_MAX_NAT, 2572 }; 2573 struct trace_ctx trace = { 2574 .reason = TRACE_REASON_UNKNOWN, 2575 .monitor = TRACE_PAYLOAD_LEN, 2576 }; 2577 __u32 src_id = 0; 2578 __s8 ext_err = 0; 2579 int ret; 2580 2581 ret = snat_v4_rev_nat(ctx, &target, &trace, &ext_err); 2582 if (IS_ERR(ret)) { 2583 if (ret == NAT_PUNT_TO_STACK || 2584 /* DROP_NAT_NO_MAPPING is unwanted behavior in a 2585 * rev-SNAT context. Let's continue to passing it up 2586 * to the host and revisiting this later if 2587 * needed. 2588 */ 2589 ret == DROP_NAT_NO_MAPPING) { 2590 /* In case of no mapping, recircle back to 2591 * main path. SNAT is very expensive in terms 2592 * of instructions and 2593 * complexity. Consequently, this is done 2594 * inside a tail call here (because we don't 2595 * have BPF to BPF calls). 2596 */ 2597 goto recircle; 2598 } 2599 goto drop_err; 2600 } 2601 2602 ctx_snat_done_set(ctx); 2603 2604 /* At this point we know that a reverse SNAT mapping exists. 2605 * Otherwise, we would have tail-called back to 2606 * CALL_IPV4_FROM_NETDEV in the code above. 2607 */ 2608 #if !defined(ENABLE_DSR) || (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \ 2609 (defined(ENABLE_EGRESS_GATEWAY_COMMON) && !defined(IS_BPF_OVERLAY)) 2610 2611 # if defined(ENABLE_HOST_FIREWALL) && defined(IS_BPF_HOST) 2612 ret = ipv4_host_policy_ingress(ctx, &src_id, &trace, &ext_err); 2613 if (IS_ERR(ret)) 2614 goto drop_err; 2615 2616 /* We don't want to enforce host policies a second time, 2617 * on recircle / after RevDNAT. 2618 */ 2619 ctx_skip_host_fw_set(ctx); 2620 # endif 2621 2622 /* If we're not in full DSR mode, reply traffic from remote backends 2623 * might pass back through the LB node and requires revDNAT. 2624 * 2625 * Also let nodeport_rev_dnat_ingress_ipv4() redirect EgressGW 2626 * reply traffic into tunnel (see there for details). 2627 */ 2628 ret = invoke_traced_tailcall_if(__and(is_defined(ENABLE_HOST_FIREWALL), 2629 is_defined(IS_BPF_HOST)), 2630 CILIUM_CALL_IPV4_NODEPORT_REVNAT, 2631 nodeport_rev_dnat_ingress_ipv4, 2632 &trace, &ext_err); 2633 if (IS_ERR(ret)) 2634 goto drop_err; 2635 2636 /* No redirect needed: */ 2637 if (ret == CTX_ACT_OK) 2638 goto recircle; 2639 2640 /* Redirected to egress interface: */ 2641 edt_set_aggregate(ctx, 0); 2642 cilium_capture_out(ctx); 2643 return ret; 2644 #endif 2645 2646 recircle: 2647 ctx_skip_nodeport_set(ctx); 2648 ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_FROM_NETDEV, &ext_err); 2649 2650 drop_err: 2651 return send_drop_notify_error_ext(ctx, src_id, ret, ext_err, CTX_ACT_DROP, 2652 METRIC_INGRESS); 2653 } 2654 2655 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT_EGRESS) 2656 static __always_inline 2657 int tail_nodeport_nat_egress_ipv4(struct __ctx_buff *ctx) 2658 { 2659 struct bpf_fib_lookup_padded fib_params = { 2660 .l = { 2661 .family = AF_INET, 2662 .ifindex = ctx_get_ifindex(ctx), 2663 }, 2664 }; 2665 struct ipv4_nat_target target = { 2666 .min_port = NODEPORT_PORT_MIN_NAT, 2667 .max_port = NODEPORT_PORT_MAX_NAT, 2668 /* Unfortunately, the bpf_fib_lookup() is not able to set src IP addr. 2669 * So we need to assume that the direct routing device is going to be 2670 * used to fwd the NodePort request, thus SNAT-ing to its IP addr. 2671 * This will change once we have resolved GH#17158. 2672 */ 2673 .addr = IPV4_DIRECT_ROUTING, 2674 }; 2675 struct ipv4_ct_tuple tuple = {}; 2676 struct trace_ctx trace = { 2677 .reason = (enum trace_reason)CT_NEW, 2678 .monitor = TRACE_PAYLOAD_LEN, 2679 }; 2680 int ret, l4_off, oif = 0; 2681 void *data, *data_end; 2682 bool has_l4_header; 2683 struct iphdr *ip4; 2684 __s8 ext_err = 0; 2685 __u32 dst_sec_identity __maybe_unused = 0; 2686 #ifdef TUNNEL_MODE 2687 __u32 src_sec_identity = ctx_load_meta(ctx, CB_SRC_LABEL); 2688 __u8 cluster_id __maybe_unused = (__u8)ctx_load_meta(ctx, CB_CLUSTER_ID_EGRESS); 2689 struct remote_endpoint_info *info; 2690 __be32 tunnel_endpoint = 0; 2691 #endif 2692 2693 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 2694 ret = DROP_INVALID; 2695 goto drop_err; 2696 } 2697 2698 has_l4_header = ipv4_has_l4_header(ip4); 2699 2700 #ifdef TUNNEL_MODE 2701 info = lookup_ip4_remote_endpoint(ip4->daddr, cluster_id); 2702 if (info && info->tunnel_endpoint != 0 && !info->flag_skip_tunnel) { 2703 tunnel_endpoint = info->tunnel_endpoint; 2704 dst_sec_identity = info->sec_identity; 2705 2706 target.addr = IPV4_GATEWAY; 2707 #if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT) 2708 if (cluster_id && cluster_id != CLUSTER_ID) 2709 target.addr = IPV4_INTER_CLUSTER_SNAT; 2710 #endif 2711 } 2712 #endif 2713 2714 ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple); 2715 if (IS_ERR(ret)) 2716 goto drop_err; 2717 2718 /* Extracted ports are in flipped order, but SNAT wants them to 2719 * match the packet header: 2720 */ 2721 ipv4_ct_tuple_swap_ports(&tuple); 2722 tuple.flags = TUPLE_F_OUT; 2723 2724 ret = nodeport_nat_egress_ipv4_hook(ctx, ip4, dst_sec_identity, &tuple, l4_off, &ext_err); 2725 if (ret != CTX_ACT_OK) 2726 return ret; 2727 2728 ret = ipv4_l3(ctx, ETH_HLEN, NULL, NULL, ip4); 2729 if (unlikely(ret != CTX_ACT_OK)) 2730 goto drop_err; 2731 2732 ret = __snat_v4_nat(ctx, &tuple, ip4, has_l4_header, l4_off, 2733 true, &target, TCP_SPORT_OFF, &trace, &ext_err); 2734 if (IS_ERR(ret)) 2735 goto drop_err; 2736 2737 /* This is also needed for from-overlay, to avoid a second SNAT by 2738 * to-overlay or to-netdev. 2739 */ 2740 ctx_snat_done_set(ctx); 2741 2742 #ifdef TUNNEL_MODE 2743 if (tunnel_endpoint) { 2744 __be16 src_port; 2745 2746 #if __ctx_is == __ctx_skb 2747 { 2748 /* Append L2 hdr before redirecting to tunnel netdev. 2749 * Otherwise, the kernel will drop such request in 2750 * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/net/core/filter.c?h=v6.7.4#n2147 2751 */ 2752 bool l2_hdr_required = false; 2753 2754 ret = maybe_add_l2_hdr(ctx, ENCAP_IFINDEX, &l2_hdr_required); 2755 if (ret != 0) 2756 goto drop_err; 2757 } 2758 #endif 2759 2760 src_port = tunnel_gen_src_port_v4(&tuple); 2761 2762 /* The request came from outside, so we need to 2763 * set the security id in the tunnel header to WORLD_ID. 2764 * Otherwise, the remote node will assume, that the 2765 * request originated from a cluster node which will 2766 * bypass any netpol which disallows LB requests from 2767 * outside. 2768 */ 2769 ret = nodeport_add_tunnel_encap(ctx, 2770 IPV4_DIRECT_ROUTING, 2771 src_port, 2772 tunnel_endpoint, 2773 src_sec_identity, 2774 dst_sec_identity, 2775 trace.reason, 2776 trace.monitor, 2777 &oif); 2778 if (IS_ERR(ret)) 2779 goto drop_err; 2780 2781 if (ret == CTX_ACT_REDIRECT && oif) { 2782 cilium_capture_out(ctx); 2783 return ctx_redirect(ctx, oif, 0); 2784 } 2785 } 2786 #endif 2787 if (!revalidate_data(ctx, &data, &data_end, &ip4)) { 2788 ret = DROP_INVALID; 2789 goto drop_err; 2790 } 2791 2792 fib_params.l.ipv4_src = ip4->saddr; 2793 fib_params.l.ipv4_dst = ip4->daddr; 2794 2795 ret = fib_redirect(ctx, true, &fib_params, false, &ext_err, &oif); 2796 if (fib_ok(ret)) { 2797 cilium_capture_out(ctx); 2798 return ret; 2799 } 2800 drop_err: 2801 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 2802 CTX_ACT_DROP, METRIC_EGRESS); 2803 } 2804 2805 static __always_inline int nodeport_svc_lb4(struct __ctx_buff *ctx, 2806 struct ipv4_ct_tuple *tuple, 2807 struct lb4_service *svc, 2808 struct lb4_key *key, 2809 struct iphdr *ip4, 2810 int l3_off, 2811 bool has_l4_header, 2812 int l4_off, 2813 __u32 src_sec_identity, 2814 __s8 *ext_err) 2815 { 2816 const bool skip_l3_xlate = DSR_ENCAP_MODE == DSR_ENCAP_IPIP; 2817 bool is_fragment = ipv4_is_fragment(ip4); 2818 struct ct_state ct_state_svc = {}; 2819 __u32 cluster_id = 0; 2820 bool backend_local; 2821 __u32 monitor = 0; 2822 int ret; 2823 2824 if (!lb4_src_range_ok(svc, ip4->saddr)) 2825 return DROP_NOT_IN_SRC_RANGE; 2826 2827 if (!lb4_svc_is_routable(svc)) 2828 return DROP_IS_CLUSTER_IP; 2829 2830 #if defined(ENABLE_L7_LB) 2831 if (lb4_svc_is_l7loadbalancer(svc) && svc->l7_lb_proxy_port > 0) { 2832 /* We cannot redirect from the XDP layer to cilium_host. 2833 * Therefore, let the bpf_host to handle the L7 ingress 2834 * request. 2835 */ 2836 if (ctx_is_xdp()) 2837 return CTX_ACT_OK; 2838 2839 send_trace_notify(ctx, TRACE_TO_PROXY, src_sec_identity, UNKNOWN_ID, 2840 bpf_ntohs((__u16)svc->l7_lb_proxy_port), 2841 NATIVE_DEV_IFINDEX, TRACE_REASON_POLICY, monitor); 2842 return ctx_redirect_to_proxy_hairpin_ipv4(ctx, ip4, 2843 (__be16)svc->l7_lb_proxy_port); 2844 } 2845 #endif 2846 if (lb4_to_lb6_service(svc)) { 2847 ret = lb4_to_lb6(ctx, ip4, l3_off); 2848 if (!ret) 2849 return NAT_46X64_RECIRC; 2850 } else { 2851 ret = lb4_local(get_ct_map4(tuple), ctx, is_fragment, l3_off, l4_off, 2852 key, tuple, svc, &ct_state_svc, 2853 has_l4_header, skip_l3_xlate, &cluster_id, 2854 ext_err); 2855 #ifdef SERVICE_NO_BACKEND_RESPONSE 2856 if (ret == DROP_NO_SERVICE) { 2857 /* Packet is TX'ed back out, avoid EDT false-positives: */ 2858 edt_set_aggregate(ctx, 0); 2859 ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_NO_SERVICE, 2860 ext_err); 2861 } 2862 #endif 2863 } 2864 if (IS_ERR(ret)) 2865 return ret; 2866 2867 backend_local = __lookup_ip4_endpoint(tuple->daddr); 2868 if (!backend_local && lb4_svc_is_hostport(svc)) 2869 return DROP_INVALID; 2870 /* Reply from DSR packet is never seen on this node again 2871 * hence no need to track in here. 2872 */ 2873 if (backend_local || !nodeport_uses_dsr4(tuple)) { 2874 struct ct_state ct_state = {}; 2875 2876 #if (defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT)) 2877 if (src_sec_identity == 0) 2878 src_sec_identity = WORLD_IPV4_ID; 2879 2880 /* Before forwarding the identity, make sure it's not local, 2881 * as in that case the next hop would't understand it. 2882 */ 2883 if (identity_is_local(src_sec_identity)) 2884 return DROP_INVALID_IDENTITY; 2885 2886 if (identity_is_host(src_sec_identity)) 2887 return DROP_INVALID_IDENTITY; 2888 #else 2889 src_sec_identity = WORLD_IPV4_ID; 2890 #endif 2891 2892 /* lookup with SCOPE_FORWARD: */ 2893 __ipv4_ct_tuple_reverse(tuple); 2894 2895 /* only match CT entries that belong to the same service: */ 2896 ct_state.rev_nat_index = ct_state_svc.rev_nat_index; 2897 2898 /* Cache is_fragment in advance, lb4_local may invalidate ip4. */ 2899 ret = ct_lazy_lookup4(get_ct_map4(tuple), tuple, ctx, is_fragment, 2900 l4_off, has_l4_header, CT_EGRESS, SCOPE_FORWARD, 2901 CT_ENTRY_NODEPORT, &ct_state, &monitor); 2902 if (ret < 0) 2903 return ret; 2904 2905 switch (ret) { 2906 case CT_NEW: 2907 ct_state.src_sec_id = src_sec_identity; 2908 ct_state.node_port = 1; 2909 #ifndef HAVE_FIB_IFINDEX 2910 ct_state.ifindex = (__u16)NATIVE_DEV_IFINDEX; 2911 #endif 2912 2913 ret = ct_create4(get_ct_map4(tuple), NULL, tuple, ctx, 2914 CT_EGRESS, &ct_state, ext_err); 2915 if (IS_ERR(ret)) 2916 return ret; 2917 break; 2918 case CT_ESTABLISHED: 2919 /* Note that we don't validate whether the matched CT entry 2920 * has identical values (eg. .ifindex) as set above. 2921 */ 2922 break; 2923 default: 2924 return DROP_UNKNOWN_CT; 2925 } 2926 2927 /* Neighbour tracking is needed for local backend until 2928 * https://github.com/cilium/cilium/issues/24062 is resolved. 2929 */ 2930 ret = neigh_record_ip4(ctx); 2931 if (ret < 0) 2932 return ret; 2933 if (backend_local) { 2934 ctx_set_xfer(ctx, XFER_PKT_NO_SVC); 2935 return CTX_ACT_OK; 2936 } 2937 } 2938 2939 /* TX request to remote backend: */ 2940 edt_set_aggregate(ctx, 0); 2941 if (nodeport_uses_dsr4(tuple)) { 2942 #if DSR_ENCAP_MODE == DSR_ENCAP_IPIP 2943 ctx_store_meta(ctx, CB_HINT, 2944 ((__u32)tuple->sport << 16) | tuple->dport); 2945 ctx_store_meta(ctx, CB_ADDR_V4, tuple->daddr); 2946 #elif DSR_ENCAP_MODE == DSR_ENCAP_GENEVE || DSR_ENCAP_MODE == DSR_ENCAP_NONE 2947 ctx_store_meta(ctx, CB_PORT, key->dport); 2948 ctx_store_meta(ctx, CB_ADDR_V4, key->address); 2949 ctx_store_meta(ctx, CB_DSR_SRC_LABEL, src_sec_identity); 2950 ctx_store_meta(ctx, CB_DSR_L3_OFF, l3_off); 2951 #endif /* DSR_ENCAP_MODE */ 2952 return tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_DSR, ext_err); 2953 } 2954 2955 ctx_store_meta(ctx, CB_SRC_LABEL, src_sec_identity); 2956 ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id); 2957 return tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_NAT_EGRESS, 2958 ext_err); 2959 } 2960 2961 /* Main node-port entry point for host-external ingressing node-port traffic 2962 * which handles the case of: i) backend is local EP, ii) backend is remote EP, 2963 * iii) reply from remote backend EP. 2964 */ 2965 static __always_inline int nodeport_lb4(struct __ctx_buff *ctx, 2966 struct iphdr *ip4, 2967 int l3_off, 2968 __u32 src_sec_identity, 2969 __s8 *ext_err, 2970 bool __maybe_unused *dsr) 2971 { 2972 bool has_l4_header = ipv4_has_l4_header(ip4); 2973 struct ipv4_ct_tuple tuple = {}; 2974 bool is_svc_proto = true; 2975 struct lb4_service *svc; 2976 struct lb4_key key = {}; 2977 int ret, l4_off; 2978 2979 cilium_capture_in(ctx); 2980 2981 ret = lb4_extract_tuple(ctx, ip4, l3_off, &l4_off, &tuple); 2982 if (IS_ERR(ret)) { 2983 if (ret == DROP_UNSUPP_SERVICE_PROTO) { 2984 is_svc_proto = false; 2985 goto skip_service_lookup; 2986 } 2987 if (ret == DROP_UNKNOWN_L4) { 2988 ctx_set_xfer(ctx, XFER_PKT_NO_SVC); 2989 return CTX_ACT_OK; 2990 } 2991 return ret; 2992 } 2993 2994 lb4_fill_key(&key, &tuple); 2995 2996 svc = lb4_lookup_service(&key, false); 2997 if (svc) { 2998 return nodeport_svc_lb4(ctx, &tuple, svc, &key, ip4, l3_off, 2999 has_l4_header, l4_off, 3000 src_sec_identity, ext_err); 3001 } else { 3002 skip_service_lookup: 3003 #ifdef ENABLE_NAT_46X64_GATEWAY 3004 if (ip4->daddr != IPV4_DIRECT_ROUTING) 3005 return tail_call_internal(ctx, CILIUM_CALL_IPV46_RFC8215, ext_err); 3006 #endif 3007 /* The packet is not destined to a service but it can be a reply 3008 * packet from a remote backend, in which case we need to perform 3009 * the reverse NAT. 3010 */ 3011 ctx_set_xfer(ctx, XFER_PKT_NO_SVC); 3012 3013 #ifdef ENABLE_DSR 3014 #if (defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE) || \ 3015 (!defined(IS_BPF_OVERLAY) && DSR_ENCAP_MODE != DSR_ENCAP_GENEVE) 3016 if (is_svc_proto && nodeport_uses_dsr4(&tuple)) { 3017 /* Check if packet has embedded DSR info, or belongs to 3018 * an established DSR connection: 3019 */ 3020 ret = nodeport_extract_dsr_v4(ctx, ip4, &tuple, 3021 l4_off, &key.address, 3022 &key.dport, dsr); 3023 if (IS_ERR(ret)) 3024 return ret; 3025 3026 if (*dsr) 3027 /* Packet continues on its way to local backend: */ 3028 return nodeport_dsr_ingress_ipv4(ctx, &tuple, ip4, 3029 has_l4_header, l4_off, 3030 key.address, key.dport, 3031 ext_err); 3032 } 3033 #endif 3034 #endif /* ENABLE_DSR */ 3035 3036 #ifndef ENABLE_MASQUERADE_IPV4 3037 /* When BPF-Masquerading is off, we can skip the revSNAT path via 3038 * CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS if: 3039 * - the packet is ICMP, or 3040 * - the packet is DSR-eligible (and thus not reply traffic by 3041 * a remote backend that would require revSNAT / revDNAT) 3042 */ 3043 if (!is_svc_proto || nodeport_uses_dsr4(&tuple)) 3044 return CTX_ACT_OK; 3045 #endif /* ENABLE_MASQUERADE_IPV4 */ 3046 3047 ctx_store_meta(ctx, CB_SRC_LABEL, src_sec_identity); 3048 /* For NAT64 we might see an IPv4 reply from the backend to 3049 * the LB entering this path. Thus, transform back to IPv6. 3050 */ 3051 if (is_svc_proto && snat_v6_has_v4_match(&tuple)) { 3052 ret = lb4_to_lb6(ctx, ip4, l3_off); 3053 if (ret) 3054 return ret; 3055 ctx_store_meta(ctx, CB_NAT_46X64, 0); 3056 return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS, 3057 ext_err); 3058 #ifdef ENABLE_NAT_46X64_GATEWAY 3059 } else if (is_svc_proto && 3060 snat_v6_has_v4_match_rfc8215(&tuple)) { 3061 ret = snat_remap_rfc8215(ctx, ip4, l3_off); 3062 if (ret) 3063 return ret; 3064 ctx_store_meta(ctx, CB_NAT_46X64, NAT46x64_MODE_ROUTE); 3065 return tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS, 3066 ext_err); 3067 #endif 3068 } 3069 3070 return tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS, ext_err); 3071 } 3072 } 3073 3074 static __always_inline int 3075 nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, bool *snat_done, 3076 bool revdnat_only __maybe_unused, 3077 struct trace_ctx *trace, __s8 *ext_err __maybe_unused) 3078 { 3079 struct bpf_fib_lookup_padded fib_params __maybe_unused = {}; 3080 int ret, l3_off = ETH_HLEN, l4_off; 3081 struct lb4_reverse_nat *nat_info; 3082 struct ipv4_ct_tuple tuple = {}; 3083 struct ct_state ct_state = {}; 3084 void *data, *data_end; 3085 bool has_l4_header, is_fragment; 3086 struct iphdr *ip4; 3087 3088 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 3089 return DROP_INVALID; 3090 3091 has_l4_header = ipv4_has_l4_header(ip4); 3092 is_fragment = ipv4_is_fragment(ip4); 3093 3094 ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple); 3095 if (ret < 0) { 3096 /* If it's not a SVC protocol, we don't need to check for RevDNAT: */ 3097 if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4) 3098 return CTX_ACT_OK; 3099 return ret; 3100 } 3101 3102 nat_info = nodeport_rev_dnat_get_info_ipv4(ctx, &tuple); 3103 if (!nat_info) 3104 return CTX_ACT_OK; 3105 3106 #if defined(IS_BPF_HOST) && !defined(ENABLE_SKIP_FIB) 3107 if (revdnat_only) 3108 goto skip_fib; 3109 3110 /* Perform FIB lookup with post-RevDNAT src IP, and redirect 3111 * packet to the correct egress interface: 3112 */ 3113 fib_params.l.family = AF_INET; 3114 fib_params.l.ifindex = ctx_get_ifindex(ctx); 3115 fib_params.l.ipv4_src = nat_info->address; 3116 fib_params.l.ipv4_dst = tuple.daddr; 3117 3118 ret = nodeport_fib_lookup_and_redirect(ctx, &fib_params, ext_err); 3119 if (ret != CTX_ACT_OK) 3120 return ret; 3121 3122 skip_fib: 3123 #endif 3124 3125 /* Cache is_fragment in advance, nodeport_fib_lookup_and_redirect may invalidate ip4. */ 3126 ret = ct_lazy_lookup4(get_ct_map4(&tuple), &tuple, ctx, is_fragment, 3127 l4_off, has_l4_header, CT_INGRESS, SCOPE_REVERSE, 3128 CT_ENTRY_NODEPORT | CT_ENTRY_DSR, 3129 &ct_state, &trace->monitor); 3130 3131 /* nodeport_rev_dnat_get_info_ipv4() just checked that such a 3132 * CT entry exists: 3133 */ 3134 if (ret == CT_REPLY) { 3135 trace->reason = TRACE_REASON_CT_REPLY; 3136 3137 ret = __lb4_rev_nat(ctx, l3_off, l4_off, &tuple, 3138 nat_info, false, has_l4_header); 3139 if (IS_ERR(ret)) 3140 return ret; 3141 3142 *snat_done = true; 3143 3144 #ifdef ENABLE_DSR 3145 #if defined(ENABLE_HIGH_SCALE_IPCACHE) && \ 3146 defined(IS_BPF_OVERLAY) && \ 3147 DSR_ENCAP_MODE == DSR_ENCAP_GENEVE 3148 /* For HS IPCache, we also need to revDNAT the OuterSrcIP: */ 3149 if (ct_state.dsr_internal) { 3150 struct bpf_tunnel_key key; 3151 3152 if (ctx_get_tunnel_key(ctx, &key, sizeof(key), 0) < 0) 3153 return DROP_NO_TUNNEL_KEY; 3154 3155 /* kernel returns addresses in flipped locations: */ 3156 key.remote_ipv4 = key.local_ipv4; 3157 key.local_ipv4 = bpf_ntohl(nat_info->address); 3158 3159 if (ctx_set_tunnel_key(ctx, &key, sizeof(key), 3160 BPF_F_ZERO_CSUM_TX) < 0) 3161 return DROP_WRITE_ERROR; 3162 } 3163 #endif 3164 #endif 3165 } 3166 3167 return CTX_ACT_OK; 3168 } 3169 3170 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD) 3171 int tail_handle_snat_fwd_ipv4(struct __ctx_buff *ctx) 3172 { 3173 __u32 cluster_id = ctx_load_and_clear_meta(ctx, CB_CLUSTER_ID_EGRESS); 3174 struct trace_ctx trace = { 3175 .reason = TRACE_REASON_UNKNOWN, 3176 .monitor = 0, 3177 }; 3178 enum trace_point obs_point; 3179 __be32 saddr = 0; 3180 int ret; 3181 __s8 ext_err = 0; 3182 3183 #ifdef IS_BPF_OVERLAY 3184 obs_point = TRACE_TO_OVERLAY; 3185 #else 3186 obs_point = TRACE_TO_NETWORK; 3187 #endif 3188 3189 ret = nodeport_snat_fwd_ipv4(ctx, cluster_id, &saddr, &trace, &ext_err); 3190 if (IS_ERR(ret)) 3191 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 3192 CTX_ACT_DROP, METRIC_EGRESS); 3193 3194 /* Don't emit a trace event if the packet has been redirected to another 3195 * interface. 3196 * This can happen for egress gateway traffic that needs to egress from 3197 * the interface to which the egress IP is assigned to. 3198 */ 3199 if (ret == CTX_ACT_OK) 3200 send_trace_notify4(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID, saddr, 3201 TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX, 3202 trace.reason, trace.monitor); 3203 3204 return ret; 3205 } 3206 3207 static __always_inline int 3208 __handle_nat_fwd_ipv4(struct __ctx_buff *ctx, __u32 cluster_id __maybe_unused, 3209 bool revdnat_only, struct trace_ctx *trace, __s8 *ext_err) 3210 { 3211 bool snat_done = false; 3212 int ret; 3213 3214 ret = nodeport_rev_dnat_fwd_ipv4(ctx, &snat_done, revdnat_only, trace, ext_err); 3215 if (ret != CTX_ACT_OK || revdnat_only) 3216 return ret; 3217 3218 #if !defined(ENABLE_DSR) || \ 3219 (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \ 3220 defined(ENABLE_MASQUERADE_IPV4) || \ 3221 (defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT)) 3222 if (!snat_done) { 3223 ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id); 3224 ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD, 3225 ext_err); 3226 } 3227 #endif 3228 3229 if (is_defined(IS_BPF_HOST) && snat_done) 3230 ctx_snat_done_set(ctx); 3231 3232 return ret; 3233 } 3234 3235 static __always_inline int 3236 handle_nat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace, 3237 __s8 *ext_err) 3238 { 3239 __u32 cb_nat_flags = ctx_load_and_clear_meta(ctx, CB_NAT_FLAGS); 3240 bool revdnat_only = cb_nat_flags & CB_NAT_FLAGS_REVDNAT_ONLY; 3241 __u32 cluster_id = ctx_load_and_clear_meta(ctx, CB_CLUSTER_ID_EGRESS); 3242 3243 return __handle_nat_fwd_ipv4(ctx, cluster_id, revdnat_only, trace, ext_err); 3244 } 3245 3246 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT_FWD) 3247 static __always_inline 3248 int tail_handle_nat_fwd_ipv4(struct __ctx_buff *ctx) 3249 { 3250 struct trace_ctx trace = { 3251 .reason = TRACE_REASON_UNKNOWN, 3252 .monitor = TRACE_PAYLOAD_LEN, 3253 }; 3254 int ret; 3255 enum trace_point obs_point; 3256 __s8 ext_err = 0; 3257 3258 #ifdef IS_BPF_OVERLAY 3259 obs_point = TRACE_TO_OVERLAY; 3260 #else 3261 obs_point = TRACE_TO_NETWORK; 3262 #endif 3263 3264 ret = handle_nat_fwd_ipv4(ctx, &trace, &ext_err); 3265 if (IS_ERR(ret)) 3266 return send_drop_notify_error_ext(ctx, UNKNOWN_ID, ret, ext_err, 3267 CTX_ACT_DROP, METRIC_EGRESS); 3268 3269 if (ret == CTX_ACT_OK) 3270 send_trace_notify(ctx, obs_point, UNKNOWN_ID, UNKNOWN_ID, 3271 TRACE_EP_ID_UNKNOWN, NATIVE_DEV_IFINDEX, 3272 trace.reason, trace.monitor); 3273 3274 return ret; 3275 } 3276 3277 #endif /* ENABLE_IPV4 */ 3278 3279 #ifdef ENABLE_HEALTH_CHECK 3280 static __always_inline int 3281 health_encap_v4(struct __ctx_buff *ctx, __u32 tunnel_ep, 3282 __u32 seclabel) 3283 { 3284 __u32 key_size = TUNNEL_KEY_WITHOUT_SRC_IP; 3285 struct bpf_tunnel_key key; 3286 3287 /* When encapsulating, a packet originating from the local 3288 * host is being considered as a packet from a remote node 3289 * as it is being received. 3290 */ 3291 memset(&key, 0, sizeof(key)); 3292 key.tunnel_id = get_tunnel_id(seclabel == HOST_ID ? LOCAL_NODE_ID : seclabel); 3293 key.remote_ipv4 = bpf_htonl(tunnel_ep); 3294 key.tunnel_ttl = IPDEFTTL; 3295 3296 if (unlikely(ctx_set_tunnel_key(ctx, &key, key_size, 3297 BPF_F_ZERO_CSUM_TX) < 0)) 3298 return DROP_WRITE_ERROR; 3299 return 0; 3300 } 3301 3302 static __always_inline int 3303 health_encap_v6(struct __ctx_buff *ctx, const union v6addr *tunnel_ep, 3304 __u32 seclabel) 3305 { 3306 __u32 key_size = TUNNEL_KEY_WITHOUT_SRC_IP; 3307 struct bpf_tunnel_key key; 3308 3309 memset(&key, 0, sizeof(key)); 3310 key.tunnel_id = get_tunnel_id(seclabel == HOST_ID ? LOCAL_NODE_ID : seclabel); 3311 key.remote_ipv6[0] = tunnel_ep->p1; 3312 key.remote_ipv6[1] = tunnel_ep->p2; 3313 key.remote_ipv6[2] = tunnel_ep->p3; 3314 key.remote_ipv6[3] = tunnel_ep->p4; 3315 key.tunnel_ttl = IPDEFTTL; 3316 3317 if (unlikely(ctx_set_tunnel_key(ctx, &key, key_size, 3318 BPF_F_ZERO_CSUM_TX | 3319 BPF_F_TUNINFO_IPV6) < 0)) 3320 return DROP_WRITE_ERROR; 3321 return 0; 3322 } 3323 3324 static __always_inline int 3325 lb_handle_health(struct __ctx_buff *ctx __maybe_unused, __be16 proto) 3326 { 3327 void *data __maybe_unused, *data_end __maybe_unused; 3328 __sock_cookie key __maybe_unused; 3329 int ret __maybe_unused; 3330 3331 if ((ctx->mark & MARK_MAGIC_HEALTH_IPIP_DONE) == 3332 MARK_MAGIC_HEALTH_IPIP_DONE) 3333 return CTX_ACT_OK; 3334 3335 switch (proto) { 3336 #if defined(ENABLE_IPV4) && DSR_ENCAP_MODE == DSR_ENCAP_IPIP 3337 case bpf_htons(ETH_P_IP): { 3338 struct lb4_health *val; 3339 3340 key = get_socket_cookie(ctx); 3341 val = map_lookup_elem(&LB4_HEALTH_MAP, &key); 3342 if (!val) 3343 return CTX_ACT_OK; 3344 ret = health_encap_v4(ctx, val->peer.address, 0); 3345 if (ret != 0) 3346 return ret; 3347 ctx->mark |= MARK_MAGIC_HEALTH_IPIP_DONE; 3348 return ctx_redirect(ctx, ENCAP4_IFINDEX, 0); 3349 } 3350 #endif 3351 #if defined(ENABLE_IPV6) && DSR_ENCAP_MODE == DSR_ENCAP_IPIP 3352 case bpf_htons(ETH_P_IPV6): { 3353 struct lb6_health *val; 3354 3355 key = get_socket_cookie(ctx); 3356 val = map_lookup_elem(&LB6_HEALTH_MAP, &key); 3357 if (!val) 3358 return CTX_ACT_OK; 3359 ret = health_encap_v6(ctx, &val->peer.address, 0); 3360 if (ret != 0) 3361 return ret; 3362 ctx->mark |= MARK_MAGIC_HEALTH_IPIP_DONE; 3363 return ctx_redirect(ctx, ENCAP6_IFINDEX, 0); 3364 } 3365 #endif 3366 default: 3367 return CTX_ACT_OK; 3368 } 3369 } 3370 #endif /* ENABLE_HEALTH_CHECK */ 3371 3372 /* handle_nat_fwd() handles revDNAT, fib_lookup_redirect, and bpf_snat for 3373 * nodeport. If revdnat_only is set to true, fib_lookup and bpf_snat are 3374 * skipped. The typical use case of handle_nat_fwd(revdnat_only=true) is for 3375 * handling reply traffic that requires revDNAT prior to wireguard/IPsec 3376 * encryption. 3377 */ 3378 static __always_inline int 3379 handle_nat_fwd(struct __ctx_buff *ctx, __u32 cluster_id, __be16 proto, 3380 bool revdnat_only, struct trace_ctx *trace __maybe_unused, 3381 __s8 *ext_err __maybe_unused) 3382 { 3383 int ret = CTX_ACT_OK; 3384 __u32 cb_nat_flags = 0; 3385 3386 if (revdnat_only) 3387 cb_nat_flags |= CB_NAT_FLAGS_REVDNAT_ONLY; 3388 3389 ctx_store_meta(ctx, CB_NAT_FLAGS, cb_nat_flags); 3390 ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id); 3391 3392 switch (proto) { 3393 #ifdef ENABLE_IPV4 3394 case bpf_htons(ETH_P_IP): 3395 ret = invoke_traced_tailcall_if(__or4(__and(is_defined(ENABLE_IPV4), 3396 is_defined(ENABLE_IPV6)), 3397 __and(is_defined(ENABLE_HOST_FIREWALL), 3398 is_defined(IS_BPF_HOST)), 3399 __and(is_defined(ENABLE_CLUSTER_AWARE_ADDRESSING), 3400 is_defined(ENABLE_INTER_CLUSTER_SNAT)), 3401 __and(is_defined(ENABLE_EGRESS_GATEWAY_COMMON), 3402 is_defined(IS_BPF_HOST))), 3403 CILIUM_CALL_IPV4_NODEPORT_NAT_FWD, 3404 handle_nat_fwd_ipv4, trace, ext_err); 3405 break; 3406 #endif /* ENABLE_IPV4 */ 3407 #ifdef ENABLE_IPV6 3408 case bpf_htons(ETH_P_IPV6): 3409 ret = invoke_traced_tailcall_if(__or(__and(is_defined(ENABLE_IPV4), 3410 is_defined(ENABLE_IPV6)), 3411 __and(is_defined(ENABLE_HOST_FIREWALL), 3412 is_defined(IS_BPF_HOST))), 3413 CILIUM_CALL_IPV6_NODEPORT_NAT_FWD, 3414 handle_nat_fwd_ipv6, trace, ext_err); 3415 break; 3416 #endif /* ENABLE_IPV6 */ 3417 default: 3418 build_bug_on(!(NODEPORT_PORT_MIN_NAT < NODEPORT_PORT_MAX_NAT)); 3419 build_bug_on(!(NODEPORT_PORT_MIN < NODEPORT_PORT_MAX)); 3420 build_bug_on(!(NODEPORT_PORT_MAX < NODEPORT_PORT_MIN_NAT)); 3421 break; 3422 } 3423 return ret; 3424 } 3425 3426 #endif /* ENABLE_NODEPORT */