github.com/cilium/cilium@v1.16.2/bpf/lib/nat.h (about) 1 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 /* Copyright Authors of Cilium */ 3 4 /* Simple NAT engine in BPF. */ 5 #pragma once 6 7 #include <linux/icmp.h> 8 #include <linux/tcp.h> 9 #include <linux/udp.h> 10 #include <linux/ip.h> 11 #include <linux/icmpv6.h> 12 #include <linux/ipv6.h> 13 14 #include "bpf/compiler.h" 15 #include "common.h" 16 #include "drop.h" 17 #include "signal.h" 18 #include "conntrack.h" 19 #include "conntrack_map.h" 20 #include "egress_gateway.h" 21 #include "icmp6.h" 22 #include "nat_46x64.h" 23 #include "stubs.h" 24 #include "trace.h" 25 26 enum nat_dir { 27 NAT_DIR_EGRESS = TUPLE_F_OUT, 28 NAT_DIR_INGRESS = TUPLE_F_IN, 29 } __packed; 30 31 struct nat_entry { 32 __u64 created; 33 __u64 needs_ct; /* Only single bit used. */ 34 __u64 pad1; /* Future use. */ 35 __u64 pad2; /* Future use. */ 36 }; 37 38 #define SNAT_COLLISION_RETRIES 128 39 #define SNAT_SIGNAL_THRES 64 40 41 #define snat_v4_needs_masquerade_hook(ctx, target) 0 42 43 static __always_inline __u16 __snat_clamp_port_range(__u16 start, __u16 end, 44 __u16 val) 45 { 46 return (val % (__u16)(end - start)) + start; 47 } 48 49 static __always_inline __maybe_unused __u16 50 __snat_try_keep_port(__u16 start, __u16 end, __u16 val) 51 { 52 return val >= start && val <= end ? val : 53 __snat_clamp_port_range(start, end, (__u16)get_prandom_u32()); 54 } 55 56 static __always_inline __maybe_unused void * 57 __snat_lookup(const void *map, const void *tuple) 58 { 59 return map_lookup_elem(map, tuple); 60 } 61 62 static __always_inline __maybe_unused int 63 __snat_create(const void *map, const void *tuple, const void *state) 64 { 65 return map_update_elem(map, tuple, state, BPF_NOEXIST); 66 } 67 68 struct ipv4_nat_entry { 69 struct nat_entry common; 70 union { 71 struct lb4_reverse_nat nat_info; 72 struct { 73 __be32 to_saddr; 74 __be16 to_sport; 75 }; 76 struct { 77 __be32 to_daddr; 78 __be16 to_dport; 79 }; 80 }; 81 }; 82 83 struct ipv4_nat_target { 84 __be32 addr; 85 const __u16 min_port; /* host endianness */ 86 const __u16 max_port; /* host endianness */ 87 bool from_local_endpoint; 88 bool egress_gateway; /* NAT is needed because of an egress gateway policy */ 89 __u32 cluster_id; 90 bool needs_ct; 91 }; 92 93 #if defined(ENABLE_IPV4) && defined(ENABLE_NODEPORT) 94 struct { 95 __uint(type, BPF_MAP_TYPE_LRU_HASH); 96 __type(key, struct ipv4_ct_tuple); 97 __type(value, struct ipv4_nat_entry); 98 __uint(pinning, LIBBPF_PIN_BY_NAME); 99 __uint(max_entries, SNAT_MAPPING_IPV4_SIZE); 100 } SNAT_MAPPING_IPV4 __section_maps_btf; 101 102 #ifdef ENABLE_CLUSTER_AWARE_ADDRESSING 103 struct per_cluster_snat_mapping_ipv4_inner_map { 104 __uint(type, BPF_MAP_TYPE_LRU_HASH); 105 __type(key, struct ipv4_ct_tuple); 106 __type(value, struct ipv4_nat_entry); 107 __uint(max_entries, SNAT_MAPPING_IPV4_SIZE); 108 #ifndef BPF_TEST 109 }; 110 #else 111 } per_cluster_snat_mapping_ipv4_1 __section_maps_btf, 112 per_cluster_snat_mapping_ipv4_2 __section_maps_btf; 113 #endif 114 115 struct { 116 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 117 __type(key, __u32); 118 __type(value, __u32); 119 __uint(pinning, LIBBPF_PIN_BY_NAME); 120 __uint(max_entries, 256); 121 __array(values, struct per_cluster_snat_mapping_ipv4_inner_map); 122 #ifndef BPF_TEST 123 } PER_CLUSTER_SNAT_MAPPING_IPV4 __section_maps_btf; 124 #else 125 } PER_CLUSTER_SNAT_MAPPING_IPV4 __section_maps_btf = { 126 .values = { 127 [1] = &per_cluster_snat_mapping_ipv4_1, 128 [2] = &per_cluster_snat_mapping_ipv4_2, 129 }, 130 }; 131 #endif 132 #endif 133 134 #ifdef ENABLE_IP_MASQ_AGENT_IPV4 135 struct { 136 __uint(type, BPF_MAP_TYPE_LPM_TRIE); 137 __type(key, struct lpm_v4_key); 138 __type(value, struct lpm_val); 139 __uint(pinning, LIBBPF_PIN_BY_NAME); 140 __uint(max_entries, 16384); 141 __uint(map_flags, BPF_F_NO_PREALLOC); 142 } IP_MASQ_AGENT_IPV4 __section_maps_btf; 143 #endif 144 145 static __always_inline void * 146 get_cluster_snat_map_v4(__u32 cluster_id __maybe_unused) 147 { 148 #if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT) 149 if (cluster_id != 0 && cluster_id != CLUSTER_ID) 150 return map_lookup_elem(&PER_CLUSTER_SNAT_MAPPING_IPV4, &cluster_id); 151 #endif 152 return &SNAT_MAPPING_IPV4; 153 } 154 155 static __always_inline 156 struct ipv4_nat_entry *snat_v4_lookup(const struct ipv4_ct_tuple *tuple) 157 { 158 return __snat_lookup(&SNAT_MAPPING_IPV4, tuple); 159 } 160 161 static __always_inline int snat_v4_new_mapping(struct __ctx_buff *ctx, void *map, 162 struct ipv4_ct_tuple *otuple, 163 struct ipv4_nat_entry *ostate, 164 const struct ipv4_nat_target *target, 165 bool needs_ct, __s8 *ext_err) 166 { 167 struct ipv4_ct_tuple rtuple = {}; 168 struct ipv4_nat_entry rstate; 169 int ret, retries; 170 __u16 port; 171 172 memset(&rstate, 0, sizeof(rstate)); 173 memset(ostate, 0, sizeof(*ostate)); 174 175 rstate.to_daddr = otuple->saddr; 176 rstate.to_dport = otuple->sport; 177 178 ostate->to_saddr = target->addr; 179 /* .to_sport is selected below */ 180 181 /* This tuple matches reply traffic for the SNATed connection: */ 182 rtuple.flags = TUPLE_F_IN; 183 rtuple.nexthdr = otuple->nexthdr; 184 rtuple.saddr = otuple->daddr; 185 rtuple.daddr = ostate->to_saddr; 186 rtuple.sport = otuple->dport; 187 /* .dport is selected below */ 188 189 port = __snat_try_keep_port(target->min_port, 190 target->max_port, 191 bpf_ntohs(otuple->sport)); 192 193 ostate->common.needs_ct = needs_ct; 194 rstate.common.needs_ct = needs_ct; 195 rstate.common.created = bpf_mono_now(); 196 197 #pragma unroll 198 for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) { 199 rtuple.dport = bpf_htons(port); 200 201 /* Try to create a RevSNAT entry. */ 202 if (__snat_create(map, &rtuple, &rstate) == 0) 203 goto create_nat_entry; 204 205 port = __snat_clamp_port_range(target->min_port, 206 target->max_port, 207 retries ? port + 1 : 208 (__u16)get_prandom_u32()); 209 } 210 211 /* Loop completed without finding a free port: */ 212 ret = DROP_NAT_NO_MAPPING; 213 goto out; 214 215 create_nat_entry: 216 ostate->to_sport = rtuple.dport; 217 ostate->common.created = rstate.common.created; 218 219 /* Create the SNAT entry. We just created the RevSNAT entry. */ 220 ret = __snat_create(map, otuple, ostate); 221 if (ret < 0) { 222 map_delete_elem(map, &rtuple); /* rollback */ 223 if (ext_err) 224 *ext_err = (__s8)ret; 225 ret = DROP_NAT_NO_MAPPING; 226 } 227 228 out: 229 /* We struggled to find a free port. Trigger GC in the agent to 230 * free up any ports that are held by expired connections. 231 */ 232 if (retries > SNAT_SIGNAL_THRES) 233 send_signal_nat_fill_up(ctx, SIGNAL_PROTO_V4); 234 235 return ret; 236 } 237 238 static __always_inline int 239 snat_v4_nat_handle_mapping(struct __ctx_buff *ctx, 240 struct ipv4_ct_tuple *tuple, 241 bool has_l4_header, 242 struct ipv4_nat_entry **state, 243 struct ipv4_nat_entry *tmp, 244 struct iphdr *ip4, __u32 off, 245 const struct ipv4_nat_target *target, 246 struct trace_ctx *trace, 247 __s8 *ext_err) 248 { 249 bool needs_ct = target->needs_ct; 250 void *map; 251 252 map = get_cluster_snat_map_v4(target->cluster_id); 253 if (!map) 254 return DROP_SNAT_NO_MAP_FOUND; 255 256 *state = __snat_lookup(map, tuple); 257 258 if (needs_ct) { 259 struct ipv4_ct_tuple tuple_snat; 260 int ret; 261 262 memcpy(&tuple_snat, tuple, sizeof(tuple_snat)); 263 /* Lookup with SCOPE_FORWARD. Ports are already in correct layout: */ 264 ipv4_ct_tuple_swap_addrs(&tuple_snat); 265 266 ret = ct_lazy_lookup4(get_ct_map4(&tuple_snat), &tuple_snat, 267 ctx, ipv4_is_fragment(ip4), off, has_l4_header, 268 CT_EGRESS, SCOPE_FORWARD, CT_ENTRY_ANY, 269 NULL, &trace->monitor); 270 if (ret < 0) 271 return ret; 272 273 trace->reason = (enum trace_reason)ret; 274 if (ret == CT_NEW) { 275 ret = ct_create4(get_ct_map4(&tuple_snat), NULL, 276 &tuple_snat, ctx, CT_EGRESS, 277 NULL, ext_err); 278 if (IS_ERR(ret)) 279 return ret; 280 } 281 } 282 283 if (*state) { 284 barrier_data(*state); 285 return 0; 286 } 287 288 *state = tmp; 289 return snat_v4_new_mapping(ctx, map, tuple, tmp, target, needs_ct, ext_err); 290 } 291 292 static __always_inline int 293 snat_v4_rev_nat_handle_mapping(struct __ctx_buff *ctx, 294 struct ipv4_ct_tuple *tuple, 295 bool has_l4_header, 296 struct ipv4_nat_entry **state, 297 struct iphdr *ip4, __u32 off, 298 const struct ipv4_nat_target *target, 299 struct trace_ctx *trace) 300 { 301 void *map; 302 303 map = get_cluster_snat_map_v4(target->cluster_id); 304 if (!map) 305 return DROP_SNAT_NO_MAP_FOUND; 306 307 *state = __snat_lookup(map, tuple); 308 309 if (*state && (*state)->common.needs_ct) { 310 struct ipv4_ct_tuple tuple_revsnat; 311 int ret; 312 313 memcpy(&tuple_revsnat, tuple, sizeof(tuple_revsnat)); 314 tuple_revsnat.daddr = (*state)->to_daddr; 315 tuple_revsnat.dport = (*state)->to_dport; 316 317 /* CT expects a tuple with the source and destination ports reversed, 318 * while NAT uses normal tuples that match packet headers. 319 */ 320 ipv4_ct_tuple_swap_ports(&tuple_revsnat); 321 322 ret = ct_lazy_lookup4(get_ct_map4(&tuple_revsnat), &tuple_revsnat, 323 ctx, ipv4_is_fragment(ip4), off, has_l4_header, 324 CT_INGRESS, SCOPE_REVERSE, CT_ENTRY_ANY, 325 NULL, &trace->monitor); 326 if (ret < 0) 327 return ret; 328 329 trace->reason = (enum trace_reason)ret; 330 } 331 332 if (*state) 333 return 0; 334 335 return DROP_NAT_NO_MAPPING; 336 } 337 338 static __always_inline int 339 snat_v4_rewrite_headers(struct __ctx_buff *ctx, __u8 nexthdr, int l3_off, 340 bool has_l4_header, int l4_off, 341 __be32 old_addr, __be32 new_addr, __u16 addr_off, 342 __be16 old_port, __be16 new_port, __u16 port_off) 343 { 344 __wsum sum; 345 int err; 346 347 /* No change needed: */ 348 if (old_addr == new_addr && old_port == new_port) 349 return 0; 350 351 sum = csum_diff(&old_addr, 4, &new_addr, 4, 0); 352 if (ctx_store_bytes(ctx, l3_off + addr_off, &new_addr, 4, 0) < 0) 353 return DROP_WRITE_ERROR; 354 355 if (has_l4_header) { 356 int flags = BPF_F_PSEUDO_HDR; 357 struct csum_offset csum = {}; 358 359 csum_l4_offset_and_flags(nexthdr, &csum); 360 361 if (old_port != new_port) { 362 switch (nexthdr) { 363 case IPPROTO_TCP: 364 case IPPROTO_UDP: 365 break; 366 #ifdef ENABLE_SCTP 367 case IPPROTO_SCTP: 368 return DROP_CSUM_L4; 369 #endif /* ENABLE_SCTP */ 370 case IPPROTO_ICMP: 371 /* Not initialized by csum_l4_offset_and_flags(), because ICMPv4 372 * doesn't use a pseudo-header, and the change in IP addresses is 373 * not supposed to change the L4 checksum. 374 * Set it temporarily to amend the checksum after changing ports. 375 */ 376 csum.offset = offsetof(struct icmphdr, checksum); 377 break; 378 default: 379 return DROP_UNKNOWN_L4; 380 } 381 382 /* Amend the L4 checksum due to changing the ports. */ 383 err = l4_modify_port(ctx, l4_off, port_off, &csum, new_port, old_port); 384 if (err < 0) 385 return err; 386 387 /* Restore the original offset. */ 388 if (nexthdr == IPPROTO_ICMP) 389 csum.offset = 0; 390 } 391 392 /* Amend the L4 checksum due to changing the addresses. */ 393 if (csum.offset && 394 csum_l4_replace(ctx, l4_off, &csum, 0, sum, flags) < 0) 395 return DROP_CSUM_L4; 396 } 397 398 /* Amend the L3 checksum due to changing the addresses. */ 399 if (ipv4_csum_update_by_diff(ctx, l3_off, sum) < 0) 400 return DROP_CSUM_L3; 401 402 return 0; 403 } 404 405 static __always_inline bool 406 snat_v4_nat_can_skip(const struct ipv4_nat_target *target, 407 const struct ipv4_ct_tuple *tuple) 408 { 409 __u16 sport = bpf_ntohs(tuple->sport); 410 411 #if defined(ENABLE_EGRESS_GATEWAY_COMMON) && defined(IS_BPF_HOST) 412 if (target->egress_gateway) 413 return false; 414 #endif 415 416 return (!target->from_local_endpoint && sport < NAT_MIN_EGRESS); 417 } 418 419 static __always_inline bool 420 snat_v4_rev_nat_can_skip(const struct ipv4_nat_target *target, const struct ipv4_ct_tuple *tuple) 421 { 422 __u16 dport = bpf_ntohs(tuple->dport); 423 424 return dport < target->min_port || dport > target->max_port; 425 } 426 427 /* Expects to be called with a nodeport-level CT tuple (ie. CT_EGRESS): 428 * - extracted from a request packet, 429 * - on CT_NEW (ie. the tuple is reversed) 430 */ 431 static __always_inline __maybe_unused int 432 snat_v4_create_dsr(const struct ipv4_ct_tuple *tuple, 433 __be32 to_saddr, __be16 to_sport, __s8 *ext_err) 434 { 435 struct ipv4_ct_tuple tmp = *tuple; 436 struct ipv4_nat_entry state = {}; 437 int ret; 438 439 build_bug_on(sizeof(struct ipv4_nat_entry) > 64); 440 441 tmp.flags = TUPLE_F_OUT; 442 tmp.sport = tuple->dport; 443 tmp.dport = tuple->sport; 444 445 state.common.created = bpf_mono_now(); 446 state.to_saddr = to_saddr; 447 state.to_sport = to_sport; 448 449 ret = map_update_elem(&SNAT_MAPPING_IPV4, &tmp, &state, 0); 450 if (ret) { 451 *ext_err = (__s8)ret; 452 return DROP_NAT_NO_MAPPING; 453 } 454 455 return CTX_ACT_OK; 456 } 457 458 static __always_inline void snat_v4_init_tuple(const struct iphdr *ip4, 459 enum nat_dir dir, 460 struct ipv4_ct_tuple *tuple) 461 { 462 tuple->nexthdr = ip4->protocol; 463 tuple->daddr = ip4->daddr; 464 tuple->saddr = ip4->saddr; 465 tuple->flags = dir; 466 } 467 468 /* The function contains a core logic for deciding whether an egressing packet 469 * has to be SNAT-ed, filling the relevant state in the target parameter if 470 * that's the case. 471 * 472 * The function will set: 473 * - target->addr to the SNAT IP address 474 * - target->from_local_endpoint to true if the packet is sent from a local endpoint 475 * - target->egress_gateway to true if the packet should be SNAT-ed because of 476 * an egress gateway policy 477 * 478 * On success, the function returns NAT_NEEDED if the packet should be SNAT-ed, 479 * or NAT_PUNT_TO_STACK if it should not. On failure, it returns a negative 480 * error code (distinct from NAT_PUNT_TO_STACK). 481 */ 482 static __always_inline int 483 snat_v4_needs_masquerade(struct __ctx_buff *ctx __maybe_unused, 484 struct ipv4_ct_tuple *tuple __maybe_unused, 485 struct iphdr *ip4 __maybe_unused, 486 int l4_off __maybe_unused, 487 struct ipv4_nat_target *target __maybe_unused) 488 { 489 struct endpoint_info *local_ep __maybe_unused; 490 struct remote_endpoint_info *remote_ep __maybe_unused; 491 struct egress_gw_policy_entry *egress_gw_policy __maybe_unused; 492 bool is_reply __maybe_unused = false; 493 int ret; 494 495 ret = snat_v4_needs_masquerade_hook(ctx, target); 496 if (IS_ERR(ret)) 497 return ret; 498 if (ret) 499 return NAT_NEEDED; 500 501 #if defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY) 502 # if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT) 503 if (target->cluster_id != 0 && 504 target->cluster_id != CLUSTER_ID) { 505 target->addr = IPV4_INTER_CLUSTER_SNAT; 506 target->from_local_endpoint = true; 507 508 return NAT_NEEDED; 509 } 510 # endif 511 #endif /* TUNNEL_MODE && IS_BPF_OVERLAY */ 512 513 #if defined(ENABLE_MASQUERADE_IPV4) && defined(IS_BPF_HOST) 514 if (tuple->saddr == IPV4_MASQUERADE) { 515 target->addr = IPV4_MASQUERADE; 516 target->needs_ct = true; 517 518 return NAT_NEEDED; 519 } 520 521 local_ep = __lookup_ip4_endpoint(tuple->saddr); 522 remote_ep = lookup_ip4_remote_endpoint(tuple->daddr, 0); 523 524 /* Check if this packet belongs to reply traffic coming from a 525 * local endpoint. 526 * 527 * If local_ep is NULL, it means there's no endpoint running on the 528 * node which matches the packet source IP, which means we can 529 * skip the CT lookup since this cannot be reply traffic. 530 */ 531 if (local_ep) { 532 int err; 533 534 target->from_local_endpoint = true; 535 536 err = ct_extract_ports4(ctx, ip4, l4_off, CT_EGRESS, tuple, NULL); 537 switch (err) { 538 case 0: 539 is_reply = ct_is_reply4(get_ct_map4(tuple), tuple); 540 541 /* SNAT code has its own port extraction logic: */ 542 tuple->dport = 0; 543 tuple->sport = 0; 544 545 break; 546 case DROP_CT_UNKNOWN_PROTO: 547 /* tolerate L4 protocols not supported by CT: */ 548 break; 549 default: 550 return err; 551 } 552 } 553 554 /* Check if the packet matches an egress NAT policy and so needs to be SNAT'ed. 555 * 556 * This check must happen before the IPV4_SNAT_EXCLUSION_DST_CIDR check below as 557 * the destination may be in the SNAT exclusion CIDR but regardless of that we 558 * always want to SNAT a packet if it's matched by an egress NAT policy. 559 */ 560 #if defined(ENABLE_EGRESS_GATEWAY_COMMON) 561 /* If the packet is a reply it means that outside has initiated the 562 * connection, so no need to SNAT the reply. 563 */ 564 if (is_reply) 565 goto skip_egress_gateway; 566 567 if (egress_gw_snat_needed_hook(tuple->saddr, tuple->daddr, &target->addr)) { 568 if (target->addr == EGRESS_GATEWAY_NO_EGRESS_IP) 569 return DROP_NO_EGRESS_IP; 570 571 target->egress_gateway = true; 572 /* If the endpoint is local, then the connection is already tracked. */ 573 if (!local_ep) 574 target->needs_ct = true; 575 576 return NAT_NEEDED; 577 } 578 skip_egress_gateway: 579 #endif 580 581 #ifdef IPV4_SNAT_EXCLUSION_DST_CIDR 582 /* Do not MASQ if a dst IP belongs to a pods CIDR 583 * (ipv4-native-routing-cidr if specified, otherwise local pod CIDR). 584 */ 585 if (ipv4_is_in_subnet(tuple->daddr, IPV4_SNAT_EXCLUSION_DST_CIDR, 586 IPV4_SNAT_EXCLUSION_DST_CIDR_LEN)) 587 return NAT_PUNT_TO_STACK; 588 #endif 589 590 /* if this is a localhost endpoint, no SNAT is needed */ 591 if (local_ep && (local_ep->flags & ENDPOINT_F_HOST)) 592 return NAT_PUNT_TO_STACK; 593 594 if (remote_ep) { 595 #ifdef ENABLE_IP_MASQ_AGENT_IPV4 596 /* Do not SNAT if dst belongs to any ip-masq-agent 597 * subnet. 598 */ 599 struct lpm_v4_key pfx; 600 601 pfx.lpm.prefixlen = 32; 602 memcpy(pfx.lpm.data, &tuple->daddr, sizeof(pfx.addr)); 603 if (map_lookup_elem(&IP_MASQ_AGENT_IPV4, &pfx)) 604 return NAT_PUNT_TO_STACK; 605 #endif 606 607 /* In the tunnel mode, a packet from a local ep 608 * to a remote node is not encap'd, and is sent 609 * via a native dev. Therefore, such packet has 610 * to be MASQ'd. Otherwise, it might be dropped 611 * either by underlying network (e.g. AWS drops 612 * packets by default from unknown subnets) or 613 * by the remote node if its native dev's 614 * rp_filter=1. 615 */ 616 617 if (!is_defined(TUNNEL_MODE) || remote_ep->flag_skip_tunnel) { 618 if (identity_is_remote_node(remote_ep->sec_identity)) 619 return NAT_PUNT_TO_STACK; 620 } 621 622 /* If the packet is a reply it means that outside has 623 * initiated the connection, so no need to SNAT the 624 * reply. 625 */ 626 if (!is_reply && local_ep) { 627 target->addr = IPV4_MASQUERADE; 628 return NAT_NEEDED; 629 } 630 } 631 #endif /*ENABLE_MASQUERADE_IPV4 && IS_BPF_HOST */ 632 633 return NAT_PUNT_TO_STACK; 634 } 635 636 static __always_inline __maybe_unused int 637 snat_v4_nat_handle_icmp_frag_needed(struct __ctx_buff *ctx, __u64 off, 638 bool has_l4_header) 639 { 640 __u32 inner_l3_off = off + sizeof(struct icmphdr); 641 struct ipv4_ct_tuple tuple = {}; 642 struct ipv4_nat_entry *state; 643 struct iphdr iphdr; 644 __u16 port_off; 645 __u32 icmpoff; 646 __u8 type; 647 int ret; 648 649 /* According to the RFC 5508, any networking equipment that is 650 * responding with an ICMP Error packet should embed the original 651 * packet in its response. 652 */ 653 if (ctx_load_bytes(ctx, inner_l3_off, &iphdr, sizeof(iphdr)) < 0) 654 return DROP_INVALID; 655 /* From the embedded IP headers we should be able to determine 656 * corresponding protocol, IP src/dst of the packet sent to resolve 657 * the NAT session. 658 */ 659 tuple.nexthdr = iphdr.protocol; 660 tuple.saddr = iphdr.daddr; 661 tuple.daddr = iphdr.saddr; 662 tuple.flags = NAT_DIR_EGRESS; 663 664 icmpoff = inner_l3_off + ipv4_hdrlen(&iphdr); 665 switch (tuple.nexthdr) { 666 case IPPROTO_TCP: 667 case IPPROTO_UDP: 668 #ifdef ENABLE_SCTP 669 case IPPROTO_SCTP: 670 #endif /* ENABLE_SCTP */ 671 /* No reasons to handle IP fragmentation for this case as it is 672 * expected that DF isn't set for this particular context. 673 */ 674 if (l4_load_ports(ctx, icmpoff, &tuple.dport) < 0) 675 return DROP_INVALID; 676 677 port_off = TCP_DPORT_OFF; 678 break; 679 case IPPROTO_ICMP: 680 /* No reasons to see a packet different than ICMP_ECHOREPLY. */ 681 if (ctx_load_bytes(ctx, icmpoff, &type, 682 sizeof(type)) < 0 || 683 type != ICMP_ECHOREPLY) 684 return DROP_INVALID; 685 686 port_off = offsetof(struct icmphdr, un.echo.id); 687 688 if (ctx_load_bytes(ctx, icmpoff + port_off, 689 &tuple.sport, sizeof(tuple.sport)) < 0) 690 return DROP_INVALID; 691 break; 692 default: 693 return DROP_UNKNOWN_L4; 694 } 695 state = snat_v4_lookup(&tuple); 696 if (!state) 697 return NAT_PUNT_TO_STACK; 698 699 /* We found SNAT entry to NAT embedded packet. The destination addr 700 * should be NATed according to the entry. 701 */ 702 ret = snat_v4_rewrite_headers(ctx, tuple.nexthdr, inner_l3_off, true, icmpoff, 703 tuple.saddr, state->to_saddr, IPV4_DADDR_OFF, 704 tuple.sport, state->to_sport, port_off); 705 if (IS_ERR(ret)) 706 return ret; 707 708 /* Rewrite outer headers for ICMP_FRAG_NEEDED. No port rewrite needed. */ 709 return snat_v4_rewrite_headers(ctx, IPPROTO_ICMP, ETH_HLEN, has_l4_header, off, 710 tuple.saddr, state->to_saddr, IPV4_SADDR_OFF, 711 0, 0, 0); 712 } 713 714 static __always_inline int 715 __snat_v4_nat(struct __ctx_buff *ctx, struct ipv4_ct_tuple *tuple, 716 struct iphdr *ip4, bool has_l4_header, int l4_off, 717 bool update_tuple, const struct ipv4_nat_target *target, 718 __u16 port_off, struct trace_ctx *trace, __s8 *ext_err) 719 { 720 struct ipv4_nat_entry *state, tmp; 721 int ret; 722 723 ret = snat_v4_nat_handle_mapping(ctx, tuple, has_l4_header, &state, 724 &tmp, ip4, l4_off, target, trace, ext_err); 725 if (ret < 0) 726 return ret; 727 728 ret = snat_v4_rewrite_headers(ctx, tuple->nexthdr, ETH_HLEN, has_l4_header, l4_off, 729 tuple->saddr, state->to_saddr, IPV4_SADDR_OFF, 730 tuple->sport, state->to_sport, port_off); 731 732 if (update_tuple) { 733 tuple->saddr = state->to_saddr; 734 tuple->sport = state->to_sport; 735 } 736 737 return ret; 738 } 739 740 static __always_inline __maybe_unused int 741 snat_v4_nat(struct __ctx_buff *ctx, struct ipv4_ct_tuple *tuple, 742 struct iphdr *ip4, int off, bool has_l4_header, 743 const struct ipv4_nat_target *target, 744 struct trace_ctx *trace, __s8 *ext_err) 745 { 746 struct icmphdr icmphdr __align_stack_8; 747 __u16 port_off; 748 int ret; 749 750 build_bug_on(sizeof(struct ipv4_nat_entry) > 64); 751 752 switch (tuple->nexthdr) { 753 case IPPROTO_TCP: 754 case IPPROTO_UDP: 755 #ifdef ENABLE_SCTP 756 case IPPROTO_SCTP: 757 #endif /* ENABLE_SCTP */ 758 ret = ipv4_load_l4_ports(ctx, ip4, off, CT_EGRESS, 759 &tuple->dport, &has_l4_header); 760 if (ret < 0) 761 return ret; 762 763 ipv4_ct_tuple_swap_ports(tuple); 764 port_off = TCP_SPORT_OFF; 765 break; 766 case IPPROTO_ICMP: 767 if (ctx_load_bytes(ctx, off, &icmphdr, sizeof(icmphdr)) < 0) 768 return DROP_INVALID; 769 770 switch (icmphdr.type) { 771 case ICMP_ECHO: 772 tuple->dport = 0; 773 tuple->sport = icmphdr.un.echo.id; 774 port_off = offsetof(struct icmphdr, un.echo.id); 775 break; 776 case ICMP_ECHOREPLY: 777 return NAT_PUNT_TO_STACK; 778 case ICMP_DEST_UNREACH: 779 if (icmphdr.code != ICMP_FRAG_NEEDED) 780 return DROP_UNKNOWN_ICMP_CODE; 781 return snat_v4_nat_handle_icmp_frag_needed(ctx, off, has_l4_header); 782 default: 783 return DROP_NAT_UNSUPP_PROTO; 784 } 785 break; 786 default: 787 return NAT_PUNT_TO_STACK; 788 }; 789 790 if (snat_v4_nat_can_skip(target, tuple)) 791 return NAT_PUNT_TO_STACK; 792 793 return __snat_v4_nat(ctx, tuple, ip4, has_l4_header, off, false, target, 794 port_off, trace, ext_err); 795 } 796 797 static __always_inline __maybe_unused int 798 snat_v4_rev_nat_handle_icmp_frag_needed(struct __ctx_buff *ctx, 799 __u64 inner_l3_off, 800 struct ipv4_nat_entry **state) 801 { 802 struct ipv4_ct_tuple tuple = {}; 803 struct iphdr iphdr; 804 __u16 port_off; 805 __u32 icmpoff; 806 __u8 type; 807 808 /* According to the RFC 5508, any networking equipment that is 809 * responding with an ICMP Error packet should embed the original 810 * packet in its response. 811 */ 812 813 if (ctx_load_bytes(ctx, inner_l3_off, &iphdr, sizeof(iphdr)) < 0) 814 return DROP_INVALID; 815 816 /* From the embedded IP headers we should be able to determine 817 * corresponding protocol, IP src/dst of the packet sent to resolve the 818 * NAT session. 819 */ 820 tuple.nexthdr = iphdr.protocol; 821 tuple.saddr = iphdr.daddr; 822 tuple.daddr = iphdr.saddr; 823 tuple.flags = NAT_DIR_INGRESS; 824 825 icmpoff = inner_l3_off + ipv4_hdrlen(&iphdr); 826 switch (tuple.nexthdr) { 827 case IPPROTO_TCP: 828 case IPPROTO_UDP: 829 #ifdef ENABLE_SCTP 830 case IPPROTO_SCTP: 831 #endif /* ENABLE_SCTP */ 832 /* No reasons to handle IP fragmentation for this case as it is 833 * expected that DF isn't set for this particular context. 834 */ 835 if (l4_load_ports(ctx, icmpoff, &tuple.dport) < 0) 836 return DROP_INVALID; 837 838 port_off = TCP_SPORT_OFF; 839 break; 840 case IPPROTO_ICMP: 841 /* No reasons to see a packet different than ICMP_ECHO. */ 842 if (ctx_load_bytes(ctx, icmpoff, &type, sizeof(type)) < 0 || 843 type != ICMP_ECHO) 844 return DROP_INVALID; 845 846 port_off = offsetof(struct icmphdr, un.echo.id); 847 848 if (ctx_load_bytes(ctx, icmpoff + port_off, 849 &tuple.dport, sizeof(tuple.dport)) < 0) 850 return DROP_INVALID; 851 break; 852 default: 853 return NAT_PUNT_TO_STACK; 854 } 855 856 *state = snat_v4_lookup(&tuple); 857 if (!*state) 858 return NAT_PUNT_TO_STACK; 859 860 /* The embedded packet was SNATed on egress. Reverse it again: */ 861 return snat_v4_rewrite_headers(ctx, tuple.nexthdr, inner_l3_off, true, icmpoff, 862 tuple.daddr, (*state)->to_daddr, IPV4_SADDR_OFF, 863 tuple.dport, (*state)->to_dport, port_off); 864 } 865 866 static __always_inline __maybe_unused int 867 snat_v4_rev_nat(struct __ctx_buff *ctx, const struct ipv4_nat_target *target, 868 struct trace_ctx *trace, __s8 *ext_err __maybe_unused) 869 { 870 struct icmphdr icmphdr __align_stack_8; 871 struct ipv4_nat_entry *state = NULL; 872 struct ipv4_ct_tuple tuple = {}; 873 void *data, *data_end; 874 struct iphdr *ip4; 875 bool has_l4_header = true; 876 __u64 off, inner_l3_off; 877 __be16 to_dport = 0; 878 __u16 port_off = 0; 879 int ret; 880 881 build_bug_on(sizeof(struct ipv4_nat_entry) > 64); 882 883 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 884 return DROP_INVALID; 885 886 snat_v4_init_tuple(ip4, NAT_DIR_INGRESS, &tuple); 887 888 off = ((void *)ip4 - data) + ipv4_hdrlen(ip4); 889 switch (tuple.nexthdr) { 890 case IPPROTO_TCP: 891 case IPPROTO_UDP: 892 #ifdef ENABLE_SCTP 893 case IPPROTO_SCTP: 894 #endif /* ENABLE_SCTP */ 895 ret = ipv4_load_l4_ports(ctx, ip4, off, CT_INGRESS, 896 &tuple.dport, &has_l4_header); 897 if (ret < 0) 898 return ret; 899 900 ipv4_ct_tuple_swap_ports(&tuple); 901 port_off = TCP_DPORT_OFF; 902 break; 903 case IPPROTO_ICMP: 904 if (ctx_load_bytes(ctx, off, &icmphdr, sizeof(icmphdr)) < 0) 905 return DROP_INVALID; 906 switch (icmphdr.type) { 907 case ICMP_ECHOREPLY: 908 tuple.dport = icmphdr.un.echo.id; 909 tuple.sport = 0; 910 port_off = offsetof(struct icmphdr, un.echo.id); 911 break; 912 case ICMP_DEST_UNREACH: 913 if (icmphdr.code != ICMP_FRAG_NEEDED) 914 return NAT_PUNT_TO_STACK; 915 916 inner_l3_off = off + sizeof(struct icmphdr); 917 918 ret = snat_v4_rev_nat_handle_icmp_frag_needed(ctx, 919 inner_l3_off, 920 &state); 921 if (IS_ERR(ret)) 922 return ret; 923 924 has_l4_header = true; 925 goto rewrite; 926 default: 927 return NAT_PUNT_TO_STACK; 928 } 929 break; 930 default: 931 return NAT_PUNT_TO_STACK; 932 }; 933 934 if (snat_v4_rev_nat_can_skip(target, &tuple)) 935 return NAT_PUNT_TO_STACK; 936 ret = snat_v4_rev_nat_handle_mapping(ctx, &tuple, has_l4_header, &state, 937 ip4, off, target, trace); 938 if (ret < 0) 939 return ret; 940 941 /* Skip port rewrite for ICMP_DEST_UNREACH by passing old_port == new_port == 0. */ 942 to_dport = state->to_dport; 943 944 rewrite: 945 return snat_v4_rewrite_headers(ctx, tuple.nexthdr, ETH_HLEN, has_l4_header, off, 946 tuple.daddr, state->to_daddr, IPV4_DADDR_OFF, 947 tuple.dport, to_dport, port_off); 948 } 949 #else 950 static __always_inline __maybe_unused 951 int snat_v4_nat(struct __ctx_buff *ctx __maybe_unused, 952 const struct ipv4_nat_target *target __maybe_unused) 953 { 954 return CTX_ACT_OK; 955 } 956 957 static __always_inline __maybe_unused 958 int snat_v4_rev_nat(struct __ctx_buff *ctx __maybe_unused, 959 const struct ipv4_nat_target *target __maybe_unused) 960 { 961 return CTX_ACT_OK; 962 } 963 #endif 964 965 struct ipv6_nat_entry { 966 struct nat_entry common; 967 union { 968 struct lb6_reverse_nat nat_info; 969 struct { 970 union v6addr to_saddr; 971 __be16 to_sport; 972 }; 973 struct { 974 union v6addr to_daddr; 975 __be16 to_dport; 976 }; 977 }; 978 }; 979 980 struct ipv6_nat_target { 981 union v6addr addr; 982 const __u16 min_port; /* host endianness */ 983 const __u16 max_port; /* host endianness */ 984 bool from_local_endpoint; 985 bool needs_ct; 986 }; 987 988 #if defined(ENABLE_IPV6) && defined(ENABLE_NODEPORT) 989 struct { 990 __uint(type, BPF_MAP_TYPE_LRU_HASH); 991 __type(key, struct ipv6_ct_tuple); 992 __type(value, struct ipv6_nat_entry); 993 __uint(pinning, LIBBPF_PIN_BY_NAME); 994 __uint(max_entries, SNAT_MAPPING_IPV6_SIZE); 995 } SNAT_MAPPING_IPV6 __section_maps_btf; 996 997 #ifdef ENABLE_CLUSTER_AWARE_ADDRESSING 998 struct { 999 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 1000 __type(key, __u32); 1001 __type(value, __u32); 1002 __uint(pinning, LIBBPF_PIN_BY_NAME); 1003 __uint(max_entries, 256); 1004 __array(values, struct { 1005 __uint(type, BPF_MAP_TYPE_LRU_HASH); 1006 __type(key, struct ipv6_ct_tuple); 1007 __type(value, struct ipv6_nat_entry); 1008 __uint(max_entries, SNAT_MAPPING_IPV6_SIZE); 1009 }); 1010 } PER_CLUSTER_SNAT_MAPPING_IPV6 __section_maps_btf; 1011 #endif 1012 1013 #ifdef ENABLE_IP_MASQ_AGENT_IPV6 1014 struct { 1015 __uint(type, BPF_MAP_TYPE_LPM_TRIE); 1016 __type(key, struct lpm_v6_key); 1017 __type(value, struct lpm_val); 1018 __uint(pinning, LIBBPF_PIN_BY_NAME); 1019 __uint(max_entries, 16384); 1020 __uint(map_flags, BPF_F_NO_PREALLOC); 1021 } IP_MASQ_AGENT_IPV6 __section_maps_btf; 1022 #endif 1023 1024 static __always_inline void * 1025 get_cluster_snat_map_v6(__u32 cluster_id __maybe_unused) 1026 { 1027 #if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT) 1028 if (cluster_id != 0 && cluster_id != CLUSTER_ID) 1029 return map_lookup_elem(&PER_CLUSTER_SNAT_MAPPING_IPV6, &cluster_id); 1030 #endif 1031 return &SNAT_MAPPING_IPV6; 1032 } 1033 1034 static __always_inline 1035 struct ipv6_nat_entry *snat_v6_lookup(const struct ipv6_ct_tuple *tuple) 1036 { 1037 return __snat_lookup(&SNAT_MAPPING_IPV6, tuple); 1038 } 1039 1040 static __always_inline int snat_v6_new_mapping(struct __ctx_buff *ctx, 1041 struct ipv6_ct_tuple *otuple, 1042 struct ipv6_nat_entry *ostate, 1043 const struct ipv6_nat_target *target, 1044 bool needs_ct, __s8 *ext_err) 1045 { 1046 struct ipv6_ct_tuple rtuple = {}; 1047 struct ipv6_nat_entry rstate; 1048 int ret, retries; 1049 __u16 port; 1050 1051 memset(&rstate, 0, sizeof(rstate)); 1052 memset(ostate, 0, sizeof(*ostate)); 1053 1054 rstate.to_daddr = otuple->saddr; 1055 rstate.to_dport = otuple->sport; 1056 1057 ostate->to_saddr = target->addr; 1058 /* .to_sport is selected below */ 1059 1060 rtuple.flags = TUPLE_F_IN; 1061 rtuple.nexthdr = otuple->nexthdr; 1062 rtuple.saddr = otuple->daddr; 1063 rtuple.daddr = ostate->to_saddr; 1064 rtuple.sport = otuple->dport; 1065 /* .dport is selected below */ 1066 1067 port = __snat_try_keep_port(target->min_port, 1068 target->max_port, 1069 bpf_ntohs(otuple->sport)); 1070 1071 ostate->common.needs_ct = needs_ct; 1072 rstate.common.needs_ct = needs_ct; 1073 rstate.common.created = bpf_mono_now(); 1074 1075 #pragma unroll 1076 for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) { 1077 rtuple.dport = bpf_htons(port); 1078 1079 if (__snat_create(&SNAT_MAPPING_IPV6, &rtuple, &rstate) == 0) 1080 goto create_nat_entry; 1081 1082 port = __snat_clamp_port_range(target->min_port, 1083 target->max_port, 1084 retries ? port + 1 : 1085 (__u16)get_prandom_u32()); 1086 } 1087 1088 ret = DROP_NAT_NO_MAPPING; 1089 goto out; 1090 1091 create_nat_entry: 1092 ostate->to_sport = rtuple.dport; 1093 ostate->common.created = rstate.common.created; 1094 1095 ret = __snat_create(&SNAT_MAPPING_IPV6, otuple, ostate); 1096 if (ret < 0) { 1097 map_delete_elem(&SNAT_MAPPING_IPV6, &rtuple); /* rollback */ 1098 if (ext_err) 1099 *ext_err = (__s8)ret; 1100 ret = DROP_NAT_NO_MAPPING; 1101 } 1102 1103 out: 1104 if (retries > SNAT_SIGNAL_THRES) 1105 send_signal_nat_fill_up(ctx, SIGNAL_PROTO_V6); 1106 1107 return ret; 1108 } 1109 1110 static __always_inline int 1111 snat_v6_nat_handle_mapping(struct __ctx_buff *ctx, 1112 struct ipv6_ct_tuple *tuple, 1113 struct ipv6_nat_entry **state, 1114 struct ipv6_nat_entry *tmp, 1115 __u32 off, 1116 const struct ipv6_nat_target *target, 1117 struct trace_ctx *trace, 1118 __s8 *ext_err) 1119 { 1120 bool needs_ct = target->needs_ct; 1121 1122 *state = snat_v6_lookup(tuple); 1123 1124 if (needs_ct) { 1125 struct ipv6_ct_tuple tuple_snat; 1126 int ret; 1127 1128 memcpy(&tuple_snat, tuple, sizeof(tuple_snat)); 1129 /* Lookup with SCOPE_FORWARD. Ports are already in correct layout: */ 1130 ipv6_ct_tuple_swap_addrs(&tuple_snat); 1131 1132 ret = ct_lazy_lookup6(get_ct_map6(&tuple_snat), &tuple_snat, 1133 ctx, off, CT_EGRESS, SCOPE_FORWARD, 1134 CT_ENTRY_ANY, NULL, &trace->monitor); 1135 if (ret < 0) 1136 return ret; 1137 1138 trace->reason = (enum trace_reason)ret; 1139 if (ret == CT_NEW) { 1140 ret = ct_create6(get_ct_map6(&tuple_snat), NULL, 1141 &tuple_snat, ctx, CT_EGRESS, 1142 NULL, ext_err); 1143 if (IS_ERR(ret)) 1144 return ret; 1145 } 1146 } 1147 1148 if (*state) { 1149 barrier_data(*state); 1150 return 0; 1151 } 1152 1153 *state = tmp; 1154 return snat_v6_new_mapping(ctx, tuple, tmp, target, needs_ct, ext_err); 1155 } 1156 1157 static __always_inline int 1158 snat_v6_rev_nat_handle_mapping(struct __ctx_buff *ctx, 1159 struct ipv6_ct_tuple *tuple, 1160 struct ipv6_nat_entry **state, 1161 __u32 off, 1162 struct trace_ctx *trace) 1163 { 1164 *state = snat_v6_lookup(tuple); 1165 1166 if (*state && (*state)->common.needs_ct) { 1167 struct ipv6_ct_tuple tuple_revsnat; 1168 int ret; 1169 1170 memcpy(&tuple_revsnat, tuple, sizeof(tuple_revsnat)); 1171 ipv6_addr_copy(&tuple_revsnat.daddr, &(*state)->to_daddr); 1172 tuple_revsnat.dport = (*state)->to_dport; 1173 1174 /* CT expects a tuple with the source and destination ports reversed, 1175 * while NAT uses normal tuples that match packet headers. 1176 */ 1177 ipv6_ct_tuple_swap_ports(&tuple_revsnat); 1178 1179 ret = ct_lazy_lookup6(get_ct_map6(&tuple_revsnat), &tuple_revsnat, 1180 ctx, off, CT_INGRESS, SCOPE_REVERSE, 1181 CT_ENTRY_ANY, NULL, &trace->monitor); 1182 if (ret < 0) 1183 return ret; 1184 1185 trace->reason = (enum trace_reason)ret; 1186 } 1187 1188 if (*state) 1189 return 0; 1190 1191 return DROP_NAT_NO_MAPPING; 1192 } 1193 1194 static __always_inline int 1195 snat_v6_rewrite_headers(struct __ctx_buff *ctx, __u8 nexthdr, int l3_off, int l4_off, 1196 union v6addr *old_addr, union v6addr *new_addr, __u16 addr_off, 1197 __be16 old_port, __be16 new_port, __u16 port_off) 1198 { 1199 struct csum_offset csum = {}; 1200 __wsum sum; 1201 1202 /* No change needed: */ 1203 if (ipv6_addr_equals(old_addr, new_addr) && old_port == new_port) 1204 return 0; 1205 1206 sum = csum_diff(old_addr, 16, new_addr, 16, 0); 1207 if (ctx_store_bytes(ctx, l3_off + addr_off, new_addr, 16, 0) < 0) 1208 return DROP_WRITE_ERROR; 1209 1210 csum_l4_offset_and_flags(nexthdr, &csum); 1211 1212 if (old_port != new_port) { 1213 int err; 1214 1215 switch (nexthdr) { 1216 case IPPROTO_TCP: 1217 case IPPROTO_UDP: 1218 case IPPROTO_ICMPV6: 1219 break; 1220 #ifdef ENABLE_SCTP 1221 case IPPROTO_SCTP: 1222 return DROP_CSUM_L4; 1223 #endif /* ENABLE_SCTP */ 1224 default: 1225 return DROP_UNKNOWN_L4; 1226 } 1227 1228 /* Amend the L4 checksum due to changing the ports. */ 1229 err = l4_modify_port(ctx, l4_off, port_off, &csum, new_port, old_port); 1230 if (err < 0) 1231 return err; 1232 } 1233 1234 if (csum.offset && 1235 csum_l4_replace(ctx, l4_off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0) 1236 return DROP_CSUM_L4; 1237 1238 return 0; 1239 } 1240 1241 static __always_inline bool 1242 snat_v6_nat_can_skip(const struct ipv6_nat_target *target, 1243 const struct ipv6_ct_tuple *tuple) 1244 { 1245 __u16 sport = bpf_ntohs(tuple->sport); 1246 1247 return (!target->from_local_endpoint && sport < NAT_MIN_EGRESS); 1248 } 1249 1250 static __always_inline bool 1251 snat_v6_rev_nat_can_skip(const struct ipv6_nat_target *target, const struct ipv6_ct_tuple *tuple) 1252 { 1253 __u16 dport = bpf_ntohs(tuple->dport); 1254 1255 return dport < target->min_port || dport > target->max_port; 1256 } 1257 1258 static __always_inline __maybe_unused int 1259 snat_v6_create_dsr(const struct ipv6_ct_tuple *tuple, union v6addr *to_saddr, 1260 __be16 to_sport, __s8 *ext_err) 1261 { 1262 struct ipv6_ct_tuple tmp = *tuple; 1263 struct ipv6_nat_entry state = {}; 1264 int ret; 1265 1266 build_bug_on(sizeof(struct ipv6_nat_entry) > 64); 1267 1268 tmp.flags = TUPLE_F_OUT; 1269 tmp.sport = tuple->dport; 1270 tmp.dport = tuple->sport; 1271 1272 state.common.created = bpf_mono_now(); 1273 ipv6_addr_copy(&state.to_saddr, to_saddr); 1274 state.to_sport = to_sport; 1275 1276 ret = map_update_elem(&SNAT_MAPPING_IPV6, &tmp, &state, 0); 1277 if (ret) { 1278 *ext_err = (__s8)ret; 1279 return DROP_NAT_NO_MAPPING; 1280 } 1281 1282 return CTX_ACT_OK; 1283 } 1284 1285 static __always_inline void snat_v6_init_tuple(const struct ipv6hdr *ip6, 1286 enum nat_dir dir, 1287 struct ipv6_ct_tuple *tuple) 1288 { 1289 ipv6_addr_copy(&tuple->daddr, (union v6addr *)&ip6->daddr); 1290 ipv6_addr_copy(&tuple->saddr, (union v6addr *)&ip6->saddr); 1291 tuple->flags = dir; 1292 } 1293 1294 static __always_inline int 1295 snat_v6_needs_masquerade(struct __ctx_buff *ctx __maybe_unused, 1296 struct ipv6_ct_tuple *tuple __maybe_unused, 1297 int l4_off __maybe_unused, 1298 struct ipv6_nat_target *target __maybe_unused) 1299 { 1300 union v6addr masq_addr __maybe_unused; 1301 struct remote_endpoint_info *remote_ep __maybe_unused; 1302 struct endpoint_info *local_ep __maybe_unused; 1303 bool is_reply __maybe_unused = false; 1304 1305 /* See comments in snat_v4_needs_masquerade(). */ 1306 #if defined(ENABLE_MASQUERADE_IPV6) && defined(IS_BPF_HOST) 1307 BPF_V6(masq_addr, IPV6_MASQUERADE); 1308 if (ipv6_addr_equals(&tuple->saddr, &masq_addr)) { 1309 ipv6_addr_copy(&target->addr, &masq_addr); 1310 target->needs_ct = true; 1311 1312 return NAT_NEEDED; 1313 } 1314 1315 local_ep = __lookup_ip6_endpoint(&tuple->saddr); 1316 remote_ep = lookup_ip6_remote_endpoint(&tuple->daddr, 0); 1317 1318 if (local_ep) { 1319 int err; 1320 1321 target->from_local_endpoint = true; 1322 1323 err = ct_extract_ports6(ctx, l4_off, tuple); 1324 switch (err) { 1325 case 0: 1326 is_reply = ct_is_reply6(get_ct_map6(tuple), tuple); 1327 1328 /* SNAT code has its own port extraction logic: */ 1329 tuple->dport = 0; 1330 tuple->sport = 0; 1331 1332 break; 1333 case DROP_CT_UNKNOWN_PROTO: 1334 /* tolerate L4 protocols not supported by CT: */ 1335 break; 1336 default: 1337 return err; 1338 } 1339 } 1340 1341 # ifdef IPV6_SNAT_EXCLUSION_DST_CIDR 1342 { 1343 union v6addr excl_cidr_mask = IPV6_SNAT_EXCLUSION_DST_CIDR_MASK; 1344 union v6addr excl_cidr = IPV6_SNAT_EXCLUSION_DST_CIDR; 1345 1346 if (ipv6_addr_in_net(&tuple->daddr, &excl_cidr, &excl_cidr_mask)) 1347 return NAT_PUNT_TO_STACK; 1348 } 1349 # endif /* IPV6_SNAT_EXCLUSION_DST_CIDR */ 1350 1351 /* if this is a localhost endpoint, no SNAT is needed */ 1352 if (local_ep && (local_ep->flags & ENDPOINT_F_HOST)) 1353 return NAT_PUNT_TO_STACK; 1354 1355 if (remote_ep) { 1356 #ifdef ENABLE_IP_MASQ_AGENT_IPV6 1357 /* Do not SNAT if dst belongs to any ip-masq-agent subnet. */ 1358 struct lpm_v6_key pfx __align_stack_8; 1359 1360 pfx.lpm.prefixlen = sizeof(pfx.addr) * 8; 1361 /* pfx.lpm is aligned on 8 bytes on the stack, but pfx.lpm.data 1362 * is on 4 (after pfx.lpm.prefixlen). As the CT tuple is on the 1363 * stack as well, we need to copy piece-by-piece. 1364 */ 1365 memcpy(pfx.lpm.data, &tuple->daddr.p1, 4); 1366 memcpy(pfx.lpm.data + 4, &tuple->daddr.p2, 4); 1367 memcpy(pfx.lpm.data + 8, &tuple->daddr.p3, 4); 1368 memcpy(pfx.lpm.data + 12, &tuple->daddr.p4, 4); 1369 if (map_lookup_elem(&IP_MASQ_AGENT_IPV6, &pfx)) 1370 return NAT_PUNT_TO_STACK; 1371 #endif 1372 1373 if (!is_defined(TUNNEL_MODE) || remote_ep->flag_skip_tunnel) { 1374 if (identity_is_remote_node(remote_ep->sec_identity)) 1375 return NAT_PUNT_TO_STACK; 1376 } 1377 1378 if (!is_reply && local_ep) { 1379 ipv6_addr_copy(&target->addr, &masq_addr); 1380 return NAT_NEEDED; 1381 } 1382 } 1383 #endif /* ENABLE_MASQUERADE_IPV6 && IS_BPF_HOST */ 1384 1385 return NAT_PUNT_TO_STACK; 1386 } 1387 1388 static __always_inline int 1389 __snat_v6_nat(struct __ctx_buff *ctx, struct ipv6_ct_tuple *tuple, 1390 int l4_off, bool update_tuple, 1391 const struct ipv6_nat_target *target, __u16 port_off, 1392 struct trace_ctx *trace, __s8 *ext_err) 1393 { 1394 struct ipv6_nat_entry *state, tmp; 1395 int ret; 1396 1397 ret = snat_v6_nat_handle_mapping(ctx, tuple, &state, &tmp, l4_off, 1398 target, trace, ext_err); 1399 if (ret < 0) 1400 return ret; 1401 1402 ret = snat_v6_rewrite_headers(ctx, tuple->nexthdr, ETH_HLEN, l4_off, 1403 &tuple->saddr, &state->to_saddr, IPV6_SADDR_OFF, 1404 tuple->sport, state->to_sport, port_off); 1405 1406 if (update_tuple) { 1407 ipv6_addr_copy(&tuple->saddr, &state->to_saddr); 1408 tuple->sport = state->to_sport; 1409 } 1410 1411 return ret; 1412 } 1413 1414 static __always_inline __maybe_unused int 1415 snat_v6_nat(struct __ctx_buff *ctx, struct ipv6_ct_tuple *tuple, int off, 1416 const struct ipv6_nat_target *target, struct trace_ctx *trace, 1417 __s8 *ext_err) 1418 { 1419 struct icmp6hdr icmp6hdr __align_stack_8; 1420 struct { 1421 __be16 sport; 1422 __be16 dport; 1423 } l4hdr; 1424 __u16 port_off; 1425 1426 build_bug_on(sizeof(struct ipv6_nat_entry) > 64); 1427 1428 switch (tuple->nexthdr) { 1429 case IPPROTO_TCP: 1430 case IPPROTO_UDP: 1431 #ifdef ENABLE_SCTP 1432 case IPPROTO_SCTP: 1433 #endif /* ENABLE_SCTP */ 1434 if (l4_load_ports(ctx, off, (__be16 *)&l4hdr) < 0) 1435 return DROP_INVALID; 1436 1437 tuple->dport = l4hdr.dport; 1438 tuple->sport = l4hdr.sport; 1439 port_off = TCP_SPORT_OFF; 1440 break; 1441 case IPPROTO_ICMPV6: 1442 if (ctx_load_bytes(ctx, off, &icmp6hdr, sizeof(icmp6hdr)) < 0) 1443 return DROP_INVALID; 1444 1445 switch (icmp6hdr.icmp6_type) { 1446 case ICMPV6_ECHO_REPLY: 1447 case ICMP6_NS_MSG_TYPE: 1448 case ICMP6_NA_MSG_TYPE: 1449 return NAT_PUNT_TO_STACK; 1450 case ICMPV6_ECHO_REQUEST: 1451 tuple->dport = 0; 1452 tuple->sport = icmp6hdr.icmp6_dataun.u_echo.identifier; 1453 port_off = offsetof(struct icmp6hdr, 1454 icmp6_dataun.u_echo.identifier); 1455 break; 1456 default: 1457 return DROP_NAT_UNSUPP_PROTO; 1458 } 1459 break; 1460 default: 1461 return NAT_PUNT_TO_STACK; 1462 }; 1463 1464 if (snat_v6_nat_can_skip(target, tuple)) 1465 return NAT_PUNT_TO_STACK; 1466 1467 return __snat_v6_nat(ctx, tuple, off, false, target, port_off, 1468 trace, ext_err); 1469 } 1470 1471 static __always_inline __maybe_unused int 1472 snat_v6_rev_nat_handle_icmp_pkt_toobig(struct __ctx_buff *ctx, 1473 __u32 inner_l3_off, 1474 struct ipv6_nat_entry **state) 1475 { 1476 struct ipv6_ct_tuple tuple = {}; 1477 struct ipv6hdr iphdr; 1478 __u16 port_off; 1479 __u32 icmpoff; 1480 __u8 type; 1481 int hdrlen; 1482 1483 /* According to the RFC 5508, any networking 1484 * equipment that is responding with an ICMP Error 1485 * packet should embed the original packet in its 1486 * response. 1487 */ 1488 1489 if (ctx_load_bytes(ctx, inner_l3_off, &iphdr, sizeof(iphdr)) < 0) 1490 return DROP_INVALID; 1491 1492 /* From the embedded IP headers we should be able 1493 * to determine corresponding protocol, IP src/dst 1494 * of the packet sent to resolve the NAT session. 1495 */ 1496 1497 tuple.nexthdr = iphdr.nexthdr; 1498 ipv6_addr_copy(&tuple.saddr, (union v6addr *)&iphdr.daddr); 1499 ipv6_addr_copy(&tuple.daddr, (union v6addr *)&iphdr.saddr); 1500 tuple.flags = NAT_DIR_INGRESS; 1501 1502 hdrlen = ipv6_hdrlen_offset(ctx, &tuple.nexthdr, inner_l3_off); 1503 if (hdrlen < 0) 1504 return hdrlen; 1505 1506 icmpoff = inner_l3_off + hdrlen; 1507 1508 switch (tuple.nexthdr) { 1509 case IPPROTO_TCP: 1510 case IPPROTO_UDP: 1511 #ifdef ENABLE_SCTP 1512 case IPPROTO_SCTP: 1513 #endif /* ENABLE_SCTP */ 1514 /* No reasons to handle IP fragmentation for this case 1515 * as it is expected that DF isn't set for this particular 1516 * context. 1517 */ 1518 if (l4_load_ports(ctx, icmpoff, &tuple.dport) < 0) 1519 return DROP_INVALID; 1520 1521 port_off = TCP_SPORT_OFF; 1522 break; 1523 case IPPROTO_ICMPV6: 1524 /* No reasons to see a packet different than 1525 * ICMPV6_ECHO_REQUEST. 1526 */ 1527 if (icmp6_load_type(ctx, icmpoff, &type) < 0 || 1528 type != ICMPV6_ECHO_REQUEST) 1529 return DROP_INVALID; 1530 1531 port_off = offsetof(struct icmp6hdr, 1532 icmp6_dataun.u_echo.identifier); 1533 1534 if (ctx_load_bytes(ctx, icmpoff + port_off, 1535 &tuple.dport, sizeof(tuple.dport)) < 0) 1536 return DROP_INVALID; 1537 break; 1538 default: 1539 return NAT_PUNT_TO_STACK; 1540 } 1541 1542 *state = snat_v6_lookup(&tuple); 1543 if (!*state) 1544 return NAT_PUNT_TO_STACK; 1545 1546 /* The embedded packet was SNATed on egress. Reverse it again: */ 1547 return snat_v6_rewrite_headers(ctx, tuple.nexthdr, inner_l3_off, icmpoff, 1548 &tuple.daddr, &(*state)->to_daddr, IPV6_SADDR_OFF, 1549 tuple.dport, (*state)->to_dport, port_off); 1550 } 1551 1552 static __always_inline __maybe_unused int 1553 snat_v6_rev_nat(struct __ctx_buff *ctx, const struct ipv6_nat_target *target, 1554 struct trace_ctx *trace, __s8 *ext_err __maybe_unused) 1555 { 1556 struct icmp6hdr icmp6hdr __align_stack_8; 1557 struct ipv6_nat_entry *state = NULL; 1558 struct ipv6_ct_tuple tuple = {}; 1559 __u32 off, inner_l3_off; 1560 void *data, *data_end; 1561 struct ipv6hdr *ip6; 1562 __be16 to_dport = 0; 1563 __u16 port_off = 0; 1564 int ret, hdrlen; 1565 struct { 1566 __be16 sport; 1567 __be16 dport; 1568 } l4hdr; 1569 1570 build_bug_on(sizeof(struct ipv6_nat_entry) > 64); 1571 1572 if (!revalidate_data(ctx, &data, &data_end, &ip6)) 1573 return DROP_INVALID; 1574 1575 tuple.nexthdr = ip6->nexthdr; 1576 hdrlen = ipv6_hdrlen(ctx, &tuple.nexthdr); 1577 if (hdrlen < 0) 1578 return hdrlen; 1579 1580 snat_v6_init_tuple(ip6, NAT_DIR_INGRESS, &tuple); 1581 1582 off = ((void *)ip6 - data) + hdrlen; 1583 switch (tuple.nexthdr) { 1584 case IPPROTO_TCP: 1585 case IPPROTO_UDP: 1586 #ifdef ENABLE_SCTP 1587 case IPPROTO_SCTP: 1588 #endif /* ENABLE_SCTP */ 1589 if (l4_load_ports(ctx, off, (__be16 *)&l4hdr) < 0) 1590 return DROP_INVALID; 1591 tuple.dport = l4hdr.dport; 1592 tuple.sport = l4hdr.sport; 1593 port_off = TCP_DPORT_OFF; 1594 break; 1595 case IPPROTO_ICMPV6: 1596 if (ctx_load_bytes(ctx, off, &icmp6hdr, sizeof(icmp6hdr)) < 0) 1597 return DROP_INVALID; 1598 switch (icmp6hdr.icmp6_type) { 1599 case ICMPV6_ECHO_REPLY: 1600 tuple.dport = icmp6hdr.icmp6_dataun.u_echo.identifier; 1601 tuple.sport = 0; 1602 port_off = offsetof(struct icmp6hdr, 1603 icmp6_dataun.u_echo.identifier); 1604 break; 1605 case ICMPV6_PKT_TOOBIG: 1606 /* ICMPV6_PKT_TOOBIG does not include identifer and 1607 * sequence in its headers. 1608 */ 1609 inner_l3_off = off + sizeof(struct icmp6hdr) - 1610 field_sizeof(struct icmp6hdr, icmp6_dataun.u_echo); 1611 1612 ret = snat_v6_rev_nat_handle_icmp_pkt_toobig(ctx, 1613 inner_l3_off, 1614 &state); 1615 if (IS_ERR(ret)) 1616 return ret; 1617 1618 goto rewrite; 1619 default: 1620 return NAT_PUNT_TO_STACK; 1621 } 1622 break; 1623 default: 1624 return NAT_PUNT_TO_STACK; 1625 }; 1626 1627 if (snat_v6_rev_nat_can_skip(target, &tuple)) 1628 return NAT_PUNT_TO_STACK; 1629 ret = snat_v6_rev_nat_handle_mapping(ctx, &tuple, &state, off, trace); 1630 if (ret < 0) 1631 return ret; 1632 1633 /* Skip port rewrite for ICMPV6_PKT_TOOBIG by passing old_port == new_port == 0. */ 1634 to_dport = state->to_dport; 1635 1636 rewrite: 1637 return snat_v6_rewrite_headers(ctx, tuple.nexthdr, ETH_HLEN, off, 1638 &tuple.daddr, &state->to_daddr, IPV6_DADDR_OFF, 1639 tuple.dport, to_dport, port_off); 1640 } 1641 #else 1642 static __always_inline __maybe_unused 1643 int snat_v6_nat(struct __ctx_buff *ctx __maybe_unused, 1644 const struct ipv6_nat_target *target __maybe_unused) 1645 { 1646 return CTX_ACT_OK; 1647 } 1648 1649 static __always_inline __maybe_unused 1650 int snat_v6_rev_nat(struct __ctx_buff *ctx __maybe_unused, 1651 const struct ipv6_nat_target *target __maybe_unused) 1652 { 1653 return CTX_ACT_OK; 1654 } 1655 #endif 1656 1657 #if defined(ENABLE_IPV6) && defined(ENABLE_NODEPORT) 1658 static __always_inline int 1659 snat_remap_rfc8215(struct __ctx_buff *ctx, const struct iphdr *ip4, int l3_off) 1660 { 1661 union v6addr src6, dst6; 1662 1663 build_v4_in_v6_rfc8215(&src6, ip4->saddr); 1664 build_v4_in_v6(&dst6, ip4->daddr); 1665 return ipv4_to_ipv6(ctx, l3_off, &src6, &dst6); 1666 } 1667 1668 static __always_inline bool 1669 __snat_v6_has_v4_complete(struct ipv6_ct_tuple *tuple6, 1670 const struct ipv4_ct_tuple *tuple4) 1671 { 1672 build_v4_in_v6(&tuple6->daddr, tuple4->daddr); 1673 tuple6->nexthdr = tuple4->nexthdr; 1674 /* tuple4 has ports in swapped order: */ 1675 tuple6->sport = tuple4->dport; 1676 tuple6->dport = tuple4->sport; 1677 tuple6->flags = NAT_DIR_INGRESS; 1678 return snat_v6_lookup(tuple6); 1679 } 1680 1681 static __always_inline bool 1682 snat_v6_has_v4_match_rfc8215(const struct ipv4_ct_tuple *tuple4) 1683 { 1684 struct ipv6_ct_tuple tuple6; 1685 1686 memset(&tuple6, 0, sizeof(tuple6)); 1687 build_v4_in_v6_rfc8215(&tuple6.saddr, tuple4->saddr); 1688 return __snat_v6_has_v4_complete(&tuple6, tuple4); 1689 } 1690 1691 static __always_inline bool 1692 snat_v6_has_v4_match(const struct ipv4_ct_tuple *tuple4) 1693 { 1694 struct ipv6_ct_tuple tuple6; 1695 1696 memset(&tuple6, 0, sizeof(tuple6)); 1697 build_v4_in_v6(&tuple6.saddr, tuple4->saddr); 1698 return __snat_v6_has_v4_complete(&tuple6, tuple4); 1699 } 1700 #else 1701 static __always_inline bool 1702 snat_v6_has_v4_match(const struct ipv4_ct_tuple *tuple4 __maybe_unused) 1703 { 1704 return false; 1705 } 1706 #endif /* ENABLE_IPV6 && ENABLE_NODEPORT */