github.com/fafucoder/cilium@v1.6.11/bpf/lib/nat.h (about) 1 /* 2 * Copyright (C) 2019 Authors of Cilium 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 /* Simple NAT engine in BPF. */ 19 #ifndef __LIB_NAT__ 20 #define __LIB_NAT__ 21 22 #include <linux/icmp.h> 23 #include <linux/tcp.h> 24 #include <linux/udp.h> 25 #include <linux/ip.h> 26 #include <linux/icmpv6.h> 27 #include <linux/ipv6.h> 28 29 #include "common.h" 30 #include "drop.h" 31 #include "signal.h" 32 #include "conntrack.h" 33 #include "conntrack_map.h" 34 35 enum { 36 NAT_DIR_EGRESS = TUPLE_F_OUT, 37 NAT_DIR_INGRESS = TUPLE_F_IN, 38 }; 39 40 struct nat_entry { 41 __u64 created; 42 __u64 host_local; /* Only single bit used. */ 43 __u64 pad1; /* Future use. */ 44 __u64 pad2; /* Future use. */ 45 }; 46 47 #define NAT_CONTINUE_XLATE 0 48 49 #ifdef HAVE_LRU_MAP_TYPE 50 # define NAT_MAP_TYPE BPF_MAP_TYPE_LRU_HASH 51 #else 52 # define NAT_MAP_TYPE BPF_MAP_TYPE_HASH 53 #endif 54 55 #ifdef HAVE_LARGE_INSN_LIMIT 56 # define SNAT_COLLISION_RETRIES 128 57 # define SNAT_SIGNAL_THRES 64 58 #else 59 # if defined ENABLE_IPV4 && defined ENABLE_IPV6 60 # define SNAT_COLLISION_RETRIES 19 61 # else 62 # define SNAT_COLLISION_RETRIES 20 63 # endif 64 # define SNAT_SIGNAL_THRES 10 65 #endif 66 67 static __always_inline __be16 __snat_clamp_port_range(__u16 start, __u16 end, 68 __u16 val) 69 { 70 return (val % (__u16)(end - start)) + start; 71 } 72 73 static __always_inline void *__snat_lookup(void *map, void *tuple) 74 { 75 return map_lookup_elem(map, tuple); 76 } 77 78 static __always_inline int __snat_update(void *map, void *otuple, void *ostate, 79 void *rtuple, void *rstate) 80 { 81 int ret = map_update_elem(map, rtuple, rstate, BPF_NOEXIST); 82 if (!ret) { 83 ret = map_update_elem(map, otuple, ostate, BPF_NOEXIST); 84 if (ret) 85 map_delete_elem(map, rtuple); 86 } 87 return ret; 88 } 89 90 static __always_inline void __snat_delete(void *map, void *otuple, 91 void *rtuple) 92 { 93 map_delete_elem(map, otuple); 94 map_delete_elem(map, rtuple); 95 } 96 97 struct ipv4_nat_entry { 98 struct nat_entry common; 99 union { 100 struct { 101 __be32 to_saddr; 102 __be16 to_sport; 103 }; 104 struct { 105 __be32 to_daddr; 106 __be16 to_dport; 107 }; 108 }; 109 }; 110 111 struct ipv4_nat_target { 112 __be32 addr; 113 const __u16 min_port; /* host endianess */ 114 const __u16 max_port; /* host endianess */ 115 bool src_from_world; 116 }; 117 118 #if defined ENABLE_IPV4 && (defined ENABLE_MASQUERADE || defined ENABLE_NODEPORT) 119 struct bpf_elf_map __section_maps SNAT_MAPPING_IPV4 = { 120 .type = NAT_MAP_TYPE, 121 .size_key = sizeof(struct ipv4_ct_tuple), 122 .size_value = sizeof(struct ipv4_nat_entry), 123 .pinning = PIN_GLOBAL_NS, 124 .max_elem = SNAT_MAPPING_IPV4_SIZE, 125 #ifndef HAVE_LRU_MAP_TYPE 126 .flags = CONDITIONAL_PREALLOC, 127 #endif 128 }; 129 130 static __always_inline 131 struct ipv4_nat_entry *snat_v4_lookup(struct ipv4_ct_tuple *tuple) 132 { 133 return __snat_lookup(&SNAT_MAPPING_IPV4, tuple); 134 } 135 136 static __always_inline int snat_v4_update(struct ipv4_ct_tuple *otuple, 137 struct ipv4_nat_entry *ostate, 138 struct ipv4_ct_tuple *rtuple, 139 struct ipv4_nat_entry *rstate) 140 { 141 return __snat_update(&SNAT_MAPPING_IPV4, otuple, ostate, 142 rtuple, rstate); 143 } 144 145 static __always_inline void snat_v4_delete(struct ipv4_ct_tuple *otuple, 146 struct ipv4_ct_tuple *rtuple) 147 { 148 __snat_delete(&SNAT_MAPPING_IPV4, otuple, rtuple); 149 } 150 151 static __always_inline void snat_v4_swap_tuple(struct ipv4_ct_tuple *otuple, 152 struct ipv4_ct_tuple *rtuple) 153 { 154 __builtin_memset(rtuple, 0, sizeof(*rtuple)); 155 rtuple->nexthdr = otuple->nexthdr; 156 rtuple->daddr = otuple->saddr; 157 rtuple->saddr = otuple->daddr; 158 rtuple->dport = otuple->sport; 159 rtuple->sport = otuple->dport; 160 rtuple->flags = otuple->flags == NAT_DIR_EGRESS ? 161 NAT_DIR_INGRESS : NAT_DIR_EGRESS; 162 } 163 164 static __always_inline int snat_v4_reverse_tuple(struct ipv4_ct_tuple *otuple, 165 struct ipv4_ct_tuple *rtuple) 166 { 167 struct ipv4_nat_entry *ostate; 168 169 ostate = snat_v4_lookup(otuple); 170 if (ostate) { 171 snat_v4_swap_tuple(otuple, rtuple); 172 rtuple->daddr = ostate->to_saddr; 173 rtuple->dport = ostate->to_sport; 174 } 175 176 return ostate ? 0 : -1; 177 } 178 179 static __always_inline void snat_v4_ct_canonicalize(struct ipv4_ct_tuple *otuple) 180 { 181 __be32 addr = otuple->saddr; 182 183 otuple->flags = NAT_DIR_EGRESS; 184 /* Workaround #5848. */ 185 otuple->saddr = otuple->daddr; 186 otuple->daddr = addr; 187 } 188 189 static __always_inline void snat_v4_delete_tuples(struct ipv4_ct_tuple *otuple) 190 { 191 struct ipv4_ct_tuple rtuple; 192 193 if (otuple->flags & TUPLE_F_IN) 194 return; 195 snat_v4_ct_canonicalize(otuple); 196 if (!snat_v4_reverse_tuple(otuple, &rtuple)) 197 snat_v4_delete(otuple, &rtuple); 198 } 199 200 static __always_inline int snat_v4_new_mapping(struct __sk_buff *skb, 201 struct ipv4_ct_tuple *otuple, 202 struct ipv4_nat_entry *ostate, 203 const struct ipv4_nat_target *target) 204 { 205 int ret = DROP_NAT_NO_MAPPING, retries; 206 struct ipv4_nat_entry rstate; 207 struct ipv4_ct_tuple rtuple; 208 __u16 port; 209 210 __builtin_memset(&rstate, 0, sizeof(rstate)); 211 __builtin_memset(ostate, 0, sizeof(*ostate)); 212 213 rstate.to_daddr = otuple->saddr; 214 rstate.to_dport = otuple->sport; 215 216 ostate->to_saddr = target->addr; 217 218 snat_v4_swap_tuple(otuple, &rtuple); 219 port = __snat_clamp_port_range(target->min_port, 220 target->max_port, 221 get_prandom_u32()); 222 223 rtuple.dport = ostate->to_sport = bpf_htons(port); 224 rtuple.daddr = target->addr; 225 226 if (otuple->saddr == target->addr) { 227 ostate->common.host_local = 1; 228 rstate.common.host_local = ostate->common.host_local; 229 } 230 231 #pragma unroll 232 for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) { 233 if (!snat_v4_lookup(&rtuple)) { 234 ostate->common.created = bpf_ktime_get_nsec(); 235 rstate.common.created = ostate->common.created; 236 237 ret = snat_v4_update(otuple, ostate, &rtuple, &rstate); 238 if (!ret) 239 break; 240 } 241 242 port = __snat_clamp_port_range(target->min_port, 243 target->max_port, 244 port + 1); 245 rtuple.dport = ostate->to_sport = bpf_htons(port); 246 } 247 248 if (retries > SNAT_SIGNAL_THRES) 249 send_signal_nat_fill_up(skb, SIGNAL_NAT_PROTO_V4); 250 return !ret ? 0 : DROP_NAT_NO_MAPPING; 251 } 252 253 static __always_inline int snat_v4_track_local(struct __sk_buff *skb, 254 struct ipv4_ct_tuple *tuple, 255 struct ipv4_nat_entry *state, 256 int dir, __u32 off, 257 const struct ipv4_nat_target *target) 258 { 259 struct ct_state ct_state; 260 struct ipv4_ct_tuple tmp; 261 bool needs_ct = false; 262 __u32 monitor = 0; 263 int ret, where; 264 265 if (state && state->common.host_local) { 266 needs_ct = true; 267 } else if (!state && dir == NAT_DIR_EGRESS) { 268 if (tuple->saddr == target->addr) 269 needs_ct = true; 270 } 271 if (!needs_ct) 272 return 0; 273 274 __builtin_memset(&ct_state, 0, sizeof(ct_state)); 275 __builtin_memcpy(&tmp, tuple, sizeof(tmp)); 276 277 where = dir == NAT_DIR_INGRESS ? CT_INGRESS : CT_EGRESS; 278 279 ret = ct_lookup4(get_ct_map4(&tmp), &tmp, skb, off, where, 280 &ct_state, &monitor); 281 if (ret < 0) { 282 return ret; 283 } else if (ret == CT_NEW) { 284 ret = ct_create4(get_ct_map4(&tmp), &tmp, skb, where, 285 &ct_state, false); 286 if (IS_ERR(ret)) 287 return ret; 288 } 289 290 return 0; 291 } 292 293 static __always_inline int snat_v4_handle_mapping(struct __sk_buff *skb, 294 struct ipv4_ct_tuple *tuple, 295 struct ipv4_nat_entry **state, 296 struct ipv4_nat_entry *tmp, 297 int dir, __u32 off, 298 const struct ipv4_nat_target *target) 299 { 300 int ret; 301 302 *state = snat_v4_lookup(tuple); 303 ret = snat_v4_track_local(skb, tuple, *state, dir, off, target); 304 if (ret < 0) 305 return ret; 306 else if (*state) 307 return NAT_CONTINUE_XLATE; 308 else if (dir == NAT_DIR_INGRESS) 309 return tuple->nexthdr != IPPROTO_ICMP && 310 bpf_ntohs(tuple->dport) < target->min_port ? 311 NAT_PUNT_TO_STACK : DROP_NAT_NO_MAPPING; 312 else 313 return snat_v4_new_mapping(skb, tuple, (*state = tmp), target); 314 } 315 316 static __always_inline int snat_v4_rewrite_egress(struct __sk_buff *skb, 317 struct ipv4_ct_tuple *tuple, 318 struct ipv4_nat_entry *state, 319 __u32 off) 320 { 321 struct csum_offset csum = {}; 322 __be32 sum_l4 = 0, sum; 323 int ret; 324 325 if (state->to_saddr == tuple->saddr && 326 state->to_sport == tuple->sport) 327 return 0; 328 sum = csum_diff(&tuple->saddr, 4, &state->to_saddr, 4, 0); 329 csum_l4_offset_and_flags(tuple->nexthdr, &csum); 330 if (state->to_sport != tuple->sport) { 331 switch (tuple->nexthdr) { 332 case IPPROTO_TCP: 333 case IPPROTO_UDP: 334 ret = l4_modify_port(skb, off, 335 offsetof(struct tcphdr, source), 336 &csum, state->to_sport, 337 tuple->sport); 338 if (ret < 0) 339 return ret; 340 break; 341 case IPPROTO_ICMP: { 342 __be32 from, to; 343 344 if (skb_store_bytes(skb, off + 345 offsetof(struct icmphdr, un.echo.id), 346 &state->to_sport, 347 sizeof(state->to_sport), 0) < 0) 348 return DROP_WRITE_ERROR; 349 from = tuple->sport; 350 to = state->to_sport; 351 sum_l4 = csum_diff(&from, 4, &to, 4, 0); 352 csum.offset = offsetof(struct icmphdr, checksum); 353 break; 354 }} 355 } 356 if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct iphdr, saddr), 357 &state->to_saddr, 4, 0) < 0) 358 return DROP_WRITE_ERROR; 359 if (l3_csum_replace(skb, ETH_HLEN + offsetof(struct iphdr, check), 360 0, sum, 0) < 0) 361 return DROP_CSUM_L3; 362 if (tuple->nexthdr == IPPROTO_ICMP) 363 sum = sum_l4; 364 if (csum.offset && 365 csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0) 366 return DROP_CSUM_L4; 367 return 0; 368 } 369 370 static __always_inline int snat_v4_rewrite_ingress(struct __sk_buff *skb, 371 struct ipv4_ct_tuple *tuple, 372 struct ipv4_nat_entry *state, 373 __u32 off) 374 { 375 struct csum_offset csum = {}; 376 __be32 sum_l4 = 0, sum; 377 int ret; 378 379 if (state->to_daddr == tuple->daddr && 380 state->to_dport == tuple->dport) 381 return 0; 382 sum = csum_diff(&tuple->daddr, 4, &state->to_daddr, 4, 0); 383 csum_l4_offset_and_flags(tuple->nexthdr, &csum); 384 if (state->to_dport != tuple->dport) { 385 switch (tuple->nexthdr) { 386 case IPPROTO_TCP: 387 case IPPROTO_UDP: 388 ret = l4_modify_port(skb, off, 389 offsetof(struct tcphdr, dest), 390 &csum, state->to_dport, 391 tuple->dport); 392 if (ret < 0) 393 return ret; 394 break; 395 case IPPROTO_ICMP: { 396 __be32 from, to; 397 398 if (skb_store_bytes(skb, off + 399 offsetof(struct icmphdr, un.echo.id), 400 &state->to_dport, 401 sizeof(state->to_dport), 0) < 0) 402 return DROP_WRITE_ERROR; 403 from = tuple->dport; 404 to = state->to_dport; 405 sum_l4 = csum_diff(&from, 4, &to, 4, 0); 406 csum.offset = offsetof(struct icmphdr, checksum); 407 break; 408 }} 409 } 410 if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct iphdr, daddr), 411 &state->to_daddr, 4, 0) < 0) 412 return DROP_WRITE_ERROR; 413 if (l3_csum_replace(skb, ETH_HLEN + offsetof(struct iphdr, check), 414 0, sum, 0) < 0) 415 return DROP_CSUM_L3; 416 if (tuple->nexthdr == IPPROTO_ICMP) 417 sum = sum_l4; 418 if (csum.offset && 419 csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0) 420 return DROP_CSUM_L4; 421 return 0; 422 } 423 424 static __always_inline bool snat_v4_can_skip(const struct ipv4_nat_target *target, 425 const struct ipv4_ct_tuple *tuple, int dir) 426 { 427 __u16 dport = bpf_ntohs(tuple->dport), sport = bpf_ntohs(tuple->sport); 428 429 if (dir == NAT_DIR_EGRESS && !target->src_from_world && sport < NAT_MIN_EGRESS) 430 return true; 431 if (dir == NAT_DIR_INGRESS && (dport < target->min_port || dport > target->max_port)) 432 return true; 433 return false; 434 } 435 436 static __always_inline int snat_v4_process(struct __sk_buff *skb, int dir, 437 const struct ipv4_nat_target *target) 438 { 439 struct ipv4_nat_entry *state, tmp; 440 struct ipv4_ct_tuple tuple = {}; 441 struct icmphdr icmphdr; 442 void *data, *data_end; 443 struct iphdr *ip4; 444 struct { 445 __be16 sport; 446 __be16 dport; 447 } l4hdr; 448 __u32 off; 449 int ret; 450 451 build_bug_on(sizeof(struct ipv4_nat_entry) > 64); 452 453 if (!revalidate_data(skb, &data, &data_end, &ip4)) 454 return DROP_INVALID; 455 456 tuple.nexthdr = ip4->protocol; 457 tuple.daddr = ip4->daddr; 458 tuple.saddr = ip4->saddr; 459 tuple.flags = dir; 460 off = ((void *)ip4 - data) + ipv4_hdrlen(ip4); 461 switch (tuple.nexthdr) { 462 case IPPROTO_TCP: 463 case IPPROTO_UDP: 464 if (skb_load_bytes(skb, off, &l4hdr, sizeof(l4hdr)) < 0) 465 return DROP_INVALID; 466 tuple.dport = l4hdr.dport; 467 tuple.sport = l4hdr.sport; 468 break; 469 case IPPROTO_ICMP: 470 if (skb_load_bytes(skb, off, &icmphdr, sizeof(icmphdr)) < 0) 471 return DROP_INVALID; 472 if (icmphdr.type != ICMP_ECHO && 473 icmphdr.type != ICMP_ECHOREPLY) 474 return DROP_NAT_UNSUPP_PROTO; 475 if (dir == NAT_DIR_EGRESS) { 476 tuple.dport = 0; 477 tuple.sport = icmphdr.un.echo.id; 478 } else { 479 tuple.dport = icmphdr.un.echo.id; 480 tuple.sport = 0; 481 } 482 break; 483 default: 484 return DROP_NAT_UNSUPP_PROTO; 485 }; 486 487 if (snat_v4_can_skip(target, &tuple, dir)) 488 return NAT_PUNT_TO_STACK; 489 ret = snat_v4_handle_mapping(skb, &tuple, &state, &tmp, dir, off, target); 490 if (ret > 0) 491 return TC_ACT_OK; 492 if (ret < 0) 493 return ret; 494 495 return dir == NAT_DIR_EGRESS ? 496 snat_v4_rewrite_egress(skb, &tuple, state, off) : 497 snat_v4_rewrite_ingress(skb, &tuple, state, off); 498 } 499 #else 500 static __always_inline int snat_v4_process(struct __sk_buff *skb, int dir, 501 const struct ipv4_nat_target *target) 502 { 503 return TC_ACT_OK; 504 } 505 506 static __always_inline void snat_v4_delete_tuples(struct ipv4_ct_tuple *tuple) 507 { 508 } 509 #endif 510 511 struct ipv6_nat_entry { 512 struct nat_entry common; 513 union { 514 struct { 515 union v6addr to_saddr; 516 __be16 to_sport; 517 }; 518 struct { 519 union v6addr to_daddr; 520 __be16 to_dport; 521 }; 522 }; 523 }; 524 525 struct ipv6_nat_target { 526 union v6addr addr; 527 const __u16 min_port; /* host endianess */ 528 const __u16 max_port; /* host endianess */ 529 bool src_from_world; 530 }; 531 532 #if defined ENABLE_IPV6 && (defined ENABLE_MASQUERADE || defined ENABLE_NODEPORT) 533 struct bpf_elf_map __section_maps SNAT_MAPPING_IPV6 = { 534 .type = NAT_MAP_TYPE, 535 .size_key = sizeof(struct ipv6_ct_tuple), 536 .size_value = sizeof(struct ipv6_nat_entry), 537 .pinning = PIN_GLOBAL_NS, 538 .max_elem = SNAT_MAPPING_IPV6_SIZE, 539 #ifndef HAVE_LRU_MAP_TYPE 540 .flags = CONDITIONAL_PREALLOC, 541 #endif 542 }; 543 544 static __always_inline 545 struct ipv6_nat_entry *snat_v6_lookup(struct ipv6_ct_tuple *tuple) 546 { 547 return __snat_lookup(&SNAT_MAPPING_IPV6, tuple); 548 } 549 550 static __always_inline int snat_v6_update(struct ipv6_ct_tuple *otuple, 551 struct ipv6_nat_entry *ostate, 552 struct ipv6_ct_tuple *rtuple, 553 struct ipv6_nat_entry *rstate) 554 { 555 return __snat_update(&SNAT_MAPPING_IPV6, otuple, ostate, 556 rtuple, rstate); 557 } 558 559 static __always_inline void snat_v6_delete(struct ipv6_ct_tuple *otuple, 560 struct ipv6_ct_tuple *rtuple) 561 { 562 __snat_delete(&SNAT_MAPPING_IPV6, otuple, rtuple); 563 } 564 565 static __always_inline void snat_v6_swap_tuple(struct ipv6_ct_tuple *otuple, 566 struct ipv6_ct_tuple *rtuple) 567 { 568 __builtin_memset(rtuple, 0, sizeof(*rtuple)); 569 rtuple->nexthdr = otuple->nexthdr; 570 rtuple->daddr = otuple->saddr; 571 rtuple->saddr = otuple->daddr; 572 rtuple->dport = otuple->sport; 573 rtuple->sport = otuple->dport; 574 rtuple->flags = otuple->flags == NAT_DIR_EGRESS ? 575 NAT_DIR_INGRESS : NAT_DIR_EGRESS; 576 } 577 578 static __always_inline int snat_v6_reverse_tuple(struct ipv6_ct_tuple *otuple, 579 struct ipv6_ct_tuple *rtuple) 580 { 581 struct ipv6_nat_entry *ostate; 582 583 ostate = snat_v6_lookup(otuple); 584 if (ostate) { 585 snat_v6_swap_tuple(otuple, rtuple); 586 rtuple->daddr = ostate->to_saddr; 587 rtuple->dport = ostate->to_sport; 588 } 589 590 return ostate ? 0 : -1; 591 } 592 593 static __always_inline void snat_v6_ct_canonicalize(struct ipv6_ct_tuple *otuple) 594 { 595 union v6addr addr = {}; 596 597 otuple->flags = NAT_DIR_EGRESS; 598 /* Workaround #5848. */ 599 ipv6_addr_copy(&addr, &otuple->saddr); 600 ipv6_addr_copy(&otuple->saddr, &otuple->daddr); 601 ipv6_addr_copy(&otuple->daddr, &addr); 602 } 603 604 static __always_inline void snat_v6_delete_tuples(struct ipv6_ct_tuple *otuple) 605 { 606 struct ipv6_ct_tuple rtuple; 607 608 if (otuple->flags & TUPLE_F_IN) 609 return; 610 snat_v6_ct_canonicalize(otuple); 611 if (!snat_v6_reverse_tuple(otuple, &rtuple)) 612 snat_v6_delete(otuple, &rtuple); 613 } 614 615 static __always_inline int snat_v6_new_mapping(struct __sk_buff *skb, 616 struct ipv6_ct_tuple *otuple, 617 struct ipv6_nat_entry *ostate, 618 const struct ipv6_nat_target *target) 619 { 620 int ret = DROP_NAT_NO_MAPPING, retries; 621 struct ipv6_nat_entry rstate; 622 struct ipv6_ct_tuple rtuple; 623 __u16 port; 624 625 __builtin_memset(&rstate, 0, sizeof(rstate)); 626 __builtin_memset(ostate, 0, sizeof(*ostate)); 627 628 rstate.to_daddr = otuple->saddr; 629 rstate.to_dport = otuple->sport; 630 631 ostate->to_saddr = target->addr; 632 633 snat_v6_swap_tuple(otuple, &rtuple); 634 port = __snat_clamp_port_range(target->min_port, 635 target->max_port, 636 get_prandom_u32()); 637 638 rtuple.dport = ostate->to_sport = bpf_htons(port); 639 rtuple.daddr = target->addr; 640 641 if (!ipv6_addrcmp(&otuple->saddr, &rtuple.daddr)) { 642 ostate->common.host_local = 1; 643 rstate.common.host_local = ostate->common.host_local; 644 } 645 646 #pragma unroll 647 for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) { 648 if (!snat_v6_lookup(&rtuple)) { 649 ostate->common.created = bpf_ktime_get_nsec(); 650 rstate.common.created = ostate->common.created; 651 652 ret = snat_v6_update(otuple, ostate, &rtuple, &rstate); 653 if (!ret) 654 break; 655 } 656 657 port = __snat_clamp_port_range(target->min_port, 658 target->max_port, 659 port + 1); 660 rtuple.dport = ostate->to_sport = bpf_htons(port); 661 } 662 663 if (retries > SNAT_SIGNAL_THRES) 664 send_signal_nat_fill_up(skb, SIGNAL_NAT_PROTO_V6); 665 return !ret ? 0 : DROP_NAT_NO_MAPPING; 666 } 667 668 static __always_inline int snat_v6_track_local(struct __sk_buff *skb, 669 struct ipv6_ct_tuple *tuple, 670 struct ipv6_nat_entry *state, 671 int dir, __u32 off, 672 const struct ipv6_nat_target *target) 673 { 674 struct ct_state ct_state; 675 struct ipv6_ct_tuple tmp; 676 bool needs_ct = false; 677 __u32 monitor = 0; 678 int ret, where; 679 680 if (state && state->common.host_local) { 681 needs_ct = true; 682 } else if (!state && dir == NAT_DIR_EGRESS) { 683 if (!ipv6_addrcmp(&tuple->saddr, (void *)&target->addr)) 684 needs_ct = true; 685 } 686 if (!needs_ct) 687 return 0; 688 689 __builtin_memset(&ct_state, 0, sizeof(ct_state)); 690 __builtin_memcpy(&tmp, tuple, sizeof(tmp)); 691 692 where = dir == NAT_DIR_INGRESS ? CT_INGRESS : CT_EGRESS; 693 694 ret = ct_lookup6(get_ct_map6(&tmp), &tmp, skb, off, where, 695 &ct_state, &monitor); 696 if (ret < 0) { 697 return ret; 698 } else if (ret == CT_NEW) { 699 ret = ct_create6(get_ct_map6(&tmp), &tmp, skb, where, 700 &ct_state, false); 701 if (IS_ERR(ret)) 702 return ret; 703 } 704 705 return 0; 706 } 707 708 static __always_inline int snat_v6_handle_mapping(struct __sk_buff *skb, 709 struct ipv6_ct_tuple *tuple, 710 struct ipv6_nat_entry **state, 711 struct ipv6_nat_entry *tmp, 712 int dir, __u32 off, 713 const struct ipv6_nat_target *target) 714 { 715 int ret; 716 717 *state = snat_v6_lookup(tuple); 718 ret = snat_v6_track_local(skb, tuple, *state, dir, off, target); 719 if (ret < 0) 720 return ret; 721 else if (*state) 722 return NAT_CONTINUE_XLATE; 723 else if (dir == NAT_DIR_INGRESS) 724 return tuple->nexthdr != IPPROTO_ICMPV6 && 725 bpf_ntohs(tuple->dport) < target->min_port ? 726 NAT_PUNT_TO_STACK : DROP_NAT_NO_MAPPING; 727 else 728 return snat_v6_new_mapping(skb, tuple, (*state = tmp), target); 729 } 730 731 static __always_inline int snat_v6_rewrite_egress(struct __sk_buff *skb, 732 struct ipv6_ct_tuple *tuple, 733 struct ipv6_nat_entry *state, 734 __u32 off) 735 { 736 struct csum_offset csum = {}; 737 __be32 sum; 738 int ret; 739 740 if (!ipv6_addrcmp(&state->to_saddr, &tuple->saddr) && 741 state->to_sport == tuple->sport) 742 return 0; 743 sum = csum_diff(&tuple->saddr, 16, &state->to_saddr, 16, 0); 744 csum_l4_offset_and_flags(tuple->nexthdr, &csum); 745 if (state->to_sport != tuple->sport) { 746 switch (tuple->nexthdr) { 747 case IPPROTO_TCP: 748 case IPPROTO_UDP: 749 ret = l4_modify_port(skb, off, offsetof(struct tcphdr, source), 750 &csum, state->to_sport, tuple->sport); 751 if (ret < 0) 752 return ret; 753 break; 754 case IPPROTO_ICMPV6: { 755 __be32 from, to; 756 757 if (skb_store_bytes(skb, off + 758 offsetof(struct icmp6hdr, 759 icmp6_dataun.u_echo.identifier), 760 &state->to_sport, 761 sizeof(state->to_sport), 0) < 0) 762 return DROP_WRITE_ERROR; 763 from = tuple->sport; 764 to = state->to_sport; 765 sum = csum_diff(&from, 4, &to, 4, sum); 766 break; 767 }} 768 } 769 if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct ipv6hdr, saddr), 770 &state->to_saddr, 16, 0) < 0) 771 return DROP_WRITE_ERROR; 772 if (csum.offset && 773 csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0) 774 return DROP_CSUM_L4; 775 return 0; 776 } 777 778 static __always_inline int snat_v6_rewrite_ingress(struct __sk_buff *skb, 779 struct ipv6_ct_tuple *tuple, 780 struct ipv6_nat_entry *state, 781 __u32 off) 782 { 783 struct csum_offset csum = {}; 784 __be32 sum; 785 int ret; 786 787 if (!ipv6_addrcmp(&state->to_daddr, &tuple->daddr) && 788 state->to_dport == tuple->dport) 789 return 0; 790 sum = csum_diff(&tuple->daddr, 16, &state->to_daddr, 16, 0); 791 csum_l4_offset_and_flags(tuple->nexthdr, &csum); 792 if (state->to_dport != tuple->dport) { 793 switch (tuple->nexthdr) { 794 case IPPROTO_TCP: 795 case IPPROTO_UDP: 796 ret = l4_modify_port(skb, off, 797 offsetof(struct tcphdr, dest), 798 &csum, state->to_dport, 799 tuple->dport); 800 if (ret < 0) 801 return ret; 802 break; 803 case IPPROTO_ICMPV6: { 804 __be32 from, to; 805 806 if (skb_store_bytes(skb, off + 807 offsetof(struct icmp6hdr, 808 icmp6_dataun.u_echo.identifier), 809 &state->to_dport, 810 sizeof(state->to_dport), 0) < 0) 811 return DROP_WRITE_ERROR; 812 from = tuple->dport; 813 to = state->to_dport; 814 sum = csum_diff(&from, 4, &to, 4, sum); 815 break; 816 }} 817 } 818 if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct ipv6hdr, daddr), 819 &state->to_daddr, 16, 0) < 0) 820 return DROP_WRITE_ERROR; 821 if (csum.offset && 822 csum_l4_replace(skb, off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0) 823 return DROP_CSUM_L4; 824 return 0; 825 } 826 827 static __always_inline bool snat_v6_can_skip(const struct ipv6_nat_target *target, 828 const struct ipv6_ct_tuple *tuple, int dir) 829 { 830 __u16 dport = bpf_ntohs(tuple->dport), sport = bpf_ntohs(tuple->sport); 831 832 if (dir == NAT_DIR_EGRESS && !target->src_from_world && sport < NAT_MIN_EGRESS) 833 return true; 834 if (dir == NAT_DIR_INGRESS && (dport < target->min_port || dport > target->max_port)) 835 return true; 836 return false; 837 } 838 839 static __always_inline int snat_v6_process(struct __sk_buff *skb, int dir, 840 const struct ipv6_nat_target *target) 841 { 842 struct ipv6_nat_entry *state, tmp; 843 struct ipv6_ct_tuple tuple = {}; 844 struct icmp6hdr icmp6hdr; 845 void *data, *data_end; 846 struct ipv6hdr *ip6; 847 int ret, hdrlen; 848 struct { 849 __be16 sport; 850 __be16 dport; 851 } l4hdr; 852 __u8 nexthdr; 853 __u32 off; 854 855 build_bug_on(sizeof(struct ipv6_nat_entry) > 64); 856 857 if (!revalidate_data(skb, &data, &data_end, &ip6)) 858 return DROP_INVALID; 859 860 nexthdr = ip6->nexthdr; 861 hdrlen = ipv6_hdrlen(skb, ETH_HLEN, &nexthdr); 862 if (hdrlen < 0) 863 return hdrlen; 864 865 tuple.nexthdr = nexthdr; 866 ipv6_addr_copy(&tuple.daddr, (union v6addr *)&ip6->daddr); 867 ipv6_addr_copy(&tuple.saddr, (union v6addr *)&ip6->saddr); 868 tuple.flags = dir; 869 off = ((void *)ip6 - data) + hdrlen; 870 switch (tuple.nexthdr) { 871 case IPPROTO_TCP: 872 case IPPROTO_UDP: 873 if (skb_load_bytes(skb, off, &l4hdr, sizeof(l4hdr)) < 0) 874 return DROP_INVALID; 875 tuple.dport = l4hdr.dport; 876 tuple.sport = l4hdr.sport; 877 break; 878 case IPPROTO_ICMPV6: 879 if (skb_load_bytes(skb, off, &icmp6hdr, sizeof(icmp6hdr)) < 0) 880 return DROP_INVALID; 881 /* Letting neighbor solicitation / advertisement pass through. */ 882 if (icmp6hdr.icmp6_type == 135 || icmp6hdr.icmp6_type == 136) 883 return TC_ACT_OK; 884 if (icmp6hdr.icmp6_type != ICMPV6_ECHO_REQUEST && 885 icmp6hdr.icmp6_type != ICMPV6_ECHO_REPLY) 886 return DROP_NAT_UNSUPP_PROTO; 887 if (dir == NAT_DIR_EGRESS) { 888 tuple.dport = 0; 889 tuple.sport = icmp6hdr.icmp6_dataun.u_echo.identifier; 890 } else { 891 tuple.dport = icmp6hdr.icmp6_dataun.u_echo.identifier; 892 tuple.sport = 0; 893 } 894 break; 895 default: 896 return DROP_NAT_UNSUPP_PROTO; 897 }; 898 899 if (snat_v6_can_skip(target, &tuple, dir)) 900 return NAT_PUNT_TO_STACK; 901 ret = snat_v6_handle_mapping(skb, &tuple, &state, &tmp, dir, off, target); 902 if (ret > 0) 903 return TC_ACT_OK; 904 if (ret < 0) 905 return ret; 906 907 return dir == NAT_DIR_EGRESS ? 908 snat_v6_rewrite_egress(skb, &tuple, state, off) : 909 snat_v6_rewrite_ingress(skb, &tuple, state, off); 910 } 911 #else 912 static __always_inline int snat_v6_process(struct __sk_buff *skb, int dir, 913 const struct ipv6_nat_target *target) 914 { 915 return TC_ACT_OK; 916 } 917 918 static __always_inline void snat_v6_delete_tuples(struct ipv6_ct_tuple *tuple) 919 { 920 } 921 #endif 922 923 #ifdef CONNTRACK 924 static __always_inline void ct_delete4(void *map, struct ipv4_ct_tuple *tuple, 925 struct __sk_buff *skb) 926 { 927 int err; 928 929 if ((err = map_delete_elem(map, tuple)) < 0) 930 cilium_dbg(skb, DBG_ERROR_RET, BPF_FUNC_map_delete_elem, err); 931 else 932 snat_v4_delete_tuples(tuple); 933 } 934 935 static __always_inline void ct_delete6(void *map, struct ipv6_ct_tuple *tuple, 936 struct __sk_buff *skb) 937 { 938 int err; 939 940 if ((err = map_delete_elem(map, tuple)) < 0) 941 cilium_dbg(skb, DBG_ERROR_RET, BPF_FUNC_map_delete_elem, err); 942 else 943 snat_v6_delete_tuples(tuple); 944 } 945 #else 946 static __always_inline void ct_delete4(void *map, struct ipv4_ct_tuple *tuple, 947 struct __sk_buff *skb) 948 { 949 } 950 951 static __always_inline void ct_delete6(void *map, struct ipv6_ct_tuple *tuple, 952 struct __sk_buff *skb) 953 { 954 } 955 #endif 956 957 static __always_inline int snat_process(struct __sk_buff *skb, int dir) 958 { 959 int ret = TC_ACT_OK; 960 961 #ifdef ENABLE_MASQUERADE 962 switch (skb->protocol) { 963 #ifdef ENABLE_IPV4 964 case bpf_htons(ETH_P_IP): { 965 struct ipv4_nat_target target = { 966 .min_port = SNAT_MAPPING_MIN_PORT, 967 .max_port = SNAT_MAPPING_MAX_PORT, 968 .addr = SNAT_IPV4_EXTERNAL, 969 }; 970 ret = snat_v4_process(skb, dir, &target); 971 break; } 972 #endif 973 #ifdef ENABLE_IPV6 974 case bpf_htons(ETH_P_IPV6): { 975 struct ipv6_nat_target target = { 976 .min_port = SNAT_MAPPING_MIN_PORT, 977 .max_port = SNAT_MAPPING_MAX_PORT, 978 }; 979 BPF_V6(target.addr, SNAT_IPV6_EXTERNAL); 980 ret = snat_v6_process(skb, dir, &target); 981 break; } 982 #endif 983 } 984 if (IS_ERR(ret)) 985 return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, dir); 986 #endif 987 return ret; 988 } 989 #endif /* __LIB_NAT__ */