github.com/cilium/cilium@v1.16.2/bpf/lib/conntrack.h (about) 1 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 /* Copyright Authors of Cilium */ 3 4 #pragma once 5 6 #include <linux/icmpv6.h> 7 #include <linux/icmp.h> 8 9 #include "common.h" 10 #include "utils.h" 11 #include "ipv4.h" 12 #include "ipv6.h" 13 #include "dbg.h" 14 #include "l4.h" 15 #include "signal.h" 16 17 enum ct_action { 18 ACTION_UNSPEC, 19 ACTION_CREATE, 20 ACTION_CLOSE, 21 }; 22 23 enum ct_scope { 24 SCOPE_FORWARD, 25 SCOPE_REVERSE, 26 SCOPE_BIDIR, 27 }; 28 29 enum ct_entry_type { 30 CT_ENTRY_ANY = 0, 31 CT_ENTRY_NODEPORT = (1 << 0), 32 CT_ENTRY_DSR = (1 << 1), 33 CT_ENTRY_SVC = (1 << 2), 34 }; 35 36 #ifdef ENABLE_IPV4 37 struct ct_buffer4 { 38 struct ipv4_ct_tuple tuple; 39 struct ct_state ct_state; 40 __u32 monitor; 41 int ret; 42 int l4_off; 43 }; 44 #endif 45 46 #ifdef ENABLE_IPV6 47 struct ct_buffer6 { 48 struct ipv6_ct_tuple tuple; 49 struct ct_state ct_state; 50 __u32 monitor; 51 int ret; 52 int l4_off; 53 }; 54 #endif 55 56 static __always_inline enum ct_action ct_tcp_select_action(union tcp_flags flags) 57 { 58 if (unlikely(flags.value & (TCP_FLAG_RST | TCP_FLAG_FIN))) 59 return ACTION_CLOSE; 60 61 if (unlikely(flags.value & TCP_FLAG_SYN)) 62 return ACTION_CREATE; 63 64 return ACTION_UNSPEC; 65 } 66 67 static __always_inline bool ct_entry_seen_both_syns(const struct ct_entry *entry) 68 { 69 bool rx_syn = entry->rx_flags_seen & TCP_FLAG_SYN; 70 bool tx_syn = entry->tx_flags_seen & TCP_FLAG_SYN; 71 72 return rx_syn && tx_syn; 73 } 74 75 /** 76 * Update the CT timeout and TCP flags for the specified entry. 77 * 78 * We track the OR'd accumulation of seen tcp flags in the entry, and the 79 * last time that a notification was sent. Multiple CPUs may enter this 80 * function with packets for the same connection, in which case it is possible 81 * for the CPUs to race to update the entry. In such a case, the critical 82 * update section may be entered in quick succession, leading to multiple 83 * updates of the entry and returning true for each CPU. The BPF architecture 84 * guarantees that entire 8-bit or 32-bit values will be set within the entry, 85 * so although the CPUs may race, the worst result is that multiple executions 86 * of this function return non-zero for the same connection within short 87 * succession, leading to multiple trace notifications being sent when one 88 * might otherwise expect such notifications to be aggregated. 89 * 90 * Returns how many bytes of the packet should be monitored: 91 * - Zero if this flow was recently monitored. 92 * - Non-zero if this flow has not been monitored recently. 93 */ 94 static __always_inline __u32 __ct_update_timeout(struct ct_entry *entry, 95 __u32 lifetime, enum ct_dir dir, 96 union tcp_flags flags, 97 __u8 report_mask) 98 { 99 __u32 now = bpf_mono_now(); 100 __u8 accumulated_flags; 101 __u8 seen_flags = flags.lower_bits & report_mask; 102 __u32 last_report; 103 104 WRITE_ONCE(entry->lifetime, now + lifetime); 105 106 if (dir == CT_INGRESS) { 107 accumulated_flags = READ_ONCE(entry->rx_flags_seen); 108 last_report = READ_ONCE(entry->last_rx_report); 109 } else { 110 accumulated_flags = READ_ONCE(entry->tx_flags_seen); 111 last_report = READ_ONCE(entry->last_tx_report); 112 } 113 seen_flags |= accumulated_flags; 114 115 /* It's possible for multiple CPUs to execute the branch statement here 116 * one after another, before the first CPU is able to execute the entry 117 * modifications within this branch. This is somewhat unlikely because 118 * packets for the same connection are typically steered towards the 119 * same CPU, but is possible in theory. 120 * 121 * If the branch is taken by multiple CPUs because of '*last_report', 122 * then this merely causes multiple notifications to be sent after 123 * CT_REPORT_INTERVAL rather than a single notification. '*last_report' 124 * will be updated by all CPUs and subsequent checks should not take 125 * this branch until the next CT_REPORT_INTERVAL. As such, the trace 126 * aggregation that uses the result of this function may reduce the 127 * number of packets per interval to a small integer value (max N_CPUS) 128 * rather than 1 notification per packet throughout the interval. 129 * 130 * Similar behaviour may happen with tcp_flags. The worst case race 131 * here would be that two or more CPUs argue over which flags have been 132 * seen and overwrite each other, with each CPU interleaving different 133 * values for which flags were seen. In practice, realistic connections 134 * are likely to progressively set SYN, ACK, then much later perhaps 135 * FIN and/or RST. Furthermore, unless such a traffic pattern were 136 * constantly received, this should self-correct as the stored 137 * tcp_flags is an OR'd set of flags and each time the above code is 138 * executed, it pulls the latest set of accumulated flags. Therefore 139 * even in the worst case such a conflict is likely only to cause a 140 * small number of additional notifications, which is still likely to 141 * be significantly less under this MONITOR_AGGREGATION mode than would 142 * otherwise be sent if the MONITOR_AGGREGATION level is set to none 143 * (ie, sending a notification for every packet). 144 */ 145 if (last_report + bpf_sec_to_mono(CT_REPORT_INTERVAL) < now || 146 accumulated_flags != seen_flags) { 147 /* verifier workaround: we don't use reference here. */ 148 if (dir == CT_INGRESS) { 149 WRITE_ONCE(entry->rx_flags_seen, seen_flags); 150 WRITE_ONCE(entry->last_rx_report, now); 151 } else { 152 WRITE_ONCE(entry->tx_flags_seen, seen_flags); 153 WRITE_ONCE(entry->last_tx_report, now); 154 } 155 return TRACE_PAYLOAD_LEN; 156 } 157 return 0; 158 } 159 160 /** 161 * Update the CT timeouts for the specified entry. 162 * 163 * If CT_REPORT_INTERVAL has elapsed since the last update, updates the 164 * last_updated timestamp and returns true. Otherwise returns false. 165 */ 166 static __always_inline __u32 ct_update_timeout(struct ct_entry *entry, 167 bool tcp, enum ct_dir dir, 168 union tcp_flags seen_flags) 169 { 170 __u32 lifetime = dir == CT_SERVICE ? 171 bpf_sec_to_mono(CT_SERVICE_LIFETIME_NONTCP) : 172 bpf_sec_to_mono(CT_CONNECTION_LIFETIME_NONTCP); 173 bool syn = seen_flags.value & TCP_FLAG_SYN; 174 175 if (tcp) { 176 entry->seen_non_syn |= !syn; 177 if (entry->seen_non_syn) { 178 lifetime = dir == CT_SERVICE ? 179 bpf_sec_to_mono(CT_SERVICE_LIFETIME_TCP) : 180 bpf_sec_to_mono(CT_CONNECTION_LIFETIME_TCP); 181 } else { 182 lifetime = bpf_sec_to_mono(CT_SYN_TIMEOUT); 183 } 184 } 185 186 return __ct_update_timeout(entry, lifetime, dir, seen_flags, 187 CT_REPORT_FLAGS); 188 } 189 190 static __always_inline void 191 ct_lookup_fill_state(struct ct_state *state, const struct ct_entry *entry, 192 enum ct_dir dir, bool syn) 193 { 194 state->rev_nat_index = entry->rev_nat_index; 195 if (dir == CT_SERVICE) { 196 state->backend_id = entry->backend_id; 197 state->syn = syn; 198 } else if (dir == CT_INGRESS || dir == CT_EGRESS) { 199 #ifndef DISABLE_LOOPBACK_LB 200 state->loopback = entry->lb_loopback; 201 #endif 202 state->node_port = entry->node_port; 203 state->dsr_internal = entry->dsr_internal; 204 state->proxy_redirect = entry->proxy_redirect; 205 state->from_l7lb = entry->from_l7lb; 206 state->from_tunnel = entry->from_tunnel; 207 #ifndef HAVE_FIB_IFINDEX 208 state->ifindex = entry->ifindex; 209 #endif 210 } 211 } 212 213 static __always_inline void ct_reset_seen_flags(struct ct_entry *entry) 214 { 215 entry->rx_flags_seen = 0; 216 entry->tx_flags_seen = 0; 217 } 218 219 static __always_inline void ct_reset_closing(struct ct_entry *entry) 220 { 221 entry->rx_closing = 0; 222 entry->tx_closing = 0; 223 } 224 225 static __always_inline bool ct_entry_alive(const struct ct_entry *entry) 226 { 227 return !entry->rx_closing || !entry->tx_closing; 228 } 229 230 static __always_inline bool ct_entry_closing(const struct ct_entry *entry) 231 { 232 return entry->tx_closing || entry->rx_closing; 233 } 234 235 static __always_inline bool 236 ct_entry_expired_rebalance(const struct ct_entry *entry) 237 { 238 __u32 wait_time = bpf_sec_to_mono(CT_SERVICE_CLOSE_REBALANCE); 239 240 /* This doesn't check last_rx_report because we don't see closing 241 * in RX direction for CT_SERVICE. 242 */ 243 return READ_ONCE(entry->last_tx_report) + wait_time <= bpf_mono_now(); 244 } 245 246 static __always_inline bool 247 ct_entry_matches_types(const struct ct_entry *entry __maybe_unused, 248 __u32 ct_entry_types, const struct ct_state *state) 249 { 250 if (ct_entry_types == CT_ENTRY_ANY) 251 return true; 252 253 /* Only match CT entries that were created for the expected service: */ 254 if ((ct_entry_types & CT_ENTRY_SVC) && 255 entry->rev_nat_index == state->rev_nat_index) 256 return true; 257 258 #ifdef ENABLE_NODEPORT 259 if ((ct_entry_types & CT_ENTRY_NODEPORT) && 260 entry->node_port && entry->rev_nat_index) { 261 if (!state || !state->rev_nat_index) 262 return true; 263 264 /* Only match CT entries that were created for the expected service: */ 265 if (entry->rev_nat_index == state->rev_nat_index) 266 return true; 267 } 268 269 # ifdef ENABLE_DSR 270 if ((ct_entry_types & CT_ENTRY_DSR) && entry->dsr_internal) 271 return true; 272 # endif 273 #endif 274 275 return false; 276 } 277 278 /** 279 * Returns CT_NEW or CT_ESTABLISHED. 280 * 'ct_state', if not nullptr, will be filled in only if CT_ESTABLISHED is returned. 281 */ 282 static __always_inline enum ct_status 283 __ct_lookup(const void *map, struct __ctx_buff *ctx, const void *tuple, 284 enum ct_action action, enum ct_dir dir, __u32 ct_entry_types, 285 struct ct_state *ct_state, bool is_tcp, union tcp_flags seen_flags, 286 __u32 *monitor) 287 { 288 bool syn = seen_flags.value & TCP_FLAG_SYN; 289 struct ct_entry *entry; 290 291 entry = map_lookup_elem(map, tuple); 292 if (entry) { 293 if (!ct_entry_matches_types(entry, ct_entry_types, ct_state)) 294 goto ct_new; 295 296 cilium_dbg(ctx, DBG_CT_MATCH, entry->lifetime, entry->rev_nat_index); 297 if (dir == CT_SERVICE && syn && 298 ct_entry_closing(entry) && 299 ct_entry_expired_rebalance(entry)) 300 goto ct_new; 301 302 if (ct_entry_alive(entry)) 303 *monitor = ct_update_timeout(entry, is_tcp, dir, seen_flags); 304 305 /* For backward-compatibility we need to update reverse NAT 306 * index in the CT_SERVICE entry for old connections. 307 */ 308 if (dir == CT_SERVICE && entry->rev_nat_index == 0) 309 entry->rev_nat_index = ct_state->rev_nat_index; 310 311 #ifdef CONNTRACK_ACCOUNTING 312 __sync_fetch_and_add(&entry->packets, 1); 313 __sync_fetch_and_add(&entry->bytes, ctx_full_len(ctx)); 314 #endif 315 switch (action) { 316 case ACTION_CREATE: 317 if (unlikely(ct_entry_closing(entry))) { 318 ct_reset_closing(entry); 319 ct_reset_seen_flags(entry); 320 entry->seen_non_syn = false; 321 322 *monitor = ct_update_timeout(entry, is_tcp, dir, seen_flags); 323 324 /* Return CT_NEW so that the caller creates a new entry instead of 325 * updating the old one. (For policy drops the old entry remains.) 326 */ 327 return CT_NEW; 328 } 329 break; 330 331 case ACTION_CLOSE: 332 switch (dir) { 333 case CT_SERVICE: 334 /* We only see the forward direction. Close both 335 * directions to make the ct_entry_alive() check 336 * below behave as expected. 337 */ 338 entry->rx_closing = 1; 339 entry->tx_closing = 1; 340 break; 341 default: 342 /* If we got an RST and have not seen both SYNs, 343 * terminate the connection. 344 */ 345 if (!ct_entry_seen_both_syns(entry) && 346 (seen_flags.value & TCP_FLAG_RST)) { 347 entry->rx_closing = 1; 348 entry->tx_closing = 1; 349 } else if (dir == CT_INGRESS) { 350 entry->rx_closing = 1; 351 } else { 352 entry->tx_closing = 1; 353 } 354 } 355 if (ct_state) 356 ct_state->closing = 1; 357 358 *monitor = TRACE_PAYLOAD_LEN; 359 if (ct_entry_alive(entry)) 360 break; 361 __ct_update_timeout(entry, bpf_sec_to_mono(CT_CLOSE_TIMEOUT), 362 dir, seen_flags, CT_REPORT_FLAGS); 363 break; 364 default: 365 break; 366 } 367 368 /* Fill ct_state after all potential CT_NEW returns. */ 369 if (ct_state) 370 ct_lookup_fill_state(ct_state, entry, dir, syn); 371 372 return CT_ESTABLISHED; 373 } 374 375 ct_new: __maybe_unused; 376 *monitor = TRACE_PAYLOAD_LEN; 377 return CT_NEW; 378 } 379 380 static __always_inline __u8 381 ct_lookup_select_tuple_type(enum ct_dir dir, enum ct_scope scope) 382 { 383 if (dir == CT_SERVICE) 384 return TUPLE_F_SERVICE; 385 386 switch (scope) { 387 case SCOPE_FORWARD: 388 return (dir == CT_EGRESS) ? TUPLE_F_OUT : TUPLE_F_IN; 389 case SCOPE_BIDIR: 390 /* Due to policy requirements, RELATED or REPLY state takes 391 * precedence over ESTABLISHED. So lookup in reverse direction first: 392 */ 393 case SCOPE_REVERSE: 394 return (dir == CT_EGRESS) ? TUPLE_F_IN : TUPLE_F_OUT; 395 } 396 } 397 398 /* The function determines whether an egress flow identified by the given 399 * tuple is a reply. 400 * 401 * The datapath creates a CT entry in a reverse order. E.g., if a pod sends a 402 * request to outside, the CT entry stored in the BPF map will be TUPLE_F_IN: 403 * pod => outside. So, we can leverage this fact to determine whether the given 404 * flow is a reply. 405 */ 406 #define DEFINE_FUNC_CT_IS_REPLY(FAMILY) \ 407 static __always_inline bool \ 408 ct_is_reply ## FAMILY(const void *map, \ 409 struct ipv ## FAMILY ## _ct_tuple *tuple) \ 410 { \ 411 __u8 flags = tuple->flags; \ 412 bool is_reply = false; \ 413 \ 414 tuple->flags = TUPLE_F_IN; \ 415 \ 416 if (map_lookup_elem(map, tuple)) \ 417 is_reply = true; \ 418 \ 419 /* restore initial flags */ \ 420 tuple->flags = flags; \ 421 \ 422 return is_reply; \ 423 } 424 425 static __always_inline int 426 ipv6_extract_tuple(struct __ctx_buff *ctx, struct ipv6_ct_tuple *tuple) 427 { 428 void *data, *data_end; 429 struct ipv6hdr *ip6; 430 int ret; 431 432 if (!revalidate_data(ctx, &data, &data_end, &ip6)) 433 return DROP_INVALID; 434 435 tuple->nexthdr = ip6->nexthdr; 436 ipv6_addr_copy(&tuple->daddr, (union v6addr *)&ip6->daddr); 437 ipv6_addr_copy(&tuple->saddr, (union v6addr *)&ip6->saddr); 438 439 ret = ipv6_hdrlen(ctx, &tuple->nexthdr); 440 if (ret < 0) 441 return ret; 442 443 if (unlikely(tuple->nexthdr != IPPROTO_TCP && 444 #ifdef ENABLE_SCTP 445 tuple->nexthdr != IPPROTO_SCTP && 446 #endif /* ENABLE_SCTP */ 447 tuple->nexthdr != IPPROTO_UDP)) 448 return DROP_CT_UNKNOWN_PROTO; 449 450 ret = l4_load_ports(ctx, ETH_HLEN + ret, &tuple->dport); 451 if (ret < 0) 452 return DROP_CT_INVALID_HDR; 453 454 return CTX_ACT_OK; 455 } 456 457 static __always_inline void ct_flip_tuple_dir6(struct ipv6_ct_tuple *tuple) 458 { 459 if (tuple->flags & TUPLE_F_IN) 460 tuple->flags &= ~TUPLE_F_IN; 461 else 462 tuple->flags |= TUPLE_F_IN; 463 } 464 465 static __always_inline void 466 ipv6_ct_tuple_swap_addrs(struct ipv6_ct_tuple *tuple) 467 { 468 union v6addr tmp_addr = {}; 469 470 ipv6_addr_copy(&tmp_addr, &tuple->saddr); 471 ipv6_addr_copy(&tuple->saddr, &tuple->daddr); 472 ipv6_addr_copy(&tuple->daddr, &tmp_addr); 473 } 474 475 static __always_inline void 476 ipv6_ct_tuple_swap_ports(struct ipv6_ct_tuple *tuple) 477 { 478 __be16 tmp; 479 480 /* Conntrack code uses tuples that have source and destination ports in 481 * the reversed order. Other code, such as BPF helpers and NAT, requires 482 * normal tuples that match the actual packet contents. This function 483 * converts between these two formats. 484 */ 485 tmp = tuple->sport; 486 tuple->sport = tuple->dport; 487 tuple->dport = tmp; 488 } 489 490 static __always_inline void 491 __ipv6_ct_tuple_reverse(struct ipv6_ct_tuple *tuple) 492 { 493 ipv6_ct_tuple_swap_addrs(tuple); 494 ipv6_ct_tuple_swap_ports(tuple); 495 } 496 497 static __always_inline void 498 ipv6_ct_tuple_reverse(struct ipv6_ct_tuple *tuple) 499 { 500 __ipv6_ct_tuple_reverse(tuple); 501 ct_flip_tuple_dir6(tuple); 502 } 503 504 static __always_inline int 505 ct_extract_ports6(struct __ctx_buff *ctx, int off, struct ipv6_ct_tuple *tuple) 506 { 507 switch (tuple->nexthdr) { 508 case IPPROTO_ICMPV6: 509 if (1) { 510 __be16 identifier = 0; 511 __u8 type; 512 513 if (ctx_load_bytes(ctx, off, &type, 1) < 0) 514 return DROP_CT_INVALID_HDR; 515 if ((type == ICMPV6_ECHO_REQUEST || type == ICMPV6_ECHO_REPLY) && 516 ctx_load_bytes(ctx, off + offsetof(struct icmp6hdr, 517 icmp6_dataun.u_echo.identifier), 518 &identifier, 2) < 0) 519 return DROP_CT_INVALID_HDR; 520 521 tuple->sport = 0; 522 tuple->dport = 0; 523 524 switch (type) { 525 case ICMPV6_DEST_UNREACH: 526 case ICMPV6_PKT_TOOBIG: 527 case ICMPV6_TIME_EXCEED: 528 case ICMPV6_PARAMPROB: 529 tuple->flags |= TUPLE_F_RELATED; 530 break; 531 532 case ICMPV6_ECHO_REPLY: 533 tuple->sport = identifier; 534 break; 535 536 case ICMPV6_ECHO_REQUEST: 537 tuple->dport = identifier; 538 fallthrough; 539 default: 540 break; 541 } 542 } 543 break; 544 545 /* TCP, UDP, and SCTP all have the ports at the same location */ 546 case IPPROTO_TCP: 547 case IPPROTO_UDP: 548 #ifdef ENABLE_SCTP 549 case IPPROTO_SCTP: 550 #endif /* ENABLE_SCTP */ 551 /* load sport + dport into tuple */ 552 if (l4_load_ports(ctx, off, &tuple->dport) < 0) 553 return DROP_CT_INVALID_HDR; 554 555 break; 556 default: 557 /* Can't handle extension headers yet */ 558 return DROP_CT_UNKNOWN_PROTO; 559 } 560 561 return 0; 562 } 563 564 /* This defines the ct_is_reply6 function. */ 565 DEFINE_FUNC_CT_IS_REPLY(6) 566 567 static __always_inline int 568 __ct_lookup6(const void *map, struct ipv6_ct_tuple *tuple, struct __ctx_buff *ctx, 569 int l4_off, enum ct_dir dir, enum ct_scope scope, __u32 ct_entry_types, 570 struct ct_state *ct_state, __u32 *monitor) 571 { 572 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 573 union tcp_flags tcp_flags = { .value = 0 }; 574 enum ct_action action; 575 enum ct_status ret; 576 577 if (is_tcp) { 578 if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0) 579 return DROP_CT_INVALID_HDR; 580 581 action = ct_tcp_select_action(tcp_flags); 582 } else { 583 action = ACTION_UNSPEC; 584 } 585 586 cilium_dbg3(ctx, DBG_CT_LOOKUP6_1, (__u32)tuple->saddr.p4, (__u32)tuple->daddr.p4, 587 (bpf_ntohs(tuple->sport) << 16) | bpf_ntohs(tuple->dport)); 588 cilium_dbg3(ctx, DBG_CT_LOOKUP6_2, (tuple->nexthdr << 8) | tuple->flags, 589 dir, scope); 590 591 switch (scope) { 592 case SCOPE_REVERSE: 593 case SCOPE_BIDIR: 594 /* Lookup in the reverse direction first: */ 595 ret = __ct_lookup(map, ctx, tuple, action, dir, ct_entry_types, 596 ct_state, is_tcp, tcp_flags, monitor); 597 if (ret != CT_NEW) { 598 if (unlikely(tuple->flags & TUPLE_F_RELATED)) 599 ret = CT_RELATED; 600 else 601 ret = CT_REPLY; 602 goto out; 603 } 604 605 if (scope != SCOPE_BIDIR) 606 goto out; 607 608 /* now lookup in forward direction: */ 609 ipv6_ct_tuple_reverse(tuple); 610 fallthrough; 611 case SCOPE_FORWARD: 612 ret = __ct_lookup(map, ctx, tuple, action, dir, ct_entry_types, 613 ct_state, is_tcp, tcp_flags, monitor); 614 } 615 616 out: 617 cilium_dbg(ctx, DBG_CT_VERDICT, ret, 618 ct_state ? ct_state->rev_nat_index : 0); 619 return ret; 620 } 621 622 /* An IPv6 version of ct_lazy_lookup4. */ 623 static __always_inline int 624 ct_lazy_lookup6(const void *map, struct ipv6_ct_tuple *tuple, 625 struct __ctx_buff *ctx, int l4_off, enum ct_dir dir, 626 enum ct_scope scope, __u32 ct_entry_types, 627 struct ct_state *ct_state, __u32 *monitor) 628 { 629 tuple->flags = ct_lookup_select_tuple_type(dir, scope); 630 631 return __ct_lookup6(map, tuple, ctx, l4_off, dir, scope, 632 ct_entry_types, ct_state, monitor); 633 } 634 635 /* Offset must point to IPv6 */ 636 static __always_inline int ct_lookup6(const void *map, 637 struct ipv6_ct_tuple *tuple, 638 struct __ctx_buff *ctx, int l4_off, 639 enum ct_dir dir, struct ct_state *ct_state, 640 __u32 *monitor) 641 { 642 int ret; 643 644 tuple->flags = ct_lookup_select_tuple_type(dir, SCOPE_BIDIR); 645 646 ret = ct_extract_ports6(ctx, l4_off, tuple); 647 if (ret < 0) 648 return ret; 649 650 return __ct_lookup6(map, tuple, ctx, l4_off, dir, SCOPE_BIDIR, 651 CT_ENTRY_ANY, ct_state, monitor); 652 } 653 654 static __always_inline int 655 ipv4_extract_tuple(struct __ctx_buff *ctx, struct ipv4_ct_tuple *tuple) 656 { 657 void *data, *data_end; 658 struct iphdr *ip4; 659 int ret; 660 661 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 662 return DROP_INVALID; 663 664 tuple->nexthdr = ip4->protocol; 665 666 if (unlikely(tuple->nexthdr != IPPROTO_TCP && 667 #ifdef ENABLE_SCTP 668 tuple->nexthdr != IPPROTO_SCTP && 669 #endif /* ENABLE_SCTP */ 670 tuple->nexthdr != IPPROTO_UDP)) 671 return DROP_CT_UNKNOWN_PROTO; 672 673 tuple->daddr = ip4->daddr; 674 tuple->saddr = ip4->saddr; 675 676 ret = ipv4_load_l4_ports(ctx, ip4, ETH_HLEN + ipv4_hdrlen(ip4), 677 CT_EGRESS, &tuple->dport, NULL); 678 if (ret < 0) 679 return ret; 680 681 return CTX_ACT_OK; 682 } 683 684 static __always_inline void ct_flip_tuple_dir4(struct ipv4_ct_tuple *tuple) 685 { 686 if (tuple->flags & TUPLE_F_IN) 687 tuple->flags &= ~TUPLE_F_IN; 688 else 689 tuple->flags |= TUPLE_F_IN; 690 } 691 692 static __always_inline void 693 ipv4_ct_tuple_swap_addrs(struct ipv4_ct_tuple *tuple) 694 { 695 __be32 tmp_addr = tuple->saddr; 696 697 tuple->saddr = tuple->daddr; 698 tuple->daddr = tmp_addr; 699 } 700 701 static __always_inline void 702 ipv4_ct_tuple_swap_ports(struct ipv4_ct_tuple *tuple) 703 { 704 __be16 tmp; 705 706 /* Conntrack code uses tuples that have source and destination ports in 707 * the reversed order. Other code, such as BPF helpers and NAT, requires 708 * normal tuples that match the actual packet contents. This function 709 * converts between these two formats. 710 */ 711 tmp = tuple->sport; 712 tuple->sport = tuple->dport; 713 tuple->dport = tmp; 714 } 715 716 static __always_inline void 717 __ipv4_ct_tuple_reverse(struct ipv4_ct_tuple *tuple) 718 { 719 ipv4_ct_tuple_swap_addrs(tuple); 720 ipv4_ct_tuple_swap_ports(tuple); 721 } 722 723 static __always_inline void 724 ipv4_ct_tuple_reverse(struct ipv4_ct_tuple *tuple) 725 { 726 __ipv4_ct_tuple_reverse(tuple); 727 ct_flip_tuple_dir4(tuple); 728 } 729 730 static __always_inline __be32 731 ipv4_ct_reverse_tuple_saddr(const struct ipv4_ct_tuple *rtuple) 732 { 733 return rtuple->daddr; 734 } 735 736 static __always_inline __be32 737 ipv4_ct_reverse_tuple_daddr(const struct ipv4_ct_tuple *rtuple) 738 { 739 return rtuple->saddr; 740 } 741 742 static __always_inline int 743 ct_extract_ports4(struct __ctx_buff *ctx, struct iphdr *ip4, int off, 744 enum ct_dir dir, struct ipv4_ct_tuple *tuple, bool *has_l4_header) 745 { 746 int err; 747 748 switch (tuple->nexthdr) { 749 case IPPROTO_ICMP: 750 if (1) { 751 __be16 identifier = 0; 752 __u8 type; 753 754 if (ctx_load_bytes(ctx, off, &type, 1) < 0) 755 return DROP_CT_INVALID_HDR; 756 if ((type == ICMP_ECHO || type == ICMP_ECHOREPLY) && 757 ctx_load_bytes(ctx, off + offsetof(struct icmphdr, un.echo.id), 758 &identifier, 2) < 0) 759 return DROP_CT_INVALID_HDR; 760 761 tuple->sport = 0; 762 tuple->dport = 0; 763 764 switch (type) { 765 case ICMP_DEST_UNREACH: 766 case ICMP_TIME_EXCEEDED: 767 case ICMP_PARAMETERPROB: 768 tuple->flags |= TUPLE_F_RELATED; 769 break; 770 771 case ICMP_ECHOREPLY: 772 tuple->sport = identifier; 773 break; 774 case ICMP_ECHO: 775 tuple->dport = identifier; 776 fallthrough; 777 default: 778 break; 779 } 780 } 781 break; 782 783 /* TCP, UDP, and SCTP all have the ports at the same location */ 784 case IPPROTO_TCP: 785 case IPPROTO_UDP: 786 #ifdef ENABLE_SCTP 787 case IPPROTO_SCTP: 788 #endif /* ENABLE_SCTP */ 789 err = ipv4_load_l4_ports(ctx, ip4, off, dir, &tuple->dport, 790 has_l4_header); 791 if (err < 0) 792 return err; 793 794 break; 795 default: 796 /* Can't handle extension headers yet */ 797 return DROP_CT_UNKNOWN_PROTO; 798 } 799 800 return 0; 801 } 802 803 /* This defines the ct_is_reply4 function. */ 804 DEFINE_FUNC_CT_IS_REPLY(4) 805 806 static __always_inline int 807 __ct_lookup4(const void *map, struct ipv4_ct_tuple *tuple, struct __ctx_buff *ctx, 808 int l4_off, bool has_l4_header, bool is_fragment __maybe_unused, 809 enum ct_dir dir, enum ct_scope scope, __u32 ct_entry_types, 810 struct ct_state *ct_state, __u32 *monitor) 811 { 812 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 813 union tcp_flags tcp_flags = { .value = 0 }; 814 enum ct_action action; 815 enum ct_status ret; 816 817 #ifdef ENABLE_IPV4_FRAGMENTS 818 if (unlikely(is_fragment)) 819 update_metrics(ctx_full_len(ctx), ct_to_metrics_dir(dir), REASON_FRAG_PACKET); 820 #endif 821 822 if (is_tcp && has_l4_header) { 823 if (l4_load_tcp_flags(ctx, l4_off, &tcp_flags) < 0) 824 return DROP_CT_INVALID_HDR; 825 826 action = ct_tcp_select_action(tcp_flags); 827 } else { 828 action = ACTION_UNSPEC; 829 } 830 831 #ifndef QUIET_CT 832 cilium_dbg3(ctx, DBG_CT_LOOKUP4_1, tuple->saddr, tuple->daddr, 833 (bpf_ntohs(tuple->sport) << 16) | bpf_ntohs(tuple->dport)); 834 cilium_dbg3(ctx, DBG_CT_LOOKUP4_2, (tuple->nexthdr << 8) | tuple->flags, 835 dir, scope); 836 #endif 837 838 switch (scope) { 839 case SCOPE_REVERSE: 840 case SCOPE_BIDIR: 841 /* Lookup in the reverse direction first: */ 842 ret = __ct_lookup(map, ctx, tuple, action, dir, ct_entry_types, 843 ct_state, is_tcp, tcp_flags, monitor); 844 if (ret != CT_NEW) { 845 if (unlikely(tuple->flags & TUPLE_F_RELATED)) 846 ret = CT_RELATED; 847 else 848 ret = CT_REPLY; 849 goto out; 850 } 851 852 if (scope != SCOPE_BIDIR) 853 goto out; 854 855 /* now lookup in forward direction: */ 856 ipv4_ct_tuple_reverse(tuple); 857 fallthrough; 858 case SCOPE_FORWARD: 859 ret = __ct_lookup(map, ctx, tuple, action, dir, ct_entry_types, 860 ct_state, is_tcp, tcp_flags, monitor); 861 } 862 863 out: 864 cilium_dbg(ctx, DBG_CT_VERDICT, ret, 865 ct_state ? ct_state->rev_nat_index : 0); 866 return ret; 867 } 868 869 /** Lookup a CT entry for a fully populated CT tuple 870 * @arg map CT map 871 * @arg tuple CT tuple (with populated L4 ports) 872 * @arg ctx packet 873 * @arg is_fragment the result of ipv4_is_fragment(ip4) 874 * @arg l4_off offset to L4 header 875 * @arg has_l4_header packet has L4 header 876 * @arg dir lookup direction 877 * @arg scope CT scope. For SCOPE_FORWARD, the tuple also needs to 878 * be in forward layout. 879 * @arg ct_entry_types a mask of CT_ENTRY_* values that selects the expected 880 * entry type(s) 881 * @arg ct_state returned CT entry information (or NULL if none required) 882 * @arg monitor monitor feedback for trace aggregation 883 * 884 * This differs from ct_lookup4(), as here we expect that the CT tuple has its 885 * L4 ports populated. 886 * 887 * Note that certain ICMP types are not supported by this function (see cases 888 * where ct_extract_ports4 sets tuple->flags), because it overwrites 889 * tuple->flags, but this works well in LB and NAT flows that don't pass these 890 * ICMP types to ct_lazy_lookup4. 891 */ 892 static __always_inline int 893 ct_lazy_lookup4(const void *map, struct ipv4_ct_tuple *tuple, struct __ctx_buff *ctx, 894 bool is_fragment, int l4_off, bool has_l4_header, 895 enum ct_dir dir, enum ct_scope scope, __u32 ct_entry_types, 896 struct ct_state *ct_state, __u32 *monitor) 897 { 898 tuple->flags = ct_lookup_select_tuple_type(dir, scope); 899 900 return __ct_lookup4(map, tuple, ctx, l4_off, has_l4_header, is_fragment, 901 dir, scope, ct_entry_types, ct_state, monitor); 902 } 903 904 /* Offset must point to IPv4 header */ 905 static __always_inline int ct_lookup4(const void *map, 906 struct ipv4_ct_tuple *tuple, 907 struct __ctx_buff *ctx, struct iphdr *ip4, 908 int off, enum ct_dir dir, 909 struct ct_state *ct_state, __u32 *monitor) 910 { 911 bool is_fragment = ipv4_is_fragment(ip4); 912 bool has_l4_header = true; 913 int ret; 914 915 tuple->flags = ct_lookup_select_tuple_type(dir, SCOPE_BIDIR); 916 917 ret = ct_extract_ports4(ctx, ip4, off, dir, tuple, &has_l4_header); 918 if (ret < 0) 919 return ret; 920 921 return __ct_lookup4(map, tuple, ctx, off, has_l4_header, is_fragment, 922 dir, SCOPE_BIDIR, CT_ENTRY_ANY, ct_state, monitor); 923 } 924 925 static __always_inline void 926 ct_create_fill_entry(struct ct_entry *entry, const struct ct_state *state, 927 enum ct_dir dir) 928 { 929 entry->rev_nat_index = state->rev_nat_index; 930 entry->src_sec_id = state->src_sec_id; 931 932 if (dir == CT_SERVICE) { 933 entry->backend_id = state->backend_id; 934 } else if (dir == CT_INGRESS || dir == CT_EGRESS) { 935 #ifndef DISABLE_LOOPBACK_LB 936 entry->lb_loopback = state->loopback; 937 #endif 938 entry->node_port = state->node_port; 939 entry->dsr_internal = state->dsr_internal; 940 entry->from_tunnel = state->from_tunnel; 941 #ifndef HAVE_FIB_IFINDEX 942 entry->ifindex = state->ifindex; 943 #endif 944 /* Note if this is a proxy connection so that replies can be redirected 945 * back to the proxy. 946 */ 947 entry->proxy_redirect = state->proxy_redirect; 948 entry->from_l7lb = state->from_l7lb; 949 } 950 } 951 952 /* Offset must point to IPv6 */ 953 static __always_inline int ct_create6(const void *map_main, const void *map_related, 954 struct ipv6_ct_tuple *tuple, 955 struct __ctx_buff *ctx, const enum ct_dir dir, 956 const struct ct_state *ct_state, __s8 *ext_err) 957 { 958 /* Create entry in original direction */ 959 struct ct_entry entry = { }; 960 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 961 union tcp_flags seen_flags = { .value = 0 }; 962 int err; 963 964 if (ct_state) 965 ct_create_fill_entry(&entry, ct_state, dir); 966 967 seen_flags.value |= is_tcp ? TCP_FLAG_SYN : 0; 968 ct_update_timeout(&entry, is_tcp, dir, seen_flags); 969 970 cilium_dbg3(ctx, DBG_CT_CREATED6, entry.rev_nat_index, 971 entry.src_sec_id, 0); 972 973 if (map_related != NULL) { 974 /* Create an ICMPv6 entry to relate errors */ 975 struct ipv6_ct_tuple icmp_tuple = { 976 .nexthdr = IPPROTO_ICMPV6, 977 .sport = 0, 978 .dport = 0, 979 .flags = tuple->flags | TUPLE_F_RELATED, 980 }; 981 982 ipv6_addr_copy(&icmp_tuple.daddr, &tuple->daddr); 983 ipv6_addr_copy(&icmp_tuple.saddr, &tuple->saddr); 984 985 err = map_update_elem(map_related, &icmp_tuple, &entry, 0); 986 if (unlikely(err < 0)) 987 goto err_ct_fill_up; 988 } 989 990 #ifdef CONNTRACK_ACCOUNTING 991 entry.packets = 1; 992 entry.bytes = ctx_full_len(ctx); 993 #endif 994 995 err = map_update_elem(map_main, tuple, &entry, 0); 996 if (unlikely(err < 0)) 997 goto err_ct_fill_up; 998 999 return 0; 1000 1001 err_ct_fill_up: 1002 if (ext_err) 1003 *ext_err = (__s8)err; 1004 send_signal_ct_fill_up(ctx, SIGNAL_PROTO_V6); 1005 return DROP_CT_CREATE_FAILED; 1006 } 1007 1008 static __always_inline int ct_create4(const void *map_main, 1009 const void *map_related, 1010 struct ipv4_ct_tuple *tuple, 1011 struct __ctx_buff *ctx, const enum ct_dir dir, 1012 const struct ct_state *ct_state, 1013 __s8 *ext_err) 1014 { 1015 /* Create entry in original direction */ 1016 struct ct_entry entry = { }; 1017 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 1018 union tcp_flags seen_flags = { .value = 0 }; 1019 int err; 1020 1021 if (ct_state) 1022 ct_create_fill_entry(&entry, ct_state, dir); 1023 1024 seen_flags.value |= is_tcp ? TCP_FLAG_SYN : 0; 1025 ct_update_timeout(&entry, is_tcp, dir, seen_flags); 1026 1027 cilium_dbg3(ctx, DBG_CT_CREATED4, entry.rev_nat_index, 1028 entry.src_sec_id, 0); 1029 1030 if (map_related != NULL) { 1031 /* Create an ICMP entry to relate errors */ 1032 struct ipv4_ct_tuple icmp_tuple = { 1033 .daddr = tuple->daddr, 1034 .saddr = tuple->saddr, 1035 .nexthdr = IPPROTO_ICMP, 1036 .sport = 0, 1037 .dport = 0, 1038 .flags = tuple->flags | TUPLE_F_RELATED, 1039 }; 1040 1041 err = map_update_elem(map_related, &icmp_tuple, &entry, 0); 1042 if (unlikely(err < 0)) 1043 goto err_ct_fill_up; 1044 } 1045 1046 #ifdef CONNTRACK_ACCOUNTING 1047 entry.packets = 1; 1048 entry.bytes = ctx_full_len(ctx); 1049 #endif 1050 1051 /* Previous map update succeeded, we could delete it in case 1052 * the below throws an error, but we might as well just let 1053 * it time out. 1054 */ 1055 err = map_update_elem(map_main, tuple, &entry, 0); 1056 if (unlikely(err < 0)) 1057 goto err_ct_fill_up; 1058 1059 return 0; 1060 1061 err_ct_fill_up: 1062 if (ext_err) 1063 *ext_err = (__s8)err; 1064 send_signal_ct_fill_up(ctx, SIGNAL_PROTO_V4); 1065 return DROP_CT_CREATE_FAILED; 1066 } 1067 1068 #ifndef DISABLE_LOOPBACK_LB 1069 static __always_inline bool 1070 ct_has_loopback_egress_entry4(const void *map, struct ipv4_ct_tuple *tuple, 1071 __u16 *rev_nat_index) 1072 { 1073 __u8 flags = tuple->flags; 1074 struct ct_entry *entry; 1075 1076 tuple->flags = TUPLE_F_OUT; 1077 entry = map_lookup_elem(map, tuple); 1078 tuple->flags = flags; 1079 1080 if (entry && entry->lb_loopback) { 1081 *rev_nat_index = entry->rev_nat_index; 1082 return true; 1083 } 1084 1085 return false; 1086 } 1087 #endif 1088 1089 static __always_inline bool 1090 __ct_has_nodeport_egress_entry(const struct ct_entry *entry, 1091 __u16 *rev_nat_index, bool check_dsr) 1092 { 1093 if (entry->node_port) { 1094 if (rev_nat_index) 1095 *rev_nat_index = entry->rev_nat_index; 1096 return true; 1097 } 1098 1099 return check_dsr && entry->dsr_internal; 1100 } 1101 1102 /* The function tries to determine whether the flow identified by the given 1103 * CT_INGRESS tuple belongs to a NodePort traffic (i.e., outside client => N/S 1104 * LB => local backend). 1105 * 1106 * When the client send the NodePort request, the NodePort BPF 1107 * (nodeport_lb{4,6}()) creates the CT_EGRESS entry for the 1108 * (saddr=client,daddr=backend) tuple. So, to derive whether the reply packet 1109 * backend => client belongs to the LB flow we can query the CT_EGRESS entry. 1110 */ 1111 static __always_inline bool 1112 ct_has_nodeport_egress_entry4(const void *map, 1113 struct ipv4_ct_tuple *ingress_tuple, 1114 __u16 *rev_nat_index, bool check_dsr) 1115 { 1116 __u8 prev_flags = ingress_tuple->flags; 1117 struct ct_entry *entry; 1118 1119 ingress_tuple->flags = TUPLE_F_OUT; 1120 entry = map_lookup_elem(map, ingress_tuple); 1121 ingress_tuple->flags = prev_flags; 1122 1123 if (!entry) 1124 return false; 1125 1126 return __ct_has_nodeport_egress_entry(entry, rev_nat_index, check_dsr); 1127 } 1128 1129 static __always_inline bool 1130 ct_has_dsr_egress_entry4(const void *map, struct ipv4_ct_tuple *ingress_tuple) 1131 { 1132 __u8 prev_flags = ingress_tuple->flags; 1133 struct ct_entry *entry; 1134 1135 ingress_tuple->flags = TUPLE_F_OUT; 1136 entry = map_lookup_elem(map, ingress_tuple); 1137 ingress_tuple->flags = prev_flags; 1138 1139 if (entry) 1140 return entry->dsr_internal; 1141 1142 return 0; 1143 } 1144 1145 static __always_inline bool 1146 ct_has_nodeport_egress_entry6(const void *map, 1147 struct ipv6_ct_tuple *ingress_tuple, 1148 __u16 *rev_nat_index, bool check_dsr) 1149 { 1150 __u8 prev_flags = ingress_tuple->flags; 1151 struct ct_entry *entry; 1152 1153 ingress_tuple->flags = TUPLE_F_OUT; 1154 entry = map_lookup_elem(map, ingress_tuple); 1155 ingress_tuple->flags = prev_flags; 1156 1157 if (!entry) 1158 return false; 1159 1160 return __ct_has_nodeport_egress_entry(entry, rev_nat_index, check_dsr); 1161 } 1162 1163 static __always_inline bool 1164 ct_has_dsr_egress_entry6(const void *map, struct ipv6_ct_tuple *ingress_tuple) 1165 { 1166 __u8 prev_flags = ingress_tuple->flags; 1167 struct ct_entry *entry; 1168 1169 ingress_tuple->flags = TUPLE_F_OUT; 1170 entry = map_lookup_elem(map, ingress_tuple); 1171 ingress_tuple->flags = prev_flags; 1172 1173 if (entry) 1174 return entry->dsr_internal; 1175 1176 return 0; 1177 } 1178 1179 static __always_inline void 1180 ct_update_svc_entry(const void *map, const void *tuple, 1181 __u32 backend_id, __u16 rev_nat_index) 1182 { 1183 struct ct_entry *entry; 1184 1185 entry = map_lookup_elem(map, tuple); 1186 if (!entry) 1187 return; 1188 1189 entry->backend_id = backend_id; 1190 entry->rev_nat_index = rev_nat_index; 1191 } 1192 1193 static __always_inline void 1194 ct_update_dsr(const void *map, const void *tuple, const bool dsr) 1195 { 1196 struct ct_entry *entry; 1197 1198 entry = map_lookup_elem(map, tuple); 1199 if (!entry) 1200 return; 1201 1202 entry->dsr_internal = dsr; 1203 }