github.com/looshlee/beatles@v0.0.0-20220727174639-742810ab631c/bpf/lib/conntrack.h (about) 1 /* 2 * Copyright (C) 2016-2019 Authors of Cilium 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #ifndef __LIB_CONNTRACK_H_ 19 #define __LIB_CONNTRACK_H_ 20 21 #include <linux/icmpv6.h> 22 #include <linux/icmp.h> 23 24 #include "common.h" 25 #include "utils.h" 26 #include "ipv6.h" 27 #include "dbg.h" 28 #include "l4.h" 29 #include "nat46.h" 30 31 /* CT_REPORT_INTERVAL, when MONITOR_AGGREGATION is >= TRACE_AGGREGATE_ACTIVE_CT 32 * determines how frequently monitor notifications should be sent for active 33 * connections. A notification is always triggered on a packet event. 34 */ 35 #ifndef CT_REPORT_INTERVAL 36 # define CT_REPORT_INTERVAL 5 /* 5 seconds */ 37 #endif 38 39 #ifdef CONNTRACK 40 enum { 41 ACTION_UNSPEC, 42 ACTION_CREATE, 43 ACTION_CLOSE, 44 }; 45 46 /* conn_is_dns returns true if the connection is DNS, false otherwise. 47 * 48 * @dport: Connection destination port. 49 * 50 * To reduce program complexity, we ignore nexthdr and dir here: 51 * nexthdr: The parser will not fill dport if nexthdr is not TCP/UDP. 52 * dir: Ideally we would only consider responses, but requests are likely 53 * to be small anyway. 54 * */ 55 static inline bool conn_is_dns(__u16 dport) 56 { 57 if (dport == bpf_htons(53)) { 58 relax_verifier(); 59 return true; 60 } 61 return false; 62 } 63 64 union tcp_flags { 65 struct { 66 __u8 upper_bits; 67 __u8 lower_bits; 68 __u16 pad; 69 }; 70 __u32 value; 71 }; 72 73 /** 74 * Update the CT timeout and TCP flags for the specified entry. 75 * 76 * We track the OR'd accumulation of seen tcp flags in the entry, and the 77 * last time that a notification was sent. Multiple CPUs may enter this 78 * function with packets for the same connection, in which case it is possible 79 * for the CPUs to race to update the entry. In such a case, the critical 80 * update section may be entered in quick succession, leading to multiple 81 * updates of the entry and returning true for each CPU. The BPF architecture 82 * guarantees that entire 8-bit or 32-bit values will be set within the entry, 83 * so although the CPUs may race, the worst result is that multiple executions 84 * of this function return non-zero for the same connection within short 85 * succession, leading to multiple trace notifications being sent when one 86 * might otherwise expect such notifications to be aggregated. 87 * 88 * Returns how many bytes of the packet should be monitored: 89 * - Zero if this flow was recently monitored. 90 * - Non-zero if this flow has not been monitored recently. 91 */ 92 static inline __u32 __inline__ __ct_update_timeout(struct ct_entry *entry, 93 __u32 lifetime, int dir, 94 union tcp_flags flags) 95 { 96 __u32 now = bpf_ktime_get_sec(); 97 __u8 accumulated_flags; 98 __u8 seen_flags = flags.lower_bits; 99 __u32 last_report; 100 101 #ifdef NEEDS_TIMEOUT 102 entry->lifetime = now + lifetime; 103 #endif 104 if (dir == CT_INGRESS) { 105 accumulated_flags = READ_ONCE(entry->rx_flags_seen); 106 last_report = READ_ONCE(entry->last_rx_report); 107 } else { 108 accumulated_flags = READ_ONCE(entry->tx_flags_seen); 109 last_report = READ_ONCE(entry->last_tx_report); 110 } 111 seen_flags |= accumulated_flags; 112 113 /* It's possible for multiple CPUs to execute the branch statement here 114 * one after another, before the first CPU is able to execute the entry 115 * modifications within this branch. This is somewhat unlikely because 116 * packets for the same connection are typically steered towards the 117 * same CPU, but is possible in theory. 118 * 119 * If the branch is taken by multiple CPUs because of '*last_report', 120 * then this merely causes multiple notifications to be sent after 121 * CT_REPORT_INTERVAL rather than a single notification. '*last_report' 122 * will be updated by all CPUs and subsequent checks should not take 123 * this branch until the next CT_REPORT_INTERVAL. As such, the trace 124 * aggregation that uses the result of this function may reduce the 125 * number of packets per interval to a small integer value (max N_CPUS) 126 * rather than 1 notification per packet throughout the interval. 127 * 128 * Similar behaviour may happen with tcp_flags. The worst case race 129 * here would be that two or more CPUs argue over which flags have been 130 * seen and overwrite each other, with each CPU interleaving different 131 * values for which flags were seen. In practice, realistic connections 132 * are likely to progressively set SYN, ACK, then much later perhaps 133 * FIN and/or RST. Furthermore, unless such a traffic pattern were 134 * constantly received, this should self-correct as the stored 135 * tcp_flags is an OR'd set of flags and each time the above code is 136 * executed, it pulls the latest set of accumulated flags. Therefore 137 * even in the worst case such a conflict is likely only to cause a 138 * small number of additional notifications, which is still likely to 139 * be significantly less under this MONITOR_AGGREGATION mode than would 140 * otherwise be sent if the MONITOR_AGGREGATION level is set to none 141 * (ie, sending a notification for every packet). 142 */ 143 if (last_report + CT_REPORT_INTERVAL < now || 144 accumulated_flags != seen_flags) { 145 /* verifier workaround: we don't use reference here. */ 146 if (dir == CT_INGRESS) { 147 WRITE_ONCE(entry->rx_flags_seen, seen_flags); 148 WRITE_ONCE(entry->last_rx_report, now); 149 } else { 150 WRITE_ONCE(entry->tx_flags_seen, seen_flags); 151 WRITE_ONCE(entry->last_tx_report, now); 152 } 153 return TRACE_PAYLOAD_LEN; 154 } 155 return 0; 156 } 157 158 /** 159 * Update the CT timeouts for the specified entry. 160 * 161 * If CT_REPORT_INTERVAL has elapsed since the last update, updates the 162 * last_updated timestamp and returns true. Otherwise returns false. 163 */ 164 static inline __u32 __inline__ ct_update_timeout(struct ct_entry *entry, 165 bool tcp, int dir, 166 union tcp_flags seen_flags) 167 { 168 __u32 lifetime = dir == CT_SERVICE ? 169 CT_SERVICE_LIFETIME_NONTCP : 170 CT_CONNECTION_LIFETIME_NONTCP; 171 bool syn = seen_flags.value & TCP_FLAG_SYN; 172 173 if (tcp) { 174 entry->seen_non_syn |= !syn; 175 if (entry->seen_non_syn) { 176 lifetime = dir == CT_SERVICE ? 177 CT_SERVICE_LIFETIME_TCP : 178 CT_CONNECTION_LIFETIME_TCP; 179 } else { 180 lifetime = CT_SYN_TIMEOUT; 181 } 182 } 183 184 return __ct_update_timeout(entry, lifetime, dir, seen_flags); 185 } 186 187 static inline void __inline__ ct_reset_closing(struct ct_entry *entry) 188 { 189 entry->rx_closing = 0; 190 entry->tx_closing = 0; 191 } 192 193 static inline bool __inline__ ct_entry_alive(const struct ct_entry *entry) 194 { 195 return !entry->rx_closing || !entry->tx_closing; 196 } 197 198 static inline __u8 __inline__ __ct_lookup(void *map, struct __sk_buff *skb, 199 void *tuple, int action, int dir, 200 struct ct_state *ct_state, 201 bool is_tcp, union tcp_flags seen_flags, 202 __u32 *monitor) 203 { 204 struct ct_entry *entry; 205 int reopen; 206 207 if ((entry = map_lookup_elem(map, tuple))) { 208 cilium_dbg(skb, DBG_CT_MATCH, entry->lifetime, entry->rev_nat_index); 209 if (ct_entry_alive(entry)) { 210 *monitor = ct_update_timeout(entry, is_tcp, dir, seen_flags); 211 } 212 if (ct_state) { 213 ct_state->rev_nat_index = entry->rev_nat_index; 214 ct_state->loopback = entry->lb_loopback; 215 ct_state->node_port = entry->node_port; 216 ct_state->proxy_redirect = entry->proxy_redirect; 217 /* To support seamless upgrade from an earlier service 218 * implementation, we store references to the backend 219 * in the "ct_entry.rx_bytes" field. 220 * Previously, the field "ct_entry.backend_id" was used 221 * for legacy services so we co-opted the field 222 * "ct_entry.rx_bytes" to store the services v2 223 * backend (as it is not used with dir=CT_SERVICE). 224 * 225 * As of v1.6, "ct_entry.backend_id" is zeroed so that 226 * users who migrate to v1.6 will end up with CT 227 * entries that assign no meaning to this field. 228 * In v1.7 it will be safe to reuse this field for 229 * other purposes. Current plans are to expand the 230 * backend_id to 32 bits, which would involve creating 231 * a union across the backend_id and [rt]x_bytes fields. 232 * For now, just retrieve the backend out of rx_bytes. 233 * 234 * TODO (1.7+): Switch to entry->backend_id 235 */ 236 if (dir == CT_SERVICE) { 237 ct_state->backend_id = entry->rx_bytes; 238 } 239 } 240 241 #ifdef ENABLE_NAT46 242 /* This packet needs nat46 translation */ 243 if (entry->nat46 && !skb->cb[CB_NAT46_STATE]) 244 skb->cb[CB_NAT46_STATE] = NAT46; 245 #endif 246 247 #ifdef CONNTRACK_ACCOUNTING 248 /* FIXME: This is slow, per-cpu counters? */ 249 if (dir == CT_INGRESS) { 250 __sync_fetch_and_add(&entry->rx_packets, 1); 251 __sync_fetch_and_add(&entry->rx_bytes, skb->len); 252 } else if (dir == CT_EGRESS) { 253 __sync_fetch_and_add(&entry->tx_packets, 1); 254 __sync_fetch_and_add(&entry->tx_bytes, skb->len); 255 } 256 #endif 257 258 switch (action) { 259 case ACTION_CREATE: 260 reopen = entry->rx_closing | entry->tx_closing; 261 reopen |= seen_flags.value & TCP_FLAG_SYN; 262 if (unlikely(reopen == (TCP_FLAG_SYN|0x1))) { 263 ct_reset_closing(entry); 264 *monitor = ct_update_timeout(entry, is_tcp, dir, seen_flags); 265 } 266 break; 267 case ACTION_CLOSE: 268 /* RST or similar, immediately delete ct entry */ 269 if (dir == CT_INGRESS) 270 entry->rx_closing = 1; 271 else 272 entry->tx_closing = 1; 273 274 *monitor = TRACE_PAYLOAD_LEN; 275 if (ct_entry_alive(entry)) 276 break; 277 __ct_update_timeout(entry, CT_CLOSE_TIMEOUT, dir, seen_flags); 278 break; 279 } 280 281 return CT_ESTABLISHED; 282 } 283 284 *monitor = TRACE_PAYLOAD_LEN; 285 return CT_NEW; 286 } 287 288 static inline void __inline__ ct_flip_tuple_dir6(struct ipv6_ct_tuple *tuple) 289 { 290 if (tuple->flags & TUPLE_F_IN) 291 tuple->flags &= ~TUPLE_F_IN; 292 else 293 tuple->flags |= TUPLE_F_IN; 294 } 295 296 static inline void __inline__ ipv6_ct_tuple_reverse(struct ipv6_ct_tuple *tuple) 297 { 298 union v6addr tmp_addr = {}; 299 __be16 tmp; 300 301 ipv6_addr_copy(&tmp_addr, &tuple->saddr); 302 ipv6_addr_copy(&tuple->saddr, &tuple->daddr); 303 ipv6_addr_copy(&tuple->daddr, &tmp_addr); 304 305 tmp = tuple->sport; 306 tuple->sport = tuple->dport; 307 tuple->dport = tmp; 308 309 ct_flip_tuple_dir6(tuple); 310 } 311 312 /* Offset must point to IPv6 */ 313 static inline int __inline__ ct_lookup6(void *map, struct ipv6_ct_tuple *tuple, 314 struct __sk_buff *skb, int l4_off, int dir, 315 struct ct_state *ct_state, __u32 *monitor) 316 { 317 int ret = CT_NEW, action = ACTION_UNSPEC; 318 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 319 union tcp_flags tcp_flags = { .value = 0 }; 320 321 /* The tuple is created in reverse order initially to find a 322 * potential reverse flow. This is required because the RELATED 323 * or REPLY state takes precedence over ESTABLISHED due to 324 * policy requirements. 325 * 326 * tuple->flags separates entries that could otherwise be overlapping. 327 */ 328 if (dir == CT_INGRESS) 329 tuple->flags = TUPLE_F_OUT; 330 else if (dir == CT_EGRESS) 331 tuple->flags = TUPLE_F_IN; 332 else if (dir == CT_SERVICE) 333 tuple->flags = TUPLE_F_SERVICE; 334 else 335 return DROP_CT_INVALID_HDR; 336 337 switch (tuple->nexthdr) { 338 case IPPROTO_ICMPV6: 339 if (1) { 340 __be16 identifier = 0; 341 __u8 type; 342 343 if (skb_load_bytes(skb, l4_off, &type, 1) < 0) 344 return DROP_CT_INVALID_HDR; 345 if ((type == ICMPV6_ECHO_REQUEST || type == ICMPV6_ECHO_REPLY) && 346 skb_load_bytes(skb, l4_off + offsetof(struct icmp6hdr, 347 icmp6_dataun.u_echo.identifier), 348 &identifier, 2) < 0) 349 return DROP_CT_INVALID_HDR; 350 351 tuple->sport = 0; 352 tuple->dport = 0; 353 354 switch (type) { 355 case ICMPV6_DEST_UNREACH: 356 case ICMPV6_PKT_TOOBIG: 357 case ICMPV6_TIME_EXCEED: 358 case ICMPV6_PARAMPROB: 359 tuple->flags |= TUPLE_F_RELATED; 360 break; 361 362 case ICMPV6_ECHO_REPLY: 363 tuple->sport = identifier; 364 break; 365 366 case ICMPV6_ECHO_REQUEST: 367 tuple->dport = identifier; 368 /* fall through */ 369 default: 370 action = ACTION_CREATE; 371 break; 372 } 373 } 374 break; 375 376 case IPPROTO_TCP: 377 if (1) { 378 if (skb_load_bytes(skb, l4_off + 12, &tcp_flags, 2) < 0) 379 return DROP_CT_INVALID_HDR; 380 381 if (unlikely(tcp_flags.value & (TCP_FLAG_RST|TCP_FLAG_FIN))) 382 action = ACTION_CLOSE; 383 else 384 action = ACTION_CREATE; 385 } 386 387 /* load sport + dport into tuple */ 388 if (skb_load_bytes(skb, l4_off, &tuple->dport, 4) < 0) 389 return DROP_CT_INVALID_HDR; 390 break; 391 392 case IPPROTO_UDP: 393 /* load sport + dport into tuple */ 394 if (skb_load_bytes(skb, l4_off, &tuple->dport, 4) < 0) 395 return DROP_CT_INVALID_HDR; 396 397 action = ACTION_CREATE; 398 break; 399 400 default: 401 /* Can't handle extension headers yet */ 402 relax_verifier(); 403 return DROP_CT_UNKNOWN_PROTO; 404 } 405 406 /* Lookup the reverse direction 407 * 408 * This will find an existing flow in the reverse direction. 409 * The reverse direction is the one where reverse nat index is stored. 410 */ 411 cilium_dbg3(skb, DBG_CT_LOOKUP6_1, (__u32) tuple->saddr.p4, (__u32) tuple->daddr.p4, 412 (bpf_ntohs(tuple->sport) << 16) | bpf_ntohs(tuple->dport)); 413 cilium_dbg3(skb, DBG_CT_LOOKUP6_2, (tuple->nexthdr << 8) | tuple->flags, 0, 0); 414 ret = __ct_lookup(map, skb, tuple, action, dir, ct_state, is_tcp, 415 tcp_flags, monitor); 416 if (ret != CT_NEW) { 417 if (likely(ret == CT_ESTABLISHED)) { 418 if (unlikely(tuple->flags & TUPLE_F_RELATED)) 419 ret = CT_RELATED; 420 else 421 ret = CT_REPLY; 422 } 423 goto out; 424 } 425 426 /* Lookup entry in forward direction */ 427 if (dir != CT_SERVICE) { 428 ipv6_ct_tuple_reverse(tuple); 429 ret = __ct_lookup(map, skb, tuple, action, dir, ct_state, 430 is_tcp, tcp_flags, monitor); 431 } 432 433 #ifdef ENABLE_NAT46 434 skb->cb[CB_NAT46_STATE] = NAT46_CLEAR; 435 #endif 436 out: 437 cilium_dbg(skb, DBG_CT_VERDICT, ret < 0 ? -ret : ret, ct_state->rev_nat_index); 438 if (conn_is_dns(tuple->dport)) 439 *monitor = MTU; 440 return ret; 441 } 442 443 static inline void __inline__ ct_flip_tuple_dir4(struct ipv4_ct_tuple *tuple) 444 { 445 if (tuple->flags & TUPLE_F_IN) 446 tuple->flags &= ~TUPLE_F_IN; 447 else 448 tuple->flags |= TUPLE_F_IN; 449 } 450 451 static inline void __inline__ ipv4_ct_tuple_reverse(struct ipv4_ct_tuple *tuple) 452 { 453 __be32 tmp_addr = tuple->saddr; 454 __be16 tmp; 455 456 tuple->saddr = tuple->daddr; 457 tuple->daddr = tmp_addr; 458 459 tmp = tuple->sport; 460 tuple->sport = tuple->dport; 461 tuple->dport = tmp; 462 463 ct_flip_tuple_dir4(tuple); 464 } 465 466 static inline void ct4_cilium_dbg_tuple(struct __sk_buff *skb, __u8 type, 467 const struct ipv4_ct_tuple *tuple, 468 __u32 rev_nat_index, int dir) 469 { 470 __be32 addr = (dir == CT_INGRESS) ? tuple->saddr : tuple->daddr; 471 cilium_dbg(skb, type, addr, rev_nat_index); 472 } 473 474 /* Offset must point to IPv4 header */ 475 static inline int __inline__ ct_lookup4(void *map, struct ipv4_ct_tuple *tuple, 476 struct __sk_buff *skb, int off, int dir, 477 struct ct_state *ct_state, __u32 *monitor) 478 { 479 int ret = CT_NEW, action = ACTION_UNSPEC; 480 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 481 union tcp_flags tcp_flags = { .value = 0 }; 482 483 /* The tuple is created in reverse order initially to find a 484 * potential reverse flow. This is required because the RELATED 485 * or REPLY state takes precedence over ESTABLISHED due to 486 * policy requirements. 487 * 488 * tuple->flags separates entries that could otherwise be overlapping. 489 */ 490 if (dir == CT_INGRESS) 491 tuple->flags = TUPLE_F_OUT; 492 else if (dir == CT_EGRESS) 493 tuple->flags = TUPLE_F_IN; 494 else if (dir == CT_SERVICE) 495 tuple->flags = TUPLE_F_SERVICE; 496 else 497 return DROP_CT_INVALID_HDR; 498 499 switch (tuple->nexthdr) { 500 case IPPROTO_ICMP: 501 if (1) { 502 __be16 identifier = 0; 503 __u8 type; 504 505 if (skb_load_bytes(skb, off, &type, 1) < 0) 506 return DROP_CT_INVALID_HDR; 507 if ((type == ICMP_ECHO || type == ICMP_ECHOREPLY) && 508 skb_load_bytes(skb, off + offsetof(struct icmphdr, un.echo.id), 509 &identifier, 2) < 0) 510 return DROP_CT_INVALID_HDR; 511 512 tuple->sport = 0; 513 tuple->dport = 0; 514 515 switch (type) { 516 case ICMP_DEST_UNREACH: 517 case ICMP_TIME_EXCEEDED: 518 case ICMP_PARAMETERPROB: 519 tuple->flags |= TUPLE_F_RELATED; 520 break; 521 522 case ICMP_ECHOREPLY: 523 tuple->sport = identifier; 524 break; 525 526 case ICMP_ECHO: 527 tuple->dport = identifier; 528 /* fall through */ 529 default: 530 action = ACTION_CREATE; 531 break; 532 } 533 } 534 break; 535 536 case IPPROTO_TCP: 537 if (1) { 538 if (skb_load_bytes(skb, off + 12, &tcp_flags, 2) < 0) 539 return DROP_CT_INVALID_HDR; 540 541 if (unlikely(tcp_flags.value & (TCP_FLAG_RST|TCP_FLAG_FIN))) 542 action = ACTION_CLOSE; 543 else 544 action = ACTION_CREATE; 545 } 546 547 /* load sport + dport into tuple */ 548 if (skb_load_bytes(skb, off, &tuple->dport, 4) < 0) 549 return DROP_CT_INVALID_HDR; 550 break; 551 552 case IPPROTO_UDP: 553 /* load sport + dport into tuple */ 554 if (skb_load_bytes(skb, off, &tuple->dport, 4) < 0) 555 return DROP_CT_INVALID_HDR; 556 557 action = ACTION_CREATE; 558 break; 559 560 default: 561 /* Can't handle extension headers yet */ 562 relax_verifier(); 563 return DROP_CT_UNKNOWN_PROTO; 564 } 565 566 /* Lookup the reverse direction 567 * 568 * This will find an existing flow in the reverse direction. 569 */ 570 #ifndef QUIET_CT 571 cilium_dbg3(skb, DBG_CT_LOOKUP4_1, tuple->saddr, tuple->daddr, 572 (bpf_ntohs(tuple->sport) << 16) | bpf_ntohs(tuple->dport)); 573 cilium_dbg3(skb, DBG_CT_LOOKUP4_2, (tuple->nexthdr << 8) | tuple->flags, 0, 0); 574 #endif 575 ret = __ct_lookup(map, skb, tuple, action, dir, ct_state, is_tcp, 576 tcp_flags, monitor); 577 if (ret != CT_NEW) { 578 if (likely(ret == CT_ESTABLISHED)) { 579 if (unlikely(tuple->flags & TUPLE_F_RELATED)) 580 ret = CT_RELATED; 581 else 582 ret = CT_REPLY; 583 } 584 goto out; 585 } 586 587 /* Lookup entry in forward direction */ 588 if (dir != CT_SERVICE) { 589 ipv4_ct_tuple_reverse(tuple); 590 ret = __ct_lookup(map, skb, tuple, action, dir, ct_state, 591 is_tcp, tcp_flags, monitor); 592 } 593 out: 594 cilium_dbg(skb, DBG_CT_VERDICT, ret < 0 ? -ret : ret, ct_state->rev_nat_index); 595 if (conn_is_dns(tuple->dport)) 596 *monitor = MTU; 597 return ret; 598 } 599 600 static inline void __inline__ ct_update6_backend_id(void *map, 601 struct ipv6_ct_tuple *tuple, 602 struct ct_state *state) 603 { 604 struct ct_entry *entry; 605 606 entry = map_lookup_elem(map, tuple); 607 if (!entry) 608 return; 609 610 /* See the ct_create4 comments re the rx_bytes hack */ 611 entry->backend_id = 0; 612 entry->rx_bytes = state->backend_id; 613 return; 614 } 615 616 static inline void __inline__ 617 ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple, 618 struct ct_state *state) 619 { 620 struct ct_entry *entry; 621 622 entry = map_lookup_elem(map, tuple); 623 if (!entry) 624 return; 625 626 entry->rev_nat_index = state->rev_nat_index; 627 return; 628 } 629 630 /* Offset must point to IPv6 */ 631 static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple, 632 struct __sk_buff *skb, int dir, 633 struct ct_state *ct_state, bool proxy_redirect) 634 { 635 /* Create entry in original direction */ 636 struct ct_entry entry = { }; 637 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 638 union tcp_flags seen_flags = { .value = 0 }; 639 640 /* Note if this is a proxy connection so that replies can be redirected back to the proxy. */ 641 entry.proxy_redirect = proxy_redirect; 642 643 /* See the ct_create4 comments re the rx_bytes hack */ 644 if (dir == CT_SERVICE) { 645 entry.backend_id = 0; 646 entry.rx_bytes = ct_state->backend_id; 647 } 648 649 entry.lb_loopback = ct_state->loopback; 650 entry.node_port = ct_state->node_port; 651 652 entry.rev_nat_index = ct_state->rev_nat_index; 653 seen_flags.value |= is_tcp ? TCP_FLAG_SYN : 0; 654 ct_update_timeout(&entry, is_tcp, dir, seen_flags); 655 656 if (dir == CT_INGRESS) { 657 entry.rx_packets = 1; 658 entry.rx_bytes = skb->len; 659 } else if (dir == CT_EGRESS) { 660 entry.tx_packets = 1; 661 entry.tx_bytes = skb->len; 662 } 663 664 cilium_dbg3(skb, DBG_CT_CREATED6, entry.rev_nat_index, ct_state->src_sec_id, 0); 665 666 entry.src_sec_id = ct_state->src_sec_id; 667 if (map_update_elem(map, tuple, &entry, 0) < 0) 668 return DROP_CT_CREATE_FAILED; 669 670 /* Create an ICMPv6 entry to relate errors */ 671 struct ipv6_ct_tuple icmp_tuple = { 672 .nexthdr = IPPROTO_ICMPV6, 673 .sport = 0, 674 .dport = 0, 675 .flags = tuple->flags | TUPLE_F_RELATED, 676 }; 677 678 entry.seen_non_syn = true; /* For ICMP, there is no SYN. */ 679 680 ipv6_addr_copy(&icmp_tuple.daddr, &tuple->daddr); 681 ipv6_addr_copy(&icmp_tuple.saddr, &tuple->saddr); 682 683 /* FIXME: We could do a lookup and check if an L3 entry already exists */ 684 if (map_update_elem(map, &icmp_tuple, &entry, 0) < 0) { 685 /* Previous map update succeeded, we could delete it 686 * but we might as well just let it time out. 687 */ 688 return DROP_CT_CREATE_FAILED; 689 } 690 691 return 0; 692 } 693 694 static inline void __inline__ ct_update4_backend_id(void *map, 695 struct ipv4_ct_tuple *tuple, 696 struct ct_state *state) 697 { 698 struct ct_entry *entry; 699 700 entry = map_lookup_elem(map, tuple); 701 if (!entry) 702 return; 703 704 /* See the ct_create4 comments re the rx_bytes hack */ 705 entry->backend_id = 0; 706 entry->rx_bytes = state->backend_id; 707 return; 708 } 709 710 static inline void __inline__ 711 ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple, 712 struct ct_state *state) 713 { 714 struct ct_entry *entry; 715 716 entry = map_lookup_elem(map, tuple); 717 if (!entry) 718 return; 719 720 entry->rev_nat_index = state->rev_nat_index; 721 return; 722 } 723 724 static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple, 725 struct __sk_buff *skb, int dir, 726 struct ct_state *ct_state, bool proxy_redirect) 727 { 728 /* Create entry in original direction */ 729 struct ct_entry entry = { }; 730 bool is_tcp = tuple->nexthdr == IPPROTO_TCP; 731 union tcp_flags seen_flags = { .value = 0 }; 732 733 /* Note if this is a proxy connection so that replies can be redirected back to the proxy. */ 734 entry.proxy_redirect = proxy_redirect; 735 736 entry.lb_loopback = ct_state->loopback; 737 entry.node_port = ct_state->node_port; 738 739 /* We need to store the backend_id (points to a svc v2 endpoint), while 740 * handling migration for users upgrading from prior releases. where 741 * the "ct_entry.backend_id" field was used for legacy services. 742 * 743 * Previously, the rx_bytes field was not used for entries with 744 * the dir=CT_SERVICE (see GH#7060). Therefore, we can safely abuse 745 * this field to save the backend_id. The hack will go away once we stop 746 * supporting the legacy svc (in v1.6 we will zero the backend_id 747 * field, in v1.7 we can remove the rx_bytes hack). 748 */ 749 if (dir == CT_SERVICE) { 750 entry.backend_id = 0; 751 entry.rx_bytes = ct_state->backend_id; 752 } 753 entry.rev_nat_index = ct_state->rev_nat_index; 754 seen_flags.value |= is_tcp ? TCP_FLAG_SYN : 0; 755 ct_update_timeout(&entry, is_tcp, dir, seen_flags); 756 757 if (dir == CT_INGRESS) { 758 entry.rx_packets = 1; 759 entry.rx_bytes = skb->len; 760 } else if (dir == CT_EGRESS) { 761 entry.tx_packets = 1; 762 entry.tx_bytes = skb->len; 763 } 764 765 #ifdef ENABLE_NAT46 766 if (skb->cb[CB_NAT46_STATE] == NAT64) 767 entry.nat46 = dir == CT_EGRESS; 768 #endif 769 770 cilium_dbg3(skb, DBG_CT_CREATED4, entry.rev_nat_index, ct_state->src_sec_id, ct_state->addr); 771 772 entry.src_sec_id = ct_state->src_sec_id; 773 if (map_update_elem(map, tuple, &entry, 0) < 0) 774 return DROP_CT_CREATE_FAILED; 775 776 if (ct_state->addr && ct_state->loopback) { 777 __u8 flags = tuple->flags; 778 __be32 saddr, daddr; 779 780 saddr = tuple->saddr; 781 daddr = tuple->daddr; 782 783 /* We are looping back into the origin endpoint through a service, 784 * set up a conntrack tuple for the reply to ensure we do rev NAT 785 * before attempting to route the destination address which will 786 * not point back to the right source. */ 787 tuple->flags = TUPLE_F_IN; 788 if (dir == CT_INGRESS) { 789 tuple->saddr = ct_state->addr; 790 tuple->daddr = ct_state->svc_addr; 791 } else { 792 tuple->saddr = ct_state->svc_addr; 793 tuple->daddr = ct_state->addr; 794 } 795 796 if (map_update_elem(map, tuple, &entry, 0) < 0) 797 return DROP_CT_CREATE_FAILED; 798 tuple->saddr = saddr; 799 tuple->daddr = daddr; 800 tuple->flags = flags; 801 } 802 803 /* Create an ICMP entry to relate errors */ 804 struct ipv4_ct_tuple icmp_tuple = { 805 .daddr = tuple->daddr, 806 .saddr = tuple->saddr, 807 .nexthdr = IPPROTO_ICMP, 808 .sport = 0, 809 .dport = 0, 810 .flags = tuple->flags | TUPLE_F_RELATED, 811 }; 812 813 entry.seen_non_syn = true; /* For ICMP, there is no SYN. */ 814 815 /* FIXME: We could do a lookup and check if an L3 entry already exists */ 816 if (map_update_elem(map, &icmp_tuple, &entry, 0) < 0) 817 return DROP_CT_CREATE_FAILED; 818 819 return 0; 820 } 821 822 #else /* !CONNTRACK */ 823 static inline int __inline__ ct_lookup6(void *map, struct ipv6_ct_tuple *tuple, 824 struct __sk_buff *skb, int off, int dir, 825 struct ct_state *ct_state, __u32 *monitor) 826 { 827 return 0; 828 } 829 830 static inline int __inline__ ct_lookup4(void *map, struct ipv4_ct_tuple *tuple, 831 struct __sk_buff *skb, int off, int dir, 832 struct ct_state *ct_state, __u32 *monitor) 833 { 834 return 0; 835 } 836 837 static inline void __inline__ ct_update6_backend_id(void *map, 838 struct ipv6_ct_tuple *tuple, 839 struct ct_state *state) 840 { 841 } 842 843 static inline void __inline__ 844 ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple, 845 struct ct_state *state) 846 { 847 } 848 849 static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple, 850 struct __sk_buff *skb, int dir, 851 struct ct_state *ct_state, bool from_proxy) 852 { 853 return 0; 854 } 855 856 static inline void __inline__ ct_update4_backend_id(void *map, 857 struct ipv4_ct_tuple *tuple, 858 struct ct_state *state) 859 { 860 } 861 862 static inline void __inline__ 863 ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple, 864 struct ct_state *state) 865 { 866 } 867 868 static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple, 869 struct __sk_buff *skb, int dir, 870 struct ct_state *ct_state, bool from_proxy) 871 { 872 return 0; 873 } 874 875 #endif 876 877 #endif