github.com/cilium/cilium@v1.16.2/bpf/lib/lb.h (about) 1 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 /* Copyright Authors of Cilium */ 3 4 #pragma once 5 6 #include "bpf/compiler.h" 7 #include "csum.h" 8 #include "conntrack.h" 9 #include "ipv4.h" 10 #include "hash.h" 11 #include "ids.h" 12 #include "nat_46x64.h" 13 #include "ratelimit.h" 14 15 #ifndef SKIP_CALLS_MAP 16 #include "drop.h" 17 #endif 18 19 #ifdef ENABLE_IPV6 20 struct { 21 __uint(type, BPF_MAP_TYPE_HASH); 22 __type(key, __u16); 23 __type(value, struct lb6_reverse_nat); 24 __uint(pinning, LIBBPF_PIN_BY_NAME); 25 __uint(max_entries, CILIUM_LB_REV_NAT_MAP_MAX_ENTRIES); 26 __uint(map_flags, CONDITIONAL_PREALLOC); 27 } LB6_REVERSE_NAT_MAP __section_maps_btf; 28 29 struct { 30 __uint(type, BPF_MAP_TYPE_HASH); 31 __type(key, struct lb6_key); 32 __type(value, struct lb6_service); 33 __uint(pinning, LIBBPF_PIN_BY_NAME); 34 __uint(max_entries, CILIUM_LB_SERVICE_MAP_MAX_ENTRIES); 35 __uint(map_flags, CONDITIONAL_PREALLOC); 36 } LB6_SERVICES_MAP_V2 __section_maps_btf; 37 38 struct { 39 __uint(type, BPF_MAP_TYPE_HASH); 40 __type(key, __u32); 41 __type(value, struct lb6_backend); 42 __uint(pinning, LIBBPF_PIN_BY_NAME); 43 __uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES); 44 __uint(map_flags, CONDITIONAL_PREALLOC); 45 } LB6_BACKEND_MAP __section_maps_btf; 46 47 #ifdef ENABLE_SESSION_AFFINITY 48 struct { 49 __uint(type, BPF_MAP_TYPE_LRU_HASH); 50 __type(key, struct lb6_affinity_key); 51 __type(value, struct lb_affinity_val); 52 __uint(pinning, LIBBPF_PIN_BY_NAME); 53 __uint(max_entries, CILIUM_LB_AFFINITY_MAP_MAX_ENTRIES); 54 } LB6_AFFINITY_MAP __section_maps_btf; 55 #endif 56 57 #ifdef ENABLE_SRC_RANGE_CHECK 58 struct { 59 __uint(type, BPF_MAP_TYPE_LPM_TRIE); 60 __type(key, struct lb6_src_range_key); 61 __type(value, __u8); 62 __uint(pinning, LIBBPF_PIN_BY_NAME); 63 __uint(max_entries, LB6_SRC_RANGE_MAP_SIZE); 64 __uint(map_flags, BPF_F_NO_PREALLOC); 65 } LB6_SRC_RANGE_MAP __section_maps_btf; 66 #endif 67 68 #ifdef ENABLE_HEALTH_CHECK 69 struct { 70 __uint(type, BPF_MAP_TYPE_LRU_HASH); 71 __type(key, __sock_cookie); 72 __type(value, struct lb6_health); 73 __uint(pinning, LIBBPF_PIN_BY_NAME); 74 __uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES); 75 } LB6_HEALTH_MAP __section_maps_btf; 76 #endif 77 78 #if LB_SELECTION == LB_SELECTION_MAGLEV 79 struct { 80 __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); 81 __type(key, __u16); 82 __type(value, __u32); 83 __uint(pinning, LIBBPF_PIN_BY_NAME); 84 __uint(max_entries, CILIUM_LB_MAGLEV_MAP_MAX_ENTRIES); 85 __uint(map_flags, CONDITIONAL_PREALLOC); 86 /* Maglev inner map definition */ 87 __array(values, struct { 88 __uint(type, BPF_MAP_TYPE_ARRAY); 89 __uint(key_size, sizeof(__u32)); 90 __uint(value_size, sizeof(__u32) * LB_MAGLEV_LUT_SIZE); 91 __uint(max_entries, 1); 92 }); 93 } LB6_MAGLEV_MAP_OUTER __section_maps_btf; 94 #endif /* LB_SELECTION == LB_SELECTION_MAGLEV */ 95 #endif /* ENABLE_IPV6 */ 96 97 #ifdef ENABLE_IPV4 98 struct { 99 __uint(type, BPF_MAP_TYPE_HASH); 100 __type(key, __u16); 101 __type(value, struct lb4_reverse_nat); 102 __uint(pinning, LIBBPF_PIN_BY_NAME); 103 __uint(max_entries, CILIUM_LB_REV_NAT_MAP_MAX_ENTRIES); 104 __uint(map_flags, CONDITIONAL_PREALLOC); 105 } LB4_REVERSE_NAT_MAP __section_maps_btf; 106 107 struct { 108 __uint(type, BPF_MAP_TYPE_HASH); 109 __type(key, struct lb4_key); 110 __type(value, struct lb4_service); 111 __uint(pinning, LIBBPF_PIN_BY_NAME); 112 __uint(max_entries, CILIUM_LB_SERVICE_MAP_MAX_ENTRIES); 113 __uint(map_flags, CONDITIONAL_PREALLOC); 114 } LB4_SERVICES_MAP_V2 __section_maps_btf; 115 116 struct { 117 __uint(type, BPF_MAP_TYPE_HASH); 118 __type(key, __u32); 119 __type(value, struct lb4_backend); 120 __uint(pinning, LIBBPF_PIN_BY_NAME); 121 __uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES); 122 __uint(map_flags, CONDITIONAL_PREALLOC); 123 } LB4_BACKEND_MAP __section_maps_btf; 124 125 #ifdef ENABLE_SESSION_AFFINITY 126 struct { 127 __uint(type, BPF_MAP_TYPE_LRU_HASH); 128 __type(key, struct lb4_affinity_key); 129 __type(value, struct lb_affinity_val); 130 __uint(pinning, LIBBPF_PIN_BY_NAME); 131 __uint(max_entries, CILIUM_LB_AFFINITY_MAP_MAX_ENTRIES); 132 } LB4_AFFINITY_MAP __section_maps_btf; 133 #endif 134 135 #ifdef ENABLE_SRC_RANGE_CHECK 136 struct { 137 __uint(type, BPF_MAP_TYPE_LPM_TRIE); 138 __type(key, struct lb4_src_range_key); 139 __type(value, __u8); 140 __uint(pinning, LIBBPF_PIN_BY_NAME); 141 __uint(max_entries, LB4_SRC_RANGE_MAP_SIZE); 142 __uint(map_flags, BPF_F_NO_PREALLOC); 143 } LB4_SRC_RANGE_MAP __section_maps_btf; 144 #endif 145 146 #ifdef ENABLE_HEALTH_CHECK 147 struct { 148 __uint(type, BPF_MAP_TYPE_LRU_HASH); 149 __type(key, __sock_cookie); 150 __type(value, struct lb4_health); 151 __uint(pinning, LIBBPF_PIN_BY_NAME); 152 __uint(max_entries, CILIUM_LB_BACKENDS_MAP_MAX_ENTRIES); 153 } LB4_HEALTH_MAP __section_maps_btf; 154 #endif 155 156 #if LB_SELECTION == LB_SELECTION_MAGLEV 157 struct { 158 __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); 159 __type(key, __u16); 160 __type(value, __u32); 161 __uint(pinning, LIBBPF_PIN_BY_NAME); 162 __uint(max_entries, CILIUM_LB_MAGLEV_MAP_MAX_ENTRIES); 163 __uint(map_flags, CONDITIONAL_PREALLOC); 164 /* Maglev inner map definition */ 165 __array(values, struct { 166 __uint(type, BPF_MAP_TYPE_ARRAY); 167 __uint(key_size, sizeof(__u32)); 168 __uint(value_size, sizeof(__u32) * LB_MAGLEV_LUT_SIZE); 169 __uint(max_entries, 1); 170 }); 171 } LB4_MAGLEV_MAP_OUTER __section_maps_btf; 172 #endif /* LB_SELECTION == LB_SELECTION_MAGLEV */ 173 #endif /* ENABLE_IPV4 */ 174 175 #ifdef ENABLE_SESSION_AFFINITY 176 struct { 177 __uint(type, BPF_MAP_TYPE_HASH); 178 __type(key, struct lb_affinity_match); 179 __type(value, __u8); 180 __uint(pinning, LIBBPF_PIN_BY_NAME); 181 __uint(max_entries, CILIUM_LB_AFFINITY_MAP_MAX_ENTRIES); 182 __uint(map_flags, CONDITIONAL_PREALLOC); 183 } LB_AFFINITY_MATCH_MAP __section_maps_btf; 184 #endif 185 186 #ifndef DSR_XLATE_MODE 187 # define DSR_XLATE_MODE 0 188 # define DSR_XLATE_FRONTEND 1 189 #endif 190 #ifdef LB_DEBUG 191 #define cilium_dbg_lb cilium_dbg 192 #else 193 #define cilium_dbg_lb(a, b, c, d) 194 #endif 195 196 #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING 197 #include "act.h" 198 #endif 199 200 static __always_inline bool lb_is_svc_proto(__u8 proto) 201 { 202 switch (proto) { 203 case IPPROTO_TCP: 204 case IPPROTO_UDP: 205 #ifdef ENABLE_SCTP 206 case IPPROTO_SCTP: 207 #endif /* ENABLE_SCTP */ 208 return true; 209 default: 210 return false; 211 } 212 } 213 214 static __always_inline 215 bool lb4_svc_is_loadbalancer(const struct lb4_service *svc __maybe_unused) 216 { 217 return svc->flags & SVC_FLAG_LOADBALANCER; 218 } 219 220 static __always_inline 221 bool lb6_svc_is_loadbalancer(const struct lb6_service *svc __maybe_unused) 222 { 223 return svc->flags & SVC_FLAG_LOADBALANCER; 224 } 225 226 static __always_inline 227 bool lb4_svc_is_nodeport(const struct lb4_service *svc __maybe_unused) 228 { 229 #ifdef ENABLE_NODEPORT 230 return svc->flags & SVC_FLAG_NODEPORT; 231 #else 232 return false; 233 #endif /* ENABLE_NODEPORT */ 234 } 235 236 static __always_inline 237 bool lb6_svc_is_nodeport(const struct lb6_service *svc __maybe_unused) 238 { 239 #ifdef ENABLE_NODEPORT 240 return svc->flags & SVC_FLAG_NODEPORT; 241 #else 242 return false; 243 #endif /* ENABLE_NODEPORT */ 244 } 245 246 static __always_inline 247 bool lb4_svc_is_external_ip(const struct lb4_service *svc __maybe_unused) 248 { 249 return svc->flags & SVC_FLAG_EXTERNAL_IP; 250 } 251 252 static __always_inline 253 bool lb6_svc_is_external_ip(const struct lb6_service *svc __maybe_unused) 254 { 255 return svc->flags & SVC_FLAG_EXTERNAL_IP; 256 } 257 258 static __always_inline 259 bool lb4_svc_is_hostport(const struct lb4_service *svc __maybe_unused) 260 { 261 return svc->flags & SVC_FLAG_HOSTPORT; 262 } 263 264 static __always_inline 265 bool lb6_svc_is_hostport(const struct lb6_service *svc __maybe_unused) 266 { 267 return svc->flags & SVC_FLAG_HOSTPORT; 268 } 269 270 static __always_inline 271 bool lb4_svc_is_loopback(const struct lb4_service *svc __maybe_unused) 272 { 273 return svc->flags2 & SVC_FLAG_LOOPBACK; 274 } 275 276 static __always_inline 277 bool lb6_svc_is_loopback(const struct lb6_service *svc __maybe_unused) 278 { 279 return svc->flags2 & SVC_FLAG_LOOPBACK; 280 } 281 282 static __always_inline 283 bool lb4_svc_has_src_range_check(const struct lb4_service *svc __maybe_unused) 284 { 285 #ifdef ENABLE_SRC_RANGE_CHECK 286 return svc->flags & SVC_FLAG_SOURCE_RANGE; 287 #else 288 return false; 289 #endif /* ENABLE_SRC_RANGE_CHECK */ 290 } 291 292 static __always_inline 293 bool lb6_svc_has_src_range_check(const struct lb6_service *svc __maybe_unused) 294 { 295 #ifdef ENABLE_SRC_RANGE_CHECK 296 return svc->flags & SVC_FLAG_SOURCE_RANGE; 297 #else 298 return false; 299 #endif /* ENABLE_SRC_RANGE_CHECK */ 300 } 301 302 static __always_inline bool lb_skip_l4_dnat(void) 303 { 304 return DSR_XLATE_MODE == DSR_XLATE_FRONTEND; 305 } 306 307 static __always_inline 308 bool lb4_svc_is_two_scopes(const struct lb4_service *svc) 309 { 310 return svc->flags2 & SVC_FLAG_TWO_SCOPES; 311 } 312 313 static __always_inline 314 bool lb6_svc_is_two_scopes(const struct lb6_service *svc) 315 { 316 return svc->flags2 & SVC_FLAG_TWO_SCOPES; 317 } 318 319 static __always_inline 320 bool lb4_svc_is_affinity(const struct lb4_service *svc) 321 { 322 return svc->flags & SVC_FLAG_AFFINITY; 323 } 324 325 static __always_inline 326 bool lb6_svc_is_affinity(const struct lb6_service *svc) 327 { 328 return svc->flags & SVC_FLAG_AFFINITY; 329 } 330 331 static __always_inline bool __lb_svc_is_routable(__u8 flags) 332 { 333 return (flags & SVC_FLAG_ROUTABLE) != 0; 334 } 335 336 static __always_inline 337 bool lb4_svc_is_routable(const struct lb4_service *svc) 338 { 339 return __lb_svc_is_routable(svc->flags); 340 } 341 342 static __always_inline 343 bool lb6_svc_is_routable(const struct lb6_service *svc) 344 { 345 return __lb_svc_is_routable(svc->flags); 346 } 347 348 #ifdef ENABLE_LOCAL_REDIRECT_POLICY 349 static __always_inline 350 bool lb4_svc_is_localredirect(const struct lb4_service *svc) 351 { 352 return svc->flags2 & SVC_FLAG_LOCALREDIRECT; 353 } 354 355 static __always_inline 356 bool lb6_svc_is_localredirect(const struct lb6_service *svc) 357 { 358 return svc->flags2 & SVC_FLAG_LOCALREDIRECT; 359 } 360 #endif /* ENABLE_LOCAL_REDIRECT_POLICY */ 361 362 static __always_inline 363 bool lb4_svc_is_l7loadbalancer(const struct lb4_service *svc __maybe_unused) 364 { 365 #ifdef ENABLE_L7_LB 366 return svc->flags2 & SVC_FLAG_L7LOADBALANCER; 367 #else 368 return false; 369 #endif 370 } 371 372 static __always_inline 373 bool lb6_svc_is_l7loadbalancer(const struct lb6_service *svc __maybe_unused) 374 { 375 #ifdef ENABLE_L7_LB 376 return svc->flags2 & SVC_FLAG_L7LOADBALANCER; 377 #else 378 return false; 379 #endif 380 } 381 382 static __always_inline int reverse_map_l4_port(struct __ctx_buff *ctx, __u8 nexthdr, 383 __be16 old_port, __be16 port, int l4_off, 384 struct csum_offset *csum_off) 385 { 386 switch (nexthdr) { 387 case IPPROTO_TCP: 388 case IPPROTO_UDP: 389 #ifdef ENABLE_SCTP 390 case IPPROTO_SCTP: 391 #endif /* ENABLE_SCTP */ 392 if (port) { 393 int ret; 394 395 if (port != old_port) { 396 #ifdef ENABLE_SCTP 397 /* This will change the SCTP checksum, which we cannot fix right now. 398 * This will likely need kernel changes before we can remove this. 399 */ 400 if (nexthdr == IPPROTO_SCTP) 401 return DROP_CSUM_L4; 402 #endif /* ENABLE_SCTP */ 403 ret = l4_modify_port(ctx, l4_off, TCP_SPORT_OFF, 404 csum_off, port, old_port); 405 if (IS_ERR(ret)) 406 return ret; 407 } 408 } 409 break; 410 411 case IPPROTO_ICMPV6: 412 case IPPROTO_ICMP: 413 return CTX_ACT_OK; 414 415 default: 416 return DROP_UNKNOWN_L4; 417 } 418 419 return 0; 420 } 421 422 static __always_inline int 423 lb_l4_xlate(struct __ctx_buff *ctx, __u8 nexthdr __maybe_unused, int l4_off, 424 struct csum_offset *csum_off, __be16 dport, __be16 backend_port) 425 { 426 if (likely(backend_port) && dport != backend_port) { 427 int ret; 428 429 #ifdef ENABLE_SCTP 430 /* This will change the SCTP checksum, which we cannot fix right now. 431 * This will likely need kernel changes before we can remove this. 432 */ 433 if (nexthdr == IPPROTO_SCTP) 434 return DROP_CSUM_L4; 435 #endif /* ENABLE_SCTP */ 436 437 /* Port offsets for UDP and TCP are the same */ 438 ret = l4_modify_port(ctx, l4_off, TCP_DPORT_OFF, csum_off, 439 backend_port, dport); 440 if (IS_ERR(ret)) 441 return ret; 442 } 443 444 return CTX_ACT_OK; 445 } 446 447 #ifdef ENABLE_IPV6 448 static __always_inline int __lb6_rev_nat(struct __ctx_buff *ctx, int l4_off, 449 struct ipv6_ct_tuple *tuple, 450 struct lb6_reverse_nat *nat) 451 { 452 struct csum_offset csum_off = {}; 453 union v6addr old_saddr __align_stack_8; 454 __be32 sum; 455 int ret; 456 457 cilium_dbg_lb(ctx, DBG_LB6_REVERSE_NAT, nat->address.p4, nat->port); 458 459 csum_l4_offset_and_flags(tuple->nexthdr, &csum_off); 460 461 if (nat->port) { 462 ret = reverse_map_l4_port(ctx, tuple->nexthdr, tuple->dport, 463 nat->port, l4_off, &csum_off); 464 if (IS_ERR(ret)) 465 return ret; 466 } 467 468 ipv6_addr_copy(&old_saddr, &tuple->saddr); 469 ipv6_addr_copy(&tuple->saddr, &nat->address); 470 471 ret = ipv6_store_saddr(ctx, nat->address.addr, ETH_HLEN); 472 if (IS_ERR(ret)) 473 return DROP_WRITE_ERROR; 474 475 sum = csum_diff(old_saddr.addr, 16, nat->address.addr, 16, 0); 476 if (csum_off.offset && 477 csum_l4_replace(ctx, l4_off, &csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0) 478 return DROP_CSUM_L4; 479 480 return 0; 481 } 482 483 static __always_inline struct lb6_reverse_nat * 484 lb6_lookup_rev_nat_entry(struct __ctx_buff *ctx __maybe_unused, __u16 index) 485 { 486 cilium_dbg_lb(ctx, DBG_LB6_REVERSE_NAT_LOOKUP, index, 0); 487 488 return map_lookup_elem(&LB6_REVERSE_NAT_MAP, &index); 489 } 490 491 /** Perform IPv6 reverse NAT based on reverse NAT index 492 * @arg ctx packet 493 * @arg l4_off offset to L4 494 * @arg index reverse NAT index 495 * @arg tuple tuple 496 */ 497 static __always_inline int lb6_rev_nat(struct __ctx_buff *ctx, int l4_off, 498 __u16 index, struct ipv6_ct_tuple *tuple) 499 { 500 struct lb6_reverse_nat *nat; 501 502 nat = lb6_lookup_rev_nat_entry(ctx, index); 503 if (nat == NULL) 504 return 0; 505 506 return __lb6_rev_nat(ctx, l4_off, tuple, nat); 507 } 508 509 static __always_inline void 510 lb6_fill_key(struct lb6_key *key, struct ipv6_ct_tuple *tuple) 511 { 512 /* FIXME: set after adding support for different L4 protocols in LB */ 513 key->proto = 0; 514 ipv6_addr_copy(&key->address, &tuple->daddr); 515 key->dport = tuple->sport; 516 } 517 518 /** Extract IPv6 CT tuple from packet 519 * @arg ctx Packet 520 * @arg ip6 Pointer to L3 header 521 * @arg l3_off Offset to L3 header 522 * @arg l4_off Offset to L4 header 523 * @arg tuple CT tuple 524 * 525 * Expects the ctx to be validated for direct packet access up to L4. 526 * 527 * Returns: 528 * - CTX_ACT_OK on successful extraction 529 * - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack) 530 * - Negative error code 531 */ 532 static __always_inline int 533 lb6_extract_tuple(struct __ctx_buff *ctx, struct ipv6hdr *ip6, int l3_off, 534 int *l4_off, struct ipv6_ct_tuple *tuple) 535 { 536 int ret; 537 538 tuple->nexthdr = ip6->nexthdr; 539 ipv6_addr_copy(&tuple->daddr, (union v6addr *)&ip6->daddr); 540 ipv6_addr_copy(&tuple->saddr, (union v6addr *)&ip6->saddr); 541 542 ret = ipv6_hdrlen_offset(ctx, &tuple->nexthdr, l3_off); 543 if (ret < 0) { 544 /* Make sure *l4_off is always initialized on return, because 545 * Clang can spill it from a register to the stack even in error 546 * flows where this value is no longer used, and this pattern is 547 * rejected by the verifier. 548 * Use a prominent value (-1) to highlight any potential misuse. 549 */ 550 *l4_off = -1; 551 return ret; 552 } 553 554 *l4_off = l3_off + ret; 555 556 switch (tuple->nexthdr) { 557 case IPPROTO_TCP: 558 case IPPROTO_UDP: 559 #ifdef ENABLE_SCTP 560 case IPPROTO_SCTP: 561 #endif /* ENABLE_SCTP */ 562 if (l4_load_ports(ctx, *l4_off, &tuple->dport) < 0) 563 return DROP_CT_INVALID_HDR; 564 return 0; 565 case IPPROTO_ICMPV6: 566 return DROP_UNSUPP_SERVICE_PROTO; 567 default: 568 return DROP_UNKNOWN_L4; 569 } 570 } 571 572 static __always_inline 573 bool lb6_src_range_ok(const struct lb6_service *svc __maybe_unused, 574 const union v6addr *saddr __maybe_unused) 575 { 576 #ifdef ENABLE_SRC_RANGE_CHECK 577 struct lb6_src_range_key key; 578 579 if (!lb6_svc_has_src_range_check(svc)) 580 return true; 581 582 key = (typeof(key)) { 583 .lpm_key = { SRC_RANGE_STATIC_PREFIX(key), {} }, 584 .rev_nat_id = svc->rev_nat_index, 585 .addr = *saddr, 586 }; 587 588 if (map_lookup_elem(&LB6_SRC_RANGE_MAP, &key)) 589 return true; 590 591 return false; 592 #else 593 return true; 594 #endif /* ENABLE_SRC_RANGE_CHECK */ 595 } 596 597 static __always_inline bool 598 lb6_to_lb4_service(const struct lb6_service *svc __maybe_unused) 599 { 600 #ifdef ENABLE_NAT_46X64 601 return svc->flags2 & SVC_FLAG_NAT_46X64; 602 #else 603 return false; 604 #endif 605 } 606 607 static __always_inline 608 struct lb6_service *lb6_lookup_service(struct lb6_key *key, 609 const bool scope_switch) 610 { 611 struct lb6_service *svc; 612 613 key->scope = LB_LOOKUP_SCOPE_EXT; 614 key->backend_slot = 0; 615 svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key); 616 if (svc) { 617 if (!scope_switch || !lb6_svc_is_two_scopes(svc)) 618 return svc; 619 key->scope = LB_LOOKUP_SCOPE_INT; 620 svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key); 621 } 622 623 return svc; 624 } 625 626 static __always_inline struct lb6_backend *__lb6_lookup_backend(__u32 backend_id) 627 { 628 return map_lookup_elem(&LB6_BACKEND_MAP, &backend_id); 629 } 630 631 static __always_inline struct lb6_backend * 632 lb6_lookup_backend(struct __ctx_buff *ctx __maybe_unused, __u32 backend_id) 633 { 634 struct lb6_backend *backend; 635 636 backend = __lb6_lookup_backend(backend_id); 637 if (!backend) 638 cilium_dbg_lb(ctx, DBG_LB6_LOOKUP_BACKEND_FAIL, backend_id, 0); 639 640 return backend; 641 } 642 643 static __always_inline 644 struct lb6_service *__lb6_lookup_backend_slot(struct lb6_key *key) 645 { 646 return map_lookup_elem(&LB6_SERVICES_MAP_V2, key); 647 } 648 649 static __always_inline 650 struct lb6_service *lb6_lookup_backend_slot(struct __ctx_buff *ctx __maybe_unused, 651 struct lb6_key *key, __u16 slot) 652 { 653 struct lb6_service *svc; 654 655 key->backend_slot = slot; 656 cilium_dbg_lb(ctx, DBG_LB6_LOOKUP_BACKEND_SLOT, key->backend_slot, key->dport); 657 svc = __lb6_lookup_backend_slot(key); 658 if (svc) 659 return svc; 660 661 cilium_dbg_lb(ctx, DBG_LB6_LOOKUP_BACKEND_SLOT_V2_FAIL, key->backend_slot, key->dport); 662 663 return NULL; 664 } 665 666 /* Backend slot 0 is always reserved for the service frontend. */ 667 #if LB_SELECTION == LB_SELECTION_RANDOM 668 static __always_inline __u32 669 lb6_select_backend_id(struct __ctx_buff *ctx, 670 struct lb6_key *key, 671 const struct ipv6_ct_tuple *tuple __maybe_unused, 672 const struct lb6_service *svc) 673 { 674 __u16 slot = (get_prandom_u32() % svc->count) + 1; 675 struct lb6_service *be = lb6_lookup_backend_slot(ctx, key, slot); 676 677 return be ? be->backend_id : 0; 678 } 679 #elif LB_SELECTION == LB_SELECTION_MAGLEV 680 static __always_inline __u32 681 lb6_select_backend_id(struct __ctx_buff *ctx __maybe_unused, 682 struct lb6_key *key __maybe_unused, 683 const struct ipv6_ct_tuple *tuple, 684 const struct lb6_service *svc) 685 { 686 __u32 zero = 0, index = svc->rev_nat_index; 687 __u32 *backend_ids; 688 void *maglev_lut; 689 690 maglev_lut = map_lookup_elem(&LB6_MAGLEV_MAP_OUTER, &index); 691 if (unlikely(!maglev_lut)) 692 return 0; 693 694 backend_ids = map_lookup_elem(maglev_lut, &zero); 695 if (unlikely(!backend_ids)) 696 return 0; 697 698 index = hash_from_tuple_v6(tuple) % LB_MAGLEV_LUT_SIZE; 699 return map_array_get_32(backend_ids, index, (LB_MAGLEV_LUT_SIZE - 1) << 2); 700 } 701 #elif LB_SELECTION == LB_SELECTION_FIRST 702 /* Backend selection for tests that always chooses first slot. */ 703 static __always_inline __u32 704 lb6_select_backend_id(struct __ctx_buff *ctx __maybe_unused, 705 struct lb6_key *key __maybe_unused, 706 const struct ipv6_ct_tuple *tuple, 707 const struct lb6_service *svc) 708 { 709 struct lb6_service *be = lb6_lookup_backend_slot(ctx, key, 1); 710 711 return be ? be->backend_id : 0; 712 } 713 #else 714 # error "Invalid load balancer backend selection algorithm!" 715 #endif /* LB_SELECTION */ 716 717 static __always_inline int lb6_xlate(struct __ctx_buff *ctx, __u8 nexthdr, 718 int l3_off, int l4_off, 719 const struct lb6_key *key, 720 const struct lb6_backend *backend, 721 const bool skip_l3_xlate) 722 { 723 const union v6addr *new_dst = &backend->address; 724 struct csum_offset csum_off = {}; 725 726 csum_l4_offset_and_flags(nexthdr, &csum_off); 727 728 if (skip_l3_xlate) 729 goto l4_xlate; 730 731 if (ipv6_store_daddr(ctx, new_dst->addr, l3_off) < 0) 732 return DROP_WRITE_ERROR; 733 if (csum_off.offset) { 734 __be32 sum = csum_diff(key->address.addr, 16, new_dst->addr, 735 16, 0); 736 737 if (csum_l4_replace(ctx, l4_off, &csum_off, 0, sum, 738 BPF_F_PSEUDO_HDR) < 0) 739 return DROP_CSUM_L4; 740 } 741 742 l4_xlate: 743 return lb_l4_xlate(ctx, nexthdr, l4_off, &csum_off, key->dport, 744 backend->port); 745 } 746 747 #ifdef ENABLE_SESSION_AFFINITY 748 static __always_inline __u32 749 __lb6_affinity_backend_id(const struct lb6_service *svc, bool netns_cookie, 750 union lb6_affinity_client_id *id) 751 { 752 struct lb6_affinity_key key = { 753 .rev_nat_id = svc->rev_nat_index, 754 .netns_cookie = netns_cookie, 755 }; 756 struct lb_affinity_val *val; 757 758 if (netns_cookie) 759 key.client_id.client_cookie = id->client_cookie; 760 else 761 ipv6_addr_copy_unaligned(&key.client_id.client_ip, &id->client_ip); 762 763 val = map_lookup_elem(&LB6_AFFINITY_MAP, &key); 764 if (val != NULL) { 765 __u32 now = bpf_mono_now(); 766 struct lb_affinity_match match = { 767 .rev_nat_id = svc->rev_nat_index, 768 .backend_id = val->backend_id, 769 }; 770 771 if (READ_ONCE(val->last_used) + 772 bpf_sec_to_mono(svc->affinity_timeout) <= now) { 773 map_delete_elem(&LB6_AFFINITY_MAP, &key); 774 return 0; 775 } 776 777 if (!map_lookup_elem(&LB_AFFINITY_MATCH_MAP, &match)) { 778 map_delete_elem(&LB6_AFFINITY_MAP, &key); 779 return 0; 780 } 781 782 WRITE_ONCE(val->last_used, now); 783 return val->backend_id; 784 } 785 786 return 0; 787 } 788 789 static __always_inline __u32 790 lb6_affinity_backend_id_by_addr(const struct lb6_service *svc, 791 union lb6_affinity_client_id *id) 792 { 793 return __lb6_affinity_backend_id(svc, false, id); 794 } 795 796 static __always_inline void 797 __lb6_update_affinity(const struct lb6_service *svc, bool netns_cookie, 798 union lb6_affinity_client_id *id, __u32 backend_id) 799 { 800 __u32 now = bpf_mono_now(); 801 struct lb6_affinity_key key = { 802 .rev_nat_id = svc->rev_nat_index, 803 .netns_cookie = netns_cookie, 804 }; 805 struct lb_affinity_val val = { 806 .backend_id = backend_id, 807 .last_used = now, 808 }; 809 810 if (netns_cookie) 811 key.client_id.client_cookie = id->client_cookie; 812 else 813 ipv6_addr_copy_unaligned(&key.client_id.client_ip, &id->client_ip); 814 815 map_update_elem(&LB6_AFFINITY_MAP, &key, &val, 0); 816 } 817 818 static __always_inline void 819 lb6_update_affinity_by_addr(const struct lb6_service *svc, 820 union lb6_affinity_client_id *id, __u32 backend_id) 821 { 822 __lb6_update_affinity(svc, false, id, backend_id); 823 } 824 #endif /* ENABLE_SESSION_AFFINITY */ 825 826 static __always_inline __u32 827 lb6_affinity_backend_id_by_netns(const struct lb6_service *svc __maybe_unused, 828 union lb6_affinity_client_id *id __maybe_unused) 829 { 830 #if defined(ENABLE_SESSION_AFFINITY) 831 return __lb6_affinity_backend_id(svc, true, id); 832 #else 833 return 0; 834 #endif 835 } 836 837 static __always_inline void 838 lb6_update_affinity_by_netns(const struct lb6_service *svc __maybe_unused, 839 union lb6_affinity_client_id *id __maybe_unused, 840 __u32 backend_id __maybe_unused) 841 { 842 #if defined(ENABLE_SESSION_AFFINITY) 843 __lb6_update_affinity(svc, true, id, backend_id); 844 #endif 845 } 846 847 static __always_inline int 848 lb6_to_lb4(struct __ctx_buff *ctx __maybe_unused, 849 const struct ipv6hdr *ip6 __maybe_unused) 850 { 851 #ifdef ENABLE_NAT_46X64 852 __be32 src4, dst4; 853 854 build_v4_from_v6((const union v6addr *)&ip6->saddr, &src4); 855 build_v4_from_v6((const union v6addr *)&ip6->daddr, &dst4); 856 857 return ipv6_to_ipv4(ctx, src4, dst4); 858 #else 859 return DROP_NAT_46X64_DISABLED; 860 #endif 861 } 862 863 static __always_inline int lb6_local(const void *map, struct __ctx_buff *ctx, 864 int l3_off, int l4_off, 865 struct lb6_key *key, 866 struct ipv6_ct_tuple *tuple, 867 const struct lb6_service *svc, 868 struct ct_state *state, 869 const bool skip_l3_xlate, 870 __s8 *ext_err) 871 { 872 __u32 monitor; /* Deliberately ignored; regular CT will determine monitoring. */ 873 __u8 flags = tuple->flags; 874 struct lb6_backend *backend; 875 __u32 backend_id = 0; 876 int ret; 877 #ifdef ENABLE_SESSION_AFFINITY 878 union lb6_affinity_client_id client_id; 879 880 ipv6_addr_copy(&client_id.client_ip, &tuple->saddr); 881 #endif 882 883 state->rev_nat_index = svc->rev_nat_index; 884 885 /* See lb4_local comments re svc endpoint lookup process */ 886 ret = ct_lazy_lookup6(map, tuple, ctx, l4_off, CT_SERVICE, 887 SCOPE_REVERSE, CT_ENTRY_SVC, state, &monitor); 888 if (ret < 0) 889 goto drop_err; 890 891 switch (ret) { 892 case CT_NEW: 893 if (unlikely(svc->count == 0)) 894 goto no_service; 895 896 #ifdef ENABLE_SESSION_AFFINITY 897 if (lb6_svc_is_affinity(svc)) { 898 backend_id = lb6_affinity_backend_id_by_addr(svc, &client_id); 899 if (backend_id != 0) { 900 backend = lb6_lookup_backend(ctx, backend_id); 901 if (backend == NULL) 902 backend_id = 0; 903 } 904 } 905 #endif 906 if (backend_id == 0) { 907 backend_id = lb6_select_backend_id(ctx, key, tuple, svc); 908 backend = lb6_lookup_backend(ctx, backend_id); 909 if (backend == NULL) 910 goto no_service; 911 } 912 913 state->backend_id = backend_id; 914 915 ret = ct_create6(map, NULL, tuple, ctx, CT_SERVICE, state, ext_err); 916 /* Fail closed, if the conntrack entry create fails drop 917 * service lookup. 918 */ 919 if (IS_ERR(ret)) 920 goto drop_err; 921 922 #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING 923 _lb_act_conn_open(ct_state->rev_nat_index, backend->zone); 924 #endif 925 926 break; 927 case CT_REPLY: 928 backend_id = state->backend_id; 929 930 /* If the lookup fails it means the user deleted the backend out from 931 * underneath us. To resolve this fall back to hash. If this is a TCP 932 * session we are likely to get a TCP RST. 933 */ 934 backend = lb6_lookup_backend(ctx, backend_id); 935 #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING 936 if (state->closing && backend) 937 _lb_act_conn_closed(svc->rev_nat_index, backend->zone); 938 #endif 939 if (unlikely(!backend || backend->flags != BE_STATE_ACTIVE)) { 940 /* Drain existing connections, but redirect new ones to only 941 * active backends. 942 */ 943 if (backend && !state->syn) 944 break; 945 946 if (unlikely(svc->count == 0)) 947 goto no_service; 948 949 backend_id = lb6_select_backend_id(ctx, key, tuple, svc); 950 backend = lb6_lookup_backend(ctx, backend_id); 951 if (!backend) 952 goto no_service; 953 954 state->rev_nat_index = svc->rev_nat_index; 955 ct_update_svc_entry(map, tuple, backend_id, svc->rev_nat_index); 956 } 957 958 break; 959 default: 960 ret = DROP_UNKNOWN_CT; 961 goto drop_err; 962 } 963 964 /* Restore flags so that SERVICE flag is only used in used when the 965 * service lookup happens and future lookups use EGRESS or INGRESS. 966 */ 967 tuple->flags = flags; 968 #ifdef ENABLE_SESSION_AFFINITY 969 if (lb6_svc_is_affinity(svc)) 970 lb6_update_affinity_by_addr(svc, &client_id, backend_id); 971 #endif 972 973 ipv6_addr_copy(&tuple->daddr, &backend->address); 974 975 if (lb_skip_l4_dnat()) 976 return CTX_ACT_OK; 977 978 if (likely(backend->port)) 979 tuple->sport = backend->port; 980 981 return lb6_xlate(ctx, tuple->nexthdr, l3_off, l4_off, 982 key, backend, skip_l3_xlate); 983 no_service: 984 ret = DROP_NO_SERVICE; 985 drop_err: 986 tuple->flags = flags; 987 return ret; 988 } 989 990 /* lb6_ctx_store_state() stores per packet load balancing state to be picked 991 * up on the continuation tail call. 992 * Note that the IP headers are already xlated and the tuple is re-initialized 993 * from the xlated headers before restoring state. 994 * NOTE: if lb_skip_l4_dnat() this is not the case as xlate is skipped. We 995 * lose the updated tuple daddr in that case. 996 */ 997 static __always_inline void lb6_ctx_store_state(struct __ctx_buff *ctx, 998 const struct ct_state *state, 999 __u16 proxy_port) 1000 { 1001 ctx_store_meta(ctx, CB_PROXY_MAGIC, (__u32)proxy_port << 16); 1002 ctx_store_meta(ctx, CB_CT_STATE, (__u32)state->rev_nat_index); 1003 } 1004 1005 /* lb6_ctx_restore_state() restores per packet load balancing state from the 1006 * previous tail call. 1007 * tuple->flags does not need to be restored, as it will be reinitialized from 1008 * the packet. 1009 */ 1010 static __always_inline void lb6_ctx_restore_state(struct __ctx_buff *ctx, 1011 struct ct_state *state, 1012 __u16 *proxy_port) 1013 { 1014 state->rev_nat_index = (__u16)ctx_load_and_clear_meta(ctx, CB_CT_STATE); 1015 1016 /* No loopback support for IPv6, see lb6_local() above. */ 1017 1018 *proxy_port = ctx_load_and_clear_meta(ctx, CB_PROXY_MAGIC) >> 16; 1019 } 1020 1021 #else 1022 1023 /* Stubs for v4-in-v6 socket cgroup hook case when only v4 is enabled to avoid 1024 * additional map management. 1025 */ 1026 static __always_inline 1027 struct lb6_service *lb6_lookup_service(struct lb6_key *key __maybe_unused, 1028 const bool scope_switch __maybe_unused) 1029 { 1030 return NULL; 1031 } 1032 1033 static __always_inline 1034 struct lb6_service *__lb6_lookup_backend_slot(struct lb6_key *key __maybe_unused) 1035 { 1036 return NULL; 1037 } 1038 1039 static __always_inline struct lb6_backend * 1040 __lb6_lookup_backend(__u16 backend_id __maybe_unused) 1041 { 1042 return NULL; 1043 } 1044 1045 static __always_inline bool 1046 lb6_to_lb4_service(const struct lb6_service *svc __maybe_unused) 1047 { 1048 return false; 1049 } 1050 #endif /* ENABLE_IPV6 */ 1051 1052 #ifdef ENABLE_IPV4 1053 static __always_inline int __lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int l4_off, 1054 struct ipv4_ct_tuple *tuple, 1055 const struct lb4_reverse_nat *nat, 1056 bool loopback __maybe_unused, bool has_l4_header) 1057 { 1058 __be32 old_sip = tuple->saddr, sum = 0; 1059 int ret; 1060 1061 cilium_dbg_lb(ctx, DBG_LB4_REVERSE_NAT, nat->address, nat->port); 1062 1063 tuple->saddr = nat->address; 1064 1065 #ifndef DISABLE_LOOPBACK_LB 1066 if (loopback) { 1067 /* The packet was looped back to the sending endpoint on the 1068 * forward service translation. This implies that the original 1069 * source address of the packet is the source address of the 1070 * current packet. We therefore need to make the current source 1071 * address the new destination address. 1072 */ 1073 __be32 old_dip = tuple->daddr; 1074 1075 cilium_dbg_lb(ctx, DBG_LB4_LOOPBACK_SNAT_REV, old_dip, old_sip); 1076 1077 ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, daddr), &old_sip, 4, 0); 1078 if (IS_ERR(ret)) 1079 return DROP_WRITE_ERROR; 1080 1081 sum = csum_diff(&old_dip, 4, &old_sip, 4, 0); 1082 1083 /* Update the tuple address which is representing the destination address */ 1084 tuple->saddr = old_sip; 1085 } 1086 #endif 1087 1088 ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, saddr), 1089 &nat->address, 4, 0); 1090 if (IS_ERR(ret)) 1091 return DROP_WRITE_ERROR; 1092 1093 sum = csum_diff(&old_sip, 4, &nat->address, 4, sum); 1094 if (ipv4_csum_update_by_diff(ctx, l3_off, sum) < 0) 1095 return DROP_CSUM_L3; 1096 1097 if (has_l4_header) { 1098 struct csum_offset csum_off = {}; 1099 1100 csum_l4_offset_and_flags(tuple->nexthdr, &csum_off); 1101 1102 if (nat->port) { 1103 /* We expect to only handle replies. Thus the extracted CT tuple 1104 * will have the packet's source port in .dport. 1105 */ 1106 ret = reverse_map_l4_port(ctx, tuple->nexthdr, tuple->dport, 1107 nat->port, l4_off, &csum_off); 1108 if (IS_ERR(ret)) 1109 return ret; 1110 } 1111 1112 if (csum_off.offset && 1113 csum_l4_replace(ctx, l4_off, &csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0) 1114 return DROP_CSUM_L4; 1115 } 1116 1117 return 0; 1118 } 1119 1120 static __always_inline struct lb4_reverse_nat * 1121 lb4_lookup_rev_nat_entry(struct __ctx_buff *ctx __maybe_unused, __u16 index) 1122 { 1123 cilium_dbg_lb(ctx, DBG_LB4_REVERSE_NAT_LOOKUP, index, 0); 1124 1125 return map_lookup_elem(&LB4_REVERSE_NAT_MAP, &index); 1126 } 1127 1128 /** Perform IPv4 reverse NAT based on reverse NAT index 1129 * @arg ctx packet 1130 * @arg l3_off offset to L3 1131 * @arg l4_off offset to L4 1132 * @arg index reverse NAT index 1133 * @arg loopback loopback connection 1134 * @arg tuple tuple 1135 */ 1136 static __always_inline int lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int l4_off, 1137 __u16 index, bool loopback, 1138 struct ipv4_ct_tuple *tuple, bool has_l4_header) 1139 { 1140 struct lb4_reverse_nat *nat; 1141 1142 nat = lb4_lookup_rev_nat_entry(ctx, index); 1143 if (nat == NULL) 1144 return 0; 1145 1146 return __lb4_rev_nat(ctx, l3_off, l4_off, tuple, nat, 1147 loopback, has_l4_header); 1148 } 1149 1150 static __always_inline void 1151 lb4_fill_key(struct lb4_key *key, const struct ipv4_ct_tuple *tuple) 1152 { 1153 /* FIXME: set after adding support for different L4 protocols in LB */ 1154 key->proto = 0; 1155 key->address = tuple->daddr; 1156 /* CT tuple has ports in reverse order: */ 1157 key->dport = tuple->sport; 1158 } 1159 1160 /** Extract IPv4 CT tuple from packet 1161 * @arg ctx Packet 1162 * @arg ip4 Pointer to L3 header 1163 * @arg l3_off Offset to L3 header 1164 * @arg l4_off Offset to L4 header 1165 * @arg tuple CT tuple 1166 * 1167 * Returns: 1168 * - CTX_ACT_OK on successful extraction 1169 * - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack) 1170 * - Negative error code 1171 */ 1172 static __always_inline int 1173 lb4_extract_tuple(struct __ctx_buff *ctx, struct iphdr *ip4, int l3_off, int *l4_off, 1174 struct ipv4_ct_tuple *tuple) 1175 { 1176 int ret; 1177 1178 tuple->nexthdr = ip4->protocol; 1179 tuple->daddr = ip4->daddr; 1180 tuple->saddr = ip4->saddr; 1181 1182 *l4_off = l3_off + ipv4_hdrlen(ip4); 1183 1184 switch (tuple->nexthdr) { 1185 case IPPROTO_TCP: 1186 case IPPROTO_UDP: 1187 #ifdef ENABLE_SCTP 1188 case IPPROTO_SCTP: 1189 #endif /* ENABLE_SCTP */ 1190 ret = ipv4_load_l4_ports(ctx, ip4, *l4_off, CT_EGRESS, 1191 &tuple->dport, NULL); 1192 1193 if (IS_ERR(ret)) 1194 return ret; 1195 return 0; 1196 case IPPROTO_ICMP: 1197 return DROP_UNSUPP_SERVICE_PROTO; 1198 default: 1199 return DROP_UNKNOWN_L4; 1200 } 1201 } 1202 1203 static __always_inline 1204 bool lb4_src_range_ok(const struct lb4_service *svc __maybe_unused, 1205 __u32 saddr __maybe_unused) 1206 { 1207 #ifdef ENABLE_SRC_RANGE_CHECK 1208 struct lb4_src_range_key key; 1209 1210 if (!lb4_svc_has_src_range_check(svc)) 1211 return true; 1212 1213 key = (typeof(key)) { 1214 .lpm_key = { SRC_RANGE_STATIC_PREFIX(key), {} }, 1215 .rev_nat_id = svc->rev_nat_index, 1216 .addr = saddr, 1217 }; 1218 1219 if (map_lookup_elem(&LB4_SRC_RANGE_MAP, &key)) 1220 return true; 1221 1222 return false; 1223 #else 1224 return true; 1225 #endif /* ENABLE_SRC_RANGE_CHECK */ 1226 } 1227 1228 static __always_inline bool 1229 lb4_to_lb6_service(const struct lb4_service *svc __maybe_unused) 1230 { 1231 #ifdef ENABLE_NAT_46X64 1232 return svc->flags2 & SVC_FLAG_NAT_46X64; 1233 #else 1234 return false; 1235 #endif 1236 } 1237 1238 static __always_inline 1239 struct lb4_service *lb4_lookup_service(struct lb4_key *key, 1240 const bool scope_switch) 1241 { 1242 struct lb4_service *svc; 1243 1244 key->scope = LB_LOOKUP_SCOPE_EXT; 1245 key->backend_slot = 0; 1246 svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 1247 if (svc) { 1248 if (!scope_switch || !lb4_svc_is_two_scopes(svc)) 1249 return svc; 1250 key->scope = LB_LOOKUP_SCOPE_INT; 1251 svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 1252 } 1253 1254 return svc; 1255 } 1256 1257 static __always_inline struct lb4_backend *__lb4_lookup_backend(__u32 backend_id) 1258 { 1259 return map_lookup_elem(&LB4_BACKEND_MAP, &backend_id); 1260 } 1261 1262 static __always_inline struct lb4_backend * 1263 lb4_lookup_backend(struct __ctx_buff *ctx __maybe_unused, __u32 backend_id) 1264 { 1265 struct lb4_backend *backend; 1266 1267 backend = __lb4_lookup_backend(backend_id); 1268 if (!backend) 1269 cilium_dbg_lb(ctx, DBG_LB4_LOOKUP_BACKEND_FAIL, backend_id, 0); 1270 1271 return backend; 1272 } 1273 1274 static __always_inline 1275 struct lb4_service *__lb4_lookup_backend_slot(struct lb4_key *key) 1276 { 1277 return map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 1278 } 1279 1280 static __always_inline 1281 struct lb4_service *lb4_lookup_backend_slot(struct __ctx_buff *ctx __maybe_unused, 1282 struct lb4_key *key, __u16 slot) 1283 { 1284 struct lb4_service *svc; 1285 1286 key->backend_slot = slot; 1287 cilium_dbg_lb(ctx, DBG_LB4_LOOKUP_BACKEND_SLOT, key->backend_slot, key->dport); 1288 svc = __lb4_lookup_backend_slot(key); 1289 if (svc) 1290 return svc; 1291 1292 cilium_dbg_lb(ctx, DBG_LB4_LOOKUP_BACKEND_SLOT_V2_FAIL, key->backend_slot, key->dport); 1293 1294 return NULL; 1295 } 1296 1297 /* Backend slot 0 is always reserved for the service frontend. */ 1298 #if LB_SELECTION == LB_SELECTION_RANDOM 1299 static __always_inline __u32 1300 lb4_select_backend_id(struct __ctx_buff *ctx, 1301 struct lb4_key *key, 1302 const struct ipv4_ct_tuple *tuple __maybe_unused, 1303 const struct lb4_service *svc) 1304 { 1305 __u16 slot = (get_prandom_u32() % svc->count) + 1; 1306 struct lb4_service *be = lb4_lookup_backend_slot(ctx, key, slot); 1307 1308 return be ? be->backend_id : 0; 1309 } 1310 #elif LB_SELECTION == LB_SELECTION_MAGLEV 1311 static __always_inline __u32 1312 lb4_select_backend_id(struct __ctx_buff *ctx __maybe_unused, 1313 struct lb4_key *key __maybe_unused, 1314 const struct ipv4_ct_tuple *tuple, 1315 const struct lb4_service *svc) 1316 { 1317 __u32 zero = 0, index = svc->rev_nat_index; 1318 __u32 *backend_ids; 1319 void *maglev_lut; 1320 1321 maglev_lut = map_lookup_elem(&LB4_MAGLEV_MAP_OUTER, &index); 1322 if (unlikely(!maglev_lut)) 1323 return 0; 1324 1325 backend_ids = map_lookup_elem(maglev_lut, &zero); 1326 if (unlikely(!backend_ids)) 1327 return 0; 1328 1329 index = hash_from_tuple_v4(tuple) % LB_MAGLEV_LUT_SIZE; 1330 return map_array_get_32(backend_ids, index, (LB_MAGLEV_LUT_SIZE - 1) << 2); 1331 } 1332 #elif LB_SELECTION == LB_SELECTION_FIRST 1333 /* Backend selection for tests that always chooses first slot. */ 1334 static __always_inline __u32 1335 lb4_select_backend_id(struct __ctx_buff *ctx, 1336 struct lb4_key *key, 1337 const struct ipv4_ct_tuple *tuple __maybe_unused, 1338 const struct lb4_service *svc) 1339 { 1340 struct lb4_service *be = lb4_lookup_backend_slot(ctx, key, 1); 1341 1342 return be ? be->backend_id : 0; 1343 } 1344 #else 1345 # error "Invalid load balancer backend selection algorithm!" 1346 #endif /* LB_SELECTION */ 1347 1348 static __always_inline int 1349 lb4_xlate(struct __ctx_buff *ctx, __be32 *new_saddr __maybe_unused, 1350 __be32 *old_saddr __maybe_unused, __u8 nexthdr __maybe_unused, int l3_off, 1351 int l4_off, struct lb4_key *key, 1352 const struct lb4_backend *backend __maybe_unused, bool has_l4_header, 1353 const bool skip_l3_xlate) 1354 { 1355 const __be32 *new_daddr = &backend->address; 1356 struct csum_offset csum_off = {}; 1357 __be32 sum; 1358 int ret; 1359 1360 if (has_l4_header) 1361 csum_l4_offset_and_flags(nexthdr, &csum_off); 1362 1363 if (skip_l3_xlate) 1364 goto l4_xlate; 1365 1366 ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, daddr), 1367 new_daddr, 4, 0); 1368 if (ret < 0) 1369 return DROP_WRITE_ERROR; 1370 1371 sum = csum_diff(&key->address, 4, new_daddr, 4, 0); 1372 #ifndef DISABLE_LOOPBACK_LB 1373 if (new_saddr && *new_saddr) { 1374 cilium_dbg_lb(ctx, DBG_LB4_LOOPBACK_SNAT, *old_saddr, *new_saddr); 1375 1376 ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, saddr), 1377 new_saddr, 4, 0); 1378 if (ret < 0) 1379 return DROP_WRITE_ERROR; 1380 1381 sum = csum_diff(old_saddr, 4, new_saddr, 4, sum); 1382 } 1383 #endif /* DISABLE_LOOPBACK_LB */ 1384 if (ipv4_csum_update_by_diff(ctx, l3_off, sum) < 0) 1385 return DROP_CSUM_L3; 1386 if (csum_off.offset) { 1387 if (csum_l4_replace(ctx, l4_off, &csum_off, 0, sum, 1388 BPF_F_PSEUDO_HDR) < 0) 1389 return DROP_CSUM_L4; 1390 } 1391 1392 l4_xlate: 1393 return has_l4_header ? lb_l4_xlate(ctx, nexthdr, l4_off, &csum_off, 1394 key->dport, backend->port) : 1395 CTX_ACT_OK; 1396 } 1397 1398 #ifdef ENABLE_SESSION_AFFINITY 1399 static __always_inline __u32 1400 __lb4_affinity_backend_id(const struct lb4_service *svc, bool netns_cookie, 1401 const union lb4_affinity_client_id *id) 1402 { 1403 struct lb4_affinity_key key = { 1404 .rev_nat_id = svc->rev_nat_index, 1405 .netns_cookie = netns_cookie, 1406 .client_id = *id, 1407 }; 1408 struct lb_affinity_val *val; 1409 1410 val = map_lookup_elem(&LB4_AFFINITY_MAP, &key); 1411 if (val != NULL) { 1412 __u32 now = bpf_mono_now(); 1413 struct lb_affinity_match match = { 1414 .rev_nat_id = svc->rev_nat_index, 1415 .backend_id = val->backend_id, 1416 }; 1417 1418 /* We have seconds granularity for timing values here. 1419 * To ensure that session affinity timeout works properly we don't include 1420 * the upper bound from the time range. 1421 * Session is sticky for range [current, last_used + affinity_timeout) 1422 */ 1423 if (READ_ONCE(val->last_used) + 1424 bpf_sec_to_mono(svc->affinity_timeout) <= now) { 1425 map_delete_elem(&LB4_AFFINITY_MAP, &key); 1426 return 0; 1427 } 1428 1429 if (!map_lookup_elem(&LB_AFFINITY_MATCH_MAP, &match)) { 1430 map_delete_elem(&LB4_AFFINITY_MAP, &key); 1431 return 0; 1432 } 1433 1434 WRITE_ONCE(val->last_used, now); 1435 return val->backend_id; 1436 } 1437 1438 return 0; 1439 } 1440 1441 static __always_inline __u32 1442 lb4_affinity_backend_id_by_addr(const struct lb4_service *svc, 1443 union lb4_affinity_client_id *id) 1444 { 1445 return __lb4_affinity_backend_id(svc, false, id); 1446 } 1447 1448 static __always_inline void 1449 __lb4_update_affinity(const struct lb4_service *svc, bool netns_cookie, 1450 const union lb4_affinity_client_id *id, 1451 __u32 backend_id) 1452 { 1453 __u32 now = bpf_mono_now(); 1454 struct lb4_affinity_key key = { 1455 .rev_nat_id = svc->rev_nat_index, 1456 .netns_cookie = netns_cookie, 1457 .client_id = *id, 1458 }; 1459 struct lb_affinity_val val = { 1460 .backend_id = backend_id, 1461 .last_used = now, 1462 }; 1463 1464 map_update_elem(&LB4_AFFINITY_MAP, &key, &val, 0); 1465 } 1466 1467 static __always_inline void 1468 lb4_update_affinity_by_addr(const struct lb4_service *svc, 1469 union lb4_affinity_client_id *id, __u32 backend_id) 1470 { 1471 __lb4_update_affinity(svc, false, id, backend_id); 1472 } 1473 #endif /* ENABLE_SESSION_AFFINITY */ 1474 1475 static __always_inline __u32 1476 lb4_affinity_backend_id_by_netns(const struct lb4_service *svc __maybe_unused, 1477 union lb4_affinity_client_id *id __maybe_unused) 1478 { 1479 #if defined(ENABLE_SESSION_AFFINITY) 1480 return __lb4_affinity_backend_id(svc, true, id); 1481 #else 1482 return 0; 1483 #endif 1484 } 1485 1486 static __always_inline void 1487 lb4_update_affinity_by_netns(const struct lb4_service *svc __maybe_unused, 1488 union lb4_affinity_client_id *id __maybe_unused, 1489 __u32 backend_id __maybe_unused) 1490 { 1491 #if defined(ENABLE_SESSION_AFFINITY) 1492 __lb4_update_affinity(svc, true, id, backend_id); 1493 #endif 1494 } 1495 1496 static __always_inline int 1497 lb4_to_lb6(struct __ctx_buff *ctx __maybe_unused, 1498 const struct iphdr *ip4 __maybe_unused, 1499 int l3_off __maybe_unused) 1500 { 1501 #ifdef ENABLE_NAT_46X64 1502 union v6addr src6, dst6; 1503 1504 build_v4_in_v6(&src6, ip4->saddr); 1505 build_v4_in_v6(&dst6, ip4->daddr); 1506 1507 return ipv4_to_ipv6(ctx, l3_off, &src6, &dst6); 1508 #else 1509 return DROP_NAT_46X64_DISABLED; 1510 #endif 1511 } 1512 1513 static __always_inline int lb4_local(const void *map, struct __ctx_buff *ctx, 1514 bool is_fragment, int l3_off, int l4_off, 1515 struct lb4_key *key, 1516 struct ipv4_ct_tuple *tuple, 1517 const struct lb4_service *svc, 1518 struct ct_state *state, 1519 bool has_l4_header, 1520 const bool skip_l3_xlate, 1521 __u32 *cluster_id __maybe_unused, 1522 __s8 *ext_err) 1523 { 1524 __u32 monitor; /* Deliberately ignored; regular CT will determine monitoring. */ 1525 __be32 saddr = tuple->saddr; 1526 __u8 flags = tuple->flags; 1527 struct lb4_backend *backend; 1528 __u32 backend_id = 0; 1529 __be32 new_saddr = 0; 1530 int ret; 1531 #ifdef ENABLE_SESSION_AFFINITY 1532 union lb4_affinity_client_id client_id = { 1533 .client_ip = saddr, 1534 }; 1535 #endif 1536 1537 state->rev_nat_index = svc->rev_nat_index; 1538 1539 ret = ct_lazy_lookup4(map, tuple, ctx, is_fragment, l4_off, has_l4_header, 1540 CT_SERVICE, SCOPE_REVERSE, CT_ENTRY_SVC, state, &monitor); 1541 if (ret < 0) 1542 goto drop_err; 1543 1544 switch (ret) { 1545 case CT_NEW: 1546 if (unlikely(svc->count == 0)) 1547 goto no_service; 1548 1549 #ifdef ENABLE_SESSION_AFFINITY 1550 if (lb4_svc_is_affinity(svc)) { 1551 backend_id = lb4_affinity_backend_id_by_addr(svc, &client_id); 1552 if (backend_id != 0) { 1553 backend = lb4_lookup_backend(ctx, backend_id); 1554 if (backend == NULL) 1555 backend_id = 0; 1556 } 1557 } 1558 #endif 1559 if (backend_id == 0) { 1560 /* No CT entry has been found, so select a svc endpoint */ 1561 backend_id = lb4_select_backend_id(ctx, key, tuple, svc); 1562 backend = lb4_lookup_backend(ctx, backend_id); 1563 if (backend == NULL) 1564 goto no_service; 1565 } 1566 1567 state->backend_id = backend_id; 1568 1569 ret = ct_create4(map, NULL, tuple, ctx, CT_SERVICE, state, ext_err); 1570 /* Fail closed, if the conntrack entry create fails drop 1571 * service lookup. 1572 */ 1573 if (IS_ERR(ret)) 1574 goto drop_err; 1575 1576 #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING 1577 _lb_act_conn_open(state->rev_nat_index, backend->zone); 1578 #endif 1579 1580 break; 1581 case CT_REPLY: 1582 backend_id = state->backend_id; 1583 1584 /* If the lookup fails it means the user deleted the backend out from 1585 * underneath us. To resolve this fall back to hash. If this is a TCP 1586 * session we are likely to get a TCP RST. 1587 */ 1588 backend = lb4_lookup_backend(ctx, backend_id); 1589 #ifdef ENABLE_ACTIVE_CONNECTION_TRACKING 1590 if (state->closing && backend) 1591 _lb_act_conn_closed(svc->rev_nat_index, backend->zone); 1592 #endif 1593 if (unlikely(!backend || backend->flags != BE_STATE_ACTIVE)) { 1594 /* Drain existing connections, but redirect new ones to only 1595 * active backends. 1596 */ 1597 if (backend && !state->syn) 1598 break; 1599 1600 if (unlikely(svc->count == 0)) 1601 goto no_service; 1602 1603 backend_id = lb4_select_backend_id(ctx, key, tuple, svc); 1604 backend = lb4_lookup_backend(ctx, backend_id); 1605 if (!backend) 1606 goto no_service; 1607 1608 state->rev_nat_index = svc->rev_nat_index; 1609 ct_update_svc_entry(map, tuple, backend_id, svc->rev_nat_index); 1610 } 1611 1612 break; 1613 default: 1614 ret = DROP_UNKNOWN_CT; 1615 goto drop_err; 1616 } 1617 1618 #ifdef ENABLE_CLUSTER_AWARE_ADDRESSING 1619 *cluster_id = backend->cluster_id; 1620 #endif 1621 1622 /* Restore flags so that SERVICE flag is only used in used when the 1623 * service lookup happens and future lookups use EGRESS or INGRESS. 1624 */ 1625 tuple->flags = flags; 1626 #ifdef ENABLE_SESSION_AFFINITY 1627 if (lb4_svc_is_affinity(svc)) 1628 lb4_update_affinity_by_addr(svc, &client_id, backend_id); 1629 #endif 1630 #ifndef DISABLE_LOOPBACK_LB 1631 /* Special loopback case: The origin endpoint has transmitted to a 1632 * service which is being translated back to the source. This would 1633 * result in a packet with identical source and destination address. 1634 * Linux considers such packets as martian source and will drop unless 1635 * received on a loopback device. Perform NAT on the source address 1636 * to make it appear from an outside address. 1637 */ 1638 if (saddr == backend->address) { 1639 new_saddr = IPV4_LOOPBACK; 1640 state->loopback = 1; 1641 } 1642 1643 if (!state->loopback) 1644 #endif 1645 tuple->daddr = backend->address; 1646 1647 if (lb_skip_l4_dnat()) 1648 return CTX_ACT_OK; 1649 1650 /* CT tuple contains ports in reverse order: */ 1651 if (likely(backend->port)) 1652 tuple->sport = backend->port; 1653 1654 return lb4_xlate(ctx, &new_saddr, &saddr, 1655 tuple->nexthdr, l3_off, l4_off, key, 1656 backend, has_l4_header, skip_l3_xlate); 1657 no_service: 1658 ret = DROP_NO_SERVICE; 1659 drop_err: 1660 tuple->flags = flags; 1661 return ret; 1662 } 1663 1664 /* lb4_ctx_store_state() stores per packet load balancing state to be picked 1665 * up on the continuation tail call. 1666 * Note that the IP headers are already xlated and the tuple is re-initialized 1667 * from the xlated headers before restoring state. 1668 * NOTE: if lb_skip_l4_dnat() this is not the case as xlate is skipped. We 1669 * lose the updated tuple daddr in that case. 1670 */ 1671 static __always_inline void lb4_ctx_store_state(struct __ctx_buff *ctx, 1672 const struct ct_state *state, 1673 __u16 proxy_port, __u32 cluster_id) 1674 { 1675 ctx_store_meta(ctx, CB_PROXY_MAGIC, (__u32)proxy_port << 16); 1676 ctx_store_meta(ctx, CB_CT_STATE, (__u32)state->rev_nat_index << 16 | 1677 #ifndef DISABLE_LOOPBACK_LB 1678 state->loopback); 1679 #else 1680 0); 1681 #endif 1682 ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id); 1683 } 1684 1685 /* lb4_ctx_restore_state() restores per packet load balancing state from the 1686 * previous tail call. 1687 * tuple->flags does not need to be restored, as it will be reinitialized from 1688 * the packet. 1689 */ 1690 static __always_inline void 1691 lb4_ctx_restore_state(struct __ctx_buff *ctx, struct ct_state *state, 1692 __u16 *proxy_port, __u32 *cluster_id __maybe_unused) 1693 { 1694 __u32 meta = ctx_load_and_clear_meta(ctx, CB_CT_STATE); 1695 #ifndef DISABLE_LOOPBACK_LB 1696 if (meta & 1) 1697 state->loopback = 1; 1698 #endif 1699 state->rev_nat_index = meta >> 16; 1700 1701 *proxy_port = ctx_load_and_clear_meta(ctx, CB_PROXY_MAGIC) >> 16; 1702 1703 #ifdef ENABLE_CLUSTER_AWARE_ADDRESSING 1704 *cluster_id = ctx_load_and_clear_meta(ctx, CB_CLUSTER_ID_EGRESS); 1705 #endif 1706 } 1707 1708 /* Because we use tail calls and this file is included in bpf_sock.h */ 1709 #ifndef SKIP_CALLS_MAP 1710 #ifdef SERVICE_NO_BACKEND_RESPONSE 1711 1712 #define ICMP_PACKET_MAX_SAMPLE_SIZE 64 1713 1714 static __always_inline 1715 __wsum icmp_wsum_accumulate(void *data_start, void *data_end, int sample_len); 1716 1717 static __always_inline 1718 int __tail_no_service_ipv4(struct __ctx_buff *ctx) 1719 { 1720 void *data, *data_end; 1721 struct ethhdr *ethhdr; 1722 struct iphdr *ip4; 1723 struct icmphdr *icmphdr; 1724 union macaddr smac = {}; 1725 union macaddr dmac = {}; 1726 __be32 saddr; 1727 __be32 daddr; 1728 __u8 tos; 1729 __wsum csum; 1730 int sample_len; 1731 int ret; 1732 const int inner_offset = sizeof(struct ethhdr) + sizeof(struct iphdr) + 1733 sizeof(struct icmphdr); 1734 1735 if (!revalidate_data(ctx, &data, &data_end, &ip4)) 1736 return DROP_INVALID; 1737 1738 /* copy the incoming src and dest IPs and mac addresses to the stack. 1739 * the pointers will not be valid after adding headroom. 1740 */ 1741 1742 if (eth_load_saddr(ctx, smac.addr, 0) < 0) 1743 return DROP_INVALID; 1744 1745 if (eth_load_daddr(ctx, dmac.addr, 0) < 0) 1746 return DROP_INVALID; 1747 1748 saddr = ip4->saddr; 1749 daddr = ip4->daddr; 1750 tos = ip4->tos; 1751 1752 /* Resize to ethernet header + 64 bytes or less */ 1753 sample_len = ctx_full_len(ctx); 1754 if (sample_len > ICMP_PACKET_MAX_SAMPLE_SIZE) 1755 sample_len = ICMP_PACKET_MAX_SAMPLE_SIZE; 1756 ctx_adjust_troom(ctx, sample_len + sizeof(struct ethhdr) - ctx_full_len(ctx)); 1757 1758 data = ctx_data(ctx); 1759 data_end = ctx_data_end(ctx); 1760 1761 /* Calculate the checksum of the ICMP sample */ 1762 csum = icmp_wsum_accumulate(data + sizeof(struct ethhdr), data_end, sample_len); 1763 1764 /* We need to insert a IPv4 and ICMP header before the original packet. 1765 * Make that room. 1766 */ 1767 1768 #if __ctx_is == __ctx_xdp 1769 ret = xdp_adjust_head(ctx, 0 - (int)(sizeof(struct iphdr) + sizeof(struct icmphdr))); 1770 #else 1771 ret = skb_adjust_room(ctx, sizeof(struct iphdr) + sizeof(struct icmphdr), 1772 BPF_ADJ_ROOM_MAC, 0); 1773 #endif 1774 1775 if (ret < 0) 1776 return DROP_INVALID; 1777 1778 /* changing size invalidates pointers, so we need to re-fetch them. */ 1779 data = ctx_data(ctx); 1780 data_end = ctx_data_end(ctx); 1781 1782 /* Bound check all 3 headers at once. */ 1783 if (data + inner_offset > data_end) 1784 return DROP_INVALID; 1785 1786 /* Write reversed eth header, ready for egress */ 1787 ethhdr = data; 1788 memcpy(ethhdr->h_dest, smac.addr, sizeof(smac.addr)); 1789 memcpy(ethhdr->h_source, dmac.addr, sizeof(dmac.addr)); 1790 ethhdr->h_proto = bpf_htons(ETH_P_IP); 1791 1792 /* Write reversed ip header, ready for egress */ 1793 ip4 = data + sizeof(struct ethhdr); 1794 ip4->version = 4; 1795 ip4->ihl = sizeof(struct iphdr) >> 2; 1796 ip4->tos = tos; 1797 ip4->tot_len = bpf_htons(sizeof(struct iphdr) + sizeof(struct icmphdr) + 1798 (__u16)sample_len); 1799 ip4->id = 0; 1800 ip4->frag_off = 0; 1801 ip4->ttl = IPDEFTTL; 1802 ip4->protocol = IPPROTO_ICMP; 1803 ip4->check = 0; 1804 ip4->daddr = saddr; 1805 ip4->saddr = daddr; 1806 ip4->check = csum_fold(csum_diff(ip4, 0, ip4, sizeof(struct iphdr), 0)); 1807 1808 /* Write reversed icmp header */ 1809 icmphdr = data + sizeof(struct ethhdr) + sizeof(struct iphdr); 1810 icmphdr->type = ICMP_DEST_UNREACH; 1811 icmphdr->code = ICMP_PORT_UNREACH; 1812 icmphdr->checksum = 0; 1813 icmphdr->un.gateway = 0; 1814 1815 /* Add ICMP header checksum to sum of its body */ 1816 csum += csum_diff(icmphdr, 0, icmphdr, sizeof(struct icmphdr), 0); 1817 icmphdr->checksum = csum_fold(csum); 1818 1819 /* Redirect ICMP to the interface we received it on. */ 1820 cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, 1821 ctx_get_ifindex(ctx)); 1822 return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0); 1823 } 1824 1825 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NO_SERVICE) 1826 int tail_no_service_ipv4(struct __ctx_buff *ctx) 1827 { 1828 __u32 src_sec_identity = ctx_load_meta(ctx, CB_SRC_LABEL); 1829 int ret; 1830 1831 ret = __tail_no_service_ipv4(ctx); 1832 if (IS_ERR(ret)) 1833 return send_drop_notify_error(ctx, src_sec_identity, ret, 1834 CTX_ACT_DROP, METRIC_INGRESS); 1835 1836 return ret; 1837 } 1838 #endif /* SERVICE_NO_BACKEND_RESPONSE */ 1839 #endif /* SKIP_CALLS_MAP */ 1840 1841 #endif /* ENABLE_IPV4 */ 1842 1843 #ifdef ENABLE_IPV6 1844 1845 /* Because we use tail calls and this file is included in bpf_sock.h */ 1846 #ifndef SKIP_CALLS_MAP 1847 #ifdef SERVICE_NO_BACKEND_RESPONSE 1848 1849 #define ICMPV6_PACKET_MAX_SAMPLE_SIZE 1280 - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr) 1850 1851 static __always_inline 1852 __wsum icmp_wsum_accumulate(void *data_start, void *data_end, int sample_len); 1853 1854 /* The IPv6 pseudo-header */ 1855 struct ipv6_pseudo_header_t { 1856 union { 1857 struct header { 1858 struct in6_addr src_ip; 1859 struct in6_addr dst_ip; 1860 __be32 top_level_length; 1861 __u8 zero[3]; 1862 __u8 next_header; 1863 } __packed fields; 1864 __u16 words[20]; 1865 }; 1866 }; 1867 1868 static __always_inline 1869 int __tail_no_service_ipv6(struct __ctx_buff *ctx) 1870 { 1871 void *data, *data_end; 1872 struct ethhdr *ethhdr; 1873 struct ipv6hdr *ip6; 1874 struct icmp6hdr *icmphdr; 1875 struct ipv6_pseudo_header_t pseudo_header; 1876 union macaddr smac = {}; 1877 union macaddr dmac = {}; 1878 struct in6_addr saddr; 1879 struct in6_addr daddr; 1880 struct ratelimit_key rkey = {}; 1881 /* Rate limit to 100 ICMPv6 replies per second, burstable to 1000 responses/s */ 1882 struct ratelimit_settings settings = { 1883 .bucket_size = 1000, 1884 .tokens_per_topup = 100, 1885 .topup_interval_ns = NSEC_PER_SEC, 1886 }; 1887 __wsum csum; 1888 __u64 sample_len; 1889 int i; 1890 int ret; 1891 const int inner_offset = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + 1892 sizeof(struct icmp6hdr); 1893 1894 rkey.netdev_idx = ctx_get_ifindex(ctx); 1895 if (!ratelimit_check_and_take(&rkey, &settings)) 1896 return DROP_RATE_LIMITED; 1897 1898 if (!revalidate_data(ctx, &data, &data_end, &ip6)) 1899 return DROP_INVALID; 1900 1901 /* copy the incoming src and dest IPs and mac addresses to the stack. 1902 * the pointers will not be valid after adding headroom. 1903 */ 1904 1905 if (eth_load_saddr(ctx, smac.addr, 0) < 0) 1906 return DROP_INVALID; 1907 1908 if (eth_load_daddr(ctx, dmac.addr, 0) < 0) 1909 return DROP_INVALID; 1910 1911 memcpy(&saddr, &ip6->saddr, sizeof(struct in6_addr)); 1912 memcpy(&daddr, &ip6->daddr, sizeof(struct in6_addr)); 1913 1914 /* Resize to min MTU - IPv6 hdr + ICMPv6 hdr */ 1915 sample_len = ctx_full_len(ctx); 1916 if (sample_len > (__u64)ICMPV6_PACKET_MAX_SAMPLE_SIZE) 1917 sample_len = ICMPV6_PACKET_MAX_SAMPLE_SIZE; 1918 ctx_adjust_troom(ctx, sample_len + sizeof(struct ethhdr) - ctx_full_len(ctx)); 1919 1920 data = ctx_data(ctx); 1921 data_end = ctx_data_end(ctx); 1922 1923 /* Calculate the unfolded checksum of the ICMPv6 sample */ 1924 csum = icmp_wsum_accumulate(data + sizeof(struct ethhdr), data_end, sample_len); 1925 1926 /* We need to insert a IPv6 and ICMPv6 header before the original packet. 1927 * Make that room. 1928 */ 1929 1930 #if __ctx_is == __ctx_xdp 1931 ret = xdp_adjust_head(ctx, 0 - (int)(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr))); 1932 #else 1933 ret = skb_adjust_room(ctx, sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr), 1934 BPF_ADJ_ROOM_MAC, 0); 1935 #endif 1936 1937 if (ret < 0) 1938 return DROP_INVALID; 1939 1940 /* changing size invalidates pointers, so we need to re-fetch them. */ 1941 data = ctx_data(ctx); 1942 data_end = ctx_data_end(ctx); 1943 1944 /* Bound check all 3 headers at once. */ 1945 if (data + inner_offset > data_end) 1946 return DROP_INVALID; 1947 1948 /* Write reversed eth header, ready for egress */ 1949 ethhdr = data; 1950 memcpy(ethhdr->h_dest, smac.addr, sizeof(smac.addr)); 1951 memcpy(ethhdr->h_source, dmac.addr, sizeof(dmac.addr)); 1952 ethhdr->h_proto = bpf_htons(ETH_P_IPV6); 1953 1954 /* Write reversed ip header, ready for egress */ 1955 ip6 = data + sizeof(struct ethhdr); 1956 ip6->version = 6; 1957 ip6->priority = 0; 1958 ip6->flow_lbl[0] = 0; 1959 ip6->flow_lbl[1] = 0; 1960 ip6->flow_lbl[2] = 0; 1961 ip6->payload_len = bpf_htons(sizeof(struct icmp6hdr) + (__u16)sample_len); 1962 ip6->nexthdr = IPPROTO_ICMPV6; 1963 ip6->hop_limit = IPDEFTTL; 1964 memcpy(&ip6->daddr, &saddr, sizeof(struct in6_addr)); 1965 memcpy(&ip6->saddr, &daddr, sizeof(struct in6_addr)); 1966 1967 /* Write reversed icmp header */ 1968 icmphdr = data + sizeof(struct ethhdr) + sizeof(struct ipv6hdr); 1969 icmphdr->icmp6_type = ICMPV6_DEST_UNREACH; 1970 icmphdr->icmp6_code = ICMPV6_PORT_UNREACH; 1971 icmphdr->icmp6_cksum = 0; 1972 icmphdr->icmp6_dataun.un_data32[0] = 0; 1973 1974 /* Add the ICMP header to the checksum (only type and code are non-zero) */ 1975 csum += ((__u16)icmphdr->icmp6_code) << 8 | (__u16)icmphdr->icmp6_type; 1976 1977 /* Fill pseudo header */ 1978 memcpy(&pseudo_header.fields.src_ip, &ip6->saddr, sizeof(struct in6_addr)); 1979 memcpy(&pseudo_header.fields.dst_ip, &ip6->daddr, sizeof(struct in6_addr)); 1980 pseudo_header.fields.top_level_length = bpf_htonl(sizeof(struct icmp6hdr) + 1981 (__u32)sample_len); 1982 __bpf_memzero(pseudo_header.fields.zero, sizeof(pseudo_header.fields.zero)); 1983 pseudo_header.fields.next_header = IPPROTO_ICMPV6; 1984 1985 #pragma unroll 1986 for (i = 0; i < (int)(sizeof(pseudo_header.words) / sizeof(__u16)); i++) 1987 csum += pseudo_header.words[i]; 1988 1989 icmphdr->icmp6_cksum = csum_fold(csum); 1990 1991 /* Redirect ICMP to the interface we received it on. */ 1992 cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, 1993 ctx_get_ifindex(ctx)); 1994 return ctx_redirect(ctx, ctx_get_ifindex(ctx), 0); 1995 } 1996 1997 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NO_SERVICE) 1998 int tail_no_service_ipv6(struct __ctx_buff *ctx) 1999 { 2000 __u32 src_sec_identity = ctx_load_meta(ctx, CB_SRC_LABEL); 2001 int ret; 2002 2003 ret = __tail_no_service_ipv6(ctx); 2004 if (IS_ERR(ret)) 2005 return send_drop_notify_error(ctx, src_sec_identity, ret, 2006 CTX_ACT_DROP, METRIC_INGRESS); 2007 2008 return ret; 2009 } 2010 #endif /* SERVICE_NO_BACKEND_RESPONSE */ 2011 #endif /* SKIP_CALLS_MAP */ 2012 #endif /* ENABLE_IPV6 */ 2013 2014 #ifdef SERVICE_NO_BACKEND_RESPONSE 2015 2016 static __always_inline 2017 __wsum icmp_wsum_accumulate(void *data_start, void *data_end, int sample_len) 2018 { 2019 /* Unrolled loop to calculate the checksum of the ICMP sample 2020 * Done manually because the compiler refuses with #pragma unroll 2021 */ 2022 __wsum wsum = 0; 2023 2024 #define body(i) if ((i) > sample_len) \ 2025 return wsum; \ 2026 if (data_start + (i) + sizeof(__u16) > data_end) { \ 2027 if (data_start + (i) + sizeof(__u8) <= data_end)\ 2028 wsum += *(__u8 *)(data_start + (i)); \ 2029 return wsum; \ 2030 } \ 2031 wsum += *(__u16 *)(data_start + (i)); 2032 2033 #define body4(i) body(i)\ 2034 body(i + 2) \ 2035 body(i + 4) \ 2036 body(i + 6) 2037 2038 #define body16(i) body4(i)\ 2039 body4(i + 8) \ 2040 body4(i + 16) \ 2041 body4(i + 24) 2042 2043 #define body128(i) body16(i)\ 2044 body16(i + 32) \ 2045 body16(i + 64) \ 2046 body16(i + 96) 2047 2048 body128(0) 2049 body128(256) 2050 body128(512) 2051 body128(768) 2052 body128(1024) 2053 2054 return wsum; 2055 } 2056 2057 #endif /* SERVICE_NO_BACKEND_RESPONSE */ 2058 2059 /* sock_local_cookie retrieves the socket cookie for the 2060 * passed socket structure. 2061 */ 2062 static __always_inline __maybe_unused 2063 __sock_cookie sock_local_cookie(struct bpf_sock_addr *ctx) 2064 { 2065 #ifdef HAVE_SOCKET_COOKIE 2066 /* prandom() breaks down on UDP, hence preference is on 2067 * socket cookie as built-in selector. On older kernels, 2068 * get_socket_cookie() provides a unique per netns cookie 2069 * for the life-time of the socket. For newer kernels this 2070 * is fixed to be a unique system _global_ cookie. Older 2071 * kernels could have a cookie collision when two pods with 2072 * different netns talk to same service backend, but that 2073 * is fine since we always reverse translate to the same 2074 * service IP/port pair. The only case that could happen 2075 * for older kernels is that we have a cookie collision 2076 * where one pod talks to the service IP/port and the 2077 * other pod talks to that same specific backend IP/port 2078 * directly _w/o_ going over service IP/port. Then the 2079 * reverse sock addr is translated to the service IP/port. 2080 * With a global socket cookie this collision cannot take 2081 * place. There, only the even more unlikely case could 2082 * happen where the same UDP socket talks first to the 2083 * service and then to the same selected backend IP/port 2084 * directly which can be considered negligible. 2085 */ 2086 return get_socket_cookie(ctx); 2087 #else 2088 return ctx->protocol == IPPROTO_TCP ? get_prandom_u32() : 0; 2089 #endif 2090 }