github.com/cilium/cilium@v1.16.2/bpf/lib/common.h (about) 1 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 /* Copyright Authors of Cilium */ 3 4 #pragma once 5 6 #include <bpf/ctx/ctx.h> 7 #include <bpf/api.h> 8 9 #include <linux/if_ether.h> 10 #include <linux/ipv6.h> 11 #include <linux/in.h> 12 #include <linux/socket.h> 13 14 #include "endian.h" 15 #include "eth.h" 16 #include "mono.h" 17 #include "config.h" 18 #include "tunnel.h" 19 20 #include "source_info.h" 21 22 #ifndef AF_INET 23 #define AF_INET 2 24 #endif 25 26 #ifndef AF_INET6 27 #define AF_INET6 10 28 #endif 29 30 #ifndef IP_DF 31 #define IP_DF 0x4000 32 #endif 33 34 #ifndef EVENT_SOURCE 35 #define EVENT_SOURCE 0 36 #endif 37 38 #ifndef THIS_MTU 39 /* If not available, fall back to generically detected MTU instead of more 40 * fine-grained per-device MTU. 41 */ 42 # define THIS_MTU MTU 43 #endif 44 45 #ifdef PREALLOCATE_MAPS 46 #define CONDITIONAL_PREALLOC 0 47 #else 48 #define CONDITIONAL_PREALLOC BPF_F_NO_PREALLOC 49 #endif 50 51 #if defined(ENABLE_EGRESS_GATEWAY) 52 #define ENABLE_EGRESS_GATEWAY_COMMON 53 #endif 54 55 #if defined(ENCAP_IFINDEX) || defined(ENABLE_EGRESS_GATEWAY_COMMON) || \ 56 (defined(ENABLE_DSR) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE) 57 #define HAVE_ENCAP 1 58 59 /* NOT_VTEP_DST is passed to an encapsulation function when the 60 * destination of the tunnel is not a VTEP. 61 */ 62 #define NOT_VTEP_DST 0 63 #endif 64 65 /* XFER_FLAGS that get transferred from XDP to SKB */ 66 enum { 67 XFER_PKT_NO_SVC = (1 << 0), /* Skip upper service handling. */ 68 XFER_UNUSED = (1 << 1), 69 XFER_PKT_SNAT_DONE = (1 << 2), /* SNAT is done */ 70 }; 71 72 /* For use in ctx_get_xfer(), after XDP called ctx_move_xfer(). */ 73 enum { 74 XFER_FLAGS = 0, /* XFER_PKT_* */ 75 }; 76 77 /* FIB errors from BPF neighbor map. */ 78 #define BPF_FIB_MAP_NO_NEIGH 100 79 80 #define CILIUM_CALL_DROP_NOTIFY 1 81 #define CILIUM_CALL_ERROR_NOTIFY 2 82 /* 83 * A gap in the macro numbering sequence was created by #24921. 84 * It can be reused for a new macro in the future, but caution is needed when 85 * backporting changes as it may conflict with older versions of the code. 86 */ 87 #define CILIUM_CALL_HANDLE_ICMP6_NS 4 88 #define CILIUM_CALL_SEND_ICMP6_TIME_EXCEEDED 5 89 #define CILIUM_CALL_ARP 6 90 #define CILIUM_CALL_IPV4_FROM_LXC 7 91 #define CILIUM_CALL_IPV4_FROM_NETDEV CILIUM_CALL_IPV4_FROM_LXC 92 #define CILIUM_CALL_IPV4_FROM_OVERLAY CILIUM_CALL_IPV4_FROM_LXC 93 #define CILIUM_CALL_IPV46_RFC8215 8 94 #define CILIUM_CALL_IPV64_RFC8215 9 95 #define CILIUM_CALL_IPV6_FROM_LXC 10 96 #define CILIUM_CALL_IPV6_FROM_NETDEV CILIUM_CALL_IPV6_FROM_LXC 97 #define CILIUM_CALL_IPV6_FROM_OVERLAY CILIUM_CALL_IPV6_FROM_LXC 98 #define CILIUM_CALL_IPV4_TO_LXC_POLICY_ONLY 11 99 #define CILIUM_CALL_IPV4_TO_HOST_POLICY_ONLY CILIUM_CALL_IPV4_TO_LXC_POLICY_ONLY 100 #define CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY 12 101 #define CILIUM_CALL_IPV6_TO_HOST_POLICY_ONLY CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY 102 #define CILIUM_CALL_IPV4_TO_ENDPOINT 13 103 #define CILIUM_CALL_IPV6_TO_ENDPOINT 14 104 #define CILIUM_CALL_IPV4_NODEPORT_NAT_EGRESS 15 105 #define CILIUM_CALL_IPV6_NODEPORT_NAT_EGRESS 16 106 #define CILIUM_CALL_IPV4_NODEPORT_REVNAT 17 107 #define CILIUM_CALL_IPV6_NODEPORT_REVNAT 18 108 #define CILIUM_CALL_IPV4_NODEPORT_NAT_FWD 19 109 #define CILIUM_CALL_IPV4_NODEPORT_DSR 20 110 #define CILIUM_CALL_IPV6_NODEPORT_DSR 21 111 #define CILIUM_CALL_IPV4_FROM_HOST 22 112 #define CILIUM_CALL_IPV6_FROM_HOST 23 113 #define CILIUM_CALL_IPV6_NODEPORT_NAT_FWD 24 114 #define CILIUM_CALL_IPV4_FROM_LXC_CONT 25 115 #define CILIUM_CALL_IPV6_FROM_LXC_CONT 26 116 #define CILIUM_CALL_IPV4_CT_INGRESS 27 117 #define CILIUM_CALL_IPV4_CT_INGRESS_POLICY_ONLY 28 118 #define CILIUM_CALL_IPV4_CT_EGRESS 29 119 #define CILIUM_CALL_IPV6_CT_INGRESS 30 120 #define CILIUM_CALL_IPV6_CT_INGRESS_POLICY_ONLY 31 121 #define CILIUM_CALL_IPV6_CT_EGRESS 32 122 #define CILIUM_CALL_SRV6_ENCAP 33 123 #define CILIUM_CALL_SRV6_DECAP 34 124 /* Unused CILIUM_CALL_SRV6_REPLY 35 */ 125 #define CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS 36 126 #define CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS 37 127 #define CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD 38 128 #define CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD 39 129 /* Unused CILIUM_CALL_IPV4_NODEPORT_DSR_INGRESS 40 130 * Unused CILIUM_CALL_IPV6_NODEPORT_DSR_INGRESS 41 131 */ 132 #define CILIUM_CALL_IPV4_INTER_CLUSTER_REVSNAT 42 133 #define CILIUM_CALL_IPV4_CONT_FROM_HOST 43 134 #define CILIUM_CALL_IPV4_CONT_FROM_NETDEV 44 135 #define CILIUM_CALL_IPV6_CONT_FROM_HOST 45 136 #define CILIUM_CALL_IPV6_CONT_FROM_NETDEV 46 137 #define CILIUM_CALL_IPV4_NO_SERVICE 47 138 #define CILIUM_CALL_IPV6_NO_SERVICE 48 139 #define CILIUM_CALL_MULTICAST_EP_DELIVERY 49 140 #define CILIUM_CALL_SIZE 50 141 142 typedef __u64 mac_t; 143 144 union v6addr { 145 struct { 146 __u32 p1; 147 __u32 p2; 148 __u32 p3; 149 __u32 p4; 150 }; 151 struct { 152 __u64 d1; 153 __u64 d2; 154 }; 155 __u8 addr[16]; 156 } __packed; 157 158 static __always_inline bool validate_ethertype_l2_off(struct __ctx_buff *ctx, 159 int l2_off, __u16 *proto) 160 { 161 const __u64 tot_len = l2_off + ETH_HLEN; 162 void *data_end = ctx_data_end(ctx); 163 void *data = ctx_data(ctx); 164 struct ethhdr *eth; 165 166 if (ETH_HLEN == 0) { 167 /* The packet is received on L2-less device. Determine L3 168 * protocol from skb->protocol. 169 */ 170 *proto = ctx_get_protocol(ctx); 171 return true; 172 } 173 174 if (data + tot_len > data_end) 175 return false; 176 177 eth = data + l2_off; 178 179 *proto = eth->h_proto; 180 181 return eth_is_supported_ethertype(*proto); 182 } 183 184 static __always_inline bool validate_ethertype(struct __ctx_buff *ctx, 185 __u16 *proto) 186 { 187 return validate_ethertype_l2_off(ctx, 0, proto); 188 } 189 190 static __always_inline __maybe_unused bool 191 ____revalidate_data_pull(struct __ctx_buff *ctx, void **data_, void **data_end_, 192 void **l3, const __u32 l3_len, const bool pull, 193 __u32 l3_off) 194 { 195 const __u64 tot_len = l3_off + l3_len; 196 void *data_end; 197 void *data; 198 199 /* Verifier workaround, do this unconditionally: invalid size of register spill. */ 200 if (pull) 201 ctx_pull_data(ctx, tot_len); 202 data_end = ctx_data_end(ctx); 203 data = ctx_data(ctx); 204 if (data + tot_len > data_end) 205 return false; 206 207 /* Verifier workaround: pointer arithmetic on pkt_end prohibited. */ 208 *data_ = data; 209 *data_end_ = data_end; 210 211 *l3 = data + l3_off; 212 return true; 213 } 214 215 static __always_inline __maybe_unused bool 216 __revalidate_data_pull(struct __ctx_buff *ctx, void **data, void **data_end, 217 void **l3, const __u32 l3_off, const __u32 l3_len, 218 const bool pull) 219 { 220 return ____revalidate_data_pull(ctx, data, data_end, l3, l3_len, pull, 221 l3_off); 222 } 223 224 static __always_inline __u32 get_tunnel_id(__u32 identity) 225 { 226 #if defined ENABLE_IPV4 && defined ENABLE_IPV6 227 if (identity == WORLD_IPV4_ID || identity == WORLD_IPV6_ID) 228 return WORLD_ID; 229 #endif 230 return identity; 231 } 232 233 static __always_inline __u32 get_id_from_tunnel_id(__u32 tunnel_id, __u16 proto __maybe_unused) 234 { 235 #if defined ENABLE_IPV4 && defined ENABLE_IPV6 236 if (tunnel_id == WORLD_ID) { 237 switch (proto) { 238 case bpf_htons(ETH_P_IP): 239 return WORLD_IPV4_ID; 240 case bpf_htons(ETH_P_IPV6): 241 return WORLD_IPV6_ID; 242 } 243 } 244 #endif 245 return tunnel_id; 246 } 247 248 /* revalidate_data_pull() initializes the provided pointers from the ctx and 249 * ensures that the data is pulled in for access. Should be used the first 250 * time that the ctx data is accessed, subsequent calls can be made to 251 * revalidate_data() which is cheaper. 252 * Returns true if 'ctx' is long enough for an IP header of the provided type, 253 * false otherwise. 254 */ 255 #define revalidate_data_pull(ctx, data, data_end, ip) \ 256 __revalidate_data_pull(ctx, data, data_end, (void **)ip, ETH_HLEN, sizeof(**ip), true) 257 258 #define revalidate_data_l3_off(ctx, data, data_end, ip, l3_off) \ 259 __revalidate_data_pull(ctx, data, data_end, (void **)ip, l3_off, sizeof(**ip), false) 260 261 /* revalidate_data() initializes the provided pointers from the ctx. 262 * Returns true if 'ctx' is long enough for an IP header of the provided type, 263 * false otherwise. 264 */ 265 #define revalidate_data(ctx, data, data_end, ip) \ 266 revalidate_data_l3_off(ctx, data, data_end, ip, ETH_HLEN) 267 268 /* Macros for working with L3 cilium defined IPV6 addresses */ 269 #define BPF_V6(dst, ...) BPF_V6_1(dst, fetch_ipv6(__VA_ARGS__)) 270 #define BPF_V6_1(dst, ...) BPF_V6_2(dst, __VA_ARGS__) 271 #define BPF_V6_2(dst, a1, a2) \ 272 ({ \ 273 dst.d1 = a1; \ 274 dst.d2 = a2; \ 275 }) 276 277 #define ENDPOINT_KEY_IPV4 1 278 #define ENDPOINT_KEY_IPV6 2 279 280 /* Structure representing an IPv4 or IPv6 address, being used as the key 281 * for the endpoints map. 282 */ 283 struct endpoint_key { 284 union { 285 struct { 286 __u32 ip4; 287 __u32 pad1; 288 __u32 pad2; 289 __u32 pad3; 290 }; 291 union v6addr ip6; 292 }; 293 __u8 family; 294 __u8 key; 295 __u16 cluster_id; 296 } __packed; 297 298 struct tunnel_key { 299 union { 300 struct { 301 __u32 ip4; 302 __u32 pad1; 303 __u32 pad2; 304 __u32 pad3; 305 }; 306 union v6addr ip6; 307 }; 308 __u8 family; 309 __u8 pad; 310 __u16 cluster_id; 311 } __packed; 312 313 struct tunnel_value { 314 union { 315 struct { 316 __u32 ip4; 317 __u32 pad1; 318 __u32 pad2; 319 __u32 pad3; 320 }; 321 union v6addr ip6; 322 }; 323 __u8 family; 324 __u8 key; 325 __u16 pad; 326 } __packed; 327 328 #define ENDPOINT_F_HOST 1 /* Special endpoint representing local host */ 329 330 /* Value of endpoint map */ 331 struct endpoint_info { 332 __u32 ifindex; 333 __u16 unused; /* used to be sec_label, no longer used */ 334 __u16 lxc_id; 335 __u32 flags; 336 mac_t mac; 337 mac_t node_mac; 338 __u32 sec_id; 339 __u32 pad[3]; 340 }; 341 342 struct edt_id { 343 __u64 id; 344 }; 345 346 struct edt_info { 347 __u64 bps; 348 __u64 t_last; 349 __u64 t_horizon_drop; 350 __u64 pad[4]; 351 }; 352 353 struct remote_endpoint_info { 354 __u32 sec_identity; 355 __u32 tunnel_endpoint; 356 __u16 pad; 357 __u8 key; 358 __u8 flag_skip_tunnel:1, 359 pad2:7; 360 }; 361 362 /* 363 * Longest-prefix match map lookup only matches the number of bits from the 364 * beginning of the key stored in the map indicated by the 'lpm_key' field in 365 * the same stored map key, not including the 'lpm_key' field itself. Note that 366 * the 'lpm_key' value passed in the lookup function argument needs to be a 367 * "full prefix" (POLICY_FULL_PREFIX defined below). 368 * 369 * Since we need to be able to wildcard 'sec_label' independently on 'protocol' 370 * and 'dport' fields, we'll need to do that explicitly with a separate lookup 371 * where 'sec_label' is zero. For the 'protocol' and 'port' we can use the 372 * longest-prefix match by placing them at the end ot the key in this specific 373 * order, as we want to be able to wildcard those fields in a specific pattern: 374 * 'protocol' can only be wildcarded if dport is also fully wildcarded. 375 * 'protocol' is never partially wildcarded, so it is either fully wildcarded or 376 * not wildcarded at all. 'dport' can be partially wildcarded, but only when 377 * 'protocol' is fully specified. This follows the logic that the destination 378 * port is a property of a transport protocol and can not be specified without 379 * also specifying the protocol. 380 */ 381 struct policy_key { 382 struct bpf_lpm_trie_key lpm_key; 383 __u32 sec_label; 384 __u8 egress:1, 385 pad:7; 386 __u8 protocol; /* can be wildcarded if 'dport' is fully wildcarded */ 387 __u16 dport; /* can be wildcarded with CIDR-like prefix */ 388 }; 389 390 /* POLICY_FULL_PREFIX gets full prefix length of policy_key */ 391 #define POLICY_FULL_PREFIX \ 392 (8 * (sizeof(struct policy_key) - sizeof(struct bpf_lpm_trie_key))) 393 394 struct policy_entry { 395 __be16 proxy_port; 396 __u8 deny:1, 397 wildcard_protocol:1, /* protocol is fully wildcarded */ 398 wildcard_dport:1, /* dport is fully wildcarded */ 399 pad:5; 400 __u8 auth_type; 401 __u16 pad1; 402 __u16 pad2; 403 __u64 packets; 404 __u64 bytes; 405 }; 406 407 struct auth_key { 408 __u32 local_sec_label; 409 __u32 remote_sec_label; 410 __u16 remote_node_id; /* zero for local node */ 411 __u8 auth_type; 412 __u8 pad; 413 }; 414 415 /* expiration is Unix epoch time in unit nanosecond/2^9 (ns/512). */ 416 struct auth_info { 417 __u64 expiration; 418 }; 419 420 /* 421 * Runtime configuration items for the datapath. 422 */ 423 enum { 424 RUNTIME_CONFIG_UTIME_OFFSET = 0, /* Index to Unix time offset in 512 ns units */ 425 /* Last monotonic time, periodically set by the agent to 426 * tell the datapath its still updating maps 427 */ 428 RUNTIME_CONFIG_AGENT_LIVENESS = 1, 429 }; 430 431 struct metrics_key { 432 __u8 reason; /* 0: forwarded, >0 dropped */ 433 __u8 dir:2, /* 1: ingress 2: egress */ 434 pad:6; 435 __u16 line; /* __MAGIC_LINE__ */ 436 __u8 file; /* __MAGIC_FILE__, needs to fit __source_file_name_to_id */ 437 __u8 reserved[3]; /* reserved for future extension */ 438 }; 439 440 441 struct metrics_value { 442 __u64 count; 443 __u64 bytes; 444 }; 445 446 struct egress_gw_policy_key { 447 struct bpf_lpm_trie_key lpm_key; 448 __u32 saddr; 449 __u32 daddr; 450 }; 451 452 struct egress_gw_policy_entry { 453 __u32 egress_ip; 454 __u32 gateway_ip; 455 }; 456 457 struct srv6_vrf_key4 { 458 struct bpf_lpm_trie_key lpm; 459 __u32 src_ip; 460 __u32 dst_cidr; 461 }; 462 463 struct srv6_vrf_key6 { 464 struct bpf_lpm_trie_key lpm; 465 union v6addr src_ip; 466 union v6addr dst_cidr; 467 }; 468 469 struct srv6_policy_key4 { 470 struct bpf_lpm_trie_key lpm; 471 __u32 vrf_id; 472 __u32 dst_cidr; 473 }; 474 475 struct srv6_policy_key6 { 476 struct bpf_lpm_trie_key lpm; 477 __u32 vrf_id; 478 union v6addr dst_cidr; 479 }; 480 481 struct vtep_key { 482 __u32 vtep_ip; 483 }; 484 485 struct vtep_value { 486 __u64 vtep_mac; 487 __u32 tunnel_endpoint; 488 }; 489 490 struct node_key { 491 __u16 pad1; 492 __u8 pad2; 493 __u8 family; 494 union { 495 struct { 496 __u32 ip4; 497 __u32 pad4; 498 __u32 pad5; 499 __u32 pad6; 500 }; 501 union v6addr ip6; 502 }; 503 }; 504 505 enum { 506 POLICY_INGRESS = 1, 507 POLICY_EGRESS = 2, 508 }; 509 510 enum { 511 POLICY_MATCH_NONE = 0, 512 POLICY_MATCH_L3_ONLY = 1, 513 POLICY_MATCH_L3_L4 = 2, 514 POLICY_MATCH_L4_ONLY = 3, 515 POLICY_MATCH_ALL = 4, 516 POLICY_MATCH_L3_PROTO = 5, 517 POLICY_MATCH_PROTO_ONLY = 6, 518 }; 519 520 enum { 521 CAPTURE_INGRESS = 1, 522 CAPTURE_EGRESS = 2, 523 }; 524 525 enum { 526 CILIUM_NOTIFY_UNSPEC, 527 CILIUM_NOTIFY_DROP, 528 CILIUM_NOTIFY_DBG_MSG, 529 CILIUM_NOTIFY_DBG_CAPTURE, 530 CILIUM_NOTIFY_TRACE, 531 CILIUM_NOTIFY_POLICY_VERDICT, 532 CILIUM_NOTIFY_CAPTURE, 533 CILIUM_NOTIFY_TRACE_SOCK, 534 }; 535 536 #define NOTIFY_COMMON_HDR \ 537 __u8 type; \ 538 __u8 subtype; \ 539 __u16 source; \ 540 __u32 hash; 541 542 #define NOTIFY_CAPTURE_HDR \ 543 NOTIFY_COMMON_HDR \ 544 __u32 len_orig; /* Length of original packet */ \ 545 __u16 len_cap; /* Length of captured bytes */ \ 546 __u16 version; /* Capture header version */ 547 548 #define __notify_common_hdr(t, s) \ 549 .type = (t), \ 550 .subtype = (s), \ 551 .source = EVENT_SOURCE, \ 552 .hash = get_hash_recalc(ctx) 553 554 #define __notify_pktcap_hdr(o, c) \ 555 .len_orig = (o), \ 556 .len_cap = (c), \ 557 .version = NOTIFY_CAPTURE_VER 558 559 /* Capture notifications version. Must be incremented when format changes. */ 560 #define NOTIFY_CAPTURE_VER 1 561 562 #ifndef TRACE_PAYLOAD_LEN 563 #define TRACE_PAYLOAD_LEN 128ULL 564 #endif 565 566 #ifndef BPF_F_PSEUDO_HDR 567 # define BPF_F_PSEUDO_HDR (1ULL << 4) 568 #endif 569 570 #define IS_ERR(x) (unlikely((x < 0) || (x == CTX_ACT_DROP))) 571 572 /* Return value to indicate that proxy redirection is required */ 573 #define POLICY_ACT_PROXY_REDIRECT (1 << 16) 574 575 /* Cilium error codes, must NOT overlap with TC return codes. 576 * These also serve as drop reasons for metrics, 577 * where reason > 0 corresponds to -(DROP_*) 578 * 579 * These are shared with pkg/monitor/api/drop.go and api/v1/flow/flow.proto. 580 * When modifying any of the below, those files should also be updated. 581 */ 582 #define DROP_UNUSED1 -130 /* unused */ 583 #define DROP_UNUSED2 -131 /* unused */ 584 #define DROP_INVALID_SIP -132 585 #define DROP_POLICY -133 586 #define DROP_INVALID -134 587 #define DROP_CT_INVALID_HDR -135 588 #define DROP_FRAG_NEEDED -136 589 #define DROP_CT_UNKNOWN_PROTO -137 590 #define DROP_UNUSED4 -138 /* unused */ 591 #define DROP_UNKNOWN_L3 -139 592 #define DROP_MISSED_TAIL_CALL -140 593 #define DROP_WRITE_ERROR -141 594 #define DROP_UNKNOWN_L4 -142 595 #define DROP_UNKNOWN_ICMP_CODE -143 596 #define DROP_UNKNOWN_ICMP_TYPE -144 597 #define DROP_UNKNOWN_ICMP6_CODE -145 598 #define DROP_UNKNOWN_ICMP6_TYPE -146 599 #define DROP_NO_TUNNEL_KEY -147 600 #define DROP_UNUSED5 -148 /* unused */ 601 #define DROP_UNUSED6 -149 /* unused */ 602 #define DROP_UNKNOWN_TARGET -150 603 #define DROP_UNROUTABLE -151 604 #define DROP_UNUSED7 -152 /* unused */ 605 #define DROP_CSUM_L3 -153 606 #define DROP_CSUM_L4 -154 607 #define DROP_CT_CREATE_FAILED -155 608 #define DROP_INVALID_EXTHDR -156 609 #define DROP_FRAG_NOSUPPORT -157 610 #define DROP_NO_SERVICE -158 611 #define DROP_UNSUPP_SERVICE_PROTO -159 612 #define DROP_NO_TUNNEL_ENDPOINT -160 613 #define DROP_NAT_46X64_DISABLED -161 614 #define DROP_EDT_HORIZON -162 615 #define DROP_UNKNOWN_CT -163 616 #define DROP_HOST_UNREACHABLE -164 617 #define DROP_NO_CONFIG -165 618 #define DROP_UNSUPPORTED_L2 -166 619 #define DROP_NAT_NO_MAPPING -167 620 #define DROP_NAT_UNSUPP_PROTO -168 621 #define DROP_NO_FIB -169 622 #define DROP_ENCAP_PROHIBITED -170 623 #define DROP_INVALID_IDENTITY -171 624 #define DROP_UNKNOWN_SENDER -172 625 #define DROP_NAT_NOT_NEEDED -173 /* Mapped as drop code, though drop not necessary. */ 626 #define DROP_IS_CLUSTER_IP -174 627 #define DROP_FRAG_NOT_FOUND -175 628 #define DROP_FORBIDDEN_ICMP6 -176 629 #define DROP_NOT_IN_SRC_RANGE -177 630 #define DROP_PROXY_LOOKUP_FAILED -178 631 #define DROP_PROXY_SET_FAILED -179 632 #define DROP_PROXY_UNKNOWN_PROTO -180 633 #define DROP_POLICY_DENY -181 634 #define DROP_VLAN_FILTERED -182 635 #define DROP_INVALID_VNI -183 636 #define DROP_INVALID_TC_BUFFER -184 637 #define DROP_NO_SID -185 638 #define DROP_MISSING_SRV6_STATE -186 /* unused */ 639 #define DROP_NAT46 -187 640 #define DROP_NAT64 -188 641 #define DROP_POLICY_AUTH_REQUIRED -189 642 #define DROP_CT_NO_MAP_FOUND -190 643 #define DROP_SNAT_NO_MAP_FOUND -191 644 #define DROP_INVALID_CLUSTER_ID -192 645 #define DROP_DSR_ENCAP_UNSUPP_PROTO -193 646 #define DROP_NO_EGRESS_GATEWAY -194 647 #define DROP_UNENCRYPTED_TRAFFIC -195 648 #define DROP_TTL_EXCEEDED -196 649 #define DROP_NO_NODE_ID -197 650 #define DROP_RATE_LIMITED -198 651 #define DROP_IGMP_HANDLED -199 652 #define DROP_IGMP_SUBSCRIBED -200 653 #define DROP_MULTICAST_HANDLED -201 654 #define DROP_HOST_NOT_READY -202 655 #define DROP_EP_NOT_READY -203 656 #define DROP_NO_EGRESS_IP -204 657 658 #define NAT_PUNT_TO_STACK DROP_NAT_NOT_NEEDED 659 #define NAT_NEEDED CTX_ACT_OK 660 #define NAT_46X64_RECIRC 100 661 662 /* Cilium metrics reasons for forwarding packets and other stats. 663 * If reason is larger than below then this is a drop reason and 664 * value corresponds to -(DROP_*), see above. 665 * 666 * These are shared with pkg/monitor/api/drop.go. 667 * When modifying any of the below, those files should also be updated. 668 */ 669 #define REASON_FORWARDED 0 670 #define REASON_PLAINTEXT 3 671 #define REASON_DECRYPT 4 672 #define REASON_LB_NO_BACKEND_SLOT 5 673 #define REASON_LB_NO_BACKEND 6 674 #define REASON_LB_REVNAT_UPDATE 7 675 #define REASON_LB_REVNAT_STALE 8 676 #define REASON_FRAG_PACKET 9 677 #define REASON_FRAG_PACKET_UPDATE 10 678 #define REASON_MISSED_CUSTOM_CALL 11 679 680 /* Lookup scope for externalTrafficPolicy=Local */ 681 #define LB_LOOKUP_SCOPE_EXT 0 682 #define LB_LOOKUP_SCOPE_INT 1 683 684 /* Cilium metrics direction for dropping/forwarding packet */ 685 enum metric_dir { 686 METRIC_INGRESS = 1, 687 METRIC_EGRESS, 688 METRIC_SERVICE 689 } __packed; 690 691 /* Magic ctx->mark identifies packets origination and encryption status. 692 * 693 * The upper 16 bits plus lower 8 bits (e.g. mask 0XFFFF00FF) contain the 694 * packets security identity. The lower/upper halves are swapped to recover 695 * the identity. 696 * 697 * In case of MARK_MAGIC_PROXY_EGRESS_EPID the upper 16 bits carry the Endpoint 698 * ID instead of the security identity and the lower 8 bits will be zeroes. 699 * 700 * The 4 bits at 0X0F00 provide 701 * - the magic marker values which indicate whether the packet is coming from 702 * an ingress or egress proxy, a local process and its current encryption 703 * status. 704 * 705 * The 4 bits at 0xF000 provide 706 * - the key index to use for encryption when multiple keys are in-flight. 707 * In the IPsec case this becomes the SPI on the wire. 708 */ 709 #define MARK_MAGIC_HOST_MASK 0x0F00 710 #define MARK_MAGIC_PROXY_TO_WORLD 0x0800 711 #define MARK_MAGIC_PROXY_EGRESS_EPID 0x0900 /* mark carries source endpoint ID */ 712 #define MARK_MAGIC_PROXY_INGRESS 0x0A00 713 #define MARK_MAGIC_PROXY_EGRESS 0x0B00 714 #define MARK_MAGIC_HOST 0x0C00 715 #define MARK_MAGIC_DECRYPT 0x0D00 716 /* used to identify encrypted overlay traffic post decryption. 717 * therefore, SPI bit can be reused to not steal an additional magic mark value. 718 */ 719 #define MARK_MAGIC_DECRYPTED_OVERLAY 0x1D00 720 #define MARK_MAGIC_ENCRYPT 0x0E00 721 #define MARK_MAGIC_IDENTITY 0x0F00 /* mark carries identity */ 722 #define MARK_MAGIC_TO_PROXY 0x0200 723 #define MARK_MAGIC_SNAT_DONE 0x0300 724 #define MARK_MAGIC_OVERLAY 0x0400 725 #define MARK_MAGIC_EGW_DONE 0x0500 726 727 #define MARK_MAGIC_KEY_MASK 0xFF00 728 729 730 /* The mark is used to indicate that the WireGuard tunnel device is done 731 * encrypting a packet. The MSB invades the Kubernetes mark "space" which is 732 * fine, as it's not used by K8s. See pkg/datapath/linux/linux_defaults/mark.go 733 * for more details. 734 */ 735 #define MARK_MAGIC_WG_ENCRYPTED 0x1E00 736 737 /* MARK_MAGIC_HEALTH_IPIP_DONE can overlap with MARK_MAGIC_SNAT_DONE with both 738 * being mutual exclusive given former is only under DSR. Used to push health 739 * probe packets to ipip tunnel device & to avoid looping back. 740 */ 741 #define MARK_MAGIC_HEALTH_IPIP_DONE MARK_MAGIC_SNAT_DONE 742 743 /* MARK_MAGIC_HEALTH can overlap with MARK_MAGIC_DECRYPT with both being 744 * mutual exclusive. Note, MARK_MAGIC_HEALTH is user-facing UAPI for LB! 745 */ 746 #define MARK_MAGIC_HEALTH MARK_MAGIC_DECRYPT 747 748 /* MARK_MAGIC_CLUSTER_ID shouldn't interfere with MARK_MAGIC_TO_PROXY. Lower 749 * 8bits carries cluster_id, and when extended via the 'max-connected-clusters' 750 * option, the upper 16bits may also be used for cluster_id, starting at the 751 * most significant bit. 752 */ 753 #define MARK_MAGIC_CLUSTER_ID MARK_MAGIC_TO_PROXY 754 755 /* IPv4 option used to carry service addr and port for DSR. 756 * 757 * Copy = 1 (option is copied to each fragment) 758 * Class = 0 (control option) 759 * Number = 26 (not used according to [1]) 760 * Len = 8 (option type (1) + option len (1) + addr (4) + port (2)) 761 * 762 * [1]: https://www.iana.org/assignments/ip-parameters/ip-parameters.xhtml 763 */ 764 #define DSR_IPV4_OPT_TYPE (IPOPT_COPY | 0x1a) 765 766 /* IPv6 option type of Destination Option used to carry service IPv6 addr and 767 * port for DSR. 768 * 769 * 0b00 - "skip over this option and continue processing the header" 770 * 0 - "Option Data does not change en-route" 771 * 11011 - Unassigned [1] 772 * 773 * [1]: https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#ipv6-parameters-2 774 */ 775 #define DSR_IPV6_OPT_TYPE 0x1B 776 #define DSR_IPV6_OPT_LEN (sizeof(struct dsr_opt_v6) - 4) 777 #define DSR_IPV6_EXT_LEN ((sizeof(struct dsr_opt_v6) - 8) / 8) 778 779 /* encrypt_config is the current encryption context on the node */ 780 struct encrypt_config { 781 __u8 encrypt_key; 782 } __packed; 783 784 /** 785 * or_encrypt_key - mask and shift key into encryption format 786 */ 787 static __always_inline __u32 or_encrypt_key(__u8 key) 788 { 789 return (((__u32)key & 0x0F) << 12) | MARK_MAGIC_ENCRYPT; 790 } 791 792 /* 793 * ctx->tc_index uses 794 * 795 * cilium_host @egress 796 * bpf_host -> bpf_lxc 797 */ 798 #define TC_INDEX_F_FROM_INGRESS_PROXY 1 799 #define TC_INDEX_F_FROM_EGRESS_PROXY 2 800 #define TC_INDEX_F_SKIP_NODEPORT 4 801 #define TC_INDEX_F_UNUSED 8 802 #define TC_INDEX_F_SKIP_HOST_FIREWALL 16 803 804 #define CB_NAT_FLAGS_REVDNAT_ONLY (1 << 0) 805 806 /* 807 * For use in ctx_{load,store}_meta(), which operates on sk_buff->cb or 808 * the cilium_xdp_scratch pad. 809 * The verifier only exposes the first 5 slots in cb[], so this enum 810 * only contains 5 entries. Aliases are added to the slots to re-use 811 * them under different names in different parts of the datapath. 812 * Take care to not clobber slots used by other functions in the same 813 * code path. 814 */ 815 /* ctx_{load,store}_meta() usage: */ 816 enum { 817 CB_SRC_LABEL, 818 #define CB_PORT CB_SRC_LABEL /* Alias, non-overlapping */ 819 #define CB_HINT CB_SRC_LABEL /* Alias, non-overlapping */ 820 #define CB_PROXY_MAGIC CB_SRC_LABEL /* Alias, non-overlapping */ 821 #define CB_ENCRYPT_MAGIC CB_SRC_LABEL /* Alias, non-overlapping */ 822 #define CB_DST_ENDPOINT_ID CB_SRC_LABEL /* Alias, non-overlapping */ 823 #define CB_SRV6_SID_1 CB_SRC_LABEL /* Alias, non-overlapping */ 824 CB_IFINDEX, 825 #define CB_NAT_46X64 CB_IFINDEX /* Alias, non-overlapping */ 826 #define CB_ADDR_V4 CB_IFINDEX /* Alias, non-overlapping */ 827 #define CB_ADDR_V6_1 CB_IFINDEX /* Alias, non-overlapping */ 828 #define CB_IPCACHE_SRC_LABEL CB_IFINDEX /* Alias, non-overlapping */ 829 #define CB_SRV6_SID_2 CB_IFINDEX /* Alias, non-overlapping */ 830 #define CB_CLUSTER_ID_EGRESS CB_IFINDEX /* Alias, non-overlapping */ 831 #define CB_HSIPC_ADDR_V4 CB_IFINDEX /* Alias, non-overlapping */ 832 #define CB_TRACED CB_IFINDEX /* Alias, non-overlapping */ 833 CB_POLICY, 834 #define CB_ADDR_V6_2 CB_POLICY /* Alias, non-overlapping */ 835 #define CB_SRV6_SID_3 CB_POLICY /* Alias, non-overlapping */ 836 #define CB_CLUSTER_ID_INGRESS CB_POLICY /* Alias, non-overlapping */ 837 #define CB_HSIPC_PORT CB_POLICY /* Alias, non-overlapping */ 838 #define CB_DSR_SRC_LABEL CB_POLICY /* Alias, non-overlapping */ 839 #define CB_NAT_FLAGS CB_POLICY /* Alias, non-overlapping */ 840 CB_3, 841 #define CB_ADDR_V6_3 CB_3 /* Alias, non-overlapping */ 842 #define CB_FROM_HOST CB_3 /* Alias, non-overlapping */ 843 #define CB_SRV6_SID_4 CB_3 /* Alias, non-overlapping */ 844 #define CB_DSR_L3_OFF CB_3 /* Alias, non-overlapping */ 845 CB_CT_STATE, 846 #define CB_ADDR_V6_4 CB_CT_STATE /* Alias, non-overlapping */ 847 #define CB_ENCRYPT_IDENTITY CB_CT_STATE /* Alias, non-overlapping, 848 * Not used by xfrm. 849 */ 850 #define CB_ENCRYPT_DST CB_CT_STATE /* Alias, non-overlapping, 851 * Not used by xfrm. 852 * Can be removed in v1.15. 853 */ 854 #define CB_CUSTOM_CALLS CB_CT_STATE /* Alias, non-overlapping */ 855 #define CB_SRV6_VRF_ID CB_CT_STATE /* Alias, non-overlapping */ 856 #define CB_FROM_TUNNEL CB_CT_STATE /* Alias, non-overlapping */ 857 }; 858 859 /* Magic values for CB_FROM_HOST. 860 * CB_FROM_HOST overlaps with CB_NAT46_STATE, so this value must be distinct 861 * from any in enum NAT46 below! 862 */ 863 #define FROM_HOST_L7_LB 0xFACADE42 864 865 #define TUPLE_F_OUT 0 /* Outgoing flow */ 866 #define TUPLE_F_IN 1 /* Incoming flow */ 867 #define TUPLE_F_RELATED 2 /* Flow represents related packets */ 868 #define TUPLE_F_SERVICE 4 /* Flow represents packets to service */ 869 870 enum ct_dir { 871 CT_EGRESS, 872 CT_INGRESS, 873 CT_SERVICE, 874 } __packed; 875 876 #ifdef ENABLE_NODEPORT 877 #define NAT_MIN_EGRESS NODEPORT_PORT_MIN_NAT 878 #else 879 #define NAT_MIN_EGRESS EPHEMERAL_MIN 880 #endif 881 882 enum ct_status { 883 CT_NEW, 884 CT_ESTABLISHED, 885 CT_REPLY, 886 CT_RELATED, 887 } __packed; 888 889 /* Service flags (lb{4,6}_service->flags) */ 890 enum { 891 SVC_FLAG_EXTERNAL_IP = (1 << 0), /* External IPs */ 892 SVC_FLAG_NODEPORT = (1 << 1), /* NodePort service */ 893 SVC_FLAG_EXT_LOCAL_SCOPE = (1 << 2), /* externalTrafficPolicy=Local */ 894 SVC_FLAG_HOSTPORT = (1 << 3), /* hostPort forwarding */ 895 SVC_FLAG_AFFINITY = (1 << 4), /* sessionAffinity=clientIP */ 896 SVC_FLAG_LOADBALANCER = (1 << 5), /* LoadBalancer service */ 897 SVC_FLAG_ROUTABLE = (1 << 6), /* Not a surrogate/ClusterIP entry */ 898 SVC_FLAG_SOURCE_RANGE = (1 << 7), /* Check LoadBalancer source range */ 899 }; 900 901 /* Service flags (lb{4,6}_service->flags2) */ 902 enum { 903 SVC_FLAG_LOCALREDIRECT = (1 << 0), /* local redirect */ 904 SVC_FLAG_NAT_46X64 = (1 << 1), /* NAT-46/64 entry */ 905 SVC_FLAG_L7LOADBALANCER = (1 << 2), /* tproxy redirect to local l7 loadbalancer */ 906 SVC_FLAG_LOOPBACK = (1 << 3), /* hostport with a loopback hostIP */ 907 SVC_FLAG_INT_LOCAL_SCOPE = (1 << 4), /* internalTrafficPolicy=Local */ 908 SVC_FLAG_TWO_SCOPES = (1 << 5), /* two sets of backends are used for external/internal connections */ 909 }; 910 911 /* Backend flags (lb{4,6}_backends->flags) */ 912 enum { 913 BE_STATE_ACTIVE = 0, 914 BE_STATE_TERMINATING, 915 BE_STATE_QUARANTINED, 916 BE_STATE_MAINTENANCE, 917 }; 918 919 struct ipv6_ct_tuple { 920 /* Address fields are reversed, i.e., 921 * these field names are correct for reply direction traffic. 922 */ 923 union v6addr daddr; 924 union v6addr saddr; 925 /* The order of dport+sport must not be changed! 926 * These field names are correct for original direction traffic. 927 */ 928 __be16 dport; 929 __be16 sport; 930 __u8 nexthdr; 931 __u8 flags; 932 } __packed; 933 934 struct ipv4_ct_tuple { 935 /* Address fields are reversed, i.e., 936 * these field names are correct for reply direction traffic. 937 */ 938 __be32 daddr; 939 __be32 saddr; 940 /* The order of dport+sport must not be changed! 941 * These field names are correct for original direction traffic. 942 */ 943 __be16 dport; 944 __be16 sport; 945 __u8 nexthdr; 946 __u8 flags; 947 } __packed; 948 949 struct ct_entry { 950 __u64 reserved0; /* unused since v1.16 */ 951 __u64 backend_id; 952 __u64 packets; 953 __u64 bytes; 954 __u32 lifetime; 955 __u16 rx_closing:1, 956 tx_closing:1, 957 reserved1:1, /* unused since v1.12 */ 958 lb_loopback:1, 959 seen_non_syn:1, 960 node_port:1, 961 proxy_redirect:1, /* Connection is redirected to a proxy */ 962 dsr_internal:1, /* DSR is k8s service related, cluster internal */ 963 from_l7lb:1, /* Connection is originated from an L7 LB proxy */ 964 reserved2:1, /* unused since v1.14 */ 965 from_tunnel:1, /* Connection is over tunnel */ 966 reserved3:5; 967 __u16 rev_nat_index; 968 /* In the kernel ifindex is u32, so we need to check in cilium-agent 969 * that ifindex of a NodePort device is <= MAX(u16). 970 * Unused when HAVE_FIB_INDEX is available. 971 */ 972 __u16 ifindex; 973 974 /* *x_flags_seen represents the OR of all TCP flags seen for the 975 * transmit/receive direction of this entry. 976 */ 977 __u8 tx_flags_seen; 978 __u8 rx_flags_seen; 979 980 __u32 src_sec_id; /* Used from userspace proxies, do not change offset! */ 981 982 /* last_*x_report is a timestamp of the last time a monitor 983 * notification was sent for the transmit/receive direction. 984 */ 985 __u32 last_tx_report; 986 __u32 last_rx_report; 987 }; 988 989 struct lb6_key { 990 union v6addr address; /* Service virtual IPv6 address */ 991 __be16 dport; /* L4 port filter, if unset, all ports apply */ 992 __u16 backend_slot; /* Backend iterator, 0 indicates the svc frontend */ 993 __u8 proto; /* L4 protocol, currently not used (set to 0) */ 994 __u8 scope; /* LB_LOOKUP_SCOPE_* for externalTrafficPolicy=Local */ 995 __u8 pad[2]; 996 }; 997 998 /* See lb4_service comments */ 999 struct lb6_service { 1000 union { 1001 __u32 backend_id; /* Backend ID in lb6_backends */ 1002 __u32 affinity_timeout; /* In seconds, only for svc frontend */ 1003 __u32 l7_lb_proxy_port; /* In host byte order, only when flags2 && SVC_FLAG_L7LOADBALANCER */ 1004 }; 1005 __u16 count; 1006 __u16 rev_nat_index; 1007 __u8 flags; 1008 __u8 flags2; 1009 __u8 pad[2]; 1010 }; 1011 1012 /* See lb4_backend comments */ 1013 struct lb6_backend { 1014 union v6addr address; 1015 __be16 port; 1016 __u8 proto; 1017 __u8 flags; 1018 __u16 cluster_id; /* With this field, we can distinguish two 1019 * backends that have the same IP address, 1020 * but belong to the different cluster. 1021 */ 1022 __u8 zone; 1023 __u8 pad; 1024 }; 1025 1026 struct lb6_health { 1027 struct lb6_backend peer; 1028 }; 1029 1030 struct lb6_reverse_nat { 1031 union v6addr address; 1032 __be16 port; 1033 } __packed; 1034 1035 struct ipv6_revnat_tuple { 1036 __sock_cookie cookie; 1037 union v6addr address; 1038 __be16 port; 1039 __u16 pad; 1040 }; 1041 1042 struct ipv6_revnat_entry { 1043 union v6addr address; 1044 __be16 port; 1045 __u16 rev_nat_index; 1046 }; 1047 1048 struct lb4_key { 1049 __be32 address; /* Service virtual IPv4 address */ 1050 __be16 dport; /* L4 port filter, if unset, all ports apply */ 1051 __u16 backend_slot; /* Backend iterator, 0 indicates the svc frontend */ 1052 __u8 proto; /* L4 protocol, currently not used (set to 0) */ 1053 __u8 scope; /* LB_LOOKUP_SCOPE_* for externalTrafficPolicy=Local */ 1054 __u8 pad[2]; 1055 }; 1056 1057 struct lb4_service { 1058 union { 1059 __u32 backend_id; /* Backend ID in lb4_backends */ 1060 __u32 affinity_timeout; /* In seconds, only for svc frontend */ 1061 __u32 l7_lb_proxy_port; /* In host byte order, only when flags2 && SVC_FLAG_L7LOADBALANCER */ 1062 }; 1063 /* For the service frontend, count denotes number of service backend 1064 * slots (otherwise zero). 1065 */ 1066 __u16 count; 1067 __u16 rev_nat_index; /* Reverse NAT ID in lb4_reverse_nat */ 1068 __u8 flags; 1069 __u8 flags2; 1070 __u8 pad[2]; 1071 }; 1072 1073 struct lb4_backend { 1074 __be32 address; /* Service endpoint IPv4 address */ 1075 __be16 port; /* L4 port filter */ 1076 __u8 proto; /* L4 protocol, currently not used (set to 0) */ 1077 __u8 flags; 1078 __u16 cluster_id; /* With this field, we can distinguish two 1079 * backends that have the same IP address, 1080 * but belong to the different cluster. 1081 */ 1082 __u8 zone; 1083 __u8 pad; 1084 }; 1085 1086 struct lb4_health { 1087 struct lb4_backend peer; 1088 }; 1089 1090 struct lb4_reverse_nat { 1091 __be32 address; 1092 __be16 port; 1093 } __packed; 1094 1095 struct ipv4_revnat_tuple { 1096 __sock_cookie cookie; 1097 __be32 address; 1098 __be16 port; 1099 __u16 pad; 1100 }; 1101 1102 struct ipv4_revnat_entry { 1103 __be32 address; 1104 __be16 port; 1105 __u16 rev_nat_index; 1106 }; 1107 1108 union lb4_affinity_client_id { 1109 __u32 client_ip; 1110 __net_cookie client_cookie; 1111 } __packed; 1112 1113 struct lb4_affinity_key { 1114 union lb4_affinity_client_id client_id; 1115 __u16 rev_nat_id; 1116 __u8 netns_cookie:1, 1117 reserved:7; 1118 __u8 pad1; 1119 __u32 pad2; 1120 } __packed; 1121 1122 union lb6_affinity_client_id { 1123 union v6addr client_ip; 1124 __net_cookie client_cookie; 1125 } __packed; 1126 1127 struct lb6_affinity_key { 1128 union lb6_affinity_client_id client_id; 1129 __u16 rev_nat_id; 1130 __u8 netns_cookie:1, 1131 reserved:7; 1132 __u8 pad1; 1133 __u32 pad2; 1134 } __packed; 1135 1136 struct lb_affinity_val { 1137 __u64 last_used; 1138 __u32 backend_id; 1139 __u32 pad; 1140 } __packed; 1141 1142 struct lb_affinity_match { 1143 __u32 backend_id; 1144 __u16 rev_nat_id; 1145 __u16 pad; 1146 } __packed; 1147 1148 struct ct_state { 1149 __u16 rev_nat_index; 1150 #ifndef DISABLE_LOOPBACK_LB 1151 __u16 loopback:1, 1152 #else 1153 __u16 loopback_disabled:1, 1154 #endif 1155 node_port:1, 1156 dsr_internal:1, /* DSR is k8s service related, cluster internal */ 1157 syn:1, 1158 proxy_redirect:1, /* Connection is redirected to a proxy */ 1159 from_l7lb:1, /* Connection is originated from an L7 LB proxy */ 1160 reserved1:1, /* Was auth_required, not used in production anywhere */ 1161 from_tunnel:1, /* Connection is from tunnel */ 1162 closing:1, 1163 reserved:7; 1164 __u32 src_sec_id; 1165 #ifndef HAVE_FIB_IFINDEX 1166 __u16 ifindex; 1167 #endif 1168 __u32 backend_id; /* Backend ID in lb4_backends */ 1169 }; 1170 1171 static __always_inline bool ct_state_is_from_l7lb(const struct ct_state *ct_state __maybe_unused) 1172 { 1173 #ifdef ENABLE_L7_LB 1174 return ct_state->from_l7lb; 1175 #else 1176 return false; 1177 #endif 1178 } 1179 1180 #define SRC_RANGE_STATIC_PREFIX(STRUCT) \ 1181 (8 * (sizeof(STRUCT) - sizeof(struct bpf_lpm_trie_key))) 1182 1183 struct lb4_src_range_key { 1184 struct bpf_lpm_trie_key lpm_key; 1185 __u16 rev_nat_id; 1186 __u16 pad; 1187 __u32 addr; 1188 }; 1189 1190 struct lb6_src_range_key { 1191 struct bpf_lpm_trie_key lpm_key; 1192 __u16 rev_nat_id; 1193 __u16 pad; 1194 union v6addr addr; 1195 }; 1196 1197 static __always_inline int redirect_ep(struct __ctx_buff *ctx __maybe_unused, 1198 int ifindex __maybe_unused, 1199 bool needs_backlog __maybe_unused, 1200 bool from_tunnel) 1201 { 1202 /* Going via CPU backlog queue (aka needs_backlog) is required 1203 * whenever we cannot do a fast ingress -> ingress switch but 1204 * instead need an ingress -> egress netns traversal or vice 1205 * versa. 1206 * 1207 * This is also the case if BPF host routing is disabled, or if 1208 * we are currently on egress which is indicated by ingress_ifindex 1209 * being 0. The latter is cleared upon skb scrubbing. 1210 * 1211 * In case of netkit, we're on the egress side and need a regular 1212 * redirect to the peer device's ifindex. In case of veth we're 1213 * on ingress and need a redirect peer to get to the target. Both 1214 * only traverse the CPU backlog queue once. In case of phys -> 1215 * Pod, the ingress_ifindex is > 0 and in both device types we 1216 * do want a redirect peer into the target Pod's netns. 1217 */ 1218 if (needs_backlog || !is_defined(ENABLE_HOST_ROUTING) || 1219 ctx_get_ingress_ifindex(ctx) == 0) { 1220 return ctx_redirect(ctx, ifindex, 0); 1221 } 1222 1223 /* When coming from overlay, we need to set packet type 1224 * to HOST as otherwise we might get dropped in IP layer. 1225 */ 1226 if (from_tunnel) 1227 ctx_change_type(ctx, PACKET_HOST); 1228 1229 return ctx_redirect_peer(ctx, ifindex, 0); 1230 } 1231 1232 static __always_inline __u64 ctx_adjust_hroom_flags(void) 1233 { 1234 #ifdef HAVE_CSUM_LEVEL 1235 return BPF_F_ADJ_ROOM_NO_CSUM_RESET; 1236 #else 1237 return 0; 1238 #endif 1239 } 1240 1241 struct lpm_v4_key { 1242 struct bpf_lpm_trie_key lpm; 1243 __u8 addr[4]; 1244 }; 1245 1246 struct lpm_v6_key { 1247 struct bpf_lpm_trie_key lpm; 1248 __u8 addr[16]; 1249 }; 1250 1251 struct lpm_val { 1252 /* Just dummy for now. */ 1253 __u8 flags; 1254 }; 1255 1256 struct skip_lb4_key { 1257 __u64 netns_cookie; /* Source pod netns cookie */ 1258 __u32 address; /* Destination service virtual IPv4 address */ 1259 __u16 port; /* Destination service virtual layer4 port */ 1260 __u16 pad; 1261 }; 1262 1263 struct skip_lb6_key { 1264 __u64 netns_cookie; /* Source pod netns cookie */ 1265 union v6addr address; /* Destination service virtual IPv6 address */ 1266 __u32 pad; 1267 __u16 port; /* Destination service virtual layer4 port */ 1268 __u16 pad2; 1269 }; 1270 1271 /* Older kernels don't support the larger tunnel key structure and we don't 1272 * need it since we only want to retrieve the tunnel ID anyway. 1273 */ 1274 #define TUNNEL_KEY_WITHOUT_SRC_IP offsetof(struct bpf_tunnel_key, local_ipv4) 1275 1276 #include "overloadable.h"