github.com/datadog/cilium@v1.6.12/bpf/lib/lb.h (about) 1 /* 2 * Copyright (C) 2016-2019 Authors of Cilium 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19 /** 20 * Configuration: 21 * LB_L4: Include L4 matching and rewriting capabilities 22 * LB_L3: Enable fallback to L3 LB entries 23 * 24 * Either LB_L4, LB_L3, or both need to be set to enable forward 25 * translation. Reverse translation will awlays occur regardless 26 * of the settings. 27 */ 28 29 30 #ifndef __LB_H_ 31 #define __LB_H_ 32 33 #include "csum.h" 34 #include "conntrack.h" 35 36 #define CILIUM_LB_MAP_MAX_FE 256 37 38 #ifdef ENABLE_IPV6 39 struct bpf_elf_map __section_maps LB6_REVERSE_NAT_MAP = { 40 .type = BPF_MAP_TYPE_HASH, 41 .size_key = sizeof(__u16), 42 .size_value = sizeof(struct lb6_reverse_nat), 43 .pinning = PIN_GLOBAL_NS, 44 .max_elem = CILIUM_LB_MAP_MAX_ENTRIES, 45 .flags = CONDITIONAL_PREALLOC, 46 }; 47 48 struct bpf_elf_map __section_maps LB6_SERVICES_MAP_V2 = { 49 .type = BPF_MAP_TYPE_HASH, 50 .size_key = sizeof(struct lb6_key_v2), 51 .size_value = sizeof(struct lb6_service_v2), 52 .pinning = PIN_GLOBAL_NS, 53 .max_elem = CILIUM_LB_MAP_MAX_ENTRIES, 54 .flags = CONDITIONAL_PREALLOC, 55 }; 56 57 struct bpf_elf_map __section_maps LB6_RR_SEQ_MAP_V2 = { 58 .type = BPF_MAP_TYPE_HASH, 59 .size_key = sizeof(struct lb6_key_v2), 60 .size_value = sizeof(struct lb_sequence), 61 .pinning = PIN_GLOBAL_NS, 62 .max_elem = CILIUM_LB_MAP_MAX_FE, 63 .flags = CONDITIONAL_PREALLOC, 64 }; 65 66 struct bpf_elf_map __section_maps LB6_BACKEND_MAP = { 67 .type = BPF_MAP_TYPE_HASH, 68 .size_key = sizeof(__u16), 69 .size_value = sizeof(struct lb6_backend), 70 .pinning = PIN_GLOBAL_NS, 71 .max_elem = CILIUM_LB_MAP_MAX_ENTRIES, 72 .flags = CONDITIONAL_PREALLOC, 73 }; 74 75 #endif /* ENABLE_IPV6 */ 76 77 #ifdef ENABLE_IPV4 78 struct bpf_elf_map __section_maps LB4_REVERSE_NAT_MAP = { 79 .type = BPF_MAP_TYPE_HASH, 80 .size_key = sizeof(__u16), 81 .size_value = sizeof(struct lb4_reverse_nat), 82 .pinning = PIN_GLOBAL_NS, 83 .max_elem = CILIUM_LB_MAP_MAX_ENTRIES, 84 .flags = CONDITIONAL_PREALLOC, 85 }; 86 87 struct bpf_elf_map __section_maps LB4_SERVICES_MAP_V2 = { 88 .type = BPF_MAP_TYPE_HASH, 89 .size_key = sizeof(struct lb4_key_v2), 90 .size_value = sizeof(struct lb4_service_v2), 91 .pinning = PIN_GLOBAL_NS, 92 .max_elem = CILIUM_LB_MAP_MAX_ENTRIES, 93 .flags = CONDITIONAL_PREALLOC, 94 }; 95 96 struct bpf_elf_map __section_maps LB4_RR_SEQ_MAP_V2 = { 97 .type = BPF_MAP_TYPE_HASH, 98 .size_key = sizeof(struct lb4_key_v2), 99 .size_value = sizeof(struct lb_sequence), 100 .pinning = PIN_GLOBAL_NS, 101 .max_elem = CILIUM_LB_MAP_MAX_FE, 102 .flags = CONDITIONAL_PREALLOC, 103 }; 104 105 struct bpf_elf_map __section_maps LB4_BACKEND_MAP = { 106 .type = BPF_MAP_TYPE_HASH, 107 .size_key = sizeof(__u16), 108 .size_value = sizeof(struct lb4_backend), 109 .pinning = PIN_GLOBAL_NS, 110 .max_elem = CILIUM_LB_MAP_MAX_ENTRIES, 111 .flags = CONDITIONAL_PREALLOC, 112 }; 113 114 #endif /* ENABLE_IPV4 */ 115 116 117 #define REV_NAT_F_TUPLE_SADDR 1 118 #ifdef LB_DEBUG 119 #define cilium_dbg_lb cilium_dbg 120 #else 121 #define cilium_dbg_lb(a, b, c, d) 122 #endif 123 124 static inline int lb6_select_slave(__u16 count) 125 { 126 /* Slave 0 is reserved for the master slot */ 127 return (get_prandom_u32() % count) + 1; 128 } 129 130 static inline int lb4_select_slave(__u16 count) 131 { 132 /* Slave 0 is reserved for the master slot */ 133 return (get_prandom_u32() % count) + 1; 134 } 135 136 static inline int __inline__ extract_l4_port(struct __sk_buff *skb, __u8 nexthdr, 137 int l4_off, __be16 *port) 138 { 139 int ret; 140 141 switch (nexthdr) { 142 case IPPROTO_TCP: 143 case IPPROTO_UDP: 144 /* Port offsets for UDP and TCP are the same */ 145 ret = l4_load_port(skb, l4_off + TCP_DPORT_OFF, port); 146 if (IS_ERR(ret)) 147 return ret; 148 break; 149 150 case IPPROTO_ICMPV6: 151 case IPPROTO_ICMP: 152 break; 153 154 default: 155 /* Pass unknown L4 to stack */ 156 return DROP_UNKNOWN_L4; 157 } 158 159 return 0; 160 } 161 162 static inline int __inline__ reverse_map_l4_port(struct __sk_buff *skb, __u8 nexthdr, 163 __be16 port, int l4_off, 164 struct csum_offset *csum_off) 165 { 166 switch (nexthdr) { 167 case IPPROTO_TCP: 168 case IPPROTO_UDP: 169 if (port) { 170 __be16 old_port; 171 int ret; 172 173 /* Port offsets for UDP and TCP are the same */ 174 ret = l4_load_port(skb, l4_off + TCP_SPORT_OFF, &old_port); 175 if (IS_ERR(ret)) 176 return ret; 177 178 if (port != old_port) { 179 ret = l4_modify_port(skb, l4_off, TCP_SPORT_OFF, 180 csum_off, port, old_port); 181 if (IS_ERR(ret)) 182 return ret; 183 } 184 } 185 break; 186 187 case IPPROTO_ICMPV6: 188 case IPPROTO_ICMP: 189 break; 190 191 default: 192 return DROP_UNKNOWN_L4; 193 } 194 195 return 0; 196 } 197 198 #ifdef ENABLE_IPV6 199 static inline int __inline__ __lb6_rev_nat(struct __sk_buff *skb, int l4_off, 200 struct csum_offset *csum_off, 201 struct ipv6_ct_tuple *tuple, int flags, 202 struct lb6_reverse_nat *nat) 203 { 204 union v6addr old_saddr; 205 union v6addr tmp; 206 __u8 *new_saddr; 207 __be32 sum; 208 int ret; 209 210 cilium_dbg_lb(skb, DBG_LB6_REVERSE_NAT, nat->address.p4, nat->port); 211 212 if (nat->port) { 213 ret = reverse_map_l4_port(skb, tuple->nexthdr, nat->port, l4_off, csum_off); 214 if (IS_ERR(ret)) 215 return ret; 216 } 217 218 if (flags & REV_NAT_F_TUPLE_SADDR) { 219 ipv6_addr_copy(&old_saddr, &tuple->saddr); 220 ipv6_addr_copy(&tuple->saddr, &nat->address); 221 new_saddr = tuple->saddr.addr; 222 } else { 223 if (ipv6_load_saddr(skb, ETH_HLEN, &old_saddr) < 0) 224 return DROP_INVALID; 225 226 ipv6_addr_copy(&tmp, &nat->address); 227 new_saddr = tmp.addr; 228 } 229 230 ret = ipv6_store_saddr(skb, new_saddr, ETH_HLEN); 231 if (IS_ERR(ret)) 232 return DROP_WRITE_ERROR; 233 234 sum = csum_diff(old_saddr.addr, 16, new_saddr, 16, 0); 235 if (csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0) 236 return DROP_CSUM_L4; 237 238 return 0; 239 } 240 241 /** Perform IPv6 reverse NAT based on reverse NAT index 242 * @arg skb packet 243 * @arg l4_off offset to L4 244 * @arg csum_off offset to L4 checksum field 245 * @arg csum_flags checksum flags 246 * @arg index reverse NAT index 247 * @arg tuple tuple 248 * @arg saddr_tuple If set, tuple address will be updated with new source address 249 */ 250 static inline int __inline__ lb6_rev_nat(struct __sk_buff *skb, int l4_off, 251 struct csum_offset *csum_off, __u16 index, 252 struct ipv6_ct_tuple *tuple, int flags) 253 { 254 struct lb6_reverse_nat *nat; 255 256 cilium_dbg_lb(skb, DBG_LB6_REVERSE_NAT_LOOKUP, index, 0); 257 nat = map_lookup_elem(&LB6_REVERSE_NAT_MAP, &index); 258 if (nat == NULL) 259 return 0; 260 261 return __lb6_rev_nat(skb, l4_off, csum_off, tuple, flags, nat); 262 } 263 264 /** Extract IPv6 LB key from packet 265 * @arg skb Packet 266 * @arg tuple Tuple 267 * @arg l4_off Offset to L4 header 268 * @arg key Pointer to store LB key in 269 * @arg csum_off Pointer to store L4 checksum field offset and flags 270 * @arg dir Flow direction 271 * 272 * Expects the skb to be validated for direct packet access up to L4. Fills 273 * lb6_key_v2 based on L4 nexthdr. 274 * 275 * Returns: 276 * - TC_ACT_OK on successful extraction 277 * - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack) 278 * - Negative error code 279 */ 280 static inline int __inline__ lb6_extract_key_v2(struct __sk_buff *skb, 281 struct ipv6_ct_tuple *tuple, 282 int l4_off, 283 struct lb6_key_v2 *key, 284 struct csum_offset *csum_off, 285 int dir) 286 { 287 union v6addr *addr; 288 // FIXME(brb): set after adding support for different L4 protocols in LB 289 key->proto = 0; 290 addr = (dir == CT_INGRESS) ? &tuple->saddr : &tuple->daddr; 291 ipv6_addr_copy(&key->address, addr); 292 csum_l4_offset_and_flags(tuple->nexthdr, csum_off); 293 294 #ifdef LB_L4 295 return extract_l4_port(skb, tuple->nexthdr, l4_off, &key->dport); 296 #else 297 return 0; 298 #endif 299 } 300 301 static inline 302 struct lb6_service_v2 *__lb6_lookup_service_v2(struct lb6_key_v2 *key) 303 { 304 key->slave = 0; 305 #ifdef LB_L4 306 if (key->dport) { 307 struct lb6_service_v2 *svc; 308 309 /* FIXME: The verifier barks on these calls right now for some reason */ 310 /* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */ 311 svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key); 312 if (svc && svc->count != 0) 313 return svc; 314 315 key->dport = 0; 316 } 317 #endif 318 319 #ifdef LB_L3 320 if (1) { 321 struct lb6_service_v2 *svc; 322 323 /* FIXME: The verifier barks on these calls right now for some reason */ 324 /* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */ 325 svc = map_lookup_elem(&LB6_SERVICES_MAP_V2, key); 326 if (svc && svc->count != 0) 327 return svc; 328 } 329 #endif 330 return NULL; 331 } 332 333 static inline 334 struct lb6_service_v2 *lb6_lookup_service_v2(struct __sk_buff *skb, 335 struct lb6_key_v2 *key) 336 { 337 struct lb6_service_v2 *svc = __lb6_lookup_service_v2(key); 338 339 340 if (!svc) 341 cilium_dbg_lb(skb, DBG_LB6_LOOKUP_MASTER_FAIL, 0, 0); 342 343 return svc; 344 } 345 346 static inline struct lb6_backend *__lb6_lookup_backend(__u16 backend_id) 347 { 348 return map_lookup_elem(&LB6_BACKEND_MAP, &backend_id); 349 } 350 351 static inline struct lb6_backend *lb6_lookup_backend(struct __sk_buff *skb, 352 __u16 backend_id) 353 { 354 struct lb6_backend *backend; 355 356 backend = __lb6_lookup_backend(backend_id); 357 if (!backend) { 358 cilium_dbg_lb(skb, DBG_LB6_LOOKUP_BACKEND_FAIL, backend_id, 0); 359 } 360 361 return backend; 362 } 363 364 static inline 365 struct lb6_service_v2 *__lb6_lookup_slave_v2(struct lb6_key_v2 *key) 366 { 367 return map_lookup_elem(&LB6_SERVICES_MAP_V2, key); 368 } 369 370 static inline 371 struct lb6_service_v2 *lb6_lookup_slave_v2(struct __sk_buff *skb, 372 struct lb6_key_v2 *key, __u16 slave) 373 { 374 struct lb6_service_v2 *svc; 375 376 key->slave = slave; 377 cilium_dbg_lb(skb, DBG_LB6_LOOKUP_SLAVE, key->slave, key->dport); 378 svc = __lb6_lookup_slave_v2(key); 379 if (svc != NULL) { 380 return svc; 381 } 382 383 cilium_dbg_lb(skb, DBG_LB6_LOOKUP_SLAVE_V2_FAIL, key->slave, key->dport); 384 385 return NULL; 386 } 387 388 static inline int __inline__ lb6_xlate_v2(struct __sk_buff *skb, 389 union v6addr *new_dst, __u8 nexthdr, 390 int l3_off, int l4_off, 391 struct csum_offset *csum_off, 392 struct lb6_key_v2 *key, 393 struct lb6_service_v2 *svc, 394 struct lb6_backend *backend) 395 { 396 ipv6_store_daddr(skb, new_dst->addr, l3_off); 397 398 if (csum_off) { 399 __be32 sum = csum_diff(key->address.addr, 16, new_dst->addr, 16, 0); 400 if (csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0) 401 return DROP_CSUM_L4; 402 } 403 404 #ifdef LB_L4 405 if (backend->port && key->dport != backend->port && 406 (nexthdr == IPPROTO_TCP || nexthdr == IPPROTO_UDP)) { 407 __be16 tmp = backend->port; 408 int ret; 409 410 /* Port offsets for UDP and TCP are the same */ 411 ret = l4_modify_port(skb, l4_off, TCP_DPORT_OFF, csum_off, tmp, key->dport); 412 if (IS_ERR(ret)) 413 return ret; 414 } 415 #endif 416 417 return TC_ACT_OK; 418 } 419 420 static inline int __inline__ lb6_local(void *map, struct __sk_buff *skb, 421 int l3_off, int l4_off, 422 struct csum_offset *csum_off, 423 struct lb6_key_v2 *key, 424 struct ipv6_ct_tuple *tuple, 425 struct lb6_service_v2 *svc_v2, 426 struct ct_state *state) 427 { 428 __u32 monitor; // Deliberately ignored; regular CT will determine monitoring. 429 union v6addr *addr; 430 __u8 flags = tuple->flags; 431 struct lb6_backend *backend; 432 struct lb6_service_v2 *slave_svc; 433 int slave; 434 int ret; 435 436 /* See lb4_local comments re svc endpoint lookup process */ 437 438 ret = ct_lookup6(map, tuple, skb, l4_off, CT_SERVICE, state, &monitor); 439 switch(ret) { 440 case CT_NEW: 441 slave = lb6_select_slave(svc_v2->count); 442 if ((slave_svc = lb6_lookup_slave_v2(skb, key, slave)) == NULL) { 443 goto drop_no_service; 444 } 445 backend = lb6_lookup_backend(skb, slave_svc->backend_id); 446 if (backend == NULL) { 447 goto drop_no_service; 448 } 449 state->backend_id = slave_svc->backend_id; 450 state->rev_nat_index = svc_v2->rev_nat_index; 451 ret = ct_create6(map, tuple, skb, CT_SERVICE, state, false); 452 /* Fail closed, if the conntrack entry create fails drop 453 * service lookup. 454 */ 455 if (IS_ERR(ret)) { 456 goto drop_no_service; 457 } 458 goto update_state; 459 case CT_ESTABLISHED: 460 case CT_RELATED: 461 case CT_REPLY: 462 // See lb4_local comment 463 if (state->rev_nat_index == 0) { 464 state->rev_nat_index = svc_v2->rev_nat_index; 465 ct_update6_rev_nat_index(map, tuple, state); 466 } 467 break; 468 default: 469 goto drop_no_service; 470 } 471 472 // See lb4_local comment 473 if (state->rev_nat_index != svc_v2->rev_nat_index) { 474 cilium_dbg_lb(skb, DBG_LB_STALE_CT, svc_v2->rev_nat_index, 475 state->rev_nat_index); 476 slave = lb6_select_slave(svc_v2->count); 477 if (!(slave_svc = lb6_lookup_slave_v2(skb, key, slave))) { 478 goto drop_no_service; 479 } 480 state->backend_id = slave_svc->backend_id; 481 ct_update6_backend_id(map, tuple, state); 482 state->rev_nat_index = svc_v2->rev_nat_index; 483 ct_update6_rev_nat_index(map, tuple, state); 484 } 485 /* If the lookup fails it means the user deleted the backend out from 486 * underneath us. To resolve this fall back to hash. If this is a TCP 487 * session we are likely to get a TCP RST. 488 */ 489 if (!(backend = lb6_lookup_backend(skb, state->backend_id))) { 490 key->slave = 0; 491 if (!(svc_v2 = lb6_lookup_service_v2(skb, key))) { 492 goto drop_no_service; 493 } 494 slave = lb6_select_slave(svc_v2->count); 495 if (!(slave_svc = lb6_lookup_slave_v2(skb, key, slave))) { 496 goto drop_no_service; 497 } 498 backend = lb6_lookup_backend(skb, slave_svc->backend_id); 499 if (backend == NULL) { 500 goto drop_no_service; 501 } 502 state->backend_id = slave_svc->backend_id; 503 ct_update6_backend_id(map, tuple, state); 504 } 505 506 update_state: 507 /* Restore flags so that SERVICE flag is only used in used when the 508 * service lookup happens and future lookups use EGRESS or INGRESS. 509 */ 510 tuple->flags = flags; 511 ipv6_addr_copy(&tuple->daddr, &backend->address); 512 addr = &tuple->daddr; 513 state->rev_nat_index = svc_v2->rev_nat_index; 514 515 return lb6_xlate_v2(skb, addr, tuple->nexthdr, l3_off, l4_off, 516 csum_off, key, svc_v2, backend); 517 518 drop_no_service: 519 tuple->flags = flags; 520 return DROP_NO_SERVICE; 521 } 522 #endif /* ENABLE_IPV6 */ 523 524 #ifdef ENABLE_IPV4 525 static inline int __inline__ __lb4_rev_nat(struct __sk_buff *skb, int l3_off, int l4_off, 526 struct csum_offset *csum_off, 527 struct ipv4_ct_tuple *tuple, int flags, 528 struct lb4_reverse_nat *nat, 529 struct ct_state *ct_state) 530 { 531 __be32 old_sip, new_sip, sum = 0; 532 int ret; 533 534 cilium_dbg_lb(skb, DBG_LB4_REVERSE_NAT, nat->address, nat->port); 535 536 if (nat->port) { 537 ret = reverse_map_l4_port(skb, tuple->nexthdr, nat->port, l4_off, csum_off); 538 if (IS_ERR(ret)) 539 return ret; 540 } 541 542 if (flags & REV_NAT_F_TUPLE_SADDR) { 543 old_sip = tuple->saddr; 544 tuple->saddr = new_sip = nat->address; 545 } else { 546 ret = skb_load_bytes(skb, l3_off + offsetof(struct iphdr, saddr), &old_sip, 4); 547 if (IS_ERR(ret)) 548 return ret; 549 550 new_sip = nat->address; 551 } 552 553 if (ct_state->loopback) { 554 /* The packet was looped back to the sending endpoint on the 555 * forward service translation. This implies that the original 556 * source address of the packet is the source address of the 557 * current packet. We therefore need to make the current source 558 * address the new destination address */ 559 __be32 old_dip; 560 561 ret = skb_load_bytes(skb, l3_off + offsetof(struct iphdr, daddr), &old_dip, 4); 562 if (IS_ERR(ret)) 563 return ret; 564 565 cilium_dbg_lb(skb, DBG_LB4_LOOPBACK_SNAT_REV, old_dip, old_sip); 566 567 ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, daddr), &old_sip, 4, 0); 568 if (IS_ERR(ret)) 569 return DROP_WRITE_ERROR; 570 571 sum = csum_diff(&old_dip, 4, &old_sip, 4, 0); 572 573 /* Update the tuple address which is representing the destination address */ 574 tuple->saddr = old_sip; 575 } 576 577 ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, saddr), &new_sip, 4, 0); 578 if (IS_ERR(ret)) 579 return DROP_WRITE_ERROR; 580 581 sum = csum_diff(&old_sip, 4, &new_sip, 4, sum); 582 if (l3_csum_replace(skb, l3_off + offsetof(struct iphdr, check), 0, sum, 0) < 0) 583 return DROP_CSUM_L3; 584 585 if (csum_off->offset && 586 csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0) 587 return DROP_CSUM_L4; 588 589 return 0; 590 } 591 592 593 /** Perform IPv4 reverse NAT based on reverse NAT index 594 * @arg skb packet 595 * @arg l3_off offset to L3 596 * @arg l4_off offset to L4 597 * @arg csum_off offset to L4 checksum field 598 * @arg csum_flags checksum flags 599 * @arg index reverse NAT index 600 * @arg tuple tuple 601 */ 602 static inline int __inline__ lb4_rev_nat(struct __sk_buff *skb, int l3_off, int l4_off, 603 struct csum_offset *csum_off, 604 struct ct_state *ct_state, 605 struct ipv4_ct_tuple *tuple, int flags) 606 { 607 struct lb4_reverse_nat *nat; 608 609 cilium_dbg_lb(skb, DBG_LB4_REVERSE_NAT_LOOKUP, ct_state->rev_nat_index, 0); 610 nat = map_lookup_elem(&LB4_REVERSE_NAT_MAP, &ct_state->rev_nat_index); 611 if (nat == NULL) 612 return 0; 613 614 return __lb4_rev_nat(skb, l3_off, l4_off, csum_off, tuple, flags, nat, 615 ct_state); 616 } 617 618 /** Extract IPv4 LB key from packet 619 * @arg skb Packet 620 * @arg tuple Tuple 621 * @arg l4_off Offset to L4 header 622 * @arg key Pointer to store LB key in 623 * @arg csum_off Pointer to store L4 checksum field offset in 624 * @arg dir Flow direction 625 * 626 * Returns: 627 * - TC_ACT_OK on successful extraction 628 * - DROP_UNKNOWN_L4 if packet should be ignore (sent to stack) 629 * - Negative error code 630 */ 631 static inline int __inline__ lb4_extract_key_v2(struct __sk_buff *skb, 632 struct ipv4_ct_tuple *tuple, 633 int l4_off, 634 struct lb4_key_v2 *key, 635 struct csum_offset *csum_off, 636 int dir) 637 { 638 // FIXME: set after adding support for different L4 protocols in LB 639 key->proto = 0; 640 key->address = (dir == CT_INGRESS) ? tuple->saddr : tuple->daddr; 641 csum_l4_offset_and_flags(tuple->nexthdr, csum_off); 642 643 #ifdef LB_L4 644 return extract_l4_port(skb, tuple->nexthdr, l4_off, &key->dport); 645 #else 646 return 0; 647 #endif 648 } 649 650 static inline 651 struct lb4_service_v2 *__lb4_lookup_service_v2(struct lb4_key_v2 *key) 652 { 653 key->slave = 0; 654 #ifdef LB_L4 655 if (key->dport) { 656 struct lb4_service_v2 *svc; 657 658 /* FIXME: The verifier barks on these calls right now for some reason */ 659 /* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */ 660 svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 661 if (svc && svc->count != 0) 662 return svc; 663 664 key->dport = 0; 665 } 666 #endif 667 668 #ifdef LB_L3 669 if (1) { 670 struct lb4_service_v2 *svc; 671 672 /* FIXME: The verifier barks on these calls right now for some reason */ 673 /* cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER, key->address, key->dport); */ 674 svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 675 if (svc && svc->count != 0) 676 return svc; 677 } 678 #endif 679 return NULL; 680 } 681 682 static inline 683 struct lb4_service_v2 *lb4_lookup_service_v2(struct __sk_buff *skb, 684 struct lb4_key_v2 *key) 685 { 686 struct lb4_service_v2 *svc = __lb4_lookup_service_v2(key); 687 688 if (!svc) 689 cilium_dbg_lb(skb, DBG_LB4_LOOKUP_MASTER_FAIL, 0, 0); 690 691 return svc; 692 } 693 694 static inline struct lb4_backend *__lb4_lookup_backend(__u16 backend_id) 695 { 696 return map_lookup_elem(&LB4_BACKEND_MAP, &backend_id); 697 } 698 699 static inline struct lb4_backend *lb4_lookup_backend(struct __sk_buff *skb, 700 __u16 backend_id) 701 { 702 struct lb4_backend *backend; 703 704 backend = __lb4_lookup_backend(backend_id); 705 if (!backend) { 706 cilium_dbg_lb(skb, DBG_LB4_LOOKUP_BACKEND_FAIL, backend_id, 0); 707 } 708 709 return backend; 710 } 711 712 static inline 713 struct lb4_service_v2 *__lb4_lookup_slave_v2(struct lb4_key_v2 *key) 714 { 715 return map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 716 } 717 718 static inline 719 struct lb4_service_v2 *lb4_lookup_slave_v2(struct __sk_buff *skb, 720 struct lb4_key_v2 *key, __u16 slave) 721 { 722 struct lb4_service_v2 *svc; 723 724 key->slave = slave; 725 cilium_dbg_lb(skb, DBG_LB4_LOOKUP_SLAVE, key->slave, key->dport); 726 svc = __lb4_lookup_slave_v2(key); 727 if (svc != NULL) { 728 return svc; 729 } 730 731 cilium_dbg_lb(skb, DBG_LB4_LOOKUP_SLAVE_V2_FAIL, key->slave, key->dport); 732 733 return NULL; 734 } 735 736 static inline int __inline__ 737 lb4_xlate_v2(struct __sk_buff *skb, __be32 *new_daddr, __be32 *new_saddr, 738 __be32 *old_saddr, __u8 nexthdr, int l3_off, int l4_off, 739 struct csum_offset *csum_off, struct lb4_key_v2 *key, 740 struct lb4_service_v2 *svc, struct lb4_backend *backend) 741 { 742 int ret; 743 __be32 sum; 744 745 ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, daddr), new_daddr, 4, 0); 746 if (ret < 0) 747 return DROP_WRITE_ERROR; 748 749 sum = csum_diff(&key->address, 4, new_daddr, 4, 0); 750 751 if (new_saddr && *new_saddr) { 752 cilium_dbg_lb(skb, DBG_LB4_LOOPBACK_SNAT, *old_saddr, *new_saddr); 753 ret = skb_store_bytes(skb, l3_off + offsetof(struct iphdr, saddr), new_saddr, 4, 0); 754 if (ret < 0) 755 return DROP_WRITE_ERROR; 756 757 sum = csum_diff(old_saddr, 4, new_saddr, 4, sum); 758 } 759 760 if (l3_csum_replace(skb, l3_off + offsetof(struct iphdr, check), 0, sum, 0) < 0) 761 return DROP_CSUM_L3; 762 763 if (csum_off->offset) { 764 if (csum_l4_replace(skb, l4_off, csum_off, 0, sum, BPF_F_PSEUDO_HDR) < 0) 765 return DROP_CSUM_L4; 766 } 767 768 #ifdef LB_L4 769 if (backend->port && key->dport != backend->port && 770 (nexthdr == IPPROTO_TCP || nexthdr == IPPROTO_UDP)) { 771 __be16 tmp = backend->port; 772 /* Port offsets for UDP and TCP are the same */ 773 ret = l4_modify_port(skb, l4_off, TCP_DPORT_OFF, csum_off, tmp, key->dport); 774 if (IS_ERR(ret)) 775 return ret; 776 } 777 #endif 778 779 return TC_ACT_OK; 780 } 781 782 static inline int __inline__ lb4_local(void *map, struct __sk_buff *skb, 783 int l3_off, int l4_off, 784 struct csum_offset *csum_off, 785 struct lb4_key_v2 *key, 786 struct ipv4_ct_tuple *tuple, 787 struct lb4_service_v2 *svc_v2, 788 struct ct_state *state, __be32 saddr) 789 { 790 __u32 monitor; // Deliberately ignored; regular CT will determine monitoring. 791 __be32 new_saddr = 0, new_daddr; 792 __u8 flags = tuple->flags; 793 struct lb4_backend *backend; 794 struct lb4_service_v2 *slave_svc; 795 int slave; 796 int ret; 797 798 ret = ct_lookup4(map, tuple, skb, l4_off, CT_SERVICE, state, &monitor); 799 switch(ret) { 800 case CT_NEW: 801 /* No CT entry has been found, so select a svc endpoint */ 802 slave = lb4_select_slave(svc_v2->count); 803 if ((slave_svc = lb4_lookup_slave_v2(skb, key, slave)) == NULL) { 804 goto drop_no_service; 805 } 806 backend = lb4_lookup_backend(skb, slave_svc->backend_id); 807 if (backend == NULL) { 808 goto drop_no_service; 809 } 810 state->backend_id = slave_svc->backend_id; 811 state->rev_nat_index = svc_v2->rev_nat_index; 812 ret = ct_create4(map, tuple, skb, CT_SERVICE, state, false); 813 /* Fail closed, if the conntrack entry create fails drop 814 * service lookup. 815 */ 816 if (IS_ERR(ret)) { 817 goto drop_no_service; 818 } 819 goto update_state; 820 case CT_ESTABLISHED: 821 case CT_RELATED: 822 case CT_REPLY: 823 // For backward-compatibility we need to update reverse NAT index 824 // in the CT_SERVICE entry for old connections, as later in the code 825 // we check whether the right backend is used. Having it set to 0 826 // would trigger a new backend selection which would in many cases 827 // would pick a different backend. 828 if (unlikely(state->rev_nat_index == 0)) { 829 state->rev_nat_index = svc_v2->rev_nat_index; 830 ct_update4_rev_nat_index(map, tuple, state); 831 } 832 break; 833 default: 834 goto drop_no_service; 835 } 836 837 // If the CT_SERVICE entry is from a non-related connection (e.g. 838 // endpoint has been removed, but its CT entries were not (it is 839 // totally possible due to the bug in DumpReliablyWithCallback)), 840 // then a wrong (=from unrelated service) backend can be selected. 841 // To avoid this, check that reverse NAT indices match. If not, 842 // select a new backend. 843 if (state->rev_nat_index != svc_v2->rev_nat_index) { 844 cilium_dbg_lb(skb, DBG_LB_STALE_CT, svc_v2->rev_nat_index, 845 state->rev_nat_index); 846 slave = lb4_select_slave(svc_v2->count); 847 if (!(slave_svc = lb4_lookup_slave_v2(skb, key, slave))) { 848 goto drop_no_service; 849 } 850 state->backend_id = slave_svc->backend_id; 851 ct_update4_backend_id(map, tuple, state); 852 state->rev_nat_index = svc_v2->rev_nat_index; 853 ct_update4_rev_nat_index(map, tuple, state); 854 } 855 /* If the lookup fails it means the user deleted the backend out from 856 * underneath us. To resolve this fall back to hash. If this is a TCP 857 * session we are likely to get a TCP RST. 858 */ 859 if (!(backend = lb4_lookup_backend(skb, state->backend_id))) { 860 key->slave = 0; 861 if (!(svc_v2 = lb4_lookup_service_v2(skb, key))) { 862 goto drop_no_service; 863 } 864 slave = lb4_select_slave(svc_v2->count); 865 if (!(slave_svc = lb4_lookup_slave_v2(skb, key, slave))) { 866 goto drop_no_service; 867 } 868 backend = lb4_lookup_backend(skb, slave_svc->backend_id); 869 if (backend == NULL) { 870 goto drop_no_service; 871 } 872 state->backend_id = slave_svc->backend_id; 873 ct_update4_backend_id(map, tuple, state); 874 } 875 876 update_state: 877 /* Restore flags so that SERVICE flag is only used in used when the 878 * service lookup happens and future lookups use EGRESS or INGRESS. 879 */ 880 tuple->flags = flags; 881 state->rev_nat_index = svc_v2->rev_nat_index; 882 state->addr = new_daddr = backend->address; 883 884 #ifndef DISABLE_LOOPBACK_LB 885 /* Special loopback case: The origin endpoint has transmitted to a 886 * service which is being translated back to the source. This would 887 * result in a packet with identical source and destination address. 888 * Linux considers such packets as martian source and will drop unless 889 * received on a loopback device. Perform NAT on the source address 890 * to make it appear from an outside address. 891 */ 892 if (saddr == backend->address) { 893 new_saddr = IPV4_LOOPBACK; 894 state->loopback = 1; 895 state->addr = new_saddr; 896 state->svc_addr = saddr; 897 } 898 #endif 899 900 if (!state->loopback) 901 tuple->daddr = backend->address; 902 903 return lb4_xlate_v2(skb, &new_daddr, &new_saddr, &saddr, 904 tuple->nexthdr, l3_off, l4_off, csum_off, key, 905 svc_v2, backend); 906 907 drop_no_service: 908 tuple->flags = flags; 909 return DROP_NO_SERVICE; 910 } 911 #endif /* ENABLE_IPV4 */ 912 913 #endif /* __LB_H_ */