github.com/datadog/cilium@v1.6.12/bpf/lib/nodeport.h (about) 1 /* 2 * Copyright (C) 2019 Authors of Cilium 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19 #ifndef __NODEPORT_H_ 20 #define __NODEPORT_H_ 21 22 #include <bpf/api.h> 23 24 #include "nat.h" 25 #include "lb.h" 26 #include "conntrack.h" 27 #include "csum.h" 28 #include "encap.h" 29 30 #define CB_SRC_IDENTITY 0 31 32 /* No nodeport on cilium_host interface. */ 33 #ifdef FROM_HOST 34 # undef ENABLE_NODEPORT 35 # undef ENABLE_MASQUERADE 36 #endif 37 38 #ifdef ENABLE_NODEPORT 39 40 #ifdef ENABLE_IPV4 41 struct bpf_elf_map __section_maps NODEPORT_NEIGH4 = { 42 .type = BPF_MAP_TYPE_LRU_HASH, 43 .size_key = sizeof(__be32), // ipv4 addr 44 .size_value = sizeof(union macaddr), // hw addr 45 .pinning = PIN_GLOBAL_NS, 46 .max_elem = SNAT_MAPPING_IPV4_SIZE, 47 }; 48 #endif /* ENABLE_IPV4 */ 49 50 #ifdef ENABLE_IPV6 51 struct bpf_elf_map __section_maps NODEPORT_NEIGH6 = { 52 .type = BPF_MAP_TYPE_LRU_HASH, 53 .size_key = sizeof(union v6addr), // ipv6 addr 54 .size_value = sizeof(union macaddr), // hw addr 55 .pinning = PIN_GLOBAL_NS, 56 .max_elem = SNAT_MAPPING_IPV6_SIZE, 57 }; 58 #endif /* ENABLE_IPV6 */ 59 60 #endif /* ENABLE_NODEPORT */ 61 62 static inline void bpf_clear_nodeport(struct __sk_buff *skb) 63 { 64 #ifdef ENABLE_NODEPORT 65 skb->tc_index &= ~TC_INDEX_F_SKIP_NODEPORT; 66 #endif 67 } 68 69 #ifdef ENABLE_NODEPORT 70 static inline bool __inline__ bpf_skip_nodeport(struct __sk_buff *skb) 71 { 72 volatile __u32 tc_index = skb->tc_index; 73 skb->tc_index &= ~TC_INDEX_F_SKIP_NODEPORT; 74 return tc_index & TC_INDEX_F_SKIP_NODEPORT; 75 } 76 #endif /* ENABLE_NODEPORT */ 77 78 #ifdef ENABLE_NODEPORT 79 #ifdef ENABLE_IPV6 80 static __always_inline bool nodeport_nat_ipv6_needed(struct __sk_buff *skb, 81 union v6addr *addr, int dir) 82 { 83 void *data, *data_end; 84 struct ipv6hdr *ip6; 85 86 if (!revalidate_data(skb, &data, &data_end, &ip6)) 87 return false; 88 /* See nodeport_nat_ipv4_needed(). */ 89 if (dir == NAT_DIR_EGRESS) 90 return !ipv6_addrcmp((union v6addr *)&ip6->saddr, addr); 91 else 92 return !ipv6_addrcmp((union v6addr *)&ip6->daddr, addr); 93 return false; 94 } 95 96 #define NODEPORT_DO_NAT_IPV6(ADDR, NDIR) \ 97 ({ \ 98 struct ipv6_nat_target target = { \ 99 .min_port = NODEPORT_PORT_MIN_NAT, \ 100 .max_port = 65535, \ 101 }; \ 102 ipv6_addr_copy(&target.addr, (ADDR)); \ 103 int ____ret = nodeport_nat_ipv6_needed(skb, (ADDR), (NDIR)) ? \ 104 snat_v6_process(skb, (NDIR), &target) : TC_ACT_OK;\ 105 if (____ret == NAT_PUNT_TO_STACK) \ 106 ____ret = TC_ACT_OK; \ 107 ____ret; \ 108 }) 109 110 static __always_inline int nodeport_nat_ipv6_fwd(struct __sk_buff *skb, 111 union v6addr *addr) 112 { 113 return NODEPORT_DO_NAT_IPV6(addr, NAT_DIR_EGRESS); 114 } 115 116 static __always_inline int nodeport_nat_ipv6_rev(struct __sk_buff *skb, 117 union v6addr *addr) 118 { 119 return NODEPORT_DO_NAT_IPV6(addr, NAT_DIR_INGRESS); 120 } 121 122 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_NAT) 123 int tail_nodeport_nat_ipv6(struct __sk_buff *skb) 124 { 125 int ifindex = NATIVE_DEV_IFINDEX, ret, dir = skb->cb[CB_NAT]; 126 struct bpf_fib_lookup fib_params = {}; 127 struct ipv6_nat_target target = { 128 .min_port = NODEPORT_PORT_MIN_NAT, 129 .max_port = NODEPORT_PORT_MAX_NAT, 130 .src_from_world = true, 131 }; 132 void *data, *data_end; 133 struct ipv6hdr *ip6; 134 135 BPF_V6(target.addr, IPV6_NODEPORT); 136 #ifdef ENCAP_IFINDEX 137 if (dir == NAT_DIR_EGRESS) { 138 struct remote_endpoint_info *info; 139 union v6addr *dst; 140 141 if (!revalidate_data(skb, &data, &data_end, &ip6)) 142 return DROP_INVALID; 143 144 dst = (union v6addr *)&ip6->daddr; 145 info = ipcache_lookup6(&IPCACHE_MAP, dst, V6_CACHE_KEY_LEN); 146 if (info != NULL && info->tunnel_endpoint != 0) { 147 int ret = __encap_with_nodeid(skb, info->tunnel_endpoint, 148 SECLABEL, TRACE_PAYLOAD_LEN); 149 if (ret) 150 return ret; 151 152 BPF_V6(target.addr, ROUTER_IP); 153 ifindex = ENCAP_IFINDEX; 154 155 /* fib lookup not necessary when going over tunnel. */ 156 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) 157 return DROP_WRITE_ERROR; 158 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) 159 return DROP_WRITE_ERROR; 160 } 161 } 162 #endif 163 ret = snat_v6_process(skb, dir, &target); 164 if (IS_ERR(ret)) { 165 /* In case of no mapping, recircle back to main path. SNAT is very 166 * expensive in terms of instructions (since we don't have BPF to 167 * BPF calls as we use tail calls) and complexity, hence this is 168 * done inside a tail call here. 169 */ 170 if (dir == NAT_DIR_INGRESS) { 171 skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT; 172 ep_tail_call(skb, CILIUM_CALL_IPV6_FROM_LXC); 173 ret = DROP_MISSED_TAIL_CALL; 174 } 175 if (ret == NAT_PUNT_TO_STACK) 176 ret = TC_ACT_OK; 177 else 178 goto drop_err; 179 } 180 181 skb->mark |= MARK_MAGIC_SNAT_DONE; 182 if (dir == NAT_DIR_INGRESS) { 183 ep_tail_call(skb, CILIUM_CALL_IPV6_NODEPORT_REVNAT); 184 ret = DROP_MISSED_TAIL_CALL; 185 goto drop_err; 186 } 187 #ifdef ENCAP_IFINDEX 188 if (ifindex == ENCAP_IFINDEX) 189 goto out_send; 190 #endif 191 if (!revalidate_data(skb, &data, &data_end, &ip6)) { 192 ret = DROP_INVALID; 193 goto drop_err; 194 } 195 196 fib_params.family = AF_INET6; 197 fib_params.ifindex = ifindex; 198 ipv6_addr_copy((union v6addr *) &fib_params.ipv6_src, (union v6addr *) &ip6->saddr); 199 ipv6_addr_copy((union v6addr *) &fib_params.ipv6_dst, (union v6addr *) &ip6->daddr); 200 201 ret = fib_lookup(skb, &fib_params, sizeof(fib_params), 202 BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT); 203 if (ret != 0) { 204 ret = DROP_NO_FIB; 205 goto drop_err; 206 } 207 208 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) { 209 ret = DROP_WRITE_ERROR; 210 goto drop_err; 211 } 212 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) { 213 ret = DROP_WRITE_ERROR; 214 goto drop_err; 215 } 216 ifindex = fib_params.ifindex; 217 out_send: 218 return redirect(ifindex, 0); 219 drop_err: 220 return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, 221 dir == NAT_DIR_INGRESS ? 222 METRIC_INGRESS : METRIC_EGRESS); 223 } 224 225 /* See nodeport_lb4(). */ 226 static inline int nodeport_lb6(struct __sk_buff *skb, __u32 src_identity) 227 { 228 int ret, l3_off = ETH_HLEN, l4_off, hdrlen; 229 struct ipv6_ct_tuple tuple = {}; 230 void *data, *data_end; 231 struct ipv6hdr *ip6; 232 struct csum_offset csum_off = {}; 233 struct lb6_service_v2 *svc; 234 struct lb6_key_v2 key = {}; 235 struct ct_state ct_state_new = {}; 236 struct ct_state ct_state = {}; 237 bool backend_local; 238 __u32 monitor = 0; 239 __u16 service_port; 240 union macaddr smac; 241 242 if (!revalidate_data(skb, &data, &data_end, &ip6)) 243 return DROP_INVALID; 244 245 tuple.nexthdr = ip6->nexthdr; 246 ipv6_addr_copy(&tuple.daddr, (union v6addr *) &ip6->daddr); 247 ipv6_addr_copy(&tuple.saddr, (union v6addr *) &ip6->saddr); 248 249 hdrlen = ipv6_hdrlen(skb, l3_off, &tuple.nexthdr); 250 if (hdrlen < 0) 251 return hdrlen; 252 253 l4_off = l3_off + hdrlen; 254 255 ret = lb6_extract_key_v2(skb, &tuple, l4_off, &key, &csum_off, CT_EGRESS); 256 if (IS_ERR(ret)) { 257 if (ret == DROP_UNKNOWN_L4) 258 return TC_ACT_OK; 259 else 260 return ret; 261 } 262 263 service_port = bpf_ntohs(key.dport); 264 if (service_port < NODEPORT_PORT_MIN || 265 service_port > NODEPORT_PORT_MAX) { 266 skb->cb[CB_NAT] = NAT_DIR_INGRESS; 267 skb->cb[CB_SRC_IDENTITY] = src_identity; 268 ep_tail_call(skb, CILIUM_CALL_IPV6_NODEPORT_NAT); 269 return DROP_MISSED_TAIL_CALL; 270 } 271 272 ct_state_new.orig_dport = key.dport; 273 274 if ((svc = lb6_lookup_service_v2(skb, &key)) != NULL) { 275 ret = lb6_local(get_ct_map6(&tuple), skb, l3_off, l4_off, 276 &csum_off, &key, &tuple, svc, &ct_state_new); 277 if (IS_ERR(ret)) 278 return ret; 279 } else { 280 return TC_ACT_OK; 281 } 282 283 ret = ct_lookup6(get_ct_map6(&tuple), &tuple, skb, l4_off, CT_EGRESS, 284 &ct_state, &monitor); 285 if (ret < 0) 286 return ret; 287 if (!revalidate_data(skb, &data, &data_end, &ip6)) 288 return DROP_INVALID; 289 290 backend_local = lookup_ip6_endpoint(ip6); 291 292 switch (ret) { 293 case CT_NEW: 294 ct_state_new.src_sec_id = SECLABEL; 295 ct_state_new.node_port = 1; 296 ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_EGRESS, 297 &ct_state_new, false); 298 if (IS_ERR(ret)) 299 return ret; 300 if (backend_local) { 301 ct_flip_tuple_dir6(&tuple); 302 ct_state_new.rev_nat_index = 0; 303 ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, 304 CT_INGRESS, &ct_state_new, false); 305 if (IS_ERR(ret)) 306 return ret; 307 } 308 break; 309 310 case CT_ESTABLISHED: 311 case CT_REPLY: 312 break; 313 314 default: 315 return DROP_UNKNOWN_CT; 316 } 317 318 if (!revalidate_data(skb, &data, &data_end, &ip6)) 319 return DROP_INVALID; 320 if (eth_load_saddr(skb, &smac.addr, 0) < 0) 321 return DROP_INVALID; 322 ret = map_update_elem(&NODEPORT_NEIGH6, &ip6->saddr, &smac, 0); 323 if (ret < 0) { 324 return ret; 325 } 326 327 if (!backend_local) { 328 skb->cb[CB_NAT] = NAT_DIR_EGRESS; 329 ep_tail_call(skb, CILIUM_CALL_IPV6_NODEPORT_NAT); 330 return DROP_MISSED_TAIL_CALL; 331 } 332 333 return TC_ACT_OK; 334 } 335 336 /* See comment in tail_rev_nodeport_lb4(). */ 337 static inline int rev_nodeport_lb6(struct __sk_buff *skb, int *ifindex, 338 union macaddr *mac) 339 { 340 int ret, ret2, l3_off = ETH_HLEN, l4_off, hdrlen; 341 struct ipv6_ct_tuple tuple = {}; 342 void *data, *data_end; 343 struct ipv6hdr *ip6; 344 struct csum_offset csum_off = {}; 345 struct ct_state ct_state = {}; 346 struct bpf_fib_lookup fib_params = {}; 347 union macaddr *dmac; 348 __u32 monitor = 0; 349 350 if (!revalidate_data(skb, &data, &data_end, &ip6)) 351 return DROP_INVALID; 352 353 tuple.nexthdr = ip6->nexthdr; 354 ipv6_addr_copy(&tuple.daddr, (union v6addr *) &ip6->daddr); 355 ipv6_addr_copy(&tuple.saddr, (union v6addr *) &ip6->saddr); 356 357 hdrlen = ipv6_hdrlen(skb, l3_off, &tuple.nexthdr); 358 if (hdrlen < 0) 359 return hdrlen; 360 361 l4_off = l3_off + hdrlen; 362 csum_l4_offset_and_flags(tuple.nexthdr, &csum_off); 363 364 ret = ct_lookup6(get_ct_map6(&tuple), &tuple, skb, l4_off, CT_INGRESS, &ct_state, 365 &monitor); 366 367 if (ret == CT_REPLY && ct_state.node_port == 1 && ct_state.rev_nat_index != 0) { 368 ret2 = lb6_rev_nat(skb, l4_off, &csum_off, ct_state.rev_nat_index, 369 &tuple, REV_NAT_F_TUPLE_SADDR); 370 if (IS_ERR(ret2)) 371 return ret2; 372 373 if (!revalidate_data(skb, &data, &data_end, &ip6)) 374 return DROP_INVALID; 375 376 skb->mark |= MARK_MAGIC_SNAT_DONE; 377 #ifdef ENCAP_IFINDEX 378 { 379 union v6addr *dst = (union v6addr *)&ip6->daddr; 380 struct remote_endpoint_info *info; 381 382 info = ipcache_lookup6(&IPCACHE_MAP, dst, V6_CACHE_KEY_LEN); 383 if (info != NULL && info->tunnel_endpoint != 0) { 384 int ret = __encap_with_nodeid(skb, info->tunnel_endpoint, 385 SECLABEL, TRACE_PAYLOAD_LEN); 386 if (ret) 387 return ret; 388 389 *ifindex = ENCAP_IFINDEX; 390 391 /* fib lookup not necessary when going over tunnel. */ 392 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) 393 return DROP_WRITE_ERROR; 394 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) 395 return DROP_WRITE_ERROR; 396 397 return TC_ACT_OK; 398 } 399 } 400 #endif 401 402 dmac = map_lookup_elem(&NODEPORT_NEIGH6, &tuple.daddr); 403 if (dmac) { 404 if (eth_store_daddr(skb, &dmac->addr, 0) < 0) 405 return DROP_WRITE_ERROR; 406 if (eth_store_saddr(skb, &mac->addr, 0) < 0) 407 return DROP_WRITE_ERROR; 408 } else { 409 fib_params.family = AF_INET6; 410 fib_params.ifindex = *ifindex; 411 412 ipv6_addr_copy((union v6addr *) &fib_params.ipv6_src, &tuple.saddr); 413 ipv6_addr_copy((union v6addr *) &fib_params.ipv6_dst, &tuple.daddr); 414 415 int rc = fib_lookup(skb, &fib_params, sizeof(fib_params), 416 BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT); 417 if (rc != 0) 418 return DROP_NO_FIB; 419 420 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) 421 return DROP_WRITE_ERROR; 422 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) 423 return DROP_WRITE_ERROR; 424 } 425 } else { 426 if (!(skb->tc_index & TC_INDEX_F_SKIP_RECIRCULATION)) { 427 skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT; 428 ep_tail_call(skb, CILIUM_CALL_IPV6_FROM_LXC); 429 return DROP_MISSED_TAIL_CALL; 430 } 431 } 432 433 return TC_ACT_OK; 434 } 435 436 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_REVNAT) 437 int tail_rev_nodeport_lb6(struct __sk_buff *skb) 438 { 439 int ifindex = NATIVE_DEV_IFINDEX; 440 union macaddr mac = NATIVE_DEV_MAC; 441 int ret = 0; 442 443 ret = rev_nodeport_lb6(skb, &ifindex, &mac); 444 if (IS_ERR(ret)) 445 return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, METRIC_EGRESS); 446 return redirect(ifindex, 0); 447 } 448 #endif /* ENABLE_IPV6 */ 449 450 #ifdef ENABLE_IPV4 451 static __always_inline bool nodeport_nat_ipv4_needed(struct __sk_buff *skb, 452 __be32 addr, int dir) 453 { 454 void *data, *data_end; 455 struct iphdr *ip4; 456 457 if (!revalidate_data(skb, &data, &data_end, &ip4)) 458 return false; 459 /* Basic minimum is to only NAT when there is a potential of 460 * overlapping tuples, e.g. applications in hostns reusing 461 * source IPs we SNAT in node-port. 462 */ 463 if (dir == NAT_DIR_EGRESS) 464 return ip4->saddr == addr; 465 else 466 return ip4->daddr == addr; 467 return false; 468 } 469 470 #define NODEPORT_DO_NAT_IPV4(ADDR, NDIR) \ 471 ({ \ 472 struct ipv4_nat_target target = { \ 473 .min_port = NODEPORT_PORT_MIN_NAT, \ 474 .max_port = 65535, \ 475 .addr = (ADDR), \ 476 }; \ 477 int ____ret = nodeport_nat_ipv4_needed(skb, (ADDR), (NDIR)) ? \ 478 snat_v4_process(skb, (NDIR), &target) : TC_ACT_OK;\ 479 if (____ret == NAT_PUNT_TO_STACK) \ 480 ____ret = TC_ACT_OK; \ 481 ____ret; \ 482 }) 483 484 static __always_inline int nodeport_nat_ipv4_fwd(struct __sk_buff *skb, 485 const __be32 addr) 486 { 487 return NODEPORT_DO_NAT_IPV4(addr, NAT_DIR_EGRESS); 488 } 489 490 static __always_inline int nodeport_nat_ipv4_rev(struct __sk_buff *skb, 491 const __be32 addr) 492 { 493 return NODEPORT_DO_NAT_IPV4(addr, NAT_DIR_INGRESS); 494 } 495 496 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_NAT) 497 int tail_nodeport_nat_ipv4(struct __sk_buff *skb) 498 { 499 int ifindex = NATIVE_DEV_IFINDEX, ret, dir = skb->cb[CB_NAT]; 500 struct bpf_fib_lookup fib_params = {}; 501 struct ipv4_nat_target target = { 502 .min_port = NODEPORT_PORT_MIN_NAT, 503 .max_port = NODEPORT_PORT_MAX_NAT, 504 .src_from_world = true, 505 }; 506 void *data, *data_end; 507 struct iphdr *ip4; 508 509 target.addr = IPV4_NODEPORT; 510 #ifdef ENCAP_IFINDEX 511 if (dir == NAT_DIR_EGRESS) { 512 struct remote_endpoint_info *info; 513 514 if (!revalidate_data(skb, &data, &data_end, &ip4)) 515 return DROP_INVALID; 516 517 info = ipcache_lookup4(&IPCACHE_MAP, ip4->daddr, V4_CACHE_KEY_LEN); 518 if (info != NULL && info->tunnel_endpoint != 0) { 519 int ret = __encap_with_nodeid(skb, info->tunnel_endpoint, 520 SECLABEL, TRACE_PAYLOAD_LEN); 521 if (ret) 522 return ret; 523 524 target.addr = IPV4_GATEWAY; 525 ifindex = ENCAP_IFINDEX; 526 527 /* fib lookup not necessary when going over tunnel. */ 528 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) 529 return DROP_WRITE_ERROR; 530 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) 531 return DROP_WRITE_ERROR; 532 } 533 } 534 #endif 535 ret = snat_v4_process(skb, dir, &target); 536 if (IS_ERR(ret)) { 537 /* In case of no mapping, recircle back to main path. SNAT is very 538 * expensive in terms of instructions (since we don't have BPF to 539 * BPF calls as we use tail calls) and complexity, hence this is 540 * done inside a tail call here. 541 */ 542 if (dir == NAT_DIR_INGRESS) { 543 skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT; 544 ep_tail_call(skb, CILIUM_CALL_IPV4_FROM_LXC); 545 ret = DROP_MISSED_TAIL_CALL; 546 } 547 if (ret == NAT_PUNT_TO_STACK) 548 ret = TC_ACT_OK; 549 else 550 goto drop_err; 551 } 552 553 skb->mark |= MARK_MAGIC_SNAT_DONE; 554 if (dir == NAT_DIR_INGRESS) { 555 ep_tail_call(skb, CILIUM_CALL_IPV4_NODEPORT_REVNAT); 556 ret = DROP_MISSED_TAIL_CALL; 557 goto drop_err; 558 } 559 #ifdef ENCAP_IFINDEX 560 if (ifindex == ENCAP_IFINDEX) 561 goto out_send; 562 #endif 563 if (!revalidate_data(skb, &data, &data_end, &ip4)) { 564 ret = DROP_INVALID; 565 goto drop_err; 566 } 567 568 fib_params.family = AF_INET; 569 fib_params.ifindex = ifindex; 570 fib_params.ipv4_src = ip4->saddr; 571 fib_params.ipv4_dst = ip4->daddr; 572 573 ret = fib_lookup(skb, &fib_params, sizeof(fib_params), 574 BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT); 575 if (ret != 0) { 576 ret = DROP_NO_FIB; 577 goto drop_err; 578 } 579 580 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) { 581 ret = DROP_WRITE_ERROR; 582 goto drop_err; 583 } 584 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) { 585 ret = DROP_WRITE_ERROR; 586 goto drop_err; 587 } 588 ifindex = fib_params.ifindex; 589 out_send: 590 return redirect(ifindex, 0); 591 drop_err: 592 return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, 593 dir == NAT_DIR_INGRESS ? 594 METRIC_INGRESS : METRIC_EGRESS); 595 } 596 597 /* Main node-port entry point for host-external ingressing node-port traffic 598 * which handles the case of: i) backend is local EP, ii) backend is remote EP, 599 * iii) reply from remote backend EP. 600 */ 601 static inline int nodeport_lb4(struct __sk_buff *skb, __u32 src_identity) 602 { 603 struct ipv4_ct_tuple tuple = {}; 604 void *data, *data_end; 605 struct iphdr *ip4; 606 int ret, l3_off = ETH_HLEN, l4_off; 607 struct csum_offset csum_off = {}; 608 struct lb4_service_v2 *svc; 609 struct lb4_key_v2 key = {}; 610 struct ct_state ct_state_new = {}; 611 struct ct_state ct_state = {}; 612 bool backend_local; 613 __u32 monitor = 0; 614 __u16 service_port; 615 union macaddr smac; 616 617 if (!revalidate_data(skb, &data, &data_end, &ip4)) 618 return DROP_INVALID; 619 620 tuple.nexthdr = ip4->protocol; 621 tuple.daddr = ip4->daddr; 622 tuple.saddr = ip4->saddr; 623 624 l4_off = l3_off + ipv4_hdrlen(ip4); 625 626 ret = lb4_extract_key_v2(skb, &tuple, l4_off, &key, &csum_off, CT_EGRESS); 627 if (IS_ERR(ret)) { 628 if (ret == DROP_UNKNOWN_L4) 629 return TC_ACT_OK; 630 else 631 return ret; 632 } 633 634 service_port = bpf_ntohs(key.dport); 635 if (service_port < NODEPORT_PORT_MIN || 636 service_port > NODEPORT_PORT_MAX) { 637 skb->cb[CB_NAT] = NAT_DIR_INGRESS; 638 skb->cb[CB_SRC_IDENTITY] = src_identity; 639 ep_tail_call(skb, CILIUM_CALL_IPV4_NODEPORT_NAT); 640 return DROP_MISSED_TAIL_CALL; 641 } 642 643 ct_state_new.orig_dport = key.dport; 644 645 if ((svc = lb4_lookup_service_v2(skb, &key)) != NULL) { 646 ret = lb4_local(get_ct_map4(&tuple), skb, l3_off, l4_off, &csum_off, 647 &key, &tuple, svc, &ct_state_new, ip4->saddr); 648 if (IS_ERR(ret)) 649 return ret; 650 } else { 651 return TC_ACT_OK; 652 } 653 654 ret = ct_lookup4(get_ct_map4(&tuple), &tuple, skb, l4_off, CT_EGRESS, 655 &ct_state, &monitor); 656 if (ret < 0) 657 return ret; 658 if (!revalidate_data(skb, &data, &data_end, &ip4)) 659 return DROP_INVALID; 660 661 backend_local = lookup_ip4_endpoint(ip4); 662 663 switch (ret) { 664 case CT_NEW: 665 ct_state_new.src_sec_id = SECLABEL; 666 ct_state_new.node_port = 1; 667 ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_EGRESS, 668 &ct_state_new, false); 669 if (IS_ERR(ret)) 670 return ret; 671 if (backend_local) { 672 ct_flip_tuple_dir4(&tuple); 673 ct_state_new.rev_nat_index = 0; 674 ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, 675 CT_INGRESS, &ct_state_new, false); 676 if (IS_ERR(ret)) 677 return ret; 678 } 679 break; 680 681 case CT_ESTABLISHED: 682 case CT_REPLY: 683 break; 684 685 default: 686 return DROP_UNKNOWN_CT; 687 } 688 689 if (!revalidate_data(skb, &data, &data_end, &ip4)) 690 return DROP_INVALID; 691 if (eth_load_saddr(skb, &smac.addr, 0) < 0) 692 return DROP_INVALID; 693 ret = map_update_elem(&NODEPORT_NEIGH4, &ip4->saddr, &smac, 0); 694 if (ret < 0) { 695 return ret; 696 } 697 698 if (!backend_local) { 699 skb->cb[CB_NAT] = NAT_DIR_EGRESS; 700 ep_tail_call(skb, CILIUM_CALL_IPV4_NODEPORT_NAT); 701 return DROP_MISSED_TAIL_CALL; 702 } 703 704 return TC_ACT_OK; 705 } 706 707 /* Reverse NAT handling of node-port traffic for the case where the 708 * backend i) was a local EP and bpf_lxc redirected to us, ii) was 709 * a remote backend and we got here after reverse SNAT from the 710 * tail_nodeport_nat_ipv4(). 711 * 712 * CILIUM_CALL_IPV{4,6}_NODEPORT_REVNAT is plugged into CILIUM_MAP_CALLS 713 * of the bpf_netdev, bpf_overlay and of the bpf_lxc. 714 */ 715 static inline int rev_nodeport_lb4(struct __sk_buff *skb, int *ifindex, 716 union macaddr *mac) 717 { 718 struct ipv4_ct_tuple tuple = {}; 719 void *data, *data_end; 720 struct iphdr *ip4; 721 struct csum_offset csum_off = {}; 722 int ret, ret2, l3_off = ETH_HLEN, l4_off; 723 struct ct_state ct_state = {}; 724 struct bpf_fib_lookup fib_params = {}; 725 union macaddr *dmac; 726 __u32 monitor = 0; 727 728 if (!revalidate_data(skb, &data, &data_end, &ip4)) 729 return DROP_INVALID; 730 731 tuple.nexthdr = ip4->protocol; 732 tuple.daddr = ip4->daddr; 733 tuple.saddr = ip4->saddr; 734 735 l4_off = l3_off + ipv4_hdrlen(ip4); 736 csum_l4_offset_and_flags(tuple.nexthdr, &csum_off); 737 738 ret = ct_lookup4(get_ct_map4(&tuple), &tuple, skb, l4_off, CT_INGRESS, &ct_state, 739 &monitor); 740 741 if (ret == CT_REPLY && ct_state.node_port == 1 && ct_state.rev_nat_index != 0) { 742 ret2 = lb4_rev_nat(skb, l3_off, l4_off, &csum_off, 743 &ct_state, &tuple, 744 REV_NAT_F_TUPLE_SADDR); 745 if (IS_ERR(ret2)) 746 return ret2; 747 748 if (!revalidate_data(skb, &data, &data_end, &ip4)) 749 return DROP_INVALID; 750 751 skb->mark |= MARK_MAGIC_SNAT_DONE; 752 #ifdef ENCAP_IFINDEX 753 { 754 struct remote_endpoint_info *info; 755 756 info = ipcache_lookup4(&IPCACHE_MAP, ip4->daddr, V4_CACHE_KEY_LEN); 757 if (info != NULL && info->tunnel_endpoint != 0) { 758 int ret = __encap_with_nodeid(skb, info->tunnel_endpoint, 759 SECLABEL, TRACE_PAYLOAD_LEN); 760 if (ret) 761 return ret; 762 763 *ifindex = ENCAP_IFINDEX; 764 765 /* fib lookup not necessary when going over tunnel. */ 766 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) 767 return DROP_WRITE_ERROR; 768 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) 769 return DROP_WRITE_ERROR; 770 771 return TC_ACT_OK; 772 } 773 } 774 #endif 775 776 dmac = map_lookup_elem(&NODEPORT_NEIGH4, &ip4->daddr); 777 if (dmac) { 778 if (eth_store_daddr(skb, &dmac->addr, 0) < 0) 779 return DROP_WRITE_ERROR; 780 if (eth_store_saddr(skb, &mac->addr, 0) < 0) 781 return DROP_WRITE_ERROR; 782 } else { 783 fib_params.family = AF_INET; 784 fib_params.ifindex = *ifindex; 785 786 fib_params.ipv4_src = ip4->saddr; 787 fib_params.ipv4_dst = ip4->daddr; 788 789 int rc = fib_lookup(skb, &fib_params, sizeof(fib_params), 790 BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT); 791 if (rc != 0) 792 return DROP_NO_FIB; 793 794 if (eth_store_daddr(skb, fib_params.dmac, 0) < 0) 795 return DROP_WRITE_ERROR; 796 if (eth_store_saddr(skb, fib_params.smac, 0) < 0) 797 return DROP_WRITE_ERROR; 798 } 799 } else { 800 if (!(skb->tc_index & TC_INDEX_F_SKIP_RECIRCULATION)) { 801 skb->tc_index |= TC_INDEX_F_SKIP_NODEPORT; 802 ep_tail_call(skb, CILIUM_CALL_IPV4_FROM_LXC); 803 return DROP_MISSED_TAIL_CALL; 804 } 805 } 806 807 return TC_ACT_OK; 808 } 809 810 __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_REVNAT) 811 int tail_rev_nodeport_lb4(struct __sk_buff *skb) 812 { 813 int ifindex = NATIVE_DEV_IFINDEX; 814 union macaddr mac = NATIVE_DEV_MAC; 815 int ret = 0; 816 817 ret = rev_nodeport_lb4(skb, &ifindex, &mac); 818 if (IS_ERR(ret)) 819 return send_drop_notify_error(skb, 0, ret, TC_ACT_SHOT, METRIC_EGRESS); 820 return redirect(ifindex, 0); 821 } 822 #endif /* ENABLE_IPV4 */ 823 824 static __always_inline int nodeport_nat_fwd(struct __sk_buff *skb, 825 const bool encap) 826 { 827 __u16 proto; 828 829 if (!validate_ethertype(skb, &proto)) 830 return TC_ACT_OK; 831 switch (proto) { 832 #ifdef ENABLE_IPV4 833 case bpf_htons(ETH_P_IP): { 834 __be32 addr; 835 #ifdef ENCAP_IFINDEX 836 if (encap) 837 addr = IPV4_GATEWAY; 838 else 839 #endif 840 addr = IPV4_NODEPORT; 841 return nodeport_nat_ipv4_fwd(skb, addr); 842 } 843 #endif /* ENABLE_IPV4 */ 844 #ifdef ENABLE_IPV6 845 case bpf_htons(ETH_P_IPV6): { 846 union v6addr addr; 847 #ifdef ENCAP_IFINDEX 848 if (encap) 849 BPF_V6(addr, ROUTER_IP); 850 else 851 #endif 852 BPF_V6(addr, IPV6_NODEPORT); 853 return nodeport_nat_ipv6_fwd(skb, &addr); 854 } 855 #endif /* ENABLE_IPV6 */ 856 default: 857 break; 858 } 859 return TC_ACT_OK; 860 } 861 862 static __always_inline int nodeport_nat_rev(struct __sk_buff *skb, 863 const bool encap) 864 { 865 __u16 proto; 866 867 if (!validate_ethertype(skb, &proto)) 868 return TC_ACT_OK; 869 switch (proto) { 870 #ifdef ENABLE_IPV4 871 case bpf_htons(ETH_P_IP): { 872 __be32 addr; 873 #ifdef ENCAP_IFINDEX 874 if (encap) 875 addr = IPV4_GATEWAY; 876 else 877 #endif 878 addr = IPV4_NODEPORT; 879 return nodeport_nat_ipv4_rev(skb, addr); 880 } 881 #endif /* ENABLE_IPV4 */ 882 #ifdef ENABLE_IPV6 883 case bpf_htons(ETH_P_IPV6): { 884 union v6addr addr; 885 #ifdef ENCAP_IFINDEX 886 if (encap) 887 BPF_V6(addr, ROUTER_IP); 888 else 889 #endif 890 BPF_V6(addr, IPV6_NODEPORT); 891 return nodeport_nat_ipv6_rev(skb, &addr); 892 } 893 #endif /* ENABLE_IPV6 */ 894 default: 895 build_bug_on(!(NODEPORT_PORT_MIN_NAT < NODEPORT_PORT_MAX_NAT)); 896 build_bug_on(!(NODEPORT_PORT_MIN < NODEPORT_PORT_MAX)); 897 build_bug_on(!(NODEPORT_PORT_MAX < NODEPORT_PORT_MIN_NAT)); 898 build_bug_on(!(NODEPORT_PORT_MAX < EPHERMERAL_MIN)); 899 break; 900 } 901 return TC_ACT_OK; 902 } 903 #endif /* ENABLE_NODEPORT */ 904 #endif /* __NODEPORT_H_ */