github.com/fafucoder/cilium@v1.6.11/bpf/bpf_sock.c (about) 1 /* 2 * Copyright (C) 2019 Authors of Cilium 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19 #include <node_config.h> 20 #include <netdev_config.h> 21 22 #include <bpf/api.h> 23 24 #include <stdint.h> 25 #include <stdio.h> 26 27 #define SKIP_POLICY_MAP 1 28 #define SKIP_CALLS_MAP 1 29 30 #include "lib/utils.h" 31 #include "lib/common.h" 32 #include "lib/lb.h" 33 #include "lib/eps.h" 34 #include "lib/metrics.h" 35 36 #define CONNECT_REJECT 0 37 #define CONNECT_PROCEED 1 38 #define SENDMSG_PROCEED CONNECT_PROCEED 39 #define RECVMSG_PROCEED CONNECT_PROCEED 40 41 static __always_inline __maybe_unused bool is_v4_loopback(__be32 daddr) 42 { 43 /* Check for 127.0.0.0/8 range, RFC3330. */ 44 return (daddr & bpf_htonl(0x7f000000)) == bpf_htonl(0x7f000000); 45 } 46 47 static __always_inline __maybe_unused bool is_v6_loopback(union v6addr *daddr) 48 { 49 /* Check for ::1/128, RFC4291. */ 50 union v6addr loopback = { .addr[15] = 1, }; 51 return ipv6_addrcmp(&loopback, daddr) == 0; 52 } 53 54 /* Hack due to missing narrow ctx access. */ 55 static __always_inline __maybe_unused __be16 56 ctx_get_port(struct bpf_sock_addr *ctx) 57 { 58 volatile __u32 dport = ctx->user_port; 59 return (__be16)dport; 60 } 61 62 static __always_inline __maybe_unused 63 void ctx_set_port(struct bpf_sock_addr *ctx, __be16 dport) 64 { 65 ctx->user_port = (__u32)dport; 66 } 67 68 static __always_inline __maybe_unused 69 __u64 sock_local_cookie(struct bpf_sock_addr *ctx) 70 { 71 #ifdef HAVE_GET_SOCK_COOKIE 72 /* prandom() breaks down on UDP, hence preference is on 73 * socket cookie as built-in selector. On older kernels, 74 * get_socket_cookie() provides a unique per netns cookie 75 * for the life-time of the socket. For newer kernels this 76 * is fixed to be a unique system _global_ cookie. Older 77 * kernels could have a cookie collision when two pods with 78 * different netns talk to same service backend, but that 79 * is fine since we always reverse translate to the same 80 * service IP/port pair. The only case that could happen 81 * for older kernels is that we have a cookie collision 82 * where one pod talks to the service IP/port and the 83 * other pod talks to that same specific backend IP/port 84 * directly _w/o_ going over service IP/port. Then the 85 * reverse sock addr is translated to the service IP/port. 86 * With a global socket cookie this collision cannot take 87 * place. There, only the even more unlikely case could 88 * happen where the same UDP socket talks first to the 89 * service and then to the same selected backend IP/port 90 * directly which can be considered negligible. 91 */ 92 return get_socket_cookie(ctx); 93 #else 94 return ctx->protocol == IPPROTO_TCP ? get_prandom_u32() : 0; 95 #endif 96 } 97 98 static __always_inline __maybe_unused 99 bool sock_proto_enabled(const struct bpf_sock_addr *ctx) 100 { 101 switch (ctx->protocol) { 102 #ifdef ENABLE_HOST_SERVICES_TCP 103 case IPPROTO_TCP: 104 return true; 105 #endif /* ENABLE_HOST_SERVICES_TCP */ 106 #ifdef ENABLE_HOST_SERVICES_UDP 107 case IPPROTO_UDPLITE: 108 case IPPROTO_UDP: 109 return true; 110 #endif /* ENABLE_HOST_SERVICES_UDP */ 111 default: 112 return false; 113 } 114 } 115 116 #ifdef ENABLE_IPV4 117 #ifdef ENABLE_HOST_SERVICES_UDP 118 struct ipv4_revnat_tuple { 119 __u64 cookie; 120 __be32 address; 121 __be16 port; 122 __u16 pad; 123 }; 124 125 struct ipv4_revnat_entry { 126 __be32 address; 127 __be16 port; 128 __u16 rev_nat_index; 129 }; 130 131 struct bpf_elf_map __section_maps LB4_REVERSE_NAT_SK_MAP = { 132 .type = BPF_MAP_TYPE_LRU_HASH, 133 .size_key = sizeof(struct ipv4_revnat_tuple), 134 .size_value = sizeof(struct ipv4_revnat_entry), 135 .pinning = PIN_GLOBAL_NS, 136 .max_elem = 256 * 1024, 137 }; 138 139 static inline int sock4_update_revnat(struct bpf_sock_addr *ctx, 140 struct lb4_backend *backend, 141 struct lb4_key_v2 *lkey, 142 struct lb4_service_v2 *slave_svc) 143 { 144 struct ipv4_revnat_tuple rkey = {}; 145 struct ipv4_revnat_entry rval = {}; 146 147 rkey.cookie = sock_local_cookie(ctx); 148 rkey.address = backend->address; 149 rkey.port = backend->port; 150 151 rval.address = lkey->address; 152 rval.port = lkey->dport; 153 rval.rev_nat_index = slave_svc->rev_nat_index; 154 155 return map_update_elem(&LB4_REVERSE_NAT_SK_MAP, &rkey, 156 &rval, 0); 157 } 158 #else 159 static inline int sock4_update_revnat(struct bpf_sock_addr *ctx, 160 struct lb4_backend *backend, 161 struct lb4_key_v2 *lkey, 162 struct lb4_service_v2 *slave_svc) 163 { 164 return -1; 165 } 166 #endif /* ENABLE_HOST_SERVICES_UDP */ 167 168 static inline void sock4_handle_node_port(struct bpf_sock_addr *ctx, 169 struct lb4_key_v2 *key) 170 { 171 #ifdef ENABLE_NODEPORT 172 struct remote_endpoint_info *info; 173 __be32 daddr = ctx->user_ip4; 174 __u16 service_port; 175 176 service_port = bpf_ntohs(key->dport); 177 if (service_port < NODEPORT_PORT_MIN || 178 service_port > NODEPORT_PORT_MAX) 179 goto out_fill_addr; 180 181 /* When connecting to node port services in our cluster that 182 * have either HOST_ID or loopback address, we do a wild-card 183 * lookup with IP of 0. 184 */ 185 if (is_v4_loopback(daddr)) 186 return; 187 188 info = ipcache_lookup4(&IPCACHE_MAP, daddr, V4_CACHE_KEY_LEN); 189 if (info != NULL && info->sec_label == HOST_ID) 190 return; 191 192 /* For everything else in terms of node port, do a direct lookup. */ 193 out_fill_addr: 194 key->address = daddr; 195 #else 196 key->address = ctx->user_ip4; 197 #endif /* ENABLE_NODEPORT */ 198 } 199 200 __section("from-sock4") 201 int sock4_xlate(struct bpf_sock_addr *ctx) 202 { 203 struct lb4_backend *backend; 204 struct lb4_service_v2 *svc; 205 struct lb4_key_v2 key = { 206 .dport = ctx_get_port(ctx), 207 }; 208 struct lb4_service_v2 *slave_svc; 209 210 if (!sock_proto_enabled(ctx)) 211 return CONNECT_PROCEED; 212 213 sock4_handle_node_port(ctx, &key); 214 215 svc = __lb4_lookup_service_v2(&key); 216 if (svc) { 217 key.slave = (sock_local_cookie(ctx) % svc->count) + 1; 218 219 slave_svc = __lb4_lookup_slave_v2(&key); 220 if (!slave_svc) { 221 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE); 222 return CONNECT_PROCEED; 223 } 224 225 backend = __lb4_lookup_backend(slave_svc->backend_id); 226 if (!backend) { 227 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND); 228 return CONNECT_PROCEED; 229 } 230 231 if (ctx->protocol != IPPROTO_TCP && 232 sock4_update_revnat(ctx, backend, &key, 233 slave_svc) < 0) { 234 update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE); 235 return CONNECT_PROCEED; 236 } 237 238 ctx->user_ip4 = backend->address; 239 ctx_set_port(ctx, backend->port); 240 } 241 242 return CONNECT_PROCEED; 243 } 244 245 #ifdef ENABLE_HOST_SERVICES_UDP 246 __section("snd-sock4") 247 int sock4_xlate_snd(struct bpf_sock_addr *ctx) 248 { 249 struct lb4_key_v2 lkey = { 250 .dport = ctx_get_port(ctx), 251 }; 252 struct lb4_backend *backend; 253 struct lb4_service_v2 *svc; 254 struct lb4_service_v2 *slave_svc; 255 256 sock4_handle_node_port(ctx, &lkey); 257 258 svc = __lb4_lookup_service_v2(&lkey); 259 if (svc) { 260 lkey.slave = (sock_local_cookie(ctx) % svc->count) + 1; 261 262 slave_svc = __lb4_lookup_slave_v2(&lkey); 263 if (!slave_svc) { 264 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE); 265 return SENDMSG_PROCEED; 266 } 267 268 backend = __lb4_lookup_backend(slave_svc->backend_id); 269 if (!backend) { 270 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND); 271 return SENDMSG_PROCEED; 272 } 273 274 if (sock4_update_revnat(ctx, backend, &lkey, 275 slave_svc) < 0) { 276 update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE); 277 return SENDMSG_PROCEED; 278 } 279 280 ctx->user_ip4 = backend->address; 281 ctx_set_port(ctx, backend->port); 282 } 283 284 return SENDMSG_PROCEED; 285 } 286 287 __section("rcv-sock4") 288 int sock4_xlate_rcv(struct bpf_sock_addr *ctx) 289 { 290 struct ipv4_revnat_entry *rval; 291 struct ipv4_revnat_tuple rkey = { 292 .cookie = sock_local_cookie(ctx), 293 .address = ctx->user_ip4, 294 .port = ctx_get_port(ctx), 295 }; 296 297 rval = map_lookup_elem(&LB4_REVERSE_NAT_SK_MAP, &rkey); 298 if (rval) { 299 struct lb4_service_v2 *svc; 300 struct lb4_key_v2 lkey = { 301 .address = rval->address, 302 .dport = rval->port, 303 }; 304 305 svc = __lb4_lookup_service_v2(&lkey); 306 if (!svc || svc->rev_nat_index != rval->rev_nat_index) { 307 map_delete_elem(&LB4_REVERSE_NAT_SK_MAP, &rkey); 308 update_metrics(0, METRIC_INGRESS, REASON_LB_REVNAT_STALE); 309 return RECVMSG_PROCEED; 310 } 311 312 ctx->user_ip4 = rval->address; 313 ctx_set_port(ctx, rval->port); 314 } 315 316 return RECVMSG_PROCEED; 317 } 318 #endif /* ENABLE_HOST_SERVICES_UDP */ 319 #endif /* ENABLE_IPV4 */ 320 321 #ifdef ENABLE_IPV6 322 #ifdef ENABLE_HOST_SERVICES_UDP 323 struct ipv6_revnat_tuple { 324 __u64 cookie; 325 union v6addr address; 326 __be16 port; 327 __u16 pad; 328 }; 329 330 struct ipv6_revnat_entry { 331 union v6addr address; 332 __be16 port; 333 __u16 rev_nat_index; 334 }; 335 336 struct bpf_elf_map __section_maps LB6_REVERSE_NAT_SK_MAP = { 337 .type = BPF_MAP_TYPE_LRU_HASH, 338 .size_key = sizeof(struct ipv6_revnat_tuple), 339 .size_value = sizeof(struct ipv6_revnat_entry), 340 .pinning = PIN_GLOBAL_NS, 341 .max_elem = 256 * 1024, 342 }; 343 344 static inline int sock6_update_revnat(struct bpf_sock_addr *ctx, 345 struct lb6_backend *backend, 346 struct lb6_key_v2 *lkey, 347 struct lb6_service_v2 *slave_svc) 348 { 349 struct ipv6_revnat_tuple rkey = {}; 350 struct ipv6_revnat_entry rval = {}; 351 352 rkey.cookie = sock_local_cookie(ctx); 353 rkey.address = backend->address; 354 rkey.port = backend->port; 355 356 rval.address = lkey->address; 357 rval.port = lkey->dport; 358 rval.rev_nat_index = slave_svc->rev_nat_index; 359 360 return map_update_elem(&LB6_REVERSE_NAT_SK_MAP, &rkey, 361 &rval, 0); 362 } 363 #else 364 static inline int sock6_update_revnat(struct bpf_sock_addr *ctx, 365 struct lb6_backend *backend, 366 struct lb6_key_v2 *lkey, 367 struct lb6_service_v2 *slave_svc) 368 { 369 return -1; 370 } 371 #endif /* ENABLE_HOST_SERVICES_UDP */ 372 373 static __always_inline void ctx_get_v6_address(struct bpf_sock_addr *ctx, 374 union v6addr *addr) 375 { 376 addr->p1 = ctx->user_ip6[0]; 377 addr->p2 = ctx->user_ip6[1]; 378 addr->p3 = ctx->user_ip6[2]; 379 addr->p4 = ctx->user_ip6[3]; 380 } 381 382 static __always_inline void ctx_set_v6_address(struct bpf_sock_addr *ctx, 383 union v6addr *addr) 384 { 385 ctx->user_ip6[0] = addr->p1; 386 ctx->user_ip6[1] = addr->p2; 387 ctx->user_ip6[2] = addr->p3; 388 ctx->user_ip6[3] = addr->p4; 389 } 390 391 static inline void sock6_handle_node_port(struct bpf_sock_addr *ctx, 392 struct lb6_key_v2 *key) 393 { 394 #ifdef ENABLE_NODEPORT 395 struct remote_endpoint_info *info; 396 union v6addr daddr; 397 __u16 service_port; 398 399 ctx_get_v6_address(ctx, &daddr); 400 401 service_port = bpf_ntohs(key->dport); 402 if (service_port < NODEPORT_PORT_MIN || 403 service_port > NODEPORT_PORT_MAX) 404 goto out_fill_addr; 405 406 /* When connecting to node port services in our cluster that 407 * have either HOST_ID or loopback address, we do a wild-card 408 * lookup with IP of 0. 409 */ 410 if (is_v6_loopback(&daddr)) 411 return; 412 413 info = ipcache_lookup6(&IPCACHE_MAP, &daddr, V6_CACHE_KEY_LEN); 414 if (info != NULL && info->sec_label == HOST_ID) 415 return; 416 417 /* For everything else in terms of node port, do a direct lookup. */ 418 out_fill_addr: 419 key->address = daddr; 420 #else 421 ctx_get_v6_address(ctx, &key->address); 422 #endif /* ENABLE_NODEPORT */ 423 } 424 425 __section("from-sock6") 426 int sock6_xlate(struct bpf_sock_addr *ctx) 427 { 428 struct lb6_backend *backend; 429 struct lb6_service_v2 *svc; 430 struct lb6_key_v2 key = { 431 .dport = ctx_get_port(ctx), 432 }; 433 struct lb6_service_v2 *slave_svc; 434 435 if (!sock_proto_enabled(ctx)) 436 return CONNECT_PROCEED; 437 438 sock6_handle_node_port(ctx, &key); 439 440 svc = __lb6_lookup_service_v2(&key); 441 if (svc) { 442 key.slave = (sock_local_cookie(ctx) % svc->count) + 1; 443 444 slave_svc = __lb6_lookup_slave_v2(&key); 445 if (!slave_svc) { 446 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE); 447 return CONNECT_PROCEED; 448 } 449 450 backend = __lb6_lookup_backend(slave_svc->backend_id); 451 if (!backend) { 452 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND); 453 return CONNECT_PROCEED; 454 } 455 456 if (ctx->protocol != IPPROTO_TCP && 457 sock6_update_revnat(ctx, backend, &key, 458 slave_svc) < 0) { 459 update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE); 460 return CONNECT_PROCEED; 461 } 462 463 ctx_set_v6_address(ctx, &backend->address); 464 ctx_set_port(ctx, backend->port); 465 } 466 467 return CONNECT_PROCEED; 468 } 469 470 #ifdef ENABLE_HOST_SERVICES_UDP 471 __section("snd-sock6") 472 int sock6_xlate_snd(struct bpf_sock_addr *ctx) 473 { 474 struct lb6_backend *backend; 475 struct lb6_service_v2 *svc; 476 struct lb6_key_v2 lkey = { 477 .dport = ctx_get_port(ctx), 478 }; 479 struct lb6_service_v2 *slave_svc; 480 481 sock6_handle_node_port(ctx, &lkey); 482 483 svc = __lb6_lookup_service_v2(&lkey); 484 if (svc) { 485 lkey.slave = (sock_local_cookie(ctx) % svc->count) + 1; 486 487 slave_svc = __lb6_lookup_slave_v2(&lkey); 488 if (!slave_svc) { 489 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE); 490 return CONNECT_PROCEED; 491 } 492 493 backend = __lb6_lookup_backend(slave_svc->backend_id); 494 if (!backend) { 495 update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND); 496 return CONNECT_PROCEED; 497 } 498 499 if (sock6_update_revnat(ctx, backend, &lkey, 500 slave_svc) < 0) { 501 update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE); 502 return CONNECT_PROCEED; 503 } 504 505 ctx_set_v6_address(ctx, &backend->address); 506 ctx_set_port(ctx, backend->port); 507 } 508 509 return CONNECT_PROCEED; 510 } 511 512 __section("rcv-sock6") 513 int sock6_xlate_rcv(struct bpf_sock_addr *ctx) 514 { 515 struct ipv6_revnat_tuple rkey = {}; 516 struct ipv6_revnat_entry *rval; 517 518 rkey.cookie = sock_local_cookie(ctx); 519 rkey.port = ctx_get_port(ctx); 520 ctx_get_v6_address(ctx, &rkey.address); 521 522 rval = map_lookup_elem(&LB6_REVERSE_NAT_SK_MAP, &rkey); 523 if (rval) { 524 struct lb6_service_v2 *svc; 525 struct lb6_key_v2 lkey = { 526 .address = rval->address, 527 .dport = rval->port, 528 }; 529 530 svc = __lb6_lookup_service_v2(&lkey); 531 if (!svc || svc->rev_nat_index != rval->rev_nat_index) { 532 map_delete_elem(&LB6_REVERSE_NAT_SK_MAP, &rkey); 533 update_metrics(0, METRIC_INGRESS, REASON_LB_REVNAT_STALE); 534 return RECVMSG_PROCEED; 535 } 536 537 ctx_set_v6_address(ctx, &rval->address); 538 ctx_set_port(ctx, rval->port); 539 } 540 541 return RECVMSG_PROCEED; 542 } 543 #endif /* ENABLE_HOST_SERVICES_UDP */ 544 #endif /* ENABLE_IPV6 */ 545 546 BPF_LICENSE("GPL");