github.com/fafucoder/cilium@v1.6.11/bpf/bpf_sock.c (about)

     1  /*
     2   *  Copyright (C) 2019 Authors of Cilium
     3   *
     4   *  This program is free software; you can redistribute it and/or modify
     5   *  it under the terms of the GNU General Public License as published by
     6   *  the Free Software Foundation; either version 2 of the License, or
     7   *  (at your option) any later version.
     8   *
     9   *  This program is distributed in the hope that it will be useful,
    10   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
    11   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    12   *  GNU General Public License for more details.
    13   *
    14   *  You should have received a copy of the GNU General Public License
    15   *  along with this program; if not, write to the Free Software
    16   *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    17   */
    18  
    19  #include <node_config.h>
    20  #include <netdev_config.h>
    21  
    22  #include <bpf/api.h>
    23  
    24  #include <stdint.h>
    25  #include <stdio.h>
    26  
    27  #define SKIP_POLICY_MAP	1
    28  #define SKIP_CALLS_MAP	1
    29  
    30  #include "lib/utils.h"
    31  #include "lib/common.h"
    32  #include "lib/lb.h"
    33  #include "lib/eps.h"
    34  #include "lib/metrics.h"
    35  
    36  #define CONNECT_REJECT	0
    37  #define CONNECT_PROCEED	1
    38  #define SENDMSG_PROCEED	CONNECT_PROCEED
    39  #define RECVMSG_PROCEED	CONNECT_PROCEED
    40  
    41  static __always_inline __maybe_unused bool is_v4_loopback(__be32 daddr)
    42  {
    43  	/* Check for 127.0.0.0/8 range, RFC3330. */
    44  	return (daddr & bpf_htonl(0x7f000000)) == bpf_htonl(0x7f000000);
    45  }
    46  
    47  static __always_inline __maybe_unused bool is_v6_loopback(union v6addr *daddr)
    48  {
    49  	/* Check for ::1/128, RFC4291. */
    50  	union v6addr loopback = { .addr[15] = 1, };
    51  	return ipv6_addrcmp(&loopback, daddr) == 0;
    52  }
    53  
    54  /* Hack due to missing narrow ctx access. */
    55  static __always_inline __maybe_unused __be16
    56  ctx_get_port(struct bpf_sock_addr *ctx)
    57  {
    58  	volatile __u32 dport = ctx->user_port;
    59  	return (__be16)dport;
    60  }
    61  
    62  static __always_inline __maybe_unused
    63  void ctx_set_port(struct bpf_sock_addr *ctx, __be16 dport)
    64  {
    65  	ctx->user_port = (__u32)dport;
    66  }
    67  
    68  static __always_inline __maybe_unused
    69  __u64 sock_local_cookie(struct bpf_sock_addr *ctx)
    70  {
    71  #ifdef HAVE_GET_SOCK_COOKIE
    72  	/* prandom() breaks down on UDP, hence preference is on
    73  	 * socket cookie as built-in selector. On older kernels,
    74  	 * get_socket_cookie() provides a unique per netns cookie
    75  	 * for the life-time of the socket. For newer kernels this
    76  	 * is fixed to be a unique system _global_ cookie. Older
    77  	 * kernels could have a cookie collision when two pods with
    78  	 * different netns talk to same service backend, but that
    79  	 * is fine since we always reverse translate to the same
    80  	 * service IP/port pair. The only case that could happen
    81  	 * for older kernels is that we have a cookie collision
    82  	 * where one pod talks to the service IP/port and the
    83  	 * other pod talks to that same specific backend IP/port
    84  	 * directly _w/o_ going over service IP/port. Then the
    85  	 * reverse sock addr is translated to the service IP/port.
    86  	 * With a global socket cookie this collision cannot take
    87  	 * place. There, only the even more unlikely case could
    88  	 * happen where the same UDP socket talks first to the
    89  	 * service and then to the same selected backend IP/port
    90  	 * directly which can be considered negligible.
    91  	 */
    92  	return get_socket_cookie(ctx);
    93  #else
    94  	return ctx->protocol == IPPROTO_TCP ? get_prandom_u32() : 0;
    95  #endif
    96  }
    97  
    98  static __always_inline __maybe_unused
    99  bool sock_proto_enabled(const struct bpf_sock_addr *ctx)
   100  {
   101  	switch (ctx->protocol) {
   102  #ifdef ENABLE_HOST_SERVICES_TCP
   103  	case IPPROTO_TCP:
   104  		return true;
   105  #endif /* ENABLE_HOST_SERVICES_TCP */
   106  #ifdef ENABLE_HOST_SERVICES_UDP
   107  	case IPPROTO_UDPLITE:
   108  	case IPPROTO_UDP:
   109  		return true;
   110  #endif /* ENABLE_HOST_SERVICES_UDP */
   111  	default:
   112  		return false;
   113  	}
   114  }
   115  
   116  #ifdef ENABLE_IPV4
   117  #ifdef ENABLE_HOST_SERVICES_UDP
   118  struct ipv4_revnat_tuple {
   119  	__u64 cookie;
   120  	__be32 address;
   121  	__be16 port;
   122  	__u16 pad;
   123  };
   124  
   125  struct ipv4_revnat_entry {
   126  	__be32 address;
   127  	__be16 port;
   128  	__u16 rev_nat_index;
   129  };
   130  
   131  struct bpf_elf_map __section_maps LB4_REVERSE_NAT_SK_MAP = {
   132  	.type		= BPF_MAP_TYPE_LRU_HASH,
   133  	.size_key	= sizeof(struct ipv4_revnat_tuple),
   134  	.size_value	= sizeof(struct ipv4_revnat_entry),
   135  	.pinning	= PIN_GLOBAL_NS,
   136  	.max_elem	= 256 * 1024,
   137  };
   138  
   139  static inline int sock4_update_revnat(struct bpf_sock_addr *ctx,
   140  				      struct lb4_backend *backend,
   141  				      struct lb4_key_v2 *lkey,
   142  				      struct lb4_service_v2 *slave_svc)
   143  {
   144  	struct ipv4_revnat_tuple rkey = {};
   145  	struct ipv4_revnat_entry rval = {};
   146  
   147  	rkey.cookie = sock_local_cookie(ctx);
   148  	rkey.address = backend->address;
   149  	rkey.port = backend->port;
   150  
   151  	rval.address = lkey->address;
   152  	rval.port = lkey->dport;
   153  	rval.rev_nat_index = slave_svc->rev_nat_index;
   154  
   155  	return map_update_elem(&LB4_REVERSE_NAT_SK_MAP, &rkey,
   156  			       &rval, 0);
   157  }
   158  #else
   159  static inline int sock4_update_revnat(struct bpf_sock_addr *ctx,
   160  				      struct lb4_backend *backend,
   161  				      struct lb4_key_v2 *lkey,
   162  				      struct lb4_service_v2 *slave_svc)
   163  {
   164  	return -1;
   165  }
   166  #endif /* ENABLE_HOST_SERVICES_UDP */
   167  
   168  static inline void sock4_handle_node_port(struct bpf_sock_addr *ctx,
   169  					  struct lb4_key_v2 *key)
   170  {
   171  #ifdef ENABLE_NODEPORT
   172  	struct remote_endpoint_info *info;
   173  	__be32 daddr = ctx->user_ip4;
   174  	__u16 service_port;
   175  
   176  	service_port = bpf_ntohs(key->dport);
   177  	if (service_port < NODEPORT_PORT_MIN ||
   178  	    service_port > NODEPORT_PORT_MAX)
   179  		goto out_fill_addr;
   180  
   181  	/* When connecting to node port services in our cluster that
   182  	 * have either HOST_ID or loopback address, we do a wild-card
   183  	 * lookup with IP of 0.
   184  	 */
   185  	if (is_v4_loopback(daddr))
   186  		return;
   187  
   188  	info = ipcache_lookup4(&IPCACHE_MAP, daddr, V4_CACHE_KEY_LEN);
   189  	if (info != NULL && info->sec_label == HOST_ID)
   190  		return;
   191  
   192  	/* For everything else in terms of node port, do a direct lookup. */
   193  out_fill_addr:
   194  	key->address = daddr;
   195  #else
   196  	key->address = ctx->user_ip4;
   197  #endif /* ENABLE_NODEPORT */
   198  }
   199  
   200  __section("from-sock4")
   201  int sock4_xlate(struct bpf_sock_addr *ctx)
   202  {
   203  	struct lb4_backend *backend;
   204  	struct lb4_service_v2 *svc;
   205  	struct lb4_key_v2 key = {
   206  		.dport		= ctx_get_port(ctx),
   207  	};
   208  	struct lb4_service_v2 *slave_svc;
   209  
   210  	if (!sock_proto_enabled(ctx))
   211  		return CONNECT_PROCEED;
   212  
   213  	sock4_handle_node_port(ctx, &key);
   214  
   215  	svc = __lb4_lookup_service_v2(&key);
   216  	if (svc) {
   217  		key.slave = (sock_local_cookie(ctx) % svc->count) + 1;
   218  
   219  		slave_svc = __lb4_lookup_slave_v2(&key);
   220  		if (!slave_svc) {
   221  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE);
   222  			return CONNECT_PROCEED;
   223  		}
   224  
   225  		backend = __lb4_lookup_backend(slave_svc->backend_id);
   226  		if (!backend) {
   227  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND);
   228  			return CONNECT_PROCEED;
   229  		}
   230  
   231  		if (ctx->protocol != IPPROTO_TCP &&
   232  		    sock4_update_revnat(ctx, backend, &key,
   233  					slave_svc) < 0) {
   234  			update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE);
   235  			return CONNECT_PROCEED;
   236  		}
   237  
   238  		ctx->user_ip4	= backend->address;
   239  		ctx_set_port(ctx, backend->port);
   240  	}
   241  
   242  	return CONNECT_PROCEED;
   243  }
   244  
   245  #ifdef ENABLE_HOST_SERVICES_UDP
   246  __section("snd-sock4")
   247  int sock4_xlate_snd(struct bpf_sock_addr *ctx)
   248  {
   249  	struct lb4_key_v2 lkey = {
   250  		.dport		= ctx_get_port(ctx),
   251  	};
   252  	struct lb4_backend *backend;
   253  	struct lb4_service_v2 *svc;
   254  	struct lb4_service_v2 *slave_svc;
   255  
   256  	sock4_handle_node_port(ctx, &lkey);
   257  
   258  	svc = __lb4_lookup_service_v2(&lkey);
   259  	if (svc) {
   260  		lkey.slave = (sock_local_cookie(ctx) % svc->count) + 1;
   261  
   262  		slave_svc = __lb4_lookup_slave_v2(&lkey);
   263  		if (!slave_svc) {
   264  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE);
   265  			return SENDMSG_PROCEED;
   266  		}
   267  
   268  		backend = __lb4_lookup_backend(slave_svc->backend_id);
   269  		if (!backend) {
   270  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND);
   271  			return SENDMSG_PROCEED;
   272  		}
   273  
   274  		if (sock4_update_revnat(ctx, backend, &lkey,
   275  					slave_svc) < 0) {
   276  			update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE);
   277  			return SENDMSG_PROCEED;
   278  		}
   279  
   280  		ctx->user_ip4 = backend->address;
   281  		ctx_set_port(ctx, backend->port);
   282  	}
   283  
   284  	return SENDMSG_PROCEED;
   285  }
   286  
   287  __section("rcv-sock4")
   288  int sock4_xlate_rcv(struct bpf_sock_addr *ctx)
   289  {
   290  	struct ipv4_revnat_entry *rval;
   291  	struct ipv4_revnat_tuple rkey = {
   292  		.cookie		= sock_local_cookie(ctx),
   293  		.address	= ctx->user_ip4,
   294  		.port		= ctx_get_port(ctx),
   295  	};
   296  
   297  	rval = map_lookup_elem(&LB4_REVERSE_NAT_SK_MAP, &rkey);
   298  	if (rval) {
   299  		struct lb4_service_v2 *svc;
   300  		struct lb4_key_v2 lkey = {
   301  			.address	= rval->address,
   302  			.dport		= rval->port,
   303  		};
   304  
   305  		svc = __lb4_lookup_service_v2(&lkey);
   306  		if (!svc || svc->rev_nat_index != rval->rev_nat_index) {
   307  			map_delete_elem(&LB4_REVERSE_NAT_SK_MAP, &rkey);
   308  			update_metrics(0, METRIC_INGRESS, REASON_LB_REVNAT_STALE);
   309  			return RECVMSG_PROCEED;
   310  		}
   311  
   312  		ctx->user_ip4 = rval->address;
   313  		ctx_set_port(ctx, rval->port);
   314  	}
   315  
   316  	return RECVMSG_PROCEED;
   317  }
   318  #endif /* ENABLE_HOST_SERVICES_UDP */
   319  #endif /* ENABLE_IPV4 */
   320  
   321  #ifdef ENABLE_IPV6
   322  #ifdef ENABLE_HOST_SERVICES_UDP
   323  struct ipv6_revnat_tuple {
   324  	__u64 cookie;
   325  	union v6addr address;
   326  	__be16 port;
   327  	__u16 pad;
   328  };
   329  
   330  struct ipv6_revnat_entry {
   331  	union v6addr address;
   332  	__be16 port;
   333  	__u16 rev_nat_index;
   334  };
   335  
   336  struct bpf_elf_map __section_maps LB6_REVERSE_NAT_SK_MAP = {
   337  	.type		= BPF_MAP_TYPE_LRU_HASH,
   338  	.size_key	= sizeof(struct ipv6_revnat_tuple),
   339  	.size_value	= sizeof(struct ipv6_revnat_entry),
   340  	.pinning	= PIN_GLOBAL_NS,
   341  	.max_elem	= 256 * 1024,
   342  };
   343  
   344  static inline int sock6_update_revnat(struct bpf_sock_addr *ctx,
   345  				      struct lb6_backend *backend,
   346  				      struct lb6_key_v2 *lkey,
   347  				      struct lb6_service_v2 *slave_svc)
   348  {
   349  	struct ipv6_revnat_tuple rkey = {};
   350  	struct ipv6_revnat_entry rval = {};
   351  
   352  	rkey.cookie = sock_local_cookie(ctx);
   353  	rkey.address = backend->address;
   354  	rkey.port = backend->port;
   355  
   356  	rval.address = lkey->address;
   357  	rval.port = lkey->dport;
   358  	rval.rev_nat_index = slave_svc->rev_nat_index;
   359  
   360  	return map_update_elem(&LB6_REVERSE_NAT_SK_MAP, &rkey,
   361  			       &rval, 0);
   362  }
   363  #else
   364  static inline int sock6_update_revnat(struct bpf_sock_addr *ctx,
   365  				      struct lb6_backend *backend,
   366  				      struct lb6_key_v2 *lkey,
   367  				      struct lb6_service_v2 *slave_svc)
   368  {
   369  	return -1;
   370  }
   371  #endif /* ENABLE_HOST_SERVICES_UDP */
   372  
   373  static __always_inline void ctx_get_v6_address(struct bpf_sock_addr *ctx,
   374  					       union v6addr *addr)
   375  {
   376  	addr->p1 = ctx->user_ip6[0];
   377  	addr->p2 = ctx->user_ip6[1];
   378  	addr->p3 = ctx->user_ip6[2];
   379  	addr->p4 = ctx->user_ip6[3];
   380  }
   381  
   382  static __always_inline void ctx_set_v6_address(struct bpf_sock_addr *ctx,
   383  					       union v6addr *addr)
   384  {
   385  	ctx->user_ip6[0] = addr->p1;
   386  	ctx->user_ip6[1] = addr->p2;
   387  	ctx->user_ip6[2] = addr->p3;
   388  	ctx->user_ip6[3] = addr->p4;
   389  }
   390  
   391  static inline void sock6_handle_node_port(struct bpf_sock_addr *ctx,
   392  					  struct lb6_key_v2 *key)
   393  {
   394  #ifdef ENABLE_NODEPORT
   395  	struct remote_endpoint_info *info;
   396  	union v6addr daddr;
   397  	__u16 service_port;
   398  
   399  	ctx_get_v6_address(ctx, &daddr);
   400  
   401  	service_port = bpf_ntohs(key->dport);
   402  	if (service_port < NODEPORT_PORT_MIN ||
   403  	    service_port > NODEPORT_PORT_MAX)
   404  		goto out_fill_addr;
   405  
   406  	/* When connecting to node port services in our cluster that
   407  	 * have either HOST_ID or loopback address, we do a wild-card
   408  	 * lookup with IP of 0.
   409  	 */
   410  	if (is_v6_loopback(&daddr))
   411  		return;
   412  
   413  	info = ipcache_lookup6(&IPCACHE_MAP, &daddr, V6_CACHE_KEY_LEN);
   414  	if (info != NULL && info->sec_label == HOST_ID)
   415  		return;
   416  
   417  	/* For everything else in terms of node port, do a direct lookup. */
   418  out_fill_addr:
   419  	key->address = daddr;
   420  #else
   421  	ctx_get_v6_address(ctx, &key->address);
   422  #endif /* ENABLE_NODEPORT */
   423  }
   424  
   425  __section("from-sock6")
   426  int sock6_xlate(struct bpf_sock_addr *ctx)
   427  {
   428  	struct lb6_backend *backend;
   429  	struct lb6_service_v2 *svc;
   430  	struct lb6_key_v2 key = {
   431  		.dport		= ctx_get_port(ctx),
   432  	};
   433  	struct lb6_service_v2 *slave_svc;
   434  
   435  	if (!sock_proto_enabled(ctx))
   436  		return CONNECT_PROCEED;
   437  
   438  	sock6_handle_node_port(ctx, &key);
   439  
   440  	svc = __lb6_lookup_service_v2(&key);
   441  	if (svc) {
   442  		key.slave = (sock_local_cookie(ctx) % svc->count) + 1;
   443  
   444  		slave_svc = __lb6_lookup_slave_v2(&key);
   445  		if (!slave_svc) {
   446  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE);
   447  			return CONNECT_PROCEED;
   448  		}
   449  
   450  		backend = __lb6_lookup_backend(slave_svc->backend_id);
   451  		if (!backend) {
   452  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND);
   453  			return CONNECT_PROCEED;
   454  		}
   455  
   456  		if (ctx->protocol != IPPROTO_TCP &&
   457  		    sock6_update_revnat(ctx, backend, &key,
   458  				        slave_svc) < 0) {
   459  			update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE);
   460  			return CONNECT_PROCEED;
   461  		}
   462  
   463  		ctx_set_v6_address(ctx, &backend->address);
   464  		ctx_set_port(ctx, backend->port);
   465  	}
   466  
   467  	return CONNECT_PROCEED;
   468  }
   469  
   470  #ifdef ENABLE_HOST_SERVICES_UDP
   471  __section("snd-sock6")
   472  int sock6_xlate_snd(struct bpf_sock_addr *ctx)
   473  {
   474  	struct lb6_backend *backend;
   475  	struct lb6_service_v2 *svc;
   476  	struct lb6_key_v2 lkey = {
   477  		.dport		= ctx_get_port(ctx),
   478  	};
   479  	struct lb6_service_v2 *slave_svc;
   480  
   481  	sock6_handle_node_port(ctx, &lkey);
   482  
   483  	svc = __lb6_lookup_service_v2(&lkey);
   484  	if (svc) {
   485  		lkey.slave = (sock_local_cookie(ctx) % svc->count) + 1;
   486  
   487  		slave_svc = __lb6_lookup_slave_v2(&lkey);
   488  		if (!slave_svc) {
   489  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_SLAVE);
   490  			return CONNECT_PROCEED;
   491  		}
   492  
   493  		backend = __lb6_lookup_backend(slave_svc->backend_id);
   494  		if (!backend) {
   495  			update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND);
   496  			return CONNECT_PROCEED;
   497  		}
   498  
   499  		if (sock6_update_revnat(ctx, backend, &lkey,
   500  				        slave_svc) < 0) {
   501  			update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE);
   502  			return CONNECT_PROCEED;
   503  		}
   504  
   505  		ctx_set_v6_address(ctx, &backend->address);
   506  		ctx_set_port(ctx, backend->port);
   507  	}
   508  
   509  	return CONNECT_PROCEED;
   510  }
   511  
   512  __section("rcv-sock6")
   513  int sock6_xlate_rcv(struct bpf_sock_addr *ctx)
   514  {
   515  	struct ipv6_revnat_tuple rkey = {};
   516  	struct ipv6_revnat_entry *rval;
   517  
   518  	rkey.cookie = sock_local_cookie(ctx);
   519  	rkey.port = ctx_get_port(ctx);
   520  	ctx_get_v6_address(ctx, &rkey.address);
   521  
   522  	rval = map_lookup_elem(&LB6_REVERSE_NAT_SK_MAP, &rkey);
   523  	if (rval) {
   524  		struct lb6_service_v2 *svc;
   525  		struct lb6_key_v2 lkey = {
   526  			.address	= rval->address,
   527  			.dport		= rval->port,
   528  		};
   529  
   530  		svc = __lb6_lookup_service_v2(&lkey);
   531  		if (!svc || svc->rev_nat_index != rval->rev_nat_index) {
   532  			map_delete_elem(&LB6_REVERSE_NAT_SK_MAP, &rkey);
   533  			update_metrics(0, METRIC_INGRESS, REASON_LB_REVNAT_STALE);
   534  			return RECVMSG_PROCEED;
   535  		}
   536  
   537  		ctx_set_v6_address(ctx, &rval->address);
   538  		ctx_set_port(ctx, rval->port);
   539  	}
   540  
   541  	return RECVMSG_PROCEED;
   542  }
   543  #endif /* ENABLE_HOST_SERVICES_UDP */
   544  #endif /* ENABLE_IPV6 */
   545  
   546  BPF_LICENSE("GPL");