github.com/looshlee/beatles@v0.0.0-20220727174639-742810ab631c/bpf/lib/conntrack.h

github.com/looshlee/beatles@v0.0.0-20220727174639-742810ab631c/bpf/lib/conntrack.h (about)

     1  /*
     2   *  Copyright (C) 2016-2019 Authors of Cilium
     3   *
     4   *  This program is free software; you can redistribute it and/or modify
     5   *  it under the terms of the GNU General Public License as published by
     6   *  the Free Software Foundation; either version 2 of the License, or
     7   *  (at your option) any later version.
     8   *
     9   *  This program is distributed in the hope that it will be useful,
    10   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
    11   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    12   *  GNU General Public License for more details.
    13   *
    14   *  You should have received a copy of the GNU General Public License
    15   *  along with this program; if not, write to the Free Software
    16   *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    17   */
    18  #ifndef __LIB_CONNTRACK_H_
    19  #define __LIB_CONNTRACK_H_
    20  
    21  #include <linux/icmpv6.h>
    22  #include <linux/icmp.h>
    23  
    24  #include "common.h"
    25  #include "utils.h"
    26  #include "ipv6.h"
    27  #include "dbg.h"
    28  #include "l4.h"
    29  #include "nat46.h"
    30  
    31  /* CT_REPORT_INTERVAL, when MONITOR_AGGREGATION is >= TRACE_AGGREGATE_ACTIVE_CT
    32   * determines how frequently monitor notifications should be sent for active
    33   * connections. A notification is always triggered on a packet event.
    34   */
    35  #ifndef CT_REPORT_INTERVAL
    36  # define CT_REPORT_INTERVAL		5	/* 5 seconds */
    37  #endif
    38  
    39  #ifdef CONNTRACK
    40  enum {
    41  	ACTION_UNSPEC,
    42  	ACTION_CREATE,
    43  	ACTION_CLOSE,
    44  };
    45  
    46  /* conn_is_dns returns true if the connection is DNS, false otherwise.
    47   *
    48   * @dport: Connection destination port.
    49   *
    50   * To reduce program complexity, we ignore nexthdr and dir here:
    51   * nexthdr: The parser will not fill dport if nexthdr is not TCP/UDP.
    52   * dir:     Ideally we would only consider responses, but requests are likely
    53   *          to be small anyway.
    54   * */
    55  static inline bool conn_is_dns(__u16 dport)
    56  {
    57  	if (dport == bpf_htons(53)) {
    58  		relax_verifier();
    59  		return true;
    60  	}
    61  	return false;
    62  }
    63  
    64  union tcp_flags {
    65  	struct {
    66  		__u8 upper_bits;
    67  		__u8 lower_bits;
    68  		__u16 pad;
    69  	};
    70  	__u32 value;
    71  };
    72  
    73  /**
    74   * Update the CT timeout and TCP flags for the specified entry.
    75   *
    76   * We track the OR'd accumulation of seen tcp flags in the entry, and the
    77   * last time that a notification was sent. Multiple CPUs may enter this
    78   * function with packets for the same connection, in which case it is possible
    79   * for the CPUs to race to update the entry. In such a case, the critical
    80   * update section may be entered in quick succession, leading to multiple
    81   * updates of the entry and returning true for each CPU. The BPF architecture
    82   * guarantees that entire 8-bit or 32-bit values will be set within the entry,
    83   * so although the CPUs may race, the worst result is that multiple executions
    84   * of this function return non-zero for the same connection within short
    85   * succession, leading to multiple trace notifications being sent when one
    86   * might otherwise expect such notifications to be aggregated.
    87   *
    88   * Returns how many bytes of the packet should be monitored:
    89   * - Zero if this flow was recently monitored.
    90   * - Non-zero if this flow has not been monitored recently.
    91   */
    92  static inline __u32 __inline__ __ct_update_timeout(struct ct_entry *entry,
    93  						   __u32 lifetime, int dir,
    94  						   union tcp_flags flags)
    95  {
    96  	__u32 now = bpf_ktime_get_sec();
    97  	__u8 accumulated_flags;
    98  	__u8 seen_flags = flags.lower_bits;
    99  	__u32 last_report;
   100  
   101  #ifdef NEEDS_TIMEOUT
   102  	entry->lifetime = now + lifetime;
   103  #endif
   104  	if (dir == CT_INGRESS) {
   105  		accumulated_flags = READ_ONCE(entry->rx_flags_seen);
   106  		last_report = READ_ONCE(entry->last_rx_report);
   107  	} else {
   108  		accumulated_flags = READ_ONCE(entry->tx_flags_seen);
   109  		last_report = READ_ONCE(entry->last_tx_report);
   110  	}
   111  	seen_flags |= accumulated_flags;
   112  
   113  	/* It's possible for multiple CPUs to execute the branch statement here
   114  	 * one after another, before the first CPU is able to execute the entry
   115  	 * modifications within this branch. This is somewhat unlikely because
   116  	 * packets for the same connection are typically steered towards the
   117  	 * same CPU, but is possible in theory.
   118  	 *
   119  	 * If the branch is taken by multiple CPUs because of '*last_report',
   120  	 * then this merely causes multiple notifications to be sent after
   121  	 * CT_REPORT_INTERVAL rather than a single notification. '*last_report'
   122  	 * will be updated by all CPUs and subsequent checks should not take
   123  	 * this branch until the next CT_REPORT_INTERVAL. As such, the trace
   124  	 * aggregation that uses the result of this function may reduce the
   125  	 * number of packets per interval to a small integer value (max N_CPUS)
   126  	 * rather than 1 notification per packet throughout the interval.
   127  	 *
   128  	 * Similar behaviour may happen with tcp_flags. The worst case race
   129  	 * here would be that two or more CPUs argue over which flags have been
   130  	 * seen and overwrite each other, with each CPU interleaving different
   131  	 * values for which flags were seen. In practice, realistic connections
   132  	 * are likely to progressively set SYN, ACK, then much later perhaps
   133  	 * FIN and/or RST. Furthermore, unless such a traffic pattern were
   134  	 * constantly received, this should self-correct as the stored
   135  	 * tcp_flags is an OR'd set of flags and each time the above code is
   136  	 * executed, it pulls the latest set of accumulated flags. Therefore
   137  	 * even in the worst case such a conflict is likely only to cause a
   138  	 * small number of additional notifications, which is still likely to
   139  	 * be significantly less under this MONITOR_AGGREGATION mode than would
   140  	 * otherwise be sent if the MONITOR_AGGREGATION level is set to none
   141  	 * (ie, sending a notification for every packet).
   142  	 */
   143  	if (last_report + CT_REPORT_INTERVAL < now ||
   144  	    accumulated_flags != seen_flags) {
   145  		/* verifier workaround: we don't use reference here. */
   146  		if (dir == CT_INGRESS) {
   147  			WRITE_ONCE(entry->rx_flags_seen, seen_flags);
   148  			WRITE_ONCE(entry->last_rx_report, now);
   149  		} else {
   150  			WRITE_ONCE(entry->tx_flags_seen, seen_flags);
   151  			WRITE_ONCE(entry->last_tx_report, now);
   152  		}
   153  		return TRACE_PAYLOAD_LEN;
   154  	}
   155  	return 0;
   156  }
   157  
   158  /**
   159   * Update the CT timeouts for the specified entry.
   160   *
   161   * If CT_REPORT_INTERVAL has elapsed since the last update, updates the
   162   * last_updated timestamp and returns true. Otherwise returns false.
   163   */
   164  static inline __u32 __inline__ ct_update_timeout(struct ct_entry *entry,
   165  						 bool tcp, int dir,
   166  						 union tcp_flags seen_flags)
   167  {
   168  	__u32 lifetime = dir == CT_SERVICE ?
   169  			 CT_SERVICE_LIFETIME_NONTCP :
   170  			 CT_CONNECTION_LIFETIME_NONTCP;
   171  	bool syn = seen_flags.value & TCP_FLAG_SYN;
   172  
   173  	if (tcp) {
   174  		entry->seen_non_syn |= !syn;
   175  		if (entry->seen_non_syn) {
   176  			lifetime = dir == CT_SERVICE ?
   177  				   CT_SERVICE_LIFETIME_TCP :
   178  				   CT_CONNECTION_LIFETIME_TCP;
   179  		} else {
   180  			lifetime = CT_SYN_TIMEOUT;
   181  		}
   182  	}
   183  
   184  	return __ct_update_timeout(entry, lifetime, dir, seen_flags);
   185  }
   186  
   187  static inline void __inline__ ct_reset_closing(struct ct_entry *entry)
   188  {
   189  	entry->rx_closing = 0;
   190  	entry->tx_closing = 0;
   191  }
   192  
   193  static inline bool __inline__ ct_entry_alive(const struct ct_entry *entry)
   194  {
   195  	return !entry->rx_closing || !entry->tx_closing;
   196  }
   197  
   198  static inline __u8 __inline__ __ct_lookup(void *map, struct __sk_buff *skb,
   199  					  void *tuple, int action, int dir,
   200  					  struct ct_state *ct_state,
   201  					  bool is_tcp, union tcp_flags seen_flags,
   202  					  __u32 *monitor)
   203  {
   204  	struct ct_entry *entry;
   205  	int reopen;
   206  
   207  	if ((entry = map_lookup_elem(map, tuple))) {
   208  		cilium_dbg(skb, DBG_CT_MATCH, entry->lifetime, entry->rev_nat_index);
   209  		if (ct_entry_alive(entry)) {
   210  			*monitor = ct_update_timeout(entry, is_tcp, dir, seen_flags);
   211  		}
   212  		if (ct_state) {
   213  			ct_state->rev_nat_index = entry->rev_nat_index;
   214  			ct_state->loopback = entry->lb_loopback;
   215  			ct_state->node_port = entry->node_port;
   216  			ct_state->proxy_redirect = entry->proxy_redirect;
   217  			/* To support seamless upgrade from an earlier service
   218  			 * implementation, we store references to the backend
   219  			 * in the "ct_entry.rx_bytes" field.
   220  			 * Previously, the field "ct_entry.backend_id" was used
   221  			 * for legacy services so we co-opted the field
   222  			 * "ct_entry.rx_bytes" to store the services v2
   223  			 * backend (as it is not used with dir=CT_SERVICE).
   224  			 *
   225  			 * As of v1.6, "ct_entry.backend_id" is zeroed so that
   226  			 * users who migrate to v1.6 will end up with CT
   227  			 * entries that assign no meaning to this field.
   228  			 * In v1.7 it will be safe to reuse this field for
   229  			 * other purposes. Current plans are to expand the
   230  			 * backend_id to 32 bits, which would involve creating
   231  			 * a union across the backend_id and [rt]x_bytes fields.
   232  			 * For now, just retrieve the backend out of rx_bytes.
   233  			 *
   234  			 * TODO (1.7+): Switch to entry->backend_id
   235  			 */
   236  			if (dir == CT_SERVICE) {
   237  				ct_state->backend_id = entry->rx_bytes;
   238  			}
   239  		}
   240  
   241  #ifdef ENABLE_NAT46
   242  		/* This packet needs nat46 translation */
   243  		if (entry->nat46 && !skb->cb[CB_NAT46_STATE])
   244  			skb->cb[CB_NAT46_STATE] = NAT46;
   245  #endif
   246  
   247  #ifdef CONNTRACK_ACCOUNTING
   248  		/* FIXME: This is slow, per-cpu counters? */
   249  		if (dir == CT_INGRESS) {
   250  			__sync_fetch_and_add(&entry->rx_packets, 1);
   251  			__sync_fetch_and_add(&entry->rx_bytes, skb->len);
   252  		} else if (dir == CT_EGRESS) {
   253  			__sync_fetch_and_add(&entry->tx_packets, 1);
   254  			__sync_fetch_and_add(&entry->tx_bytes, skb->len);
   255  		}
   256  #endif
   257  
   258  		switch (action) {
   259  		case ACTION_CREATE:
   260  			reopen = entry->rx_closing | entry->tx_closing;
   261  			reopen |= seen_flags.value & TCP_FLAG_SYN;
   262  			if (unlikely(reopen == (TCP_FLAG_SYN|0x1))) {
   263  				ct_reset_closing(entry);
   264  				*monitor = ct_update_timeout(entry, is_tcp, dir, seen_flags);
   265  			}
   266  			break;
   267  		case ACTION_CLOSE:
   268  			/* RST or similar, immediately delete ct entry */
   269  			if (dir == CT_INGRESS)
   270  				entry->rx_closing = 1;
   271  			else
   272  				entry->tx_closing = 1;
   273  
   274  			*monitor = TRACE_PAYLOAD_LEN;
   275  			if (ct_entry_alive(entry))
   276  				break;
   277  			__ct_update_timeout(entry, CT_CLOSE_TIMEOUT, dir, seen_flags);
   278  			break;
   279  		}
   280  
   281  		return CT_ESTABLISHED;
   282  	}
   283  
   284  	*monitor = TRACE_PAYLOAD_LEN;
   285  	return CT_NEW;
   286  }
   287  
   288  static inline void __inline__ ct_flip_tuple_dir6(struct ipv6_ct_tuple *tuple)
   289  {
   290  	if (tuple->flags & TUPLE_F_IN)
   291  		tuple->flags &= ~TUPLE_F_IN;
   292  	else
   293  		tuple->flags |= TUPLE_F_IN;
   294  }
   295  
   296  static inline void __inline__ ipv6_ct_tuple_reverse(struct ipv6_ct_tuple *tuple)
   297  {
   298  	union v6addr tmp_addr = {};
   299  	__be16 tmp;
   300  
   301  	ipv6_addr_copy(&tmp_addr, &tuple->saddr);
   302  	ipv6_addr_copy(&tuple->saddr, &tuple->daddr);
   303  	ipv6_addr_copy(&tuple->daddr, &tmp_addr);
   304  
   305  	tmp = tuple->sport;
   306  	tuple->sport = tuple->dport;
   307  	tuple->dport = tmp;
   308  
   309  	ct_flip_tuple_dir6(tuple);
   310  }
   311  
   312  /* Offset must point to IPv6 */
   313  static inline int __inline__ ct_lookup6(void *map, struct ipv6_ct_tuple *tuple,
   314  					struct __sk_buff *skb, int l4_off, int dir,
   315  					struct ct_state *ct_state, __u32 *monitor)
   316  {
   317  	int ret = CT_NEW, action = ACTION_UNSPEC;
   318  	bool is_tcp = tuple->nexthdr == IPPROTO_TCP;
   319  	union tcp_flags tcp_flags = { .value = 0 };
   320  
   321  	/* The tuple is created in reverse order initially to find a
   322  	 * potential reverse flow. This is required because the RELATED
   323  	 * or REPLY state takes precedence over ESTABLISHED due to
   324  	 * policy requirements.
   325  	 *
   326  	 * tuple->flags separates entries that could otherwise be overlapping.
   327  	 */
   328  	if (dir == CT_INGRESS)
   329  		tuple->flags = TUPLE_F_OUT;
   330  	else if (dir == CT_EGRESS)
   331  		tuple->flags = TUPLE_F_IN;
   332  	else if (dir == CT_SERVICE)
   333  		tuple->flags = TUPLE_F_SERVICE;
   334  	else
   335  		return DROP_CT_INVALID_HDR;
   336  
   337  	switch (tuple->nexthdr) {
   338  	case IPPROTO_ICMPV6:
   339  		if (1) {
   340  			__be16 identifier = 0;
   341  			__u8 type;
   342  
   343  			if (skb_load_bytes(skb, l4_off, &type, 1) < 0)
   344  				return DROP_CT_INVALID_HDR;
   345  			if ((type == ICMPV6_ECHO_REQUEST || type == ICMPV6_ECHO_REPLY) &&
   346  			     skb_load_bytes(skb, l4_off + offsetof(struct icmp6hdr,
   347  								   icmp6_dataun.u_echo.identifier),
   348  					    &identifier, 2) < 0)
   349  				return DROP_CT_INVALID_HDR;
   350  
   351  			tuple->sport = 0;
   352  			tuple->dport = 0;
   353  
   354  			switch (type) {
   355  			case ICMPV6_DEST_UNREACH:
   356  			case ICMPV6_PKT_TOOBIG:
   357  			case ICMPV6_TIME_EXCEED:
   358  			case ICMPV6_PARAMPROB:
   359  				tuple->flags |= TUPLE_F_RELATED;
   360  				break;
   361  
   362  			case ICMPV6_ECHO_REPLY:
   363  				tuple->sport = identifier;
   364  				break;
   365  
   366  			case ICMPV6_ECHO_REQUEST:
   367  				tuple->dport = identifier;
   368  				/* fall through */
   369  			default:
   370  				action = ACTION_CREATE;
   371  				break;
   372  			}
   373  		}
   374  		break;
   375  
   376  	case IPPROTO_TCP:
   377  		if (1) {
   378  			if (skb_load_bytes(skb, l4_off + 12, &tcp_flags, 2) < 0)
   379  				return DROP_CT_INVALID_HDR;
   380  
   381  			if (unlikely(tcp_flags.value & (TCP_FLAG_RST|TCP_FLAG_FIN)))
   382  				action = ACTION_CLOSE;
   383  			else
   384  				action = ACTION_CREATE;
   385  		}
   386  
   387  		/* load sport + dport into tuple */
   388  		if (skb_load_bytes(skb, l4_off, &tuple->dport, 4) < 0)
   389  			return DROP_CT_INVALID_HDR;
   390  		break;
   391  
   392  	case IPPROTO_UDP:
   393  		/* load sport + dport into tuple */
   394  		if (skb_load_bytes(skb, l4_off, &tuple->dport, 4) < 0)
   395  			return DROP_CT_INVALID_HDR;
   396  
   397  		action = ACTION_CREATE;
   398  		break;
   399  
   400  	default:
   401  		/* Can't handle extension headers yet */
   402  		relax_verifier();
   403  		return DROP_CT_UNKNOWN_PROTO;
   404  	}
   405  
   406  	/* Lookup the reverse direction
   407  	 *
   408  	 * This will find an existing flow in the reverse direction.
   409  	 * The reverse direction is the one where reverse nat index is stored.
   410  	 */
   411  	cilium_dbg3(skb, DBG_CT_LOOKUP6_1, (__u32) tuple->saddr.p4, (__u32) tuple->daddr.p4,
   412  		      (bpf_ntohs(tuple->sport) << 16) | bpf_ntohs(tuple->dport));
   413  	cilium_dbg3(skb, DBG_CT_LOOKUP6_2, (tuple->nexthdr << 8) | tuple->flags, 0, 0);
   414  	ret = __ct_lookup(map, skb, tuple, action, dir, ct_state, is_tcp,
   415  			  tcp_flags, monitor);
   416  	if (ret != CT_NEW) {
   417  		if (likely(ret == CT_ESTABLISHED)) {
   418  			if (unlikely(tuple->flags & TUPLE_F_RELATED))
   419  				ret = CT_RELATED;
   420  			else
   421  				ret = CT_REPLY;
   422  		}
   423  		goto out;
   424  	}
   425  
   426  	/* Lookup entry in forward direction */
   427  	if (dir != CT_SERVICE) {
   428  		ipv6_ct_tuple_reverse(tuple);
   429  		ret = __ct_lookup(map, skb, tuple, action, dir, ct_state,
   430  				  is_tcp, tcp_flags, monitor);
   431  	}
   432  
   433  #ifdef ENABLE_NAT46
   434  	skb->cb[CB_NAT46_STATE] = NAT46_CLEAR;
   435  #endif
   436  out:
   437  	cilium_dbg(skb, DBG_CT_VERDICT, ret < 0 ? -ret : ret, ct_state->rev_nat_index);
   438  	if (conn_is_dns(tuple->dport))
   439  		*monitor = MTU;
   440  	return ret;
   441  }
   442  
   443  static inline void __inline__ ct_flip_tuple_dir4(struct ipv4_ct_tuple *tuple)
   444  {
   445  	if (tuple->flags & TUPLE_F_IN)
   446  		tuple->flags &= ~TUPLE_F_IN;
   447  	else
   448  		tuple->flags |= TUPLE_F_IN;
   449  }
   450  
   451  static inline void __inline__ ipv4_ct_tuple_reverse(struct ipv4_ct_tuple *tuple)
   452  {
   453  	__be32 tmp_addr = tuple->saddr;
   454  	__be16 tmp;
   455  
   456  	tuple->saddr = tuple->daddr;
   457  	tuple->daddr = tmp_addr;
   458  
   459  	tmp = tuple->sport;
   460  	tuple->sport = tuple->dport;
   461  	tuple->dport = tmp;
   462  
   463  	ct_flip_tuple_dir4(tuple);
   464  }
   465  
   466  static inline void ct4_cilium_dbg_tuple(struct __sk_buff *skb, __u8 type,
   467  					  const struct ipv4_ct_tuple *tuple,
   468  					  __u32 rev_nat_index, int dir)
   469  {
   470  	__be32 addr = (dir == CT_INGRESS) ? tuple->saddr : tuple->daddr;
   471  	cilium_dbg(skb, type, addr, rev_nat_index);
   472  }
   473  
   474  /* Offset must point to IPv4 header */
   475  static inline int __inline__ ct_lookup4(void *map, struct ipv4_ct_tuple *tuple,
   476  					struct __sk_buff *skb, int off, int dir,
   477  					struct ct_state *ct_state, __u32 *monitor)
   478  {
   479  	int ret = CT_NEW, action = ACTION_UNSPEC;
   480  	bool is_tcp = tuple->nexthdr == IPPROTO_TCP;
   481  	union tcp_flags tcp_flags = { .value = 0 };
   482  
   483  	/* The tuple is created in reverse order initially to find a
   484  	 * potential reverse flow. This is required because the RELATED
   485  	 * or REPLY state takes precedence over ESTABLISHED due to
   486  	 * policy requirements.
   487  	 *
   488  	 * tuple->flags separates entries that could otherwise be overlapping.
   489  	 */
   490  	if (dir == CT_INGRESS)
   491  		tuple->flags = TUPLE_F_OUT;
   492  	else if (dir == CT_EGRESS)
   493  		tuple->flags = TUPLE_F_IN;
   494  	else if (dir == CT_SERVICE)
   495  		tuple->flags = TUPLE_F_SERVICE;
   496  	else
   497  		return DROP_CT_INVALID_HDR;
   498  
   499  	switch (tuple->nexthdr) {
   500  	case IPPROTO_ICMP:
   501  		if (1) {
   502  			__be16 identifier = 0;
   503  			__u8 type;
   504  
   505  			if (skb_load_bytes(skb, off, &type, 1) < 0)
   506  				return DROP_CT_INVALID_HDR;
   507  			if ((type == ICMP_ECHO || type == ICMP_ECHOREPLY) &&
   508  			     skb_load_bytes(skb, off + offsetof(struct icmphdr, un.echo.id),
   509  					    &identifier, 2) < 0)
   510  				return DROP_CT_INVALID_HDR;
   511  
   512  			tuple->sport = 0;
   513  			tuple->dport = 0;
   514  
   515  			switch (type) {
   516  			case ICMP_DEST_UNREACH:
   517  			case ICMP_TIME_EXCEEDED:
   518  			case ICMP_PARAMETERPROB:
   519  				tuple->flags |= TUPLE_F_RELATED;
   520  				break;
   521  
   522  			case ICMP_ECHOREPLY:
   523  				tuple->sport = identifier;
   524  				break;
   525  
   526  			case ICMP_ECHO:
   527  				tuple->dport = identifier;
   528  				/* fall through */
   529  			default:
   530  				action = ACTION_CREATE;
   531  				break;
   532  			}
   533  		}
   534  		break;
   535  
   536  	case IPPROTO_TCP:
   537  		if (1) {
   538  			if (skb_load_bytes(skb, off + 12, &tcp_flags, 2) < 0)
   539  				return DROP_CT_INVALID_HDR;
   540  
   541  			if (unlikely(tcp_flags.value & (TCP_FLAG_RST|TCP_FLAG_FIN)))
   542  				action = ACTION_CLOSE;
   543  			else
   544  				action = ACTION_CREATE;
   545  		}
   546  
   547  		/* load sport + dport into tuple */
   548  		if (skb_load_bytes(skb, off, &tuple->dport, 4) < 0)
   549  			return DROP_CT_INVALID_HDR;
   550  		break;
   551  
   552  	case IPPROTO_UDP:
   553  		/* load sport + dport into tuple */
   554  		if (skb_load_bytes(skb, off, &tuple->dport, 4) < 0)
   555  			return DROP_CT_INVALID_HDR;
   556  
   557  		action = ACTION_CREATE;
   558  		break;
   559  
   560  	default:
   561  		/* Can't handle extension headers yet */
   562  		relax_verifier();
   563  		return DROP_CT_UNKNOWN_PROTO;
   564  	}
   565  
   566  	/* Lookup the reverse direction
   567  	 *
   568  	 * This will find an existing flow in the reverse direction.
   569  	 */
   570  #ifndef QUIET_CT
   571  	cilium_dbg3(skb, DBG_CT_LOOKUP4_1, tuple->saddr, tuple->daddr,
   572  		      (bpf_ntohs(tuple->sport) << 16) | bpf_ntohs(tuple->dport));
   573  	cilium_dbg3(skb, DBG_CT_LOOKUP4_2, (tuple->nexthdr << 8) | tuple->flags, 0, 0);
   574  #endif
   575  	ret = __ct_lookup(map, skb, tuple, action, dir, ct_state, is_tcp,
   576  			  tcp_flags, monitor);
   577  	if (ret != CT_NEW) {
   578  		if (likely(ret == CT_ESTABLISHED)) {
   579  			if (unlikely(tuple->flags & TUPLE_F_RELATED))
   580  				ret = CT_RELATED;
   581  			else
   582  				ret = CT_REPLY;
   583  		}
   584  		goto out;
   585  	}
   586  
   587  	/* Lookup entry in forward direction */
   588  	if (dir != CT_SERVICE) {
   589  		ipv4_ct_tuple_reverse(tuple);
   590  		ret = __ct_lookup(map, skb, tuple, action, dir, ct_state,
   591  				  is_tcp, tcp_flags, monitor);
   592  	}
   593  out:
   594  	cilium_dbg(skb, DBG_CT_VERDICT, ret < 0 ? -ret : ret, ct_state->rev_nat_index);
   595  	if (conn_is_dns(tuple->dport))
   596  		*monitor = MTU;
   597  	return ret;
   598  }
   599  
   600  static inline void __inline__ ct_update6_backend_id(void *map,
   601  						    struct ipv6_ct_tuple *tuple,
   602  						    struct ct_state *state)
   603  {
   604  	struct ct_entry *entry;
   605  
   606  	entry = map_lookup_elem(map, tuple);
   607  	if (!entry)
   608  		return;
   609  
   610  	/* See the ct_create4 comments re the rx_bytes hack */
   611  	entry->backend_id = 0;
   612  	entry->rx_bytes = state->backend_id;
   613  	return;
   614  }
   615  
   616  static inline void __inline__
   617  ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple,
   618  			 struct ct_state *state)
   619  {
   620  	struct ct_entry *entry;
   621  
   622  	entry = map_lookup_elem(map, tuple);
   623  	if (!entry)
   624  		return;
   625  
   626  	entry->rev_nat_index = state->rev_nat_index;
   627  	return;
   628  }
   629  
   630  /* Offset must point to IPv6 */
   631  static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple,
   632  					struct __sk_buff *skb, int dir,
   633  					struct ct_state *ct_state, bool proxy_redirect)
   634  {
   635  	/* Create entry in original direction */
   636  	struct ct_entry entry = { };
   637  	bool is_tcp = tuple->nexthdr == IPPROTO_TCP;
   638  	union tcp_flags seen_flags = { .value = 0 };
   639  
   640  	/* Note if this is a proxy connection so that replies can be redirected back to the proxy. */
   641  	entry.proxy_redirect = proxy_redirect;
   642  	
   643  	/* See the ct_create4 comments re the rx_bytes hack */
   644  	if (dir == CT_SERVICE) {
   645  		entry.backend_id = 0;
   646  		entry.rx_bytes = ct_state->backend_id;
   647  	}
   648  
   649  	entry.lb_loopback = ct_state->loopback;
   650  	entry.node_port = ct_state->node_port;
   651  
   652  	entry.rev_nat_index = ct_state->rev_nat_index;
   653  	seen_flags.value |= is_tcp ? TCP_FLAG_SYN : 0;
   654  	ct_update_timeout(&entry, is_tcp, dir, seen_flags);
   655  
   656  	if (dir == CT_INGRESS) {
   657  		entry.rx_packets = 1;
   658  		entry.rx_bytes = skb->len;
   659  	} else if (dir == CT_EGRESS) {
   660  		entry.tx_packets = 1;
   661  		entry.tx_bytes = skb->len;
   662  	}
   663  
   664  	cilium_dbg3(skb, DBG_CT_CREATED6, entry.rev_nat_index, ct_state->src_sec_id, 0);
   665  
   666  	entry.src_sec_id = ct_state->src_sec_id;
   667  	if (map_update_elem(map, tuple, &entry, 0) < 0)
   668  		return DROP_CT_CREATE_FAILED;
   669  
   670  	/* Create an ICMPv6 entry to relate errors */
   671  	struct ipv6_ct_tuple icmp_tuple = {
   672  		.nexthdr = IPPROTO_ICMPV6,
   673  		.sport = 0,
   674  		.dport = 0,
   675  		.flags = tuple->flags | TUPLE_F_RELATED,
   676  	};
   677  
   678  	entry.seen_non_syn = true; /* For ICMP, there is no SYN. */
   679  
   680  	ipv6_addr_copy(&icmp_tuple.daddr, &tuple->daddr);
   681  	ipv6_addr_copy(&icmp_tuple.saddr, &tuple->saddr);
   682  
   683  	/* FIXME: We could do a lookup and check if an L3 entry already exists */
   684  	if (map_update_elem(map, &icmp_tuple, &entry, 0) < 0) {
   685  		/* Previous map update succeeded, we could delete it
   686  		 * but we might as well just let it time out.
   687  		 */
   688  		return DROP_CT_CREATE_FAILED;
   689  	}
   690  
   691  	return 0;
   692  }
   693  
   694  static inline void __inline__ ct_update4_backend_id(void *map,
   695  						    struct ipv4_ct_tuple *tuple,
   696  						    struct ct_state *state)
   697  {
   698  	struct ct_entry *entry;
   699  
   700  	entry = map_lookup_elem(map, tuple);
   701  	if (!entry)
   702  		return;
   703  
   704  	/* See the ct_create4 comments re the rx_bytes hack */
   705  	entry->backend_id = 0;
   706  	entry->rx_bytes = state->backend_id;
   707  	return;
   708  }
   709  
   710  static inline void __inline__
   711  ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple,
   712  			 struct ct_state *state)
   713  {
   714  	struct ct_entry *entry;
   715  
   716  	entry = map_lookup_elem(map, tuple);
   717  	if (!entry)
   718  		return;
   719  
   720  	entry->rev_nat_index = state->rev_nat_index;
   721  	return;
   722  }
   723  
   724  static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple,
   725  					struct __sk_buff *skb, int dir,
   726  					struct ct_state *ct_state, bool proxy_redirect)
   727  {
   728  	/* Create entry in original direction */
   729  	struct ct_entry entry = { };
   730  	bool is_tcp = tuple->nexthdr == IPPROTO_TCP;
   731  	union tcp_flags seen_flags = { .value = 0 };
   732  
   733  	/* Note if this is a proxy connection so that replies can be redirected back to the proxy. */
   734  	entry.proxy_redirect = proxy_redirect;
   735  	
   736  	entry.lb_loopback = ct_state->loopback;
   737  	entry.node_port = ct_state->node_port;
   738  
   739  	/* We need to store the backend_id (points to a svc v2 endpoint), while
   740  	 * handling migration for users upgrading from prior releases. where
   741  	 * the "ct_entry.backend_id" field was used for legacy services.
   742  	 *
   743  	 * Previously, the rx_bytes field was not used for entries with
   744  	 * the dir=CT_SERVICE (see GH#7060). Therefore, we can safely abuse
   745  	 * this field to save the backend_id. The hack will go away once we stop
   746  	 * supporting the legacy svc (in v1.6 we will zero the backend_id
   747  	 * field, in v1.7 we can remove the rx_bytes hack).
   748  	 */
   749  	if (dir == CT_SERVICE) {
   750  		entry.backend_id = 0;
   751  		entry.rx_bytes = ct_state->backend_id;
   752  	}
   753  	entry.rev_nat_index = ct_state->rev_nat_index;
   754  	seen_flags.value |= is_tcp ? TCP_FLAG_SYN : 0;
   755  	ct_update_timeout(&entry, is_tcp, dir, seen_flags);
   756  
   757  	if (dir == CT_INGRESS) {
   758  		entry.rx_packets = 1;
   759  		entry.rx_bytes = skb->len;
   760  	} else if (dir == CT_EGRESS) {
   761  		entry.tx_packets = 1;
   762  		entry.tx_bytes = skb->len;
   763  	}
   764  
   765  #ifdef ENABLE_NAT46
   766  	if (skb->cb[CB_NAT46_STATE] == NAT64)
   767  		entry.nat46 = dir == CT_EGRESS;
   768  #endif
   769  
   770  	cilium_dbg3(skb, DBG_CT_CREATED4, entry.rev_nat_index, ct_state->src_sec_id, ct_state->addr);
   771  
   772  	entry.src_sec_id = ct_state->src_sec_id;
   773  	if (map_update_elem(map, tuple, &entry, 0) < 0)
   774  		return DROP_CT_CREATE_FAILED;
   775  
   776  	if (ct_state->addr && ct_state->loopback) {
   777  		__u8 flags = tuple->flags;
   778  		__be32 saddr, daddr;
   779  
   780  		saddr = tuple->saddr;
   781  		daddr = tuple->daddr;
   782  
   783  		/* We are looping back into the origin endpoint through a service,
   784  		 * set up a conntrack tuple for the reply to ensure we do rev NAT
   785  		 * before attempting to route the destination address which will
   786  		 * not point back to the right source. */
   787  		tuple->flags = TUPLE_F_IN;
   788  		if (dir == CT_INGRESS) {
   789  			tuple->saddr = ct_state->addr;
   790  			tuple->daddr = ct_state->svc_addr;
   791  		} else {
   792  			tuple->saddr = ct_state->svc_addr;
   793  			tuple->daddr = ct_state->addr;
   794  		}
   795  
   796  		if (map_update_elem(map, tuple, &entry, 0) < 0)
   797  			return DROP_CT_CREATE_FAILED;
   798  		tuple->saddr = saddr;
   799  		tuple->daddr = daddr;
   800  		tuple->flags = flags;
   801  	}
   802  
   803  	/* Create an ICMP entry to relate errors */
   804  	struct ipv4_ct_tuple icmp_tuple = {
   805  		.daddr = tuple->daddr,
   806  		.saddr = tuple->saddr,
   807  		.nexthdr = IPPROTO_ICMP,
   808  		.sport = 0,
   809  		.dport = 0,
   810  		.flags = tuple->flags | TUPLE_F_RELATED,
   811  	};
   812  
   813  	entry.seen_non_syn = true; /* For ICMP, there is no SYN. */
   814  
   815  	/* FIXME: We could do a lookup and check if an L3 entry already exists */
   816  	if (map_update_elem(map, &icmp_tuple, &entry, 0) < 0)
   817  		return DROP_CT_CREATE_FAILED;
   818  
   819  	return 0;
   820  }
   821  
   822  #else /* !CONNTRACK */
   823  static inline int __inline__ ct_lookup6(void *map, struct ipv6_ct_tuple *tuple,
   824  					struct __sk_buff *skb, int off, int dir,
   825  					struct ct_state *ct_state, __u32 *monitor)
   826  {
   827  	return 0;
   828  }
   829  
   830  static inline int __inline__ ct_lookup4(void *map, struct ipv4_ct_tuple *tuple,
   831  					struct __sk_buff *skb, int off, int dir,
   832  					struct ct_state *ct_state, __u32 *monitor)
   833  {
   834  	return 0;
   835  }
   836  
   837  static inline void __inline__ ct_update6_backend_id(void *map,
   838  						    struct ipv6_ct_tuple *tuple,
   839  						    struct ct_state *state)
   840  {
   841  }
   842  
   843  static inline void __inline__
   844  ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple,
   845  			 struct ct_state *state)
   846  {
   847  }
   848  
   849  static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple,
   850  					struct __sk_buff *skb, int dir,
   851  					struct ct_state *ct_state, bool from_proxy)
   852  {
   853  	return 0;
   854  }
   855  
   856  static inline void __inline__ ct_update4_backend_id(void *map,
   857  						    struct ipv4_ct_tuple *tuple,
   858  					            struct ct_state *state)
   859  {
   860  }
   861  
   862  static inline void __inline__
   863  ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple,
   864  			 struct ct_state *state)
   865  {
   866  }
   867  
   868  static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple,
   869  					struct __sk_buff *skb, int dir,
   870  					struct ct_state *ct_state, bool from_proxy)
   871  {
   872  	return 0;
   873  }
   874  
   875  #endif
   876  
   877  #endif