github.com/cilium/cilium@v1.16.2/bpf/include/bpf/builtins.h (about)

     1  /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
     2  /* Copyright Authors of Cilium */
     3  
     4  #pragma once
     5  
     6  #include "compiler.h"
     7  
     8  #ifndef lock_xadd
     9  # define lock_xadd(P, V)	((void) __sync_fetch_and_add((P), (V)))
    10  #endif
    11  
    12  /* Unfortunately verifier forces aligned stack access while other memory
    13   * do not have to be aligned (map, pkt, etc). Mark those on the /stack/
    14   * for objects > 8 bytes in order to force-align such memcpy candidates
    15   * when we really need them to be aligned, this is not needed for objects
    16   * of size <= 8 bytes and in case of > 8 bytes /only/ when 8 byte is not
    17   * the natural object alignment (e.g. __u8 foo[12]).
    18   */
    19  #define __align_stack_8		__aligned(8)
    20  
    21  /* Memory iterators used below. */
    22  #define __it_bwd(x, op) (x -= sizeof(__u##op))
    23  #define __it_fwd(x, op) (x += sizeof(__u##op))
    24  
    25  /* Memory operators used below. */
    26  #define __it_set(a, op) (*(__u##op *)__it_bwd(a, op)) = 0
    27  #define __it_xor(a, b, r, op) r |= (*(__u##op *)__it_bwd(a, op)) ^ (*(__u##op *)__it_bwd(b, op))
    28  #define __it_mob(a, b, op) (*(__u##op *)__it_bwd(a, op)) = (*(__u##op *)__it_bwd(b, op))
    29  #define __it_mof(a, b, op)				\
    30  	do {						\
    31  		*(__u##op *)a = *(__u##op *)b;		\
    32  		__it_fwd(a, op); __it_fwd(b, op);	\
    33  	} while (0)
    34  
    35  static __always_inline __maybe_unused void
    36  __bpf_memset_builtin(void *d, __u8 c, __u64 len)
    37  {
    38  	/* Everything non-zero or non-const (currently unsupported) as c
    39  	 * gets handled here.
    40  	 */
    41  	__builtin_memset(d, c, len);
    42  }
    43  
    44  static __always_inline void __bpf_memzero(void *d, __u64 len)
    45  {
    46  #if __clang_major__ >= 10
    47  	if (!__builtin_constant_p(len))
    48  		__throw_build_bug();
    49  
    50  	d += len;
    51  
    52  	if (len > 1 && len % 2 == 1) {
    53  		__it_set(d, 8);
    54  		len -= 1;
    55  	}
    56  
    57  	switch (len) {
    58  	case 96:         __it_set(d, 64); fallthrough;
    59  	case 88: jmp_88: __it_set(d, 64); fallthrough;
    60  	case 80: jmp_80: __it_set(d, 64); fallthrough;
    61  	case 72: jmp_72: __it_set(d, 64); fallthrough;
    62  	case 64: jmp_64: __it_set(d, 64); fallthrough;
    63  	case 56: jmp_56: __it_set(d, 64); fallthrough;
    64  	case 48: jmp_48: __it_set(d, 64); fallthrough;
    65  	case 40: jmp_40: __it_set(d, 64); fallthrough;
    66  	case 32: jmp_32: __it_set(d, 64); fallthrough;
    67  	case 24: jmp_24: __it_set(d, 64); fallthrough;
    68  	case 16: jmp_16: __it_set(d, 64); fallthrough;
    69  	case  8: jmp_8:  __it_set(d, 64);
    70  		break;
    71  
    72  	case 94: __it_set(d, 16); __it_set(d, 32); goto jmp_88;
    73  	case 86: __it_set(d, 16); __it_set(d, 32); goto jmp_80;
    74  	case 78: __it_set(d, 16); __it_set(d, 32); goto jmp_72;
    75  	case 70: __it_set(d, 16); __it_set(d, 32); goto jmp_64;
    76  	case 62: __it_set(d, 16); __it_set(d, 32); goto jmp_56;
    77  	case 54: __it_set(d, 16); __it_set(d, 32); goto jmp_48;
    78  	case 46: __it_set(d, 16); __it_set(d, 32); goto jmp_40;
    79  	case 38: __it_set(d, 16); __it_set(d, 32); goto jmp_32;
    80  	case 30: __it_set(d, 16); __it_set(d, 32); goto jmp_24;
    81  	case 22: __it_set(d, 16); __it_set(d, 32); goto jmp_16;
    82  	case 14: __it_set(d, 16); __it_set(d, 32); goto jmp_8;
    83  	case  6: __it_set(d, 16); __it_set(d, 32);
    84  		break;
    85  
    86  	case 92: __it_set(d, 32); goto jmp_88;
    87  	case 84: __it_set(d, 32); goto jmp_80;
    88  	case 76: __it_set(d, 32); goto jmp_72;
    89  	case 68: __it_set(d, 32); goto jmp_64;
    90  	case 60: __it_set(d, 32); goto jmp_56;
    91  	case 52: __it_set(d, 32); goto jmp_48;
    92  	case 44: __it_set(d, 32); goto jmp_40;
    93  	case 36: __it_set(d, 32); goto jmp_32;
    94  	case 28: __it_set(d, 32); goto jmp_24;
    95  	case 20: __it_set(d, 32); goto jmp_16;
    96  	case 12: __it_set(d, 32); goto jmp_8;
    97  	case  4: __it_set(d, 32);
    98  		break;
    99  
   100  	case 90: __it_set(d, 16); goto jmp_88;
   101  	case 82: __it_set(d, 16); goto jmp_80;
   102  	case 74: __it_set(d, 16); goto jmp_72;
   103  	case 66: __it_set(d, 16); goto jmp_64;
   104  	case 58: __it_set(d, 16); goto jmp_56;
   105  	case 50: __it_set(d, 16); goto jmp_48;
   106  	case 42: __it_set(d, 16); goto jmp_40;
   107  	case 34: __it_set(d, 16); goto jmp_32;
   108  	case 26: __it_set(d, 16); goto jmp_24;
   109  	case 18: __it_set(d, 16); goto jmp_16;
   110  	case 10: __it_set(d, 16); goto jmp_8;
   111  	case  2: __it_set(d, 16);
   112  		break;
   113  
   114  	case  1: __it_set(d, 8);
   115  		break;
   116  
   117  	default:
   118  		/* __builtin_memset() is crappy slow since it cannot
   119  		 * make any assumptions about alignment & underlying
   120  		 * efficient unaligned access on the target we're
   121  		 * running.
   122  		 */
   123  		__throw_build_bug();
   124  	}
   125  #else
   126  	__bpf_memset_builtin(d, 0, len);
   127  #endif
   128  }
   129  
   130  static __always_inline __maybe_unused void
   131  __bpf_no_builtin_memset(void *d __maybe_unused, __u8 c __maybe_unused,
   132  			__u64 len __maybe_unused)
   133  {
   134  	__throw_build_bug();
   135  }
   136  
   137  /* Redirect any direct use in our code to throw an error. */
   138  #define __builtin_memset	__bpf_no_builtin_memset
   139  
   140  static __always_inline __nobuiltin("memset") void memset(void *d, int c,
   141  							 __u64 len)
   142  {
   143  	if (__builtin_constant_p(len) && __builtin_constant_p(c) && c == 0)
   144  		__bpf_memzero(d, len);
   145  	else
   146  		__bpf_memset_builtin(d, (__u8)c, len);
   147  }
   148  
   149  static __always_inline __maybe_unused void
   150  __bpf_memcpy_builtin(void *d, const void *s, __u64 len)
   151  {
   152  	/* Explicit opt-in for __builtin_memcpy(). */
   153  	__builtin_memcpy(d, s, len);
   154  }
   155  
   156  static __always_inline void __bpf_memcpy(void *d, const void *s, __u64 len)
   157  {
   158  #if __clang_major__ >= 10
   159  	if (!__builtin_constant_p(len))
   160  		__throw_build_bug();
   161  
   162  	d += len;
   163  	s += len;
   164  
   165  	if (len > 1 && len % 2 == 1) {
   166  		__it_mob(d, s, 8);
   167  		len -= 1;
   168  	}
   169  
   170  	switch (len) {
   171  	case 96:         __it_mob(d, s, 64); fallthrough;
   172  	case 88: jmp_88: __it_mob(d, s, 64); fallthrough;
   173  	case 80: jmp_80: __it_mob(d, s, 64); fallthrough;
   174  	case 72: jmp_72: __it_mob(d, s, 64); fallthrough;
   175  	case 64: jmp_64: __it_mob(d, s, 64); fallthrough;
   176  	case 56: jmp_56: __it_mob(d, s, 64); fallthrough;
   177  	case 48: jmp_48: __it_mob(d, s, 64); fallthrough;
   178  	case 40: jmp_40: __it_mob(d, s, 64); fallthrough;
   179  	case 32: jmp_32: __it_mob(d, s, 64); fallthrough;
   180  	case 24: jmp_24: __it_mob(d, s, 64); fallthrough;
   181  	case 16: jmp_16: __it_mob(d, s, 64); fallthrough;
   182  	case  8: jmp_8:  __it_mob(d, s, 64);
   183  		break;
   184  
   185  	case 94: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_88;
   186  	case 86: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_80;
   187  	case 78: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_72;
   188  	case 70: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_64;
   189  	case 62: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_56;
   190  	case 54: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_48;
   191  	case 46: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_40;
   192  	case 38: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_32;
   193  	case 30: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_24;
   194  	case 22: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_16;
   195  	case 14: __it_mob(d, s, 16); __it_mob(d, s, 32); goto jmp_8;
   196  	case  6: __it_mob(d, s, 16); __it_mob(d, s, 32);
   197  		break;
   198  
   199  	case 92: __it_mob(d, s, 32); goto jmp_88;
   200  	case 84: __it_mob(d, s, 32); goto jmp_80;
   201  	case 76: __it_mob(d, s, 32); goto jmp_72;
   202  	case 68: __it_mob(d, s, 32); goto jmp_64;
   203  	case 60: __it_mob(d, s, 32); goto jmp_56;
   204  	case 52: __it_mob(d, s, 32); goto jmp_48;
   205  	case 44: __it_mob(d, s, 32); goto jmp_40;
   206  	case 36: __it_mob(d, s, 32); goto jmp_32;
   207  	case 28: __it_mob(d, s, 32); goto jmp_24;
   208  	case 20: __it_mob(d, s, 32); goto jmp_16;
   209  	case 12: __it_mob(d, s, 32); goto jmp_8;
   210  	case  4: __it_mob(d, s, 32);
   211  		break;
   212  
   213  	case 90: __it_mob(d, s, 16); goto jmp_88;
   214  	case 82: __it_mob(d, s, 16); goto jmp_80;
   215  	case 74: __it_mob(d, s, 16); goto jmp_72;
   216  	case 66: __it_mob(d, s, 16); goto jmp_64;
   217  	case 58: __it_mob(d, s, 16); goto jmp_56;
   218  	case 50: __it_mob(d, s, 16); goto jmp_48;
   219  	case 42: __it_mob(d, s, 16); goto jmp_40;
   220  	case 34: __it_mob(d, s, 16); goto jmp_32;
   221  	case 26: __it_mob(d, s, 16); goto jmp_24;
   222  	case 18: __it_mob(d, s, 16); goto jmp_16;
   223  	case 10: __it_mob(d, s, 16); goto jmp_8;
   224  	case  2: __it_mob(d, s, 16);
   225  		break;
   226  
   227  	case  1: __it_mob(d, s, 8);
   228  		break;
   229  
   230  	default:
   231  		/* __builtin_memcpy() is crappy slow since it cannot
   232  		 * make any assumptions about alignment & underlying
   233  		 * efficient unaligned access on the target we're
   234  		 * running.
   235  		 */
   236  		__throw_build_bug();
   237  	}
   238  #else
   239  	__bpf_memcpy_builtin(d, s, len);
   240  #endif
   241  }
   242  
   243  static __always_inline __maybe_unused void
   244  __bpf_no_builtin_memcpy(void *d __maybe_unused, const void *s __maybe_unused,
   245  			__u64 len __maybe_unused)
   246  {
   247  	__throw_build_bug();
   248  }
   249  
   250  /* Redirect any direct use in our code to throw an error. */
   251  #define __builtin_memcpy	__bpf_no_builtin_memcpy
   252  
   253  static __always_inline __nobuiltin("memcpy") void memcpy(void *d, const void *s,
   254  							 __u64 len)
   255  {
   256  	return __bpf_memcpy(d, s, len);
   257  }
   258  
   259  static __always_inline __maybe_unused __u64
   260  __bpf_memcmp_builtin(const void *x, const void *y, __u64 len)
   261  {
   262  	/* Explicit opt-in for __builtin_memcmp(). We use the bcmp builtin
   263  	 * here for two reasons: i) we only need to know equal or non-equal
   264  	 * similar as in __bpf_memcmp(), and ii) if __bpf_memcmp() ends up
   265  	 * selecting __bpf_memcmp_builtin(), clang generats a memcmp loop.
   266  	 * That is, (*) -> __bpf_memcmp() -> __bpf_memcmp_builtin() ->
   267  	 * __builtin_memcmp() -> memcmp() -> (*), meaning it will end up
   268  	 * selecting our memcmp() from here. Remapping to __builtin_bcmp()
   269  	 * breaks this loop and resolves both needs at once.
   270  	 */
   271  	return __builtin_bcmp(x, y, len);
   272  }
   273  
   274  static __always_inline __u64 __bpf_memcmp(const void *x, const void *y,
   275  					  __u64 len)
   276  {
   277  #if __clang_major__ >= 10
   278  	__u64 r = 0;
   279  
   280  	if (!__builtin_constant_p(len))
   281  		__throw_build_bug();
   282  
   283  	x += len;
   284  	y += len;
   285  
   286  	if (len > 1 && len % 2 == 1) {
   287  		__it_xor(x, y, r, 8);
   288  		len -= 1;
   289  	}
   290  
   291  	switch (len) {
   292  	case 72:         __it_xor(x, y, r, 64); fallthrough;
   293  	case 64: jmp_64: __it_xor(x, y, r, 64); fallthrough;
   294  	case 56: jmp_56: __it_xor(x, y, r, 64); fallthrough;
   295  	case 48: jmp_48: __it_xor(x, y, r, 64); fallthrough;
   296  	case 40: jmp_40: __it_xor(x, y, r, 64); fallthrough;
   297  	case 32: jmp_32: __it_xor(x, y, r, 64); fallthrough;
   298  	case 24: jmp_24: __it_xor(x, y, r, 64); fallthrough;
   299  	case 16: jmp_16: __it_xor(x, y, r, 64); fallthrough;
   300  	case  8: jmp_8:  __it_xor(x, y, r, 64);
   301  		break;
   302  
   303  	case 70: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_64;
   304  	case 62: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_56;
   305  	case 54: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_48;
   306  	case 46: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_40;
   307  	case 38: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_32;
   308  	case 30: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_24;
   309  	case 22: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_16;
   310  	case 14: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32); goto jmp_8;
   311  	case  6: __it_xor(x, y, r, 16); __it_xor(x, y, r, 32);
   312  		break;
   313  
   314  	case 68: __it_xor(x, y, r, 32); goto jmp_64;
   315  	case 60: __it_xor(x, y, r, 32); goto jmp_56;
   316  	case 52: __it_xor(x, y, r, 32); goto jmp_48;
   317  	case 44: __it_xor(x, y, r, 32); goto jmp_40;
   318  	case 36: __it_xor(x, y, r, 32); goto jmp_32;
   319  	case 28: __it_xor(x, y, r, 32); goto jmp_24;
   320  	case 20: __it_xor(x, y, r, 32); goto jmp_16;
   321  	case 12: __it_xor(x, y, r, 32); goto jmp_8;
   322  	case  4: __it_xor(x, y, r, 32);
   323  		break;
   324  
   325  	case 66: __it_xor(x, y, r, 16); goto jmp_64;
   326  	case 58: __it_xor(x, y, r, 16); goto jmp_56;
   327  	case 50: __it_xor(x, y, r, 16); goto jmp_48;
   328  	case 42: __it_xor(x, y, r, 16); goto jmp_40;
   329  	case 34: __it_xor(x, y, r, 16); goto jmp_32;
   330  	case 26: __it_xor(x, y, r, 16); goto jmp_24;
   331  	case 18: __it_xor(x, y, r, 16); goto jmp_16;
   332  	case 10: __it_xor(x, y, r, 16); goto jmp_8;
   333  	case  2: __it_xor(x, y, r, 16);
   334  		break;
   335  
   336  	case  1: __it_xor(x, y, r, 8);
   337  		break;
   338  
   339  	default:
   340  		__throw_build_bug();
   341  	}
   342  
   343  	return r;
   344  #else
   345  	return __bpf_memcmp_builtin(x, y, len);
   346  #endif
   347  }
   348  
   349  static __always_inline __maybe_unused __u64
   350  __bpf_no_builtin_memcmp(const void *x __maybe_unused,
   351  			const void *y __maybe_unused, __u64 len __maybe_unused)
   352  {
   353  	__throw_build_bug();
   354  	return 0;
   355  }
   356  
   357  /* Redirect any direct use in our code to throw an error. */
   358  #define __builtin_memcmp	__bpf_no_builtin_memcmp
   359  
   360  /* Modified for our needs in that we only return either zero (x and y
   361   * are equal) or non-zero (x and y are non-equal).
   362   */
   363  static __always_inline __nobuiltin("memcmp") __u64 memcmp(const void *x,
   364  							  const void *y,
   365  							  __u64 len)
   366  {
   367  	return __bpf_memcmp(x, y, len);
   368  }
   369  
   370  static __always_inline __maybe_unused void
   371  __bpf_memmove_builtin(void *d, const void *s, __u64 len)
   372  {
   373  	/* Explicit opt-in for __builtin_memmove(). */
   374  	__builtin_memmove(d, s, len);
   375  }
   376  
   377  static __always_inline void __bpf_memmove_bwd(void *d, const void *s, __u64 len)
   378  {
   379  	/* Our internal memcpy implementation walks backwards by default. */
   380  	__bpf_memcpy(d, s, len);
   381  }
   382  
   383  static __always_inline void __bpf_memmove_fwd(void *d, const void *s, __u64 len)
   384  {
   385  #if __clang_major__ >= 10
   386  	if (!__builtin_constant_p(len))
   387  		__throw_build_bug();
   388  
   389  	switch (len) {
   390  	case 96:         __it_mof(d, s, 64); fallthrough;
   391  	case 88: jmp_88: __it_mof(d, s, 64); fallthrough;
   392  	case 80: jmp_80: __it_mof(d, s, 64); fallthrough;
   393  	case 72: jmp_72: __it_mof(d, s, 64); fallthrough;
   394  	case 64: jmp_64: __it_mof(d, s, 64); fallthrough;
   395  	case 56: jmp_56: __it_mof(d, s, 64); fallthrough;
   396  	case 48: jmp_48: __it_mof(d, s, 64); fallthrough;
   397  	case 40: jmp_40: __it_mof(d, s, 64); fallthrough;
   398  	case 32: jmp_32: __it_mof(d, s, 64); fallthrough;
   399  	case 24: jmp_24: __it_mof(d, s, 64); fallthrough;
   400  	case 16: jmp_16: __it_mof(d, s, 64); fallthrough;
   401  	case  8: jmp_8:  __it_mof(d, s, 64);
   402  		break;
   403  
   404  	case 94: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_88;
   405  	case 86: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_80;
   406  	case 78: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_72;
   407  	case 70: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_64;
   408  	case 62: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_56;
   409  	case 54: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_48;
   410  	case 46: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_40;
   411  	case 38: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_32;
   412  	case 30: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_24;
   413  	case 22: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_16;
   414  	case 14: __it_mof(d, s, 16); __it_mof(d, s, 32); goto jmp_8;
   415  	case  6: __it_mof(d, s, 16); __it_mof(d, s, 32);
   416  		break;
   417  
   418  	case 92: __it_mof(d, s, 32); goto jmp_88;
   419  	case 84: __it_mof(d, s, 32); goto jmp_80;
   420  	case 76: __it_mof(d, s, 32); goto jmp_72;
   421  	case 68: __it_mof(d, s, 32); goto jmp_64;
   422  	case 60: __it_mof(d, s, 32); goto jmp_56;
   423  	case 52: __it_mof(d, s, 32); goto jmp_48;
   424  	case 44: __it_mof(d, s, 32); goto jmp_40;
   425  	case 36: __it_mof(d, s, 32); goto jmp_32;
   426  	case 28: __it_mof(d, s, 32); goto jmp_24;
   427  	case 20: __it_mof(d, s, 32); goto jmp_16;
   428  	case 12: __it_mof(d, s, 32); goto jmp_8;
   429  	case  4: __it_mof(d, s, 32);
   430  		break;
   431  
   432  	case 90: __it_mof(d, s, 16); goto jmp_88;
   433  	case 82: __it_mof(d, s, 16); goto jmp_80;
   434  	case 74: __it_mof(d, s, 16); goto jmp_72;
   435  	case 66: __it_mof(d, s, 16); goto jmp_64;
   436  	case 58: __it_mof(d, s, 16); goto jmp_56;
   437  	case 50: __it_mof(d, s, 16); goto jmp_48;
   438  	case 42: __it_mof(d, s, 16); goto jmp_40;
   439  	case 34: __it_mof(d, s, 16); goto jmp_32;
   440  	case 26: __it_mof(d, s, 16); goto jmp_24;
   441  	case 18: __it_mof(d, s, 16); goto jmp_16;
   442  	case 10: __it_mof(d, s, 16); goto jmp_8;
   443  	case  2: __it_mof(d, s, 16);
   444  		break;
   445  
   446  	case  1: __it_mof(d, s, 8);
   447  		break;
   448  
   449  	default:
   450  		/* __builtin_memmove() is crappy slow since it cannot
   451  		 * make any assumptions about alignment & underlying
   452  		 * efficient unaligned access on the target we're
   453  		 * running.
   454  		 */
   455  		__throw_build_bug();
   456  	}
   457  #else
   458  	__bpf_memmove_builtin(d, s, len);
   459  #endif
   460  }
   461  
   462  static __always_inline __maybe_unused void
   463  __bpf_no_builtin_memmove(void *d __maybe_unused, const void *s __maybe_unused,
   464  			 __u64 len __maybe_unused)
   465  {
   466  	__throw_build_bug();
   467  }
   468  
   469  /* Redirect any direct use in our code to throw an error. */
   470  #define __builtin_memmove	__bpf_no_builtin_memmove
   471  
   472  static __always_inline void __bpf_memmove(void *d, const void *s, __u64 len)
   473  {
   474  	/* Note, the forward walking memmove() might not work with on-stack data
   475  	 * since we'll end up walking the memory unaligned even when __align_stack_8
   476  	 * is set. Should not matter much since we'll use memmove() mostly or only
   477  	 * on pkt data.
   478  	 *
   479  	 * Example with d, s, len = 12 bytes:
   480  	 *   * __bpf_memmove_fwd() emits: mov_32 d[0],s[0]; mov_64 d[4],s[4]
   481  	 *   * __bpf_memmove_bwd() emits: mov_32 d[8],s[8]; mov_64 d[0],s[0]
   482  	 */
   483  	if (d <= s)
   484  		return __bpf_memmove_fwd(d, s, len);
   485  	else
   486  		return __bpf_memmove_bwd(d, s, len);
   487  }
   488  
   489  static __always_inline __nobuiltin("memmove") void memmove(void *d,
   490  							   const void *s,
   491  							   __u64 len)
   492  {
   493  	return __bpf_memmove(d, s, len);
   494  }