github.com/golang/gofrontend@v0.0.0-20240429183944-60f985a78526/libgo/runtime/aeshash.c (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Hash code using AES intrinsics.
     6  
     7  #include "runtime.h"
     8  
     9  uintptr aeshashbody(void*, uintptr, uintptr, Slice)
    10  	__asm__(GOSYM_PREFIX "runtime.aeshashbody");
    11  
    12  uintptr aeshashbody(void*, uintptr, uintptr, Slice)
    13  	__attribute__((no_split_stack));
    14  
    15  #if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_AS_X86_AES)
    16  
    17  #include <emmintrin.h>
    18  #include <tmmintrin.h>
    19  #include <wmmintrin.h>
    20  
    21  // Force appropriate CPU level.  We won't call here unless the CPU
    22  // supports it.
    23  
    24  #pragma GCC target("ssse3", "aes")
    25  
    26  #ifdef __x86_64__
    27  
    28  // aeshashbody implements a hash function using AES instructions
    29  // available in recent x86 processors. Note this is not encryption,
    30  // just hashing.
    31  //
    32  // This is written to produce exactly the same results as the gc
    33  // implementation, not because that matters, but just to ensure that
    34  // this does something reasonable.
    35  uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
    36  	__m128i mseed, mseed2, mseed3, mseed4, mseed5, mseed6, mseed7, mseed8;
    37  	__m128i mval, mval2, mval3, mval4, mval5, mval6, mval7, mval8;
    38  
    39  	// Start with hash seed.
    40  	mseed = _mm_cvtsi64_si128(seed);
    41  	// Get 16 bits of length.
    42  	mseed = _mm_insert_epi16(mseed, size, 4);
    43  	// Repeat length 4 times total.
    44  	mseed = _mm_shufflehi_epi16(mseed, 0);
    45  	// Save unscrambled seed.
    46  	mseed2 = mseed;
    47  	// XOR in per-process seed.
    48  	mseed ^= _mm_loadu_si128(aeskeysched.__values);
    49  	// Scramble seed.
    50  	mseed = _mm_aesenc_si128(mseed, mseed);
    51  
    52  	if (size <= 16) {
    53  		if (size == 0) {
    54  			// Return scrambled input seed.
    55  			return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed, mseed));
    56  		} else if (size < 16) {
    57  			if ((((uintptr)(p) + 16) & 0xff0) != 0) {
    58  				static const uint64 masks[32]
    59  				  __attribute__ ((aligned(16))) =
    60  				  {
    61  				    0x0000000000000000, 0x0000000000000000,
    62  				    0x00000000000000ff, 0x0000000000000000,
    63  				    0x000000000000ffff, 0x0000000000000000,
    64  				    0x0000000000ffffff, 0x0000000000000000,
    65  				    0x00000000ffffffff, 0x0000000000000000,
    66  				    0x000000ffffffffff, 0x0000000000000000,
    67  				    0x0000ffffffffffff, 0x0000000000000000,
    68  				    0x00ffffffffffffff, 0x0000000000000000,
    69  				    0xffffffffffffffff, 0x0000000000000000,
    70  				    0xffffffffffffffff, 0x00000000000000ff,
    71  				    0xffffffffffffffff, 0x000000000000ffff,
    72  				    0xffffffffffffffff, 0x0000000000ffffff,
    73  				    0xffffffffffffffff, 0x00000000ffffffff,
    74  				    0xffffffffffffffff, 0x000000ffffffffff,
    75  				    0xffffffffffffffff, 0x0000ffffffffffff,
    76  				    0xffffffffffffffff, 0x00ffffffffffffff
    77  				  };
    78  
    79  				// 16 bytes loaded at p won't cross a page
    80  				// boundary, so we can load directly.
    81  				mval = _mm_loadu_si128(p);
    82  				mval &= *(const __m128i*)(&masks[size*2]);
    83  			} else {
    84  				static const uint64 shifts[32]
    85  				  __attribute__ ((aligned(16))) =
    86  				  {
    87  				    0x0000000000000000, 0x0000000000000000,
    88  				    0xffffffffffffff0f, 0xffffffffffffffff,
    89  				    0xffffffffffff0f0e, 0xffffffffffffffff,
    90  				    0xffffffffff0f0e0d, 0xffffffffffffffff,
    91  				    0xffffffff0f0e0d0c, 0xffffffffffffffff,
    92  				    0xffffff0f0e0d0c0b, 0xffffffffffffffff,
    93  				    0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
    94  				    0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
    95  				    0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
    96  				    0x0e0d0c0b0a090807, 0xffffffffffffff0f,
    97  				    0x0d0c0b0a09080706, 0xffffffffffff0f0e,
    98  				    0x0c0b0a0908070605, 0xffffffffff0f0e0d,
    99  				    0x0b0a090807060504, 0xffffffff0f0e0d0c,
   100  				    0x0a09080706050403, 0xffffff0f0e0d0c0b,
   101  				    0x0908070605040302, 0xffff0f0e0d0c0b0a,
   102  				    0x0807060504030201, 0xff0f0e0d0c0b0a09,
   103  				  };
   104  
   105  				// address ends in 1111xxxx. Might be
   106  				// up against a page boundary, so load
   107  				// ending at last byte.  Then shift
   108  				// bytes down using pshufb.
   109  				mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
   110  				mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
   111  			}
   112  		} else {
   113  			mval = _mm_loadu_si128(p);
   114  		}
   115  
   116  		// XOR data with seed.
   117  		mval ^= mseed;
   118  		// Scramble combo 3 times.
   119  		mval = _mm_aesenc_si128(mval, mval);
   120  		mval = _mm_aesenc_si128(mval, mval);
   121  		mval = _mm_aesenc_si128(mval, mval);
   122  		return _mm_cvtsi128_si64(mval);
   123  	} else if (size <= 32) {
   124  		// Make second starting seed.
   125  		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
   126  		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
   127  		// Load data to be hashed.
   128  		mval = _mm_loadu_si128(p);
   129  		mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
   130  		// XOR with seed.
   131  		mval ^= mseed;
   132  		mval2 ^= mseed2;
   133  		// Scramble 3 times.
   134  		mval = _mm_aesenc_si128(mval, mval);
   135  		mval2 = _mm_aesenc_si128(mval2, mval2);
   136  		mval = _mm_aesenc_si128(mval, mval);
   137  		mval2 = _mm_aesenc_si128(mval2, mval2);
   138  		mval = _mm_aesenc_si128(mval, mval);
   139  		mval2 = _mm_aesenc_si128(mval2, mval2);
   140  		// Combine results.
   141  		mval ^= mval2;
   142  		return _mm_cvtsi128_si64(mval);
   143  	} else if (size <= 64) {
   144  		// Make 3 more starting seeds.
   145  		mseed3 = mseed2;
   146  		mseed4 = mseed2;
   147  		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
   148  		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
   149  		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
   150  		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
   151  		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
   152  		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
   153  
   154  		mval = _mm_loadu_si128(p);
   155  		mval2 = _mm_loadu_si128((void*)((char*)p + 16));
   156  		mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
   157  		mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
   158  
   159  		mval ^= mseed;
   160  		mval2 ^= mseed2;
   161  		mval3 ^= mseed3;
   162  		mval4 ^= mseed4;
   163  
   164  		mval = _mm_aesenc_si128(mval, mval);
   165  		mval2 = _mm_aesenc_si128(mval2, mval2);
   166  		mval3 = _mm_aesenc_si128(mval3, mval3);
   167  		mval4 = _mm_aesenc_si128(mval4, mval4);
   168  
   169  		mval = _mm_aesenc_si128(mval, mval);
   170  		mval2 = _mm_aesenc_si128(mval2, mval2);
   171  		mval3 = _mm_aesenc_si128(mval3, mval3);
   172  		mval4 = _mm_aesenc_si128(mval4, mval4);
   173  
   174  		mval = _mm_aesenc_si128(mval, mval);
   175  		mval2 = _mm_aesenc_si128(mval2, mval2);
   176  		mval3 = _mm_aesenc_si128(mval3, mval3);
   177  		mval4 = _mm_aesenc_si128(mval4, mval4);
   178  
   179  		mval ^= mval3;
   180  		mval2 ^= mval4;
   181  		mval ^= mval2;
   182  		return _mm_cvtsi128_si64(mval);
   183  	} else if (size <= 128) {
   184  		// Make 7 more starting seeds.
   185  		mseed3 = mseed2;
   186  		mseed4 = mseed2;
   187  		mseed5 = mseed2;
   188  		mseed6 = mseed2;
   189  		mseed7 = mseed2;
   190  		mseed8 = mseed2;
   191  		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
   192  		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
   193  		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
   194  		mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
   195  		mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
   196  		mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
   197  		mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
   198  		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
   199  		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
   200  		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
   201  		mseed5 = _mm_aesenc_si128(mseed5, mseed5);
   202  		mseed6 = _mm_aesenc_si128(mseed6, mseed6);
   203  		mseed7 = _mm_aesenc_si128(mseed7, mseed7);
   204  		mseed8 = _mm_aesenc_si128(mseed8, mseed8);
   205  
   206  		// Load data.
   207  		mval = _mm_loadu_si128(p);
   208  		mval2 = _mm_loadu_si128((void*)((char*)p + 16));
   209  		mval3 = _mm_loadu_si128((void*)((char*)p + 32));
   210  		mval4 = _mm_loadu_si128((void*)((char*)p + 48));
   211  		mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
   212  		mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
   213  		mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
   214  		mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
   215  
   216  		// XOR with seed.
   217  		mval ^= mseed;
   218  		mval2 ^= mseed2;
   219  		mval3 ^= mseed3;
   220  		mval4 ^= mseed4;
   221  		mval5 ^= mseed5;
   222  		mval6 ^= mseed6;
   223  		mval7 ^= mseed7;
   224  		mval8 ^= mseed8;
   225  
   226  		// Scramble 3 times.
   227  		mval = _mm_aesenc_si128(mval, mval);
   228  		mval2 = _mm_aesenc_si128(mval2, mval2);
   229  		mval3 = _mm_aesenc_si128(mval3, mval3);
   230  		mval4 = _mm_aesenc_si128(mval4, mval4);
   231  		mval5 = _mm_aesenc_si128(mval5, mval5);
   232  		mval6 = _mm_aesenc_si128(mval6, mval6);
   233  		mval7 = _mm_aesenc_si128(mval7, mval7);
   234  		mval8 = _mm_aesenc_si128(mval8, mval8);
   235  
   236  		mval = _mm_aesenc_si128(mval, mval);
   237  		mval2 = _mm_aesenc_si128(mval2, mval2);
   238  		mval3 = _mm_aesenc_si128(mval3, mval3);
   239  		mval4 = _mm_aesenc_si128(mval4, mval4);
   240  		mval5 = _mm_aesenc_si128(mval5, mval5);
   241  		mval6 = _mm_aesenc_si128(mval6, mval6);
   242  		mval7 = _mm_aesenc_si128(mval7, mval7);
   243  		mval8 = _mm_aesenc_si128(mval8, mval8);
   244  
   245  		mval = _mm_aesenc_si128(mval, mval);
   246  		mval2 = _mm_aesenc_si128(mval2, mval2);
   247  		mval3 = _mm_aesenc_si128(mval3, mval3);
   248  		mval4 = _mm_aesenc_si128(mval4, mval4);
   249  		mval5 = _mm_aesenc_si128(mval5, mval5);
   250  		mval6 = _mm_aesenc_si128(mval6, mval6);
   251  		mval7 = _mm_aesenc_si128(mval7, mval7);
   252  		mval8 = _mm_aesenc_si128(mval8, mval8);
   253  
   254  		// Combine results.
   255  		mval ^= mval5;
   256  		mval2 ^= mval6;
   257  		mval3 ^= mval7;
   258  		mval4 ^= mval8;
   259  		mval ^= mval3;
   260  		mval2 ^= mval4;
   261  		mval ^= mval2;
   262  		return _mm_cvtsi128_si64(mval);
   263  	} else {
   264  		// Make 7 more starting seeds.
   265  		mseed3 = mseed2;
   266  		mseed4 = mseed2;
   267  		mseed5 = mseed2;
   268  		mseed6 = mseed2;
   269  		mseed7 = mseed2;
   270  		mseed8 = mseed2;
   271  		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
   272  		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
   273  		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
   274  		mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
   275  		mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
   276  		mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
   277  		mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
   278  		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
   279  		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
   280  		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
   281  		mseed5 = _mm_aesenc_si128(mseed5, mseed5);
   282  		mseed6 = _mm_aesenc_si128(mseed6, mseed6);
   283  		mseed7 = _mm_aesenc_si128(mseed7, mseed7);
   284  		mseed8 = _mm_aesenc_si128(mseed8, mseed8);
   285  
   286  		// Start with last (possibly overlapping) block.
   287  		mval = _mm_loadu_si128((void*)((char*)p + size - 128));
   288  		mval2 = _mm_loadu_si128((void*)((char*)p + size - 112));
   289  		mval3 = _mm_loadu_si128((void*)((char*)p + size - 96));
   290  		mval4 = _mm_loadu_si128((void*)((char*)p + size - 80));
   291  		mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
   292  		mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
   293  		mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
   294  		mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
   295  
   296  		// XOR in seed.
   297  		mval ^= mseed;
   298  		mval2 ^= mseed2;
   299  		mval3 ^= mseed3;
   300  		mval4 ^= mseed4;
   301  		mval5 ^= mseed5;
   302  		mval6 ^= mseed6;
   303  		mval7 ^= mseed7;
   304  		mval8 ^= mseed8;
   305  
   306  		// Compute number of remaining 128-byte blocks.
   307  		size--;
   308  		size >>= 7;
   309  		do {
   310  			// Scramble state.
   311  			mval = _mm_aesenc_si128(mval, mval);
   312  			mval2 = _mm_aesenc_si128(mval2, mval2);
   313  			mval3 = _mm_aesenc_si128(mval3, mval3);
   314  			mval4 = _mm_aesenc_si128(mval4, mval4);
   315  			mval5 = _mm_aesenc_si128(mval5, mval5);
   316  			mval6 = _mm_aesenc_si128(mval6, mval6);
   317  			mval7 = _mm_aesenc_si128(mval7, mval7);
   318  			mval8 = _mm_aesenc_si128(mval8, mval8);
   319  
   320  			// Scramble state, XOR in a block.
   321  			mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
   322  			mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
   323  			mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
   324  			mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
   325  			mval5 = _mm_aesenc_si128(mval5, _mm_loadu_si128((void*)((char*)p + 64)));
   326  			mval6 = _mm_aesenc_si128(mval6, _mm_loadu_si128((void*)((char*)p + 80)));
   327  			mval7 = _mm_aesenc_si128(mval7, _mm_loadu_si128((void*)((char*)p + 96)));
   328  			mval8 = _mm_aesenc_si128(mval8, _mm_loadu_si128((void*)((char*)p + 112)));
   329  
   330  			p = (void*)((char*)p + 128);
   331  		} while (--size > 0);
   332  
   333  		// 3 more scrambles to finish.
   334  		mval = _mm_aesenc_si128(mval, mval);
   335  		mval2 = _mm_aesenc_si128(mval2, mval2);
   336  		mval3 = _mm_aesenc_si128(mval3, mval3);
   337  		mval4 = _mm_aesenc_si128(mval4, mval4);
   338  		mval5 = _mm_aesenc_si128(mval5, mval5);
   339  		mval6 = _mm_aesenc_si128(mval6, mval6);
   340  		mval7 = _mm_aesenc_si128(mval7, mval7);
   341  		mval8 = _mm_aesenc_si128(mval8, mval8);
   342  		mval = _mm_aesenc_si128(mval, mval);
   343  		mval2 = _mm_aesenc_si128(mval2, mval2);
   344  		mval3 = _mm_aesenc_si128(mval3, mval3);
   345  		mval4 = _mm_aesenc_si128(mval4, mval4);
   346  		mval5 = _mm_aesenc_si128(mval5, mval5);
   347  		mval6 = _mm_aesenc_si128(mval6, mval6);
   348  		mval7 = _mm_aesenc_si128(mval7, mval7);
   349  		mval8 = _mm_aesenc_si128(mval8, mval8);
   350  		mval = _mm_aesenc_si128(mval, mval);
   351  		mval2 = _mm_aesenc_si128(mval2, mval2);
   352  		mval3 = _mm_aesenc_si128(mval3, mval3);
   353  		mval4 = _mm_aesenc_si128(mval4, mval4);
   354  		mval5 = _mm_aesenc_si128(mval5, mval5);
   355  		mval6 = _mm_aesenc_si128(mval6, mval6);
   356  		mval7 = _mm_aesenc_si128(mval7, mval7);
   357  		mval8 = _mm_aesenc_si128(mval8, mval8);
   358  
   359  		mval ^= mval5;
   360  		mval2 ^= mval6;
   361  		mval3 ^= mval7;
   362  		mval4 ^= mval8;
   363  		mval ^= mval3;
   364  		mval2 ^= mval4;
   365  		mval ^= mval2;
   366  		return _mm_cvtsi128_si64(mval);
   367  	}
   368  }
   369  
   370  #else // !defined(__x86_64__)
   371  
   372  // The 32-bit version of aeshashbody.
   373  
   374  uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
   375  	__m128i mseed, mseed2, mseed3, mseed4;
   376  	__m128i mval, mval2, mval3, mval4;
   377  
   378  	// Start with hash seed.
   379  	mseed = _mm_cvtsi32_si128(seed);
   380  	// Get 16 bits of length.
   381  	mseed = _mm_insert_epi16(mseed, size, 4);
   382  	// Replace size with its low 2 bytes repeated 4 times.
   383  	mseed = _mm_shufflehi_epi16(mseed, 0);
   384  	// Save unscrambled seed.
   385  	mseed2 = mseed;
   386  	// XOR in per-process seed.
   387  	mseed ^= _mm_loadu_si128(aeskeysched.__values);
   388  	// Scramble seed.
   389  	mseed = _mm_aesenc_si128(mseed, mseed);
   390  
   391  	if (size <= 16) {
   392  		if (size == 0) {
   393  			// Return scrambled input seed.
   394  			return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed, mseed));
   395  		} else if (size < 16) {
   396  			if ((((uintptr)(p) + 16) & 0xff0) != 0) {
   397  				static const uint64 masks[32]
   398  				  __attribute__ ((aligned(16))) =
   399  				  {
   400  				    0x0000000000000000, 0x0000000000000000,
   401  				    0x00000000000000ff, 0x0000000000000000,
   402  				    0x000000000000ffff, 0x0000000000000000,
   403  				    0x0000000000ffffff, 0x0000000000000000,
   404  				    0x00000000ffffffff, 0x0000000000000000,
   405  				    0x000000ffffffffff, 0x0000000000000000,
   406  				    0x0000ffffffffffff, 0x0000000000000000,
   407  				    0x00ffffffffffffff, 0x0000000000000000,
   408  				    0xffffffffffffffff, 0x0000000000000000,
   409  				    0xffffffffffffffff, 0x00000000000000ff,
   410  				    0xffffffffffffffff, 0x000000000000ffff,
   411  				    0xffffffffffffffff, 0x0000000000ffffff,
   412  				    0xffffffffffffffff, 0x00000000ffffffff,
   413  				    0xffffffffffffffff, 0x000000ffffffffff,
   414  				    0xffffffffffffffff, 0x0000ffffffffffff,
   415  				    0xffffffffffffffff, 0x00ffffffffffffff
   416  				  };
   417  
   418  				// 16 bytes loaded at p won't cross a page
   419  				// boundary, so we can load it directly.
   420  				mval = _mm_loadu_si128(p);
   421  				mval &= *(const __m128i*)(&masks[size*2]);
   422  			} else {
   423  				static const uint64 shifts[32]
   424  				  __attribute__ ((aligned(16))) =
   425  				  {
   426  				    0x0000000000000000, 0x0000000000000000,
   427  				    0xffffffffffffff0f, 0xffffffffffffffff,
   428  				    0xffffffffffff0f0e, 0xffffffffffffffff,
   429  				    0xffffffffff0f0e0d, 0xffffffffffffffff,
   430  				    0xffffffff0f0e0d0c, 0xffffffffffffffff,
   431  				    0xffffff0f0e0d0c0b, 0xffffffffffffffff,
   432  				    0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
   433  				    0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
   434  				    0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
   435  				    0x0e0d0c0b0a090807, 0xffffffffffffff0f,
   436  				    0x0d0c0b0a09080706, 0xffffffffffff0f0e,
   437  				    0x0c0b0a0908070605, 0xffffffffff0f0e0d,
   438  				    0x0b0a090807060504, 0xffffffff0f0e0d0c,
   439  				    0x0a09080706050403, 0xffffff0f0e0d0c0b,
   440  				    0x0908070605040302, 0xffff0f0e0d0c0b0a,
   441  				    0x0807060504030201, 0xff0f0e0d0c0b0a09,
   442  				  };
   443  
   444  				// address ends in 1111xxxx. Might be
   445  				// up against a page boundary, so load
   446  				// ending at last byte.  Then shift
   447  				// bytes down using pshufb.
   448  				mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
   449  				mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
   450  			}
   451  		} else {
   452  			mval = _mm_loadu_si128(p);
   453  		}
   454  
   455  		// Scramble input, XOR in seed.
   456  		mval = _mm_aesenc_si128(mval, mseed);
   457  		mval = _mm_aesenc_si128(mval, mval);
   458  		mval = _mm_aesenc_si128(mval, mval);
   459  		return _mm_cvtsi128_si32(mval);
   460  	} else if (size <= 32) {
   461  		// Make second starting seed.
   462  		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
   463  		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
   464  		// Load data to be hashed.
   465  		mval = _mm_loadu_si128(p);
   466  		mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
   467  
   468  		// Scramble 3 times.
   469  		mval = _mm_aesenc_si128(mval, mseed);
   470  		mval2 = _mm_aesenc_si128(mval2, mseed2);
   471  		mval = _mm_aesenc_si128(mval, mval);
   472  		mval2 = _mm_aesenc_si128(mval2, mval2);
   473  		mval = _mm_aesenc_si128(mval, mval);
   474  		mval2 = _mm_aesenc_si128(mval2, mval2);
   475  
   476  		// Combine results.
   477  		mval ^= mval2;
   478  		return _mm_cvtsi128_si32(mval);
   479  	} else if (size <= 64) {
   480  		// Make 3 more starting seeds.
   481  		mseed3 = mseed2;
   482  		mseed4 = mseed2;
   483  		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
   484  		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
   485  		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
   486  		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
   487  		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
   488  		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
   489  
   490  		mval = _mm_loadu_si128(p);
   491  		mval2 = _mm_loadu_si128((void*)((char*)p + 16));
   492  		mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
   493  		mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
   494  
   495  		mval = _mm_aesenc_si128(mval, mseed);
   496  		mval2 = _mm_aesenc_si128(mval2, mseed2);
   497  		mval3 = _mm_aesenc_si128(mval3, mseed3);
   498  		mval4 = _mm_aesenc_si128(mval4, mseed4);
   499  
   500  		mval = _mm_aesenc_si128(mval, mval);
   501  		mval2 = _mm_aesenc_si128(mval2, mval2);
   502  		mval3 = _mm_aesenc_si128(mval3, mval3);
   503  		mval4 = _mm_aesenc_si128(mval4, mval4);
   504  
   505  		mval = _mm_aesenc_si128(mval, mval);
   506  		mval2 = _mm_aesenc_si128(mval2, mval2);
   507  		mval3 = _mm_aesenc_si128(mval3, mval3);
   508  		mval4 = _mm_aesenc_si128(mval4, mval4);
   509  
   510  		mval ^= mval3;
   511  		mval2 ^= mval4;
   512  		mval ^= mval2;
   513  		return _mm_cvtsi128_si32(mval);
   514  	} else {
   515  		// Make 3 more starting seeds.
   516  		mseed3 = mseed2;
   517  		mseed4 = mseed2;
   518  		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
   519  		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
   520  		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
   521  		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
   522  		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
   523  		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
   524  
   525  		// Start with last (possibly overlapping) block.
   526  		mval = _mm_loadu_si128((void*)((char*)p + size - 64));
   527  		mval2 = _mm_loadu_si128((void*)((char*)p + size - 48));
   528  		mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
   529  		mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
   530  
   531  		// Scramble state once.
   532  		mval = _mm_aesenc_si128(mval, mseed);
   533  		mval2 = _mm_aesenc_si128(mval2, mseed2);
   534  		mval3 = _mm_aesenc_si128(mval3, mseed3);
   535  		mval4 = _mm_aesenc_si128(mval4, mseed4);
   536  
   537  		// Compute number of remaining 64-byte blocks.
   538  		size--;
   539  		size >>= 6;
   540  		do {
   541  			// Scramble state, XOR in a block.
   542  			mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
   543  			mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
   544  			mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
   545  			mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
   546  
   547  			// Scramble state.
   548  			mval = _mm_aesenc_si128(mval, mval);
   549  			mval2 = _mm_aesenc_si128(mval2, mval2);
   550  			mval3 = _mm_aesenc_si128(mval3, mval3);
   551  			mval4 = _mm_aesenc_si128(mval4, mval4);
   552  
   553  			p = (void*)((char*)p + 64);
   554  		} while (--size > 0);
   555  
   556  		// 2 more scrambles to finish.
   557  		mval = _mm_aesenc_si128(mval, mval);
   558  		mval2 = _mm_aesenc_si128(mval2, mval2);
   559  		mval3 = _mm_aesenc_si128(mval3, mval3);
   560  		mval4 = _mm_aesenc_si128(mval4, mval4);
   561  
   562  		mval = _mm_aesenc_si128(mval, mval);
   563  		mval2 = _mm_aesenc_si128(mval2, mval2);
   564  		mval3 = _mm_aesenc_si128(mval3, mval3);
   565  		mval4 = _mm_aesenc_si128(mval4, mval4);
   566  
   567  		mval ^= mval3;
   568  		mval2 ^= mval4;
   569  		mval ^= mval2;
   570  		return _mm_cvtsi128_si32(mval);
   571  	}
   572  }
   573  
   574  #endif // !defined(__x86_64__)
   575  
   576  #elif defined(__aarch64__)
   577  
   578  // Undefine some identifiers that we pick up from the Go runtime package that
   579  // are used in arm_neon.h.
   580  
   581  #undef t1
   582  #undef tx
   583  #undef t2
   584  #undef t3
   585  #undef t4
   586  #undef t5
   587  
   588  #include <arm_neon.h>
   589  
   590  // Force appropriate CPU level.  We won't call here unless the CPU
   591  // supports it.
   592  
   593  #pragma GCC target("+crypto")
   594  
   595  // The arm64 version of aeshashbody.
   596  
   597  uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
   598  	uint8x16_t *pseed;
   599  	uint64x2_t vinit64;
   600  	uint8x16_t vinit;
   601  	uint8x16_t vseed, vseed2, vseed3, vseed4;
   602  	uint8x16_t vseed5, vseed6, vseed7, vseed8;
   603  	uint8x16_t vval, vval2, vval3, vval4;
   604  	uint8x16_t vval5, vval6, vval7, vval8;
   605  	uint8x16_t vvalLoop, vvalLoop2, vvalLoop3, vvalLoop4;
   606  	uint8x16_t vvalLoop5, vvalLoop6, vvalLoop7, vvalLoop8;
   607  	uint8x16x2_t avval2;
   608  	uint8x16x3_t avseed3;
   609  
   610  	pseed = (uint8x16_t*)(aeskeysched.__values);
   611  
   612  	// Combined hash seed and length.
   613  	vinit64 = vdupq_n_u64(0);
   614  	vinit64[0] = (uint64)seed;
   615  	vinit64[1] = (uint64)size;
   616  	vinit = vreinterpretq_u8_u64(vinit64);
   617  
   618  	// Mix in per-process seed.
   619  	vseed = vaeseq_u8(*pseed, vinit);
   620  	++pseed;
   621  	// Scramble seed.
   622  	vseed = vaesmcq_u8(vseed);
   623  
   624  	if (size <= 16) {
   625  		if (size == 0) {
   626  			// Return 64 bits of scrambled input seed.
   627  			return vreinterpretq_u64_u8(vseed)[0];
   628  		} else if (size < 16) {
   629  			vval = vreinterpretq_u8_u64(vdupq_n_u64(0));
   630  			if ((size & 8) != 0) {
   631  				vval = vreinterpretq_u8_u64(vld1q_lane_u64((uint64_t*)(p), vreinterpretq_u64_u8(vval), 0));
   632  				p = (void*)((uint64_t*)(p) + 1);
   633  			}
   634  			if ((size & 4) != 0) {
   635  				vval = vreinterpretq_u8_u32(vld1q_lane_u32((uint32_t*)(p), vreinterpretq_u32_u8(vval), 2));
   636  				p = (void*)((uint32_t*)(p) + 1);
   637  			}
   638  			if ((size & 2) != 0) {
   639  				vval = vreinterpretq_u8_u16(vld1q_lane_u16((uint16_t*)(p), vreinterpretq_u16_u8(vval), 6));
   640  				p = (void*)((uint16_t*)(p) + 1);
   641  			}
   642  			if ((size & 1) != 0) {
   643  				vval = vld1q_lane_u8((uint8*)(p), vval, 14);
   644  			}
   645  		} else {
   646  			vval = *(uint8x16_t*)(p);
   647  		}
   648  		vval = vaeseq_u8(vval, vseed);
   649  		vval = vaesmcq_u8(vval);
   650  		vval = vaeseq_u8(vval, vseed);
   651  		vval = vaesmcq_u8(vval);
   652  		vval = vaeseq_u8(vval, vseed);
   653  		return vreinterpretq_u64_u8(vval)[0];
   654  	} else if (size <= 32) {
   655  		// Make a second seed.
   656  		vseed2 = vaeseq_u8(*pseed, vinit);
   657  		vseed2 = vaesmcq_u8(vseed2);
   658  		vval = *(uint8x16_t*)(p);
   659  		vval2 = *(uint8x16_t*)((char*)(p) + (size - 16));
   660  
   661  		vval = vaeseq_u8(vval, vseed);
   662  		vval = vaesmcq_u8(vval);
   663  		vval2 = vaeseq_u8(vval2, vseed2);
   664  		vval2 = vaesmcq_u8(vval2);
   665  
   666  		vval = vaeseq_u8(vval, vseed);
   667  		vval = vaesmcq_u8(vval);
   668  		vval2 = vaeseq_u8(vval2, vseed2);
   669  		vval2 = vaesmcq_u8(vval2);
   670  
   671  		vval = vaeseq_u8(vval, vseed);
   672  		vval2 = vaeseq_u8(vval2, vseed2);
   673  
   674  		vval ^= vval2;
   675  
   676  		return vreinterpretq_u64_u8(vval)[0];
   677  	} else if (size <= 64) {
   678  		avseed3 = vld1q_u8_x3((uint8*)(pseed));
   679  		vseed2 = avseed3.val[0];
   680  		vseed3 = avseed3.val[1];
   681  		vseed4 = avseed3.val[2];
   682  
   683  		vseed2 = vaeseq_u8(vseed2, vinit);
   684  		vseed2 = vaesmcq_u8(vseed2);
   685  		vseed3 = vaeseq_u8(vseed3, vinit);
   686  		vseed3 = vaesmcq_u8(vseed3);
   687  		vseed4 = vaeseq_u8(vseed4, vinit);
   688  		vseed4 = vaesmcq_u8(vseed4);
   689  
   690  		avval2 = vld1q_u8_x2((uint8*)(p));
   691  		vval = avval2.val[0];
   692  		vval2 = avval2.val[1];
   693  		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
   694  		vval3 = avval2.val[0];
   695  		vval4 = avval2.val[1];
   696  
   697  		vval = vaeseq_u8(vval, vseed);
   698  		vval = vaesmcq_u8(vval);
   699  		vval2 = vaeseq_u8(vval2, vseed2);
   700  		vval2 = vaesmcq_u8(vval2);
   701  		vval3 = vaeseq_u8(vval3, vseed3);
   702  		vval3 = vaesmcq_u8(vval3);
   703  		vval4 = vaeseq_u8(vval4, vseed4);
   704  		vval4 = vaesmcq_u8(vval4);
   705  
   706  		vval = vaeseq_u8(vval, vseed);
   707  		vval = vaesmcq_u8(vval);
   708  		vval2 = vaeseq_u8(vval2, vseed2);
   709  		vval2 = vaesmcq_u8(vval2);
   710  		vval3 = vaeseq_u8(vval3, vseed3);
   711  		vval3 = vaesmcq_u8(vval3);
   712  		vval4 = vaeseq_u8(vval4, vseed4);
   713  		vval4 = vaesmcq_u8(vval4);
   714  
   715  		vval = vaeseq_u8(vval, vseed);
   716  		vval2 = vaeseq_u8(vval2, vseed2);
   717  		vval3 = vaeseq_u8(vval3, vseed3);
   718  		vval4 = vaeseq_u8(vval4, vseed4);
   719  
   720  		vval ^= vval3;
   721  		vval2 ^= vval4;
   722  		vval ^= vval2;
   723  
   724  		return vreinterpretq_u64_u8(vval)[0];
   725  	} else if (size <= 128) {
   726  		// For some reason vld1q_u8_x4 is missing.
   727  		avseed3 = vld1q_u8_x3((uint8*)(pseed));
   728  		vseed2 = avseed3.val[0];
   729  		vseed3 = avseed3.val[1];
   730  		vseed4 = avseed3.val[2];
   731  		avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
   732  		vseed5 = avseed3.val[0];
   733  		vseed6 = avseed3.val[1];
   734  		vseed7 = avseed3.val[2];
   735  		vseed8 = *(pseed + 6);
   736  
   737  		vseed2 = vaeseq_u8(vseed2, vinit);
   738  		vseed2 = vaesmcq_u8(vseed2);
   739  		vseed3 = vaeseq_u8(vseed3, vinit);
   740  		vseed3 = vaesmcq_u8(vseed3);
   741  		vseed4 = vaeseq_u8(vseed4, vinit);
   742  		vseed4 = vaesmcq_u8(vseed4);
   743  		vseed5 = vaeseq_u8(vseed5, vinit);
   744  		vseed5 = vaesmcq_u8(vseed5);
   745  		vseed6 = vaeseq_u8(vseed6, vinit);
   746  		vseed6 = vaesmcq_u8(vseed6);
   747  		vseed7 = vaeseq_u8(vseed7, vinit);
   748  		vseed7 = vaesmcq_u8(vseed7);
   749  		vseed8 = vaeseq_u8(vseed8, vinit);
   750  		vseed8 = vaesmcq_u8(vseed8);
   751  
   752  		avval2 = vld1q_u8_x2((uint8*)(p));
   753  		vval = avval2.val[0];
   754  		vval2 = avval2.val[1];
   755  		avval2 = vld1q_u8_x2((uint8*)(p) + 32);
   756  		vval3 = avval2.val[0];
   757  		vval4 = avval2.val[1];
   758  		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
   759  		vval5 = avval2.val[0];
   760  		vval6 = avval2.val[1];
   761  		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
   762  		vval7 = avval2.val[0];
   763  		vval8 = avval2.val[1];
   764  
   765  		vval = vaeseq_u8(vval, vseed);
   766  		vval = vaesmcq_u8(vval);
   767  		vval2 = vaeseq_u8(vval2, vseed2);
   768  		vval2 = vaesmcq_u8(vval2);
   769  		vval3 = vaeseq_u8(vval3, vseed3);
   770  		vval3 = vaesmcq_u8(vval3);
   771  		vval4 = vaeseq_u8(vval4, vseed4);
   772  		vval4 = vaesmcq_u8(vval4);
   773  		vval5 = vaeseq_u8(vval5, vseed5);
   774  		vval5 = vaesmcq_u8(vval5);
   775  		vval6 = vaeseq_u8(vval6, vseed6);
   776  		vval6 = vaesmcq_u8(vval6);
   777  		vval7 = vaeseq_u8(vval7, vseed7);
   778  		vval7 = vaesmcq_u8(vval7);
   779  		vval8 = vaeseq_u8(vval8, vseed8);
   780  		vval8 = vaesmcq_u8(vval8);
   781  
   782  		vval = vaeseq_u8(vval, vseed);
   783  		vval = vaesmcq_u8(vval);
   784  		vval2 = vaeseq_u8(vval2, vseed2);
   785  		vval2 = vaesmcq_u8(vval2);
   786  		vval3 = vaeseq_u8(vval3, vseed3);
   787  		vval3 = vaesmcq_u8(vval3);
   788  		vval4 = vaeseq_u8(vval4, vseed4);
   789  		vval4 = vaesmcq_u8(vval4);
   790  		vval5 = vaeseq_u8(vval5, vseed5);
   791  		vval5 = vaesmcq_u8(vval5);
   792  		vval6 = vaeseq_u8(vval6, vseed6);
   793  		vval6 = vaesmcq_u8(vval6);
   794  		vval7 = vaeseq_u8(vval7, vseed7);
   795  		vval7 = vaesmcq_u8(vval7);
   796  		vval8 = vaeseq_u8(vval8, vseed8);
   797  		vval8 = vaesmcq_u8(vval8);
   798  
   799  		vval = vaeseq_u8(vval, vseed);
   800  		vval2 = vaeseq_u8(vval2, vseed2);
   801  		vval3 = vaeseq_u8(vval3, vseed3);
   802  		vval4 = vaeseq_u8(vval4, vseed4);
   803  		vval5 = vaeseq_u8(vval5, vseed5);
   804  		vval6 = vaeseq_u8(vval6, vseed6);
   805  		vval7 = vaeseq_u8(vval7, vseed7);
   806  		vval8 = vaeseq_u8(vval8, vseed8);
   807  
   808  		vval ^= vval5;
   809  		vval2 ^= vval6;
   810  		vval3 ^= vval7;
   811  		vval4 ^= vval8;
   812  		vval ^= vval3;
   813  		vval2 ^= vval4;
   814  		vval ^= vval2;
   815  
   816  		return vreinterpretq_u64_u8(vval)[0];
   817  	} else {
   818  		// For some reason vld1q_u8_x4 is missing.
   819  		avseed3 = vld1q_u8_x3((uint8*)(pseed));
   820  		vseed2 = avseed3.val[0];
   821  		vseed3 = avseed3.val[1];
   822  		vseed4 = avseed3.val[2];
   823  		avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
   824  		vseed5 = avseed3.val[0];
   825  		vseed6 = avseed3.val[1];
   826  		vseed7 = avseed3.val[2];
   827  		vseed8 = *(pseed + 6);
   828  
   829  		vseed2 = vaeseq_u8(vseed2, vinit);
   830  		vseed2 = vaesmcq_u8(vseed2);
   831  		vseed3 = vaeseq_u8(vseed3, vinit);
   832  		vseed3 = vaesmcq_u8(vseed3);
   833  		vseed4 = vaeseq_u8(vseed4, vinit);
   834  		vseed4 = vaesmcq_u8(vseed4);
   835  		vseed5 = vaeseq_u8(vseed5, vinit);
   836  		vseed5 = vaesmcq_u8(vseed5);
   837  		vseed6 = vaeseq_u8(vseed6, vinit);
   838  		vseed6 = vaesmcq_u8(vseed6);
   839  		vseed7 = vaeseq_u8(vseed7, vinit);
   840  		vseed7 = vaesmcq_u8(vseed7);
   841  		vseed8 = vaeseq_u8(vseed8, vinit);
   842  		vseed8 = vaesmcq_u8(vseed8);
   843  
   844  		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 128));
   845  		vval = avval2.val[0];
   846  		vval2 = avval2.val[1];
   847  		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 96));
   848  		vval3 = avval2.val[0];
   849  		vval4 = avval2.val[1];
   850  		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
   851  		vval5 = avval2.val[0];
   852  		vval6 = avval2.val[1];
   853  		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
   854  		vval7 = avval2.val[0];
   855  		vval8 = avval2.val[1];
   856  
   857  		vvalLoop = vseed;
   858  		vvalLoop2 = vseed2;
   859  		vvalLoop3 = vseed3;
   860  		vvalLoop4 = vseed4;
   861  		vvalLoop5 = vseed5;
   862  		vvalLoop6 = vseed6;
   863  		vvalLoop7 = vseed7;
   864  		vvalLoop8 = vseed8;
   865  
   866  		size--;
   867  		size >>= 7;
   868  		do {
   869  			vval = vaeseq_u8(vval, vvalLoop);
   870  			vval = vaesmcq_u8(vval);
   871  			vval2 = vaeseq_u8(vval2, vvalLoop2);
   872  			vval2 = vaesmcq_u8(vval2);
   873  			vval3 = vaeseq_u8(vval3, vvalLoop3);
   874  			vval3 = vaesmcq_u8(vval3);
   875  			vval4 = vaeseq_u8(vval4, vvalLoop4);
   876  			vval4 = vaesmcq_u8(vval4);
   877  			vval5 = vaeseq_u8(vval5, vvalLoop5);
   878  			vval5 = vaesmcq_u8(vval5);
   879  			vval6 = vaeseq_u8(vval6, vvalLoop6);
   880  			vval6 = vaesmcq_u8(vval6);
   881  			vval7 = vaeseq_u8(vval7, vvalLoop7);
   882  			vval7 = vaesmcq_u8(vval7);
   883  			vval8 = vaeseq_u8(vval8, vvalLoop8);
   884  			vval8 = vaesmcq_u8(vval8);
   885  
   886  			avval2 = vld1q_u8_x2((uint8*)(p));
   887  			vvalLoop = avval2.val[0];
   888  			vvalLoop2 = avval2.val[1];
   889  			avval2 = vld1q_u8_x2((uint8*)(p) + 32);
   890  			vvalLoop3 = avval2.val[0];
   891  			vvalLoop4 = avval2.val[1];
   892  			avval2 = vld1q_u8_x2((uint8*)(p) + 64);
   893  			vvalLoop5 = avval2.val[0];
   894  			vvalLoop6 = avval2.val[1];
   895  			avval2 = vld1q_u8_x2((uint8*)(p) + 96);
   896  			vvalLoop7 = avval2.val[0];
   897  			vvalLoop8 = avval2.val[1];
   898  
   899  			p = (void *)((uint8*)(p) + 128);
   900  
   901  			vval = vaeseq_u8(vval, vvalLoop);
   902  			vval = vaesmcq_u8(vval);
   903  			vval2 = vaeseq_u8(vval2, vvalLoop2);
   904  			vval2 = vaesmcq_u8(vval2);
   905  			vval3 = vaeseq_u8(vval3, vvalLoop3);
   906  			vval3 = vaesmcq_u8(vval3);
   907  			vval4 = vaeseq_u8(vval4, vvalLoop4);
   908  			vval4 = vaesmcq_u8(vval4);
   909  			vval5 = vaeseq_u8(vval5, vvalLoop5);
   910  			vval5 = vaesmcq_u8(vval5);
   911  			vval6 = vaeseq_u8(vval6, vvalLoop6);
   912  			vval6 = vaesmcq_u8(vval6);
   913  			vval7 = vaeseq_u8(vval7, vvalLoop7);
   914  			vval7 = vaesmcq_u8(vval7);
   915  			vval8 = vaeseq_u8(vval8, vvalLoop8);
   916  			vval8 = vaesmcq_u8(vval8);
   917  		} while (--size > 0);
   918  
   919  		vval = vaeseq_u8(vval, vvalLoop);
   920  		vval = vaesmcq_u8(vval);
   921  		vval2 = vaeseq_u8(vval2, vvalLoop2);
   922  		vval2 = vaesmcq_u8(vval2);
   923  		vval3 = vaeseq_u8(vval3, vvalLoop3);
   924  		vval3 = vaesmcq_u8(vval3);
   925  		vval4 = vaeseq_u8(vval4, vvalLoop4);
   926  		vval4 = vaesmcq_u8(vval4);
   927  		vval5 = vaeseq_u8(vval5, vvalLoop5);
   928  		vval5 = vaesmcq_u8(vval5);
   929  		vval6 = vaeseq_u8(vval6, vvalLoop6);
   930  		vval6 = vaesmcq_u8(vval6);
   931  		vval7 = vaeseq_u8(vval7, vvalLoop7);
   932  		vval7 = vaesmcq_u8(vval7);
   933  		vval8 = vaeseq_u8(vval8, vvalLoop8);
   934  		vval8 = vaesmcq_u8(vval8);
   935  
   936  
   937  		vval = vaeseq_u8(vval, vvalLoop);
   938  		vval = vaesmcq_u8(vval);
   939  		vval2 = vaeseq_u8(vval2, vvalLoop2);
   940  		vval2 = vaesmcq_u8(vval2);
   941  		vval3 = vaeseq_u8(vval3, vvalLoop3);
   942  		vval3 = vaesmcq_u8(vval3);
   943  		vval4 = vaeseq_u8(vval4, vvalLoop4);
   944  		vval4 = vaesmcq_u8(vval4);
   945  		vval5 = vaeseq_u8(vval5, vvalLoop5);
   946  		vval5 = vaesmcq_u8(vval5);
   947  		vval6 = vaeseq_u8(vval6, vvalLoop6);
   948  		vval6 = vaesmcq_u8(vval6);
   949  		vval7 = vaeseq_u8(vval7, vvalLoop7);
   950  		vval7 = vaesmcq_u8(vval7);
   951  		vval8 = vaeseq_u8(vval8, vvalLoop8);
   952  		vval8 = vaesmcq_u8(vval8);
   953  
   954  		vval = vaeseq_u8(vval, vvalLoop);
   955  		vval2 = vaeseq_u8(vval2, vvalLoop2);
   956  		vval3 = vaeseq_u8(vval3, vvalLoop3);
   957  		vval4 = vaeseq_u8(vval4, vvalLoop4);
   958  		vval5 = vaeseq_u8(vval5, vvalLoop5);
   959  		vval6 = vaeseq_u8(vval6, vvalLoop6);
   960  		vval7 = vaeseq_u8(vval7, vvalLoop7);
   961  		vval8 = vaeseq_u8(vval8, vvalLoop8);
   962  
   963  		vval ^= vval5;
   964  		vval2 ^= vval6;
   965  		vval3 ^= vval7;
   966  		vval4 ^= vval8;
   967  		vval ^= vval3;
   968  		vval2 ^= vval4;
   969  		vval ^= vval2;
   970  
   971  		return vreinterpretq_u64_u8(vval)[0];
   972  	}
   973  }
   974  
   975  #else // (!defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)) && !defined(__aarch64__)
   976  
   977  uintptr aeshashbody(void* p __attribute__((unused)),
   978  		    uintptr seed __attribute__((unused)),
   979  		    uintptr size __attribute__((unused)),
   980  		    Slice aeskeysched __attribute__((unused))) {
   981  	// We should never get here on a non-x86, non-arm64 system.
   982  	runtime_throw("impossible call to aeshashbody");
   983  }
   984  
   985  #endif // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)