github.com/insolar/x-crypto@v0.0.0-20191031140942-75fab8a325f6/sha512/sha512block_amd64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // SHA512 block routine. See sha512block.go for Go equivalent.
     8  //
     9  // The algorithm is detailed in FIPS 180-4:
    10  //
    11  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12  //
    13  // Wt = Mt; for 0 <= t <= 15
    14  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    15  //
    16  // a = H0
    17  // b = H1
    18  // c = H2
    19  // d = H3
    20  // e = H4
    21  // f = H5
    22  // g = H6
    23  // h = H7
    24  //
    25  // for t = 0 to 79 {
    26  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    27  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    28  //    h = g
    29  //    g = f
    30  //    f = e
    31  //    e = d + T1
    32  //    d = c
    33  //    c = b
    34  //    b = a
    35  //    a = T1 + T2
    36  // }
    37  //
    38  // H0 = a + H0
    39  // H1 = b + H1
    40  // H2 = c + H2
    41  // H3 = d + H3
    42  // H4 = e + H4
    43  // H5 = f + H5
    44  // H6 = g + H6
    45  // H7 = h + H7
    46  
    47  // Wt = Mt; for 0 <= t <= 15
    48  #define MSGSCHEDULE0(index) \
    49  	MOVQ	(index*8)(SI), AX; \
    50  	BSWAPQ	AX; \
    51  	MOVQ	AX, (index*8)(BP)
    52  
    53  // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    54  //   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
    55  //   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
    56  #define MSGSCHEDULE1(index) \
    57  	MOVQ	((index-2)*8)(BP), AX; \
    58  	MOVQ	AX, CX; \
    59  	RORQ	$19, AX; \
    60  	MOVQ	CX, DX; \
    61  	RORQ	$61, CX; \
    62  	SHRQ	$6, DX; \
    63  	MOVQ	((index-15)*8)(BP), BX; \
    64  	XORQ	CX, AX; \
    65  	MOVQ	BX, CX; \
    66  	XORQ	DX, AX; \
    67  	RORQ	$1, BX; \
    68  	MOVQ	CX, DX; \
    69  	SHRQ	$7, DX; \
    70  	RORQ	$8, CX; \
    71  	ADDQ	((index-7)*8)(BP), AX; \
    72  	XORQ	CX, BX; \
    73  	XORQ	DX, BX; \
    74  	ADDQ	((index-16)*8)(BP), BX; \
    75  	ADDQ	BX, AX; \
    76  	MOVQ	AX, ((index)*8)(BP)
    77  
    78  // Calculate T1 in AX - uses AX, CX and DX registers.
    79  // h is also used as an accumulator. Wt is passed in AX.
    80  //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    81  //     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
    82  //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    83  #define SHA512T1(const, e, f, g, h) \
    84  	MOVQ	$const, DX; \
    85  	ADDQ	AX, h; \
    86  	MOVQ	e, AX; \
    87  	ADDQ	DX, h; \
    88  	MOVQ	e, CX; \
    89  	RORQ	$14, AX; \
    90  	MOVQ	e, DX; \
    91  	RORQ	$18, CX; \
    92  	XORQ	CX, AX; \
    93  	MOVQ	e, CX; \
    94  	RORQ	$41, DX; \
    95  	ANDQ	f, CX; \
    96  	XORQ	AX, DX; \
    97  	MOVQ	e, AX; \
    98  	NOTQ	AX; \
    99  	ADDQ	DX, h; \
   100  	ANDQ	g, AX; \
   101  	XORQ	CX, AX; \
   102  	ADDQ	h, AX
   103  
   104  // Calculate T2 in BX - uses BX, CX, DX and DI registers.
   105  //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   106  //     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
   107  //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   108  #define SHA512T2(a, b, c) \
   109  	MOVQ	a, DI; \
   110  	MOVQ	c, BX; \
   111  	RORQ	$28, DI; \
   112  	MOVQ	a, DX; \
   113  	ANDQ	b, BX; \
   114  	RORQ	$34, DX; \
   115  	MOVQ	a, CX; \
   116  	ANDQ	c, CX; \
   117  	XORQ	DX, DI; \
   118  	XORQ	CX, BX; \
   119  	MOVQ	a, DX; \
   120  	MOVQ	b, CX; \
   121  	RORQ	$39, DX; \
   122  	ANDQ	a, CX; \
   123  	XORQ	CX, BX; \
   124  	XORQ	DX, DI; \
   125  	ADDQ	DI, BX
   126  
   127  // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   128  // The values for e and a are stored in d and h, ready for rotation.
   129  #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
   130  	SHA512T1(const, e, f, g, h); \
   131  	SHA512T2(a, b, c); \
   132  	MOVQ	BX, h; \
   133  	ADDQ	AX, d; \
   134  	ADDQ	AX, h
   135  
   136  #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
   137  	MSGSCHEDULE0(index); \
   138  	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   139  
   140  #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
   141  	MSGSCHEDULE1(index); \
   142  	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   143  
   144  TEXT ·blockAMD64(SB),0,$648-32
   145  	MOVQ	p_base+8(FP), SI
   146  	MOVQ	p_len+16(FP), DX
   147  	SHRQ	$7, DX
   148  	SHLQ	$7, DX
   149  
   150  	LEAQ	(SI)(DX*1), DI
   151  	MOVQ	DI, 640(SP)
   152  	CMPQ	SI, DI
   153  	JEQ	end
   154  
   155  	MOVQ	dig+0(FP), BP
   156  	MOVQ	(0*8)(BP), R8		// a = H0
   157  	MOVQ	(1*8)(BP), R9		// b = H1
   158  	MOVQ	(2*8)(BP), R10		// c = H2
   159  	MOVQ	(3*8)(BP), R11		// d = H3
   160  	MOVQ	(4*8)(BP), R12		// e = H4
   161  	MOVQ	(5*8)(BP), R13		// f = H5
   162  	MOVQ	(6*8)(BP), R14		// g = H6
   163  	MOVQ	(7*8)(BP), R15		// h = H7
   164  
   165  loop:
   166  	MOVQ	SP, BP			// message schedule
   167  
   168  	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
   169  	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
   170  	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
   171  	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
   172  	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
   173  	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
   174  	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
   175  	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
   176  	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
   177  	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
   178  	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
   179  	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
   180  	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
   181  	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
   182  	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
   183  	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
   184  
   185  	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
   186  	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
   187  	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
   188  	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
   189  	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
   190  	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
   191  	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
   192  	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
   193  	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
   194  	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
   195  	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
   196  	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
   197  	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
   198  	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
   199  	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
   200  	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
   201  	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
   202  	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
   203  	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
   204  	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
   205  	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
   206  	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
   207  	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
   208  	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
   209  	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
   210  	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
   211  	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
   212  	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
   213  	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
   214  	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
   215  	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
   216  	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
   217  	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
   218  	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
   219  	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
   220  	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
   221  	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
   222  	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
   223  	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
   224  	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
   225  	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
   226  	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
   227  	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
   228  	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
   229  	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
   230  	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
   231  	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
   232  	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
   233  	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
   234  	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
   235  	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
   236  	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
   237  	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
   238  	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
   239  	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
   240  	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
   241  	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
   242  	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
   243  	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
   244  	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
   245  	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
   246  	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
   247  	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
   248  	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
   249  
   250  	MOVQ	dig+0(FP), BP
   251  	ADDQ	(0*8)(BP), R8	// H0 = a + H0
   252  	MOVQ	R8, (0*8)(BP)
   253  	ADDQ	(1*8)(BP), R9	// H1 = b + H1
   254  	MOVQ	R9, (1*8)(BP)
   255  	ADDQ	(2*8)(BP), R10	// H2 = c + H2
   256  	MOVQ	R10, (2*8)(BP)
   257  	ADDQ	(3*8)(BP), R11	// H3 = d + H3
   258  	MOVQ	R11, (3*8)(BP)
   259  	ADDQ	(4*8)(BP), R12	// H4 = e + H4
   260  	MOVQ	R12, (4*8)(BP)
   261  	ADDQ	(5*8)(BP), R13	// H5 = f + H5
   262  	MOVQ	R13, (5*8)(BP)
   263  	ADDQ	(6*8)(BP), R14	// H6 = g + H6
   264  	MOVQ	R14, (6*8)(BP)
   265  	ADDQ	(7*8)(BP), R15	// H7 = h + H7
   266  	MOVQ	R15, (7*8)(BP)
   267  
   268  	ADDQ	$128, SI
   269  	CMPQ	SI, 640(SP)
   270  	JB	loop
   271  
   272  end:
   273  	RET
   274  
   275  // Version below is based on "Fast SHA512 Implementations on Intel
   276  // Architecture Processors" White-paper
   277  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
   278  // AVX2 version by Intel, same algorithm in Linux kernel:
   279  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
   280  
   281  // James Guilford <james.guilford@intel.com>
   282  // Kirk Yap <kirk.s.yap@intel.com>
   283  // Tim Chen <tim.c.chen@linux.intel.com>
   284  // David Cote <david.m.cote@intel.com>
   285  // Aleksey Sidorov <aleksey.sidorov@intel.com>
   286  
   287  #define YFER_SIZE (4*8)
   288  #define SRND_SIZE (1*8)
   289  #define INP_SIZE (1*8)
   290  
   291  #define frame_YFER (0)
   292  #define frame_SRND (frame_YFER + YFER_SIZE)
   293  #define frame_INP (frame_SRND + SRND_SIZE)
   294  #define frame_INPEND (frame_INP + INP_SIZE)
   295  
   296  #define addm(p1, p2) \
   297  	ADDQ p1, p2; \
   298  	MOVQ p2, p1
   299  
   300  #define COPY_YMM_AND_BSWAP(p1, p2, p3) \
   301  	VMOVDQU p2, p1;    \
   302  	VPSHUFB p3, p1, p1
   303  
   304  #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
   305  	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
   306  	VPALIGNR   $RVAL, YSRC2, YDST, YDST
   307  
   308  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
   309  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
   310  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
   311  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
   312  
   313  GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
   314  
   315  DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
   316  DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
   317  DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   318  DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   319  
   320  GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
   321  
   322  TEXT ·blockAVX2(SB), NOSPLIT, $56-32
   323  	MOVQ dig+0(FP), SI
   324  	MOVQ p_base+8(FP), DI
   325  	MOVQ p_len+16(FP), DX
   326  
   327  	SHRQ $7, DX
   328  	SHLQ $7, DX
   329  
   330  	JZ   done_hash
   331  	ADDQ DI, DX
   332  	MOVQ DX, frame_INPEND(SP)
   333  
   334  	MOVQ (0*8)(SI), AX
   335  	MOVQ (1*8)(SI), BX
   336  	MOVQ (2*8)(SI), CX
   337  	MOVQ (3*8)(SI), R8
   338  	MOVQ (4*8)(SI), DX
   339  	MOVQ (5*8)(SI), R9
   340  	MOVQ (6*8)(SI), R10
   341  	MOVQ (7*8)(SI), R11
   342  
   343  	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
   344  
   345  loop0:
   346  	MOVQ ·_K+0(SB), BP
   347  
   348  	// byte swap first 16 dwords
   349  	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
   350  	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
   351  	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
   352  	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
   353  
   354  	MOVQ DI, frame_INP(SP)
   355  
   356  	// schedule 64 input dwords, by doing 12 rounds of 4 each
   357  	MOVQ $4, frame_SRND(SP)
   358  
   359  loop1:
   360  	VPADDQ  (BP), Y4, Y0
   361  	VMOVDQU Y0, frame_YFER(SP)
   362  
   363  	MY_VPALIGNR(Y0, Y7, Y6, 8)
   364  
   365  	VPADDQ Y4, Y0, Y0
   366  
   367  	MY_VPALIGNR(Y1, Y5, Y4, 8)
   368  
   369  	VPSRLQ $1, Y1, Y2
   370  	VPSLLQ $(64-1), Y1, Y3
   371  	VPOR   Y2, Y3, Y3
   372  
   373  	VPSRLQ $7, Y1, Y8
   374  
   375  	MOVQ  AX, DI
   376  	RORXQ $41, DX, R13
   377  	RORXQ $18, DX, R14
   378  	ADDQ  frame_YFER(SP), R11
   379  	ORQ   CX, DI
   380  	MOVQ  R9, R15
   381  	RORXQ $34, AX, R12
   382  
   383  	XORQ  R14, R13
   384  	XORQ  R10, R15
   385  	RORXQ $14, DX, R14
   386  
   387  	ANDQ  DX, R15
   388  	XORQ  R14, R13
   389  	RORXQ $39, AX, R14
   390  	ADDQ  R11, R8
   391  
   392  	ANDQ  BX, DI
   393  	XORQ  R12, R14
   394  	RORXQ $28, AX, R12
   395  
   396  	XORQ R10, R15
   397  	XORQ R12, R14
   398  	MOVQ AX, R12
   399  	ANDQ CX, R12
   400  
   401  	ADDQ R13, R15
   402  	ORQ  R12, DI
   403  	ADDQ R14, R11
   404  
   405  	ADDQ R15, R8
   406  
   407  	ADDQ R15, R11
   408  	ADDQ DI, R11
   409  
   410  	VPSRLQ $8, Y1, Y2
   411  	VPSLLQ $(64-8), Y1, Y1
   412  	VPOR   Y2, Y1, Y1
   413  
   414  	VPXOR Y8, Y3, Y3
   415  	VPXOR Y1, Y3, Y1
   416  
   417  	VPADDQ Y1, Y0, Y0
   418  
   419  	VPERM2F128 $0x0, Y0, Y0, Y4
   420  
   421  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   422  
   423  	VPERM2F128 $0x11, Y7, Y7, Y2
   424  	VPSRLQ     $6, Y2, Y8
   425  
   426  	MOVQ  R11, DI
   427  	RORXQ $41, R8, R13
   428  	RORXQ $18, R8, R14
   429  	ADDQ  1*8+frame_YFER(SP), R10
   430  	ORQ   BX, DI
   431  
   432  	MOVQ  DX, R15
   433  	RORXQ $34, R11, R12
   434  	XORQ  R14, R13
   435  	XORQ  R9, R15
   436  
   437  	RORXQ $14, R8, R14
   438  	XORQ  R14, R13
   439  	RORXQ $39, R11, R14
   440  	ANDQ  R8, R15
   441  	ADDQ  R10, CX
   442  
   443  	ANDQ AX, DI
   444  	XORQ R12, R14
   445  
   446  	RORXQ $28, R11, R12
   447  	XORQ  R9, R15
   448  
   449  	XORQ R12, R14
   450  	MOVQ R11, R12
   451  	ANDQ BX, R12
   452  	ADDQ R13, R15
   453  
   454  	ORQ  R12, DI
   455  	ADDQ R14, R10
   456  
   457  	ADDQ R15, CX
   458  	ADDQ R15, R10
   459  	ADDQ DI, R10
   460  
   461  	VPSRLQ $19, Y2, Y3
   462  	VPSLLQ $(64-19), Y2, Y1
   463  	VPOR   Y1, Y3, Y3
   464  	VPXOR  Y3, Y8, Y8
   465  	VPSRLQ $61, Y2, Y3
   466  	VPSLLQ $(64-61), Y2, Y1
   467  	VPOR   Y1, Y3, Y3
   468  	VPXOR  Y3, Y8, Y8
   469  
   470  	VPADDQ Y8, Y4, Y4
   471  
   472  	VPSRLQ $6, Y4, Y8
   473  
   474  	MOVQ  R10, DI
   475  	RORXQ $41, CX, R13
   476  	ADDQ  2*8+frame_YFER(SP), R9
   477  
   478  	RORXQ $18, CX, R14
   479  	ORQ   AX, DI
   480  	MOVQ  R8, R15
   481  	XORQ  DX, R15
   482  
   483  	RORXQ $34, R10, R12
   484  	XORQ  R14, R13
   485  	ANDQ  CX, R15
   486  
   487  	RORXQ $14, CX, R14
   488  	ADDQ  R9, BX
   489  	ANDQ  R11, DI
   490  
   491  	XORQ  R14, R13
   492  	RORXQ $39, R10, R14
   493  	XORQ  DX, R15
   494  
   495  	XORQ  R12, R14
   496  	RORXQ $28, R10, R12
   497  
   498  	XORQ R12, R14
   499  	MOVQ R10, R12
   500  	ANDQ AX, R12
   501  	ADDQ R13, R15
   502  
   503  	ORQ  R12, DI
   504  	ADDQ R14, R9
   505  	ADDQ R15, BX
   506  	ADDQ R15, R9
   507  
   508  	ADDQ DI, R9
   509  
   510  	VPSRLQ $19, Y4, Y3
   511  	VPSLLQ $(64-19), Y4, Y1
   512  	VPOR   Y1, Y3, Y3
   513  	VPXOR  Y3, Y8, Y8
   514  	VPSRLQ $61, Y4, Y3
   515  	VPSLLQ $(64-61), Y4, Y1
   516  	VPOR   Y1, Y3, Y3
   517  	VPXOR  Y3, Y8, Y8
   518  
   519  	VPADDQ Y8, Y0, Y2
   520  
   521  	VPBLENDD $0xF0, Y2, Y4, Y4
   522  
   523  	MOVQ  R9, DI
   524  	RORXQ $41, BX, R13
   525  	RORXQ $18, BX, R14
   526  	ADDQ  3*8+frame_YFER(SP), DX
   527  	ORQ   R11, DI
   528  
   529  	MOVQ  CX, R15
   530  	RORXQ $34, R9, R12
   531  	XORQ  R14, R13
   532  	XORQ  R8, R15
   533  
   534  	RORXQ $14, BX, R14
   535  	ANDQ  BX, R15
   536  	ADDQ  DX, AX
   537  	ANDQ  R10, DI
   538  
   539  	XORQ R14, R13
   540  	XORQ R8, R15
   541  
   542  	RORXQ $39, R9, R14
   543  	ADDQ  R13, R15
   544  
   545  	XORQ R12, R14
   546  	ADDQ R15, AX
   547  
   548  	RORXQ $28, R9, R12
   549  
   550  	XORQ R12, R14
   551  	MOVQ R9, R12
   552  	ANDQ R11, R12
   553  	ORQ  R12, DI
   554  
   555  	ADDQ R14, DX
   556  	ADDQ R15, DX
   557  	ADDQ DI, DX
   558  
   559  	VPADDQ  1*32(BP), Y5, Y0
   560  	VMOVDQU Y0, frame_YFER(SP)
   561  
   562  	MY_VPALIGNR(Y0, Y4, Y7, 8)
   563  
   564  	VPADDQ Y5, Y0, Y0
   565  
   566  	MY_VPALIGNR(Y1, Y6, Y5, 8)
   567  
   568  	VPSRLQ $1, Y1, Y2
   569  	VPSLLQ $(64-1), Y1, Y3
   570  	VPOR   Y2, Y3, Y3
   571  
   572  	VPSRLQ $7, Y1, Y8
   573  
   574  	MOVQ  DX, DI
   575  	RORXQ $41, AX, R13
   576  	RORXQ $18, AX, R14
   577  	ADDQ  frame_YFER(SP), R8
   578  	ORQ   R10, DI
   579  	MOVQ  BX, R15
   580  	RORXQ $34, DX, R12
   581  
   582  	XORQ  R14, R13
   583  	XORQ  CX, R15
   584  	RORXQ $14, AX, R14
   585  
   586  	ANDQ  AX, R15
   587  	XORQ  R14, R13
   588  	RORXQ $39, DX, R14
   589  	ADDQ  R8, R11
   590  
   591  	ANDQ  R9, DI
   592  	XORQ  R12, R14
   593  	RORXQ $28, DX, R12
   594  
   595  	XORQ CX, R15
   596  	XORQ R12, R14
   597  	MOVQ DX, R12
   598  	ANDQ R10, R12
   599  
   600  	ADDQ R13, R15
   601  	ORQ  R12, DI
   602  	ADDQ R14, R8
   603  
   604  	ADDQ R15, R11
   605  
   606  	ADDQ R15, R8
   607  	ADDQ DI, R8
   608  
   609  	VPSRLQ $8, Y1, Y2
   610  	VPSLLQ $(64-8), Y1, Y1
   611  	VPOR   Y2, Y1, Y1
   612  
   613  	VPXOR Y8, Y3, Y3
   614  	VPXOR Y1, Y3, Y1
   615  
   616  	VPADDQ Y1, Y0, Y0
   617  
   618  	VPERM2F128 $0x0, Y0, Y0, Y5
   619  
   620  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   621  
   622  	VPERM2F128 $0x11, Y4, Y4, Y2
   623  	VPSRLQ     $6, Y2, Y8
   624  
   625  	MOVQ  R8, DI
   626  	RORXQ $41, R11, R13
   627  	RORXQ $18, R11, R14
   628  	ADDQ  1*8+frame_YFER(SP), CX
   629  	ORQ   R9, DI
   630  
   631  	MOVQ  AX, R15
   632  	RORXQ $34, R8, R12
   633  	XORQ  R14, R13
   634  	XORQ  BX, R15
   635  
   636  	RORXQ $14, R11, R14
   637  	XORQ  R14, R13
   638  	RORXQ $39, R8, R14
   639  	ANDQ  R11, R15
   640  	ADDQ  CX, R10
   641  
   642  	ANDQ DX, DI
   643  	XORQ R12, R14
   644  
   645  	RORXQ $28, R8, R12
   646  	XORQ  BX, R15
   647  
   648  	XORQ R12, R14
   649  	MOVQ R8, R12
   650  	ANDQ R9, R12
   651  	ADDQ R13, R15
   652  
   653  	ORQ  R12, DI
   654  	ADDQ R14, CX
   655  
   656  	ADDQ R15, R10
   657  	ADDQ R15, CX
   658  	ADDQ DI, CX
   659  
   660  	VPSRLQ $19, Y2, Y3
   661  	VPSLLQ $(64-19), Y2, Y1
   662  	VPOR   Y1, Y3, Y3
   663  	VPXOR  Y3, Y8, Y8
   664  	VPSRLQ $61, Y2, Y3
   665  	VPSLLQ $(64-61), Y2, Y1
   666  	VPOR   Y1, Y3, Y3
   667  	VPXOR  Y3, Y8, Y8
   668  
   669  	VPADDQ Y8, Y5, Y5
   670  
   671  	VPSRLQ $6, Y5, Y8
   672  
   673  	MOVQ  CX, DI
   674  	RORXQ $41, R10, R13
   675  	ADDQ  2*8+frame_YFER(SP), BX
   676  
   677  	RORXQ $18, R10, R14
   678  	ORQ   DX, DI
   679  	MOVQ  R11, R15
   680  	XORQ  AX, R15
   681  
   682  	RORXQ $34, CX, R12
   683  	XORQ  R14, R13
   684  	ANDQ  R10, R15
   685  
   686  	RORXQ $14, R10, R14
   687  	ADDQ  BX, R9
   688  	ANDQ  R8, DI
   689  
   690  	XORQ  R14, R13
   691  	RORXQ $39, CX, R14
   692  	XORQ  AX, R15
   693  
   694  	XORQ  R12, R14
   695  	RORXQ $28, CX, R12
   696  
   697  	XORQ R12, R14
   698  	MOVQ CX, R12
   699  	ANDQ DX, R12
   700  	ADDQ R13, R15
   701  
   702  	ORQ  R12, DI
   703  	ADDQ R14, BX
   704  	ADDQ R15, R9
   705  	ADDQ R15, BX
   706  
   707  	ADDQ DI, BX
   708  
   709  	VPSRLQ $19, Y5, Y3
   710  	VPSLLQ $(64-19), Y5, Y1
   711  	VPOR   Y1, Y3, Y3
   712  	VPXOR  Y3, Y8, Y8
   713  	VPSRLQ $61, Y5, Y3
   714  	VPSLLQ $(64-61), Y5, Y1
   715  	VPOR   Y1, Y3, Y3
   716  	VPXOR  Y3, Y8, Y8
   717  
   718  	VPADDQ Y8, Y0, Y2
   719  
   720  	VPBLENDD $0xF0, Y2, Y5, Y5
   721  
   722  	MOVQ  BX, DI
   723  	RORXQ $41, R9, R13
   724  	RORXQ $18, R9, R14
   725  	ADDQ  3*8+frame_YFER(SP), AX
   726  	ORQ   R8, DI
   727  
   728  	MOVQ  R10, R15
   729  	RORXQ $34, BX, R12
   730  	XORQ  R14, R13
   731  	XORQ  R11, R15
   732  
   733  	RORXQ $14, R9, R14
   734  	ANDQ  R9, R15
   735  	ADDQ  AX, DX
   736  	ANDQ  CX, DI
   737  
   738  	XORQ R14, R13
   739  	XORQ R11, R15
   740  
   741  	RORXQ $39, BX, R14
   742  	ADDQ  R13, R15
   743  
   744  	XORQ R12, R14
   745  	ADDQ R15, DX
   746  
   747  	RORXQ $28, BX, R12
   748  
   749  	XORQ R12, R14
   750  	MOVQ BX, R12
   751  	ANDQ R8, R12
   752  	ORQ  R12, DI
   753  
   754  	ADDQ R14, AX
   755  	ADDQ R15, AX
   756  	ADDQ DI, AX
   757  
   758  	VPADDQ  2*32(BP), Y6, Y0
   759  	VMOVDQU Y0, frame_YFER(SP)
   760  
   761  	MY_VPALIGNR(Y0, Y5, Y4, 8)
   762  
   763  	VPADDQ Y6, Y0, Y0
   764  
   765  	MY_VPALIGNR(Y1, Y7, Y6, 8)
   766  
   767  	VPSRLQ $1, Y1, Y2
   768  	VPSLLQ $(64-1), Y1, Y3
   769  	VPOR   Y2, Y3, Y3
   770  
   771  	VPSRLQ $7, Y1, Y8
   772  
   773  	MOVQ  AX, DI
   774  	RORXQ $41, DX, R13
   775  	RORXQ $18, DX, R14
   776  	ADDQ  frame_YFER(SP), R11
   777  	ORQ   CX, DI
   778  	MOVQ  R9, R15
   779  	RORXQ $34, AX, R12
   780  
   781  	XORQ  R14, R13
   782  	XORQ  R10, R15
   783  	RORXQ $14, DX, R14
   784  
   785  	ANDQ  DX, R15
   786  	XORQ  R14, R13
   787  	RORXQ $39, AX, R14
   788  	ADDQ  R11, R8
   789  
   790  	ANDQ  BX, DI
   791  	XORQ  R12, R14
   792  	RORXQ $28, AX, R12
   793  
   794  	XORQ R10, R15
   795  	XORQ R12, R14
   796  	MOVQ AX, R12
   797  	ANDQ CX, R12
   798  
   799  	ADDQ R13, R15
   800  	ORQ  R12, DI
   801  	ADDQ R14, R11
   802  
   803  	ADDQ R15, R8
   804  
   805  	ADDQ R15, R11
   806  	ADDQ DI, R11
   807  
   808  	VPSRLQ $8, Y1, Y2
   809  	VPSLLQ $(64-8), Y1, Y1
   810  	VPOR   Y2, Y1, Y1
   811  
   812  	VPXOR Y8, Y3, Y3
   813  	VPXOR Y1, Y3, Y1
   814  
   815  	VPADDQ Y1, Y0, Y0
   816  
   817  	VPERM2F128 $0x0, Y0, Y0, Y6
   818  
   819  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   820  
   821  	VPERM2F128 $0x11, Y5, Y5, Y2
   822  	VPSRLQ     $6, Y2, Y8
   823  
   824  	MOVQ  R11, DI
   825  	RORXQ $41, R8, R13
   826  	RORXQ $18, R8, R14
   827  	ADDQ  1*8+frame_YFER(SP), R10
   828  	ORQ   BX, DI
   829  
   830  	MOVQ  DX, R15
   831  	RORXQ $34, R11, R12
   832  	XORQ  R14, R13
   833  	XORQ  R9, R15
   834  
   835  	RORXQ $14, R8, R14
   836  	XORQ  R14, R13
   837  	RORXQ $39, R11, R14
   838  	ANDQ  R8, R15
   839  	ADDQ  R10, CX
   840  
   841  	ANDQ AX, DI
   842  	XORQ R12, R14
   843  
   844  	RORXQ $28, R11, R12
   845  	XORQ  R9, R15
   846  
   847  	XORQ R12, R14
   848  	MOVQ R11, R12
   849  	ANDQ BX, R12
   850  	ADDQ R13, R15
   851  
   852  	ORQ  R12, DI
   853  	ADDQ R14, R10
   854  
   855  	ADDQ R15, CX
   856  	ADDQ R15, R10
   857  	ADDQ DI, R10
   858  
   859  	VPSRLQ $19, Y2, Y3
   860  	VPSLLQ $(64-19), Y2, Y1
   861  	VPOR   Y1, Y3, Y3
   862  	VPXOR  Y3, Y8, Y8
   863  	VPSRLQ $61, Y2, Y3
   864  	VPSLLQ $(64-61), Y2, Y1
   865  	VPOR   Y1, Y3, Y3
   866  	VPXOR  Y3, Y8, Y8
   867  
   868  	VPADDQ Y8, Y6, Y6
   869  
   870  	VPSRLQ $6, Y6, Y8
   871  
   872  	MOVQ  R10, DI
   873  	RORXQ $41, CX, R13
   874  	ADDQ  2*8+frame_YFER(SP), R9
   875  
   876  	RORXQ $18, CX, R14
   877  	ORQ   AX, DI
   878  	MOVQ  R8, R15
   879  	XORQ  DX, R15
   880  
   881  	RORXQ $34, R10, R12
   882  	XORQ  R14, R13
   883  	ANDQ  CX, R15
   884  
   885  	RORXQ $14, CX, R14
   886  	ADDQ  R9, BX
   887  	ANDQ  R11, DI
   888  
   889  	XORQ  R14, R13
   890  	RORXQ $39, R10, R14
   891  	XORQ  DX, R15
   892  
   893  	XORQ  R12, R14
   894  	RORXQ $28, R10, R12
   895  
   896  	XORQ R12, R14
   897  	MOVQ R10, R12
   898  	ANDQ AX, R12
   899  	ADDQ R13, R15
   900  
   901  	ORQ  R12, DI
   902  	ADDQ R14, R9
   903  	ADDQ R15, BX
   904  	ADDQ R15, R9
   905  
   906  	ADDQ DI, R9
   907  
   908  	VPSRLQ $19, Y6, Y3
   909  	VPSLLQ $(64-19), Y6, Y1
   910  	VPOR   Y1, Y3, Y3
   911  	VPXOR  Y3, Y8, Y8
   912  	VPSRLQ $61, Y6, Y3
   913  	VPSLLQ $(64-61), Y6, Y1
   914  	VPOR   Y1, Y3, Y3
   915  	VPXOR  Y3, Y8, Y8
   916  
   917  	VPADDQ Y8, Y0, Y2
   918  
   919  	VPBLENDD $0xF0, Y2, Y6, Y6
   920  
   921  	MOVQ  R9, DI
   922  	RORXQ $41, BX, R13
   923  	RORXQ $18, BX, R14
   924  	ADDQ  3*8+frame_YFER(SP), DX
   925  	ORQ   R11, DI
   926  
   927  	MOVQ  CX, R15
   928  	RORXQ $34, R9, R12
   929  	XORQ  R14, R13
   930  	XORQ  R8, R15
   931  
   932  	RORXQ $14, BX, R14
   933  	ANDQ  BX, R15
   934  	ADDQ  DX, AX
   935  	ANDQ  R10, DI
   936  
   937  	XORQ R14, R13
   938  	XORQ R8, R15
   939  
   940  	RORXQ $39, R9, R14
   941  	ADDQ  R13, R15
   942  
   943  	XORQ R12, R14
   944  	ADDQ R15, AX
   945  
   946  	RORXQ $28, R9, R12
   947  
   948  	XORQ R12, R14
   949  	MOVQ R9, R12
   950  	ANDQ R11, R12
   951  	ORQ  R12, DI
   952  
   953  	ADDQ R14, DX
   954  	ADDQ R15, DX
   955  	ADDQ DI, DX
   956  
   957  	VPADDQ  3*32(BP), Y7, Y0
   958  	VMOVDQU Y0, frame_YFER(SP)
   959  	ADDQ    $(4*32), BP
   960  
   961  	MY_VPALIGNR(Y0, Y6, Y5, 8)
   962  
   963  	VPADDQ Y7, Y0, Y0
   964  
   965  	MY_VPALIGNR(Y1, Y4, Y7, 8)
   966  
   967  	VPSRLQ $1, Y1, Y2
   968  	VPSLLQ $(64-1), Y1, Y3
   969  	VPOR   Y2, Y3, Y3
   970  
   971  	VPSRLQ $7, Y1, Y8
   972  
   973  	MOVQ  DX, DI
   974  	RORXQ $41, AX, R13
   975  	RORXQ $18, AX, R14
   976  	ADDQ  frame_YFER(SP), R8
   977  	ORQ   R10, DI
   978  	MOVQ  BX, R15
   979  	RORXQ $34, DX, R12
   980  
   981  	XORQ  R14, R13
   982  	XORQ  CX, R15
   983  	RORXQ $14, AX, R14
   984  
   985  	ANDQ  AX, R15
   986  	XORQ  R14, R13
   987  	RORXQ $39, DX, R14
   988  	ADDQ  R8, R11
   989  
   990  	ANDQ  R9, DI
   991  	XORQ  R12, R14
   992  	RORXQ $28, DX, R12
   993  
   994  	XORQ CX, R15
   995  	XORQ R12, R14
   996  	MOVQ DX, R12
   997  	ANDQ R10, R12
   998  
   999  	ADDQ R13, R15
  1000  	ORQ  R12, DI
  1001  	ADDQ R14, R8
  1002  
  1003  	ADDQ R15, R11
  1004  
  1005  	ADDQ R15, R8
  1006  	ADDQ DI, R8
  1007  
  1008  	VPSRLQ $8, Y1, Y2
  1009  	VPSLLQ $(64-8), Y1, Y1
  1010  	VPOR   Y2, Y1, Y1
  1011  
  1012  	VPXOR Y8, Y3, Y3
  1013  	VPXOR Y1, Y3, Y1
  1014  
  1015  	VPADDQ Y1, Y0, Y0
  1016  
  1017  	VPERM2F128 $0x0, Y0, Y0, Y7
  1018  
  1019  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
  1020  
  1021  	VPERM2F128 $0x11, Y6, Y6, Y2
  1022  	VPSRLQ     $6, Y2, Y8
  1023  
  1024  	MOVQ  R8, DI
  1025  	RORXQ $41, R11, R13
  1026  	RORXQ $18, R11, R14
  1027  	ADDQ  1*8+frame_YFER(SP), CX
  1028  	ORQ   R9, DI
  1029  
  1030  	MOVQ  AX, R15
  1031  	RORXQ $34, R8, R12
  1032  	XORQ  R14, R13
  1033  	XORQ  BX, R15
  1034  
  1035  	RORXQ $14, R11, R14
  1036  	XORQ  R14, R13
  1037  	RORXQ $39, R8, R14
  1038  	ANDQ  R11, R15
  1039  	ADDQ  CX, R10
  1040  
  1041  	ANDQ DX, DI
  1042  	XORQ R12, R14
  1043  
  1044  	RORXQ $28, R8, R12
  1045  	XORQ  BX, R15
  1046  
  1047  	XORQ R12, R14
  1048  	MOVQ R8, R12
  1049  	ANDQ R9, R12
  1050  	ADDQ R13, R15
  1051  
  1052  	ORQ  R12, DI
  1053  	ADDQ R14, CX
  1054  
  1055  	ADDQ R15, R10
  1056  	ADDQ R15, CX
  1057  	ADDQ DI, CX
  1058  
  1059  	VPSRLQ $19, Y2, Y3
  1060  	VPSLLQ $(64-19), Y2, Y1
  1061  	VPOR   Y1, Y3, Y3
  1062  	VPXOR  Y3, Y8, Y8
  1063  	VPSRLQ $61, Y2, Y3
  1064  	VPSLLQ $(64-61), Y2, Y1
  1065  	VPOR   Y1, Y3, Y3
  1066  	VPXOR  Y3, Y8, Y8
  1067  
  1068  	VPADDQ Y8, Y7, Y7
  1069  
  1070  	VPSRLQ $6, Y7, Y8
  1071  
  1072  	MOVQ  CX, DI
  1073  	RORXQ $41, R10, R13
  1074  	ADDQ  2*8+frame_YFER(SP), BX
  1075  
  1076  	RORXQ $18, R10, R14
  1077  	ORQ   DX, DI
  1078  	MOVQ  R11, R15
  1079  	XORQ  AX, R15
  1080  
  1081  	RORXQ $34, CX, R12
  1082  	XORQ  R14, R13
  1083  	ANDQ  R10, R15
  1084  
  1085  	RORXQ $14, R10, R14
  1086  	ADDQ  BX, R9
  1087  	ANDQ  R8, DI
  1088  
  1089  	XORQ  R14, R13
  1090  	RORXQ $39, CX, R14
  1091  	XORQ  AX, R15
  1092  
  1093  	XORQ  R12, R14
  1094  	RORXQ $28, CX, R12
  1095  
  1096  	XORQ R12, R14
  1097  	MOVQ CX, R12
  1098  	ANDQ DX, R12
  1099  	ADDQ R13, R15
  1100  
  1101  	ORQ  R12, DI
  1102  	ADDQ R14, BX
  1103  	ADDQ R15, R9
  1104  	ADDQ R15, BX
  1105  
  1106  	ADDQ DI, BX
  1107  
  1108  	VPSRLQ $19, Y7, Y3
  1109  	VPSLLQ $(64-19), Y7, Y1
  1110  	VPOR   Y1, Y3, Y3
  1111  	VPXOR  Y3, Y8, Y8
  1112  	VPSRLQ $61, Y7, Y3
  1113  	VPSLLQ $(64-61), Y7, Y1
  1114  	VPOR   Y1, Y3, Y3
  1115  	VPXOR  Y3, Y8, Y8
  1116  
  1117  	VPADDQ Y8, Y0, Y2
  1118  
  1119  	VPBLENDD $0xF0, Y2, Y7, Y7
  1120  
  1121  	MOVQ  BX, DI
  1122  	RORXQ $41, R9, R13
  1123  	RORXQ $18, R9, R14
  1124  	ADDQ  3*8+frame_YFER(SP), AX
  1125  	ORQ   R8, DI
  1126  
  1127  	MOVQ  R10, R15
  1128  	RORXQ $34, BX, R12
  1129  	XORQ  R14, R13
  1130  	XORQ  R11, R15
  1131  
  1132  	RORXQ $14, R9, R14
  1133  	ANDQ  R9, R15
  1134  	ADDQ  AX, DX
  1135  	ANDQ  CX, DI
  1136  
  1137  	XORQ R14, R13
  1138  	XORQ R11, R15
  1139  
  1140  	RORXQ $39, BX, R14
  1141  	ADDQ  R13, R15
  1142  
  1143  	XORQ R12, R14
  1144  	ADDQ R15, DX
  1145  
  1146  	RORXQ $28, BX, R12
  1147  
  1148  	XORQ R12, R14
  1149  	MOVQ BX, R12
  1150  	ANDQ R8, R12
  1151  	ORQ  R12, DI
  1152  
  1153  	ADDQ R14, AX
  1154  	ADDQ R15, AX
  1155  	ADDQ DI, AX
  1156  
  1157  	SUBQ $1, frame_SRND(SP)
  1158  	JNE  loop1
  1159  
  1160  	MOVQ $2, frame_SRND(SP)
  1161  
  1162  loop2:
  1163  	VPADDQ  (BP), Y4, Y0
  1164  	VMOVDQU Y0, frame_YFER(SP)
  1165  
  1166  	MOVQ  R9, R15
  1167  	RORXQ $41, DX, R13
  1168  	RORXQ $18, DX, R14
  1169  	XORQ  R10, R15
  1170  
  1171  	XORQ  R14, R13
  1172  	RORXQ $14, DX, R14
  1173  	ANDQ  DX, R15
  1174  
  1175  	XORQ  R14, R13
  1176  	RORXQ $34, AX, R12
  1177  	XORQ  R10, R15
  1178  	RORXQ $39, AX, R14
  1179  	MOVQ  AX, DI
  1180  
  1181  	XORQ  R12, R14
  1182  	RORXQ $28, AX, R12
  1183  	ADDQ  frame_YFER(SP), R11
  1184  	ORQ   CX, DI
  1185  
  1186  	XORQ R12, R14
  1187  	MOVQ AX, R12
  1188  	ANDQ BX, DI
  1189  	ANDQ CX, R12
  1190  	ADDQ R13, R15
  1191  
  1192  	ADDQ R11, R8
  1193  	ORQ  R12, DI
  1194  	ADDQ R14, R11
  1195  
  1196  	ADDQ R15, R8
  1197  
  1198  	ADDQ  R15, R11
  1199  	MOVQ  DX, R15
  1200  	RORXQ $41, R8, R13
  1201  	RORXQ $18, R8, R14
  1202  	XORQ  R9, R15
  1203  
  1204  	XORQ  R14, R13
  1205  	RORXQ $14, R8, R14
  1206  	ANDQ  R8, R15
  1207  	ADDQ  DI, R11
  1208  
  1209  	XORQ  R14, R13
  1210  	RORXQ $34, R11, R12
  1211  	XORQ  R9, R15
  1212  	RORXQ $39, R11, R14
  1213  	MOVQ  R11, DI
  1214  
  1215  	XORQ  R12, R14
  1216  	RORXQ $28, R11, R12
  1217  	ADDQ  8*1+frame_YFER(SP), R10
  1218  	ORQ   BX, DI
  1219  
  1220  	XORQ R12, R14
  1221  	MOVQ R11, R12
  1222  	ANDQ AX, DI
  1223  	ANDQ BX, R12
  1224  	ADDQ R13, R15
  1225  
  1226  	ADDQ R10, CX
  1227  	ORQ  R12, DI
  1228  	ADDQ R14, R10
  1229  
  1230  	ADDQ R15, CX
  1231  
  1232  	ADDQ  R15, R10
  1233  	MOVQ  R8, R15
  1234  	RORXQ $41, CX, R13
  1235  	RORXQ $18, CX, R14
  1236  	XORQ  DX, R15
  1237  
  1238  	XORQ  R14, R13
  1239  	RORXQ $14, CX, R14
  1240  	ANDQ  CX, R15
  1241  	ADDQ  DI, R10
  1242  
  1243  	XORQ  R14, R13
  1244  	RORXQ $34, R10, R12
  1245  	XORQ  DX, R15
  1246  	RORXQ $39, R10, R14
  1247  	MOVQ  R10, DI
  1248  
  1249  	XORQ  R12, R14
  1250  	RORXQ $28, R10, R12
  1251  	ADDQ  8*2+frame_YFER(SP), R9
  1252  	ORQ   AX, DI
  1253  
  1254  	XORQ R12, R14
  1255  	MOVQ R10, R12
  1256  	ANDQ R11, DI
  1257  	ANDQ AX, R12
  1258  	ADDQ R13, R15
  1259  
  1260  	ADDQ R9, BX
  1261  	ORQ  R12, DI
  1262  	ADDQ R14, R9
  1263  
  1264  	ADDQ R15, BX
  1265  
  1266  	ADDQ  R15, R9
  1267  	MOVQ  CX, R15
  1268  	RORXQ $41, BX, R13
  1269  	RORXQ $18, BX, R14
  1270  	XORQ  R8, R15
  1271  
  1272  	XORQ  R14, R13
  1273  	RORXQ $14, BX, R14
  1274  	ANDQ  BX, R15
  1275  	ADDQ  DI, R9
  1276  
  1277  	XORQ  R14, R13
  1278  	RORXQ $34, R9, R12
  1279  	XORQ  R8, R15
  1280  	RORXQ $39, R9, R14
  1281  	MOVQ  R9, DI
  1282  
  1283  	XORQ  R12, R14
  1284  	RORXQ $28, R9, R12
  1285  	ADDQ  8*3+frame_YFER(SP), DX
  1286  	ORQ   R11, DI
  1287  
  1288  	XORQ R12, R14
  1289  	MOVQ R9, R12
  1290  	ANDQ R10, DI
  1291  	ANDQ R11, R12
  1292  	ADDQ R13, R15
  1293  
  1294  	ADDQ DX, AX
  1295  	ORQ  R12, DI
  1296  	ADDQ R14, DX
  1297  
  1298  	ADDQ R15, AX
  1299  
  1300  	ADDQ R15, DX
  1301  
  1302  	ADDQ DI, DX
  1303  
  1304  	VPADDQ  1*32(BP), Y5, Y0
  1305  	VMOVDQU Y0, frame_YFER(SP)
  1306  	ADDQ    $(2*32), BP
  1307  
  1308  	MOVQ  BX, R15
  1309  	RORXQ $41, AX, R13
  1310  	RORXQ $18, AX, R14
  1311  	XORQ  CX, R15
  1312  
  1313  	XORQ  R14, R13
  1314  	RORXQ $14, AX, R14
  1315  	ANDQ  AX, R15
  1316  
  1317  	XORQ  R14, R13
  1318  	RORXQ $34, DX, R12
  1319  	XORQ  CX, R15
  1320  	RORXQ $39, DX, R14
  1321  	MOVQ  DX, DI
  1322  
  1323  	XORQ  R12, R14
  1324  	RORXQ $28, DX, R12
  1325  	ADDQ  frame_YFER(SP), R8
  1326  	ORQ   R10, DI
  1327  
  1328  	XORQ R12, R14
  1329  	MOVQ DX, R12
  1330  	ANDQ R9, DI
  1331  	ANDQ R10, R12
  1332  	ADDQ R13, R15
  1333  
  1334  	ADDQ R8, R11
  1335  	ORQ  R12, DI
  1336  	ADDQ R14, R8
  1337  
  1338  	ADDQ R15, R11
  1339  
  1340  	ADDQ  R15, R8
  1341  	MOVQ  AX, R15
  1342  	RORXQ $41, R11, R13
  1343  	RORXQ $18, R11, R14
  1344  	XORQ  BX, R15
  1345  
  1346  	XORQ  R14, R13
  1347  	RORXQ $14, R11, R14
  1348  	ANDQ  R11, R15
  1349  	ADDQ  DI, R8
  1350  
  1351  	XORQ  R14, R13
  1352  	RORXQ $34, R8, R12
  1353  	XORQ  BX, R15
  1354  	RORXQ $39, R8, R14
  1355  	MOVQ  R8, DI
  1356  
  1357  	XORQ  R12, R14
  1358  	RORXQ $28, R8, R12
  1359  	ADDQ  8*1+frame_YFER(SP), CX
  1360  	ORQ   R9, DI
  1361  
  1362  	XORQ R12, R14
  1363  	MOVQ R8, R12
  1364  	ANDQ DX, DI
  1365  	ANDQ R9, R12
  1366  	ADDQ R13, R15
  1367  
  1368  	ADDQ CX, R10
  1369  	ORQ  R12, DI
  1370  	ADDQ R14, CX
  1371  
  1372  	ADDQ R15, R10
  1373  
  1374  	ADDQ  R15, CX
  1375  	MOVQ  R11, R15
  1376  	RORXQ $41, R10, R13
  1377  	RORXQ $18, R10, R14
  1378  	XORQ  AX, R15
  1379  
  1380  	XORQ  R14, R13
  1381  	RORXQ $14, R10, R14
  1382  	ANDQ  R10, R15
  1383  	ADDQ  DI, CX
  1384  
  1385  	XORQ  R14, R13
  1386  	RORXQ $34, CX, R12
  1387  	XORQ  AX, R15
  1388  	RORXQ $39, CX, R14
  1389  	MOVQ  CX, DI
  1390  
  1391  	XORQ  R12, R14
  1392  	RORXQ $28, CX, R12
  1393  	ADDQ  8*2+frame_YFER(SP), BX
  1394  	ORQ   DX, DI
  1395  
  1396  	XORQ R12, R14
  1397  	MOVQ CX, R12
  1398  	ANDQ R8, DI
  1399  	ANDQ DX, R12
  1400  	ADDQ R13, R15
  1401  
  1402  	ADDQ BX, R9
  1403  	ORQ  R12, DI
  1404  	ADDQ R14, BX
  1405  
  1406  	ADDQ R15, R9
  1407  
  1408  	ADDQ  R15, BX
  1409  	MOVQ  R10, R15
  1410  	RORXQ $41, R9, R13
  1411  	RORXQ $18, R9, R14
  1412  	XORQ  R11, R15
  1413  
  1414  	XORQ  R14, R13
  1415  	RORXQ $14, R9, R14
  1416  	ANDQ  R9, R15
  1417  	ADDQ  DI, BX
  1418  
  1419  	XORQ  R14, R13
  1420  	RORXQ $34, BX, R12
  1421  	XORQ  R11, R15
  1422  	RORXQ $39, BX, R14
  1423  	MOVQ  BX, DI
  1424  
  1425  	XORQ  R12, R14
  1426  	RORXQ $28, BX, R12
  1427  	ADDQ  8*3+frame_YFER(SP), AX
  1428  	ORQ   R8, DI
  1429  
  1430  	XORQ R12, R14
  1431  	MOVQ BX, R12
  1432  	ANDQ CX, DI
  1433  	ANDQ R8, R12
  1434  	ADDQ R13, R15
  1435  
  1436  	ADDQ AX, DX
  1437  	ORQ  R12, DI
  1438  	ADDQ R14, AX
  1439  
  1440  	ADDQ R15, DX
  1441  
  1442  	ADDQ R15, AX
  1443  
  1444  	ADDQ DI, AX
  1445  
  1446  	VMOVDQU Y6, Y4
  1447  	VMOVDQU Y7, Y5
  1448  
  1449  	SUBQ $1, frame_SRND(SP)
  1450  	JNE  loop2
  1451  
  1452  	addm(8*0(SI),AX)
  1453  	addm(8*1(SI),BX)
  1454  	addm(8*2(SI),CX)
  1455  	addm(8*3(SI),R8)
  1456  	addm(8*4(SI),DX)
  1457  	addm(8*5(SI),R9)
  1458  	addm(8*6(SI),R10)
  1459  	addm(8*7(SI),R11)
  1460  
  1461  	MOVQ frame_INP(SP), DI
  1462  	ADDQ $128, DI
  1463  	CMPQ DI, frame_INPEND(SP)
  1464  	JNE  loop0
  1465  
  1466  done_hash:
  1467  	VZEROUPPER
  1468  	RET