golang.org/toolchain@v0.0.1-go1.9rc2.windows-amd64/src/crypto/sha512/sha512block_amd64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // SHA512 block routine. See sha512block.go for Go equivalent.
     8  //
     9  // The algorithm is detailed in FIPS 180-4:
    10  //
    11  //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12  //
    13  // Wt = Mt; for 0 <= t <= 15
    14  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    15  //
    16  // a = H0
    17  // b = H1
    18  // c = H2
    19  // d = H3
    20  // e = H4
    21  // f = H5
    22  // g = H6
    23  // h = H7
    24  //
    25  // for t = 0 to 79 {
    26  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    27  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    28  //    h = g
    29  //    g = f
    30  //    f = e
    31  //    e = d + T1
    32  //    d = c
    33  //    c = b
    34  //    b = a
    35  //    a = T1 + T2
    36  // }
    37  //
    38  // H0 = a + H0
    39  // H1 = b + H1
    40  // H2 = c + H2
    41  // H3 = d + H3
    42  // H4 = e + H4
    43  // H5 = f + H5
    44  // H6 = g + H6
    45  // H7 = h + H7
    46  
    47  // Wt = Mt; for 0 <= t <= 15
    48  #define MSGSCHEDULE0(index) \
    49  	MOVQ	(index*8)(SI), AX; \
    50  	BSWAPQ	AX; \
    51  	MOVQ	AX, (index*8)(BP)
    52  
    53  // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    54  //   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
    55  //   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
    56  #define MSGSCHEDULE1(index) \
    57  	MOVQ	((index-2)*8)(BP), AX; \
    58  	MOVQ	AX, CX; \
    59  	RORQ	$19, AX; \
    60  	MOVQ	CX, DX; \
    61  	RORQ	$61, CX; \
    62  	SHRQ	$6, DX; \
    63  	MOVQ	((index-15)*8)(BP), BX; \
    64  	XORQ	CX, AX; \
    65  	MOVQ	BX, CX; \
    66  	XORQ	DX, AX; \
    67  	RORQ	$1, BX; \
    68  	MOVQ	CX, DX; \
    69  	SHRQ	$7, DX; \
    70  	RORQ	$8, CX; \
    71  	ADDQ	((index-7)*8)(BP), AX; \
    72  	XORQ	CX, BX; \
    73  	XORQ	DX, BX; \
    74  	ADDQ	((index-16)*8)(BP), BX; \
    75  	ADDQ	BX, AX; \
    76  	MOVQ	AX, ((index)*8)(BP)
    77  
    78  // Calculate T1 in AX - uses AX, CX and DX registers.
    79  // h is also used as an accumulator. Wt is passed in AX.
    80  //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    81  //     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
    82  //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    83  #define SHA512T1(const, e, f, g, h) \
    84  	MOVQ	$const, DX; \
    85  	ADDQ	AX, h; \
    86  	MOVQ	e, AX; \
    87  	ADDQ	DX, h; \
    88  	MOVQ	e, CX; \
    89  	RORQ	$14, AX; \
    90  	MOVQ	e, DX; \
    91  	RORQ	$18, CX; \
    92  	XORQ	CX, AX; \
    93  	MOVQ	e, CX; \
    94  	RORQ	$41, DX; \
    95  	ANDQ	f, CX; \
    96  	XORQ	AX, DX; \
    97  	MOVQ	e, AX; \
    98  	NOTQ	AX; \
    99  	ADDQ	DX, h; \
   100  	ANDQ	g, AX; \
   101  	XORQ	CX, AX; \
   102  	ADDQ	h, AX
   103  
   104  // Calculate T2 in BX - uses BX, CX, DX and DI registers.
   105  //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   106  //     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
   107  //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   108  #define SHA512T2(a, b, c) \
   109  	MOVQ	a, DI; \
   110  	MOVQ	c, BX; \
   111  	RORQ	$28, DI; \
   112  	MOVQ	a, DX; \
   113  	ANDQ	b, BX; \
   114  	RORQ	$34, DX; \
   115  	MOVQ	a, CX; \
   116  	ANDQ	c, CX; \
   117  	XORQ	DX, DI; \
   118  	XORQ	CX, BX; \
   119  	MOVQ	a, DX; \
   120  	MOVQ	b, CX; \
   121  	RORQ	$39, DX; \
   122  	ANDQ	a, CX; \
   123  	XORQ	CX, BX; \
   124  	XORQ	DX, DI; \
   125  	ADDQ	DI, BX
   126  
   127  // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   128  // The values for e and a are stored in d and h, ready for rotation.
   129  #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
   130  	SHA512T1(const, e, f, g, h); \
   131  	SHA512T2(a, b, c); \
   132  	MOVQ	BX, h; \
   133  	ADDQ	AX, d; \
   134  	ADDQ	AX, h
   135  
   136  #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
   137  	MSGSCHEDULE0(index); \
   138  	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   139  
   140  #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
   141  	MSGSCHEDULE1(index); \
   142  	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   143  
   144  TEXT ·blockAMD64(SB),0,$648-32
   145  	MOVQ	p_base+8(FP), SI
   146  	MOVQ	p_len+16(FP), DX
   147  	SHRQ	$7, DX
   148  	SHLQ	$7, DX
   149  
   150  	LEAQ	(SI)(DX*1), DI
   151  	MOVQ	DI, 640(SP)
   152  	CMPQ	SI, DI
   153  	JEQ	end
   154  
   155  	MOVQ	dig+0(FP), BP
   156  	MOVQ	(0*8)(BP), R8		// a = H0
   157  	MOVQ	(1*8)(BP), R9		// b = H1
   158  	MOVQ	(2*8)(BP), R10		// c = H2
   159  	MOVQ	(3*8)(BP), R11		// d = H3
   160  	MOVQ	(4*8)(BP), R12		// e = H4
   161  	MOVQ	(5*8)(BP), R13		// f = H5
   162  	MOVQ	(6*8)(BP), R14		// g = H6
   163  	MOVQ	(7*8)(BP), R15		// h = H7
   164  
   165  loop:
   166  	MOVQ	SP, BP			// message schedule
   167  
   168  	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
   169  	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
   170  	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
   171  	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
   172  	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
   173  	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
   174  	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
   175  	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
   176  	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
   177  	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
   178  	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
   179  	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
   180  	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
   181  	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
   182  	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
   183  	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
   184  
   185  	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
   186  	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
   187  	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
   188  	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
   189  	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
   190  	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
   191  	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
   192  	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
   193  	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
   194  	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
   195  	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
   196  	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
   197  	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
   198  	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
   199  	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
   200  	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
   201  	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
   202  	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
   203  	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
   204  	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
   205  	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
   206  	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
   207  	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
   208  	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
   209  	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
   210  	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
   211  	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
   212  	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
   213  	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
   214  	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
   215  	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
   216  	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
   217  	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
   218  	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
   219  	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
   220  	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
   221  	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
   222  	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
   223  	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
   224  	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
   225  	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
   226  	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
   227  	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
   228  	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
   229  	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
   230  	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
   231  	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
   232  	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
   233  	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
   234  	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
   235  	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
   236  	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
   237  	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
   238  	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
   239  	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
   240  	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
   241  	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
   242  	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
   243  	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
   244  	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
   245  	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
   246  	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
   247  	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
   248  	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
   249  
   250  	MOVQ	dig+0(FP), BP
   251  	ADDQ	(0*8)(BP), R8	// H0 = a + H0
   252  	MOVQ	R8, (0*8)(BP)
   253  	ADDQ	(1*8)(BP), R9	// H1 = b + H1
   254  	MOVQ	R9, (1*8)(BP)
   255  	ADDQ	(2*8)(BP), R10	// H2 = c + H2
   256  	MOVQ	R10, (2*8)(BP)
   257  	ADDQ	(3*8)(BP), R11	// H3 = d + H3
   258  	MOVQ	R11, (3*8)(BP)
   259  	ADDQ	(4*8)(BP), R12	// H4 = e + H4
   260  	MOVQ	R12, (4*8)(BP)
   261  	ADDQ	(5*8)(BP), R13	// H5 = f + H5
   262  	MOVQ	R13, (5*8)(BP)
   263  	ADDQ	(6*8)(BP), R14	// H6 = g + H6
   264  	MOVQ	R14, (6*8)(BP)
   265  	ADDQ	(7*8)(BP), R15	// H7 = h + H7
   266  	MOVQ	R15, (7*8)(BP)
   267  
   268  	ADDQ	$128, SI
   269  	CMPQ	SI, 640(SP)
   270  	JB	loop
   271  
   272  end:
   273  	RET
   274  
   275  // Version below is based on "Fast SHA512 Implementations on Intel
   276  // Architecture Processors" White-paper
   277  // http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
   278  // AVX2 version by Intel, same algorithm in Linux kernel:
   279  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
   280  
   281  // James Guilford <james.guilford@intel.com>
   282  // Kirk Yap <kirk.s.yap@intel.com>
   283  // Tim Chen <tim.c.chen@linux.intel.com>
   284  // David Cote <david.m.cote@intel.com>
   285  // Aleksey Sidorov <aleksey.sidorov@intel.com>
   286  
   287  #define YFER_SIZE (4*8)
   288  #define SRND_SIZE (1*8)
   289  #define INP_SIZE (1*8)
   290  
   291  #define frame_YFER (0)
   292  #define frame_SRND (frame_YFER + YFER_SIZE)
   293  #define frame_INP (frame_SRND + SRND_SIZE)
   294  #define frame_INPEND (frame_INP + INP_SIZE)
   295  
   296  #define addm(p1, p2) \
   297  	ADDQ p1, p2; \
   298  	MOVQ p2, p1
   299  
   300  #define COPY_YMM_AND_BSWAP(p1, p2, p3) \
   301  	VMOVDQU p2, p1;    \
   302  	VPSHUFB p3, p1, p1
   303  
   304  #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
   305  	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
   306  	VPALIGNR   $RVAL, YSRC2, YDST, YDST
   307  
   308  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
   309  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
   310  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
   311  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
   312  
   313  GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
   314  
   315  DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
   316  DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
   317  DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   318  DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   319  
   320  GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
   321  
   322  TEXT ·blockAVX2(SB), NOSPLIT, $56-32
   323  	MOVQ dig+0(FP), SI
   324  	MOVQ p_base+8(FP), DI
   325  	MOVQ p_len+16(FP), DX
   326  
   327  	SHRQ $7, DX
   328  	SHLQ $7, DX
   329  
   330  	JZ   done_hash
   331  	ADDQ DI, DX
   332  	MOVQ DX, frame_INPEND(SP)
   333  
   334  	MOVQ (0*8)(SI), AX
   335  	MOVQ (1*8)(SI), BX
   336  	MOVQ (2*8)(SI), CX
   337  	MOVQ (3*8)(SI), R8
   338  	MOVQ (4*8)(SI), DX
   339  	MOVQ (5*8)(SI), R9
   340  	MOVQ (6*8)(SI), R10
   341  	MOVQ (7*8)(SI), R11
   342  
   343  	MOVQ    $PSHUFFLE_BYTE_FLIP_MASK<>(SB), R12
   344  	VMOVDQU (R12), Y9
   345  
   346  loop0:
   347  	MOVQ ·_K+0(SB), BP
   348  
   349  	// byte swap first 16 dwords
   350  	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
   351  	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
   352  	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
   353  	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
   354  
   355  	MOVQ DI, frame_INP(SP)
   356  
   357  	// schedule 64 input dwords, by doing 12 rounds of 4 each
   358  	MOVQ $4, frame_SRND(SP)
   359  
   360  loop1:
   361  	VPADDQ  (BP), Y4, Y0
   362  	VMOVDQU Y0, frame_YFER(SP)
   363  
   364  	MY_VPALIGNR(Y0, Y7, Y6, 8)
   365  
   366  	VPADDQ Y4, Y0, Y0
   367  
   368  	MY_VPALIGNR(Y1, Y5, Y4, 8)
   369  
   370  	VPSRLQ $1, Y1, Y2
   371  	VPSLLQ $(64-1), Y1, Y3
   372  	VPOR   Y2, Y3, Y3
   373  
   374  	VPSRLQ $7, Y1, Y8
   375  
   376  	MOVQ  AX, DI
   377  	RORXQ $41, DX, R13
   378  	RORXQ $18, DX, R14
   379  	ADDQ  frame_YFER(SP), R11
   380  	ORQ   CX, DI
   381  	MOVQ  R9, R15
   382  	RORXQ $34, AX, R12
   383  
   384  	XORQ  R14, R13
   385  	XORQ  R10, R15
   386  	RORXQ $14, DX, R14
   387  
   388  	ANDQ  DX, R15
   389  	XORQ  R14, R13
   390  	RORXQ $39, AX, R14
   391  	ADDQ  R11, R8
   392  
   393  	ANDQ  BX, DI
   394  	XORQ  R12, R14
   395  	RORXQ $28, AX, R12
   396  
   397  	XORQ R10, R15
   398  	XORQ R12, R14
   399  	MOVQ AX, R12
   400  	ANDQ CX, R12
   401  
   402  	ADDQ R13, R15
   403  	ORQ  R12, DI
   404  	ADDQ R14, R11
   405  
   406  	ADDQ R15, R8
   407  
   408  	ADDQ R15, R11
   409  	ADDQ DI, R11
   410  
   411  	VPSRLQ $8, Y1, Y2
   412  	VPSLLQ $(64-8), Y1, Y1
   413  	VPOR   Y2, Y1, Y1
   414  
   415  	VPXOR Y8, Y3, Y3
   416  	VPXOR Y1, Y3, Y1
   417  
   418  	VPADDQ Y1, Y0, Y0
   419  
   420  	VPERM2F128 $0x0, Y0, Y0, Y4
   421  
   422  	MOVQ $MASK_YMM_LO<>(SB), R13
   423  
   424  	VPAND (R13), Y0, Y0
   425  
   426  	VPERM2F128 $0x11, Y7, Y7, Y2
   427  	VPSRLQ     $6, Y2, Y8
   428  
   429  	MOVQ  R11, DI
   430  	RORXQ $41, R8, R13
   431  	RORXQ $18, R8, R14
   432  	ADDQ  1*8+frame_YFER(SP), R10
   433  	ORQ   BX, DI
   434  
   435  	MOVQ  DX, R15
   436  	RORXQ $34, R11, R12
   437  	XORQ  R14, R13
   438  	XORQ  R9, R15
   439  
   440  	RORXQ $14, R8, R14
   441  	XORQ  R14, R13
   442  	RORXQ $39, R11, R14
   443  	ANDQ  R8, R15
   444  	ADDQ  R10, CX
   445  
   446  	ANDQ AX, DI
   447  	XORQ R12, R14
   448  
   449  	RORXQ $28, R11, R12
   450  	XORQ  R9, R15
   451  
   452  	XORQ R12, R14
   453  	MOVQ R11, R12
   454  	ANDQ BX, R12
   455  	ADDQ R13, R15
   456  
   457  	ORQ  R12, DI
   458  	ADDQ R14, R10
   459  
   460  	ADDQ R15, CX
   461  	ADDQ R15, R10
   462  	ADDQ DI, R10
   463  
   464  	VPSRLQ $19, Y2, Y3
   465  	VPSLLQ $(64-19), Y2, Y1
   466  	VPOR   Y1, Y3, Y3
   467  	VPXOR  Y3, Y8, Y8
   468  	VPSRLQ $61, Y2, Y3
   469  	VPSLLQ $(64-61), Y2, Y1
   470  	VPOR   Y1, Y3, Y3
   471  	VPXOR  Y3, Y8, Y8
   472  
   473  	VPADDQ Y8, Y4, Y4
   474  
   475  	VPSRLQ $6, Y4, Y8
   476  
   477  	MOVQ  R10, DI
   478  	RORXQ $41, CX, R13
   479  	ADDQ  2*8+frame_YFER(SP), R9
   480  
   481  	RORXQ $18, CX, R14
   482  	ORQ   AX, DI
   483  	MOVQ  R8, R15
   484  	XORQ  DX, R15
   485  
   486  	RORXQ $34, R10, R12
   487  	XORQ  R14, R13
   488  	ANDQ  CX, R15
   489  
   490  	RORXQ $14, CX, R14
   491  	ADDQ  R9, BX
   492  	ANDQ  R11, DI
   493  
   494  	XORQ  R14, R13
   495  	RORXQ $39, R10, R14
   496  	XORQ  DX, R15
   497  
   498  	XORQ  R12, R14
   499  	RORXQ $28, R10, R12
   500  
   501  	XORQ R12, R14
   502  	MOVQ R10, R12
   503  	ANDQ AX, R12
   504  	ADDQ R13, R15
   505  
   506  	ORQ  R12, DI
   507  	ADDQ R14, R9
   508  	ADDQ R15, BX
   509  	ADDQ R15, R9
   510  
   511  	ADDQ DI, R9
   512  
   513  	VPSRLQ $19, Y4, Y3
   514  	VPSLLQ $(64-19), Y4, Y1
   515  	VPOR   Y1, Y3, Y3
   516  	VPXOR  Y3, Y8, Y8
   517  	VPSRLQ $61, Y4, Y3
   518  	VPSLLQ $(64-61), Y4, Y1
   519  	VPOR   Y1, Y3, Y3
   520  	VPXOR  Y3, Y8, Y8
   521  
   522  	VPADDQ Y8, Y0, Y2
   523  
   524  	VPBLENDD $0xF0, Y2, Y4, Y4
   525  
   526  	MOVQ  R9, DI
   527  	RORXQ $41, BX, R13
   528  	RORXQ $18, BX, R14
   529  	ADDQ  3*8+frame_YFER(SP), DX
   530  	ORQ   R11, DI
   531  
   532  	MOVQ  CX, R15
   533  	RORXQ $34, R9, R12
   534  	XORQ  R14, R13
   535  	XORQ  R8, R15
   536  
   537  	RORXQ $14, BX, R14
   538  	ANDQ  BX, R15
   539  	ADDQ  DX, AX
   540  	ANDQ  R10, DI
   541  
   542  	XORQ R14, R13
   543  	XORQ R8, R15
   544  
   545  	RORXQ $39, R9, R14
   546  	ADDQ  R13, R15
   547  
   548  	XORQ R12, R14
   549  	ADDQ R15, AX
   550  
   551  	RORXQ $28, R9, R12
   552  
   553  	XORQ R12, R14
   554  	MOVQ R9, R12
   555  	ANDQ R11, R12
   556  	ORQ  R12, DI
   557  
   558  	ADDQ R14, DX
   559  	ADDQ R15, DX
   560  	ADDQ DI, DX
   561  
   562  	VPADDQ  1*32(BP), Y5, Y0
   563  	VMOVDQU Y0, frame_YFER(SP)
   564  
   565  	MY_VPALIGNR(Y0, Y4, Y7, 8)
   566  
   567  	VPADDQ Y5, Y0, Y0
   568  
   569  	MY_VPALIGNR(Y1, Y6, Y5, 8)
   570  
   571  	VPSRLQ $1, Y1, Y2
   572  	VPSLLQ $(64-1), Y1, Y3
   573  	VPOR   Y2, Y3, Y3
   574  
   575  	VPSRLQ $7, Y1, Y8
   576  
   577  	MOVQ  DX, DI
   578  	RORXQ $41, AX, R13
   579  	RORXQ $18, AX, R14
   580  	ADDQ  frame_YFER(SP), R8
   581  	ORQ   R10, DI
   582  	MOVQ  BX, R15
   583  	RORXQ $34, DX, R12
   584  
   585  	XORQ  R14, R13
   586  	XORQ  CX, R15
   587  	RORXQ $14, AX, R14
   588  
   589  	ANDQ  AX, R15
   590  	XORQ  R14, R13
   591  	RORXQ $39, DX, R14
   592  	ADDQ  R8, R11
   593  
   594  	ANDQ  R9, DI
   595  	XORQ  R12, R14
   596  	RORXQ $28, DX, R12
   597  
   598  	XORQ CX, R15
   599  	XORQ R12, R14
   600  	MOVQ DX, R12
   601  	ANDQ R10, R12
   602  
   603  	ADDQ R13, R15
   604  	ORQ  R12, DI
   605  	ADDQ R14, R8
   606  
   607  	ADDQ R15, R11
   608  
   609  	ADDQ R15, R8
   610  	ADDQ DI, R8
   611  
   612  	VPSRLQ $8, Y1, Y2
   613  	VPSLLQ $(64-8), Y1, Y1
   614  	VPOR   Y2, Y1, Y1
   615  
   616  	VPXOR Y8, Y3, Y3
   617  	VPXOR Y1, Y3, Y1
   618  
   619  	VPADDQ Y1, Y0, Y0
   620  
   621  	VPERM2F128 $0x0, Y0, Y0, Y5
   622  
   623  	MOVQ  $MASK_YMM_LO<>(SB), R13
   624  	VPAND (R13), Y0, Y0
   625  
   626  	VPERM2F128 $0x11, Y4, Y4, Y2
   627  	VPSRLQ     $6, Y2, Y8
   628  
   629  	MOVQ  R8, DI
   630  	RORXQ $41, R11, R13
   631  	RORXQ $18, R11, R14
   632  	ADDQ  1*8+frame_YFER(SP), CX
   633  	ORQ   R9, DI
   634  
   635  	MOVQ  AX, R15
   636  	RORXQ $34, R8, R12
   637  	XORQ  R14, R13
   638  	XORQ  BX, R15
   639  
   640  	RORXQ $14, R11, R14
   641  	XORQ  R14, R13
   642  	RORXQ $39, R8, R14
   643  	ANDQ  R11, R15
   644  	ADDQ  CX, R10
   645  
   646  	ANDQ DX, DI
   647  	XORQ R12, R14
   648  
   649  	RORXQ $28, R8, R12
   650  	XORQ  BX, R15
   651  
   652  	XORQ R12, R14
   653  	MOVQ R8, R12
   654  	ANDQ R9, R12
   655  	ADDQ R13, R15
   656  
   657  	ORQ  R12, DI
   658  	ADDQ R14, CX
   659  
   660  	ADDQ R15, R10
   661  	ADDQ R15, CX
   662  	ADDQ DI, CX
   663  
   664  	VPSRLQ $19, Y2, Y3
   665  	VPSLLQ $(64-19), Y2, Y1
   666  	VPOR   Y1, Y3, Y3
   667  	VPXOR  Y3, Y8, Y8
   668  	VPSRLQ $61, Y2, Y3
   669  	VPSLLQ $(64-61), Y2, Y1
   670  	VPOR   Y1, Y3, Y3
   671  	VPXOR  Y3, Y8, Y8
   672  
   673  	VPADDQ Y8, Y5, Y5
   674  
   675  	VPSRLQ $6, Y5, Y8
   676  
   677  	MOVQ  CX, DI
   678  	RORXQ $41, R10, R13
   679  	ADDQ  2*8+frame_YFER(SP), BX
   680  
   681  	RORXQ $18, R10, R14
   682  	ORQ   DX, DI
   683  	MOVQ  R11, R15
   684  	XORQ  AX, R15
   685  
   686  	RORXQ $34, CX, R12
   687  	XORQ  R14, R13
   688  	ANDQ  R10, R15
   689  
   690  	RORXQ $14, R10, R14
   691  	ADDQ  BX, R9
   692  	ANDQ  R8, DI
   693  
   694  	XORQ  R14, R13
   695  	RORXQ $39, CX, R14
   696  	XORQ  AX, R15
   697  
   698  	XORQ  R12, R14
   699  	RORXQ $28, CX, R12
   700  
   701  	XORQ R12, R14
   702  	MOVQ CX, R12
   703  	ANDQ DX, R12
   704  	ADDQ R13, R15
   705  
   706  	ORQ  R12, DI
   707  	ADDQ R14, BX
   708  	ADDQ R15, R9
   709  	ADDQ R15, BX
   710  
   711  	ADDQ DI, BX
   712  
   713  	VPSRLQ $19, Y5, Y3
   714  	VPSLLQ $(64-19), Y5, Y1
   715  	VPOR   Y1, Y3, Y3
   716  	VPXOR  Y3, Y8, Y8
   717  	VPSRLQ $61, Y5, Y3
   718  	VPSLLQ $(64-61), Y5, Y1
   719  	VPOR   Y1, Y3, Y3
   720  	VPXOR  Y3, Y8, Y8
   721  
   722  	VPADDQ Y8, Y0, Y2
   723  
   724  	VPBLENDD $0xF0, Y2, Y5, Y5
   725  
   726  	MOVQ  BX, DI
   727  	RORXQ $41, R9, R13
   728  	RORXQ $18, R9, R14
   729  	ADDQ  3*8+frame_YFER(SP), AX
   730  	ORQ   R8, DI
   731  
   732  	MOVQ  R10, R15
   733  	RORXQ $34, BX, R12
   734  	XORQ  R14, R13
   735  	XORQ  R11, R15
   736  
   737  	RORXQ $14, R9, R14
   738  	ANDQ  R9, R15
   739  	ADDQ  AX, DX
   740  	ANDQ  CX, DI
   741  
   742  	XORQ R14, R13
   743  	XORQ R11, R15
   744  
   745  	RORXQ $39, BX, R14
   746  	ADDQ  R13, R15
   747  
   748  	XORQ R12, R14
   749  	ADDQ R15, DX
   750  
   751  	RORXQ $28, BX, R12
   752  
   753  	XORQ R12, R14
   754  	MOVQ BX, R12
   755  	ANDQ R8, R12
   756  	ORQ  R12, DI
   757  
   758  	ADDQ R14, AX
   759  	ADDQ R15, AX
   760  	ADDQ DI, AX
   761  
   762  	VPADDQ  2*32(BP), Y6, Y0
   763  	VMOVDQU Y0, frame_YFER(SP)
   764  
   765  	MY_VPALIGNR(Y0, Y5, Y4, 8)
   766  
   767  	VPADDQ Y6, Y0, Y0
   768  
   769  	MY_VPALIGNR(Y1, Y7, Y6, 8)
   770  
   771  	VPSRLQ $1, Y1, Y2
   772  	VPSLLQ $(64-1), Y1, Y3
   773  	VPOR   Y2, Y3, Y3
   774  
   775  	VPSRLQ $7, Y1, Y8
   776  
   777  	MOVQ  AX, DI
   778  	RORXQ $41, DX, R13
   779  	RORXQ $18, DX, R14
   780  	ADDQ  frame_YFER(SP), R11
   781  	ORQ   CX, DI
   782  	MOVQ  R9, R15
   783  	RORXQ $34, AX, R12
   784  
   785  	XORQ  R14, R13
   786  	XORQ  R10, R15
   787  	RORXQ $14, DX, R14
   788  
   789  	ANDQ  DX, R15
   790  	XORQ  R14, R13
   791  	RORXQ $39, AX, R14
   792  	ADDQ  R11, R8
   793  
   794  	ANDQ  BX, DI
   795  	XORQ  R12, R14
   796  	RORXQ $28, AX, R12
   797  
   798  	XORQ R10, R15
   799  	XORQ R12, R14
   800  	MOVQ AX, R12
   801  	ANDQ CX, R12
   802  
   803  	ADDQ R13, R15
   804  	ORQ  R12, DI
   805  	ADDQ R14, R11
   806  
   807  	ADDQ R15, R8
   808  
   809  	ADDQ R15, R11
   810  	ADDQ DI, R11
   811  
   812  	VPSRLQ $8, Y1, Y2
   813  	VPSLLQ $(64-8), Y1, Y1
   814  	VPOR   Y2, Y1, Y1
   815  
   816  	VPXOR Y8, Y3, Y3
   817  	VPXOR Y1, Y3, Y1
   818  
   819  	VPADDQ Y1, Y0, Y0
   820  
   821  	VPERM2F128 $0x0, Y0, Y0, Y6
   822  
   823  	MOVQ  $MASK_YMM_LO<>(SB), R13
   824  	VPAND (R13), Y0, Y0
   825  
   826  	VPERM2F128 $0x11, Y5, Y5, Y2
   827  	VPSRLQ     $6, Y2, Y8
   828  
   829  	MOVQ  R11, DI
   830  	RORXQ $41, R8, R13
   831  	RORXQ $18, R8, R14
   832  	ADDQ  1*8+frame_YFER(SP), R10
   833  	ORQ   BX, DI
   834  
   835  	MOVQ  DX, R15
   836  	RORXQ $34, R11, R12
   837  	XORQ  R14, R13
   838  	XORQ  R9, R15
   839  
   840  	RORXQ $14, R8, R14
   841  	XORQ  R14, R13
   842  	RORXQ $39, R11, R14
   843  	ANDQ  R8, R15
   844  	ADDQ  R10, CX
   845  
   846  	ANDQ AX, DI
   847  	XORQ R12, R14
   848  
   849  	RORXQ $28, R11, R12
   850  	XORQ  R9, R15
   851  
   852  	XORQ R12, R14
   853  	MOVQ R11, R12
   854  	ANDQ BX, R12
   855  	ADDQ R13, R15
   856  
   857  	ORQ  R12, DI
   858  	ADDQ R14, R10
   859  
   860  	ADDQ R15, CX
   861  	ADDQ R15, R10
   862  	ADDQ DI, R10
   863  
   864  	VPSRLQ $19, Y2, Y3
   865  	VPSLLQ $(64-19), Y2, Y1
   866  	VPOR   Y1, Y3, Y3
   867  	VPXOR  Y3, Y8, Y8
   868  	VPSRLQ $61, Y2, Y3
   869  	VPSLLQ $(64-61), Y2, Y1
   870  	VPOR   Y1, Y3, Y3
   871  	VPXOR  Y3, Y8, Y8
   872  
   873  	VPADDQ Y8, Y6, Y6
   874  
   875  	VPSRLQ $6, Y6, Y8
   876  
   877  	MOVQ  R10, DI
   878  	RORXQ $41, CX, R13
   879  	ADDQ  2*8+frame_YFER(SP), R9
   880  
   881  	RORXQ $18, CX, R14
   882  	ORQ   AX, DI
   883  	MOVQ  R8, R15
   884  	XORQ  DX, R15
   885  
   886  	RORXQ $34, R10, R12
   887  	XORQ  R14, R13
   888  	ANDQ  CX, R15
   889  
   890  	RORXQ $14, CX, R14
   891  	ADDQ  R9, BX
   892  	ANDQ  R11, DI
   893  
   894  	XORQ  R14, R13
   895  	RORXQ $39, R10, R14
   896  	XORQ  DX, R15
   897  
   898  	XORQ  R12, R14
   899  	RORXQ $28, R10, R12
   900  
   901  	XORQ R12, R14
   902  	MOVQ R10, R12
   903  	ANDQ AX, R12
   904  	ADDQ R13, R15
   905  
   906  	ORQ  R12, DI
   907  	ADDQ R14, R9
   908  	ADDQ R15, BX
   909  	ADDQ R15, R9
   910  
   911  	ADDQ DI, R9
   912  
   913  	VPSRLQ $19, Y6, Y3
   914  	VPSLLQ $(64-19), Y6, Y1
   915  	VPOR   Y1, Y3, Y3
   916  	VPXOR  Y3, Y8, Y8
   917  	VPSRLQ $61, Y6, Y3
   918  	VPSLLQ $(64-61), Y6, Y1
   919  	VPOR   Y1, Y3, Y3
   920  	VPXOR  Y3, Y8, Y8
   921  
   922  	VPADDQ Y8, Y0, Y2
   923  
   924  	VPBLENDD $0xF0, Y2, Y6, Y6
   925  
   926  	MOVQ  R9, DI
   927  	RORXQ $41, BX, R13
   928  	RORXQ $18, BX, R14
   929  	ADDQ  3*8+frame_YFER(SP), DX
   930  	ORQ   R11, DI
   931  
   932  	MOVQ  CX, R15
   933  	RORXQ $34, R9, R12
   934  	XORQ  R14, R13
   935  	XORQ  R8, R15
   936  
   937  	RORXQ $14, BX, R14
   938  	ANDQ  BX, R15
   939  	ADDQ  DX, AX
   940  	ANDQ  R10, DI
   941  
   942  	XORQ R14, R13
   943  	XORQ R8, R15
   944  
   945  	RORXQ $39, R9, R14
   946  	ADDQ  R13, R15
   947  
   948  	XORQ R12, R14
   949  	ADDQ R15, AX
   950  
   951  	RORXQ $28, R9, R12
   952  
   953  	XORQ R12, R14
   954  	MOVQ R9, R12
   955  	ANDQ R11, R12
   956  	ORQ  R12, DI
   957  
   958  	ADDQ R14, DX
   959  	ADDQ R15, DX
   960  	ADDQ DI, DX
   961  
   962  	VPADDQ  3*32(BP), Y7, Y0
   963  	VMOVDQU Y0, frame_YFER(SP)
   964  	ADDQ    $(4*32), BP
   965  
   966  	MY_VPALIGNR(Y0, Y6, Y5, 8)
   967  
   968  	VPADDQ Y7, Y0, Y0
   969  
   970  	MY_VPALIGNR(Y1, Y4, Y7, 8)
   971  
   972  	VPSRLQ $1, Y1, Y2
   973  	VPSLLQ $(64-1), Y1, Y3
   974  	VPOR   Y2, Y3, Y3
   975  
   976  	VPSRLQ $7, Y1, Y8
   977  
   978  	MOVQ  DX, DI
   979  	RORXQ $41, AX, R13
   980  	RORXQ $18, AX, R14
   981  	ADDQ  frame_YFER(SP), R8
   982  	ORQ   R10, DI
   983  	MOVQ  BX, R15
   984  	RORXQ $34, DX, R12
   985  
   986  	XORQ  R14, R13
   987  	XORQ  CX, R15
   988  	RORXQ $14, AX, R14
   989  
   990  	ANDQ  AX, R15
   991  	XORQ  R14, R13
   992  	RORXQ $39, DX, R14
   993  	ADDQ  R8, R11
   994  
   995  	ANDQ  R9, DI
   996  	XORQ  R12, R14
   997  	RORXQ $28, DX, R12
   998  
   999  	XORQ CX, R15
  1000  	XORQ R12, R14
  1001  	MOVQ DX, R12
  1002  	ANDQ R10, R12
  1003  
  1004  	ADDQ R13, R15
  1005  	ORQ  R12, DI
  1006  	ADDQ R14, R8
  1007  
  1008  	ADDQ R15, R11
  1009  
  1010  	ADDQ R15, R8
  1011  	ADDQ DI, R8
  1012  
  1013  	VPSRLQ $8, Y1, Y2
  1014  	VPSLLQ $(64-8), Y1, Y1
  1015  	VPOR   Y2, Y1, Y1
  1016  
  1017  	VPXOR Y8, Y3, Y3
  1018  	VPXOR Y1, Y3, Y1
  1019  
  1020  	VPADDQ Y1, Y0, Y0
  1021  
  1022  	VPERM2F128 $0x0, Y0, Y0, Y7
  1023  
  1024  	MOVQ  $MASK_YMM_LO<>(SB), R13
  1025  	VPAND (R13), Y0, Y0
  1026  
  1027  	VPERM2F128 $0x11, Y6, Y6, Y2
  1028  	VPSRLQ     $6, Y2, Y8
  1029  
  1030  	MOVQ  R8, DI
  1031  	RORXQ $41, R11, R13
  1032  	RORXQ $18, R11, R14
  1033  	ADDQ  1*8+frame_YFER(SP), CX
  1034  	ORQ   R9, DI
  1035  
  1036  	MOVQ  AX, R15
  1037  	RORXQ $34, R8, R12
  1038  	XORQ  R14, R13
  1039  	XORQ  BX, R15
  1040  
  1041  	RORXQ $14, R11, R14
  1042  	XORQ  R14, R13
  1043  	RORXQ $39, R8, R14
  1044  	ANDQ  R11, R15
  1045  	ADDQ  CX, R10
  1046  
  1047  	ANDQ DX, DI
  1048  	XORQ R12, R14
  1049  
  1050  	RORXQ $28, R8, R12
  1051  	XORQ  BX, R15
  1052  
  1053  	XORQ R12, R14
  1054  	MOVQ R8, R12
  1055  	ANDQ R9, R12
  1056  	ADDQ R13, R15
  1057  
  1058  	ORQ  R12, DI
  1059  	ADDQ R14, CX
  1060  
  1061  	ADDQ R15, R10
  1062  	ADDQ R15, CX
  1063  	ADDQ DI, CX
  1064  
  1065  	VPSRLQ $19, Y2, Y3
  1066  	VPSLLQ $(64-19), Y2, Y1
  1067  	VPOR   Y1, Y3, Y3
  1068  	VPXOR  Y3, Y8, Y8
  1069  	VPSRLQ $61, Y2, Y3
  1070  	VPSLLQ $(64-61), Y2, Y1
  1071  	VPOR   Y1, Y3, Y3
  1072  	VPXOR  Y3, Y8, Y8
  1073  
  1074  	VPADDQ Y8, Y7, Y7
  1075  
  1076  	VPSRLQ $6, Y7, Y8
  1077  
  1078  	MOVQ  CX, DI
  1079  	RORXQ $41, R10, R13
  1080  	ADDQ  2*8+frame_YFER(SP), BX
  1081  
  1082  	RORXQ $18, R10, R14
  1083  	ORQ   DX, DI
  1084  	MOVQ  R11, R15
  1085  	XORQ  AX, R15
  1086  
  1087  	RORXQ $34, CX, R12
  1088  	XORQ  R14, R13
  1089  	ANDQ  R10, R15
  1090  
  1091  	RORXQ $14, R10, R14
  1092  	ADDQ  BX, R9
  1093  	ANDQ  R8, DI
  1094  
  1095  	XORQ  R14, R13
  1096  	RORXQ $39, CX, R14
  1097  	XORQ  AX, R15
  1098  
  1099  	XORQ  R12, R14
  1100  	RORXQ $28, CX, R12
  1101  
  1102  	XORQ R12, R14
  1103  	MOVQ CX, R12
  1104  	ANDQ DX, R12
  1105  	ADDQ R13, R15
  1106  
  1107  	ORQ  R12, DI
  1108  	ADDQ R14, BX
  1109  	ADDQ R15, R9
  1110  	ADDQ R15, BX
  1111  
  1112  	ADDQ DI, BX
  1113  
  1114  	VPSRLQ $19, Y7, Y3
  1115  	VPSLLQ $(64-19), Y7, Y1
  1116  	VPOR   Y1, Y3, Y3
  1117  	VPXOR  Y3, Y8, Y8
  1118  	VPSRLQ $61, Y7, Y3
  1119  	VPSLLQ $(64-61), Y7, Y1
  1120  	VPOR   Y1, Y3, Y3
  1121  	VPXOR  Y3, Y8, Y8
  1122  
  1123  	VPADDQ Y8, Y0, Y2
  1124  
  1125  	VPBLENDD $0xF0, Y2, Y7, Y7
  1126  
  1127  	MOVQ  BX, DI
  1128  	RORXQ $41, R9, R13
  1129  	RORXQ $18, R9, R14
  1130  	ADDQ  3*8+frame_YFER(SP), AX
  1131  	ORQ   R8, DI
  1132  
  1133  	MOVQ  R10, R15
  1134  	RORXQ $34, BX, R12
  1135  	XORQ  R14, R13
  1136  	XORQ  R11, R15
  1137  
  1138  	RORXQ $14, R9, R14
  1139  	ANDQ  R9, R15
  1140  	ADDQ  AX, DX
  1141  	ANDQ  CX, DI
  1142  
  1143  	XORQ R14, R13
  1144  	XORQ R11, R15
  1145  
  1146  	RORXQ $39, BX, R14
  1147  	ADDQ  R13, R15
  1148  
  1149  	XORQ R12, R14
  1150  	ADDQ R15, DX
  1151  
  1152  	RORXQ $28, BX, R12
  1153  
  1154  	XORQ R12, R14
  1155  	MOVQ BX, R12
  1156  	ANDQ R8, R12
  1157  	ORQ  R12, DI
  1158  
  1159  	ADDQ R14, AX
  1160  	ADDQ R15, AX
  1161  	ADDQ DI, AX
  1162  
  1163  	SUBQ $1, frame_SRND(SP)
  1164  	JNE  loop1
  1165  
  1166  	MOVQ $2, frame_SRND(SP)
  1167  
  1168  loop2:
  1169  	VPADDQ  (BP), Y4, Y0
  1170  	VMOVDQU Y0, frame_YFER(SP)
  1171  
  1172  	MOVQ  R9, R15
  1173  	RORXQ $41, DX, R13
  1174  	RORXQ $18, DX, R14
  1175  	XORQ  R10, R15
  1176  
  1177  	XORQ  R14, R13
  1178  	RORXQ $14, DX, R14
  1179  	ANDQ  DX, R15
  1180  
  1181  	XORQ  R14, R13
  1182  	RORXQ $34, AX, R12
  1183  	XORQ  R10, R15
  1184  	RORXQ $39, AX, R14
  1185  	MOVQ  AX, DI
  1186  
  1187  	XORQ  R12, R14
  1188  	RORXQ $28, AX, R12
  1189  	ADDQ  frame_YFER(SP), R11
  1190  	ORQ   CX, DI
  1191  
  1192  	XORQ R12, R14
  1193  	MOVQ AX, R12
  1194  	ANDQ BX, DI
  1195  	ANDQ CX, R12
  1196  	ADDQ R13, R15
  1197  
  1198  	ADDQ R11, R8
  1199  	ORQ  R12, DI
  1200  	ADDQ R14, R11
  1201  
  1202  	ADDQ R15, R8
  1203  
  1204  	ADDQ  R15, R11
  1205  	MOVQ  DX, R15
  1206  	RORXQ $41, R8, R13
  1207  	RORXQ $18, R8, R14
  1208  	XORQ  R9, R15
  1209  
  1210  	XORQ  R14, R13
  1211  	RORXQ $14, R8, R14
  1212  	ANDQ  R8, R15
  1213  	ADDQ  DI, R11
  1214  
  1215  	XORQ  R14, R13
  1216  	RORXQ $34, R11, R12
  1217  	XORQ  R9, R15
  1218  	RORXQ $39, R11, R14
  1219  	MOVQ  R11, DI
  1220  
  1221  	XORQ  R12, R14
  1222  	RORXQ $28, R11, R12
  1223  	ADDQ  8*1+frame_YFER(SP), R10
  1224  	ORQ   BX, DI
  1225  
  1226  	XORQ R12, R14
  1227  	MOVQ R11, R12
  1228  	ANDQ AX, DI
  1229  	ANDQ BX, R12
  1230  	ADDQ R13, R15
  1231  
  1232  	ADDQ R10, CX
  1233  	ORQ  R12, DI
  1234  	ADDQ R14, R10
  1235  
  1236  	ADDQ R15, CX
  1237  
  1238  	ADDQ  R15, R10
  1239  	MOVQ  R8, R15
  1240  	RORXQ $41, CX, R13
  1241  	RORXQ $18, CX, R14
  1242  	XORQ  DX, R15
  1243  
  1244  	XORQ  R14, R13
  1245  	RORXQ $14, CX, R14
  1246  	ANDQ  CX, R15
  1247  	ADDQ  DI, R10
  1248  
  1249  	XORQ  R14, R13
  1250  	RORXQ $34, R10, R12
  1251  	XORQ  DX, R15
  1252  	RORXQ $39, R10, R14
  1253  	MOVQ  R10, DI
  1254  
  1255  	XORQ  R12, R14
  1256  	RORXQ $28, R10, R12
  1257  	ADDQ  8*2+frame_YFER(SP), R9
  1258  	ORQ   AX, DI
  1259  
  1260  	XORQ R12, R14
  1261  	MOVQ R10, R12
  1262  	ANDQ R11, DI
  1263  	ANDQ AX, R12
  1264  	ADDQ R13, R15
  1265  
  1266  	ADDQ R9, BX
  1267  	ORQ  R12, DI
  1268  	ADDQ R14, R9
  1269  
  1270  	ADDQ R15, BX
  1271  
  1272  	ADDQ  R15, R9
  1273  	MOVQ  CX, R15
  1274  	RORXQ $41, BX, R13
  1275  	RORXQ $18, BX, R14
  1276  	XORQ  R8, R15
  1277  
  1278  	XORQ  R14, R13
  1279  	RORXQ $14, BX, R14
  1280  	ANDQ  BX, R15
  1281  	ADDQ  DI, R9
  1282  
  1283  	XORQ  R14, R13
  1284  	RORXQ $34, R9, R12
  1285  	XORQ  R8, R15
  1286  	RORXQ $39, R9, R14
  1287  	MOVQ  R9, DI
  1288  
  1289  	XORQ  R12, R14
  1290  	RORXQ $28, R9, R12
  1291  	ADDQ  8*3+frame_YFER(SP), DX
  1292  	ORQ   R11, DI
  1293  
  1294  	XORQ R12, R14
  1295  	MOVQ R9, R12
  1296  	ANDQ R10, DI
  1297  	ANDQ R11, R12
  1298  	ADDQ R13, R15
  1299  
  1300  	ADDQ DX, AX
  1301  	ORQ  R12, DI
  1302  	ADDQ R14, DX
  1303  
  1304  	ADDQ R15, AX
  1305  
  1306  	ADDQ R15, DX
  1307  
  1308  	ADDQ DI, DX
  1309  
  1310  	VPADDQ  1*32(BP), Y5, Y0
  1311  	VMOVDQU Y0, frame_YFER(SP)
  1312  	ADDQ    $(2*32), BP
  1313  
  1314  	MOVQ  BX, R15
  1315  	RORXQ $41, AX, R13
  1316  	RORXQ $18, AX, R14
  1317  	XORQ  CX, R15
  1318  
  1319  	XORQ  R14, R13
  1320  	RORXQ $14, AX, R14
  1321  	ANDQ  AX, R15
  1322  
  1323  	XORQ  R14, R13
  1324  	RORXQ $34, DX, R12
  1325  	XORQ  CX, R15
  1326  	RORXQ $39, DX, R14
  1327  	MOVQ  DX, DI
  1328  
  1329  	XORQ  R12, R14
  1330  	RORXQ $28, DX, R12
  1331  	ADDQ  frame_YFER(SP), R8
  1332  	ORQ   R10, DI
  1333  
  1334  	XORQ R12, R14
  1335  	MOVQ DX, R12
  1336  	ANDQ R9, DI
  1337  	ANDQ R10, R12
  1338  	ADDQ R13, R15
  1339  
  1340  	ADDQ R8, R11
  1341  	ORQ  R12, DI
  1342  	ADDQ R14, R8
  1343  
  1344  	ADDQ R15, R11
  1345  
  1346  	ADDQ  R15, R8
  1347  	MOVQ  AX, R15
  1348  	RORXQ $41, R11, R13
  1349  	RORXQ $18, R11, R14
  1350  	XORQ  BX, R15
  1351  
  1352  	XORQ  R14, R13
  1353  	RORXQ $14, R11, R14
  1354  	ANDQ  R11, R15
  1355  	ADDQ  DI, R8
  1356  
  1357  	XORQ  R14, R13
  1358  	RORXQ $34, R8, R12
  1359  	XORQ  BX, R15
  1360  	RORXQ $39, R8, R14
  1361  	MOVQ  R8, DI
  1362  
  1363  	XORQ  R12, R14
  1364  	RORXQ $28, R8, R12
  1365  	ADDQ  8*1+frame_YFER(SP), CX
  1366  	ORQ   R9, DI
  1367  
  1368  	XORQ R12, R14
  1369  	MOVQ R8, R12
  1370  	ANDQ DX, DI
  1371  	ANDQ R9, R12
  1372  	ADDQ R13, R15
  1373  
  1374  	ADDQ CX, R10
  1375  	ORQ  R12, DI
  1376  	ADDQ R14, CX
  1377  
  1378  	ADDQ R15, R10
  1379  
  1380  	ADDQ  R15, CX
  1381  	MOVQ  R11, R15
  1382  	RORXQ $41, R10, R13
  1383  	RORXQ $18, R10, R14
  1384  	XORQ  AX, R15
  1385  
  1386  	XORQ  R14, R13
  1387  	RORXQ $14, R10, R14
  1388  	ANDQ  R10, R15
  1389  	ADDQ  DI, CX
  1390  
  1391  	XORQ  R14, R13
  1392  	RORXQ $34, CX, R12
  1393  	XORQ  AX, R15
  1394  	RORXQ $39, CX, R14
  1395  	MOVQ  CX, DI
  1396  
  1397  	XORQ  R12, R14
  1398  	RORXQ $28, CX, R12
  1399  	ADDQ  8*2+frame_YFER(SP), BX
  1400  	ORQ   DX, DI
  1401  
  1402  	XORQ R12, R14
  1403  	MOVQ CX, R12
  1404  	ANDQ R8, DI
  1405  	ANDQ DX, R12
  1406  	ADDQ R13, R15
  1407  
  1408  	ADDQ BX, R9
  1409  	ORQ  R12, DI
  1410  	ADDQ R14, BX
  1411  
  1412  	ADDQ R15, R9
  1413  
  1414  	ADDQ  R15, BX
  1415  	MOVQ  R10, R15
  1416  	RORXQ $41, R9, R13
  1417  	RORXQ $18, R9, R14
  1418  	XORQ  R11, R15
  1419  
  1420  	XORQ  R14, R13
  1421  	RORXQ $14, R9, R14
  1422  	ANDQ  R9, R15
  1423  	ADDQ  DI, BX
  1424  
  1425  	XORQ  R14, R13
  1426  	RORXQ $34, BX, R12
  1427  	XORQ  R11, R15
  1428  	RORXQ $39, BX, R14
  1429  	MOVQ  BX, DI
  1430  
  1431  	XORQ  R12, R14
  1432  	RORXQ $28, BX, R12
  1433  	ADDQ  8*3+frame_YFER(SP), AX
  1434  	ORQ   R8, DI
  1435  
  1436  	XORQ R12, R14
  1437  	MOVQ BX, R12
  1438  	ANDQ CX, DI
  1439  	ANDQ R8, R12
  1440  	ADDQ R13, R15
  1441  
  1442  	ADDQ AX, DX
  1443  	ORQ  R12, DI
  1444  	ADDQ R14, AX
  1445  
  1446  	ADDQ R15, DX
  1447  
  1448  	ADDQ R15, AX
  1449  
  1450  	ADDQ DI, AX
  1451  
  1452  	VMOVDQU Y6, Y4
  1453  	VMOVDQU Y7, Y5
  1454  
  1455  	SUBQ $1, frame_SRND(SP)
  1456  	JNE  loop2
  1457  
  1458  	addm(8*0(SI),AX)
  1459  	addm(8*1(SI),BX)
  1460  	addm(8*2(SI),CX)
  1461  	addm(8*3(SI),R8)
  1462  	addm(8*4(SI),DX)
  1463  	addm(8*5(SI),R9)
  1464  	addm(8*6(SI),R10)
  1465  	addm(8*7(SI),R11)
  1466  
  1467  	MOVQ frame_INP(SP), DI
  1468  	ADDQ $128, DI
  1469  	CMPQ DI, frame_INPEND(SP)
  1470  	JNE  loop0
  1471  
  1472  done_hash:
  1473  	VZEROUPPER
  1474  	RET