github.com/insolar/x-crypto@v0.0.0-20191031140942-75fab8a325f6/sha1/sha1block_amd64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // AVX2 version by Intel, same algorithm as code in Linux kernel:
     6  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
     7  // Authors:
     8  // Ilya Albrekht <ilya.albrekht@intel.com>
     9  // Maxim Locktyukhin <maxim.locktyukhin@intel.com>
    10  // Ronen Zohar <ronen.zohar@intel.com>
    11  // Chandramouli Narayanan <mouli@linux.intel.com>
    12  
    13  
    14  #include "textflag.h"
    15  
    16  // SHA-1 block routine. See sha1block.go for Go equivalent.
    17  //
    18  // There are 80 rounds of 4 types:
    19  //   - rounds 0-15 are type 1 and load data (ROUND1 macro).
    20  //   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
    21  //   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
    22  //   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
    23  //   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
    24  //
    25  // Each round loads or shuffles the data, then computes a per-round
    26  // function of b, c, d, and then mixes the result into and rotates the
    27  // five registers a, b, c, d, e holding the intermediate results.
    28  //
    29  // The register rotation is implemented by rotating the arguments to
    30  // the round macros instead of by explicit move instructions.
    31  
    32  #define LOAD(index) \
    33  	MOVL	(index*4)(SI), R10; \
    34  	BSWAPL	R10; \
    35  	MOVL	R10, (index*4)(SP)
    36  
    37  #define SHUFFLE(index) \
    38  	MOVL	(((index)&0xf)*4)(SP), R10; \
    39  	XORL	(((index-3)&0xf)*4)(SP), R10; \
    40  	XORL	(((index-8)&0xf)*4)(SP), R10; \
    41  	XORL	(((index-14)&0xf)*4)(SP), R10; \
    42  	ROLL	$1, R10; \
    43  	MOVL	R10, (((index)&0xf)*4)(SP)
    44  
    45  #define FUNC1(a, b, c, d, e) \
    46  	MOVL	d, R9; \
    47  	XORL	c, R9; \
    48  	ANDL	b, R9; \
    49  	XORL	d, R9
    50  
    51  #define FUNC2(a, b, c, d, e) \
    52  	MOVL	b, R9; \
    53  	XORL	c, R9; \
    54  	XORL	d, R9
    55  
    56  #define FUNC3(a, b, c, d, e) \
    57  	MOVL	b, R8; \
    58  	ORL	c, R8; \
    59  	ANDL	d, R8; \
    60  	MOVL	b, R9; \
    61  	ANDL	c, R9; \
    62  	ORL	R8, R9
    63  	
    64  #define FUNC4 FUNC2
    65  
    66  #define MIX(a, b, c, d, e, const) \
    67  	ROLL	$30, b; \
    68  	ADDL	R9, e; \
    69  	MOVL	a, R8; \
    70  	ROLL	$5, R8; \
    71  	LEAL	const(e)(R10*1), e; \
    72  	ADDL	R8, e
    73  
    74  #define ROUND1(a, b, c, d, e, index) \
    75  	LOAD(index); \
    76  	FUNC1(a, b, c, d, e); \
    77  	MIX(a, b, c, d, e, 0x5A827999)
    78  
    79  #define ROUND1x(a, b, c, d, e, index) \
    80  	SHUFFLE(index); \
    81  	FUNC1(a, b, c, d, e); \
    82  	MIX(a, b, c, d, e, 0x5A827999)
    83  
    84  #define ROUND2(a, b, c, d, e, index) \
    85  	SHUFFLE(index); \
    86  	FUNC2(a, b, c, d, e); \
    87  	MIX(a, b, c, d, e, 0x6ED9EBA1)
    88  
    89  #define ROUND3(a, b, c, d, e, index) \
    90  	SHUFFLE(index); \
    91  	FUNC3(a, b, c, d, e); \
    92  	MIX(a, b, c, d, e, 0x8F1BBCDC)
    93  
    94  #define ROUND4(a, b, c, d, e, index) \
    95  	SHUFFLE(index); \
    96  	FUNC4(a, b, c, d, e); \
    97  	MIX(a, b, c, d, e, 0xCA62C1D6)
    98  
    99  TEXT ·blockAMD64(SB),NOSPLIT,$64-32
   100  	MOVQ	dig+0(FP),	BP
   101  	MOVQ	p_base+8(FP),	SI
   102  	MOVQ	p_len+16(FP),	DX
   103  	SHRQ	$6,		DX
   104  	SHLQ	$6,		DX
   105  	
   106  	LEAQ	(SI)(DX*1),	DI
   107  	MOVL	(0*4)(BP),	AX
   108  	MOVL	(1*4)(BP),	BX
   109  	MOVL	(2*4)(BP),	CX
   110  	MOVL	(3*4)(BP),	DX
   111  	MOVL	(4*4)(BP),	BP
   112  
   113  	CMPQ	SI,		DI
   114  	JEQ	end
   115  
   116  loop:
   117  	MOVL	AX,	R11
   118  	MOVL	BX,	R12
   119  	MOVL	CX,	R13
   120  	MOVL	DX,	R14
   121  	MOVL	BP,	R15
   122  
   123  	ROUND1(AX, BX, CX, DX, BP, 0)
   124  	ROUND1(BP, AX, BX, CX, DX, 1)
   125  	ROUND1(DX, BP, AX, BX, CX, 2)
   126  	ROUND1(CX, DX, BP, AX, BX, 3)
   127  	ROUND1(BX, CX, DX, BP, AX, 4)
   128  	ROUND1(AX, BX, CX, DX, BP, 5)
   129  	ROUND1(BP, AX, BX, CX, DX, 6)
   130  	ROUND1(DX, BP, AX, BX, CX, 7)
   131  	ROUND1(CX, DX, BP, AX, BX, 8)
   132  	ROUND1(BX, CX, DX, BP, AX, 9)
   133  	ROUND1(AX, BX, CX, DX, BP, 10)
   134  	ROUND1(BP, AX, BX, CX, DX, 11)
   135  	ROUND1(DX, BP, AX, BX, CX, 12)
   136  	ROUND1(CX, DX, BP, AX, BX, 13)
   137  	ROUND1(BX, CX, DX, BP, AX, 14)
   138  	ROUND1(AX, BX, CX, DX, BP, 15)
   139  
   140  	ROUND1x(BP, AX, BX, CX, DX, 16)
   141  	ROUND1x(DX, BP, AX, BX, CX, 17)
   142  	ROUND1x(CX, DX, BP, AX, BX, 18)
   143  	ROUND1x(BX, CX, DX, BP, AX, 19)
   144  	
   145  	ROUND2(AX, BX, CX, DX, BP, 20)
   146  	ROUND2(BP, AX, BX, CX, DX, 21)
   147  	ROUND2(DX, BP, AX, BX, CX, 22)
   148  	ROUND2(CX, DX, BP, AX, BX, 23)
   149  	ROUND2(BX, CX, DX, BP, AX, 24)
   150  	ROUND2(AX, BX, CX, DX, BP, 25)
   151  	ROUND2(BP, AX, BX, CX, DX, 26)
   152  	ROUND2(DX, BP, AX, BX, CX, 27)
   153  	ROUND2(CX, DX, BP, AX, BX, 28)
   154  	ROUND2(BX, CX, DX, BP, AX, 29)
   155  	ROUND2(AX, BX, CX, DX, BP, 30)
   156  	ROUND2(BP, AX, BX, CX, DX, 31)
   157  	ROUND2(DX, BP, AX, BX, CX, 32)
   158  	ROUND2(CX, DX, BP, AX, BX, 33)
   159  	ROUND2(BX, CX, DX, BP, AX, 34)
   160  	ROUND2(AX, BX, CX, DX, BP, 35)
   161  	ROUND2(BP, AX, BX, CX, DX, 36)
   162  	ROUND2(DX, BP, AX, BX, CX, 37)
   163  	ROUND2(CX, DX, BP, AX, BX, 38)
   164  	ROUND2(BX, CX, DX, BP, AX, 39)
   165  	
   166  	ROUND3(AX, BX, CX, DX, BP, 40)
   167  	ROUND3(BP, AX, BX, CX, DX, 41)
   168  	ROUND3(DX, BP, AX, BX, CX, 42)
   169  	ROUND3(CX, DX, BP, AX, BX, 43)
   170  	ROUND3(BX, CX, DX, BP, AX, 44)
   171  	ROUND3(AX, BX, CX, DX, BP, 45)
   172  	ROUND3(BP, AX, BX, CX, DX, 46)
   173  	ROUND3(DX, BP, AX, BX, CX, 47)
   174  	ROUND3(CX, DX, BP, AX, BX, 48)
   175  	ROUND3(BX, CX, DX, BP, AX, 49)
   176  	ROUND3(AX, BX, CX, DX, BP, 50)
   177  	ROUND3(BP, AX, BX, CX, DX, 51)
   178  	ROUND3(DX, BP, AX, BX, CX, 52)
   179  	ROUND3(CX, DX, BP, AX, BX, 53)
   180  	ROUND3(BX, CX, DX, BP, AX, 54)
   181  	ROUND3(AX, BX, CX, DX, BP, 55)
   182  	ROUND3(BP, AX, BX, CX, DX, 56)
   183  	ROUND3(DX, BP, AX, BX, CX, 57)
   184  	ROUND3(CX, DX, BP, AX, BX, 58)
   185  	ROUND3(BX, CX, DX, BP, AX, 59)
   186  	
   187  	ROUND4(AX, BX, CX, DX, BP, 60)
   188  	ROUND4(BP, AX, BX, CX, DX, 61)
   189  	ROUND4(DX, BP, AX, BX, CX, 62)
   190  	ROUND4(CX, DX, BP, AX, BX, 63)
   191  	ROUND4(BX, CX, DX, BP, AX, 64)
   192  	ROUND4(AX, BX, CX, DX, BP, 65)
   193  	ROUND4(BP, AX, BX, CX, DX, 66)
   194  	ROUND4(DX, BP, AX, BX, CX, 67)
   195  	ROUND4(CX, DX, BP, AX, BX, 68)
   196  	ROUND4(BX, CX, DX, BP, AX, 69)
   197  	ROUND4(AX, BX, CX, DX, BP, 70)
   198  	ROUND4(BP, AX, BX, CX, DX, 71)
   199  	ROUND4(DX, BP, AX, BX, CX, 72)
   200  	ROUND4(CX, DX, BP, AX, BX, 73)
   201  	ROUND4(BX, CX, DX, BP, AX, 74)
   202  	ROUND4(AX, BX, CX, DX, BP, 75)
   203  	ROUND4(BP, AX, BX, CX, DX, 76)
   204  	ROUND4(DX, BP, AX, BX, CX, 77)
   205  	ROUND4(CX, DX, BP, AX, BX, 78)
   206  	ROUND4(BX, CX, DX, BP, AX, 79)
   207  
   208  	ADDL	R11, AX
   209  	ADDL	R12, BX
   210  	ADDL	R13, CX
   211  	ADDL	R14, DX
   212  	ADDL	R15, BP
   213  
   214  	ADDQ	$64, SI
   215  	CMPQ	SI, DI
   216  	JB	loop
   217  
   218  end:
   219  	MOVQ	dig+0(FP), DI
   220  	MOVL	AX, (0*4)(DI)
   221  	MOVL	BX, (1*4)(DI)
   222  	MOVL	CX, (2*4)(DI)
   223  	MOVL	DX, (3*4)(DI)
   224  	MOVL	BP, (4*4)(DI)
   225  	RET
   226  
   227  
   228  // This is the implementation using AVX2, BMI1 and BMI2. It is based on:
   229  // "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
   230  // From http://software.intel.com/en-us/articles
   231  // (look for improving-the-performance-of-the-secure-hash-algorithm-1)
   232  // This implementation is 2x unrolled, and interleaves vector instructions,
   233  // used to precompute W, with scalar computation of current round
   234  // for optimal scheduling.
   235  
   236  // Trivial helper macros.
   237  #define UPDATE_HASH(A,TB,C,D,E) \
   238  	ADDL	(R9), A \
   239  	MOVL	A, (R9) \
   240  	ADDL	4(R9), TB \
   241  	MOVL	TB, 4(R9) \
   242  	ADDL	8(R9), C \
   243  	MOVL	C, 8(R9) \
   244  	ADDL	12(R9), D \
   245  	MOVL	D, 12(R9) \
   246  	ADDL	16(R9), E \
   247  	MOVL	E, 16(R9)
   248  
   249  
   250  
   251  // Helper macros for PRECALC, which does precomputations
   252  #define PRECALC_0(OFFSET) \
   253  	VMOVDQU   OFFSET(R10),X0
   254  
   255  #define PRECALC_1(OFFSET) \
   256  	VINSERTI128 $1, OFFSET(R13), Y0, Y0
   257  
   258  #define PRECALC_2(YREG) \
   259  	VPSHUFB Y10, Y0, YREG
   260  
   261  #define PRECALC_4(YREG,K_OFFSET) \
   262  	VPADDD K_OFFSET(R8), YREG, Y0
   263  
   264  #define PRECALC_7(OFFSET) \
   265  	VMOVDQU Y0, (OFFSET*2)(R14)
   266  
   267  
   268  // Message scheduling pre-compute for rounds 0-15
   269  // R13 is a pointer to even 64-byte block
   270  // R10 is a pointer to odd 64-byte block
   271  // R14 is a pointer to temp buffer
   272  // X0 is used as temp register
   273  // YREG is clobbered as part of computation
   274  // OFFSET chooses 16 byte chunk within a block
   275  // R8 is a pointer to constants block
   276  // K_OFFSET chooses K constants relevant to this round
   277  // X10 holds swap mask
   278  #define PRECALC_00_15(OFFSET,YREG) \
   279  	PRECALC_0(OFFSET) \
   280  	PRECALC_1(OFFSET) \
   281  	PRECALC_2(YREG) \
   282  	PRECALC_4(YREG,0x0) \
   283  	PRECALC_7(OFFSET)
   284  
   285  
   286  // Helper macros for PRECALC_16_31
   287  #define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   288  	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
   289  	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
   290  
   291  #define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   292  	VPXOR  REG_SUB_8, REG, REG \
   293  	VPXOR  REG_SUB_16, Y0, Y0
   294  
   295  #define PRECALC_18(REG) \
   296  	VPXOR Y0, REG, REG \
   297  	VPSLLDQ $12, REG, Y9
   298  
   299  #define PRECALC_19(REG) \
   300  	VPSLLD $1, REG, Y0 \
   301  	VPSRLD $31, REG, REG
   302  
   303  #define PRECALC_20(REG) \
   304  	VPOR REG, Y0, Y0 \
   305  	VPSLLD $2, Y9,  REG
   306  
   307  #define PRECALC_21(REG) \
   308  	VPSRLD $30, Y9, Y9 \
   309  	VPXOR REG, Y0, Y0
   310  
   311  #define PRECALC_23(REG,K_OFFSET,OFFSET) \
   312  	VPXOR Y9, Y0, REG \
   313  	VPADDD K_OFFSET(R8), REG, Y0 \
   314  	VMOVDQU Y0, (OFFSET)(R14)
   315  
   316  // Message scheduling pre-compute for rounds 16-31
   317  // calculating last 32 w[i] values in 8 XMM registers
   318  // pre-calculate K+w[i] values and store to mem
   319  // for later load by ALU add instruction.
   320  // "brute force" vectorization for rounds 16-31 only
   321  // due to w[i]->w[i-3] dependency.
   322  // clobbers 5 input ymm registers REG_SUB*
   323  // uses X0 and X9 as temp registers
   324  // As always, R8 is a pointer to constants block
   325  // and R14 is a pointer to temp buffer
   326  #define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
   327  	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   328  	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   329  	PRECALC_18(REG) \
   330  	PRECALC_19(REG) \
   331  	PRECALC_20(REG) \
   332  	PRECALC_21(REG) \
   333  	PRECALC_23(REG,K_OFFSET,OFFSET)
   334  
   335  
   336  // Helper macros for PRECALC_32_79
   337  #define PRECALC_32(REG_SUB_8,REG_SUB_4) \
   338  	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
   339  
   340  #define PRECALC_33(REG_SUB_28,REG) \
   341  	VPXOR REG_SUB_28, REG, REG
   342  
   343  #define PRECALC_34(REG_SUB_16) \
   344  	VPXOR REG_SUB_16, Y0, Y0
   345  
   346  #define PRECALC_35(REG) \
   347  	VPXOR Y0, REG, REG
   348  
   349  #define PRECALC_36(REG) \
   350  	VPSLLD $2, REG, Y0
   351  
   352  #define PRECALC_37(REG) \
   353  	VPSRLD $30, REG, REG \
   354  	VPOR REG, Y0, REG
   355  
   356  #define PRECALC_39(REG,K_OFFSET,OFFSET) \
   357  	VPADDD K_OFFSET(R8), REG, Y0 \
   358  	VMOVDQU Y0, (OFFSET)(R14)
   359  
   360  // Message scheduling pre-compute for rounds 32-79
   361  // In SHA-1 specification we have:
   362  // w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
   363  // Which is the same as:
   364  // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
   365  // This allows for more efficient vectorization,
   366  // since w[i]->w[i-3] dependency is broken
   367  #define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
   368  	PRECALC_32(REG_SUB_8,REG_SUB_4) \
   369  	PRECALC_33(REG_SUB_28,REG) \
   370  	PRECALC_34(REG_SUB_16) \
   371  	PRECALC_35(REG) \
   372  	PRECALC_36(REG) \
   373  	PRECALC_37(REG) \
   374  	PRECALC_39(REG,K_OFFSET,OFFSET)
   375  
   376  #define PRECALC \
   377  	PRECALC_00_15(0,Y15) \
   378  	PRECALC_00_15(0x10,Y14) \
   379  	PRECALC_00_15(0x20,Y13) \
   380  	PRECALC_00_15(0x30,Y12) \
   381  	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
   382  	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
   383  	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
   384  	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
   385  	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
   386  	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
   387  	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
   388  	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
   389  	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
   390  	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
   391  	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
   392  	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
   393  	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
   394  	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
   395  	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
   396  	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
   397  
   398  // Macros calculating individual rounds have general form
   399  // CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
   400  // CALC_ROUND_{PRE,POST} macros follow
   401  
   402  #define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
   403  	ADDL OFFSET(R15),REG_E \
   404  	ANDNL REG_C,REG_A,BP \
   405  	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   406  	RORXL $0x1b, REG_A, R12 \
   407  	RORXL $2, REG_A, REG_B         // for next round
   408  
   409  // Calculate F for the next round
   410  #define CALC_F1_POST(REG_A,REG_B,REG_E) \
   411  	ANDL REG_B,REG_A \             // b&c
   412  	XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
   413  	LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5
   414  
   415  
   416  // Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
   417  #define CALC_0 \
   418  	MOVL SI, BX \ // Precalculating first round
   419  	RORXL $2, SI, SI \
   420  	ANDNL AX, BX, BP \
   421  	ANDL DI, BX \
   422  	XORL BP, BX \
   423  	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
   424  	PRECALC_0(0x80) \
   425  	CALC_F1_POST(CX,SI,DX)
   426  
   427  #define CALC_1 \
   428  	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
   429  	PRECALC_1(0x80) \
   430  	CALC_F1_POST(DX,BX,AX)
   431  
   432  #define CALC_2 \
   433  	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
   434  	PRECALC_2(Y15) \
   435  	CALC_F1_POST(AX,CX,DI)
   436  
   437  #define CALC_3 \
   438  	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
   439  	CALC_F1_POST(DI,DX,SI)
   440  
   441  #define CALC_4 \
   442  	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
   443  	PRECALC_4(Y15,0x0) \
   444  	CALC_F1_POST(SI,AX,BX)
   445  
   446  #define CALC_5 \
   447  	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
   448  	CALC_F1_POST(BX,DI,CX)
   449  
   450  #define CALC_6 \
   451  	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
   452  	CALC_F1_POST(CX,SI,DX)
   453  
   454  #define CALC_7 \
   455  	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
   456  	PRECALC_7(0x0) \
   457  	CALC_F1_POST(DX,BX,AX)
   458  
   459  #define CALC_8 \
   460  	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
   461  	PRECALC_0(0x90) \
   462  	CALC_F1_POST(AX,CX,DI)
   463  
   464  #define CALC_9 \
   465  	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
   466  	PRECALC_1(0x90) \
   467  	CALC_F1_POST(DI,DX,SI)
   468  
   469  #define CALC_10 \
   470  	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
   471  	PRECALC_2(Y14) \
   472  	CALC_F1_POST(SI,AX,BX)
   473  
   474  #define CALC_11 \
   475  	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
   476  	CALC_F1_POST(BX,DI,CX)
   477  
   478  #define CALC_12 \
   479  	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
   480  	PRECALC_4(Y14,0x0) \
   481  	CALC_F1_POST(CX,SI,DX)
   482  
   483  #define CALC_13 \
   484  	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
   485  	CALC_F1_POST(DX,BX,AX)
   486  
   487  #define CALC_14 \
   488  	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
   489  	CALC_F1_POST(AX,CX,DI)
   490  
   491  #define CALC_15 \
   492  	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
   493  	PRECALC_7(0x10) \
   494  	CALC_F1_POST(DI,DX,SI)
   495  
   496  #define CALC_16 \
   497  	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
   498  	PRECALC_0(0xa0) \
   499  	CALC_F1_POST(SI,AX,BX)
   500  
   501  #define CALC_17 \
   502  	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
   503  	PRECALC_1(0xa0) \
   504  	CALC_F1_POST(BX,DI,CX)
   505  
   506  #define CALC_18 \
   507  	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
   508  	PRECALC_2(Y13) \
   509  	CALC_F1_POST(CX,SI,DX)
   510  
   511  
   512  #define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
   513  	ADDL OFFSET(R15),REG_E \
   514  	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   515  	RORXL $0x1b, REG_A, R12 \
   516  	RORXL $2, REG_A, REG_B         // for next round
   517  
   518  #define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
   519  	XORL REG_B, REG_A \
   520  	ADDL R12, REG_E \
   521          XORL REG_C, REG_A
   522  
   523  #define CALC_19 \
   524  	CALC_F2_PRE(0x8c,DX,CX,AX) \
   525  	CALC_F2_POST(DX,BX,SI,AX)
   526  
   527  #define CALC_20 \
   528  	CALC_F2_PRE(0xa0,AX,DX,DI) \
   529  	PRECALC_4(Y13,0x0) \
   530  	CALC_F2_POST(AX,CX,BX,DI)
   531  
   532  #define CALC_21 \
   533  	CALC_F2_PRE(0xa4,DI,AX,SI) \
   534  	CALC_F2_POST(DI,DX,CX,SI)
   535  
   536  #define CALC_22 \
   537  	CALC_F2_PRE(0xa8,SI,DI,BX) \
   538  	CALC_F2_POST(SI,AX,DX,BX)
   539  
   540  #define CALC_23 \
   541  	CALC_F2_PRE(0xac,BX,SI,CX) \
   542  	PRECALC_7(0x20) \
   543  	CALC_F2_POST(BX,DI,AX,CX)
   544  
   545  #define CALC_24 \
   546  	CALC_F2_PRE(0xc0,CX,BX,DX) \
   547  	PRECALC_0(0xb0) \
   548  	CALC_F2_POST(CX,SI,DI,DX)
   549  
   550  #define CALC_25 \
   551  	CALC_F2_PRE(0xc4,DX,CX,AX) \
   552  	PRECALC_1(0xb0) \
   553  	CALC_F2_POST(DX,BX,SI,AX)
   554  
   555  #define CALC_26 \
   556  	CALC_F2_PRE(0xc8,AX,DX,DI) \
   557  	PRECALC_2(Y12) \
   558  	CALC_F2_POST(AX,CX,BX,DI)
   559  
   560  #define CALC_27 \
   561  	CALC_F2_PRE(0xcc,DI,AX,SI) \
   562  	CALC_F2_POST(DI,DX,CX,SI)
   563  
   564  #define CALC_28 \
   565  	CALC_F2_PRE(0xe0,SI,DI,BX) \
   566  	PRECALC_4(Y12,0x0) \
   567  	CALC_F2_POST(SI,AX,DX,BX)
   568  
   569  #define CALC_29 \
   570  	CALC_F2_PRE(0xe4,BX,SI,CX) \
   571  	CALC_F2_POST(BX,DI,AX,CX)
   572  
   573  #define CALC_30 \
   574  	CALC_F2_PRE(0xe8,CX,BX,DX) \
   575  	CALC_F2_POST(CX,SI,DI,DX)
   576  
   577  #define CALC_31 \
   578  	CALC_F2_PRE(0xec,DX,CX,AX) \
   579  	PRECALC_7(0x30) \
   580  	CALC_F2_POST(DX,BX,SI,AX)
   581  
   582  #define CALC_32 \
   583  	CALC_F2_PRE(0x100,AX,DX,DI) \
   584  	PRECALC_16(Y15,Y14,Y12,Y8) \
   585  	CALC_F2_POST(AX,CX,BX,DI)
   586  
   587  #define CALC_33 \
   588  	CALC_F2_PRE(0x104,DI,AX,SI) \
   589  	PRECALC_17(Y15,Y13,Y8) \
   590  	CALC_F2_POST(DI,DX,CX,SI)
   591  
   592  #define CALC_34 \
   593  	CALC_F2_PRE(0x108,SI,DI,BX) \
   594  	PRECALC_18(Y8) \
   595  	CALC_F2_POST(SI,AX,DX,BX)
   596  
   597  #define CALC_35 \
   598  	CALC_F2_PRE(0x10c,BX,SI,CX) \
   599  	PRECALC_19(Y8) \
   600  	CALC_F2_POST(BX,DI,AX,CX)
   601  
   602  #define CALC_36 \
   603  	CALC_F2_PRE(0x120,CX,BX,DX) \
   604  	PRECALC_20(Y8) \
   605  	CALC_F2_POST(CX,SI,DI,DX)
   606  
   607  #define CALC_37 \
   608  	CALC_F2_PRE(0x124,DX,CX,AX) \
   609  	PRECALC_21(Y8) \
   610  	CALC_F2_POST(DX,BX,SI,AX)
   611  
   612  #define CALC_38 \
   613  	CALC_F2_PRE(0x128,AX,DX,DI) \
   614  	CALC_F2_POST(AX,CX,BX,DI)
   615  
   616  
   617  #define CALC_F3_PRE(OFFSET,REG_E) \
   618  	ADDL OFFSET(R15),REG_E
   619  
   620  #define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
   621  	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
   622  	MOVL REG_B, BP \
   623  	ORL  REG_A, BP \
   624  	RORXL $0x1b, REG_A, R12 \
   625  	RORXL $2, REG_A, REG_TB \
   626  	ANDL REG_C, BP \		// Calculate F for the next round
   627  	ANDL REG_B, REG_A \
   628  	ORL  BP, REG_A \
   629  	ADDL R12, REG_E
   630  
   631  #define CALC_39 \
   632  	CALC_F3_PRE(0x12c,SI) \
   633  	PRECALC_23(Y8,0x0,0x80) \
   634  	CALC_F3_POST(DI,DX,CX,SI,AX)
   635  
   636  #define CALC_40 \
   637  	CALC_F3_PRE(0x140,BX) \
   638  	PRECALC_16(Y14,Y13,Y8,Y7) \
   639  	CALC_F3_POST(SI,AX,DX,BX,DI)
   640  
   641  #define CALC_41 \
   642  	CALC_F3_PRE(0x144,CX) \
   643  	PRECALC_17(Y14,Y12,Y7) \
   644  	CALC_F3_POST(BX,DI,AX,CX,SI)
   645  
   646  #define CALC_42 \
   647  	CALC_F3_PRE(0x148,DX) \
   648  	PRECALC_18(Y7) \
   649  	CALC_F3_POST(CX,SI,DI,DX,BX)
   650  
   651  #define CALC_43 \
   652  	CALC_F3_PRE(0x14c,AX) \
   653  	PRECALC_19(Y7) \
   654  	CALC_F3_POST(DX,BX,SI,AX,CX)
   655  
   656  #define CALC_44 \
   657  	CALC_F3_PRE(0x160,DI) \
   658  	PRECALC_20(Y7) \
   659  	CALC_F3_POST(AX,CX,BX,DI,DX)
   660  
   661  #define CALC_45 \
   662  	CALC_F3_PRE(0x164,SI) \
   663  	PRECALC_21(Y7) \
   664  	CALC_F3_POST(DI,DX,CX,SI,AX)
   665  
   666  #define CALC_46 \
   667  	CALC_F3_PRE(0x168,BX) \
   668  	CALC_F3_POST(SI,AX,DX,BX,DI)
   669  
   670  #define CALC_47 \
   671  	CALC_F3_PRE(0x16c,CX) \
   672  	VPXOR Y9, Y0, Y7 \
   673  	VPADDD 0x20(R8), Y7, Y0 \
   674  	VMOVDQU Y0, 0xa0(R14) \
   675  	CALC_F3_POST(BX,DI,AX,CX,SI)
   676  
   677  #define CALC_48 \
   678  	CALC_F3_PRE(0x180,DX) \
   679  	PRECALC_16(Y13,Y12,Y7,Y5) \
   680  	CALC_F3_POST(CX,SI,DI,DX,BX)
   681  
   682  #define CALC_49 \
   683  	CALC_F3_PRE(0x184,AX) \
   684  	PRECALC_17(Y13,Y8,Y5) \
   685  	CALC_F3_POST(DX,BX,SI,AX,CX)
   686  
   687  #define CALC_50 \
   688  	CALC_F3_PRE(0x188,DI) \
   689  	PRECALC_18(Y5) \
   690  	CALC_F3_POST(AX,CX,BX,DI,DX)
   691  
   692  #define CALC_51 \
   693  	CALC_F3_PRE(0x18c,SI) \
   694  	PRECALC_19(Y5) \
   695  	CALC_F3_POST(DI,DX,CX,SI,AX)
   696  
   697  #define CALC_52 \
   698  	CALC_F3_PRE(0x1a0,BX) \
   699  	PRECALC_20(Y5) \
   700  	CALC_F3_POST(SI,AX,DX,BX,DI)
   701  
   702  #define CALC_53 \
   703  	CALC_F3_PRE(0x1a4,CX) \
   704  	PRECALC_21(Y5) \
   705  	CALC_F3_POST(BX,DI,AX,CX,SI)
   706  
   707  #define CALC_54 \
   708  	CALC_F3_PRE(0x1a8,DX) \
   709  	CALC_F3_POST(CX,SI,DI,DX,BX)
   710  
   711  #define CALC_55 \
   712  	CALC_F3_PRE(0x1ac,AX) \
   713  	PRECALC_23(Y5,0x20,0xc0) \
   714  	CALC_F3_POST(DX,BX,SI,AX,CX)
   715  
   716  #define CALC_56 \
   717  	CALC_F3_PRE(0x1c0,DI) \
   718  	PRECALC_16(Y12,Y8,Y5,Y3) \
   719  	CALC_F3_POST(AX,CX,BX,DI,DX)
   720  
   721  #define CALC_57 \
   722  	CALC_F3_PRE(0x1c4,SI) \
   723  	PRECALC_17(Y12,Y7,Y3) \
   724  	CALC_F3_POST(DI,DX,CX,SI,AX)
   725  
   726  #define CALC_58 \
   727  	CALC_F3_PRE(0x1c8,BX) \
   728  	PRECALC_18(Y3) \
   729  	CALC_F3_POST(SI,AX,DX,BX,DI)
   730  
   731  #define CALC_59 \
   732  	CALC_F2_PRE(0x1cc,BX,SI,CX) \
   733  	PRECALC_19(Y3) \
   734  	CALC_F2_POST(BX,DI,AX,CX)
   735  
   736  #define CALC_60 \
   737  	CALC_F2_PRE(0x1e0,CX,BX,DX) \
   738  	PRECALC_20(Y3) \
   739  	CALC_F2_POST(CX,SI,DI,DX)
   740  
   741  #define CALC_61 \
   742  	CALC_F2_PRE(0x1e4,DX,CX,AX) \
   743  	PRECALC_21(Y3) \
   744  	CALC_F2_POST(DX,BX,SI,AX)
   745  
   746  #define CALC_62 \
   747  	CALC_F2_PRE(0x1e8,AX,DX,DI) \
   748  	CALC_F2_POST(AX,CX,BX,DI)
   749  
   750  #define CALC_63 \
   751  	CALC_F2_PRE(0x1ec,DI,AX,SI) \
   752  	PRECALC_23(Y3,0x20,0xe0) \
   753  	CALC_F2_POST(DI,DX,CX,SI)
   754  
   755  #define CALC_64 \
   756  	CALC_F2_PRE(0x200,SI,DI,BX) \
   757  	PRECALC_32(Y5,Y3) \
   758  	CALC_F2_POST(SI,AX,DX,BX)
   759  
   760  #define CALC_65 \
   761  	CALC_F2_PRE(0x204,BX,SI,CX) \
   762  	PRECALC_33(Y14,Y15) \
   763  	CALC_F2_POST(BX,DI,AX,CX)
   764  
   765  #define CALC_66 \
   766  	CALC_F2_PRE(0x208,CX,BX,DX) \
   767  	PRECALC_34(Y8) \
   768  	CALC_F2_POST(CX,SI,DI,DX)
   769  
   770  #define CALC_67 \
   771  	CALC_F2_PRE(0x20c,DX,CX,AX) \
   772  	PRECALC_35(Y15) \
   773  	CALC_F2_POST(DX,BX,SI,AX)
   774  
   775  #define CALC_68 \
   776  	CALC_F2_PRE(0x220,AX,DX,DI) \
   777  	PRECALC_36(Y15) \
   778  	CALC_F2_POST(AX,CX,BX,DI)
   779  
   780  #define CALC_69 \
   781  	CALC_F2_PRE(0x224,DI,AX,SI) \
   782  	PRECALC_37(Y15) \
   783  	CALC_F2_POST(DI,DX,CX,SI)
   784  
   785  #define CALC_70 \
   786  	CALC_F2_PRE(0x228,SI,DI,BX) \
   787  	CALC_F2_POST(SI,AX,DX,BX)
   788  
   789  #define CALC_71 \
   790  	CALC_F2_PRE(0x22c,BX,SI,CX) \
   791  	PRECALC_39(Y15,0x20,0x100) \
   792  	CALC_F2_POST(BX,DI,AX,CX)
   793  
   794  #define CALC_72 \
   795  	CALC_F2_PRE(0x240,CX,BX,DX) \
   796  	PRECALC_32(Y3,Y15) \
   797  	CALC_F2_POST(CX,SI,DI,DX)
   798  
   799  #define CALC_73 \
   800  	CALC_F2_PRE(0x244,DX,CX,AX) \
   801  	PRECALC_33(Y13,Y14) \
   802  	CALC_F2_POST(DX,BX,SI,AX)
   803  
   804  #define CALC_74 \
   805  	CALC_F2_PRE(0x248,AX,DX,DI) \
   806  	PRECALC_34(Y7) \
   807  	CALC_F2_POST(AX,CX,BX,DI)
   808  
   809  #define CALC_75 \
   810  	CALC_F2_PRE(0x24c,DI,AX,SI) \
   811  	PRECALC_35(Y14) \
   812  	CALC_F2_POST(DI,DX,CX,SI)
   813  
   814  #define CALC_76 \
   815  	CALC_F2_PRE(0x260,SI,DI,BX) \
   816  	PRECALC_36(Y14) \
   817  	CALC_F2_POST(SI,AX,DX,BX)
   818  
   819  #define CALC_77 \
   820  	CALC_F2_PRE(0x264,BX,SI,CX) \
   821  	PRECALC_37(Y14) \
   822  	CALC_F2_POST(BX,DI,AX,CX)
   823  
   824  #define CALC_78 \
   825  	CALC_F2_PRE(0x268,CX,BX,DX) \
   826  	CALC_F2_POST(CX,SI,DI,DX)
   827  
   828  #define CALC_79 \
   829  	ADDL 0x26c(R15), AX \
   830  	LEAL (AX)(CX*1), AX \
   831  	RORXL $0x1b, DX, R12 \
   832  	PRECALC_39(Y14,0x20,0x120) \
   833  	ADDL R12, AX
   834  
   835  // Similar to CALC_0
   836  #define CALC_80 \
   837  	MOVL CX, DX \
   838  	RORXL $2, CX, CX \
   839  	ANDNL SI, DX, BP \
   840  	ANDL BX, DX \
   841  	XORL BP, DX \
   842  	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
   843  	PRECALC_32(Y15,Y14) \
   844  	CALC_F1_POST(AX,CX,DI)
   845  
   846  #define CALC_81 \
   847  	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
   848  	PRECALC_33(Y12,Y13) \
   849  	CALC_F1_POST(DI,DX,SI)
   850  
   851  #define CALC_82 \
   852  	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
   853  	PRECALC_34(Y5) \
   854  	CALC_F1_POST(SI,AX,BX)
   855  
   856  #define CALC_83 \
   857  	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
   858  	PRECALC_35(Y13) \
   859  	CALC_F1_POST(BX,DI,CX)
   860  
   861  #define CALC_84 \
   862  	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
   863  	PRECALC_36(Y13) \
   864  	CALC_F1_POST(CX,SI,DX)
   865  
   866  #define CALC_85 \
   867  	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
   868  	PRECALC_37(Y13) \
   869  	CALC_F1_POST(DX,BX,AX)
   870  
   871  #define CALC_86 \
   872  	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
   873  	CALC_F1_POST(AX,CX,DI)
   874  
   875  #define CALC_87 \
   876  	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
   877  	PRECALC_39(Y13,0x40,0x140) \
   878  	CALC_F1_POST(DI,DX,SI)
   879  
   880  #define CALC_88 \
   881  	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
   882  	PRECALC_32(Y14,Y13) \
   883  	CALC_F1_POST(SI,AX,BX)
   884  
   885  #define CALC_89 \
   886  	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
   887  	PRECALC_33(Y8,Y12) \
   888  	CALC_F1_POST(BX,DI,CX)
   889  
   890  #define CALC_90 \
   891  	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
   892  	PRECALC_34(Y3) \
   893  	CALC_F1_POST(CX,SI,DX)
   894  
   895  #define CALC_91 \
   896  	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
   897  	PRECALC_35(Y12) \
   898  	CALC_F1_POST(DX,BX,AX)
   899  
   900  #define CALC_92 \
   901  	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
   902  	PRECALC_36(Y12) \
   903  	CALC_F1_POST(AX,CX,DI)
   904  
   905  #define CALC_93 \
   906  	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
   907  	PRECALC_37(Y12) \
   908  	CALC_F1_POST(DI,DX,SI)
   909  
   910  #define CALC_94 \
   911  	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
   912  	CALC_F1_POST(SI,AX,BX)
   913  
   914  #define CALC_95 \
   915  	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
   916  	PRECALC_39(Y12,0x40,0x160) \
   917  	CALC_F1_POST(BX,DI,CX)
   918  
   919  #define CALC_96 \
   920  	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
   921  	PRECALC_32(Y13,Y12) \
   922  	CALC_F1_POST(CX,SI,DX)
   923  
   924  #define CALC_97 \
   925  	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
   926  	PRECALC_33(Y7,Y8) \
   927  	CALC_F1_POST(DX,BX,AX)
   928  
   929  #define CALC_98 \
   930  	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
   931  	PRECALC_34(Y15) \
   932  	CALC_F1_POST(AX,CX,DI)
   933  
   934  #define CALC_99 \
   935  	CALC_F2_PRE(0x9c,DI,AX,SI) \
   936  	PRECALC_35(Y8) \
   937  	CALC_F2_POST(DI,DX,CX,SI)
   938  
   939  #define CALC_100 \
   940  	CALC_F2_PRE(0xb0,SI,DI,BX) \
   941  	PRECALC_36(Y8) \
   942  	CALC_F2_POST(SI,AX,DX,BX)
   943  
   944  #define CALC_101 \
   945  	CALC_F2_PRE(0xb4,BX,SI,CX) \
   946  	PRECALC_37(Y8) \
   947  	CALC_F2_POST(BX,DI,AX,CX)
   948  
   949  #define CALC_102 \
   950  	CALC_F2_PRE(0xb8,CX,BX,DX) \
   951  	CALC_F2_POST(CX,SI,DI,DX)
   952  
   953  #define CALC_103 \
   954  	CALC_F2_PRE(0xbc,DX,CX,AX) \
   955  	PRECALC_39(Y8,0x40,0x180) \
   956  	CALC_F2_POST(DX,BX,SI,AX)
   957  
   958  #define CALC_104 \
   959  	CALC_F2_PRE(0xd0,AX,DX,DI) \
   960  	PRECALC_32(Y12,Y8) \
   961  	CALC_F2_POST(AX,CX,BX,DI)
   962  
   963  #define CALC_105 \
   964  	CALC_F2_PRE(0xd4,DI,AX,SI) \
   965  	PRECALC_33(Y5,Y7) \
   966  	CALC_F2_POST(DI,DX,CX,SI)
   967  
   968  #define CALC_106 \
   969  	CALC_F2_PRE(0xd8,SI,DI,BX) \
   970  	PRECALC_34(Y14) \
   971  	CALC_F2_POST(SI,AX,DX,BX)
   972  
   973  #define CALC_107 \
   974  	CALC_F2_PRE(0xdc,BX,SI,CX) \
   975  	PRECALC_35(Y7) \
   976  	CALC_F2_POST(BX,DI,AX,CX)
   977  
   978  #define CALC_108 \
   979  	CALC_F2_PRE(0xf0,CX,BX,DX) \
   980  	PRECALC_36(Y7) \
   981  	CALC_F2_POST(CX,SI,DI,DX)
   982  
   983  #define CALC_109 \
   984  	CALC_F2_PRE(0xf4,DX,CX,AX) \
   985  	PRECALC_37(Y7) \
   986  	CALC_F2_POST(DX,BX,SI,AX)
   987  
   988  #define CALC_110 \
   989  	CALC_F2_PRE(0xf8,AX,DX,DI) \
   990  	CALC_F2_POST(AX,CX,BX,DI)
   991  
   992  #define CALC_111 \
   993  	CALC_F2_PRE(0xfc,DI,AX,SI) \
   994  	PRECALC_39(Y7,0x40,0x1a0) \
   995  	CALC_F2_POST(DI,DX,CX,SI)
   996  
   997  #define CALC_112 \
   998  	CALC_F2_PRE(0x110,SI,DI,BX) \
   999  	PRECALC_32(Y8,Y7) \
  1000  	CALC_F2_POST(SI,AX,DX,BX)
  1001  
  1002  #define CALC_113 \
  1003  	CALC_F2_PRE(0x114,BX,SI,CX) \
  1004  	PRECALC_33(Y3,Y5) \
  1005  	CALC_F2_POST(BX,DI,AX,CX)
  1006  
  1007  #define CALC_114 \
  1008  	CALC_F2_PRE(0x118,CX,BX,DX) \
  1009  	PRECALC_34(Y13) \
  1010  	CALC_F2_POST(CX,SI,DI,DX)
  1011  
  1012  #define CALC_115 \
  1013  	CALC_F2_PRE(0x11c,DX,CX,AX) \
  1014  	PRECALC_35(Y5) \
  1015  	CALC_F2_POST(DX,BX,SI,AX)
  1016  
  1017  #define CALC_116 \
  1018  	CALC_F2_PRE(0x130,AX,DX,DI) \
  1019  	PRECALC_36(Y5) \
  1020  	CALC_F2_POST(AX,CX,BX,DI)
  1021  
  1022  #define CALC_117 \
  1023  	CALC_F2_PRE(0x134,DI,AX,SI) \
  1024  	PRECALC_37(Y5) \
  1025  	CALC_F2_POST(DI,DX,CX,SI)
  1026  
  1027  #define CALC_118 \
  1028  	CALC_F2_PRE(0x138,SI,DI,BX) \
  1029  	CALC_F2_POST(SI,AX,DX,BX)
  1030  
  1031  #define CALC_119 \
  1032  	CALC_F3_PRE(0x13c,CX) \
  1033  	PRECALC_39(Y5,0x40,0x1c0) \
  1034  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1035  
  1036  #define CALC_120 \
  1037  	CALC_F3_PRE(0x150,DX) \
  1038  	PRECALC_32(Y7,Y5) \
  1039  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1040  
  1041  #define CALC_121 \
  1042  	CALC_F3_PRE(0x154,AX) \
  1043  	PRECALC_33(Y15,Y3) \
  1044  	CALC_F3_POST(DX,BX,SI,AX,CX)
  1045  
  1046  #define CALC_122 \
  1047  	CALC_F3_PRE(0x158,DI) \
  1048  	PRECALC_34(Y12) \
  1049  	CALC_F3_POST(AX,CX,BX,DI,DX)
  1050  
  1051  #define CALC_123 \
  1052  	CALC_F3_PRE(0x15c,SI) \
  1053  	PRECALC_35(Y3) \
  1054  	CALC_F3_POST(DI,DX,CX,SI,AX)
  1055  
  1056  #define CALC_124 \
  1057  	CALC_F3_PRE(0x170,BX) \
  1058  	PRECALC_36(Y3) \
  1059  	CALC_F3_POST(SI,AX,DX,BX,DI)
  1060  
  1061  #define CALC_125 \
  1062  	CALC_F3_PRE(0x174,CX) \
  1063  	PRECALC_37(Y3) \
  1064  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1065  
  1066  #define CALC_126 \
  1067  	CALC_F3_PRE(0x178,DX) \
  1068  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1069  
  1070  #define CALC_127 \
  1071  	CALC_F3_PRE(0x17c,AX) \
  1072  	PRECALC_39(Y3,0x60,0x1e0) \
  1073  	CALC_F3_POST(DX,BX,SI,AX,CX)
  1074  
  1075  #define CALC_128 \
  1076  	CALC_F3_PRE(0x190,DI) \
  1077  	PRECALC_32(Y5,Y3) \
  1078  	CALC_F3_POST(AX,CX,BX,DI,DX)
  1079  
  1080  #define CALC_129 \
  1081  	CALC_F3_PRE(0x194,SI) \
  1082  	PRECALC_33(Y14,Y15) \
  1083  	CALC_F3_POST(DI,DX,CX,SI,AX)
  1084  
  1085  #define CALC_130 \
  1086  	CALC_F3_PRE(0x198,BX) \
  1087  	PRECALC_34(Y8) \
  1088  	CALC_F3_POST(SI,AX,DX,BX,DI)
  1089  
  1090  #define CALC_131 \
  1091  	CALC_F3_PRE(0x19c,CX) \
  1092  	PRECALC_35(Y15) \
  1093  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1094  
  1095  #define CALC_132 \
  1096  	CALC_F3_PRE(0x1b0,DX) \
  1097  	PRECALC_36(Y15) \
  1098  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1099  
  1100  #define CALC_133 \
  1101  	CALC_F3_PRE(0x1b4,AX) \
  1102  	PRECALC_37(Y15) \
  1103  	CALC_F3_POST(DX,BX,SI,AX,CX)
  1104  
  1105  #define CALC_134 \
  1106  	CALC_F3_PRE(0x1b8,DI) \
  1107  	CALC_F3_POST(AX,CX,BX,DI,DX)
  1108  
  1109  #define CALC_135 \
  1110  	CALC_F3_PRE(0x1bc,SI) \
  1111  	PRECALC_39(Y15,0x60,0x200) \
  1112  	CALC_F3_POST(DI,DX,CX,SI,AX)
  1113  
  1114  #define CALC_136 \
  1115  	CALC_F3_PRE(0x1d0,BX) \
  1116  	PRECALC_32(Y3,Y15) \
  1117  	CALC_F3_POST(SI,AX,DX,BX,DI)
  1118  
  1119  #define CALC_137 \
  1120  	CALC_F3_PRE(0x1d4,CX) \
  1121  	PRECALC_33(Y13,Y14) \
  1122  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1123  
  1124  #define CALC_138 \
  1125  	CALC_F3_PRE(0x1d8,DX) \
  1126  	PRECALC_34(Y7) \
  1127  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1128  
  1129  #define CALC_139 \
  1130  	CALC_F2_PRE(0x1dc,DX,CX,AX) \
  1131  	PRECALC_35(Y14) \
  1132  	CALC_F2_POST(DX,BX,SI,AX)
  1133  
  1134  #define CALC_140 \
  1135  	CALC_F2_PRE(0x1f0,AX,DX,DI) \
  1136  	PRECALC_36(Y14) \
  1137  	CALC_F2_POST(AX,CX,BX,DI)
  1138  
  1139  #define CALC_141 \
  1140  	CALC_F2_PRE(0x1f4,DI,AX,SI) \
  1141  	PRECALC_37(Y14) \
  1142  	CALC_F2_POST(DI,DX,CX,SI)
  1143  
  1144  #define CALC_142 \
  1145  	CALC_F2_PRE(0x1f8,SI,DI,BX) \
  1146  	CALC_F2_POST(SI,AX,DX,BX)
  1147  
  1148  #define CALC_143 \
  1149  	CALC_F2_PRE(0x1fc,BX,SI,CX) \
  1150  	PRECALC_39(Y14,0x60,0x220) \
  1151  	CALC_F2_POST(BX,DI,AX,CX)
  1152  
  1153  #define CALC_144 \
  1154  	CALC_F2_PRE(0x210,CX,BX,DX) \
  1155  	PRECALC_32(Y15,Y14) \
  1156  	CALC_F2_POST(CX,SI,DI,DX)
  1157  
  1158  #define CALC_145 \
  1159  	CALC_F2_PRE(0x214,DX,CX,AX) \
  1160  	PRECALC_33(Y12,Y13) \
  1161  	CALC_F2_POST(DX,BX,SI,AX)
  1162  
  1163  #define CALC_146 \
  1164  	CALC_F2_PRE(0x218,AX,DX,DI) \
  1165  	PRECALC_34(Y5) \
  1166  	CALC_F2_POST(AX,CX,BX,DI)
  1167  
  1168  #define CALC_147 \
  1169  	CALC_F2_PRE(0x21c,DI,AX,SI) \
  1170  	PRECALC_35(Y13) \
  1171  	CALC_F2_POST(DI,DX,CX,SI)
  1172  
  1173  #define CALC_148 \
  1174  	CALC_F2_PRE(0x230,SI,DI,BX) \
  1175  	PRECALC_36(Y13) \
  1176  	CALC_F2_POST(SI,AX,DX,BX)
  1177  
  1178  #define CALC_149 \
  1179  	CALC_F2_PRE(0x234,BX,SI,CX) \
  1180  	PRECALC_37(Y13) \
  1181  	CALC_F2_POST(BX,DI,AX,CX)
  1182  
  1183  #define CALC_150 \
  1184  	CALC_F2_PRE(0x238,CX,BX,DX) \
  1185  	CALC_F2_POST(CX,SI,DI,DX)
  1186  
  1187  #define CALC_151 \
  1188  	CALC_F2_PRE(0x23c,DX,CX,AX) \
  1189  	PRECALC_39(Y13,0x60,0x240) \
  1190  	CALC_F2_POST(DX,BX,SI,AX)
  1191  
  1192  #define CALC_152 \
  1193  	CALC_F2_PRE(0x250,AX,DX,DI) \
  1194  	PRECALC_32(Y14,Y13) \
  1195  	CALC_F2_POST(AX,CX,BX,DI)
  1196  
  1197  #define CALC_153 \
  1198  	CALC_F2_PRE(0x254,DI,AX,SI) \
  1199  	PRECALC_33(Y8,Y12) \
  1200  	CALC_F2_POST(DI,DX,CX,SI)
  1201  
  1202  #define CALC_154 \
  1203  	CALC_F2_PRE(0x258,SI,DI,BX) \
  1204  	PRECALC_34(Y3) \
  1205  	CALC_F2_POST(SI,AX,DX,BX)
  1206  
  1207  #define CALC_155 \
  1208  	CALC_F2_PRE(0x25c,BX,SI,CX) \
  1209  	PRECALC_35(Y12) \
  1210  	CALC_F2_POST(BX,DI,AX,CX)
  1211  
  1212  #define CALC_156 \
  1213  	CALC_F2_PRE(0x270,CX,BX,DX) \
  1214  	PRECALC_36(Y12) \
  1215  	CALC_F2_POST(CX,SI,DI,DX)
  1216  
  1217  #define CALC_157 \
  1218  	CALC_F2_PRE(0x274,DX,CX,AX) \
  1219  	PRECALC_37(Y12) \
  1220  	CALC_F2_POST(DX,BX,SI,AX)
  1221  
  1222  #define CALC_158 \
  1223  	CALC_F2_PRE(0x278,AX,DX,DI) \
  1224  	CALC_F2_POST(AX,CX,BX,DI)
  1225  
  1226  #define CALC_159 \
  1227  	ADDL 0x27c(R15),SI \
  1228  	LEAL (SI)(AX*1), SI \
  1229  	RORXL $0x1b, DI, R12 \
  1230  	PRECALC_39(Y12,0x60,0x260) \
  1231  	ADDL R12, SI
  1232  
  1233  
  1234  
  1235  #define CALC \
  1236  	MOVL	(R9), CX \
  1237  	MOVL	4(R9), SI \
  1238  	MOVL	8(R9), DI \
  1239  	MOVL	12(R9), AX \
  1240  	MOVL	16(R9), DX \
  1241  	MOVQ    SP, R14 \
  1242  	LEAQ    (2*4*80+32)(SP), R15 \
  1243  	PRECALC \ // Precalc WK for first 2 blocks
  1244  	XCHGQ   R15, R14 \
  1245  loop: \  // this loops is unrolled
  1246  	CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
  1247  	JNE	begin \
  1248  	VZEROUPPER \
  1249  	RET \
  1250  begin: \
  1251  	CALC_0 \
  1252  	CALC_1 \
  1253  	CALC_2 \
  1254  	CALC_3 \
  1255  	CALC_4 \
  1256  	CALC_5 \
  1257  	CALC_6 \
  1258  	CALC_7 \
  1259  	CALC_8 \
  1260  	CALC_9 \
  1261  	CALC_10 \
  1262  	CALC_11 \
  1263  	CALC_12 \
  1264  	CALC_13 \
  1265  	CALC_14 \
  1266  	CALC_15 \
  1267  	CALC_16 \
  1268  	CALC_17 \
  1269  	CALC_18 \
  1270  	CALC_19 \
  1271  	CALC_20 \
  1272  	CALC_21 \
  1273  	CALC_22 \
  1274  	CALC_23 \
  1275  	CALC_24 \
  1276  	CALC_25 \
  1277  	CALC_26 \
  1278  	CALC_27 \
  1279  	CALC_28 \
  1280  	CALC_29 \
  1281  	CALC_30 \
  1282  	CALC_31 \
  1283  	CALC_32 \
  1284  	CALC_33 \
  1285  	CALC_34 \
  1286  	CALC_35 \
  1287  	CALC_36 \
  1288  	CALC_37 \
  1289  	CALC_38 \
  1290  	CALC_39 \
  1291  	CALC_40 \
  1292  	CALC_41 \
  1293  	CALC_42 \
  1294  	CALC_43 \
  1295  	CALC_44 \
  1296  	CALC_45 \
  1297  	CALC_46 \
  1298  	CALC_47 \
  1299  	CALC_48 \
  1300  	CALC_49 \
  1301  	CALC_50 \
  1302  	CALC_51 \
  1303  	CALC_52 \
  1304  	CALC_53 \
  1305  	CALC_54 \
  1306  	CALC_55 \
  1307  	CALC_56 \
  1308  	CALC_57 \
  1309  	CALC_58 \
  1310  	CALC_59 \
  1311  	ADDQ $128, R10 \ // move to next even-64-byte block
  1312  	CMPQ R10, R11 \ // is current block the last one?
  1313  	CMOVQCC R8, R10 \ // signal the last iteration smartly
  1314  	CALC_60 \
  1315  	CALC_61 \
  1316  	CALC_62 \
  1317  	CALC_63 \
  1318  	CALC_64 \
  1319  	CALC_65 \
  1320  	CALC_66 \
  1321  	CALC_67 \
  1322  	CALC_68 \
  1323  	CALC_69 \
  1324  	CALC_70 \
  1325  	CALC_71 \
  1326  	CALC_72 \
  1327  	CALC_73 \
  1328  	CALC_74 \
  1329  	CALC_75 \
  1330  	CALC_76 \
  1331  	CALC_77 \
  1332  	CALC_78 \
  1333  	CALC_79 \
  1334  	UPDATE_HASH(AX,DX,BX,SI,DI) \
  1335  	CMPQ R10, R8 \ // is current block the last one?
  1336  	JE loop\
  1337  	MOVL DX, CX \
  1338  	CALC_80 \
  1339  	CALC_81 \
  1340  	CALC_82 \
  1341  	CALC_83 \
  1342  	CALC_84 \
  1343  	CALC_85 \
  1344  	CALC_86 \
  1345  	CALC_87 \
  1346  	CALC_88 \
  1347  	CALC_89 \
  1348  	CALC_90 \
  1349  	CALC_91 \
  1350  	CALC_92 \
  1351  	CALC_93 \
  1352  	CALC_94 \
  1353  	CALC_95 \
  1354  	CALC_96 \
  1355  	CALC_97 \
  1356  	CALC_98 \
  1357  	CALC_99 \
  1358  	CALC_100 \
  1359  	CALC_101 \
  1360  	CALC_102 \
  1361  	CALC_103 \
  1362  	CALC_104 \
  1363  	CALC_105 \
  1364  	CALC_106 \
  1365  	CALC_107 \
  1366  	CALC_108 \
  1367  	CALC_109 \
  1368  	CALC_110 \
  1369  	CALC_111 \
  1370  	CALC_112 \
  1371  	CALC_113 \
  1372  	CALC_114 \
  1373  	CALC_115 \
  1374  	CALC_116 \
  1375  	CALC_117 \
  1376  	CALC_118 \
  1377  	CALC_119 \
  1378  	CALC_120 \
  1379  	CALC_121 \
  1380  	CALC_122 \
  1381  	CALC_123 \
  1382  	CALC_124 \
  1383  	CALC_125 \
  1384  	CALC_126 \
  1385  	CALC_127 \
  1386  	CALC_128 \
  1387  	CALC_129 \
  1388  	CALC_130 \
  1389  	CALC_131 \
  1390  	CALC_132 \
  1391  	CALC_133 \
  1392  	CALC_134 \
  1393  	CALC_135 \
  1394  	CALC_136 \
  1395  	CALC_137 \
  1396  	CALC_138 \
  1397  	CALC_139 \
  1398  	ADDQ $128, R13 \ //move to next even-64-byte block
  1399  	CMPQ R13, R11 \ //is current block the last one?
  1400  	CMOVQCC R8, R10 \
  1401  	CALC_140 \
  1402  	CALC_141 \
  1403  	CALC_142 \
  1404  	CALC_143 \
  1405  	CALC_144 \
  1406  	CALC_145 \
  1407  	CALC_146 \
  1408  	CALC_147 \
  1409  	CALC_148 \
  1410  	CALC_149 \
  1411  	CALC_150 \
  1412  	CALC_151 \
  1413  	CALC_152 \
  1414  	CALC_153 \
  1415  	CALC_154 \
  1416  	CALC_155 \
  1417  	CALC_156 \
  1418  	CALC_157 \
  1419  	CALC_158 \
  1420  	CALC_159 \
  1421  	UPDATE_HASH(SI,DI,DX,CX,BX) \
  1422  	MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
  1423  	MOVL	DI, SI \
  1424  	MOVL	DX, DI \
  1425  	MOVL	BX, DX \
  1426  	MOVL	CX, AX \
  1427  	MOVL	R12, CX \
  1428  	XCHGQ   R15, R14 \
  1429  	JMP     loop
  1430  
  1431  
  1432  
  1433  TEXT ·blockAVX2(SB),$1408-32
  1434  
  1435  	MOVQ	dig+0(FP),	DI
  1436  	MOVQ	p_base+8(FP),	SI
  1437  	MOVQ	p_len+16(FP),	DX
  1438  	SHRQ	$6,		DX
  1439  	SHLQ	$6,		DX
  1440  
  1441  	MOVQ	$K_XMM_AR<>(SB), R8
  1442  
  1443  	MOVQ	DI, R9
  1444  	MOVQ	SI, R10
  1445  	LEAQ	64(SI), R13
  1446  
  1447  	ADDQ	SI, DX
  1448  	ADDQ	$64, DX
  1449  	MOVQ	DX, R11
  1450  
  1451  	CMPQ	R13, R11
  1452  	CMOVQCC	R8, R13
  1453  
  1454  	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10
  1455  
  1456  	CALC // RET is inside macros
  1457  
  1458  DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
  1459  DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
  1460  DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
  1461  DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
  1462  DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
  1463  DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
  1464  DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
  1465  DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
  1466  DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
  1467  DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
  1468  DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
  1469  DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
  1470  DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
  1471  DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
  1472  DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
  1473  DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
  1474  DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
  1475  DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
  1476  DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
  1477  DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
  1478  DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
  1479  DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
  1480  DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
  1481  DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
  1482  DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
  1483  DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
  1484  DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
  1485  DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
  1486  DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
  1487  DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
  1488  DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
  1489  DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
  1490  GLOBL K_XMM_AR<>(SB),RODATA,$128
  1491  
  1492  DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
  1493  DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
  1494  DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
  1495  DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
  1496  DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
  1497  DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
  1498  DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
  1499  DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
  1500  GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32