github.com/hellobchain/newcryptosm@v0.0.0-20221019060107-edb949a317e9/sm3/sm3block_amd64.s (about)

     1  #include "textflag.h"
     2  
     3  #define xorm(P1, P2) \
     4  	XORL P2, P1; \
     5  	MOVL P1, P2
     6  
     7  #define XDWORD0 Y4
     8  #define XDWORD1 Y5
     9  #define XDWORD2 Y6
    10  #define XDWORD3 Y7
    11  #define XDWORD4 Y8
    12  
    13  #define XWORD0 X4
    14  #define XWORD1 X5
    15  #define XWORD2 X6
    16  #define XWORD3 X7
    17  #define XWORD4 X8
    18  
    19  #define XTMP0 Y0
    20  #define XTMP1 Y1
    21  #define XTMP2 Y2
    22  #define XTMP3 Y3
    23  #define XTMP4 Y10
    24  #define XTMP5 Y11
    25  
    26  #define a AX
    27  #define b BX
    28  #define c CX
    29  #define d R8
    30  #define e DX
    31  #define f R9
    32  #define g R10
    33  #define h R11
    34  
    35  #define T1 R12
    36  #define y0 R13
    37  #define y1 R14
    38  #define y2 R15
    39  #define y3 DI
    40  
    41  // mask to convert LE -> BE
    42  #define BYTE_FLIP_MASK 	Y13
    43  #define X_BYTE_FLIP_MASK X13    //low half of Y13
    44  
    45  #define NUM_BYTES DX
    46  #define INP	DI
    47  
    48  #define CTX	SI
    49  #define SRND SI
    50  #define TBL BP
    51  
    52  // Offsets
    53  #define XFER_SIZE 2*64*4
    54  #define INP_END_SIZE 8
    55  #define INP_SIZE 8
    56  
    57  #define _XFER	0
    58  #define _INP_END _XFER + XFER_SIZE
    59  #define _INP _INP_END + INP_END_SIZE
    60  #define STACK_SIZE _INP + INP_SIZE
    61  
    62  #define ROUND_AND_SCHED_0_15_0(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
    63  	MOVL	e, y2;						  \ // y2=E
    64  	RORXL	$20, a, y1;					\ // y1=A<<<12
    65  	ADDL	0*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
    66    VPALIGNR $12, XDWORD0, XDWORD1, XTMP0;\ //XTMP0 = W[-13]
    67  	ADDL	y1, y2;						  \ // y2=(A<<<12)+E+Ti
    68  	RORXL	$25, y2, y0;				\ // y0=((A<<<12)+E+Ti)<<<7=SS1
    69  	XORL	y0, y1;						  \ // y1=SS1^(A<<<12)=SS2
    70    VPSLLD   $7, XTMP0, XTMP1;            \
    71  	;									        \
    72  	ADDL	(wj2 + 0*4)(SP)(SRND*1), d;						\
    73  	MOVL	a, T1; 						\
    74  	XORL	b, T1; 						\
    75    VPSRLD   $(32-7), XTMP0, XTMP2;       \
    76  	XORL	c, T1; 						\
    77  	ADDL	T1, d; 						\
    78  	ADDL	y1, d;						\ // d=TT1
    79  	VPOR     XTMP1, XTMP2, XTMP3;         \ // XTMP3 = W[-13] <<< 7
    80  	;									      \
    81  	ADDL	(wj + 0*4)(SP)(SRND*1), h;						\
    82  	MOVL	e, y3;						\
    83  	XORL	f, y3;						\
    84    VPALIGNR $8, XDWORD2, XDWORD3, XTMP1; \ // XTMP1 = W[-6]
    85  	XORL	g, y3;						\
    86  	ADDL	y3, h;						\
    87  	ADDL	y0, h;						\ // h=TT2
    88  	VPXOR    XTMP3, XTMP1, XTMP1;         \ // XTMP1 = W[-6] ^ (W[-13]<<<7)  outside
    89  	;									      \
    90  	RORXL	$23, h, y2;					\
    91  	RORXL	$15, h, y3;					\
    92  	XORL	h, y2;    					\
    93    VPALIGNR $12, XDWORD1, XDWORD2, XTMP0;\ // XTMP0 = W[-9]
    94  	;                 				\
    95  	MOVL	d, h;     					\
    96  	XORL	y2, y3;   					\
    97  	MOVL	y3, d;    					\
    98  	VPXOR    XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-9] ^ W[-16]   inside
    99  	;                 				\
   100  	RORXL	$23, b, b; 					\
   101  	RORXL	$13, f, f;          \
   102    VPSHUFD $0xA5, XDWORD3, XTMP2        // XTMP2 = W[-3] {BBAA} 待扩展
   103  
   104  #define ROUND_AND_SCHED_0_15_1(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   105  	MOVL	e, y2;						\ // y2=E
   106  	RORXL	$20, a, y1;					\ // y1=A<<<12
   107  	ADDL	1*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
   108    VPSLLQ  $15, XTMP2, XTMP3;           \ // XTMP3 = W[-3] <<< 15 {BxAx}
   109  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   110  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   111  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   112  	VPSHUFB shuff_00BA<>(SB), XTMP3, XTMP3;\ // XTMP3 = s1 {00BA}
   113  	;									\
   114  	ADDL	(wj2 + 1*4)(SP)(SRND*1), d;						\
   115  	MOVL	a, T1; 						\
   116  	XORL	b, T1; 						\
   117    VPXOR   XTMP0, XTMP3, XTMP3;         \ // XTMP3 = x {xxBA}  store to use
   118  	XORL	c, T1; 						\
   119  	ADDL	T1, d; 						\
   120  	ADDL	y1, d;						\ // d=TT1
   121    VPSLLD  $15, XTMP3, XTMP2;            \ // XTMP2 = x << 15
   122  	;									\
   123  	ADDL	(wj + 1*4)(SP)(SRND*1), h;						\
   124  	MOVL	e, y3;						\
   125  	XORL	f, y3;						\
   126  	VPSRLD  $(32-15), XTMP3, XTMP4;       \ // XTMP4 = x >> (32-15)
   127  	XORL	g, y3;						\
   128  	ADDL	y3, h;						\
   129  	ADDL	y0, h;						\ // h=TT2
   130  	VPOR    XTMP2, XTMP4, XTMP5;         \ // XTMP5 = x <<< 15 (xxBA)
   131  	;									\
   132  	RORXL	$23, h, y2;					\
   133  	RORXL	$15, h, y3;					\
   134  	XORL	h, y2;    					\
   135    VPXOR   XTMP3, XTMP5, XTMP5;         \ // XTMP5 = x ^ (x <<< 15) (xxBA)
   136  	;                 					\
   137  	MOVL	d, h;     					\
   138  	XORL	y2, y3;   					\
   139  	MOVL	y3, d;    					\
   140    VPSLLD  $23, XTMP3, XTMP2;           \ // XTMP3 << 23
   141  	;                 					\
   142  	RORXL	$23, b, b; 					\
   143  	RORXL	$13, f, f;          \
   144  	VPSRLD  $(32-23), XTMP3, XTMP4       // XTMP3 >> (32-23)
   145  
   146  #define ROUND_AND_SCHED_0_15_2(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   147  	MOVL	e, y2;						\ // y2=E
   148  	RORXL	$20, a, y1;					\ // y1=A<<<12
   149  	ADDL	2*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
   150  	VPOR    XTMP2, XTMP4, XTMP4;         \ // XTMP4 = x <<< 23 (xxBA)
   151  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   152  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   153  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   154    VPXOR   XTMP5, XTMP4, XTMP4;         \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (xxBA)
   155  	;									\
   156  	ADDL	(wj2 + 2*4)(SP)(SRND*1), d;						\
   157  	MOVL	a, T1; 						\
   158  	XORL	b, T1; 						\
   159    VPXOR   XTMP4, XTMP1, XTMP2;         \ // XTMP2 = {. ,. , w1, w0}
   160  	XORL	c, T1; 						\
   161  	ADDL	T1, d; 						\
   162  	ADDL	y1, d;						\ // d=TT1
   163    VPALIGNR $4, XDWORD3, XTMP2, XTMP3;  \ // XTMP3 = DCBA
   164  	;									\
   165  	ADDL	(wj + 2*4)(SP)(SRND*1), h;						\
   166  	MOVL	e, y3;						\
   167  	XORL	f, y3;						\
   168    VPSLLD  $15, XTMP3, XTMP4;           \ // XTMP4 = W[-3] << 15
   169  	XORL	g, y3;						\
   170  	ADDL	y3, h;						\
   171  	ADDL	y0, h;						\ // h=TT2
   172    VPSRLD  $(32-15), XTMP3, XTMP5;      \ // XTMP5 = W[-3] >> (32-15)
   173  	;									\
   174  	RORXL	$23, h, y2;					\
   175  	RORXL	$15, h, y3;					\
   176  	XORL	h, y2;    					\
   177    VPOR    XTMP4, XTMP5, XTMP5;         \ // XTMP5 = W[-3] <<< 15 {DCBA}
   178  	;                 					\
   179  	MOVL	d, h;     					\
   180  	XORL	y2, y3;   					\
   181  	MOVL	y3, d;    					\
   182    VPXOR   XTMP0, XTMP5, XTMP3;         \ // XTMP3 = x {DCBA}
   183  	;                 					\
   184  	RORXL	$23, b, b; 					\
   185  	RORXL	$13, f, f;          \
   186    VPSLLD  $15, XTMP3, XTMP4            // XTMP4 = XTMP3 << 15
   187  
   188  #define ROUND_AND_SCHED_0_15_3(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   189  	MOVL	e, y2;						\ // y2=E
   190  	RORXL	$20, a, y1;					\ // y1=A<<<12
   191  	ADDL	3*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
   192    VPSRLD  $(32-15), XTMP3, XTMP5;      \ // XTMP5 = XTMP3 >> (32-15)
   193  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   194  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   195  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   196    VPOR    XTMP5, XTMP4, XTMP4;         \ // XTMP4 = x <<< 15 (DCBA)
   197  	;									\
   198  	ADDL	(wj2 + 3*4)(SP)(SRND*1), d;						\
   199  	MOVL	a, T1; 						\
   200  	XORL	b, T1; 						\
   201    VPXOR   XTMP3, XTMP4, XTMP4;         \ // XTMP4 = x ^ (x <<< 15) (DCBA)
   202  	XORL	c, T1; 						\
   203  	ADDL	T1, d; 						\
   204  	ADDL	y1, d;						\ // d=TT1
   205    VPSLLD  $23, XTMP3, XTMP5;           \ // XTMP5 = XTMP3 << 23
   206  	;									\
   207  	ADDL	(wj + 3*4)(SP)(SRND*1), h;						\
   208  	MOVL	e, y3;						\
   209  	XORL	f, y3;						\
   210    VPSRLD  $(32-23), XTMP3, XTMP3;      \ // XTMP3 >> (32-23)  XTMP3 still useful?
   211  	XORL	g, y3;						\
   212  	ADDL	y3, h;						\
   213  	ADDL	y0, h;						\ // h=TT2
   214    VPOR    XTMP3, XTMP5, XTMP5;         \ // XTMP5 = x <<< 23
   215  	;									\
   216  	RORXL	$23, h, y2;					\
   217  	RORXL	$15, h, y3;					\
   218  	XORL	h, y2;    					\
   219    VPXOR   XTMP5, XTMP4, XTMP4;         \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (DCBA)
   220  	;                 					\
   221  	MOVL	d, h;     					\
   222  	XORL	y2, y3;   					\
   223  	MOVL	y3, d;    					\
   224    VPXOR   XTMP4, XTMP1, XDWORD0;       \ // XDWORD0 = {W3, W2, W1, W0,}
   225  	;                 					\
   226  	RORXL	$23, b, b; 					\
   227  	RORXL	$13, f, f
   228  
   229  
   230  #define ROUND_AND_SCHED_16_63_0(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   231    MOVL	e, y2;						  \ // y2=E
   232    RORXL	$20, a, y1;					\ // y1=A<<<12
   233    VPALIGNR $12, XDWORD0, XDWORD1, XTMP0;\ //XTMP0 = W[-13]
   234    ADDL	0*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
   235    ADDL	y1, y2;						  \ // y2=(A<<<12)+E+Ti
   236    RORXL	$25, y2, y0;				\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   237    XORL	y0, y1;						  \ // y1=SS1^(A<<<12)=SS2
   238    VPSLLD   $7, XTMP0, XTMP1;            \
   239    ;									        \
   240    ADDL	(wj2 + 0*4)(SP)(SRND*1), d;						\
   241    MOVL	a, T1; 						\
   242    ORL		c, T1; 						\ // a|c
   243    ANDL	b, T1; 						\ //(a|c)&b
   244    VPSRLD   $(32-7), XTMP0, XTMP2;       \
   245    MOVL	c, y2;						\
   246    ANDL	a, y2;						\
   247    ORL		y2, T1;						\ // (a|c)&b | a&c
   248    VPOR     XTMP1, XTMP2, XTMP3;         \ // XTMP3 = W[-13] <<< 7
   249    ADDL	T1, d; 						\
   250    ADDL	y1, d;						\ // d=TT1
   251    ;									      \
   252    ADDL	(wj + 0*4)(SP)(SRND*1), h;						\
   253    VPALIGNR $8, XDWORD2, XDWORD3, XTMP1; \ // XTMP1 = W[-6]
   254    MOVL	e, y3;						\
   255    ANDL	f, y3;						\
   256    ANDNL	g, e, y2;					\
   257    VPXOR    XTMP3, XTMP1, XTMP1;         \ // XTMP1 = W[-6] ^ (W[-13]<<<7)  outside
   258    ORL		y2, y3;						\
   259    ADDL	y3, h;						\
   260    ADDL	y0, h;						\ // h=TT2
   261    ;									      \
   262    VPALIGNR $12, XDWORD1, XDWORD2, XTMP0;\ // XTMP0 = W[-9]
   263    RORXL	$23, h, y2;					\
   264    RORXL	$15, h, y3;					\
   265    XORL	h, y2;    					\
   266    ;                 				\
   267    VPXOR    XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-9] ^ W[-16]   inside
   268    MOVL	d, h;     					\
   269    XORL	y2, y3;   					\
   270    MOVL	y3, d;    					\
   271    ;                 				\
   272    VPSHUFD $0xA5, XDWORD3, XTMP2;       \ // XTMP2 = W[-3] {BBAA} 待扩展
   273    RORXL	$23, b, b; 					\
   274    RORXL	$13, f, f
   275  
   276  #define ROUND_AND_SCHED_16_63_1(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   277  	MOVL	e, y2;						\ // y2=E
   278  	RORXL	$20, a, y1;					\ // y1=A<<<12
   279    VPSLLQ  $15, XTMP2, XTMP3;           \ // XTMP3 = W[-3] <<< 15 {BxAx}
   280  	ADDL	1*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
   281  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   282  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   283  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   284  	VPSHUFB shuff_00BA<>(SB), XTMP3, XTMP3;\ // XTMP3 = s1 {00BA}
   285  	;									\
   286  	ADDL	(wj2 + 1*4)(SP)(SRND*1), d;						\
   287  	MOVL	a, T1; 						\
   288    ORL		c, T1; 						\ // a|c
   289  	ANDL	b, T1; 						\ //(a|c)&b
   290    VPXOR   XTMP0, XTMP3, XTMP3;         \ // XTMP3 = x {xxBA}  store to use
   291  	MOVL	c, y2;						\
   292  	ANDL	a, y2;						\
   293  	ORL		y2, T1;						\ // (a|c)&b | a&c
   294    VPSLLD  $15, XTMP3, XTMP2;            \ // XTMP2 = x << 15
   295  	ADDL	T1, d; 						\
   296  	ADDL	y1, d;						\ // d=TT1
   297  	;									\
   298  	ADDL	(wj + 1*4)(SP)(SRND*1), h;						\
   299    VPSRLD  $(32-15), XTMP3, XTMP4;       \ // XTMP4 = x >> (32-15)
   300  	MOVL	e, y3;						\
   301    ANDL	f, y3;						\
   302  	ANDNL	g, e, y2;					\
   303    VPOR    XTMP2, XTMP4, XTMP5;         \ // XTMP5 = x <<< 15 (xxBA)
   304  	ORL		y2, y3;						\
   305  	ADDL	y3, h;						\
   306  	ADDL	y0, h;						\ // h=TT2
   307  	;									\
   308    VPXOR   XTMP3, XTMP5, XTMP5;         \ // XTMP5 = x ^ (x <<< 15) (xxBA)
   309  	RORXL	$23, h, y2;					\
   310  	RORXL	$15, h, y3;					\
   311  	XORL	h, y2;    					\
   312  	;                 					\
   313    VPSLLD  $23, XTMP3, XTMP2;           \ // XTMP3 << 23
   314  	MOVL	d, h;     					\
   315  	XORL	y2, y3;   					\
   316  	MOVL	y3, d;    					\
   317  	;                 					\
   318    VPSRLD  $(32-23), XTMP3, XTMP4;      \ // XTMP3 >> (32-23)
   319  	RORXL	$23, b, b; 					\
   320  	RORXL	$13, f, f
   321  
   322  #define ROUND_AND_SCHED_16_63_2(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   323  	MOVL	e, y2;						\ // y2=E
   324  	RORXL	$20, a, y1;					\ // y1=A<<<12
   325    VPOR    XTMP2, XTMP4, XTMP4;         \ // XTMP4 = x <<< 23 (xxBA)
   326  	ADDL	2*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
   327  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   328  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   329  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   330    VPXOR   XTMP5, XTMP4, XTMP4;         \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (xxBA)
   331  	;									\
   332  	ADDL	(wj2 + 2*4)(SP)(SRND*1), d;						\
   333  	MOVL	a, T1; 						\
   334    ORL		c, T1; 						\ // a|c
   335  	ANDL	b, T1; 						\ //(a|c)&b
   336    VPXOR   XTMP4, XTMP1, XTMP2;         \ // XTMP2 = {. ,. , w1, w0}
   337  	MOVL	c, y2;						\
   338  	ANDL	a, y2;						\
   339  	ORL		y2, T1;						\ // (a|c)&b | a&c
   340    VPALIGNR $4, XDWORD3, XTMP2, XTMP3;  \ // XTMP3 = DCBA
   341  	ADDL	T1, d; 						\
   342  	ADDL	y1, d;						\ // d=TT1
   343  	;									\
   344  	ADDL	(wj + 2*4)(SP)(SRND*1), h;						\
   345    VPSLLD  $15, XTMP3, XTMP4;           \ // XTMP4 = W[-3] << 15
   346  	MOVL	e, y3;						\
   347    ANDL	f, y3;						\
   348  	ANDNL	g, e, y2;					\
   349    VPSRLD  $(32-15), XTMP3, XTMP5;      \ // XTMP5 = W[-3] >> (32-15)
   350  	ORL		y2, y3;						\
   351  	ADDL	y3, h;						\
   352  	ADDL	y0, h;						\ // h=TT2
   353  	;									\
   354    VPOR    XTMP4, XTMP5, XTMP5;         \ // XTMP5 = W[-3] <<< 15 {DCBA}
   355  	RORXL	$23, h, y2;					\
   356  	RORXL	$15, h, y3;					\
   357  	XORL	h, y2;    					\
   358  	;                 					\
   359    VPXOR   XTMP0, XTMP5, XTMP3;         \ // XTMP3 = x {DCBA}
   360  	MOVL	d, h;     					\
   361  	XORL	y2, y3;   					\
   362  	MOVL	y3, d;    					\
   363  	;                 					\
   364    VPSLLD  $15, XTMP3, XTMP4;           \ // XTMP4 = XTMP3 << 15
   365  	RORXL	$23, b, b; 					\
   366  	RORXL	$13, f, f
   367  
   368  #define ROUND_AND_SCHED_16_63_3(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   369  	MOVL	e, y2;						\ // y2=E
   370  	RORXL	$20, a, y1;					\ // y1=A<<<12
   371    VPSRLD  $(32-15), XTMP3, XTMP5;      \ // XTMP5 = XTMP3 >> (32-15)
   372  	ADDL	3*4(TBL)(SRND*1), y2;				\ // y2=E+Ti
   373  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   374  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   375  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   376    VPOR    XTMP5, XTMP4, XTMP4;         \ // XTMP4 = x <<< 15 (DCBA)
   377  	;									\
   378  	ADDL	(wj2 + 3*4)(SP)(SRND*1), d;						\
   379  	MOVL	a, T1; 						\
   380    ORL		c, T1; 						\ // a|c
   381  	ANDL	b, T1; 						\ //(a|c)&b
   382    VPXOR   XTMP3, XTMP4, XTMP4;         \ // XTMP4 = x ^ (x <<< 15) (DCBA)
   383  	MOVL	c, y2;						\
   384  	ANDL	a, y2;						\
   385  	ORL		y2, T1;						\ // (a|c)&b | a&c
   386  	ADDL	T1, d; 						\
   387    VPSLLD  $23, XTMP3, XTMP5;           \ // XTMP5 = XTMP3 << 23
   388  	ADDL	y1, d;						\ // d=TT1
   389  	;									\
   390  	ADDL	(wj + 3*4)(SP)(SRND*1), h;						\
   391  	MOVL	e, y3;						\
   392    ANDL	f, y3;						\
   393    VPSRLD  $(32-23), XTMP3, XTMP3;      \ // XTMP3 >> (32-23)  XTMP3 still useful?
   394  	ANDNL	g, e, y2;					\
   395  	ORL		y2, y3;						\
   396  	ADDL	y3, h;						\
   397    VPOR    XTMP3, XTMP5, XTMP5;         \ // XTMP5 = x <<< 23
   398  	ADDL	y0, h;						\ // h=TT2
   399  	;									\
   400  	RORXL	$23, h, y2;					\
   401  	RORXL	$15, h, y3;					\
   402    VPXOR   XTMP5, XTMP4, XTMP4;         \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (DCBA)
   403  	XORL	h, y2;    					\
   404  	;                 					\
   405  	MOVL	d, h;     					\
   406  	XORL	y2, y3;   					\
   407    VPXOR   XTMP4, XTMP1, XDWORD0;       \ // XDWORD0 = {W3, W2, W1, W0,}
   408  	MOVL	y3, d;    					\
   409  	;                 					\
   410  	RORXL	$23, b, b; 					\
   411  	RORXL	$13, f, f
   412  
   413  
   414  #define ROUND_0_15_0(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   415  	MOVL	e, y2;						  \ // y2=E
   416  	RORXL	$20, a, y1;					\ // y1=A<<<12
   417  	ADDL	(0*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   418  	ADDL	y1, y2;						  \ // y2=(A<<<12)+E+Ti
   419  	RORXL	$25, y2, y0;				\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   420  	XORL	y0, y1;						  \ // y1=SS1^(A<<<12)=SS2
   421  	;									        \
   422  	ADDL	(wj2 + 0*4)(SP)(SRND*1), d;						\
   423  	MOVL	a, T1; 						\
   424  	XORL	b, T1; 						\
   425  	XORL	c, T1; 						\
   426  	ADDL	T1, d; 						\
   427  	ADDL	y1, d;						\ // d=TT1
   428  	;									      \
   429  	ADDL	(wj + 0*4)(SP)(SRND*1), h;						\
   430  	MOVL	e, y3;						\
   431  	XORL	f, y3;						\
   432  	XORL	g, y3;						\
   433  	ADDL	y3, h;						\
   434  	ADDL	y0, h;						\ // h=TT2
   435  	;									      \
   436  	RORXL	$23, h, y2;					\
   437  	RORXL	$15, h, y3;					\
   438  	XORL	h, y2;    					\
   439  	;                 				\
   440  	MOVL	d, h;     					\
   441  	XORL	y2, y3;   					\
   442  	MOVL	y3, d;    					\
   443  	;                 				\
   444  	RORXL	$23, b, b; 					\
   445  	RORXL	$13, f, f
   446  
   447  #define ROUND_0_15_1(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   448  	MOVL	e, y2;						\ // y2=E
   449  	RORXL	$20, a, y1;					\ // y1=A<<<12
   450  	ADDL	(1*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   451  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   452  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   453  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   454  	;									\
   455  	ADDL	(wj2 + 1*4)(SP)(SRND*1), d;						\
   456  	MOVL	a, T1; 						\
   457  	XORL	b, T1; 						\
   458  	XORL	c, T1; 						\
   459  	ADDL	T1, d; 						\
   460  	ADDL	y1, d;						\ // d=TT1
   461  	;									\
   462  	ADDL	(wj + 1*4)(SP)(SRND*1), h;						\
   463  	MOVL	e, y3;						\
   464  	XORL	f, y3;						\
   465  	XORL	g, y3;						\
   466  	ADDL	y3, h;						\
   467  	ADDL	y0, h;						\ // h=TT2
   468  	;									\
   469  	RORXL	$23, h, y2;					\
   470  	RORXL	$15, h, y3;					\
   471  	XORL	h, y2;    					\
   472  	;                 					\
   473  	MOVL	d, h;     					\
   474  	XORL	y2, y3;   					\
   475  	MOVL	y3, d;    					\
   476  	;                 					\
   477  	RORXL	$23, b, b; 					\
   478  	RORXL	$13, f, f
   479  
   480  #define ROUND_0_15_2(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   481  	MOVL	e, y2;						\ // y2=E
   482  	RORXL	$20, a, y1;					\ // y1=A<<<12
   483  	ADDL	(2*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   484  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   485  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   486  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   487  	;									\
   488  	ADDL	(wj2 + 2*4)(SP)(SRND*1), d;						\
   489  	MOVL	a, T1; 						\
   490  	XORL	b, T1; 						\
   491  	XORL	c, T1; 						\
   492  	ADDL	T1, d; 						\
   493  	ADDL	y1, d;						\ // d=TT1
   494  	;									\
   495  	ADDL	(wj + 2*4)(SP)(SRND*1), h;						\
   496  	MOVL	e, y3;						\
   497  	XORL	f, y3;						\
   498  	XORL	g, y3;						\
   499  	ADDL	y3, h;						\
   500  	ADDL	y0, h;						\ // h=TT2
   501  	;									\
   502  	RORXL	$23, h, y2;					\
   503  	RORXL	$15, h, y3;					\
   504  	XORL	h, y2;    					\
   505  	;                 					\
   506  	MOVL	d, h;     					\
   507  	XORL	y2, y3;   					\
   508  	MOVL	y3, d;    					\
   509  	;                 					\
   510  	RORXL	$23, b, b; 					\
   511  	RORXL	$13, f, f
   512  
   513  #define ROUND_0_15_3(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   514  	MOVL	e, y2;						\ // y2=E
   515  	RORXL	$20, a, y1;					\ // y1=A<<<12
   516  	ADDL	(3*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   517  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   518  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   519  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   520  	;									\
   521  	ADDL	(wj2 + 3*4)(SP)(SRND*1), d;						\
   522  	MOVL	a, T1; 						\
   523  	XORL	b, T1; 						\
   524  	XORL	c, T1; 						\
   525  	ADDL	T1, d; 						\
   526  	ADDL	y1, d;						\ // d=TT1
   527  	;									\
   528  	ADDL	(wj + 3*4)(SP)(SRND*1), h;						\
   529  	MOVL	e, y3;						\
   530  	XORL	f, y3;						\
   531  	XORL	g, y3;						\
   532  	ADDL	y3, h;						\
   533  	ADDL	y0, h;						\ // h=TT2
   534  	;									\
   535  	RORXL	$23, h, y2;					\
   536  	RORXL	$15, h, y3;					\
   537  	XORL	h, y2;    					\
   538  	;                 					\
   539  	MOVL	d, h;     					\
   540  	XORL	y2, y3;   					\
   541  	MOVL	y3, d;    					\
   542  	;                 					\
   543  	RORXL	$23, b, b; 					\
   544  	RORXL	$13, f, f
   545  
   546  
   547  #define ROUND_16_63_0(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   548  	MOVL	e, y2;						  \ // y2=E
   549  	RORXL	$20, a, y1;					\ // y1=A<<<12
   550  	ADDL	(0*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   551  	ADDL	y1, y2;						  \ // y2=(A<<<12)+E+Ti
   552  	RORXL	$25, y2, y0;				\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   553  	XORL	y0, y1;						  \ // y1=SS1^(A<<<12)=SS2
   554  	;									        \
   555  	ADDL	(wj2 + 0*4)(SP)(SRND*1), d;						\
   556  	MOVL	a, T1; 						\
   557    ORL		c, T1; 						\ // a|c
   558  	ANDL	b, T1; 						\ //(a|c)&b
   559  	MOVL	c, y2;						\
   560  	ANDL	a, y2;						\
   561  	ORL		y2, T1;						\ // (a|c)&b | a&c
   562  	ADDL	T1, d; 						\
   563  	ADDL	y1, d;						\ // d=TT1
   564  	;									      \
   565  	ADDL	(wj + 0*4)(SP)(SRND*1), h;						\
   566  	MOVL	e, y3;						\
   567    ANDL	f, y3;						\
   568  	ANDNL	g, e, y2;					\
   569  	ORL		y2, y3;						\
   570  	ADDL	y3, h;						\
   571  	ADDL	y0, h;						\ // h=TT2
   572  	;									      \
   573  	RORXL	$23, h, y2;					\
   574  	RORXL	$15, h, y3;					\
   575  	XORL	h, y2;    					\
   576  	;                 				\
   577  	MOVL	d, h;     					\
   578  	XORL	y2, y3;   					\
   579  	MOVL	y3, d;    					\
   580  	;                 				\
   581  	RORXL	$23, b, b; 					\
   582  	RORXL	$13, f, f
   583  
   584  #define ROUND_16_63_1(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   585  	MOVL	e, y2;						\ // y2=E
   586  	RORXL	$20, a, y1;					\ // y1=A<<<12
   587  	ADDL	(1*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   588  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   589  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   590  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   591  	;									\
   592  	ADDL	(wj2 + 1*4)(SP)(SRND*1), d;						\
   593  	MOVL	a, T1; 						\
   594    ORL		c, T1; 						\ // a|c
   595  	ANDL	b, T1; 						\ //(a|c)&b
   596  	MOVL	c, y2;						\
   597  	ANDL	a, y2;						\
   598  	ORL		y2, T1;						\ // (a|c)&b | a&c
   599  	ADDL	T1, d; 						\
   600  	ADDL	y1, d;						\ // d=TT1
   601  	;									\
   602  	ADDL	(wj + 1*4)(SP)(SRND*1), h;						\
   603  	MOVL	e, y3;						\
   604    ANDL	f, y3;						\
   605  	ANDNL	g, e, y2;					\
   606  	ORL		y2, y3;						\
   607  	ADDL	y3, h;						\
   608  	ADDL	y0, h;						\ // h=TT2
   609  	;									\
   610  	RORXL	$23, h, y2;					\
   611  	RORXL	$15, h, y3;					\
   612  	XORL	h, y2;    					\
   613  	;                 					\
   614  	MOVL	d, h;     					\
   615  	XORL	y2, y3;   					\
   616  	MOVL	y3, d;    					\
   617  	;                 					\
   618  	RORXL	$23, b, b; 					\
   619  	RORXL	$13, f, f
   620  
   621  #define ROUND_16_63_2(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   622  	MOVL	e, y2;						\ // y2=E
   623  	RORXL	$20, a, y1;					\ // y1=A<<<12
   624  	ADDL	(2*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   625  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   626  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   627  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   628  	;									\
   629  	ADDL	(wj2 + 2*4)(SP)(SRND*1), d;						\
   630  	MOVL	a, T1; 						\
   631    ORL		c, T1; 						\ // a|c
   632  	ANDL	b, T1; 						\ //(a|c)&b
   633  	MOVL	c, y2;						\
   634  	ANDL	a, y2;						\
   635  	ORL		y2, T1;						\ // (a|c)&b | a&c
   636  	ADDL	T1, d; 						\
   637  	ADDL	y1, d;						\ // d=TT1
   638  	;									\
   639  	ADDL	(wj + 2*4)(SP)(SRND*1), h;						\
   640  	MOVL	e, y3;						\
   641    ANDL	f, y3;						\
   642  	ANDNL	g, e, y2;					\
   643  	ORL		y2, y3;						\
   644  	ADDL	y3, h;						\
   645  	ADDL	y0, h;						\ // h=TT2
   646  	;									\
   647  	RORXL	$23, h, y2;					\
   648  	RORXL	$15, h, y3;					\
   649  	XORL	h, y2;    					\
   650  	;                 					\
   651  	MOVL	d, h;     					\
   652  	XORL	y2, y3;   					\
   653  	MOVL	y3, d;    					\
   654  	;                 					\
   655  	RORXL	$23, b, b; 					\
   656  	RORXL	$13, f, f
   657  
   658  #define ROUND_16_63_3(wj, wj2, flag, a, b, c, d, e, f, g, h) \
   659  	MOVL	e, y2;						\ // y2=E
   660  	RORXL	$20, a, y1;					\ // y1=A<<<12
   661  	ADDL	(3*4+flag*16)(TBL)(SRND*1), y2;				\ // y2=E+Ti
   662  	ADDL	y1, y2;						\ // y2=(A<<<12)+E+Ti
   663  	RORXL	$25, y2, y0;					\ // y0=((A<<<12)+E+Ti)<<<7=SS1
   664  	XORL	y0, y1;						\ // y1=SS1^(A<<<12)=SS2
   665  	;									\
   666  	ADDL	(wj2 + 3*4)(SP)(SRND*1), d;						\
   667  	MOVL	a, T1; 						\
   668    ORL		c, T1; 						\ // a|c
   669  	ANDL	b, T1; 						\ //(a|c)&b
   670  	MOVL	c, y2;						\
   671  	ANDL	a, y2;						\
   672  	ORL		y2, T1;						\ // (a|c)&b | a&c
   673  	ADDL	T1, d; 						\
   674  	ADDL	y1, d;						\ // d=TT1
   675  	;									\
   676  	ADDL	(wj + 3*4)(SP)(SRND*1), h;						\
   677  	MOVL	e, y3;						\
   678    ANDL	f, y3;						\
   679  	ANDNL	g, e, y2;					\
   680  	ORL		y2, y3;						\
   681  	ADDL	y3, h;						\
   682  	ADDL	y0, h;						\ // h=TT2
   683  	;									\
   684  	RORXL	$23, h, y2;					\
   685  	RORXL	$15, h, y3;					\
   686  	XORL	h, y2;    					\
   687  	;                 					\
   688  	MOVL	d, h;     					\
   689  	XORL	y2, y3;   					\
   690  	MOVL	y3, d;    					\
   691  	;                 					\
   692  	RORXL	$23, b, b; 					\
   693  	RORXL	$13, f, f
   694  
   695  // (68+64)*4*2+8+8+8
   696  TEXT ·blockasm(SB), 0, $1080-48
   697  	CMPB ·useAVX2(SB), $1
   698  	JE   avx2
   699  
   700  avx2:
   701    MOVQ dig+0(FP), CTX        //dig.h
   702    MOVQ p_base+8(FP), INP          //Input
   703    MOVQ p_len+16(FP), NUM_BYTES    //INP_LEN
   704  
   705    LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   706    MOVQ NUM_BYTES, _INP_END(SP)
   707  
   708    CMPQ NUM_BYTES, INP
   709    JE   avx2_only_one_block
   710  
   711    MOVL 0(CTX), a  // a = H0
   712    MOVL 4(CTX), b  // b = H1
   713    MOVL 8(CTX), c  // c = H2
   714    MOVL 12(CTX), d // d = H3
   715    MOVL 16(CTX), e // e = H4
   716    MOVL 20(CTX), f // f = H5
   717    MOVL 24(CTX), g // g = H6
   718    MOVL 28(CTX), h // h = H7
   719  
   720  loop0: //load input
   721  
   722    VMOVDQU (0*32)(INP), XTMP0
   723    VMOVDQU (1*32)(INP), XTMP1
   724    VMOVDQU (2*32)(INP), XTMP2
   725    VMOVDQU (3*32)(INP), XTMP3
   726  
   727    VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   728  
   729    // Apply Byte Flip Mask: LE -> BE
   730  	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   731  	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   732  	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   733  	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   734  
   735  	// Transpose data into high/low parts
   736    VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
   737  	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
   738  	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
   739  	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
   740  
   741    MOVQ $TSHF<>(SB), TBL
   742  
   743  avx2_last_block_enter:
   744    ADDQ $64, INP
   745    MOVQ INP, _INP(SP)
   746    XORQ SRND, SRND
   747  
   748  loop1_1: //w16-w31 and first 16 rounds, srnd:4*32
   749  
   750    VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1)     //wj
   751    VPXOR   XDWORD1, XDWORD0, XDWORD4               //wj2
   752    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   753    ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   754    ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   755    ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   756    ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   757  
   758  	ADDQ $32, SRND
   759  
   760    VMOVDQU XDWORD1, (_XFER + 0*32)(SP)(SRND*1)
   761  	VPXOR   XDWORD2, XDWORD1, XDWORD4
   762  	VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   763  	ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   764  	ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   765  	ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   766  	ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   767  
   768  	ADDQ $32, SRND
   769  
   770    VMOVDQU XDWORD2, (_XFER + 0*32)(SP)(SRND*1)
   771    VPXOR   XDWORD3, XDWORD2, XDWORD4
   772    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   773    ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32,  a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   774    ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32,  h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   775    ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32,  g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   776    ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32,  f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   777  
   778    ADDQ $32, SRND
   779  
   780    VMOVDQU XDWORD3, (_XFER + 0*32)(SP)(SRND*1)
   781    VPXOR   XDWORD0, XDWORD3, XDWORD4
   782    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   783    ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32,  e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   784    ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32,  d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   785    ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32,  c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   786    ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32,  b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   787    ADDQ $32, SRND
   788  
   789  loop1_2: //w32-w64, srnd 3*4*32, 将tshift(传参)摆脱srnd依赖,重写round_and_sched,减少3条addq
   790  
   791    VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1)     //wj
   792    VPXOR   XDWORD1, XDWORD0, XDWORD4               //wj2
   793    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   794    ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   795    ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   796    ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   797    ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   798  
   799    ADDQ $32, SRND
   800  
   801    VMOVDQU XDWORD1, (_XFER + 0*32)(SP)(SRND*1)
   802    VPXOR   XDWORD2, XDWORD1, XDWORD4
   803    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   804    ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   805    ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   806    ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   807    ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   808  
   809    ADDQ $32, SRND
   810  
   811    VMOVDQU XDWORD2, (_XFER + 0*32)(SP)(SRND*1)
   812    VPXOR   XDWORD3, XDWORD2, XDWORD4
   813    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   814    ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32,  a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   815    ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32,  h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   816    ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32,  g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   817    ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32,  f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   818  
   819    ADDQ $32, SRND
   820  
   821    VMOVDQU XDWORD3, (_XFER + 0*32)(SP)(SRND*1)
   822    VPXOR   XDWORD0, XDWORD3, XDWORD4
   823    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   824    ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32,  e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   825    ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32,  d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   826    ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32,  c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   827    ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32,  b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   828  
   829    ADDQ $32, SRND
   830    CMPQ SRND, $3*4*32
   831  	JB   loop1_2
   832  
   833  loop1_3: //w64-w67, last 16rounds and 4 msg_sched
   834  
   835    VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1)     //wj
   836    VPXOR   XDWORD1, XDWORD0, XDWORD4               //wj2
   837    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   838    ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   839    ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   840    ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   841    ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   842    ADDQ $32, SRND
   843  
   844    VMOVDQU XDWORD1, (_XFER + 0*32)(SP)(SRND*1)     //wj
   845    VPXOR   XDWORD2, XDWORD1, XDWORD4               //wj2
   846    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   847    ROUND_16_63_0(_XFER + 0*32, _XFER + 17*32, 0, e, f, g, h, a, b, c, d)
   848    ROUND_16_63_1(_XFER + 0*32, _XFER + 17*32, 0, d, e, f, g, h, a, b, c)
   849    ROUND_16_63_2(_XFER + 0*32, _XFER + 17*32, 0, c, d, e, f, g, h, a, b)
   850    ROUND_16_63_3(_XFER + 0*32, _XFER + 17*32, 0, b, c, d, e, f, g, h, a)
   851    ADDQ $32, SRND
   852  
   853    VMOVDQU XDWORD2, (_XFER + 0*32)(SP)(SRND*1)     //wj
   854    VPXOR   XDWORD3, XDWORD2, XDWORD4               //wj2
   855    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   856  	ROUND_16_63_0(_XFER + 0*32, _XFER + 17*32, 0, a, b, c, d, e, f, g, h)
   857  	ROUND_16_63_1(_XFER + 0*32, _XFER + 17*32, 0, h, a, b, c, d, e, f, g)
   858  	ROUND_16_63_2(_XFER + 0*32, _XFER + 17*32, 0, g, h, a, b, c, d, e, f)
   859  	ROUND_16_63_3(_XFER + 0*32, _XFER + 17*32, 0, f, g, h, a, b, c, d, e)
   860    ADDQ $32, SRND
   861  
   862    VMOVDQU XDWORD3, (_XFER + 0*32)(SP)(SRND*1)     //wj
   863    VPXOR   XDWORD0, XDWORD3, XDWORD4               //wj2
   864    VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1)
   865    ROUND_16_63_0(_XFER + 0*32, _XFER + 17*32, 0, e, f, g, h, a, b, c, d)
   866    ROUND_16_63_1(_XFER + 0*32, _XFER + 17*32, 0, d, e, f, g, h, a, b, c)
   867    ROUND_16_63_2(_XFER + 0*32, _XFER + 17*32, 0, c, d, e, f, g, h, a, b)
   868    ROUND_16_63_3(_XFER + 0*32, _XFER + 17*32, 0, b, c, d, e, f, g, h, a)
   869    ADDQ $32, SRND
   870  
   871    MOVQ dig+0(FP), CTX      //dig.h
   872  	MOVQ _INP(SP), INP
   873  
   874    xorm(  0(CTX), a)
   875    xorm(  4(CTX), b)
   876    xorm(  8(CTX), c)
   877    xorm( 12(CTX), d)
   878    xorm( 16(CTX), e)
   879    xorm( 20(CTX), f)
   880    xorm( 24(CTX), g)
   881    xorm( 28(CTX), h)
   882  
   883    CMPQ _INP_END(SP), INP
   884    JB   done_hash
   885  
   886    XORQ SRND, SRND
   887  
   888  loop2_0: //Do second block with previously scheduled results wj/wj2
   889  
   890    ROUND_0_15_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, a, b, c, d, e, f, g, h)
   891    ROUND_0_15_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, h, a, b, c, d, e, f, g)
   892    ROUND_0_15_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, g, h, a, b, c, d, e, f)
   893    ROUND_0_15_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, f, g, h, a, b, c, d, e)
   894    ADDQ $32, SRND
   895  
   896    ROUND_0_15_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, e, f, g, h, a, b, c, d)
   897    ROUND_0_15_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, d, e, f, g, h, a, b, c)
   898    ROUND_0_15_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, c, d, e, f, g, h, a, b)
   899    ROUND_0_15_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, b, c, d, e, f, g, h, a)
   900    ADDQ $32, SRND
   901  
   902    CMPQ SRND, $4*32
   903    JB   loop2_0
   904  
   905  loop2_1:
   906    ROUND_16_63_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, a, b, c, d, e, f, g, h)
   907    ROUND_16_63_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, h, a, b, c, d, e, f, g)
   908    ROUND_16_63_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, g, h, a, b, c, d, e, f)
   909    ROUND_16_63_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, f, g, h, a, b, c, d, e)
   910    ADDQ $32, SRND
   911  
   912    ROUND_16_63_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, e, f, g, h, a, b, c, d)
   913    ROUND_16_63_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, d, e, f, g, h, a, b, c)
   914    ROUND_16_63_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, c, d, e, f, g, h, a, b)
   915    ROUND_16_63_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, b, c, d, e, f, g, h, a)
   916    ADDQ $32, SRND
   917  
   918    CMPQ SRND, $4*4*32
   919    JB   loop2_1
   920  
   921    MOVQ dig+0(FP), CTX      //Output
   922    MOVQ _INP(SP), INP
   923    ADDQ $64, INP
   924  
   925    xorm(  0(CTX), a)
   926    xorm(  4(CTX), b)
   927    xorm(  8(CTX), c)
   928    xorm( 12(CTX), d)
   929    xorm( 16(CTX), e)
   930    xorm( 20(CTX), f)
   931    xorm( 24(CTX), g)
   932    xorm( 28(CTX), h)
   933  
   934    CMPQ _INP_END(SP), INP
   935    JA   loop0
   936    JB   done_hash
   937  
   938  
   939  avx2_do_last_block:
   940  
   941  	VMOVDQU 0(INP), XWORD0
   942  	VMOVDQU 16(INP), XWORD1
   943  	VMOVDQU 32(INP), XWORD2
   944  	VMOVDQU 48(INP), XWORD3
   945  
   946  	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   947  
   948  	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   949  	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   950  	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   951  	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   952  
   953  	MOVQ $TSHF<>(SB), TBL
   954  
   955  	JMP avx2_last_block_enter
   956  
   957  avx2_only_one_block:
   958  	// Load initial digest
   959  	MOVL 0(CTX), a  // a = H0
   960  	MOVL 4(CTX), b  // b = H1
   961  	MOVL 8(CTX), c  // c = H2
   962  	MOVL 12(CTX), d // d = H3
   963  	MOVL 16(CTX), e // e = H4
   964  	MOVL 20(CTX), f // f = H5
   965  	MOVL 24(CTX), g // g = H6
   966  	MOVL 28(CTX), h // h = H7
   967  
   968  	JMP avx2_do_last_block
   969  
   970  done_hash:
   971    VZEROUPPER
   972  	RET
   973  
   974  // shuffle byte order from LE to BE
   975  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   976  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   977  DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
   978  DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
   979  GLOBL flip_mask<>(SB), 8, $32
   980  
   981  // shuffle BxAx -> 00BA
   982  DATA shuff_00BA<>+0x00(SB)/8, $0x0f0e0d0c07060504
   983  DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
   984  DATA shuff_00BA<>+0x10(SB)/8, $0x0f0e0d0c07060504
   985  DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   986  GLOBL shuff_00BA<>(SB), 8, $32
   987  
   988  //tshift for 2 blocks
   989  DATA TSHF<>+0x0(SB)/4, $0x79cc4519
   990  DATA TSHF<>+0x4(SB)/4, $0xf3988a32
   991  DATA TSHF<>+0x8(SB)/4, $0xe7311465
   992  DATA TSHF<>+0xc(SB)/4, $0xce6228cb
   993  DATA TSHF<>+0x10(SB)/4, $0x79cc4519
   994  DATA TSHF<>+0x14(SB)/4, $0xf3988a32
   995  DATA TSHF<>+0x18(SB)/4, $0xe7311465
   996  DATA TSHF<>+0x1c(SB)/4, $0xce6228cb
   997  
   998  DATA TSHF<>+0x20(SB)/4, $0x9cc45197
   999  DATA TSHF<>+0x24(SB)/4, $0x3988a32f
  1000  DATA TSHF<>+0x28(SB)/4, $0x7311465e
  1001  DATA TSHF<>+0x2c(SB)/4, $0xe6228cbc
  1002  DATA TSHF<>+0x30(SB)/4, $0x9cc45197
  1003  DATA TSHF<>+0x34(SB)/4, $0x3988a32f
  1004  DATA TSHF<>+0x38(SB)/4, $0x7311465e
  1005  DATA TSHF<>+0x3c(SB)/4, $0xe6228cbc
  1006  
  1007  DATA TSHF<>+0x40(SB)/4, $0xcc451979
  1008  DATA TSHF<>+0x44(SB)/4, $0x988a32f3
  1009  DATA TSHF<>+0x48(SB)/4, $0x311465e7
  1010  DATA TSHF<>+0x4c(SB)/4, $0x6228cbce
  1011  DATA TSHF<>+0x50(SB)/4, $0xcc451979
  1012  DATA TSHF<>+0x54(SB)/4, $0x988a32f3
  1013  DATA TSHF<>+0x58(SB)/4, $0x311465e7
  1014  DATA TSHF<>+0x5c(SB)/4, $0x6228cbce
  1015  
  1016  DATA TSHF<>+0x60(SB)/4, $0xc451979c
  1017  DATA TSHF<>+0x64(SB)/4, $0x88a32f39
  1018  DATA TSHF<>+0x68(SB)/4, $0x11465e73
  1019  DATA TSHF<>+0x6c(SB)/4, $0x228cbce6
  1020  DATA TSHF<>+0x70(SB)/4, $0xc451979c
  1021  DATA TSHF<>+0x74(SB)/4, $0x88a32f39
  1022  DATA TSHF<>+0x78(SB)/4, $0x11465e73
  1023  DATA TSHF<>+0x7c(SB)/4, $0x228cbce6
  1024  
  1025  DATA TSHF<>+0x80(SB)/4, $0x9d8a7a87
  1026  DATA TSHF<>+0x84(SB)/4, $0x3b14f50f
  1027  DATA TSHF<>+0x88(SB)/4, $0x7629ea1e
  1028  DATA TSHF<>+0x8c(SB)/4, $0xec53d43c
  1029  DATA TSHF<>+0x90(SB)/4, $0x9d8a7a87
  1030  DATA TSHF<>+0x94(SB)/4, $0x3b14f50f
  1031  DATA TSHF<>+0x98(SB)/4, $0x7629ea1e
  1032  DATA TSHF<>+0x9c(SB)/4, $0xec53d43c
  1033  
  1034  DATA TSHF<>+0xa0(SB)/4, $0xd8a7a879
  1035  DATA TSHF<>+0xa4(SB)/4, $0xb14f50f3
  1036  DATA TSHF<>+0xa8(SB)/4, $0x629ea1e7
  1037  DATA TSHF<>+0xac(SB)/4, $0xc53d43ce
  1038  DATA TSHF<>+0xb0(SB)/4, $0xd8a7a879
  1039  DATA TSHF<>+0xb4(SB)/4, $0xb14f50f3
  1040  DATA TSHF<>+0xb8(SB)/4, $0x629ea1e7
  1041  DATA TSHF<>+0xbc(SB)/4, $0xc53d43ce
  1042  
  1043  DATA TSHF<>+0xc0(SB)/4, $0x8a7a879d
  1044  DATA TSHF<>+0xc4(SB)/4, $0x14f50f3b
  1045  DATA TSHF<>+0xc8(SB)/4, $0x29ea1e76
  1046  DATA TSHF<>+0xcc(SB)/4, $0x53d43cec
  1047  DATA TSHF<>+0xd0(SB)/4, $0x8a7a879d
  1048  DATA TSHF<>+0xd4(SB)/4, $0x14f50f3b
  1049  DATA TSHF<>+0xd8(SB)/4, $0x29ea1e76
  1050  DATA TSHF<>+0xdc(SB)/4, $0x53d43cec
  1051  
  1052  DATA TSHF<>+0xe0(SB)/4, $0xa7a879d8
  1053  DATA TSHF<>+0xe4(SB)/4, $0x4f50f3b1
  1054  DATA TSHF<>+0xe8(SB)/4, $0x9ea1e762
  1055  DATA TSHF<>+0xec(SB)/4, $0x3d43cec5
  1056  DATA TSHF<>+0xf0(SB)/4, $0xa7a879d8
  1057  DATA TSHF<>+0xf4(SB)/4, $0x4f50f3b1
  1058  DATA TSHF<>+0xf8(SB)/4, $0x9ea1e762
  1059  DATA TSHF<>+0xfc(SB)/4, $0x3d43cec5
  1060  
  1061  DATA TSHF<>+0x100(SB)/4, $0x7a879d8a
  1062  DATA TSHF<>+0x104(SB)/4, $0xf50f3b14
  1063  DATA TSHF<>+0x108(SB)/4, $0xea1e7629
  1064  DATA TSHF<>+0x10c(SB)/4, $0xd43cec53
  1065  DATA TSHF<>+0x110(SB)/4, $0x7a879d8a
  1066  DATA TSHF<>+0x114(SB)/4, $0xf50f3b14
  1067  DATA TSHF<>+0x118(SB)/4, $0xea1e7629
  1068  DATA TSHF<>+0x11c(SB)/4, $0xd43cec53
  1069  
  1070  DATA TSHF<>+0x120(SB)/4, $0xa879d8a7
  1071  DATA TSHF<>+0x124(SB)/4, $0x50f3b14f
  1072  DATA TSHF<>+0x128(SB)/4, $0xa1e7629e
  1073  DATA TSHF<>+0x12c(SB)/4, $0x43cec53d
  1074  DATA TSHF<>+0x130(SB)/4, $0xa879d8a7
  1075  DATA TSHF<>+0x134(SB)/4, $0x50f3b14f
  1076  DATA TSHF<>+0x138(SB)/4, $0xa1e7629e
  1077  DATA TSHF<>+0x13c(SB)/4, $0x43cec53d
  1078  
  1079  DATA TSHF<>+0x140(SB)/4, $0x879d8a7a
  1080  DATA TSHF<>+0x144(SB)/4, $0xf3b14f5
  1081  DATA TSHF<>+0x148(SB)/4, $0x1e7629ea
  1082  DATA TSHF<>+0x14c(SB)/4, $0x3cec53d4
  1083  DATA TSHF<>+0x150(SB)/4, $0x879d8a7a
  1084  DATA TSHF<>+0x154(SB)/4, $0xf3b14f5
  1085  DATA TSHF<>+0x158(SB)/4, $0x1e7629ea
  1086  DATA TSHF<>+0x15c(SB)/4, $0x3cec53d4
  1087  
  1088  DATA TSHF<>+0x160(SB)/4, $0x79d8a7a8
  1089  DATA TSHF<>+0x164(SB)/4, $0xf3b14f50
  1090  DATA TSHF<>+0x168(SB)/4, $0xe7629ea1
  1091  DATA TSHF<>+0x16c(SB)/4, $0xcec53d43
  1092  DATA TSHF<>+0x170(SB)/4, $0x79d8a7a8
  1093  DATA TSHF<>+0x174(SB)/4, $0xf3b14f50
  1094  DATA TSHF<>+0x178(SB)/4, $0xe7629ea1
  1095  DATA TSHF<>+0x17c(SB)/4, $0xcec53d43
  1096  
  1097  DATA TSHF<>+0x180(SB)/4, $0x9d8a7a87
  1098  DATA TSHF<>+0x184(SB)/4, $0x3b14f50f
  1099  DATA TSHF<>+0x188(SB)/4, $0x7629ea1e
  1100  DATA TSHF<>+0x18c(SB)/4, $0xec53d43c
  1101  DATA TSHF<>+0x190(SB)/4, $0x9d8a7a87
  1102  DATA TSHF<>+0x194(SB)/4, $0x3b14f50f
  1103  DATA TSHF<>+0x198(SB)/4, $0x7629ea1e
  1104  DATA TSHF<>+0x19c(SB)/4, $0xec53d43c
  1105  
  1106  DATA TSHF<>+0x1a0(SB)/4, $0xd8a7a879
  1107  DATA TSHF<>+0x1a4(SB)/4, $0xb14f50f3
  1108  DATA TSHF<>+0x1a8(SB)/4, $0x629ea1e7
  1109  DATA TSHF<>+0x1ac(SB)/4, $0xc53d43ce
  1110  DATA TSHF<>+0x1b0(SB)/4, $0xd8a7a879
  1111  DATA TSHF<>+0x1b4(SB)/4, $0xb14f50f3
  1112  DATA TSHF<>+0x1b8(SB)/4, $0x629ea1e7
  1113  DATA TSHF<>+0x1bc(SB)/4, $0xc53d43ce
  1114  
  1115  DATA TSHF<>+0x1c0(SB)/4, $0x8a7a879d
  1116  DATA TSHF<>+0x1c4(SB)/4, $0x14f50f3b
  1117  DATA TSHF<>+0x1c8(SB)/4, $0x29ea1e76
  1118  DATA TSHF<>+0x1cc(SB)/4, $0x53d43cec
  1119  DATA TSHF<>+0x1d0(SB)/4, $0x8a7a879d
  1120  DATA TSHF<>+0x1d4(SB)/4, $0x14f50f3b
  1121  DATA TSHF<>+0x1d8(SB)/4, $0x29ea1e76
  1122  DATA TSHF<>+0x1dc(SB)/4, $0x53d43cec
  1123  
  1124  DATA TSHF<>+0x1e0(SB)/4, $0xa7a879d8
  1125  DATA TSHF<>+0x1e4(SB)/4, $0x4f50f3b1
  1126  DATA TSHF<>+0x1e8(SB)/4, $0x9ea1e762
  1127  DATA TSHF<>+0x1ec(SB)/4, $0x3d43cec5
  1128  DATA TSHF<>+0x1f0(SB)/4, $0xa7a879d8
  1129  DATA TSHF<>+0x1f4(SB)/4, $0x4f50f3b1
  1130  DATA TSHF<>+0x1f8(SB)/4, $0x9ea1e762
  1131  DATA TSHF<>+0x1fc(SB)/4, $0x3d43cec5
  1132  GLOBL TSHF<>(SB), (NOPTR + RODATA), $512