github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_simd_amd64.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  // shuffle byte order from LE to BE
    10  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
    11  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
    12  GLOBL flip_mask<>(SB), RODATA, $16
    13  
    14  // left rotations of 32-bit words by 8-bit increments
    15  DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
    16  DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
    17  GLOBL r08_mask<>(SB), 8, $16
    18  
    19  // Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
    20  // input: from high to low
    21  // r0 = [w3, w2, w1, w0]
    22  // r1 = [w7, w6, w5, w4]
    23  // r2 = [w11, w10, w9, w8]
    24  // r3 = [w15, w14, w13, w12]
    25  // r: 32/64 temp register
    26  // tmp1: 128 bits temp register
    27  // tmp2: 128 bits temp register
    28  //
    29  // output: from high to low
    30  // r0 = [w12, w8, w4, w0]
    31  // r1 = [w13, w9, w5, w1]
    32  // r2 = [w14, w10, w6, w2]
    33  // r3 = [w15, w11, w7, w3]
    34  //
    35  // SSE2/MMX instructions:
    36  //	MOVOU r0, tmp2;
    37  //	PUNPCKHDQ r1, tmp2;
    38  //	PUNPCKLDQ	r1, r0; 
    39  //	MOVOU r2, tmp1; 
    40  //	PUNPCKLDQ r3, tmp1; 
    41  //	PUNPCKHDQ r3, r2; 
    42  //	MOVOU r0, r1; 
    43  //	PUNPCKHQDQ tmp1, r1; 
    44  //	PUNPCKLQDQ tmp1, r0; 
    45  //	MOVOU tmp2, r3; 
    46  //	PUNPCKHQDQ r2, r3; 
    47  //	PUNPCKLQDQ r2, tmp2; 
    48  //	MOVOU tmp2, r2
    49  #define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
    50  	MOVOU r0, tmp2;      \
    51  	PUNPCKHLQ r1, tmp2;  \
    52  	PUNPCKLLQ	r1, r0;  \
    53  	MOVOU r2, tmp1;      \
    54  	PUNPCKLLQ r3, tmp1;  \
    55  	PUNPCKHLQ r3, r2;    \
    56  	MOVOU r0, r1;        \
    57  	PUNPCKHQDQ tmp1, r1; \
    58  	PUNPCKLQDQ tmp1, r0; \
    59  	MOVOU tmp2, r3;      \
    60  	PUNPCKHQDQ r2, r3;   \
    61  	PUNPCKLQDQ r2, tmp2; \
    62  	MOVOU tmp2, r2
    63  
    64  #define a X0
    65  #define b X1
    66  #define c X2
    67  #define d X3
    68  #define e X4
    69  #define f X5
    70  #define g X6
    71  #define h X7
    72  
    73  #define tmp1 X8
    74  #define tmp2 X9
    75  
    76  #define storeState(R) \
    77  	MOVOU a, (R) \
    78  	MOVOU b, 16(R) \
    79  	MOVOU c, 32(R) \
    80  	MOVOU d, 48(R) \
    81  	MOVOU e, 64(R) \
    82  	MOVOU f, 80(R) \
    83  	MOVOU g, 96(R) \
    84  	MOVOU h, 112(R)
    85  
    86  #define storeWord(W, j) MOVOU W, (128+(j)*16)(BX)
    87  #define loadWord(W, i) MOVOU (128+(i)*16)(BX), W
    88  
    89  #define SSE_REV32(a, b, c, d) \
    90  	PSHUFB flip_mask<>(SB), a; \
    91  	PSHUFB flip_mask<>(SB), b; \
    92  	PSHUFB flip_mask<>(SB), c; \
    93  	PSHUFB flip_mask<>(SB), d
    94  
    95  #define prepare4Words(i) \
    96  	MOVOU (i*16)(R8), X10; \
    97  	MOVOU (i*16)(R9), X11; \
    98  	MOVOU (i*16)(R10), X12; \
    99  	MOVOU (i*16)(R11), X13; \
   100  	; \
   101  	SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
   102  	SSE_REV32(X10, X11, X12, X13); \
   103  	; \
   104  	storeWord(X10, 4*i+0); \
   105  	storeWord(X11, 4*i+1); \
   106  	storeWord(X12, 4*i+2); \
   107  	storeWord(X13, 4*i+3)
   108  
   109  #define LOAD_T(index, T) \
   110  	MOVL (index*4)(AX), T;    \
   111  	PSHUFD $0, T, T
   112  
   113  // r <<< n, SSE version
   114  #define PROLD(r, n) \
   115  	MOVOU r, tmp1; \
   116  	PSLLL $n, r; \
   117  	PSRLL $(32-n), tmp1; \
   118  	POR tmp1, r
   119  
   120  #define SSE_SS1SS2(index, a, e, TMP, SS1, SS2) \
   121  	MOVOU a, SS1; \
   122  	PROLD(SS1, 12); \
   123  	MOVOU SS1, SS2; \ // a <<< 12
   124  	LOAD_T(index, TMP); \
   125  	PADDL TMP, SS1; \
   126  	PADDL e, SS1; \
   127  	PROLD(SS1, 7); \ // SS1
   128  	PXOR SS1, SS2; \ // SS2
   129  
   130  #define SSE_FF0(X, Y, Z, DST) \
   131  	MOVOU X, DST; \
   132  	PXOR Y, DST; \
   133  	PXOR Z, DST
   134  
   135  #define SSE_FF1(X, Y, Z, TMP, DST) \
   136  	MOVOU X, DST; \
   137  	POR Y, DST; \
   138  	MOVOU X, TMP; \
   139  	PAND Y, TMP; \
   140  	PAND Z, DST; \
   141  	POR TMP, DST; \ // (a AND b) OR (a AND c) OR (b AND c)
   142  
   143  #define SSE_GG0(X, Y, Z, DST) \
   144  	SSE_FF0(X, Y, Z, DST)
   145  
   146  // DST = (Y XOR Z) AND X XOR Z
   147  #define SSE_GG1(X, Y, Z, DST) \
   148  	MOVOU Y, DST; \
   149  	PXOR Z, DST; \
   150  	PAND X, DST; \
   151  	PXOR Z, DST
   152  
   153  #define SSE_COPY_RESULT(b, d, f, h, TT1, TT2) \
   154  	PROLD(b, 9); \
   155  	MOVOU TT1, h; \
   156  	PROLD(f, 19); \
   157  	MOVOU TT2, TT1; \
   158  	PROLD(TT1, 9); \
   159  	PXOR TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
   160  	PSHUFB r08_mask<>(SB), TT1; \ // ROTL(17, tt2)
   161  	PXOR TT2, TT1; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
   162  	MOVOU TT1, d
   163  
   164  #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
   165  	SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
   166  	SSE_FF0(a, b, c, X14); \
   167  	PADDL d, X14; \ // (a XOR b XOR c) + d 
   168  	loadWord(X10, index); \
   169  	loadWord(X11, index+4); \
   170  	PXOR X10, X11; \ //Wt XOR Wt+4
   171  	PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
   172  	PADDL X14, X13; \ // TT1
   173  	PADDL h, X10; \ // Wt + h
   174  	PADDL X12, X10; \ // Wt + h + SS1
   175  	SSE_GG0(e, f, g, X11); \
   176  	PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
   177  	; \ // copy result
   178  	SSE_COPY_RESULT(b, d, f, h, X13, X10)
   179  
   180  #define MESSAGE_SCHEDULE(index) \
   181  	loadWord(X10, index+1); \ // Wj-3
   182  	PROLD(X10, 15); \
   183  	loadWord(X11, index-12); \ // Wj-16
   184  	PXOR X11, X10; \
   185  	loadWord(X11, index-5); \ // Wj-9
   186  	PXOR X11, X10; \
   187  	MOVOU X10, X11; \
   188  	PROLD(X11, 15); \
   189  	PXOR X11, X10; \
   190  	PSHUFB r08_mask<>(SB), X11; \
   191  	PXOR X11, X10; \ // P1
   192  	loadWord(X11, index-9); \ // Wj-13
   193  	PROLD(X11, 7); \
   194  	PXOR X11, X10; \
   195  	loadWord(X11, index-2); \ // Wj-6
   196  	PXOR X10, X11; \
   197  	storeWord(X11, index+4)
   198  
   199  #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
   200  	MESSAGE_SCHEDULE(index); \
   201  	ROUND_00_11(index, a, b, c, d, e, f, g, h)
   202  
   203  #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
   204  	MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
   205  	SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
   206  	; \
   207  	SSE_FF1(a, b, c, X10, X14); \
   208  	PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
   209  	loadWord(X10, index); \
   210  	PXOR X10, X11; \ //Wt XOR Wt+4
   211  	PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
   212  	PADDL X14, X13; \ // TT1
   213  	; \
   214  	PADDL h, X10; \ // Wt + h
   215  	PADDL X12, X10; \ // Wt + h + SS1
   216  	SSE_GG1(e, f, g, X11); \
   217  	PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
   218  	; \ // copy result
   219  	SSE_COPY_RESULT(b, d, f, h, X13, X10)
   220  
   221  // transpose matrix function, AVX version
   222  // parameters:
   223  // - r0: 128 bits register as input/output data
   224  // - r1: 128 bits register as input/output data
   225  // - r2: 128 bits register as input/output data
   226  // - r3: 128 bits register as input/output data
   227  // - tmp1: 128 bits temp register
   228  // - tmp2: 128 bits temp register
   229  #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
   230  	VPUNPCKHDQ r1, r0, tmp2;                 \ // tmp2 =  tmp2 = [w07, w03, w06, w02]
   231  	VPUNPCKLDQ r1, r0, r0;                   \ // r0 =      r0 = [w05, w01, w04, w00]
   232  	VPUNPCKLDQ r3, r2, tmp1;                 \ // tmp1 =  tmp1 = [w13, w09, w12, w08]
   233  	VPUNPCKHDQ r3, r2, r2;                   \ // r2 =      r2 = [w15, w11, w14, w10] 
   234  	VPUNPCKHQDQ tmp1, r0, r1;                \ // r1 =      r1 = [w13, w09, w05, w01]
   235  	VPUNPCKLQDQ tmp1, r0, r0;                \ // r0 =      r0 = [w12, w08, w04, w00]
   236  	VPUNPCKHQDQ r2, tmp2, r3;                \ // r3 =      r3 = [w15, w11, w07, w03]
   237  	VPUNPCKLQDQ r2, tmp2, r2                   // r2 =      r2 = [w14, w10, w06, w02]
   238  
   239  #define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX)
   240  #define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W
   241  
   242  #define avxStoreState(R) \
   243  	VMOVDQU a, (0*16)(R) \
   244  	VMOVDQU b, (1*16)(R) \
   245  	VMOVDQU c, (2*16)(R) \
   246  	VMOVDQU d, (3*16)(R) \
   247  	VMOVDQU e, (4*16)(R) \
   248  	VMOVDQU f, (5*16)(R) \
   249  	VMOVDQU g, (6*16)(R) \
   250  	VMOVDQU h, (7*16)(R)
   251  
   252  #define AVX_REV32(a, b, c, d) \
   253  	VPSHUFB flip_mask<>(SB), a, a; \
   254  	VPSHUFB flip_mask<>(SB), b, b; \
   255  	VPSHUFB flip_mask<>(SB), c, c; \
   256  	VPSHUFB flip_mask<>(SB), d, d
   257  
   258  #define avxPrepare4Words(i) \
   259  	VMOVDQU (i*16)(R8), X10; \
   260  	VMOVDQU (i*16)(R9), X11; \
   261  	VMOVDQU (i*16)(R10), X12; \
   262  	VMOVDQU (i*16)(R11), X13; \
   263  	; \
   264  	TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
   265  	AVX_REV32(X10, X11, X12, X13); \
   266  	; \
   267  	avxStoreWord(X10, 4*i+0); \
   268  	avxStoreWord(X11, 4*i+1); \
   269  	avxStoreWord(X12, 4*i+2); \
   270  	avxStoreWord(X13, 4*i+3)
   271  
   272  #define AVX_LOAD_T(index, T) \
   273  	MOVL (index*4)(AX), T;    \
   274  	VPSHUFD $0, T, T
   275  
   276  // r <<< n
   277  #define VPROLD(r, n) \
   278  	VPSLLD $(n), r, tmp1; \
   279  	VPSRLD $(32-n), r, r; \
   280  	VPOR tmp1, r, r
   281  
   282  // d = r <<< n
   283  #define VPROLD2(r, d, n) \
   284  	VPSLLD $(n), r, tmp1; \
   285  	VPSRLD $(32-n), r, d; \
   286  	VPOR tmp1, d, d
   287  
   288  #define AVX_SS1SS2(index, a, e, SS1, SS2) \
   289  	VPROLD2(a, SS2, 12); \ // a <<< 12
   290  	AVX_LOAD_T(index, SS1); \
   291  	VPADDD SS1, SS2, SS1; \
   292  	VPADDD e, SS1, SS1; \
   293  	VPROLD(SS1, 7); \ // SS1
   294  	VPXOR SS1, SS2, SS2
   295  
   296  // DST = X XOR Y XOR Z
   297  #define AVX_FF0(X, Y, Z, DST) \
   298  	VPXOR X, Y, DST; \
   299  	VPXOR Z, DST, DST
   300  
   301  // DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
   302  #define AVX_FF1(X, Y, Z, TMP, DST) \
   303  	VPOR X, Y, DST; \
   304  	VPAND X, Y, TMP; \
   305  	VPAND Z, DST, DST; \
   306  	VPOR TMP, DST, DST
   307  
   308  // DST = X XOR Y XOR Z
   309  #define AVX_GG0(X, Y, Z, DST) \
   310  	AVX_FF0(X, Y, Z, DST)
   311  
   312  // DST = (Y XOR Z) AND X XOR Z
   313  #define AVX_GG1(X, Y, Z, DST) \
   314  	VPXOR Y, Z, DST; \
   315  	VPAND X, DST, DST; \ 
   316  	VPXOR Z, DST, DST
   317  
   318  #define AVX_COPY_RESULT(b, d, f, h, TT1, TT2) \
   319  	VPROLD(b, 9); \
   320  	VMOVDQU TT1, h; \
   321  	VPROLD(f, 19); \
   322  	VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
   323  	VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
   324  	VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
   325  	VPXOR TT2, TT1, d
   326  
   327  #define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \
   328  	AVX_SS1SS2(index, a, e, X12, X13); \
   329  	; \
   330  	AVX_FF0(a, b, c, X14); \
   331  	VPADDD d, X14, X14; \ // (a XOR b XOR c) + d 
   332  	avxLoadWord(X10, index); \
   333  	avxLoadWord(X11, index+4); \
   334  	VPXOR X10, X11, X11; \ //Wt XOR Wt+4
   335  	VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
   336  	VPADDD X14, X13, X13; \ // TT1
   337  	VPADDD h, X10, X10; \ // Wt + h
   338  	VPADDD X12, X10, X10; \ // Wt + h + SS1
   339  	AVX_GG0(e, f, g, X11); \
   340  	VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
   341  	; \ // copy result
   342  	AVX_COPY_RESULT(b, d, f, h, X13, X10)
   343  
   344  #define AVX_MESSAGE_SCHEDULE(index) \
   345  	avxLoadWord(X10, index+1); \ // Wj-3
   346  	VPROLD(X10, 15); \
   347  	VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16
   348  	VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9
   349  	; \ // P1
   350  	VPROLD2(X10, X11, 15); \
   351  	VPXOR X11, X10, X10; \
   352  	VPSHUFB r08_mask<>(SB), X11, X11; \
   353  	VPXOR X11, X10, X10; \ // P1
   354  	avxLoadWord(X11, index-9); \ // Wj-13
   355  	VPROLD(X11, 7); \
   356  	VPXOR X11, X10, X10; \
   357  	VPXOR (128+(index-2)*16)(BX), X10, X11; \
   358  	avxStoreWord(X11, index+4)
   359  
   360  #define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \
   361  	AVX_MESSAGE_SCHEDULE(index); \
   362  	AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h)
   363  
   364  #define AVX_ROUND_16_63(index, a, b, c, d, e, f, g, h) \
   365  	AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
   366  	AVX_SS1SS2(index, a, e, X12, X13); \
   367  	; \
   368  	AVX_FF1(a, b, c, X10, X14); \
   369  	VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
   370  	avxLoadWord(X10, index); \
   371  	VPXOR X10, X11, X11; \ //Wt XOR Wt+4
   372  	VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
   373  	VPADDD X14, X13, X13; \ // TT1
   374  	; \
   375  	VPADDD h, X10, X10; \ // Wt + h
   376  	VPADDD X12, X10, X10; \ // Wt + h + SS1
   377  	AVX_GG1(e, f, g, X11); \
   378  	VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
   379  	; \ // copy result
   380  	AVX_COPY_RESULT(b, d, f, h, X13, X10)
   381  
   382  // blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
   383  TEXT ·blockMultBy4(SB),NOSPLIT,$0
   384  	MOVQ	dig+0(FP), DI
   385  	MOVQ	p+8(FP), SI
   386  	MOVQ	buffer+16(FP), BX
   387  	MOVQ	blocks+24(FP), DX
   388  
   389  	CMPB ·useAVX(SB), $1
   390  	JE   avx
   391  
   392  	// load state
   393  	MOVQ (DI), R8
   394  	MOVOU (0*16)(R8), a
   395  	MOVOU (1*16)(R8), e
   396  	MOVQ 8(DI), R8
   397  	MOVOU (0*16)(R8), b
   398  	MOVOU (1*16)(R8), f
   399  	MOVQ 16(DI), R8
   400  	MOVOU (0*16)(R8), c
   401  	MOVOU (1*16)(R8), g
   402  	MOVQ 24(DI), R8
   403  	MOVOU (0*16)(R8), d
   404  	MOVOU (1*16)(R8), h
   405  
   406  	// transpose state
   407  	SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
   408  	SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
   409  
   410  	// store state to temporary buffer
   411  	storeState(BX)
   412  
   413  	MOVQ $·_K+0(SB), AX
   414  	MOVQ (SI), R8
   415  	MOVQ 8(SI), R9
   416  	MOVQ 16(SI), R10
   417  	MOVQ 24(SI), R11
   418  
   419  loop:	
   420  	// load message block
   421  	prepare4Words(0)
   422  	prepare4Words(1)
   423  	prepare4Words(2)
   424  	prepare4Words(3)
   425  
   426  	ROUND_00_11(0, a, b, c, d, e, f, g, h)
   427  	ROUND_00_11(1, h, a, b, c, d, e, f, g)
   428  	ROUND_00_11(2, g, h, a, b, c, d, e, f)
   429  	ROUND_00_11(3, f, g, h, a, b, c, d, e)
   430  	ROUND_00_11(4, e, f, g, h, a, b, c, d)
   431  	ROUND_00_11(5, d, e, f, g, h, a, b, c)
   432  	ROUND_00_11(6, c, d, e, f, g, h, a, b)
   433  	ROUND_00_11(7, b, c, d, e, f, g, h, a)
   434  	ROUND_00_11(8, a, b, c, d, e, f, g, h)
   435  	ROUND_00_11(9, h, a, b, c, d, e, f, g)
   436  	ROUND_00_11(10, g, h, a, b, c, d, e, f)
   437  	ROUND_00_11(11, f, g, h, a, b, c, d, e)
   438  
   439  	ROUND_12_15(12, e, f, g, h, a, b, c, d)
   440  	ROUND_12_15(13, d, e, f, g, h, a, b, c)
   441  	ROUND_12_15(14, c, d, e, f, g, h, a, b)
   442  	ROUND_12_15(15, b, c, d, e, f, g, h, a)
   443  
   444  	ROUND_16_63(16, a, b, c, d, e, f, g, h)
   445  	ROUND_16_63(17, h, a, b, c, d, e, f, g)
   446  	ROUND_16_63(18, g, h, a, b, c, d, e, f)
   447  	ROUND_16_63(19, f, g, h, a, b, c, d, e)
   448  	ROUND_16_63(20, e, f, g, h, a, b, c, d)
   449  	ROUND_16_63(21, d, e, f, g, h, a, b, c)
   450  	ROUND_16_63(22, c, d, e, f, g, h, a, b)
   451  	ROUND_16_63(23, b, c, d, e, f, g, h, a)
   452  	ROUND_16_63(24, a, b, c, d, e, f, g, h)
   453  	ROUND_16_63(25, h, a, b, c, d, e, f, g)
   454  	ROUND_16_63(26, g, h, a, b, c, d, e, f)
   455  	ROUND_16_63(27, f, g, h, a, b, c, d, e)
   456  	ROUND_16_63(28, e, f, g, h, a, b, c, d)
   457  	ROUND_16_63(29, d, e, f, g, h, a, b, c)
   458  	ROUND_16_63(30, c, d, e, f, g, h, a, b)
   459  	ROUND_16_63(31, b, c, d, e, f, g, h, a)
   460  	ROUND_16_63(32, a, b, c, d, e, f, g, h)
   461  	ROUND_16_63(33, h, a, b, c, d, e, f, g)
   462  	ROUND_16_63(34, g, h, a, b, c, d, e, f)
   463  	ROUND_16_63(35, f, g, h, a, b, c, d, e)
   464  	ROUND_16_63(36, e, f, g, h, a, b, c, d)
   465  	ROUND_16_63(37, d, e, f, g, h, a, b, c)
   466  	ROUND_16_63(38, c, d, e, f, g, h, a, b)
   467  	ROUND_16_63(39, b, c, d, e, f, g, h, a)
   468  	ROUND_16_63(40, a, b, c, d, e, f, g, h)
   469  	ROUND_16_63(41, h, a, b, c, d, e, f, g)
   470  	ROUND_16_63(42, g, h, a, b, c, d, e, f)
   471  	ROUND_16_63(43, f, g, h, a, b, c, d, e)
   472  	ROUND_16_63(44, e, f, g, h, a, b, c, d)
   473  	ROUND_16_63(45, d, e, f, g, h, a, b, c)
   474  	ROUND_16_63(46, c, d, e, f, g, h, a, b)
   475  	ROUND_16_63(47, b, c, d, e, f, g, h, a)
   476  	ROUND_16_63(48, a, b, c, d, e, f, g, h)
   477  	ROUND_16_63(49, h, a, b, c, d, e, f, g)
   478  	ROUND_16_63(50, g, h, a, b, c, d, e, f)
   479  	ROUND_16_63(51, f, g, h, a, b, c, d, e)
   480  	ROUND_16_63(52, e, f, g, h, a, b, c, d)
   481  	ROUND_16_63(53, d, e, f, g, h, a, b, c)
   482  	ROUND_16_63(54, c, d, e, f, g, h, a, b)
   483  	ROUND_16_63(55, b, c, d, e, f, g, h, a)
   484  	ROUND_16_63(56, a, b, c, d, e, f, g, h)
   485  	ROUND_16_63(57, h, a, b, c, d, e, f, g)
   486  	ROUND_16_63(58, g, h, a, b, c, d, e, f)
   487  	ROUND_16_63(59, f, g, h, a, b, c, d, e)
   488  	ROUND_16_63(60, e, f, g, h, a, b, c, d)
   489  	ROUND_16_63(61, d, e, f, g, h, a, b, c)
   490  	ROUND_16_63(62, c, d, e, f, g, h, a, b)
   491  	ROUND_16_63(63, b, c, d, e, f, g, h, a)
   492  
   493  	MOVOU (0*16)(BX), tmp1
   494  	PXOR tmp1, a
   495  	MOVOU (1*16)(BX), tmp1
   496  	PXOR tmp1, b
   497  	MOVOU (2*16)(BX), tmp1
   498  	PXOR tmp1, c
   499  	MOVOU (3*16)(BX), tmp1
   500  	PXOR tmp1, d
   501  	MOVOU (4*16)(BX), tmp1
   502  	PXOR tmp1, e
   503  	MOVOU (5*16)(BX), tmp1
   504  	PXOR tmp1, f
   505  	MOVOU (6*16)(BX), tmp1
   506  	PXOR tmp1, g
   507  	MOVOU (7*16)(BX), tmp1
   508  	PXOR tmp1, h
   509  
   510  	DECQ DX
   511  	JZ end
   512  	
   513  	storeState(BX)
   514  	LEAQ 64(R8), R8
   515  	LEAQ 64(R9), R9
   516  	LEAQ 64(R10), R10
   517  	LEAQ 64(R11), R11
   518  	JMP loop
   519  
   520  end:
   521  	// transpose state
   522  	SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
   523  	SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
   524  
   525  	MOVQ (DI), R8
   526  	MOVOU a, (0*16)(R8)
   527  	MOVOU e, (1*16)(R8)
   528  	MOVQ 8(DI), R8
   529  	MOVOU b, (0*16)(R8)
   530  	MOVOU f, (1*16)(R8)
   531  	MOVQ 16(DI), R8
   532  	MOVOU c, (0*16)(R8)
   533  	MOVOU g, (1*16)(R8)
   534  	MOVQ 24(DI), R8
   535  	MOVOU d, (0*16)(R8)
   536  	MOVOU h, (1*16)(R8)
   537  
   538  	RET
   539  
   540  avx:
   541  	// load state
   542  	MOVQ (DI), R8
   543  	VMOVDQU (0*16)(R8), a
   544  	VMOVDQU (1*16)(R8), e
   545  	MOVQ 8(DI), R8
   546  	VMOVDQU (0*16)(R8), b
   547  	VMOVDQU (1*16)(R8), f
   548  	MOVQ 16(DI), R8
   549  	VMOVDQU (0*16)(R8), c
   550  	VMOVDQU (1*16)(R8), g
   551  	MOVQ 24(DI), R8
   552  	VMOVDQU (0*16)(R8), d
   553  	VMOVDQU (1*16)(R8), h
   554  
   555  	// transpose state
   556  	TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
   557  	TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
   558  
   559  	avxStoreState(BX)
   560  
   561  	MOVQ $·_K+0(SB), AX
   562  	MOVQ (SI), R8
   563  	MOVQ 8(SI), R9
   564  	MOVQ 16(SI), R10
   565  	MOVQ 24(SI), R11
   566  
   567  avxLoop:
   568  	// load message block
   569  	avxPrepare4Words(0)
   570  	avxPrepare4Words(1)
   571  	avxPrepare4Words(2)
   572  	avxPrepare4Words(3)
   573  
   574  	AVX_ROUND_00_11(0, a, b, c, d, e, f, g, h)
   575  	AVX_ROUND_00_11(1, h, a, b, c, d, e, f, g)
   576  	AVX_ROUND_00_11(2, g, h, a, b, c, d, e, f)
   577  	AVX_ROUND_00_11(3, f, g, h, a, b, c, d, e)
   578  	AVX_ROUND_00_11(4, e, f, g, h, a, b, c, d)
   579  	AVX_ROUND_00_11(5, d, e, f, g, h, a, b, c)
   580  	AVX_ROUND_00_11(6, c, d, e, f, g, h, a, b)
   581  	AVX_ROUND_00_11(7, b, c, d, e, f, g, h, a)
   582  	AVX_ROUND_00_11(8, a, b, c, d, e, f, g, h)
   583  	AVX_ROUND_00_11(9, h, a, b, c, d, e, f, g)
   584  	AVX_ROUND_00_11(10, g, h, a, b, c, d, e, f)
   585  	AVX_ROUND_00_11(11, f, g, h, a, b, c, d, e)
   586  
   587  	AVX_ROUND_12_15(12, e, f, g, h, a, b, c, d)
   588  	AVX_ROUND_12_15(13, d, e, f, g, h, a, b, c)
   589  	AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b)
   590  	AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a)
   591  
   592  	AVX_ROUND_16_63(16, a, b, c, d, e, f, g, h)
   593  	AVX_ROUND_16_63(17, h, a, b, c, d, e, f, g)
   594  	AVX_ROUND_16_63(18, g, h, a, b, c, d, e, f)
   595  	AVX_ROUND_16_63(19, f, g, h, a, b, c, d, e)
   596  	AVX_ROUND_16_63(20, e, f, g, h, a, b, c, d)
   597  	AVX_ROUND_16_63(21, d, e, f, g, h, a, b, c)
   598  	AVX_ROUND_16_63(22, c, d, e, f, g, h, a, b)
   599  	AVX_ROUND_16_63(23, b, c, d, e, f, g, h, a)
   600  	AVX_ROUND_16_63(24, a, b, c, d, e, f, g, h)
   601  	AVX_ROUND_16_63(25, h, a, b, c, d, e, f, g)
   602  	AVX_ROUND_16_63(26, g, h, a, b, c, d, e, f)
   603  	AVX_ROUND_16_63(27, f, g, h, a, b, c, d, e)
   604  	AVX_ROUND_16_63(28, e, f, g, h, a, b, c, d)
   605  	AVX_ROUND_16_63(29, d, e, f, g, h, a, b, c)
   606  	AVX_ROUND_16_63(30, c, d, e, f, g, h, a, b)
   607  	AVX_ROUND_16_63(31, b, c, d, e, f, g, h, a)
   608  	AVX_ROUND_16_63(32, a, b, c, d, e, f, g, h)
   609  	AVX_ROUND_16_63(33, h, a, b, c, d, e, f, g)
   610  	AVX_ROUND_16_63(34, g, h, a, b, c, d, e, f)
   611  	AVX_ROUND_16_63(35, f, g, h, a, b, c, d, e)
   612  	AVX_ROUND_16_63(36, e, f, g, h, a, b, c, d)
   613  	AVX_ROUND_16_63(37, d, e, f, g, h, a, b, c)
   614  	AVX_ROUND_16_63(38, c, d, e, f, g, h, a, b)
   615  	AVX_ROUND_16_63(39, b, c, d, e, f, g, h, a)
   616  	AVX_ROUND_16_63(40, a, b, c, d, e, f, g, h)
   617  	AVX_ROUND_16_63(41, h, a, b, c, d, e, f, g)
   618  	AVX_ROUND_16_63(42, g, h, a, b, c, d, e, f)
   619  	AVX_ROUND_16_63(43, f, g, h, a, b, c, d, e)
   620  	AVX_ROUND_16_63(44, e, f, g, h, a, b, c, d)
   621  	AVX_ROUND_16_63(45, d, e, f, g, h, a, b, c)
   622  	AVX_ROUND_16_63(46, c, d, e, f, g, h, a, b)
   623  	AVX_ROUND_16_63(47, b, c, d, e, f, g, h, a)
   624  	AVX_ROUND_16_63(48, a, b, c, d, e, f, g, h)
   625  	AVX_ROUND_16_63(49, h, a, b, c, d, e, f, g)
   626  	AVX_ROUND_16_63(50, g, h, a, b, c, d, e, f)
   627  	AVX_ROUND_16_63(51, f, g, h, a, b, c, d, e)
   628  	AVX_ROUND_16_63(52, e, f, g, h, a, b, c, d)
   629  	AVX_ROUND_16_63(53, d, e, f, g, h, a, b, c)
   630  	AVX_ROUND_16_63(54, c, d, e, f, g, h, a, b)
   631  	AVX_ROUND_16_63(55, b, c, d, e, f, g, h, a)
   632  	AVX_ROUND_16_63(56, a, b, c, d, e, f, g, h)
   633  	AVX_ROUND_16_63(57, h, a, b, c, d, e, f, g)
   634  	AVX_ROUND_16_63(58, g, h, a, b, c, d, e, f)
   635  	AVX_ROUND_16_63(59, f, g, h, a, b, c, d, e)
   636  	AVX_ROUND_16_63(60, e, f, g, h, a, b, c, d)
   637  	AVX_ROUND_16_63(61, d, e, f, g, h, a, b, c)
   638  	AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b)
   639  	AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a)
   640  
   641  	VPXOR (0*16)(BX), a, a
   642  	VPXOR (1*16)(BX), b, b
   643  	VPXOR (2*16)(BX), c, c
   644  	VPXOR (3*16)(BX), d, d
   645  	VPXOR (4*16)(BX), e, e
   646  	VPXOR (5*16)(BX), f, f
   647  	VPXOR (6*16)(BX), g, g
   648  	VPXOR (7*16)(BX), h, h
   649  
   650  	DECQ DX
   651  	JZ avxEnd
   652  
   653  	// store current state
   654  	avxStoreState(BX)
   655  
   656  	LEAQ 64(R8), R8
   657  	LEAQ 64(R9), R9
   658  	LEAQ 64(R10), R10
   659  	LEAQ 64(R11), R11
   660  	JMP avxLoop
   661  
   662  avxEnd:
   663  	// transpose state
   664  	TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
   665  	TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
   666  
   667  	MOVQ (DI), R8
   668  	VMOVDQU a, (0*16)(R8)
   669  	VMOVDQU e, (1*16)(R8)
   670  	MOVQ 8(DI), R8
   671  	VMOVDQU b, (0*16)(R8)
   672  	VMOVDQU f, (1*16)(R8)
   673  	MOVQ 16(DI), R8
   674  	VMOVDQU c, (0*16)(R8)
   675  	VMOVDQU g, (1*16)(R8)
   676  	MOVQ 24(DI), R8
   677  	VMOVDQU d, (0*16)(R8)
   678  	VMOVDQU h, (1*16)(R8)
   679  
   680  	RET
   681  
   682  // func copyResultsBy4(dig *uint32, dst *byte)
   683  TEXT ·copyResultsBy4(SB),NOSPLIT,$0
   684  	MOVQ	dig+0(FP), DI
   685  	MOVQ	dst+8(FP), SI
   686  
   687  	CMPB ·useAVX(SB), $1
   688  	JE   avx
   689  
   690  	// load state
   691  	MOVOU (0*16)(DI), a
   692  	MOVOU (1*16)(DI), b
   693  	MOVOU (2*16)(DI), c
   694  	MOVOU (3*16)(DI), d
   695  	MOVOU (4*16)(DI), e
   696  	MOVOU (5*16)(DI), f
   697  	MOVOU (6*16)(DI), g
   698  	MOVOU (7*16)(DI), h
   699  	
   700  	SSE_REV32(a, b, c, d)
   701  	SSE_REV32(e, f, g, h)
   702  	storeState(SI)
   703  
   704  	RET
   705  
   706  avx:
   707  	// load state
   708  	VMOVDQU (0*16)(DI), a
   709  	VMOVDQU (1*16)(DI), b
   710  	VMOVDQU (2*16)(DI), c
   711  	VMOVDQU (3*16)(DI), d
   712  	VMOVDQU (4*16)(DI), e
   713  	VMOVDQU (5*16)(DI), f
   714  	VMOVDQU (6*16)(DI), g
   715  	VMOVDQU (7*16)(DI), h
   716  
   717  	AVX_REV32(a, b, c, d)
   718  	AVX_REV32(e, f, g, h)
   719  
   720  	avxStoreState(SI)
   721  
   722  	RET