github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_avx2_amd64.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  // shuffle byte order from LE to BE
    10  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
    11  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
    12  DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
    13  DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
    14  GLOBL flip_mask<>(SB), 8, $32
    15  
    16  // left rotations of 32-bit words by 8-bit increments
    17  DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
    18  DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
    19  DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003
    20  DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
    21  GLOBL r08_mask<>(SB), 8, $32
    22  
    23  #define a Y0
    24  #define b Y1
    25  #define c Y2
    26  #define d Y3
    27  #define e Y4
    28  #define f Y5
    29  #define g Y6
    30  #define h Y7
    31  #define TMP1 Y8
    32  #define TMP2 Y9
    33  #define TMP3 Y10
    34  #define TMP4 Y11
    35  
    36  #define srcPtr1 CX
    37  #define srcPtr2 R8
    38  #define srcPtr3 R9
    39  #define srcPtr4 R10
    40  #define srcPtr5 R11
    41  #define srcPtr6 R12
    42  #define srcPtr7 R13
    43  #define srcPtr8 R14
    44  
    45  // transpose matrix function, AVX2 version
    46  // parameters:
    47  // - r0: 256 bits register as input/output data
    48  // - r1: 256 bits register as input/output data
    49  // - r2: 256 bits register as input/output data
    50  // - r3: 256 bits register as input/output data
    51  // - r4: 256 bits register as input/output data
    52  // - r5: 256 bits register as input/output data
    53  // - r6: 256 bits register as input/output data
    54  // - r7: 256 bits register as input/output data
    55  // - tmp1: 256 bits temp register
    56  // - tmp2: 256 bits temp register
    57  // - tmp3: 256 bits temp register
    58  // - tmp4: 256 bits temp register
    59  #define TRANSPOSE_MATRIX(r0, r1, r2, r3, r4, r5, r6, r7, tmp1, tmp2, tmp3, tmp4) \
    60  	; \ // [r0, r1, r2, r3] => [tmp3, tmp4, tmp2, tmp1]
    61  	VPUNPCKHDQ r1, r0, tmp4;                 \ // tmp4 =  [w15, w7, w14, w6, w11, w3, w10, w2]
    62  	VPUNPCKLDQ r1, r0, r0;                   \ // r0 =    [w13, w5, w12, w4, w9, w1, w8, w0]
    63  	VPUNPCKLDQ r3, r2, tmp3;                 \ // tmp3 =  [w29, w21, w28, w20, w25, w17, w24, w16]
    64  	VPUNPCKHDQ r3, r2, r2;                   \ // r2 =    [w31, w27, w30, w22, w27, w19, w26, w18]
    65  	VPUNPCKHQDQ tmp3, r0, tmp2;                \ // tmp2 =    [w29, w21, w13, w5, w25, w17, w9, w1]
    66  	VPUNPCKLQDQ tmp3, r0, tmp1;                \ // tmp1 =    [w28, w20, w12, w4, w24, w16, w8, w0]
    67  	VPUNPCKHQDQ r2, tmp4, tmp3;                \ // tmp3 =    [w31, w23, w15, w7, w27, w19, w11, w3]
    68  	VPUNPCKLQDQ r2, tmp4, tmp4;                \ // tmp4 =    [w30, w22, w14, w6, w26, w18, w10, w2]
    69  	; \ // [r4, r5, r6, r7] => [r4, r5, r6, r7]
    70  	VPUNPCKHDQ r5, r4, r1;                 \ // r1 =  [w47, w39, w46, w38, w43, w35, w42, w34]
    71  	VPUNPCKLDQ r5, r4, r4;                   \ // r4 =    [w45, w37, w44, w36, w41, w33, w40, w32]
    72  	VPUNPCKLDQ r7, r6, r0;                 \ // r0 =  [w61, w53, w60, w52, w57, w49, w56, w48]
    73  	VPUNPCKHDQ r7, r6, r6;                   \ // r6 =    [w63, w59, w52, w54, w59, w51, w58, w50]
    74  	VPUNPCKHQDQ r0, r4, r5;                \ // r5 =    [w61, w53, w45, w37, w57, w49, w41, w33]
    75  	VPUNPCKLQDQ r0, r4, r4;                \ // r4 =    [w60, w52, w44, w36, w56, w48, w40, w32]
    76  	VPUNPCKHQDQ r6, r1, r7;                \ // r7 =    [w63, w55, w47, w39, w59, w51, w43, w35]
    77  	VPUNPCKLQDQ r6, r1, r6;                \ // r6 =    [w62, w54, w46, w38, w58, w50, w42, w34]
    78  	; \ // [tmp3, tmp4, tmp2, tmp1], [r4, r5, r6, r7] => [r0, r1, r2, r3, r4, r5, r6, r7]
    79  	VPERM2I128 $0x20, r4, tmp1, r0;              \ // r0 =    [w56, w48, w40, w32, w24, w16, w8, w0]
    80  	VPERM2I128 $0x20, r5, tmp2, r1;              \ // r1 =    [w57, w49, w41, w33, w25, w17, w9, w1]
    81  	VPERM2I128 $0x20, r6, tmp4, r2;              \ // r2 =    [w58, w50, w42, w34, w26, w18, w10, w2]
    82  	VPERM2I128 $0x20, r7, tmp3, r3;              \ // r3 =    [w59, w51, w43, w35, w27, w19, w11, w3]
    83  	VPERM2I128 $0x31, r4, tmp1, r4;              \ // r4 =    [w60, w52, w44, w36, w28, w20, w12, w4]
    84  	VPERM2I128 $0x31, r5, tmp2, r5;              \ // r5 =    [w61, w53, w45, w37, w29, w21, w13, w5]
    85  	VPERM2I128 $0x31, r6, tmp4, r6;              \ // r6 =    [w62, w54, w46, w38, w30, w22, w14, w6]
    86  	VPERM2I128 $0x31, r7, tmp3, r7;              \ // r7 =    [w63, w55, w47, w39, w31, w23, w15, w7]
    87  
    88  // store 256 bits
    89  #define storeWord(W, j) VMOVDQU W, (256+(j)*32)(BX)
    90  // load 256 bits
    91  #define loadWord(W, i) VMOVDQU (256+(i)*32)(BX), W
    92  
    93  #define REV32(a, b, c, d, e, f, g, h) \
    94  	VPSHUFB flip_mask<>(SB), a, a; \
    95  	VPSHUFB flip_mask<>(SB), b, b; \
    96  	VPSHUFB flip_mask<>(SB), c, c; \
    97  	VPSHUFB flip_mask<>(SB), d, d; \
    98  	VPSHUFB flip_mask<>(SB), e, e; \
    99  	VPSHUFB flip_mask<>(SB), f, f; \
   100  	VPSHUFB flip_mask<>(SB), g, g; \
   101  	VPSHUFB flip_mask<>(SB), h, h
   102  
   103  #define prepare8Words(i) \
   104  	VMOVDQU (i*32)(srcPtr1), a; \
   105  	VMOVDQU (i*32)(srcPtr2), b; \
   106  	VMOVDQU (i*32)(srcPtr3), c; \
   107  	VMOVDQU (i*32)(srcPtr4), d; \
   108  	VMOVDQU (i*32)(srcPtr5), e; \
   109  	VMOVDQU (i*32)(srcPtr6), f; \
   110  	VMOVDQU (i*32)(srcPtr7), g; \
   111  	VMOVDQU (i*32)(srcPtr8), h; \    
   112  	; \
   113  	TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4); \
   114  	REV32(a, b, c, d, e, f, g, h); \
   115  	; \
   116  	storeWord(a, 8*i+0); \
   117  	storeWord(b, 8*i+1); \
   118  	storeWord(c, 8*i+2); \
   119  	storeWord(d, 8*i+3); \
   120  	storeWord(e, 8*i+4); \
   121  	storeWord(f, 8*i+5); \
   122  	storeWord(g, 8*i+6); \
   123  	storeWord(h, 8*i+7)
   124  
   125  #define saveState(R) \
   126  	VMOVDQU a, (0*32)(R); \
   127  	VMOVDQU b, (1*32)(R); \
   128  	VMOVDQU c, (2*32)(R); \
   129  	VMOVDQU d, (3*32)(R); \
   130  	VMOVDQU e, (4*32)(R); \
   131  	VMOVDQU f, (5*32)(R); \
   132  	VMOVDQU g, (6*32)(R); \
   133  	VMOVDQU h, (7*32)(R)
   134  
   135  #define loadState(R) \
   136  	VMOVDQU (0*32)(R), a; \
   137  	VMOVDQU (1*32)(R), b; \
   138  	VMOVDQU (2*32)(R), c; \
   139  	VMOVDQU (3*32)(R), d; \
   140  	VMOVDQU (4*32)(R), e; \
   141  	VMOVDQU (5*32)(R), f; \
   142  	VMOVDQU (6*32)(R), g; \
   143  	VMOVDQU (7*32)(R), h
   144  
   145  // r <<< n
   146  #define VPROLD(r, n) \
   147  	VPSLLD $(n), r, TMP1; \
   148  	VPSRLD $(32-n), r, r; \
   149  	VPOR TMP1, r, r
   150  
   151  // d = r <<< n
   152  #define VPROLD2(r, d, n) \
   153  	VPSLLD $(n), r, TMP1; \
   154  	VPSRLD $(32-n), r, d; \
   155  	VPOR TMP1, d, d
   156  
   157  #define LOAD_T(index, T) \
   158  	VPBROADCASTD (index*4)(AX), T
   159  
   160  // DST = X XOR Y XOR Z
   161  #define FF0(X, Y, Z, DST) \
   162  	VPXOR X, Y, DST; \
   163  	VPXOR Z, DST, DST
   164  
   165  // DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
   166  #define FF1(X, Y, Z, TMP, DST) \
   167  	VPOR X, Y, DST; \
   168  	VPAND X, Y, TMP; \
   169  	VPAND Z, DST, DST; \
   170  	VPOR TMP, DST, DST
   171  
   172  // DST = X XOR Y XOR Z
   173  #define GG0(X, Y, Z, DST) \
   174  	FF0(X, Y, Z, DST)
   175  
   176  // DST = (Y XOR Z) AND X XOR Z
   177  #define GG1(X, Y, Z, DST) \
   178  	VPXOR Y, Z, DST; \
   179  	VPAND X, DST, DST; \ 
   180  	VPXOR Z, DST, DST
   181  
   182  #define SS1SS2(index, a, e, SS1, SS2) \
   183  	VPROLD2(a, SS2, 12); \ // a <<< 12
   184  	LOAD_T(index, SS1);   \ // const
   185  	VPADDD SS1, SS2, SS1; \
   186  	VPADDD e, SS1, SS1; \
   187  	VPROLD(SS1, 7); \ // SS1
   188  	VPXOR SS1, SS2, SS2; \ // SS2
   189  
   190  #define COPY_RESULT(b, d, f, h, TT1, TT2) \
   191  	VPROLD(b, 9); \
   192  	VMOVDQU TT1, h; \ // TT1
   193  	VPROLD(f, 19); \
   194  	VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
   195  	VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
   196  	VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
   197  	VPXOR TT1, TT2, d
   198  
   199  #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
   200  	SS1SS2(index, a, e, Y12, Y13); \
   201  	; \
   202  	FF0(a, b, c, Y14); \
   203  	VPADDD d, Y14, Y14; \ // (a XOR b XOR c) + d 
   204  	loadWord(Y10, index); \
   205  	loadWord(Y11, index+4); \
   206  	VPXOR Y10, Y11, Y11; \ //Wt XOR Wt+4
   207  	VPADDD Y11, Y14, Y14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
   208  	VPADDD Y14, Y13, Y13; \ // TT1
   209  	VPADDD h, Y10, Y10; \ // Wt + h
   210  	VPADDD Y12, Y10, Y10; \ // Wt + h + SS1
   211  	GG0(e, f, g, Y11); \
   212  	VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
   213  	; \ // copy result
   214  	COPY_RESULT(b, d, f, h, Y13, Y10)
   215  
   216  #define MESSAGE_SCHEDULE(index) \
   217  	loadWord(Y10, index+1); \ // Wj-3
   218  	VPROLD(Y10, 15); \
   219  	VPXOR (256+(index-12)*32)(BX), Y10, Y10; \ // Wj-16
   220  	VPXOR (256+(index-5)*32)(BX), Y10, Y10; \ // Wj-9
   221  	; \ // P1
   222  	VPROLD2(Y10, Y11, 15); \
   223  	VPXOR Y11, Y10, Y10; \
   224  	VPSHUFB r08_mask<>(SB), Y11, Y11; \
   225  	VPXOR Y11, Y10, Y10; \ // P1
   226  	loadWord(Y11, index-9); \ // Wj-13
   227  	VPROLD(Y11, 7); \
   228  	VPXOR Y11, Y10, Y10; \
   229  	VPXOR (256+(index-2)*32)(BX), Y10, Y11; \
   230  	storeWord(Y11, index+4)
   231  
   232  #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
   233  	MESSAGE_SCHEDULE(index); \
   234  	ROUND_00_11(index, a, b, c, d, e, f, g, h)
   235  
   236  #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
   237  	MESSAGE_SCHEDULE(index); \ // Y11 is Wt+4 now, Pls do not use it
   238  	SS1SS2(index, a, e, Y12, Y13); \
   239  	; \
   240  	FF1(a, b, c, Y10, Y14); \ // (a AND b) OR (a AND c) OR (b AND c)
   241  	VPADDD d, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
   242  	loadWord(Y10, index); \
   243  	VPXOR Y10, Y11, Y11; \ //Wt XOR Wt+4
   244  	VPADDD Y11, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
   245  	VPADDD Y14, Y13, Y13; \ // TT1
   246  	; \
   247  	VPADDD h, Y10, Y10; \ // Wt + h
   248  	VPADDD Y12, Y10, Y10; \ // Wt + h + SS1
   249  	GG1(e, f, g, Y11); \
   250  	VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
   251  	; \ // copy result
   252  	COPY_RESULT(b, d, f, h, Y13, Y10)
   253  
   254  // transposeMatrix8x8(dig **[8]uint32)
   255  TEXT ·transposeMatrix8x8(SB),NOSPLIT,$0
   256  	MOVQ	dig+0(FP), DI
   257  
   258  	// load state
   259  	MOVQ (DI), R8
   260  	VMOVDQU (R8), a
   261  	MOVQ 8(DI), R8
   262  	VMOVDQU (R8), b
   263  	MOVQ 16(DI), R8
   264  	VMOVDQU (R8), c
   265  	MOVQ 24(DI), R8
   266  	VMOVDQU (R8), d
   267  	MOVQ 32(DI), R8
   268  	VMOVDQU (R8), e
   269  	MOVQ 40(DI), R8
   270  	VMOVDQU (R8), f
   271  	MOVQ 48(DI), R8
   272  	VMOVDQU (R8), g
   273  	MOVQ 56(DI), R8
   274  	VMOVDQU (R8), h
   275  
   276  	TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4)
   277  
   278  	// save state
   279  	MOVQ (DI), R8
   280  	VMOVDQU a, (R8)
   281  	MOVQ 8(DI), R8
   282  	VMOVDQU b, (R8)
   283  	MOVQ 16(DI), R8
   284  	VMOVDQU c, (R8)
   285  	MOVQ 24(DI), R8
   286  	VMOVDQU d, (R8)
   287  	MOVQ 32(DI), R8
   288  	VMOVDQU e, (R8)
   289  	MOVQ 40(DI), R8
   290  	VMOVDQU f, (R8)
   291  	MOVQ 48(DI), R8
   292  	VMOVDQU g, (R8)
   293  	MOVQ 56(DI), R8
   294  	VMOVDQU h, (R8)
   295  
   296  	VZEROUPPER
   297  
   298  	RET
   299  
   300  // blockMultBy8(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
   301  TEXT ·blockMultBy8(SB),NOSPLIT,$0
   302  	MOVQ	dig+0(FP), DI
   303  	MOVQ	p+8(FP), SI
   304  	MOVQ	buffer+16(FP), BX
   305  	MOVQ	blocks+24(FP), DX
   306  
   307  	// load state
   308  	MOVQ (DI), R8
   309  	VMOVDQU (R8), a
   310  	MOVQ 8(DI), R8
   311  	VMOVDQU (R8), b
   312  	MOVQ 16(DI), R8
   313  	VMOVDQU (R8), c
   314  	MOVQ 24(DI), R8
   315  	VMOVDQU (R8), d
   316  	MOVQ 32(DI), R8
   317  	VMOVDQU (R8), e
   318  	MOVQ 40(DI), R8
   319  	VMOVDQU (R8), f
   320  	MOVQ 48(DI), R8
   321  	VMOVDQU (R8), g
   322  	MOVQ 56(DI), R8
   323  	VMOVDQU (R8), h
   324  
   325  	TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4)
   326  
   327  	saveState(BX)
   328  
   329  	MOVQ $·_K+0(SB), AX
   330  	MOVQ (0*8)(SI), srcPtr1
   331  	MOVQ (1*8)(SI), srcPtr2
   332  	MOVQ (2*8)(SI), srcPtr3
   333  	MOVQ (3*8)(SI), srcPtr4
   334  	MOVQ (4*8)(SI), srcPtr5
   335  	MOVQ (5*8)(SI), srcPtr6
   336  	MOVQ (6*8)(SI), srcPtr7
   337  	MOVQ (7*8)(SI), srcPtr8
   338  
   339  loop:
   340  	prepare8Words(0)
   341  	prepare8Words(1)
   342  
   343  	// Need to load state again due to YMM registers are used in prepare8Words
   344  	loadState(BX)
   345  
   346  	ROUND_00_11(0, a, b, c, d, e, f, g, h)
   347  	ROUND_00_11(1, h, a, b, c, d, e, f, g)
   348  	ROUND_00_11(2, g, h, a, b, c, d, e, f)
   349  	ROUND_00_11(3, f, g, h, a, b, c, d, e)
   350  	ROUND_00_11(4, e, f, g, h, a, b, c, d)
   351  	ROUND_00_11(5, d, e, f, g, h, a, b, c)
   352  	ROUND_00_11(6, c, d, e, f, g, h, a, b)
   353  	ROUND_00_11(7, b, c, d, e, f, g, h, a)
   354  	ROUND_00_11(8, a, b, c, d, e, f, g, h)
   355  	ROUND_00_11(9, h, a, b, c, d, e, f, g)
   356  	ROUND_00_11(10, g, h, a, b, c, d, e, f)
   357  	ROUND_00_11(11, f, g, h, a, b, c, d, e)
   358  
   359  	ROUND_12_15(12, e, f, g, h, a, b, c, d)
   360  	ROUND_12_15(13, d, e, f, g, h, a, b, c)
   361  	ROUND_12_15(14, c, d, e, f, g, h, a, b)
   362  	ROUND_12_15(15, b, c, d, e, f, g, h, a)
   363  
   364  	ROUND_16_63(16, a, b, c, d, e, f, g, h)
   365  	ROUND_16_63(17, h, a, b, c, d, e, f, g)
   366  	ROUND_16_63(18, g, h, a, b, c, d, e, f)
   367  	ROUND_16_63(19, f, g, h, a, b, c, d, e)
   368  	ROUND_16_63(20, e, f, g, h, a, b, c, d)
   369  	ROUND_16_63(21, d, e, f, g, h, a, b, c)
   370  	ROUND_16_63(22, c, d, e, f, g, h, a, b)
   371  	ROUND_16_63(23, b, c, d, e, f, g, h, a)
   372  	ROUND_16_63(24, a, b, c, d, e, f, g, h)
   373  	ROUND_16_63(25, h, a, b, c, d, e, f, g)
   374  	ROUND_16_63(26, g, h, a, b, c, d, e, f)
   375  	ROUND_16_63(27, f, g, h, a, b, c, d, e)
   376  	ROUND_16_63(28, e, f, g, h, a, b, c, d)
   377  	ROUND_16_63(29, d, e, f, g, h, a, b, c)
   378  	ROUND_16_63(30, c, d, e, f, g, h, a, b)
   379  	ROUND_16_63(31, b, c, d, e, f, g, h, a)
   380  	ROUND_16_63(32, a, b, c, d, e, f, g, h)
   381  	ROUND_16_63(33, h, a, b, c, d, e, f, g)
   382  	ROUND_16_63(34, g, h, a, b, c, d, e, f)
   383  	ROUND_16_63(35, f, g, h, a, b, c, d, e)
   384  	ROUND_16_63(36, e, f, g, h, a, b, c, d)
   385  	ROUND_16_63(37, d, e, f, g, h, a, b, c)
   386  	ROUND_16_63(38, c, d, e, f, g, h, a, b)
   387  	ROUND_16_63(39, b, c, d, e, f, g, h, a)
   388  	ROUND_16_63(40, a, b, c, d, e, f, g, h)
   389  	ROUND_16_63(41, h, a, b, c, d, e, f, g)
   390  	ROUND_16_63(42, g, h, a, b, c, d, e, f)
   391  	ROUND_16_63(43, f, g, h, a, b, c, d, e)
   392  	ROUND_16_63(44, e, f, g, h, a, b, c, d)
   393  	ROUND_16_63(45, d, e, f, g, h, a, b, c)
   394  	ROUND_16_63(46, c, d, e, f, g, h, a, b)
   395  	ROUND_16_63(47, b, c, d, e, f, g, h, a)
   396  	ROUND_16_63(48, a, b, c, d, e, f, g, h)
   397  	ROUND_16_63(49, h, a, b, c, d, e, f, g)
   398  	ROUND_16_63(50, g, h, a, b, c, d, e, f)
   399  	ROUND_16_63(51, f, g, h, a, b, c, d, e)
   400  	ROUND_16_63(52, e, f, g, h, a, b, c, d)
   401  	ROUND_16_63(53, d, e, f, g, h, a, b, c)
   402  	ROUND_16_63(54, c, d, e, f, g, h, a, b)
   403  	ROUND_16_63(55, b, c, d, e, f, g, h, a)
   404  	ROUND_16_63(56, a, b, c, d, e, f, g, h)
   405  	ROUND_16_63(57, h, a, b, c, d, e, f, g)
   406  	ROUND_16_63(58, g, h, a, b, c, d, e, f)
   407  	ROUND_16_63(59, f, g, h, a, b, c, d, e)
   408  	ROUND_16_63(60, e, f, g, h, a, b, c, d)
   409  	ROUND_16_63(61, d, e, f, g, h, a, b, c)
   410  	ROUND_16_63(62, c, d, e, f, g, h, a, b)
   411  	ROUND_16_63(63, b, c, d, e, f, g, h, a)
   412  
   413  	VPXOR (0*32)(BX), a, a
   414  	VPXOR (1*32)(BX), b, b
   415  	VPXOR (2*32)(BX), c, c
   416  	VPXOR (3*32)(BX), d, d
   417  	VPXOR (4*32)(BX), e, e
   418  	VPXOR (5*32)(BX), f, f
   419  	VPXOR (6*32)(BX), g, g
   420  	VPXOR (7*32)(BX), h, h
   421  
   422  	DECQ DX
   423  	JZ end
   424  
   425  	saveState(BX)
   426  	LEAQ 64(srcPtr1), srcPtr1
   427  	LEAQ 64(srcPtr2), srcPtr2
   428  	LEAQ 64(srcPtr3), srcPtr3
   429  	LEAQ 64(srcPtr4), srcPtr4
   430  	LEAQ 64(srcPtr5), srcPtr5
   431  	LEAQ 64(srcPtr6), srcPtr6
   432  	LEAQ 64(srcPtr7), srcPtr7
   433  	LEAQ 64(srcPtr8), srcPtr8
   434  
   435  	JMP loop
   436  
   437  end:
   438  	TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4)
   439  
   440  	// save state
   441  	MOVQ (DI), R8
   442  	VMOVDQU a, (R8)
   443  	MOVQ 8(DI), R8
   444  	VMOVDQU b, (R8)
   445  	MOVQ 16(DI), R8
   446  	VMOVDQU c, (R8)
   447  	MOVQ 24(DI), R8
   448  	VMOVDQU d, (R8)
   449  	MOVQ 32(DI), R8
   450  	VMOVDQU e, (R8)
   451  	MOVQ 40(DI), R8
   452  	VMOVDQU f, (R8)
   453  	MOVQ 48(DI), R8
   454  	VMOVDQU g, (R8)
   455  	MOVQ 56(DI), R8
   456  	VMOVDQU h, (R8)
   457  
   458  	VZEROUPPER
   459  	RET
   460  
   461  // func copyResultsBy8(dig *uint32, dst *byte)
   462  TEXT ·copyResultsBy8(SB),NOSPLIT,$0
   463  	MOVQ	dig+0(FP), DI
   464  	MOVQ	dst+8(FP), SI
   465  
   466  	loadState(DI)
   467  	REV32(a, b, c, d, e, f, g, h)
   468  	saveState(SI)
   469  
   470  	VZEROUPPER
   471  	RET