github.com/devops-filetransfer/sshego@v7.0.4+incompatible/_vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build go1.7,amd64,!gccgo,!appengine
     6  
     7  #include "textflag.h"
     8  
     9  DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
    10  DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
    11  DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
    12  DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
    13  GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
    14  
    15  DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
    16  DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
    17  DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
    18  DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
    19  GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
    20  
    21  DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
    22  DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
    23  DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
    24  DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
    25  GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
    26  
    27  DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
    28  DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
    29  DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
    30  DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
    31  GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
    32  
    33  DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
    34  DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
    35  GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
    36  
    37  DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
    38  DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
    39  GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
    40  
    41  DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
    42  DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
    43  GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
    44  
    45  DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
    46  DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
    47  GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
    48  
    49  DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
    50  DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
    51  GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
    52  
    53  DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
    54  DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
    55  GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
    56  
    57  #define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
    58  #define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
    59  #define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
    60  #define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
    61  #define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
    62  
    63  #define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
    64  	VPADDQ  m0, Y0, Y0;   \
    65  	VPADDQ  Y1, Y0, Y0;   \
    66  	VPXOR   Y0, Y3, Y3;   \
    67  	VPSHUFD $-79, Y3, Y3; \
    68  	VPADDQ  Y3, Y2, Y2;   \
    69  	VPXOR   Y2, Y1, Y1;   \
    70  	VPSHUFB c40, Y1, Y1;  \
    71  	VPADDQ  m1, Y0, Y0;   \
    72  	VPADDQ  Y1, Y0, Y0;   \
    73  	VPXOR   Y0, Y3, Y3;   \
    74  	VPSHUFB c48, Y3, Y3;  \
    75  	VPADDQ  Y3, Y2, Y2;   \
    76  	VPXOR   Y2, Y1, Y1;   \
    77  	VPADDQ  Y1, Y1, t;    \
    78  	VPSRLQ  $63, Y1, Y1;  \
    79  	VPXOR   t, Y1, Y1;    \
    80  	VPERMQ_0x39_Y1_Y1;    \
    81  	VPERMQ_0x4E_Y2_Y2;    \
    82  	VPERMQ_0x93_Y3_Y3;    \
    83  	VPADDQ  m2, Y0, Y0;   \
    84  	VPADDQ  Y1, Y0, Y0;   \
    85  	VPXOR   Y0, Y3, Y3;   \
    86  	VPSHUFD $-79, Y3, Y3; \
    87  	VPADDQ  Y3, Y2, Y2;   \
    88  	VPXOR   Y2, Y1, Y1;   \
    89  	VPSHUFB c40, Y1, Y1;  \
    90  	VPADDQ  m3, Y0, Y0;   \
    91  	VPADDQ  Y1, Y0, Y0;   \
    92  	VPXOR   Y0, Y3, Y3;   \
    93  	VPSHUFB c48, Y3, Y3;  \
    94  	VPADDQ  Y3, Y2, Y2;   \
    95  	VPXOR   Y2, Y1, Y1;   \
    96  	VPADDQ  Y1, Y1, t;    \
    97  	VPSRLQ  $63, Y1, Y1;  \
    98  	VPXOR   t, Y1, Y1;    \
    99  	VPERMQ_0x39_Y3_Y3;    \
   100  	VPERMQ_0x4E_Y2_Y2;    \
   101  	VPERMQ_0x93_Y1_Y1
   102  
   103  #define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
   104  #define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
   105  #define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
   106  #define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
   107  #define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
   108  
   109  #define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
   110  #define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
   111  #define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
   112  #define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
   113  #define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
   114  
   115  #define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
   116  #define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
   117  #define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
   118  #define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
   119  #define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
   120  
   121  #define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
   122  #define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
   123  #define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
   124  #define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
   125  #define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
   126  
   127  #define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
   128  #define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
   129  
   130  // load msg: Y12 = (i0, i1, i2, i3)
   131  // i0, i1, i2, i3 must not be 0
   132  #define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
   133  	VMOVQ_SI_X12(i0*8);           \
   134  	VMOVQ_SI_X11(i2*8);           \
   135  	VPINSRQ_1_SI_X12(i1*8);       \
   136  	VPINSRQ_1_SI_X11(i3*8);       \
   137  	VINSERTI128 $1, X11, Y12, Y12
   138  
   139  // load msg: Y13 = (i0, i1, i2, i3)
   140  // i0, i1, i2, i3 must not be 0
   141  #define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
   142  	VMOVQ_SI_X13(i0*8);           \
   143  	VMOVQ_SI_X11(i2*8);           \
   144  	VPINSRQ_1_SI_X13(i1*8);       \
   145  	VPINSRQ_1_SI_X11(i3*8);       \
   146  	VINSERTI128 $1, X11, Y13, Y13
   147  
   148  // load msg: Y14 = (i0, i1, i2, i3)
   149  // i0, i1, i2, i3 must not be 0
   150  #define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
   151  	VMOVQ_SI_X14(i0*8);           \
   152  	VMOVQ_SI_X11(i2*8);           \
   153  	VPINSRQ_1_SI_X14(i1*8);       \
   154  	VPINSRQ_1_SI_X11(i3*8);       \
   155  	VINSERTI128 $1, X11, Y14, Y14
   156  
   157  // load msg: Y15 = (i0, i1, i2, i3)
   158  // i0, i1, i2, i3 must not be 0
   159  #define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
   160  	VMOVQ_SI_X15(i0*8);           \
   161  	VMOVQ_SI_X11(i2*8);           \
   162  	VPINSRQ_1_SI_X15(i1*8);       \
   163  	VPINSRQ_1_SI_X11(i3*8);       \
   164  	VINSERTI128 $1, X11, Y15, Y15
   165  
   166  #define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
   167  	VMOVQ_SI_X12_0;                   \
   168  	VMOVQ_SI_X11(4*8);                \
   169  	VPINSRQ_1_SI_X12(2*8);            \
   170  	VPINSRQ_1_SI_X11(6*8);            \
   171  	VINSERTI128 $1, X11, Y12, Y12;    \
   172  	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
   173  	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
   174  	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
   175  
   176  #define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
   177  	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
   178  	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
   179  	VMOVQ_SI_X11(11*8);              \
   180  	VPSHUFD     $0x4E, 0*8(SI), X14; \
   181  	VPINSRQ_1_SI_X11(5*8);           \
   182  	VINSERTI128 $1, X11, Y14, Y14;   \
   183  	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
   184  
   185  #define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
   186  	VMOVQ_SI_X11(5*8);              \
   187  	VMOVDQU     11*8(SI), X12;      \
   188  	VPINSRQ_1_SI_X11(15*8);         \
   189  	VINSERTI128 $1, X11, Y12, Y12;  \
   190  	VMOVQ_SI_X13(8*8);              \
   191  	VMOVQ_SI_X11(2*8);              \
   192  	VPINSRQ_1_SI_X13_0;             \
   193  	VPINSRQ_1_SI_X11(13*8);         \
   194  	VINSERTI128 $1, X11, Y13, Y13;  \
   195  	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
   196  	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
   197  
   198  #define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
   199  	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
   200  	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
   201  	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
   202  	VMOVQ_SI_X15(6*8);               \
   203  	VMOVQ_SI_X11_0;                  \
   204  	VPINSRQ_1_SI_X15(10*8);          \
   205  	VPINSRQ_1_SI_X11(8*8);           \
   206  	VINSERTI128 $1, X11, Y15, Y15
   207  
   208  #define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
   209  	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
   210  	VMOVQ_SI_X13_0;                  \
   211  	VMOVQ_SI_X11(4*8);               \
   212  	VPINSRQ_1_SI_X13(7*8);           \
   213  	VPINSRQ_1_SI_X11(15*8);          \
   214  	VINSERTI128 $1, X11, Y13, Y13;   \
   215  	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
   216  	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
   217  
   218  #define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
   219  	VMOVQ_SI_X12(2*8);                \
   220  	VMOVQ_SI_X11_0;                   \
   221  	VPINSRQ_1_SI_X12(6*8);            \
   222  	VPINSRQ_1_SI_X11(8*8);            \
   223  	VINSERTI128 $1, X11, Y12, Y12;    \
   224  	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
   225  	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
   226  	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
   227  
   228  #define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
   229  	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
   230  	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
   231  	VMOVQ_SI_X14_0;                   \
   232  	VPSHUFD     $0x4E, 8*8(SI), X11;  \
   233  	VPINSRQ_1_SI_X14(6*8);            \
   234  	VINSERTI128 $1, X11, Y14, Y14;    \
   235  	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
   236  
   237  #define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
   238  	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
   239  	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
   240  	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
   241  	VMOVQ_SI_X15_0;                  \
   242  	VMOVQ_SI_X11(6*8);               \
   243  	VPINSRQ_1_SI_X15(4*8);           \
   244  	VPINSRQ_1_SI_X11(10*8);          \
   245  	VINSERTI128 $1, X11, Y15, Y15
   246  
   247  #define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
   248  	VMOVQ_SI_X12(6*8);              \
   249  	VMOVQ_SI_X11(11*8);             \
   250  	VPINSRQ_1_SI_X12(14*8);         \
   251  	VPINSRQ_1_SI_X11_0;             \
   252  	VINSERTI128 $1, X11, Y12, Y12;  \
   253  	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
   254  	VMOVQ_SI_X11(1*8);              \
   255  	VMOVDQU     12*8(SI), X14;      \
   256  	VPINSRQ_1_SI_X11(10*8);         \
   257  	VINSERTI128 $1, X11, Y14, Y14;  \
   258  	VMOVQ_SI_X15(2*8);              \
   259  	VMOVDQU     4*8(SI), X11;       \
   260  	VPINSRQ_1_SI_X15(7*8);          \
   261  	VINSERTI128 $1, X11, Y15, Y15
   262  
   263  #define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
   264  	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
   265  	VMOVQ_SI_X13(2*8);               \
   266  	VPSHUFD     $0x4E, 5*8(SI), X11; \
   267  	VPINSRQ_1_SI_X13(4*8);           \
   268  	VINSERTI128 $1, X11, Y13, Y13;   \
   269  	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
   270  	VMOVQ_SI_X15(11*8);              \
   271  	VMOVQ_SI_X11(12*8);              \
   272  	VPINSRQ_1_SI_X15(14*8);          \
   273  	VPINSRQ_1_SI_X11_0;              \
   274  	VINSERTI128 $1, X11, Y15, Y15
   275  
   276  // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
   277  TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
   278  	MOVQ h+0(FP), AX
   279  	MOVQ c+8(FP), BX
   280  	MOVQ flag+16(FP), CX
   281  	MOVQ blocks_base+24(FP), SI
   282  	MOVQ blocks_len+32(FP), DI
   283  
   284  	MOVQ SP, DX
   285  	MOVQ SP, R9
   286  	ADDQ $31, R9
   287  	ANDQ $~31, R9
   288  	MOVQ R9, SP
   289  
   290  	MOVQ CX, 16(SP)
   291  	XORQ CX, CX
   292  	MOVQ CX, 24(SP)
   293  
   294  	VMOVDQU ·AVX2_c40<>(SB), Y4
   295  	VMOVDQU ·AVX2_c48<>(SB), Y5
   296  
   297  	VMOVDQU 0(AX), Y8
   298  	VMOVDQU 32(AX), Y9
   299  	VMOVDQU ·AVX2_iv0<>(SB), Y6
   300  	VMOVDQU ·AVX2_iv1<>(SB), Y7
   301  
   302  	MOVQ 0(BX), R8
   303  	MOVQ 8(BX), R9
   304  	MOVQ R9, 8(SP)
   305  
   306  loop:
   307  	ADDQ $128, R8
   308  	MOVQ R8, 0(SP)
   309  	CMPQ R8, $128
   310  	JGE  noinc
   311  	INCQ R9
   312  	MOVQ R9, 8(SP)
   313  
   314  noinc:
   315  	VMOVDQA Y8, Y0
   316  	VMOVDQA Y9, Y1
   317  	VMOVDQA Y6, Y2
   318  	VPXOR   0(SP), Y7, Y3
   319  
   320  	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
   321  	VMOVDQA Y12, 32(SP)
   322  	VMOVDQA Y13, 64(SP)
   323  	VMOVDQA Y14, 96(SP)
   324  	VMOVDQA Y15, 128(SP)
   325  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   326  	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
   327  	VMOVDQA Y12, 160(SP)
   328  	VMOVDQA Y13, 192(SP)
   329  	VMOVDQA Y14, 224(SP)
   330  	VMOVDQA Y15, 256(SP)
   331  
   332  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   333  	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
   334  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   335  	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
   336  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   337  	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
   338  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   339  	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
   340  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   341  	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
   342  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   343  	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
   344  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   345  	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
   346  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   347  	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
   348  	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   349  
   350  	ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5)
   351  	ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5)
   352  
   353  	VPXOR Y0, Y8, Y8
   354  	VPXOR Y1, Y9, Y9
   355  	VPXOR Y2, Y8, Y8
   356  	VPXOR Y3, Y9, Y9
   357  
   358  	LEAQ 128(SI), SI
   359  	SUBQ $128, DI
   360  	JNE  loop
   361  
   362  	MOVQ R8, 0(BX)
   363  	MOVQ R9, 8(BX)
   364  
   365  	VMOVDQU Y8, 0(AX)
   366  	VMOVDQU Y9, 32(AX)
   367  	VZEROUPPER
   368  
   369  	MOVQ DX, SP
   370  	RET
   371  
   372  #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
   373  #define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
   374  #define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
   375  #define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
   376  #define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
   377  
   378  #define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
   379  #define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
   380  #define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
   381  #define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
   382  #define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
   383  #define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
   384  #define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
   385  #define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
   386  
   387  #define SHUFFLE_AVX() \
   388  	VMOVDQA X6, X13;         \
   389  	VMOVDQA X2, X14;         \
   390  	VMOVDQA X4, X6;          \
   391  	VPUNPCKLQDQ_X13_X13_X15; \
   392  	VMOVDQA X5, X4;          \
   393  	VMOVDQA X6, X5;          \
   394  	VPUNPCKHQDQ_X15_X7_X6;   \
   395  	VPUNPCKLQDQ_X7_X7_X15;   \
   396  	VPUNPCKHQDQ_X15_X13_X7;  \
   397  	VPUNPCKLQDQ_X3_X3_X15;   \
   398  	VPUNPCKHQDQ_X15_X2_X2;   \
   399  	VPUNPCKLQDQ_X14_X14_X15; \
   400  	VPUNPCKHQDQ_X15_X3_X3;   \
   401  
   402  #define SHUFFLE_AVX_INV() \
   403  	VMOVDQA X2, X13;         \
   404  	VMOVDQA X4, X14;         \
   405  	VPUNPCKLQDQ_X2_X2_X15;   \
   406  	VMOVDQA X5, X4;          \
   407  	VPUNPCKHQDQ_X15_X3_X2;   \
   408  	VMOVDQA X14, X5;         \
   409  	VPUNPCKLQDQ_X3_X3_X15;   \
   410  	VMOVDQA X6, X14;         \
   411  	VPUNPCKHQDQ_X15_X13_X3;  \
   412  	VPUNPCKLQDQ_X7_X7_X15;   \
   413  	VPUNPCKHQDQ_X15_X6_X6;   \
   414  	VPUNPCKLQDQ_X14_X14_X15; \
   415  	VPUNPCKHQDQ_X15_X7_X7;   \
   416  
   417  #define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
   418  	VPADDQ  m0, v0, v0;   \
   419  	VPADDQ  v2, v0, v0;   \
   420  	VPADDQ  m1, v1, v1;   \
   421  	VPADDQ  v3, v1, v1;   \
   422  	VPXOR   v0, v6, v6;   \
   423  	VPXOR   v1, v7, v7;   \
   424  	VPSHUFD $-79, v6, v6; \
   425  	VPSHUFD $-79, v7, v7; \
   426  	VPADDQ  v6, v4, v4;   \
   427  	VPADDQ  v7, v5, v5;   \
   428  	VPXOR   v4, v2, v2;   \
   429  	VPXOR   v5, v3, v3;   \
   430  	VPSHUFB c40, v2, v2;  \
   431  	VPSHUFB c40, v3, v3;  \
   432  	VPADDQ  m2, v0, v0;   \
   433  	VPADDQ  v2, v0, v0;   \
   434  	VPADDQ  m3, v1, v1;   \
   435  	VPADDQ  v3, v1, v1;   \
   436  	VPXOR   v0, v6, v6;   \
   437  	VPXOR   v1, v7, v7;   \
   438  	VPSHUFB c48, v6, v6;  \
   439  	VPSHUFB c48, v7, v7;  \
   440  	VPADDQ  v6, v4, v4;   \
   441  	VPADDQ  v7, v5, v5;   \
   442  	VPXOR   v4, v2, v2;   \
   443  	VPXOR   v5, v3, v3;   \
   444  	VPADDQ  v2, v2, t0;   \
   445  	VPSRLQ  $63, v2, v2;  \
   446  	VPXOR   t0, v2, v2;   \
   447  	VPADDQ  v3, v3, t0;   \
   448  	VPSRLQ  $63, v3, v3;  \
   449  	VPXOR   t0, v3, v3
   450  
   451  // load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
   452  // i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
   453  #define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
   454  	VMOVQ_SI_X12(i0*8);     \
   455  	VMOVQ_SI_X13(i2*8);     \
   456  	VMOVQ_SI_X14(i4*8);     \
   457  	VMOVQ_SI_X15(i6*8);     \
   458  	VPINSRQ_1_SI_X12(i1*8); \
   459  	VPINSRQ_1_SI_X13(i3*8); \
   460  	VPINSRQ_1_SI_X14(i5*8); \
   461  	VPINSRQ_1_SI_X15(i7*8)
   462  
   463  // load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
   464  #define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
   465  	VMOVQ_SI_X12_0;        \
   466  	VMOVQ_SI_X13(4*8);     \
   467  	VMOVQ_SI_X14(1*8);     \
   468  	VMOVQ_SI_X15(5*8);     \
   469  	VPINSRQ_1_SI_X12(2*8); \
   470  	VPINSRQ_1_SI_X13(6*8); \
   471  	VPINSRQ_1_SI_X14(3*8); \
   472  	VPINSRQ_1_SI_X15(7*8)
   473  
   474  // load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
   475  #define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
   476  	VPSHUFD $0x4E, 0*8(SI), X12; \
   477  	VMOVQ_SI_X13(11*8);          \
   478  	VMOVQ_SI_X14(12*8);          \
   479  	VMOVQ_SI_X15(7*8);           \
   480  	VPINSRQ_1_SI_X13(5*8);       \
   481  	VPINSRQ_1_SI_X14(2*8);       \
   482  	VPINSRQ_1_SI_X15(3*8)
   483  
   484  // load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
   485  #define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
   486  	VMOVDQU 11*8(SI), X12;  \
   487  	VMOVQ_SI_X13(5*8);      \
   488  	VMOVQ_SI_X14(8*8);      \
   489  	VMOVQ_SI_X15(2*8);      \
   490  	VPINSRQ_1_SI_X13(15*8); \
   491  	VPINSRQ_1_SI_X14_0;     \
   492  	VPINSRQ_1_SI_X15(13*8)
   493  
   494  // load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
   495  #define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
   496  	VMOVQ_SI_X12(2*8);      \
   497  	VMOVQ_SI_X13(4*8);      \
   498  	VMOVQ_SI_X14(6*8);      \
   499  	VMOVQ_SI_X15_0;         \
   500  	VPINSRQ_1_SI_X12(5*8);  \
   501  	VPINSRQ_1_SI_X13(15*8); \
   502  	VPINSRQ_1_SI_X14(10*8); \
   503  	VPINSRQ_1_SI_X15(8*8)
   504  
   505  // load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
   506  #define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
   507  	VMOVQ_SI_X12(9*8);      \
   508  	VMOVQ_SI_X13(2*8);      \
   509  	VMOVQ_SI_X14_0;         \
   510  	VMOVQ_SI_X15(4*8);      \
   511  	VPINSRQ_1_SI_X12(5*8);  \
   512  	VPINSRQ_1_SI_X13(10*8); \
   513  	VPINSRQ_1_SI_X14(7*8);  \
   514  	VPINSRQ_1_SI_X15(15*8)
   515  
   516  // load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
   517  #define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
   518  	VMOVQ_SI_X12(2*8);      \
   519  	VMOVQ_SI_X13_0;         \
   520  	VMOVQ_SI_X14(12*8);     \
   521  	VMOVQ_SI_X15(11*8);     \
   522  	VPINSRQ_1_SI_X12(6*8);  \
   523  	VPINSRQ_1_SI_X13(8*8);  \
   524  	VPINSRQ_1_SI_X14(10*8); \
   525  	VPINSRQ_1_SI_X15(3*8)
   526  
   527  // load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
   528  #define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
   529  	MOVQ    0*8(SI), X12;        \
   530  	VPSHUFD $0x4E, 8*8(SI), X13; \
   531  	MOVQ    7*8(SI), X14;        \
   532  	MOVQ    2*8(SI), X15;        \
   533  	VPINSRQ_1_SI_X12(6*8);       \
   534  	VPINSRQ_1_SI_X14(3*8);       \
   535  	VPINSRQ_1_SI_X15(11*8)
   536  
   537  // load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
   538  #define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
   539  	MOVQ 6*8(SI), X12;      \
   540  	MOVQ 11*8(SI), X13;     \
   541  	MOVQ 15*8(SI), X14;     \
   542  	MOVQ 3*8(SI), X15;      \
   543  	VPINSRQ_1_SI_X12(14*8); \
   544  	VPINSRQ_1_SI_X13_0;     \
   545  	VPINSRQ_1_SI_X14(9*8);  \
   546  	VPINSRQ_1_SI_X15(8*8)
   547  
   548  // load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
   549  #define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
   550  	MOVQ 5*8(SI), X12;      \
   551  	MOVQ 8*8(SI), X13;      \
   552  	MOVQ 0*8(SI), X14;      \
   553  	MOVQ 6*8(SI), X15;      \
   554  	VPINSRQ_1_SI_X12(15*8); \
   555  	VPINSRQ_1_SI_X13(2*8);  \
   556  	VPINSRQ_1_SI_X14(4*8);  \
   557  	VPINSRQ_1_SI_X15(10*8)
   558  
   559  // load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
   560  #define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
   561  	VMOVDQU 12*8(SI), X12;  \
   562  	MOVQ    1*8(SI), X13;   \
   563  	MOVQ    2*8(SI), X14;   \
   564  	VPINSRQ_1_SI_X13(10*8); \
   565  	VPINSRQ_1_SI_X14(7*8);  \
   566  	VMOVDQU 4*8(SI), X15
   567  
   568  // load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
   569  #define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
   570  	MOVQ 15*8(SI), X12;     \
   571  	MOVQ 3*8(SI), X13;      \
   572  	MOVQ 11*8(SI), X14;     \
   573  	MOVQ 12*8(SI), X15;     \
   574  	VPINSRQ_1_SI_X12(9*8);  \
   575  	VPINSRQ_1_SI_X13(13*8); \
   576  	VPINSRQ_1_SI_X14(14*8); \
   577  	VPINSRQ_1_SI_X15_0
   578  
   579  // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
   580  TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
   581  	MOVQ h+0(FP), AX
   582  	MOVQ c+8(FP), BX
   583  	MOVQ flag+16(FP), CX
   584  	MOVQ blocks_base+24(FP), SI
   585  	MOVQ blocks_len+32(FP), DI
   586  
   587  	MOVQ SP, BP
   588  	MOVQ SP, R9
   589  	ADDQ $15, R9
   590  	ANDQ $~15, R9
   591  	MOVQ R9, SP
   592  
   593  	VMOVDQU ·AVX_c40<>(SB), X0
   594  	VMOVDQU ·AVX_c48<>(SB), X1
   595  	VMOVDQA X0, X8
   596  	VMOVDQA X1, X9
   597  
   598  	VMOVDQU ·AVX_iv3<>(SB), X0
   599  	VMOVDQA X0, 0(SP)
   600  	XORQ    CX, 0(SP)          // 0(SP) = ·AVX_iv3 ^ (CX || 0)
   601  
   602  	VMOVDQU 0(AX), X10
   603  	VMOVDQU 16(AX), X11
   604  	VMOVDQU 32(AX), X2
   605  	VMOVDQU 48(AX), X3
   606  
   607  	MOVQ 0(BX), R8
   608  	MOVQ 8(BX), R9
   609  
   610  loop:
   611  	ADDQ $128, R8
   612  	CMPQ R8, $128
   613  	JGE  noinc
   614  	INCQ R9
   615  
   616  noinc:
   617  	VMOVQ_R8_X15
   618  	VPINSRQ_1_R9_X15
   619  
   620  	VMOVDQA X10, X0
   621  	VMOVDQA X11, X1
   622  	VMOVDQU ·AVX_iv0<>(SB), X4
   623  	VMOVDQU ·AVX_iv1<>(SB), X5
   624  	VMOVDQU ·AVX_iv2<>(SB), X6
   625  
   626  	VPXOR   X15, X6, X6
   627  	VMOVDQA 0(SP), X7
   628  
   629  	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
   630  	VMOVDQA X12, 16(SP)
   631  	VMOVDQA X13, 32(SP)
   632  	VMOVDQA X14, 48(SP)
   633  	VMOVDQA X15, 64(SP)
   634  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   635  	SHUFFLE_AVX()
   636  	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
   637  	VMOVDQA X12, 80(SP)
   638  	VMOVDQA X13, 96(SP)
   639  	VMOVDQA X14, 112(SP)
   640  	VMOVDQA X15, 128(SP)
   641  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   642  	SHUFFLE_AVX_INV()
   643  
   644  	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
   645  	VMOVDQA X12, 144(SP)
   646  	VMOVDQA X13, 160(SP)
   647  	VMOVDQA X14, 176(SP)
   648  	VMOVDQA X15, 192(SP)
   649  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   650  	SHUFFLE_AVX()
   651  	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
   652  	VMOVDQA X12, 208(SP)
   653  	VMOVDQA X13, 224(SP)
   654  	VMOVDQA X14, 240(SP)
   655  	VMOVDQA X15, 256(SP)
   656  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   657  	SHUFFLE_AVX_INV()
   658  
   659  	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
   660  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   661  	SHUFFLE_AVX()
   662  	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
   663  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   664  	SHUFFLE_AVX_INV()
   665  
   666  	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
   667  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   668  	SHUFFLE_AVX()
   669  	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
   670  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   671  	SHUFFLE_AVX_INV()
   672  
   673  	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
   674  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   675  	SHUFFLE_AVX()
   676  	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
   677  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   678  	SHUFFLE_AVX_INV()
   679  
   680  	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
   681  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   682  	SHUFFLE_AVX()
   683  	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
   684  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   685  	SHUFFLE_AVX_INV()
   686  
   687  	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
   688  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   689  	SHUFFLE_AVX()
   690  	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
   691  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   692  	SHUFFLE_AVX_INV()
   693  
   694  	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
   695  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   696  	SHUFFLE_AVX()
   697  	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
   698  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   699  	SHUFFLE_AVX_INV()
   700  
   701  	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
   702  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   703  	SHUFFLE_AVX()
   704  	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
   705  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   706  	SHUFFLE_AVX_INV()
   707  
   708  	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
   709  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   710  	SHUFFLE_AVX()
   711  	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
   712  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   713  	SHUFFLE_AVX_INV()
   714  
   715  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X15, X8, X9)
   716  	SHUFFLE_AVX()
   717  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X15, X8, X9)
   718  	SHUFFLE_AVX_INV()
   719  
   720  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X15, X8, X9)
   721  	SHUFFLE_AVX()
   722  	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X15, X8, X9)
   723  	SHUFFLE_AVX_INV()
   724  
   725  	VMOVDQU 32(AX), X14
   726  	VMOVDQU 48(AX), X15
   727  	VPXOR   X0, X10, X10
   728  	VPXOR   X1, X11, X11
   729  	VPXOR   X2, X14, X14
   730  	VPXOR   X3, X15, X15
   731  	VPXOR   X4, X10, X10
   732  	VPXOR   X5, X11, X11
   733  	VPXOR   X6, X14, X2
   734  	VPXOR   X7, X15, X3
   735  	VMOVDQU X2, 32(AX)
   736  	VMOVDQU X3, 48(AX)
   737  
   738  	LEAQ 128(SI), SI
   739  	SUBQ $128, DI
   740  	JNE  loop
   741  
   742  	VMOVDQU X10, 0(AX)
   743  	VMOVDQU X11, 16(AX)
   744  
   745  	MOVQ R8, 0(BX)
   746  	MOVQ R9, 8(BX)
   747  	VZEROUPPER
   748  
   749  	MOVQ BP, SP
   750  	RET
   751  
   752  // func supportsAVX2() bool
   753  TEXT ·supportsAVX2(SB), 4, $0-1
   754  	MOVQ runtime·support_avx2(SB), AX
   755  	MOVB AX, ret+0(FP)
   756  	RET
   757  
   758  // func supportsAVX() bool
   759  TEXT ·supportsAVX(SB), 4, $0-1
   760  	MOVQ runtime·support_avx(SB), AX
   761  	MOVB AX, ret+0(FP)
   762  	RET