gitlab.com/yawning/chacha20.git@v0.0.0-20230427033715-7877545b1b37/internal/hardware/impl_amd64.s (about)

     1  // Copryright (C) 2019 Yawning Angel
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15  
    16  // +build !noasm
    17  
    18  #include "textflag.h"
    19  
    20  DATA ·chacha_constants<>+0x00(SB)/4, $0x61707865
    21  DATA ·chacha_constants<>+0x04(SB)/4, $0x3320646E
    22  DATA ·chacha_constants<>+0x08(SB)/4, $0x79622D32
    23  DATA ·chacha_constants<>+0x0c(SB)/4, $0x6B206574
    24  DATA ·chacha_constants<>+0x10(SB)/8, $0x0504070601000302
    25  DATA ·chacha_constants<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
    26  DATA ·chacha_constants<>+0x20(SB)/8, $0x0605040702010003
    27  DATA ·chacha_constants<>+0x28(SB)/8, $0x0E0D0C0F0A09080B
    28  GLOBL ·chacha_constants<>(SB), (NOPTR+RODATA), $48
    29  
    30  // func blocksAVX2(s *[api.StateSize]uint32, in, out []byte)
    31  TEXT ·blocksAVX2(SB), NOSPLIT, $576-56
    32  	// This is Andrew Moon's AVX2 ChaCha implementation taken from
    33  	// supercop-20171218, with some minor changes, primarily calling
    34  	// convention and assembly dialect related.
    35  
    36  	// Align the stack on a 64 byte boundary.
    37  	MOVQ SP, BP
    38  	ADDQ $64, BP
    39  	ANDQ $-64, BP
    40  
    41  	// Go calling convention -> SYSV AMD64 (and a fixup).
    42  	MOVQ s+0(FP), DI       // &s -> DI
    43  	ADDQ $16, DI           // Skip the ChaCha constants in the chachaState.
    44  	MOVQ in+8(FP), SI      // &in[0] -> SI
    45  	MOVQ out+32(FP), DX    // &out[0] -> DX
    46  	MOVQ in_len+16(FP), CX // len(in) -> CX
    47  
    48  	// Begin the main body of `chacha_blocks_avx2`.
    49  	//
    50  	// Mostly a direct translation except:
    51  	//  * The number of rounds is always 20.
    52  	//  * %rbp is used instead of %rsp.
    53  	LEAQ    ·chacha_constants<>(SB), AX
    54  	VMOVDQU 0(AX), X8
    55  	VMOVDQU 16(AX), X6
    56  	VMOVDQU 32(AX), X7
    57  	VMOVDQU 0(DI), X9
    58  	VMOVDQU 16(DI), X10
    59  	VMOVDQU 32(DI), X11
    60  
    61  	// MOVQ 48(DI), AX
    62  	MOVQ    $1, R9
    63  	VMOVDQA X8, 0(BP)
    64  	VMOVDQA X9, 16(BP)
    65  	VMOVDQA X10, 32(BP)
    66  	VMOVDQA X11, 48(BP)
    67  
    68  	// MOVQ AX, 64(BP)
    69  	VMOVDQA X6, 448(BP)
    70  	VMOVDQA X6, 464(BP)
    71  	VMOVDQA X7, 480(BP)
    72  	VMOVDQA X7, 496(BP)
    73  	CMPQ    CX, $512
    74  	JAE     chacha_blocks_avx2_atleast512
    75  	CMPQ    CX, $256
    76  	JAE     chacha_blocks_avx2_atleast256
    77  	JMP     chacha_blocks_avx2_below256
    78  
    79  chacha_blocks_avx2_atleast512:
    80  	MOVQ 48(BP), AX
    81  	LEAQ 1(AX), R8
    82  	LEAQ 2(AX), R9
    83  	LEAQ 3(AX), R10
    84  	LEAQ 4(AX), BX
    85  	LEAQ 5(AX), R11
    86  	LEAQ 6(AX), R12
    87  	LEAQ 7(AX), R13
    88  	LEAQ 8(AX), R14
    89  	MOVL AX, 128(BP)
    90  	MOVL R8, 4+128(BP)
    91  	MOVL R9, 8+128(BP)
    92  	MOVL R10, 12+128(BP)
    93  	MOVL BX, 16+128(BP)
    94  	MOVL R11, 20+128(BP)
    95  	MOVL R12, 24+128(BP)
    96  	MOVL R13, 28+128(BP)
    97  	SHRQ $32, AX
    98  	SHRQ $32, R8
    99  	SHRQ $32, R9
   100  	SHRQ $32, R10
   101  	SHRQ $32, BX
   102  	SHRQ $32, R11
   103  	SHRQ $32, R12
   104  	SHRQ $32, R13
   105  	MOVL AX, 160(BP)
   106  	MOVL R8, 4+160(BP)
   107  	MOVL R9, 8+160(BP)
   108  	MOVL R10, 12+160(BP)
   109  	MOVL BX, 16+160(BP)
   110  	MOVL R11, 20+160(BP)
   111  	MOVL R12, 24+160(BP)
   112  	MOVL R13, 28+160(BP)
   113  	MOVQ R14, 48(BP)
   114  
   115  	// MOVQ 64(BP), AX
   116  	MOVQ         $20, AX
   117  	VPBROADCASTD 0(BP), Y0
   118  	VPBROADCASTD 4+0(BP), Y1
   119  	VPBROADCASTD 8+0(BP), Y2
   120  	VPBROADCASTD 12+0(BP), Y3
   121  	VPBROADCASTD 16(BP), Y4
   122  	VPBROADCASTD 4+16(BP), Y5
   123  	VPBROADCASTD 8+16(BP), Y6
   124  	VPBROADCASTD 12+16(BP), Y7
   125  	VPBROADCASTD 32(BP), Y8
   126  	VPBROADCASTD 4+32(BP), Y9
   127  	VPBROADCASTD 8+32(BP), Y10
   128  	VPBROADCASTD 12+32(BP), Y11
   129  	VPBROADCASTD 8+48(BP), Y14
   130  	VPBROADCASTD 12+48(BP), Y15
   131  	VMOVDQA      128(BP), Y12
   132  	VMOVDQA      160(BP), Y13
   133  
   134  chacha_blocks_avx2_mainloop1:
   135  	VPADDD       Y0, Y4, Y0
   136  	VPADDD       Y1, Y5, Y1
   137  	VPXOR        Y12, Y0, Y12
   138  	VPXOR        Y13, Y1, Y13
   139  	VPADDD       Y2, Y6, Y2
   140  	VPADDD       Y3, Y7, Y3
   141  	VPXOR        Y14, Y2, Y14
   142  	VPXOR        Y15, Y3, Y15
   143  	VPSHUFB      448(BP), Y12, Y12
   144  	VPSHUFB      448(BP), Y13, Y13
   145  	VPADDD       Y8, Y12, Y8
   146  	VPADDD       Y9, Y13, Y9
   147  	VPSHUFB      448(BP), Y14, Y14
   148  	VPSHUFB      448(BP), Y15, Y15
   149  	VPADDD       Y10, Y14, Y10
   150  	VPADDD       Y11, Y15, Y11
   151  	VMOVDQA      Y12, 96(BP)
   152  	VPXOR        Y4, Y8, Y4
   153  	VPXOR        Y5, Y9, Y5
   154  	VPSLLD       $ 12, Y4, Y12
   155  	VPSRLD       $20, Y4, Y4
   156  	VPXOR        Y4, Y12, Y4
   157  	VPSLLD       $ 12, Y5, Y12
   158  	VPSRLD       $20, Y5, Y5
   159  	VPXOR        Y5, Y12, Y5
   160  	VPXOR        Y6, Y10, Y6
   161  	VPXOR        Y7, Y11, Y7
   162  	VPSLLD       $ 12, Y6, Y12
   163  	VPSRLD       $20, Y6, Y6
   164  	VPXOR        Y6, Y12, Y6
   165  	VPSLLD       $ 12, Y7, Y12
   166  	VPSRLD       $20, Y7, Y7
   167  	VPXOR        Y7, Y12, Y7
   168  	VPADDD       Y0, Y4, Y0
   169  	VPADDD       Y1, Y5, Y1
   170  	VPXOR        96(BP), Y0, Y12
   171  	VPXOR        Y13, Y1, Y13
   172  	VPADDD       Y2, Y6, Y2
   173  	VPADDD       Y3, Y7, Y3
   174  	VPXOR        Y14, Y2, Y14
   175  	VPXOR        Y15, Y3, Y15
   176  	VPSHUFB      480(BP), Y12, Y12
   177  	VPSHUFB      480(BP), Y13, Y13
   178  	VPADDD       Y8, Y12, Y8
   179  	VPADDD       Y9, Y13, Y9
   180  	VPSHUFB      480(BP), Y14, Y14
   181  	VPSHUFB      480(BP), Y15, Y15
   182  	VPADDD       Y10, Y14, Y10
   183  	VPADDD       Y11, Y15, Y11
   184  	VMOVDQA      Y12, 96(BP)
   185  	VPXOR        Y4, Y8, Y4
   186  	VPXOR        Y5, Y9, Y5
   187  	VPSLLD       $ 7, Y4, Y12
   188  	VPSRLD       $25, Y4, Y4
   189  	VPXOR        Y4, Y12, Y4
   190  	VPSLLD       $ 7, Y5, Y12
   191  	VPSRLD       $25, Y5, Y5
   192  	VPXOR        Y5, Y12, Y5
   193  	VPXOR        Y6, Y10, Y6
   194  	VPXOR        Y7, Y11, Y7
   195  	VPSLLD       $ 7, Y6, Y12
   196  	VPSRLD       $25, Y6, Y6
   197  	VPXOR        Y6, Y12, Y6
   198  	VPSLLD       $ 7, Y7, Y12
   199  	VPSRLD       $25, Y7, Y7
   200  	VPXOR        Y7, Y12, Y7
   201  	VPADDD       Y0, Y5, Y0
   202  	VPADDD       Y1, Y6, Y1
   203  	VPXOR        Y15, Y0, Y15
   204  	VPXOR        96(BP), Y1, Y12
   205  	VPADDD       Y2, Y7, Y2
   206  	VPADDD       Y3, Y4, Y3
   207  	VPXOR        Y13, Y2, Y13
   208  	VPXOR        Y14, Y3, Y14
   209  	VPSHUFB      448(BP), Y15, Y15
   210  	VPSHUFB      448(BP), Y12, Y12
   211  	VPADDD       Y10, Y15, Y10
   212  	VPADDD       Y11, Y12, Y11
   213  	VPSHUFB      448(BP), Y13, Y13
   214  	VPSHUFB      448(BP), Y14, Y14
   215  	VPADDD       Y8, Y13, Y8
   216  	VPADDD       Y9, Y14, Y9
   217  	VMOVDQA      Y15, 96(BP)
   218  	VPXOR        Y5, Y10, Y5
   219  	VPXOR        Y6, Y11, Y6
   220  	VPSLLD       $ 12, Y5, Y15
   221  	VPSRLD       $20, Y5, Y5
   222  	VPXOR        Y5, Y15, Y5
   223  	VPSLLD       $ 12, Y6, Y15
   224  	VPSRLD       $20, Y6, Y6
   225  	VPXOR        Y6, Y15, Y6
   226  	VPXOR        Y7, Y8, Y7
   227  	VPXOR        Y4, Y9, Y4
   228  	VPSLLD       $ 12, Y7, Y15
   229  	VPSRLD       $20, Y7, Y7
   230  	VPXOR        Y7, Y15, Y7
   231  	VPSLLD       $ 12, Y4, Y15
   232  	VPSRLD       $20, Y4, Y4
   233  	VPXOR        Y4, Y15, Y4
   234  	VPADDD       Y0, Y5, Y0
   235  	VPADDD       Y1, Y6, Y1
   236  	VPXOR        96(BP), Y0, Y15
   237  	VPXOR        Y12, Y1, Y12
   238  	VPADDD       Y2, Y7, Y2
   239  	VPADDD       Y3, Y4, Y3
   240  	VPXOR        Y13, Y2, Y13
   241  	VPXOR        Y14, Y3, Y14
   242  	VPSHUFB      480(BP), Y15, Y15
   243  	VPSHUFB      480(BP), Y12, Y12
   244  	VPADDD       Y10, Y15, Y10
   245  	VPADDD       Y11, Y12, Y11
   246  	VPSHUFB      480(BP), Y13, Y13
   247  	VPSHUFB      480(BP), Y14, Y14
   248  	VPADDD       Y8, Y13, Y8
   249  	VPADDD       Y9, Y14, Y9
   250  	VMOVDQA      Y15, 96(BP)
   251  	VPXOR        Y5, Y10, Y5
   252  	VPXOR        Y6, Y11, Y6
   253  	VPSLLD       $ 7, Y5, Y15
   254  	VPSRLD       $25, Y5, Y5
   255  	VPXOR        Y5, Y15, Y5
   256  	VPSLLD       $ 7, Y6, Y15
   257  	VPSRLD       $25, Y6, Y6
   258  	VPXOR        Y6, Y15, Y6
   259  	VPXOR        Y7, Y8, Y7
   260  	VPXOR        Y4, Y9, Y4
   261  	VPSLLD       $ 7, Y7, Y15
   262  	VPSRLD       $25, Y7, Y7
   263  	VPXOR        Y7, Y15, Y7
   264  	VPSLLD       $ 7, Y4, Y15
   265  	VPSRLD       $25, Y4, Y4
   266  	VPXOR        Y4, Y15, Y4
   267  	VMOVDQA      96(BP), Y15
   268  	SUBQ         $2, AX
   269  	JNZ          chacha_blocks_avx2_mainloop1
   270  	VMOVDQA      Y8, 192(BP)
   271  	VMOVDQA      Y9, 224(BP)
   272  	VMOVDQA      Y10, 256(BP)
   273  	VMOVDQA      Y11, 288(BP)
   274  	VMOVDQA      Y12, 320(BP)
   275  	VMOVDQA      Y13, 352(BP)
   276  	VMOVDQA      Y14, 384(BP)
   277  	VMOVDQA      Y15, 416(BP)
   278  	VPBROADCASTD 0(BP), Y8
   279  	VPBROADCASTD 4+0(BP), Y9
   280  	VPBROADCASTD 8+0(BP), Y10
   281  	VPBROADCASTD 12+0(BP), Y11
   282  	VPBROADCASTD 16(BP), Y12
   283  	VPBROADCASTD 4+16(BP), Y13
   284  	VPBROADCASTD 8+16(BP), Y14
   285  	VPBROADCASTD 12+16(BP), Y15
   286  	VPADDD       Y8, Y0, Y0
   287  	VPADDD       Y9, Y1, Y1
   288  	VPADDD       Y10, Y2, Y2
   289  	VPADDD       Y11, Y3, Y3
   290  	VPADDD       Y12, Y4, Y4
   291  	VPADDD       Y13, Y5, Y5
   292  	VPADDD       Y14, Y6, Y6
   293  	VPADDD       Y15, Y7, Y7
   294  	VPUNPCKLDQ   Y1, Y0, Y8
   295  	VPUNPCKLDQ   Y3, Y2, Y9
   296  	VPUNPCKHDQ   Y1, Y0, Y12
   297  	VPUNPCKHDQ   Y3, Y2, Y13
   298  	VPUNPCKLDQ   Y5, Y4, Y10
   299  	VPUNPCKLDQ   Y7, Y6, Y11
   300  	VPUNPCKHDQ   Y5, Y4, Y14
   301  	VPUNPCKHDQ   Y7, Y6, Y15
   302  	VPUNPCKLQDQ  Y9, Y8, Y0
   303  	VPUNPCKLQDQ  Y11, Y10, Y1
   304  	VPUNPCKHQDQ  Y9, Y8, Y2
   305  	VPUNPCKHQDQ  Y11, Y10, Y3
   306  	VPUNPCKLQDQ  Y13, Y12, Y4
   307  	VPUNPCKLQDQ  Y15, Y14, Y5
   308  	VPUNPCKHQDQ  Y13, Y12, Y6
   309  	VPUNPCKHQDQ  Y15, Y14, Y7
   310  	VPERM2I128   $0x20, Y1, Y0, Y8
   311  	VPERM2I128   $0x20, Y3, Y2, Y9
   312  	VPERM2I128   $0x31, Y1, Y0, Y12
   313  	VPERM2I128   $0x31, Y3, Y2, Y13
   314  	VPERM2I128   $0x20, Y5, Y4, Y10
   315  	VPERM2I128   $0x20, Y7, Y6, Y11
   316  	VPERM2I128   $0x31, Y5, Y4, Y14
   317  	VPERM2I128   $0x31, Y7, Y6, Y15
   318  	ANDQ         SI, SI
   319  	JZ           chacha_blocks_avx2_noinput1
   320  	VPXOR        0(SI), Y8, Y8
   321  	VPXOR        64(SI), Y9, Y9
   322  	VPXOR        128(SI), Y10, Y10
   323  	VPXOR        192(SI), Y11, Y11
   324  	VPXOR        256(SI), Y12, Y12
   325  	VPXOR        320(SI), Y13, Y13
   326  	VPXOR        384(SI), Y14, Y14
   327  	VPXOR        448(SI), Y15, Y15
   328  	VMOVDQU      Y8, 0(DX)
   329  	VMOVDQU      Y9, 64(DX)
   330  	VMOVDQU      Y10, 128(DX)
   331  	VMOVDQU      Y11, 192(DX)
   332  	VMOVDQU      Y12, 256(DX)
   333  	VMOVDQU      Y13, 320(DX)
   334  	VMOVDQU      Y14, 384(DX)
   335  	VMOVDQU      Y15, 448(DX)
   336  	VMOVDQA      192(BP), Y0
   337  	VMOVDQA      224(BP), Y1
   338  	VMOVDQA      256(BP), Y2
   339  	VMOVDQA      288(BP), Y3
   340  	VMOVDQA      320(BP), Y4
   341  	VMOVDQA      352(BP), Y5
   342  	VMOVDQA      384(BP), Y6
   343  	VMOVDQA      416(BP), Y7
   344  	VPBROADCASTD 32(BP), Y8
   345  	VPBROADCASTD 4+32(BP), Y9
   346  	VPBROADCASTD 8+32(BP), Y10
   347  	VPBROADCASTD 12+32(BP), Y11
   348  	VMOVDQA      128(BP), Y12
   349  	VMOVDQA      160(BP), Y13
   350  	VPBROADCASTD 8+48(BP), Y14
   351  	VPBROADCASTD 12+48(BP), Y15
   352  	VPADDD       Y8, Y0, Y0
   353  	VPADDD       Y9, Y1, Y1
   354  	VPADDD       Y10, Y2, Y2
   355  	VPADDD       Y11, Y3, Y3
   356  	VPADDD       Y12, Y4, Y4
   357  	VPADDD       Y13, Y5, Y5
   358  	VPADDD       Y14, Y6, Y6
   359  	VPADDD       Y15, Y7, Y7
   360  	VPUNPCKLDQ   Y1, Y0, Y8
   361  	VPUNPCKLDQ   Y3, Y2, Y9
   362  	VPUNPCKHDQ   Y1, Y0, Y12
   363  	VPUNPCKHDQ   Y3, Y2, Y13
   364  	VPUNPCKLDQ   Y5, Y4, Y10
   365  	VPUNPCKLDQ   Y7, Y6, Y11
   366  	VPUNPCKHDQ   Y5, Y4, Y14
   367  	VPUNPCKHDQ   Y7, Y6, Y15
   368  	VPUNPCKLQDQ  Y9, Y8, Y0
   369  	VPUNPCKLQDQ  Y11, Y10, Y1
   370  	VPUNPCKHQDQ  Y9, Y8, Y2
   371  	VPUNPCKHQDQ  Y11, Y10, Y3
   372  	VPUNPCKLQDQ  Y13, Y12, Y4
   373  	VPUNPCKLQDQ  Y15, Y14, Y5
   374  	VPUNPCKHQDQ  Y13, Y12, Y6
   375  	VPUNPCKHQDQ  Y15, Y14, Y7
   376  	VPERM2I128   $0x20, Y1, Y0, Y8
   377  	VPERM2I128   $0x20, Y3, Y2, Y9
   378  	VPERM2I128   $0x31, Y1, Y0, Y12
   379  	VPERM2I128   $0x31, Y3, Y2, Y13
   380  	VPERM2I128   $0x20, Y5, Y4, Y10
   381  	VPERM2I128   $0x20, Y7, Y6, Y11
   382  	VPERM2I128   $0x31, Y5, Y4, Y14
   383  	VPERM2I128   $0x31, Y7, Y6, Y15
   384  	VPXOR        32(SI), Y8, Y8
   385  	VPXOR        96(SI), Y9, Y9
   386  	VPXOR        160(SI), Y10, Y10
   387  	VPXOR        224(SI), Y11, Y11
   388  	VPXOR        288(SI), Y12, Y12
   389  	VPXOR        352(SI), Y13, Y13
   390  	VPXOR        416(SI), Y14, Y14
   391  	VPXOR        480(SI), Y15, Y15
   392  	VMOVDQU      Y8, 32(DX)
   393  	VMOVDQU      Y9, 96(DX)
   394  	VMOVDQU      Y10, 160(DX)
   395  	VMOVDQU      Y11, 224(DX)
   396  	VMOVDQU      Y12, 288(DX)
   397  	VMOVDQU      Y13, 352(DX)
   398  	VMOVDQU      Y14, 416(DX)
   399  	VMOVDQU      Y15, 480(DX)
   400  	ADDQ         $512, SI
   401  	JMP          chacha_blocks_avx2_mainloop1_cont
   402  
   403  chacha_blocks_avx2_noinput1:
   404  	VMOVDQU      Y8, 0(DX)
   405  	VMOVDQU      Y9, 64(DX)
   406  	VMOVDQU      Y10, 128(DX)
   407  	VMOVDQU      Y11, 192(DX)
   408  	VMOVDQU      Y12, 256(DX)
   409  	VMOVDQU      Y13, 320(DX)
   410  	VMOVDQU      Y14, 384(DX)
   411  	VMOVDQU      Y15, 448(DX)
   412  	VMOVDQA      192(BP), Y0
   413  	VMOVDQA      224(BP), Y1
   414  	VMOVDQA      256(BP), Y2
   415  	VMOVDQA      288(BP), Y3
   416  	VMOVDQA      320(BP), Y4
   417  	VMOVDQA      352(BP), Y5
   418  	VMOVDQA      384(BP), Y6
   419  	VMOVDQA      416(BP), Y7
   420  	VPBROADCASTD 32(BP), Y8
   421  	VPBROADCASTD 4+32(BP), Y9
   422  	VPBROADCASTD 8+32(BP), Y10
   423  	VPBROADCASTD 12+32(BP), Y11
   424  	VMOVDQA      128(BP), Y12
   425  	VMOVDQA      160(BP), Y13
   426  	VPBROADCASTD 8+48(BP), Y14
   427  	VPBROADCASTD 12+48(BP), Y15
   428  	VPADDD       Y8, Y0, Y0
   429  	VPADDD       Y9, Y1, Y1
   430  	VPADDD       Y10, Y2, Y2
   431  	VPADDD       Y11, Y3, Y3
   432  	VPADDD       Y12, Y4, Y4
   433  	VPADDD       Y13, Y5, Y5
   434  	VPADDD       Y14, Y6, Y6
   435  	VPADDD       Y15, Y7, Y7
   436  	VPUNPCKLDQ   Y1, Y0, Y8
   437  	VPUNPCKLDQ   Y3, Y2, Y9
   438  	VPUNPCKHDQ   Y1, Y0, Y12
   439  	VPUNPCKHDQ   Y3, Y2, Y13
   440  	VPUNPCKLDQ   Y5, Y4, Y10
   441  	VPUNPCKLDQ   Y7, Y6, Y11
   442  	VPUNPCKHDQ   Y5, Y4, Y14
   443  	VPUNPCKHDQ   Y7, Y6, Y15
   444  	VPUNPCKLQDQ  Y9, Y8, Y0
   445  	VPUNPCKLQDQ  Y11, Y10, Y1
   446  	VPUNPCKHQDQ  Y9, Y8, Y2
   447  	VPUNPCKHQDQ  Y11, Y10, Y3
   448  	VPUNPCKLQDQ  Y13, Y12, Y4
   449  	VPUNPCKLQDQ  Y15, Y14, Y5
   450  	VPUNPCKHQDQ  Y13, Y12, Y6
   451  	VPUNPCKHQDQ  Y15, Y14, Y7
   452  	VPERM2I128   $0x20, Y1, Y0, Y8
   453  	VPERM2I128   $0x20, Y3, Y2, Y9
   454  	VPERM2I128   $0x31, Y1, Y0, Y12
   455  	VPERM2I128   $0x31, Y3, Y2, Y13
   456  	VPERM2I128   $0x20, Y5, Y4, Y10
   457  	VPERM2I128   $0x20, Y7, Y6, Y11
   458  	VPERM2I128   $0x31, Y5, Y4, Y14
   459  	VPERM2I128   $0x31, Y7, Y6, Y15
   460  	VMOVDQU      Y8, 32(DX)
   461  	VMOVDQU      Y9, 96(DX)
   462  	VMOVDQU      Y10, 160(DX)
   463  	VMOVDQU      Y11, 224(DX)
   464  	VMOVDQU      Y12, 288(DX)
   465  	VMOVDQU      Y13, 352(DX)
   466  	VMOVDQU      Y14, 416(DX)
   467  	VMOVDQU      Y15, 480(DX)
   468  
   469  chacha_blocks_avx2_mainloop1_cont:
   470  	ADDQ $512, DX
   471  	SUBQ $512, CX
   472  	CMPQ CX, $512
   473  	JAE  chacha_blocks_avx2_atleast512
   474  	CMPQ CX, $256
   475  	JB   chacha_blocks_avx2_below256_fixup
   476  
   477  chacha_blocks_avx2_atleast256:
   478  	MOVQ 48(BP), AX
   479  	LEAQ 1(AX), R8
   480  	LEAQ 2(AX), R9
   481  	LEAQ 3(AX), R10
   482  	LEAQ 4(AX), BX
   483  	MOVL AX, 128(BP)
   484  	MOVL R8, 4+128(BP)
   485  	MOVL R9, 8+128(BP)
   486  	MOVL R10, 12+128(BP)
   487  	SHRQ $32, AX
   488  	SHRQ $32, R8
   489  	SHRQ $32, R9
   490  	SHRQ $32, R10
   491  	MOVL AX, 160(BP)
   492  	MOVL R8, 4+160(BP)
   493  	MOVL R9, 8+160(BP)
   494  	MOVL R10, 12+160(BP)
   495  	MOVQ BX, 48(BP)
   496  
   497  	// MOVQ 64(BP), AX
   498  	MOVQ         $20, AX
   499  	VPBROADCASTD 0(BP), X0
   500  	VPBROADCASTD 4+0(BP), X1
   501  	VPBROADCASTD 8+0(BP), X2
   502  	VPBROADCASTD 12+0(BP), X3
   503  	VPBROADCASTD 16(BP), X4
   504  	VPBROADCASTD 4+16(BP), X5
   505  	VPBROADCASTD 8+16(BP), X6
   506  	VPBROADCASTD 12+16(BP), X7
   507  	VPBROADCASTD 32(BP), X8
   508  	VPBROADCASTD 4+32(BP), X9
   509  	VPBROADCASTD 8+32(BP), X10
   510  	VPBROADCASTD 12+32(BP), X11
   511  	VMOVDQA      128(BP), X12
   512  	VMOVDQA      160(BP), X13
   513  	VPBROADCASTD 8+48(BP), X14
   514  	VPBROADCASTD 12+48(BP), X15
   515  
   516  chacha_blocks_avx2_mainloop2:
   517  	VPADDD       X0, X4, X0
   518  	VPADDD       X1, X5, X1
   519  	VPXOR        X12, X0, X12
   520  	VPXOR        X13, X1, X13
   521  	VPADDD       X2, X6, X2
   522  	VPADDD       X3, X7, X3
   523  	VPXOR        X14, X2, X14
   524  	VPXOR        X15, X3, X15
   525  	VPSHUFB      448(BP), X12, X12
   526  	VPSHUFB      448(BP), X13, X13
   527  	VPADDD       X8, X12, X8
   528  	VPADDD       X9, X13, X9
   529  	VPSHUFB      448(BP), X14, X14
   530  	VPSHUFB      448(BP), X15, X15
   531  	VPADDD       X10, X14, X10
   532  	VPADDD       X11, X15, X11
   533  	VMOVDQA      X12, 96(BP)
   534  	VPXOR        X4, X8, X4
   535  	VPXOR        X5, X9, X5
   536  	VPSLLD       $ 12, X4, X12
   537  	VPSRLD       $20, X4, X4
   538  	VPXOR        X4, X12, X4
   539  	VPSLLD       $ 12, X5, X12
   540  	VPSRLD       $20, X5, X5
   541  	VPXOR        X5, X12, X5
   542  	VPXOR        X6, X10, X6
   543  	VPXOR        X7, X11, X7
   544  	VPSLLD       $ 12, X6, X12
   545  	VPSRLD       $20, X6, X6
   546  	VPXOR        X6, X12, X6
   547  	VPSLLD       $ 12, X7, X12
   548  	VPSRLD       $20, X7, X7
   549  	VPXOR        X7, X12, X7
   550  	VPADDD       X0, X4, X0
   551  	VPADDD       X1, X5, X1
   552  	VPXOR        96(BP), X0, X12
   553  	VPXOR        X13, X1, X13
   554  	VPADDD       X2, X6, X2
   555  	VPADDD       X3, X7, X3
   556  	VPXOR        X14, X2, X14
   557  	VPXOR        X15, X3, X15
   558  	VPSHUFB      480(BP), X12, X12
   559  	VPSHUFB      480(BP), X13, X13
   560  	VPADDD       X8, X12, X8
   561  	VPADDD       X9, X13, X9
   562  	VPSHUFB      480(BP), X14, X14
   563  	VPSHUFB      480(BP), X15, X15
   564  	VPADDD       X10, X14, X10
   565  	VPADDD       X11, X15, X11
   566  	VMOVDQA      X12, 96(BP)
   567  	VPXOR        X4, X8, X4
   568  	VPXOR        X5, X9, X5
   569  	VPSLLD       $ 7, X4, X12
   570  	VPSRLD       $25, X4, X4
   571  	VPXOR        X4, X12, X4
   572  	VPSLLD       $ 7, X5, X12
   573  	VPSRLD       $25, X5, X5
   574  	VPXOR        X5, X12, X5
   575  	VPXOR        X6, X10, X6
   576  	VPXOR        X7, X11, X7
   577  	VPSLLD       $ 7, X6, X12
   578  	VPSRLD       $25, X6, X6
   579  	VPXOR        X6, X12, X6
   580  	VPSLLD       $ 7, X7, X12
   581  	VPSRLD       $25, X7, X7
   582  	VPXOR        X7, X12, X7
   583  	VPADDD       X0, X5, X0
   584  	VPADDD       X1, X6, X1
   585  	VPXOR        X15, X0, X15
   586  	VPXOR        96(BP), X1, X12
   587  	VPADDD       X2, X7, X2
   588  	VPADDD       X3, X4, X3
   589  	VPXOR        X13, X2, X13
   590  	VPXOR        X14, X3, X14
   591  	VPSHUFB      448(BP), X15, X15
   592  	VPSHUFB      448(BP), X12, X12
   593  	VPADDD       X10, X15, X10
   594  	VPADDD       X11, X12, X11
   595  	VPSHUFB      448(BP), X13, X13
   596  	VPSHUFB      448(BP), X14, X14
   597  	VPADDD       X8, X13, X8
   598  	VPADDD       X9, X14, X9
   599  	VMOVDQA      X15, 96(BP)
   600  	VPXOR        X5, X10, X5
   601  	VPXOR        X6, X11, X6
   602  	VPSLLD       $ 12, X5, X15
   603  	VPSRLD       $20, X5, X5
   604  	VPXOR        X5, X15, X5
   605  	VPSLLD       $ 12, X6, X15
   606  	VPSRLD       $20, X6, X6
   607  	VPXOR        X6, X15, X6
   608  	VPXOR        X7, X8, X7
   609  	VPXOR        X4, X9, X4
   610  	VPSLLD       $ 12, X7, X15
   611  	VPSRLD       $20, X7, X7
   612  	VPXOR        X7, X15, X7
   613  	VPSLLD       $ 12, X4, X15
   614  	VPSRLD       $20, X4, X4
   615  	VPXOR        X4, X15, X4
   616  	VPADDD       X0, X5, X0
   617  	VPADDD       X1, X6, X1
   618  	VPXOR        96(BP), X0, X15
   619  	VPXOR        X12, X1, X12
   620  	VPADDD       X2, X7, X2
   621  	VPADDD       X3, X4, X3
   622  	VPXOR        X13, X2, X13
   623  	VPXOR        X14, X3, X14
   624  	VPSHUFB      480(BP), X15, X15
   625  	VPSHUFB      480(BP), X12, X12
   626  	VPADDD       X10, X15, X10
   627  	VPADDD       X11, X12, X11
   628  	VPSHUFB      480(BP), X13, X13
   629  	VPSHUFB      480(BP), X14, X14
   630  	VPADDD       X8, X13, X8
   631  	VPADDD       X9, X14, X9
   632  	VMOVDQA      X15, 96(BP)
   633  	VPXOR        X5, X10, X5
   634  	VPXOR        X6, X11, X6
   635  	VPSLLD       $ 7, X5, X15
   636  	VPSRLD       $25, X5, X5
   637  	VPXOR        X5, X15, X5
   638  	VPSLLD       $ 7, X6, X15
   639  	VPSRLD       $25, X6, X6
   640  	VPXOR        X6, X15, X6
   641  	VPXOR        X7, X8, X7
   642  	VPXOR        X4, X9, X4
   643  	VPSLLD       $ 7, X7, X15
   644  	VPSRLD       $25, X7, X7
   645  	VPXOR        X7, X15, X7
   646  	VPSLLD       $ 7, X4, X15
   647  	VPSRLD       $25, X4, X4
   648  	VPXOR        X4, X15, X4
   649  	VMOVDQA      96(BP), X15
   650  	SUBQ         $2, AX
   651  	JNZ          chacha_blocks_avx2_mainloop2
   652  	VMOVDQA      X8, 192(BP)
   653  	VMOVDQA      X9, 208(BP)
   654  	VMOVDQA      X10, 224(BP)
   655  	VMOVDQA      X11, 240(BP)
   656  	VMOVDQA      X12, 256(BP)
   657  	VMOVDQA      X13, 272(BP)
   658  	VMOVDQA      X14, 288(BP)
   659  	VMOVDQA      X15, 304(BP)
   660  	VPBROADCASTD 0(BP), X8
   661  	VPBROADCASTD 4+0(BP), X9
   662  	VPBROADCASTD 8+0(BP), X10
   663  	VPBROADCASTD 12+0(BP), X11
   664  	VPBROADCASTD 16(BP), X12
   665  	VPBROADCASTD 4+16(BP), X13
   666  	VPBROADCASTD 8+16(BP), X14
   667  	VPBROADCASTD 12+16(BP), X15
   668  	VPADDD       X8, X0, X0
   669  	VPADDD       X9, X1, X1
   670  	VPADDD       X10, X2, X2
   671  	VPADDD       X11, X3, X3
   672  	VPADDD       X12, X4, X4
   673  	VPADDD       X13, X5, X5
   674  	VPADDD       X14, X6, X6
   675  	VPADDD       X15, X7, X7
   676  	VPUNPCKLDQ   X1, X0, X8
   677  	VPUNPCKLDQ   X3, X2, X9
   678  	VPUNPCKHDQ   X1, X0, X12
   679  	VPUNPCKHDQ   X3, X2, X13
   680  	VPUNPCKLDQ   X5, X4, X10
   681  	VPUNPCKLDQ   X7, X6, X11
   682  	VPUNPCKHDQ   X5, X4, X14
   683  	VPUNPCKHDQ   X7, X6, X15
   684  	VPUNPCKLQDQ  X9, X8, X0
   685  	VPUNPCKLQDQ  X11, X10, X1
   686  	VPUNPCKHQDQ  X9, X8, X2
   687  	VPUNPCKHQDQ  X11, X10, X3
   688  	VPUNPCKLQDQ  X13, X12, X4
   689  	VPUNPCKLQDQ  X15, X14, X5
   690  	VPUNPCKHQDQ  X13, X12, X6
   691  	VPUNPCKHQDQ  X15, X14, X7
   692  	ANDQ         SI, SI
   693  	JZ           chacha_blocks_avx2_noinput2
   694  	VPXOR        0(SI), X0, X0
   695  	VPXOR        16(SI), X1, X1
   696  	VPXOR        64(SI), X2, X2
   697  	VPXOR        80(SI), X3, X3
   698  	VPXOR        128(SI), X4, X4
   699  	VPXOR        144(SI), X5, X5
   700  	VPXOR        192(SI), X6, X6
   701  	VPXOR        208(SI), X7, X7
   702  	VMOVDQU      X0, 0(DX)
   703  	VMOVDQU      X1, 16(DX)
   704  	VMOVDQU      X2, 64(DX)
   705  	VMOVDQU      X3, 80(DX)
   706  	VMOVDQU      X4, 128(DX)
   707  	VMOVDQU      X5, 144(DX)
   708  	VMOVDQU      X6, 192(DX)
   709  	VMOVDQU      X7, 208(DX)
   710  	VMOVDQA      192(BP), X0
   711  	VMOVDQA      208(BP), X1
   712  	VMOVDQA      224(BP), X2
   713  	VMOVDQA      240(BP), X3
   714  	VMOVDQA      256(BP), X4
   715  	VMOVDQA      272(BP), X5
   716  	VMOVDQA      288(BP), X6
   717  	VMOVDQA      304(BP), X7
   718  	VPBROADCASTD 32(BP), X8
   719  	VPBROADCASTD 4+32(BP), X9
   720  	VPBROADCASTD 8+32(BP), X10
   721  	VPBROADCASTD 12+32(BP), X11
   722  	VMOVDQA      128(BP), X12
   723  	VMOVDQA      160(BP), X13
   724  	VPBROADCASTD 8+48(BP), X14
   725  	VPBROADCASTD 12+48(BP), X15
   726  	VPADDD       X8, X0, X0
   727  	VPADDD       X9, X1, X1
   728  	VPADDD       X10, X2, X2
   729  	VPADDD       X11, X3, X3
   730  	VPADDD       X12, X4, X4
   731  	VPADDD       X13, X5, X5
   732  	VPADDD       X14, X6, X6
   733  	VPADDD       X15, X7, X7
   734  	VPUNPCKLDQ   X1, X0, X8
   735  	VPUNPCKLDQ   X3, X2, X9
   736  	VPUNPCKHDQ   X1, X0, X12
   737  	VPUNPCKHDQ   X3, X2, X13
   738  	VPUNPCKLDQ   X5, X4, X10
   739  	VPUNPCKLDQ   X7, X6, X11
   740  	VPUNPCKHDQ   X5, X4, X14
   741  	VPUNPCKHDQ   X7, X6, X15
   742  	VPUNPCKLQDQ  X9, X8, X0
   743  	VPUNPCKLQDQ  X11, X10, X1
   744  	VPUNPCKHQDQ  X9, X8, X2
   745  	VPUNPCKHQDQ  X11, X10, X3
   746  	VPUNPCKLQDQ  X13, X12, X4
   747  	VPUNPCKLQDQ  X15, X14, X5
   748  	VPUNPCKHQDQ  X13, X12, X6
   749  	VPUNPCKHQDQ  X15, X14, X7
   750  	VPXOR        32(SI), X0, X0
   751  	VPXOR        48(SI), X1, X1
   752  	VPXOR        96(SI), X2, X2
   753  	VPXOR        112(SI), X3, X3
   754  	VPXOR        160(SI), X4, X4
   755  	VPXOR        176(SI), X5, X5
   756  	VPXOR        224(SI), X6, X6
   757  	VPXOR        240(SI), X7, X7
   758  	VMOVDQU      X0, 32(DX)
   759  	VMOVDQU      X1, 48(DX)
   760  	VMOVDQU      X2, 96(DX)
   761  	VMOVDQU      X3, 112(DX)
   762  	VMOVDQU      X4, 160(DX)
   763  	VMOVDQU      X5, 176(DX)
   764  	VMOVDQU      X6, 224(DX)
   765  	VMOVDQU      X7, 240(DX)
   766  	ADDQ         $256, SI
   767  	JMP          chacha_blocks_avx2_mainloop2_cont
   768  
   769  chacha_blocks_avx2_noinput2:
   770  	VMOVDQU      X0, 0(DX)
   771  	VMOVDQU      X1, 16(DX)
   772  	VMOVDQU      X2, 64(DX)
   773  	VMOVDQU      X3, 80(DX)
   774  	VMOVDQU      X4, 128(DX)
   775  	VMOVDQU      X5, 144(DX)
   776  	VMOVDQU      X6, 192(DX)
   777  	VMOVDQU      X7, 208(DX)
   778  	VMOVDQA      192(BP), X0
   779  	VMOVDQA      208(BP), X1
   780  	VMOVDQA      224(BP), X2
   781  	VMOVDQA      240(BP), X3
   782  	VMOVDQA      256(BP), X4
   783  	VMOVDQA      272(BP), X5
   784  	VMOVDQA      288(BP), X6
   785  	VMOVDQA      304(BP), X7
   786  	VPBROADCASTD 32(BP), X8
   787  	VPBROADCASTD 4+32(BP), X9
   788  	VPBROADCASTD 8+32(BP), X10
   789  	VPBROADCASTD 12+32(BP), X11
   790  	VMOVDQA      128(BP), X12
   791  	VMOVDQA      160(BP), X13
   792  	VPBROADCASTD 8+48(BP), X14
   793  	VPBROADCASTD 12+48(BP), X15
   794  	VPADDD       X8, X0, X0
   795  	VPADDD       X9, X1, X1
   796  	VPADDD       X10, X2, X2
   797  	VPADDD       X11, X3, X3
   798  	VPADDD       X12, X4, X4
   799  	VPADDD       X13, X5, X5
   800  	VPADDD       X14, X6, X6
   801  	VPADDD       X15, X7, X7
   802  	VPUNPCKLDQ   X1, X0, X8
   803  	VPUNPCKLDQ   X3, X2, X9
   804  	VPUNPCKHDQ   X1, X0, X12
   805  	VPUNPCKHDQ   X3, X2, X13
   806  	VPUNPCKLDQ   X5, X4, X10
   807  	VPUNPCKLDQ   X7, X6, X11
   808  	VPUNPCKHDQ   X5, X4, X14
   809  	VPUNPCKHDQ   X7, X6, X15
   810  	VPUNPCKLQDQ  X9, X8, X0
   811  	VPUNPCKLQDQ  X11, X10, X1
   812  	VPUNPCKHQDQ  X9, X8, X2
   813  	VPUNPCKHQDQ  X11, X10, X3
   814  	VPUNPCKLQDQ  X13, X12, X4
   815  	VPUNPCKLQDQ  X15, X14, X5
   816  	VPUNPCKHQDQ  X13, X12, X6
   817  	VPUNPCKHQDQ  X15, X14, X7
   818  	VMOVDQU      X0, 32(DX)
   819  	VMOVDQU      X1, 48(DX)
   820  	VMOVDQU      X2, 96(DX)
   821  	VMOVDQU      X3, 112(DX)
   822  	VMOVDQU      X4, 160(DX)
   823  	VMOVDQU      X5, 176(DX)
   824  	VMOVDQU      X6, 224(DX)
   825  	VMOVDQU      X7, 240(DX)
   826  
   827  chacha_blocks_avx2_mainloop2_cont:
   828  	ADDQ $256, DX
   829  	SUBQ $256, CX
   830  	CMPQ CX, $256
   831  	JAE  chacha_blocks_avx2_atleast256
   832  
   833  chacha_blocks_avx2_below256_fixup:
   834  	VMOVDQA 448(BP), X6
   835  	VMOVDQA 480(BP), X7
   836  	VMOVDQA 0(BP), X8
   837  	VMOVDQA 16(BP), X9
   838  	VMOVDQA 32(BP), X10
   839  	VMOVDQA 48(BP), X11
   840  	MOVQ    $1, R9
   841  
   842  chacha_blocks_avx2_below256:
   843  	VMOVQ R9, X5
   844  	ANDQ  CX, CX
   845  	JZ    chacha_blocks_avx2_done
   846  	CMPQ  CX, $64
   847  	JAE   chacha_blocks_avx2_above63
   848  	MOVQ  DX, R9
   849  	ANDQ  SI, SI
   850  	JZ    chacha_blocks_avx2_noinput3
   851  	MOVQ  CX, R10
   852  	MOVQ  BP, DX
   853  	ADDQ  R10, SI
   854  	ADDQ  R10, DX
   855  	NEGQ  R10
   856  
   857  chacha_blocks_avx2_copyinput:
   858  	MOVB (SI)(R10*1), AX
   859  	MOVB AX, (DX)(R10*1)
   860  	INCQ R10
   861  	JNZ  chacha_blocks_avx2_copyinput
   862  	MOVQ BP, SI
   863  
   864  chacha_blocks_avx2_noinput3:
   865  	MOVQ BP, DX
   866  
   867  chacha_blocks_avx2_above63:
   868  	VMOVDQA X8, X0
   869  	VMOVDQA X9, X1
   870  	VMOVDQA X10, X2
   871  	VMOVDQA X11, X3
   872  
   873  	// MOVQ 64(BP), AX
   874  	MOVQ $20, AX
   875  
   876  chacha_blocks_avx2_mainloop3:
   877  	VPADDD  X0, X1, X0
   878  	VPXOR   X3, X0, X3
   879  	VPSHUFB X6, X3, X3
   880  	VPADDD  X2, X3, X2
   881  	VPXOR   X1, X2, X1
   882  	VPSLLD  $12, X1, X4
   883  	VPSRLD  $20, X1, X1
   884  	VPXOR   X1, X4, X1
   885  	VPADDD  X0, X1, X0
   886  	VPXOR   X3, X0, X3
   887  	VPSHUFB X7, X3, X3
   888  	VPSHUFD $0x93, X0, X0
   889  	VPADDD  X2, X3, X2
   890  	VPSHUFD $0x4e, X3, X3
   891  	VPXOR   X1, X2, X1
   892  	VPSHUFD $0x39, X2, X2
   893  	VPSLLD  $7, X1, X4
   894  	VPSRLD  $25, X1, X1
   895  	VPXOR   X1, X4, X1
   896  	VPADDD  X0, X1, X0
   897  	VPXOR   X3, X0, X3
   898  	VPSHUFB X6, X3, X3
   899  	VPADDD  X2, X3, X2
   900  	VPXOR   X1, X2, X1
   901  	VPSLLD  $12, X1, X4
   902  	VPSRLD  $20, X1, X1
   903  	VPXOR   X1, X4, X1
   904  	VPADDD  X0, X1, X0
   905  	VPXOR   X3, X0, X3
   906  	VPSHUFB X7, X3, X3
   907  	VPSHUFD $0x39, X0, X0
   908  	VPADDD  X2, X3, X2
   909  	VPSHUFD $0x4e, X3, X3
   910  	VPXOR   X1, X2, X1
   911  	VPSHUFD $0x93, X2, X2
   912  	VPSLLD  $7, X1, X4
   913  	VPSRLD  $25, X1, X1
   914  	VPXOR   X1, X4, X1
   915  	SUBQ    $2, AX
   916  	JNZ     chacha_blocks_avx2_mainloop3
   917  	VPADDD  X0, X8, X0
   918  	VPADDD  X1, X9, X1
   919  	VPADDD  X2, X10, X2
   920  	VPADDD  X3, X11, X3
   921  	ANDQ    SI, SI
   922  	JZ      chacha_blocks_avx2_noinput4
   923  	VPXOR   0(SI), X0, X0
   924  	VPXOR   16(SI), X1, X1
   925  	VPXOR   32(SI), X2, X2
   926  	VPXOR   48(SI), X3, X3
   927  	ADDQ    $64, SI
   928  
   929  chacha_blocks_avx2_noinput4:
   930  	VMOVDQU X0, 0(DX)
   931  	VMOVDQU X1, 16(DX)
   932  	VMOVDQU X2, 32(DX)
   933  	VMOVDQU X3, 48(DX)
   934  	VPADDQ  X11, X5, X11
   935  	CMPQ    CX, $64
   936  	JBE     chacha_blocks_avx2_mainloop3_finishup
   937  	ADDQ    $64, DX
   938  	SUBQ    $64, CX
   939  	JMP     chacha_blocks_avx2_below256
   940  
   941  chacha_blocks_avx2_mainloop3_finishup:
   942  	CMPQ CX, $64
   943  	JE   chacha_blocks_avx2_done
   944  	ADDQ CX, R9
   945  	ADDQ CX, DX
   946  	NEGQ CX
   947  
   948  chacha_blocks_avx2_copyoutput:
   949  	MOVB (DX)(CX*1), AX
   950  	MOVB AX, (R9)(CX*1)
   951  	INCQ CX
   952  	JNZ  chacha_blocks_avx2_copyoutput
   953  
   954  chacha_blocks_avx2_done:
   955  	VMOVDQU X11, 32(DI)
   956  
   957  	VZEROUPPER
   958  	RET
   959  
   960  // func hChaChaAVX2(key, nonce []byte, dst *byte)
   961  TEXT ·hChaChaAVX2(SB), NOSPLIT|NOFRAME, $0-56
   962  	MOVQ key+0(FP), DI
   963  	MOVQ nonce+24(FP), SI
   964  	MOVQ dst+48(FP), DX
   965  
   966  	MOVL $20, CX
   967  
   968  	LEAQ    ·chacha_constants<>(SB), AX
   969  	VMOVDQA 0(AX), X0
   970  	VMOVDQA 16(AX), X6
   971  	VMOVDQA 32(AX), X5
   972  
   973  	VMOVDQU 0(DI), X1
   974  	VMOVDQU 16(DI), X2
   975  	VMOVDQU 0(SI), X3
   976  
   977  hhacha_mainloop_avx2:
   978  	VPADDD  X0, X1, X0
   979  	VPXOR   X3, X0, X3
   980  	VPSHUFB X6, X3, X3
   981  	VPADDD  X2, X3, X2
   982  	VPXOR   X1, X2, X1
   983  	VPSLLD  $12, X1, X4
   984  	VPSRLD  $20, X1, X1
   985  	VPXOR   X1, X4, X1
   986  	VPADDD  X0, X1, X0
   987  	VPXOR   X3, X0, X3
   988  	VPSHUFB X5, X3, X3
   989  	VPADDD  X2, X3, X2
   990  	VPXOR   X1, X2, X1
   991  	VPSLLD  $7, X1, X4
   992  	VPSRLD  $25, X1, X1
   993  	VPSHUFD $0x93, X0, X0
   994  	VPXOR   X1, X4, X1
   995  	VPSHUFD $0x4e, X3, X3
   996  	VPADDD  X0, X1, X0
   997  	VPXOR   X3, X0, X3
   998  	VPSHUFB X6, X3, X3
   999  	VPSHUFD $0x39, X2, X2
  1000  	VPADDD  X2, X3, X2
  1001  	VPXOR   X1, X2, X1
  1002  	VPSLLD  $12, X1, X4
  1003  	VPSRLD  $20, X1, X1
  1004  	VPXOR   X1, X4, X1
  1005  	VPADDD  X0, X1, X0
  1006  	VPXOR   X3, X0, X3
  1007  	VPSHUFB X5, X3, X3
  1008  	VPADDD  X2, X3, X2
  1009  	VPXOR   X1, X2, X1
  1010  	VPSHUFD $0x39, X0, X0
  1011  	VPSLLD  $7, X1, X4
  1012  	VPSHUFD $0x4e, X3, X3
  1013  	VPSRLD  $25, X1, X1
  1014  	VPSHUFD $0x93, X2, X2
  1015  	VPXOR   X1, X4, X1
  1016  	SUBL    $2, CX
  1017  	JNE     hhacha_mainloop_avx2
  1018  
  1019  	VMOVDQU X0, (DX)
  1020  	VMOVDQU X3, 16(DX)
  1021  
  1022  	VZEROUPPER
  1023  	RET
  1024  
  1025  // func blocksSSSE3(s *[api.StateSize]uint32, in, out []byte)
  1026  TEXT ·blocksSSSE3(SB), NOSPLIT, $576-56
  1027  	// This is Andrew Moon's SSSE3 ChaCha implementation taken from
  1028  	// supercop-20190110, with some minor changes, primarily calling
  1029  	// convention and assembly dialect related.
  1030  
  1031  	// Align the stack on a 64 byte boundary.
  1032  	MOVQ SP, BP
  1033  	ADDQ $64, BP
  1034  	ANDQ $-64, BP
  1035  
  1036  	// Go calling convention -> SYSV AMD64 (and a fixup).
  1037  	MOVQ s+0(FP), DI       // &s -> DI
  1038  	ADDQ $16, DI           // Skip the ChaCha constants in the chachaState.
  1039  	MOVQ in+8(FP), SI      // &in[0] -> SI
  1040  	MOVQ out+32(FP), DX    // &out[0] -> DX
  1041  	MOVQ in_len+16(FP), CX // len(in) -> CX
  1042  
  1043  	// Begin the main body of `chacha_blocks_ssse3`.
  1044  	//
  1045  	// Mostly a direct translation except:
  1046  	//  * The number of rounds is always 20.
  1047  	//  * %rbp is used instead of BP.
  1048  	LEAQ  ·chacha_constants<>(SB), AX
  1049  	MOVO  0(AX), X8
  1050  	MOVO  16(AX), X6
  1051  	MOVO  32(AX), X7
  1052  	MOVOU 0(DI), X9
  1053  	MOVOU 16(DI), X10
  1054  	MOVOU 32(DI), X11
  1055  
  1056  	// MOVQ 48(DI), AX
  1057  	MOVQ $1, R9
  1058  	MOVO X8, 0(BP)
  1059  	MOVO X9, 16(BP)
  1060  	MOVO X10, 32(BP)
  1061  	MOVO X11, 48(BP)
  1062  
  1063  	MOVO X6, 80(BP)
  1064  	MOVO X7, 96(BP)
  1065  	// MOVQ AX, 64(BP)
  1066  	CMPQ   CX, $256
  1067  	JB     chacha_blocks_ssse3_below256
  1068  	PSHUFD $0x00, X8, X0
  1069  	PSHUFD $0x55, X8, X1
  1070  	PSHUFD $0xaa, X8, X2
  1071  	PSHUFD $0xff, X8, X3
  1072  	MOVO   X0, 128(BP)
  1073  	MOVO   X1, 144(BP)
  1074  	MOVO   X2, 160(BP)
  1075  	MOVO   X3, 176(BP)
  1076  	PSHUFD $0x00, X9, X0
  1077  	PSHUFD $0x55, X9, X1
  1078  	PSHUFD $0xaa, X9, X2
  1079  	PSHUFD $0xff, X9, X3
  1080  	MOVO   X0, 192(BP)
  1081  	MOVO   X1, 208(BP)
  1082  	MOVO   X2, 224(BP)
  1083  	MOVO   X3, 240(BP)
  1084  	PSHUFD $0x00, X10, X0
  1085  	PSHUFD $0x55, X10, X1
  1086  	PSHUFD $0xaa, X10, X2
  1087  	PSHUFD $0xff, X10, X3
  1088  	MOVO   X0, 256(BP)
  1089  	MOVO   X1, 272(BP)
  1090  	MOVO   X2, 288(BP)
  1091  	MOVO   X3, 304(BP)
  1092  	PSHUFD $0xaa, X11, X0
  1093  	PSHUFD $0xff, X11, X1
  1094  	MOVO   X0, 352(BP)
  1095  	MOVO   X1, 368(BP)
  1096  	JMP    chacha_blocks_ssse3_atleast256
  1097  
  1098  // .p2align 6,,63
  1099  // # align to 4 mod 64
  1100  // nop;nop;nop;nop;
  1101  chacha_blocks_ssse3_atleast256:
  1102  	MOVQ 48(BP), AX
  1103  	LEAQ 1(AX), R8
  1104  	LEAQ 2(AX), R9
  1105  	LEAQ 3(AX), R10
  1106  	LEAQ 4(AX), BX
  1107  	MOVL AX, 320(BP)
  1108  	MOVL R8, 4+320(BP)
  1109  	MOVL R9, 8+320(BP)
  1110  	MOVL R10, 12+320(BP)
  1111  	SHRQ $32, AX
  1112  	SHRQ $32, R8
  1113  	SHRQ $32, R9
  1114  	SHRQ $32, R10
  1115  	MOVL AX, 336(BP)
  1116  	MOVL R8, 4+336(BP)
  1117  	MOVL R9, 8+336(BP)
  1118  	MOVL R10, 12+336(BP)
  1119  	MOVQ BX, 48(BP)
  1120  
  1121  	// MOVQ 64(BP), AX
  1122  	MOVQ $20, AX
  1123  	MOVO 128(BP), X0
  1124  	MOVO 144(BP), X1
  1125  	MOVO 160(BP), X2
  1126  	MOVO 176(BP), X3
  1127  	MOVO 192(BP), X4
  1128  	MOVO 208(BP), X5
  1129  	MOVO 224(BP), X6
  1130  	MOVO 240(BP), X7
  1131  	MOVO 256(BP), X8
  1132  	MOVO 272(BP), X9
  1133  	MOVO 288(BP), X10
  1134  	MOVO 304(BP), X11
  1135  	MOVO 320(BP), X12
  1136  	MOVO 336(BP), X13
  1137  	MOVO 352(BP), X14
  1138  	MOVO 368(BP), X15
  1139  
  1140  chacha_blocks_ssse3_mainloop1:
  1141  	PADDD      X4, X0
  1142  	PADDD      X5, X1
  1143  	PXOR       X0, X12
  1144  	PXOR       X1, X13
  1145  	PADDD      X6, X2
  1146  	PADDD      X7, X3
  1147  	PXOR       X2, X14
  1148  	PXOR       X3, X15
  1149  	PSHUFB     80(BP), X12
  1150  	PSHUFB     80(BP), X13
  1151  	PADDD      X12, X8
  1152  	PADDD      X13, X9
  1153  	PSHUFB     80(BP), X14
  1154  	PSHUFB     80(BP), X15
  1155  	PADDD      X14, X10
  1156  	PADDD      X15, X11
  1157  	MOVO       X12, 112(BP)
  1158  	PXOR       X8, X4
  1159  	PXOR       X9, X5
  1160  	MOVO       X4, X12
  1161  	PSLLL      $ 12, X4
  1162  	PSRLL      $20, X12
  1163  	PXOR       X12, X4
  1164  	MOVO       X5, X12
  1165  	PSLLL      $ 12, X5
  1166  	PSRLL      $20, X12
  1167  	PXOR       X12, X5
  1168  	PXOR       X10, X6
  1169  	PXOR       X11, X7
  1170  	MOVO       X6, X12
  1171  	PSLLL      $ 12, X6
  1172  	PSRLL      $20, X12
  1173  	PXOR       X12, X6
  1174  	MOVO       X7, X12
  1175  	PSLLL      $ 12, X7
  1176  	PSRLL      $20, X12
  1177  	PXOR       X12, X7
  1178  	MOVO       112(BP), X12
  1179  	PADDD      X4, X0
  1180  	PADDD      X5, X1
  1181  	PXOR       X0, X12
  1182  	PXOR       X1, X13
  1183  	PADDD      X6, X2
  1184  	PADDD      X7, X3
  1185  	PXOR       X2, X14
  1186  	PXOR       X3, X15
  1187  	PSHUFB     96(BP), X12
  1188  	PSHUFB     96(BP), X13
  1189  	PADDD      X12, X8
  1190  	PADDD      X13, X9
  1191  	PSHUFB     96(BP), X14
  1192  	PSHUFB     96(BP), X15
  1193  	PADDD      X14, X10
  1194  	PADDD      X15, X11
  1195  	MOVO       X12, 112(BP)
  1196  	PXOR       X8, X4
  1197  	PXOR       X9, X5
  1198  	MOVO       X4, X12
  1199  	PSLLL      $ 7, X4
  1200  	PSRLL      $25, X12
  1201  	PXOR       X12, X4
  1202  	MOVO       X5, X12
  1203  	PSLLL      $ 7, X5
  1204  	PSRLL      $25, X12
  1205  	PXOR       X12, X5
  1206  	PXOR       X10, X6
  1207  	PXOR       X11, X7
  1208  	MOVO       X6, X12
  1209  	PSLLL      $ 7, X6
  1210  	PSRLL      $25, X12
  1211  	PXOR       X12, X6
  1212  	MOVO       X7, X12
  1213  	PSLLL      $ 7, X7
  1214  	PSRLL      $25, X12
  1215  	PXOR       X12, X7
  1216  	MOVO       112(BP), X12
  1217  	PADDD      X5, X0
  1218  	PADDD      X6, X1
  1219  	PXOR       X0, X15
  1220  	PXOR       X1, X12
  1221  	PADDD      X7, X2
  1222  	PADDD      X4, X3
  1223  	PXOR       X2, X13
  1224  	PXOR       X3, X14
  1225  	PSHUFB     80(BP), X15
  1226  	PSHUFB     80(BP), X12
  1227  	PADDD      X15, X10
  1228  	PADDD      X12, X11
  1229  	PSHUFB     80(BP), X13
  1230  	PSHUFB     80(BP), X14
  1231  	PADDD      X13, X8
  1232  	PADDD      X14, X9
  1233  	MOVO       X15, 112(BP)
  1234  	PXOR       X10, X5
  1235  	PXOR       X11, X6
  1236  	MOVO       X5, X15
  1237  	PSLLL      $ 12, X5
  1238  	PSRLL      $20, X15
  1239  	PXOR       X15, X5
  1240  	MOVO       X6, X15
  1241  	PSLLL      $ 12, X6
  1242  	PSRLL      $20, X15
  1243  	PXOR       X15, X6
  1244  	PXOR       X8, X7
  1245  	PXOR       X9, X4
  1246  	MOVO       X7, X15
  1247  	PSLLL      $ 12, X7
  1248  	PSRLL      $20, X15
  1249  	PXOR       X15, X7
  1250  	MOVO       X4, X15
  1251  	PSLLL      $ 12, X4
  1252  	PSRLL      $20, X15
  1253  	PXOR       X15, X4
  1254  	MOVO       112(BP), X15
  1255  	PADDD      X5, X0
  1256  	PADDD      X6, X1
  1257  	PXOR       X0, X15
  1258  	PXOR       X1, X12
  1259  	PADDD      X7, X2
  1260  	PADDD      X4, X3
  1261  	PXOR       X2, X13
  1262  	PXOR       X3, X14
  1263  	PSHUFB     96(BP), X15
  1264  	PSHUFB     96(BP), X12
  1265  	PADDD      X15, X10
  1266  	PADDD      X12, X11
  1267  	PSHUFB     96(BP), X13
  1268  	PSHUFB     96(BP), X14
  1269  	PADDD      X13, X8
  1270  	PADDD      X14, X9
  1271  	MOVO       X15, 112(BP)
  1272  	PXOR       X10, X5
  1273  	PXOR       X11, X6
  1274  	MOVO       X5, X15
  1275  	PSLLL      $ 7, X5
  1276  	PSRLL      $25, X15
  1277  	PXOR       X15, X5
  1278  	MOVO       X6, X15
  1279  	PSLLL      $ 7, X6
  1280  	PSRLL      $25, X15
  1281  	PXOR       X15, X6
  1282  	PXOR       X8, X7
  1283  	PXOR       X9, X4
  1284  	MOVO       X7, X15
  1285  	PSLLL      $ 7, X7
  1286  	PSRLL      $25, X15
  1287  	PXOR       X15, X7
  1288  	MOVO       X4, X15
  1289  	PSLLL      $ 7, X4
  1290  	PSRLL      $25, X15
  1291  	PXOR       X15, X4
  1292  	SUBQ       $2, AX
  1293  	MOVO       112(BP), X15
  1294  	JNZ        chacha_blocks_ssse3_mainloop1
  1295  	PADDD      128(BP), X0
  1296  	PADDD      144(BP), X1
  1297  	PADDD      160(BP), X2
  1298  	PADDD      176(BP), X3
  1299  	PADDD      192(BP), X4
  1300  	PADDD      208(BP), X5
  1301  	PADDD      224(BP), X6
  1302  	PADDD      240(BP), X7
  1303  	PADDD      256(BP), X8
  1304  	PADDD      272(BP), X9
  1305  	PADDD      288(BP), X10
  1306  	PADDD      304(BP), X11
  1307  	PADDD      320(BP), X12
  1308  	PADDD      336(BP), X13
  1309  	PADDD      352(BP), X14
  1310  	PADDD      368(BP), X15
  1311  	MOVO       X8, 384(BP)
  1312  	MOVO       X9, 400(BP)
  1313  	MOVO       X10, 416(BP)
  1314  	MOVO       X11, 432(BP)
  1315  	MOVO       X12, 448(BP)
  1316  	MOVO       X13, 464(BP)
  1317  	MOVO       X14, 480(BP)
  1318  	MOVO       X15, 496(BP)
  1319  	MOVO       X0, X8
  1320  	MOVO       X2, X9
  1321  	MOVO       X4, X10
  1322  	MOVO       X6, X11
  1323  	PUNPCKHLQ  X1, X0
  1324  	PUNPCKHLQ  X3, X2
  1325  	PUNPCKHLQ  X5, X4
  1326  	PUNPCKHLQ  X7, X6
  1327  	PUNPCKLLQ  X1, X8
  1328  	PUNPCKLLQ  X3, X9
  1329  	PUNPCKLLQ  X5, X10
  1330  	PUNPCKLLQ  X7, X11
  1331  	MOVO       X0, X1
  1332  	MOVO       X4, X3
  1333  	MOVO       X8, X5
  1334  	MOVO       X10, X7
  1335  	PUNPCKHQDQ X2, X0
  1336  	PUNPCKHQDQ X6, X4
  1337  	PUNPCKHQDQ X9, X8
  1338  	PUNPCKHQDQ X11, X10
  1339  	PUNPCKLQDQ X2, X1
  1340  	PUNPCKLQDQ X6, X3
  1341  	PUNPCKLQDQ X9, X5
  1342  	PUNPCKLQDQ X11, X7
  1343  	ANDQ       SI, SI
  1344  	JZ         chacha_blocks_ssse3_noinput1
  1345  	MOVOU      0(SI), X2
  1346  	MOVOU      16(SI), X6
  1347  	MOVOU      64(SI), X9
  1348  	MOVOU      80(SI), X11
  1349  	MOVOU      128(SI), X12
  1350  	MOVOU      144(SI), X13
  1351  	MOVOU      192(SI), X14
  1352  	MOVOU      208(SI), X15
  1353  	PXOR       X2, X5
  1354  	PXOR       X6, X7
  1355  	PXOR       X9, X8
  1356  	PXOR       X11, X10
  1357  	PXOR       X12, X1
  1358  	PXOR       X13, X3
  1359  	PXOR       X14, X0
  1360  	PXOR       X15, X4
  1361  	MOVOU      X5, 0(DX)
  1362  	MOVOU      X7, 16(DX)
  1363  	MOVOU      X8, 64(DX)
  1364  	MOVOU      X10, 80(DX)
  1365  	MOVOU      X1, 128(DX)
  1366  	MOVOU      X3, 144(DX)
  1367  	MOVOU      X0, 192(DX)
  1368  	MOVOU      X4, 208(DX)
  1369  	MOVO       384(BP), X0
  1370  	MOVO       400(BP), X1
  1371  	MOVO       416(BP), X2
  1372  	MOVO       432(BP), X3
  1373  	MOVO       448(BP), X4
  1374  	MOVO       464(BP), X5
  1375  	MOVO       480(BP), X6
  1376  	MOVO       496(BP), X7
  1377  	MOVO       X0, X8
  1378  	MOVO       X2, X9
  1379  	MOVO       X4, X10
  1380  	MOVO       X6, X11
  1381  	PUNPCKLLQ  X1, X8
  1382  	PUNPCKLLQ  X3, X9
  1383  	PUNPCKHLQ  X1, X0
  1384  	PUNPCKHLQ  X3, X2
  1385  	PUNPCKLLQ  X5, X10
  1386  	PUNPCKLLQ  X7, X11
  1387  	PUNPCKHLQ  X5, X4
  1388  	PUNPCKHLQ  X7, X6
  1389  	MOVO       X8, X1
  1390  	MOVO       X0, X3
  1391  	MOVO       X10, X5
  1392  	MOVO       X4, X7
  1393  	PUNPCKLQDQ X9, X1
  1394  	PUNPCKLQDQ X11, X5
  1395  	PUNPCKHQDQ X9, X8
  1396  	PUNPCKHQDQ X11, X10
  1397  	PUNPCKLQDQ X2, X3
  1398  	PUNPCKLQDQ X6, X7
  1399  	PUNPCKHQDQ X2, X0
  1400  	PUNPCKHQDQ X6, X4
  1401  	MOVOU      32(SI), X2
  1402  	MOVOU      48(SI), X6
  1403  	MOVOU      96(SI), X9
  1404  	MOVOU      112(SI), X11
  1405  	MOVOU      160(SI), X12
  1406  	MOVOU      176(SI), X13
  1407  	MOVOU      224(SI), X14
  1408  	MOVOU      240(SI), X15
  1409  	PXOR       X2, X1
  1410  	PXOR       X6, X5
  1411  	PXOR       X9, X8
  1412  	PXOR       X11, X10
  1413  	PXOR       X12, X3
  1414  	PXOR       X13, X7
  1415  	PXOR       X14, X0
  1416  	PXOR       X15, X4
  1417  	MOVOU      X1, 32(DX)
  1418  	MOVOU      X5, 48(DX)
  1419  	MOVOU      X8, 96(DX)
  1420  	MOVOU      X10, 112(DX)
  1421  	MOVOU      X3, 160(DX)
  1422  	MOVOU      X7, 176(DX)
  1423  	MOVOU      X0, 224(DX)
  1424  	MOVOU      X4, 240(DX)
  1425  	ADDQ       $256, SI
  1426  	JMP        chacha_blocks_ssse3_mainloop_cont
  1427  
  1428  chacha_blocks_ssse3_noinput1:
  1429  	MOVOU      X5, 0(DX)
  1430  	MOVOU      X7, 16(DX)
  1431  	MOVOU      X8, 64(DX)
  1432  	MOVOU      X10, 80(DX)
  1433  	MOVOU      X1, 128(DX)
  1434  	MOVOU      X3, 144(DX)
  1435  	MOVOU      X0, 192(DX)
  1436  	MOVOU      X4, 208(DX)
  1437  	MOVO       384(BP), X0
  1438  	MOVO       400(BP), X1
  1439  	MOVO       416(BP), X2
  1440  	MOVO       432(BP), X3
  1441  	MOVO       448(BP), X4
  1442  	MOVO       464(BP), X5
  1443  	MOVO       480(BP), X6
  1444  	MOVO       496(BP), X7
  1445  	MOVO       X0, X8
  1446  	MOVO       X2, X9
  1447  	MOVO       X4, X10
  1448  	MOVO       X6, X11
  1449  	PUNPCKLLQ  X1, X8
  1450  	PUNPCKLLQ  X3, X9
  1451  	PUNPCKHLQ  X1, X0
  1452  	PUNPCKHLQ  X3, X2
  1453  	PUNPCKLLQ  X5, X10
  1454  	PUNPCKLLQ  X7, X11
  1455  	PUNPCKHLQ  X5, X4
  1456  	PUNPCKHLQ  X7, X6
  1457  	MOVO       X8, X1
  1458  	MOVO       X0, X3
  1459  	MOVO       X10, X5
  1460  	MOVO       X4, X7
  1461  	PUNPCKLQDQ X9, X1
  1462  	PUNPCKLQDQ X11, X5
  1463  	PUNPCKHQDQ X9, X8
  1464  	PUNPCKHQDQ X11, X10
  1465  	PUNPCKLQDQ X2, X3
  1466  	PUNPCKLQDQ X6, X7
  1467  	PUNPCKHQDQ X2, X0
  1468  	PUNPCKHQDQ X6, X4
  1469  	MOVOU      X1, 32(DX)
  1470  	MOVOU      X5, 48(DX)
  1471  	MOVOU      X8, 96(DX)
  1472  	MOVOU      X10, 112(DX)
  1473  	MOVOU      X3, 160(DX)
  1474  	MOVOU      X7, 176(DX)
  1475  	MOVOU      X0, 224(DX)
  1476  	MOVOU      X4, 240(DX)
  1477  
  1478  chacha_blocks_ssse3_mainloop_cont:
  1479  	ADDQ $256, DX
  1480  	SUBQ $256, CX
  1481  	CMPQ CX, $256
  1482  	JAE  chacha_blocks_ssse3_atleast256
  1483  	MOVO 80(BP), X6
  1484  	MOVO 96(BP), X7
  1485  	MOVO 0(BP), X8
  1486  	MOVO 16(BP), X9
  1487  	MOVO 32(BP), X10
  1488  	MOVO 48(BP), X11
  1489  	MOVQ $1, R9
  1490  
  1491  chacha_blocks_ssse3_below256:
  1492  	MOVQ R9, X5
  1493  	ANDQ CX, CX
  1494  	JZ   chacha_blocks_ssse3_done
  1495  	CMPQ CX, $64
  1496  	JAE  chacha_blocks_ssse3_above63
  1497  	MOVQ DX, R9
  1498  	ANDQ SI, SI
  1499  	JZ   chacha_blocks_ssse3_noinput2
  1500  	MOVQ CX, R10
  1501  	MOVQ BP, DX
  1502  	ADDQ R10, SI
  1503  	ADDQ R10, DX
  1504  	NEGQ R10
  1505  
  1506  chacha_blocks_ssse3_copyinput:
  1507  	MOVB (SI)(R10*1), AX
  1508  	MOVB AX, (DX)(R10*1)
  1509  	INCQ R10
  1510  	JNZ  chacha_blocks_ssse3_copyinput
  1511  	MOVQ BP, SI
  1512  
  1513  chacha_blocks_ssse3_noinput2:
  1514  	MOVQ BP, DX
  1515  
  1516  chacha_blocks_ssse3_above63:
  1517  	MOVO X8, X0
  1518  	MOVO X9, X1
  1519  	MOVO X10, X2
  1520  	MOVO X11, X3
  1521  
  1522  	// MOVQ 64(BP), AX
  1523  	MOVQ $20, AX
  1524  
  1525  chacha_blocks_ssse3_mainloop2:
  1526  	PADDD  X1, X0
  1527  	PXOR   X0, X3
  1528  	PSHUFB X6, X3
  1529  	PADDD  X3, X2
  1530  	PXOR   X2, X1
  1531  	MOVO   X1, X4
  1532  	PSLLL  $12, X4
  1533  	PSRLL  $20, X1
  1534  	PXOR   X4, X1
  1535  	PADDD  X1, X0
  1536  	PXOR   X0, X3
  1537  	PSHUFB X7, X3
  1538  	PSHUFD $0x93, X0, X0
  1539  	PADDD  X3, X2
  1540  	PSHUFD $0x4e, X3, X3
  1541  	PXOR   X2, X1
  1542  	PSHUFD $0x39, X2, X2
  1543  	MOVO   X1, X4
  1544  	PSLLL  $7, X4
  1545  	PSRLL  $25, X1
  1546  	PXOR   X4, X1
  1547  	PADDD  X1, X0
  1548  	PXOR   X0, X3
  1549  	PSHUFB X6, X3
  1550  	PADDD  X3, X2
  1551  	PXOR   X2, X1
  1552  	MOVO   X1, X4
  1553  	PSLLL  $12, X4
  1554  	PSRLL  $20, X1
  1555  	PXOR   X4, X1
  1556  	PADDD  X1, X0
  1557  	PXOR   X0, X3
  1558  	PSHUFB X7, X3
  1559  	PSHUFD $0x39, X0, X0
  1560  	PADDD  X3, X2
  1561  	PSHUFD $0x4e, X3, X3
  1562  	PXOR   X2, X1
  1563  	PSHUFD $0x93, X2, X2
  1564  	MOVO   X1, X4
  1565  	PSLLL  $7, X4
  1566  	PSRLL  $25, X1
  1567  	PXOR   X4, X1
  1568  	SUBQ   $2, AX
  1569  	JNZ    chacha_blocks_ssse3_mainloop2
  1570  	PADDD  X8, X0
  1571  	PADDD  X9, X1
  1572  	PADDD  X10, X2
  1573  	PADDD  X11, X3
  1574  	ANDQ   SI, SI
  1575  	JZ     chacha_blocks_ssse3_noinput3
  1576  	MOVOU  0(SI), X12
  1577  	MOVOU  16(SI), X13
  1578  	MOVOU  32(SI), X14
  1579  	MOVOU  48(SI), X15
  1580  	PXOR   X12, X0
  1581  	PXOR   X13, X1
  1582  	PXOR   X14, X2
  1583  	PXOR   X15, X3
  1584  	ADDQ   $64, SI
  1585  
  1586  chacha_blocks_ssse3_noinput3:
  1587  	MOVOU X0, 0(DX)
  1588  	MOVOU X1, 16(DX)
  1589  	MOVOU X2, 32(DX)
  1590  	MOVOU X3, 48(DX)
  1591  	PADDQ X5, X11
  1592  	CMPQ  CX, $64
  1593  	JBE   chacha_blocks_ssse3_mainloop2_finishup
  1594  	ADDQ  $64, DX
  1595  	SUBQ  $64, CX
  1596  	JMP   chacha_blocks_ssse3_below256
  1597  
  1598  chacha_blocks_ssse3_mainloop2_finishup:
  1599  	CMPQ CX, $64
  1600  	JE   chacha_blocks_ssse3_done
  1601  	ADDQ CX, R9
  1602  	ADDQ CX, DX
  1603  	NEGQ CX
  1604  
  1605  chacha_blocks_ssse3_copyoutput:
  1606  	MOVB (DX)(CX*1), AX
  1607  	MOVB AX, (R9)(CX*1)
  1608  	INCQ CX
  1609  	JNZ  chacha_blocks_ssse3_copyoutput
  1610  
  1611  chacha_blocks_ssse3_done:
  1612  	MOVOU X11, 32(DI)
  1613  
  1614  	RET
  1615  
  1616  // func hChaChaSSSE3(key, nonce []byte, dst *byte)
  1617  TEXT ·hChaChaSSSE3(SB), NOSPLIT|NOFRAME, $0-56
  1618  	MOVQ key+0(FP), DI
  1619  	MOVQ nonce+24(FP), SI
  1620  	MOVQ dst+48(FP), DX
  1621  
  1622  	MOVL $20, CX
  1623  
  1624  	LEAQ ·chacha_constants<>(SB), AX
  1625  	MOVO 0(AX), X0
  1626  	MOVO 16(AX), X5
  1627  	MOVO 32(AX), X6
  1628  
  1629  	MOVOU 0(DI), X1
  1630  	MOVOU 16(DI), X2
  1631  	MOVOU 0(SI), X3
  1632  
  1633  hchacha_ssse3_mainloop:
  1634  	PADDD  X1, X0
  1635  	PXOR   X0, X3
  1636  	PSHUFB X5, X3
  1637  	PADDD  X3, X2
  1638  	PXOR   X2, X1
  1639  	MOVO   X1, X4
  1640  	PSLLL  $12, X1
  1641  	PSRLL  $20, X4
  1642  	PXOR   X4, X1
  1643  	PADDD  X1, X0
  1644  	PXOR   X0, X3
  1645  	PSHUFB X6, X3
  1646  	PSHUFD $0X93, X0, X0
  1647  	PADDD  X3, X2
  1648  	PSHUFD $0X4E, X3, X3
  1649  	PXOR   X2, X1
  1650  	PSHUFD $0X39, X2, X2
  1651  	MOVO   X1, X4
  1652  	PSLLL  $7, X1
  1653  	PSRLL  $25, X4
  1654  	PXOR   X4, X1
  1655  	SUBQ   $2, CX
  1656  	PADDD  X1, X0
  1657  	PXOR   X0, X3
  1658  	PSHUFB X5, X3
  1659  	PADDD  X3, X2
  1660  	PXOR   X2, X1
  1661  	MOVO   X1, X4
  1662  	PSLLL  $12, X1
  1663  	PSRLL  $20, X4
  1664  	PXOR   X4, X1
  1665  	PADDD  X1, X0
  1666  	PXOR   X0, X3
  1667  	PSHUFB X6, X3
  1668  	PSHUFD $0X39, X0, X0
  1669  	PADDD  X3, X2
  1670  	PSHUFD $0X4E, X3, X3
  1671  	PXOR   X2, X1
  1672  	PSHUFD $0X93, X2, X2
  1673  	MOVO   X1, X4
  1674  	PSLLL  $7, X1
  1675  	PSRLL  $25, X4
  1676  	PXOR   X4, X1
  1677  	JA     hchacha_ssse3_mainloop
  1678  
  1679  	MOVOU X0, 0(DX)
  1680  	MOVOU X3, 16(DX)
  1681  
  1682  	RET