git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/chacha/chacha_amd64.s (about)

     1  // Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
     2  // Use of this source code is governed by a license that can be
     3  // found in the LICENSE file.
     4  
     5  // +build amd64,!gccgo,!appengine,!nacl
     6  
     7  #include "const.s"
     8  #include "macro.s"
     9  
    10  // FINALIZE xors len bytes from src and block using
    11  // the temp. registers t0 and t1 and writes the result
    12  // to dst.
    13  #define FINALIZE(dst, src, block, len, t0, t1) \
    14  	XORQ t0, t0;       \
    15  	XORQ t1, t1;       \
    16  	FINALIZE_LOOP:;    \
    17  	MOVB 0(src), t0;   \
    18  	MOVB 0(block), t1; \
    19  	XORQ t0, t1;       \
    20  	MOVB t1, 0(dst);   \
    21  	INCQ src;          \
    22  	INCQ block;        \
    23  	INCQ dst;          \
    24  	DECQ len;          \
    25  	JG   FINALIZE_LOOP \
    26  
    27  #define Dst DI
    28  #define Nonce AX
    29  #define Key BX
    30  #define Rounds DX
    31  
    32  // func initialize(state *[64]byte, key []byte, nonce *[16]byte)
    33  TEXT ·initialize(SB), 4, $0-40
    34  	MOVQ state+0(FP), Dst
    35  	MOVQ key+8(FP), Key
    36  	MOVQ nonce+32(FP), Nonce
    37  
    38  	MOVOU ·sigma<>(SB), X0
    39  	MOVOU 0*16(Key), X1
    40  	MOVOU 1*16(Key), X2
    41  	MOVOU 0*16(Nonce), X3
    42  
    43  	MOVOU X0, 0*16(Dst)
    44  	MOVOU X1, 1*16(Dst)
    45  	MOVOU X2, 2*16(Dst)
    46  	MOVOU X3, 3*16(Dst)
    47  	RET
    48  
    49  #undef Dst
    50  #undef Nonce
    51  #undef Key
    52  #undef Rounds
    53  
    54  #define Dst DI
    55  #define Src SI
    56  #define Len R12
    57  #define Rounds DX
    58  #define Buffer BX
    59  #define State AX
    60  #define Stack SP
    61  #define SavedSP R8
    62  #define Tmp0 R9
    63  #define Tmp1 R10
    64  #define Tmp2 R11
    65  
    66  // func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
    67  TEXT ·xorKeyStreamSSE2(SB), 4, $112-80
    68  	MOVQ dst_base+0(FP), Dst
    69  	MOVQ src_base+24(FP), Src
    70  	MOVQ block+48(FP), Buffer
    71  	MOVQ state+56(FP), State
    72  	MOVQ rounds+64(FP), Rounds
    73  	MOVQ src_len+32(FP), Len
    74  
    75  	MOVOU 0*16(State), X0
    76  	MOVOU 1*16(State), X1
    77  	MOVOU 2*16(State), X2
    78  	MOVOU 3*16(State), X3
    79  
    80  	MOVQ Stack, SavedSP
    81  	ADDQ $16, Stack
    82  	ANDQ $-16, Stack
    83  
    84  	TESTQ Len, Len
    85  	JZ    DONE
    86  
    87  	MOVOU ·one<>(SB), X4
    88  	MOVO  X0, 0*16(Stack)
    89  	MOVO  X1, 1*16(Stack)
    90  	MOVO  X2, 2*16(Stack)
    91  	MOVO  X3, 3*16(Stack)
    92  	MOVO  X4, 4*16(Stack)
    93  
    94  	CMPQ Len, $64
    95  	JLE  GENERATE_KEYSTREAM_64
    96  	CMPQ Len, $128
    97  	JLE  GENERATE_KEYSTREAM_128
    98  	CMPQ Len, $192
    99  	JLE  GENERATE_KEYSTREAM_192
   100  
   101  GENERATE_KEYSTREAM_256:
   102  	MOVO  X0, X12
   103  	MOVO  X1, X13
   104  	MOVO  X2, X14
   105  	MOVO  X3, X15
   106  	PADDQ 4*16(Stack), X15
   107  	MOVO  X0, X8
   108  	MOVO  X1, X9
   109  	MOVO  X2, X10
   110  	MOVO  X15, X11
   111  	PADDQ 4*16(Stack), X11
   112  	MOVO  X0, X4
   113  	MOVO  X1, X5
   114  	MOVO  X2, X6
   115  	MOVO  X11, X7
   116  	PADDQ 4*16(Stack), X7
   117  	MOVQ  Rounds, Tmp0
   118  
   119  	MOVO X3, 3*16(Stack) // Save X3
   120  
   121  CHACHA_LOOP_256:
   122  	MOVO X4, 5*16(Stack)
   123  	CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
   124  	CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4)
   125  	MOVO 5*16(Stack), X4
   126  	MOVO X0, 5*16(Stack)
   127  	CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
   128  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
   129  	MOVO 5*16(Stack), X0
   130  	CHACHA_SHUFFLE_SSE(X1, X2, X3)
   131  	CHACHA_SHUFFLE_SSE(X13, X14, X15)
   132  	CHACHA_SHUFFLE_SSE(X9, X10, X11)
   133  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   134  	MOVO X4, 5*16(Stack)
   135  	CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
   136  	CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4)
   137  	MOVO 5*16(Stack), X4
   138  	MOVO X0, 5*16(Stack)
   139  	CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
   140  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
   141  	MOVO 5*16(Stack), X0
   142  	CHACHA_SHUFFLE_SSE(X3, X2, X1)
   143  	CHACHA_SHUFFLE_SSE(X15, X14, X13)
   144  	CHACHA_SHUFFLE_SSE(X11, X10, X9)
   145  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   146  	SUBQ $2, Tmp0
   147  	JNZ  CHACHA_LOOP_256
   148  
   149  	PADDL 0*16(Stack), X0
   150  	PADDL 1*16(Stack), X1
   151  	PADDL 2*16(Stack), X2
   152  	PADDL 3*16(Stack), X3
   153  	MOVO  X4, 5*16(Stack) // Save X4
   154  	XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4)
   155  	MOVO  5*16(Stack), X4 // Restore X4
   156  
   157  	MOVO  0*16(Stack), X0
   158  	MOVO  1*16(Stack), X1
   159  	MOVO  2*16(Stack), X2
   160  	MOVO  3*16(Stack), X3
   161  	PADDQ 4*16(Stack), X3
   162  
   163  	PADDL X0, X12
   164  	PADDL X1, X13
   165  	PADDL X2, X14
   166  	PADDL X3, X15
   167  	PADDQ 4*16(Stack), X3
   168  	PADDL X0, X8
   169  	PADDL X1, X9
   170  	PADDL X2, X10
   171  	PADDL X3, X11
   172  	PADDQ 4*16(Stack), X3
   173  	PADDL X0, X4
   174  	PADDL X1, X5
   175  	PADDL X2, X6
   176  	PADDL X3, X7
   177  	PADDQ 4*16(Stack), X3
   178  
   179  	XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0)
   180  	XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0)
   181  	MOVO 0*16(Stack), X0 // Restore X0
   182  	ADDQ $192, Dst
   183  	ADDQ $192, Src
   184  	SUBQ $192, Len
   185  
   186  	CMPQ Len, $64
   187  	JL   BUFFER_KEYSTREAM
   188  
   189  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   190  	ADDQ $64, Dst
   191  	ADDQ $64, Src
   192  	SUBQ $64, Len
   193  	JZ   DONE
   194  	CMPQ Len, $64               // If Len <= 64 -> gen. only 64 byte keystream.
   195  	JLE  GENERATE_KEYSTREAM_64
   196  	CMPQ Len, $128              // If 64 < Len <= 128 -> gen. only 128 byte keystream.
   197  	JLE  GENERATE_KEYSTREAM_128
   198  	CMPQ Len, $192              // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream
   199  	JG   GENERATE_KEYSTREAM_256
   200  
   201  GENERATE_KEYSTREAM_192:
   202  	MOVO  X0, X12
   203  	MOVO  X1, X13
   204  	MOVO  X2, X14
   205  	MOVO  X3, X15
   206  	MOVO  X0, X8
   207  	MOVO  X1, X9
   208  	MOVO  X2, X10
   209  	MOVO  X3, X11
   210  	PADDQ 4*16(Stack), X11
   211  	MOVO  X0, X4
   212  	MOVO  X1, X5
   213  	MOVO  X2, X6
   214  	MOVO  X11, X7
   215  	PADDQ 4*16(Stack), X7
   216  	MOVQ  Rounds, Tmp0
   217  
   218  CHACHA_LOOP_192:
   219  	CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
   220  	CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
   221  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
   222  	CHACHA_SHUFFLE_SSE(X13, X14, X15)
   223  	CHACHA_SHUFFLE_SSE(X9, X10, X11)
   224  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   225  	CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
   226  	CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
   227  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
   228  	CHACHA_SHUFFLE_SSE(X15, X14, X13)
   229  	CHACHA_SHUFFLE_SSE(X11, X10, X9)
   230  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   231  	SUBQ $2, Tmp0
   232  	JNZ  CHACHA_LOOP_192
   233  
   234  	MOVO  0*16(Stack), X0 // Restore X0
   235  	PADDL X0, X12
   236  	PADDL X1, X13
   237  	PADDL X2, X14
   238  	PADDL X3, X15
   239  	PADDQ 4*16(Stack), X3
   240  	PADDL X0, X8
   241  	PADDL X1, X9
   242  	PADDL X2, X10
   243  	PADDL X3, X11
   244  	PADDQ 4*16(Stack), X3
   245  	PADDL X0, X4
   246  	PADDL X1, X5
   247  	PADDL X2, X6
   248  	PADDL X3, X7
   249  	PADDQ 4*16(Stack), X3
   250  
   251  	XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0)
   252  	XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0)
   253  	MOVO 0*16(Stack), X0 // Restore X0
   254  	ADDQ $128, Dst
   255  	ADDQ $128, Src
   256  	SUBQ $128, Len
   257  
   258  	CMPQ Len, $64
   259  	JL   BUFFER_KEYSTREAM
   260  
   261  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   262  	ADDQ $64, Dst
   263  	ADDQ $64, Src
   264  	SUBQ $64, Len
   265  	JZ   DONE
   266  	CMPQ Len, $64              // If Len <= 64 -> gen. only 64 byte keystream.
   267  	JLE  GENERATE_KEYSTREAM_64
   268  
   269  GENERATE_KEYSTREAM_128:
   270  	MOVO  X0, X8
   271  	MOVO  X1, X9
   272  	MOVO  X2, X10
   273  	MOVO  X3, X11
   274  	MOVO  X0, X4
   275  	MOVO  X1, X5
   276  	MOVO  X2, X6
   277  	MOVO  X3, X7
   278  	PADDQ 4*16(Stack), X7
   279  	MOVQ  Rounds, Tmp0
   280  
   281  CHACHA_LOOP_128:
   282  	CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
   283  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
   284  	CHACHA_SHUFFLE_SSE(X9, X10, X11)
   285  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   286  	CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
   287  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
   288  	CHACHA_SHUFFLE_SSE(X11, X10, X9)
   289  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   290  	SUBQ $2, Tmp0
   291  	JNZ  CHACHA_LOOP_128
   292  
   293  	PADDL X0, X8
   294  	PADDL X1, X9
   295  	PADDL X2, X10
   296  	PADDL X3, X11
   297  	PADDQ 4*16(Stack), X3
   298  	PADDL X0, X4
   299  	PADDL X1, X5
   300  	PADDL X2, X6
   301  	PADDL X3, X7
   302  	PADDQ 4*16(Stack), X3
   303  
   304  	XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12)
   305  	ADDQ $64, Dst
   306  	ADDQ $64, Src
   307  	SUBQ $64, Len
   308  
   309  	CMPQ Len, $64
   310  	JL   BUFFER_KEYSTREAM
   311  
   312  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   313  	ADDQ $64, Dst
   314  	ADDQ $64, Src
   315  	SUBQ $64, Len
   316  	JZ   DONE     // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream
   317  
   318  GENERATE_KEYSTREAM_64:
   319  	MOVO X0, X4
   320  	MOVO X1, X5
   321  	MOVO X2, X6
   322  	MOVO X3, X7
   323  	MOVQ Rounds, Tmp0
   324  
   325  CHACHA_LOOP_64:
   326  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
   327  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   328  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
   329  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   330  	SUBQ $2, Tmp0
   331  	JNZ  CHACHA_LOOP_64
   332  
   333  	PADDL X0, X4
   334  	PADDL X1, X5
   335  	PADDL X2, X6
   336  	PADDL X3, X7
   337  	PADDQ 4*16(Stack), X3
   338  
   339  	CMPQ Len, $64
   340  	JL   BUFFER_KEYSTREAM
   341  
   342  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   343  	ADDQ $64, Src
   344  	ADDQ $64, Dst
   345  	SUBQ $64, Len
   346  	JMP  DONE     // jump directly to DONE - there is no keystream to buffer, Len == 0 always true.
   347  
   348  BUFFER_KEYSTREAM:
   349  	MOVOU X4, 0*16(Buffer)
   350  	MOVOU X5, 1*16(Buffer)
   351  	MOVOU X6, 2*16(Buffer)
   352  	MOVOU X7, 3*16(Buffer)
   353  	MOVQ  Len, Tmp0
   354  	FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2)
   355  
   356  DONE:
   357  	MOVQ  SavedSP, Stack  // Restore stack pointer
   358  	MOVOU X3, 3*16(State)
   359  	MOVQ  Len, ret+72(FP)
   360  	RET
   361  
   362  // func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
   363  TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80
   364  	MOVQ dst_base+0(FP), Dst
   365  	MOVQ src_base+24(FP), Src
   366  	MOVQ block+48(FP), Buffer
   367  	MOVQ state+56(FP), State
   368  	MOVQ rounds+64(FP), Rounds
   369  	MOVQ src_len+32(FP), Len
   370  
   371  	MOVOU 0*16(State), X0
   372  	MOVOU 1*16(State), X1
   373  	MOVOU 2*16(State), X2
   374  	MOVOU 3*16(State), X3
   375  
   376  	MOVQ Stack, SavedSP
   377  	ADDQ $16, Stack
   378  	ANDQ $-16, Stack
   379  
   380  	TESTQ Len, Len
   381  	JZ    DONE
   382  
   383  	MOVOU ·one<>(SB), X4
   384  	MOVOU ·rol16<>(SB), X5
   385  	MOVOU ·rol8<>(SB), X6
   386  	MOVO  X0, 0*16(Stack)
   387  	MOVO  X1, 1*16(Stack)
   388  	MOVO  X2, 2*16(Stack)
   389  	MOVO  X3, 3*16(Stack)
   390  	MOVO  X4, 4*16(Stack)
   391  	MOVO  X5, 6*16(Stack)
   392  	MOVO  X6, 7*16(Stack)
   393  
   394  	CMPQ Len, $64
   395  	JLE  GENERATE_KEYSTREAM_64
   396  	CMPQ Len, $128
   397  	JLE  GENERATE_KEYSTREAM_128
   398  	CMPQ Len, $192
   399  	JLE  GENERATE_KEYSTREAM_192
   400  
   401  GENERATE_KEYSTREAM_256:
   402  	MOVO  X0, X12
   403  	MOVO  X1, X13
   404  	MOVO  X2, X14
   405  	MOVO  X3, X15
   406  	PADDQ 4*16(Stack), X15
   407  	MOVO  X0, X8
   408  	MOVO  X1, X9
   409  	MOVO  X2, X10
   410  	MOVO  X15, X11
   411  	PADDQ 4*16(Stack), X11
   412  	MOVO  X0, X4
   413  	MOVO  X1, X5
   414  	MOVO  X2, X6
   415  	MOVO  X11, X7
   416  	PADDQ 4*16(Stack), X7
   417  	MOVQ  Rounds, Tmp0
   418  
   419  	MOVO X3, 3*16(Stack) // Save X3
   420  
   421  CHACHA_LOOP_256:
   422  	MOVO X4, 5*16(Stack)
   423  	CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
   424  	CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
   425  	MOVO 5*16(Stack), X4
   426  	MOVO X0, 5*16(Stack)
   427  	CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
   428  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
   429  	MOVO 5*16(Stack), X0
   430  	CHACHA_SHUFFLE_SSE(X1, X2, X3)
   431  	CHACHA_SHUFFLE_SSE(X13, X14, X15)
   432  	CHACHA_SHUFFLE_SSE(X9, X10, X11)
   433  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   434  	MOVO X4, 5*16(Stack)
   435  	CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
   436  	CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
   437  	MOVO 5*16(Stack), X4
   438  	MOVO X0, 5*16(Stack)
   439  	CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
   440  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
   441  	MOVO 5*16(Stack), X0
   442  	CHACHA_SHUFFLE_SSE(X3, X2, X1)
   443  	CHACHA_SHUFFLE_SSE(X15, X14, X13)
   444  	CHACHA_SHUFFLE_SSE(X11, X10, X9)
   445  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   446  	SUBQ $2, Tmp0
   447  	JNZ  CHACHA_LOOP_256
   448  
   449  	PADDL 0*16(Stack), X0
   450  	PADDL 1*16(Stack), X1
   451  	PADDL 2*16(Stack), X2
   452  	PADDL 3*16(Stack), X3
   453  	MOVO  X4, 5*16(Stack) // Save X4
   454  	XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4)
   455  	MOVO  5*16(Stack), X4 // Restore X4
   456  
   457  	MOVO  0*16(Stack), X0
   458  	MOVO  1*16(Stack), X1
   459  	MOVO  2*16(Stack), X2
   460  	MOVO  3*16(Stack), X3
   461  	PADDQ 4*16(Stack), X3
   462  
   463  	PADDL X0, X12
   464  	PADDL X1, X13
   465  	PADDL X2, X14
   466  	PADDL X3, X15
   467  	PADDQ 4*16(Stack), X3
   468  	PADDL X0, X8
   469  	PADDL X1, X9
   470  	PADDL X2, X10
   471  	PADDL X3, X11
   472  	PADDQ 4*16(Stack), X3
   473  	PADDL X0, X4
   474  	PADDL X1, X5
   475  	PADDL X2, X6
   476  	PADDL X3, X7
   477  	PADDQ 4*16(Stack), X3
   478  
   479  	XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0)
   480  	XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0)
   481  	MOVO 0*16(Stack), X0 // Restore X0
   482  	ADDQ $192, Dst
   483  	ADDQ $192, Src
   484  	SUBQ $192, Len
   485  
   486  	CMPQ Len, $64
   487  	JL   BUFFER_KEYSTREAM
   488  
   489  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   490  	ADDQ $64, Dst
   491  	ADDQ $64, Src
   492  	SUBQ $64, Len
   493  	JZ   DONE
   494  	CMPQ Len, $64               // If Len <= 64 -> gen. only 64 byte keystream.
   495  	JLE  GENERATE_KEYSTREAM_64
   496  	CMPQ Len, $128              // If 64 < Len <= 128 -> gen. only 128 byte keystream.
   497  	JLE  GENERATE_KEYSTREAM_128
   498  	CMPQ Len, $192              // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream
   499  	JG   GENERATE_KEYSTREAM_256
   500  
   501  GENERATE_KEYSTREAM_192:
   502  	MOVO  X0, X12
   503  	MOVO  X1, X13
   504  	MOVO  X2, X14
   505  	MOVO  X3, X15
   506  	MOVO  X0, X8
   507  	MOVO  X1, X9
   508  	MOVO  X2, X10
   509  	MOVO  X3, X11
   510  	PADDQ 4*16(Stack), X11
   511  	MOVO  X0, X4
   512  	MOVO  X1, X5
   513  	MOVO  X2, X6
   514  	MOVO  X11, X7
   515  	PADDQ 4*16(Stack), X7
   516  	MOVQ  Rounds, Tmp0
   517  
   518  	MOVO 6*16(Stack), X1 // Load 16 bit rotate-left constant
   519  	MOVO 7*16(Stack), X2 // Load 8 bit rotate-left constant
   520  
   521  CHACHA_LOOP_192:
   522  	CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2)
   523  	CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2)
   524  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
   525  	CHACHA_SHUFFLE_SSE(X13, X14, X15)
   526  	CHACHA_SHUFFLE_SSE(X9, X10, X11)
   527  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   528  	CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2)
   529  	CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2)
   530  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
   531  	CHACHA_SHUFFLE_SSE(X15, X14, X13)
   532  	CHACHA_SHUFFLE_SSE(X11, X10, X9)
   533  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   534  	SUBQ $2, Tmp0
   535  	JNZ  CHACHA_LOOP_192
   536  
   537  	MOVO  0*16(Stack), X0 // Restore X0
   538  	MOVO  1*16(Stack), X1 // Restore X1
   539  	MOVO  2*16(Stack), X2 // Restore X2
   540  	PADDL X0, X12
   541  	PADDL X1, X13
   542  	PADDL X2, X14
   543  	PADDL X3, X15
   544  	PADDQ 4*16(Stack), X3
   545  	PADDL X0, X8
   546  	PADDL X1, X9
   547  	PADDL X2, X10
   548  	PADDL X3, X11
   549  	PADDQ 4*16(Stack), X3
   550  	PADDL X0, X4
   551  	PADDL X1, X5
   552  	PADDL X2, X6
   553  	PADDL X3, X7
   554  	PADDQ 4*16(Stack), X3
   555  
   556  	XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0)
   557  	XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0)
   558  	MOVO 0*16(Stack), X0 // Restore X0
   559  	ADDQ $128, Dst
   560  	ADDQ $128, Src
   561  	SUBQ $128, Len
   562  
   563  	CMPQ Len, $64
   564  	JL   BUFFER_KEYSTREAM
   565  
   566  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   567  	ADDQ $64, Dst
   568  	ADDQ $64, Src
   569  	SUBQ $64, Len
   570  	JZ   DONE
   571  	CMPQ Len, $64              // If Len <= 64 -> gen. only 64 byte keystream.
   572  	JLE  GENERATE_KEYSTREAM_64
   573  
   574  GENERATE_KEYSTREAM_128:
   575  	MOVO  X0, X8
   576  	MOVO  X1, X9
   577  	MOVO  X2, X10
   578  	MOVO  X3, X11
   579  	MOVO  X0, X4
   580  	MOVO  X1, X5
   581  	MOVO  X2, X6
   582  	MOVO  X3, X7
   583  	PADDQ 4*16(Stack), X7
   584  	MOVQ  Rounds, Tmp0
   585  
   586  	MOVO 6*16(Stack), X13 // Load 16 bit rotate-left constant
   587  	MOVO 7*16(Stack), X14 // Load 8 bit rotate-left constant
   588  
   589  CHACHA_LOOP_128:
   590  	CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
   591  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
   592  	CHACHA_SHUFFLE_SSE(X9, X10, X11)
   593  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   594  	CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
   595  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
   596  	CHACHA_SHUFFLE_SSE(X11, X10, X9)
   597  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   598  	SUBQ $2, Tmp0
   599  	JNZ  CHACHA_LOOP_128
   600  
   601  	PADDL X0, X8
   602  	PADDL X1, X9
   603  	PADDL X2, X10
   604  	PADDL X3, X11
   605  	PADDQ 4*16(Stack), X3
   606  	PADDL X0, X4
   607  	PADDL X1, X5
   608  	PADDL X2, X6
   609  	PADDL X3, X7
   610  	PADDQ 4*16(Stack), X3
   611  
   612  	XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12)
   613  	ADDQ $64, Dst
   614  	ADDQ $64, Src
   615  	SUBQ $64, Len
   616  
   617  	CMPQ Len, $64
   618  	JL   BUFFER_KEYSTREAM
   619  
   620  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   621  	ADDQ $64, Dst
   622  	ADDQ $64, Src
   623  	SUBQ $64, Len
   624  	JZ   DONE     // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream
   625  
   626  GENERATE_KEYSTREAM_64:
   627  	MOVO X0, X4
   628  	MOVO X1, X5
   629  	MOVO X2, X6
   630  	MOVO X3, X7
   631  	MOVQ Rounds, Tmp0
   632  
   633  	MOVO 6*16(Stack), X9  // Load 16 bit rotate-left constant
   634  	MOVO 7*16(Stack), X10 // Load 8 bit rotate-left constant
   635  
   636  CHACHA_LOOP_64:
   637  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10)
   638  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   639  	CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10)
   640  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   641  	SUBQ $2, Tmp0
   642  	JNZ  CHACHA_LOOP_64
   643  
   644  	PADDL X0, X4
   645  	PADDL X1, X5
   646  	PADDL X2, X6
   647  	PADDL X3, X7
   648  	PADDQ 4*16(Stack), X3
   649  
   650  	CMPQ Len, $64
   651  	JL   BUFFER_KEYSTREAM
   652  
   653  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
   654  	ADDQ $64, Src
   655  	ADDQ $64, Dst
   656  	SUBQ $64, Len
   657  	JMP  DONE     // jump directly to DONE - there is no keystream to buffer, Len == 0 always true.
   658  
   659  BUFFER_KEYSTREAM:
   660  	MOVOU X4, 0*16(Buffer)
   661  	MOVOU X5, 1*16(Buffer)
   662  	MOVOU X6, 2*16(Buffer)
   663  	MOVOU X7, 3*16(Buffer)
   664  	MOVQ  Len, Tmp0
   665  	FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2)
   666  
   667  DONE:
   668  	MOVQ  SavedSP, Stack  // Restore stack pointer
   669  	MOVOU X3, 3*16(State)
   670  	MOVQ  Len, ret+72(FP)
   671  	RET
   672  
   673  // func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
   674  TEXT ·xorKeyStreamAVX(SB), 4, $144-80
   675  	MOVQ dst_base+0(FP), Dst
   676  	MOVQ src_base+24(FP), Src
   677  	MOVQ block+48(FP), Buffer
   678  	MOVQ state+56(FP), State
   679  	MOVQ rounds+64(FP), Rounds
   680  	MOVQ src_len+32(FP), Len
   681  
   682  	VMOVDQU 0*16(State), X0
   683  	VMOVDQU 1*16(State), X1
   684  	VMOVDQU 2*16(State), X2
   685  	VMOVDQU 3*16(State), X3
   686  
   687  	MOVQ Stack, SavedSP
   688  	ADDQ $16, Stack
   689  	ANDQ $-16, Stack
   690  
   691  	TESTQ Len, Len
   692  	JZ    DONE
   693  
   694  	VMOVDQU ·one<>(SB), X4
   695  	VMOVDQU ·rol16<>(SB), X5
   696  	VMOVDQU ·rol8<>(SB), X6
   697  	VMOVDQA X0, 0*16(Stack)
   698  	VMOVDQA X1, 1*16(Stack)
   699  	VMOVDQA X2, 2*16(Stack)
   700  	VMOVDQA X3, 3*16(Stack)
   701  	VMOVDQA X4, 4*16(Stack)
   702  	VMOVDQA X5, 6*16(Stack)
   703  	VMOVDQA X6, 7*16(Stack)
   704  
   705  	CMPQ Len, $64
   706  	JLE  GENERATE_KEYSTREAM_64
   707  	CMPQ Len, $128
   708  	JLE  GENERATE_KEYSTREAM_128
   709  	CMPQ Len, $192
   710  	JLE  GENERATE_KEYSTREAM_192
   711  
   712  GENERATE_KEYSTREAM_256:
   713  	VMOVDQA X0, X12
   714  	VMOVDQA X1, X13
   715  	VMOVDQA X2, X14
   716  	VMOVDQA X3, X15
   717  	VPADDQ  4*16(Stack), X15, X15
   718  	VMOVDQA X0, X8
   719  	VMOVDQA X1, X9
   720  	VMOVDQA X2, X10
   721  	VMOVDQA X15, X11
   722  	VPADDQ  4*16(Stack), X11, X11
   723  	VMOVDQA X0, X4
   724  	VMOVDQA X1, X5
   725  	VMOVDQA X2, X6
   726  	VMOVDQA X11, X7
   727  	VPADDQ  4*16(Stack), X7, X7
   728  	MOVQ    Rounds, Tmp0
   729  
   730  	VMOVDQA X3, 3*16(Stack) // Save X3
   731  
   732  CHACHA_LOOP_256:
   733  	VMOVDQA X4, 5*16(Stack)
   734  	CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
   735  	CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
   736  	VMOVDQA 5*16(Stack), X4
   737  	VMOVDQA X0, 5*16(Stack)
   738  	CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
   739  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
   740  	VMOVDQA 5*16(Stack), X0
   741  	CHACHA_SHUFFLE_AVX(X1, X2, X3)
   742  	CHACHA_SHUFFLE_AVX(X13, X14, X15)
   743  	CHACHA_SHUFFLE_AVX(X9, X10, X11)
   744  	CHACHA_SHUFFLE_AVX(X5, X6, X7)
   745  	VMOVDQA X4, 5*16(Stack)
   746  	CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
   747  	CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
   748  	VMOVDQA 5*16(Stack), X4
   749  	VMOVDQA X0, 5*16(Stack)
   750  	CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
   751  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
   752  	VMOVDQA 5*16(Stack), X0
   753  	CHACHA_SHUFFLE_AVX(X3, X2, X1)
   754  	CHACHA_SHUFFLE_AVX(X15, X14, X13)
   755  	CHACHA_SHUFFLE_AVX(X11, X10, X9)
   756  	CHACHA_SHUFFLE_AVX(X7, X6, X5)
   757  	SUBQ    $2, Tmp0
   758  	JNZ     CHACHA_LOOP_256
   759  
   760  	VPADDD  0*16(Stack), X0, X0
   761  	VPADDD  1*16(Stack), X1, X1
   762  	VPADDD  2*16(Stack), X2, X2
   763  	VPADDD  3*16(Stack), X3, X3
   764  	VMOVDQA X4, 5*16(Stack)     // Save X4
   765  	XOR_AVX(Dst, Src, 0, X0, X1, X2, X3, X4)
   766  	VMOVDQA 5*16(Stack), X4     // Restore X4
   767  
   768  	VMOVDQA 0*16(Stack), X0
   769  	VMOVDQA 1*16(Stack), X1
   770  	VMOVDQA 2*16(Stack), X2
   771  	VMOVDQA 3*16(Stack), X3
   772  	VPADDQ  4*16(Stack), X3, X3
   773  
   774  	VPADDD X0, X12, X12
   775  	VPADDD X1, X13, X13
   776  	VPADDD X2, X14, X14
   777  	VPADDD X3, X15, X15
   778  	VPADDQ 4*16(Stack), X3, X3
   779  	VPADDD X0, X8, X8
   780  	VPADDD X1, X9, X9
   781  	VPADDD X2, X10, X10
   782  	VPADDD X3, X11, X11
   783  	VPADDQ 4*16(Stack), X3, X3
   784  	VPADDD X0, X4, X4
   785  	VPADDD X1, X5, X5
   786  	VPADDD X2, X6, X6
   787  	VPADDD X3, X7, X7
   788  	VPADDQ 4*16(Stack), X3, X3
   789  
   790  	XOR_AVX(Dst, Src, 64, X12, X13, X14, X15, X0)
   791  	XOR_AVX(Dst, Src, 128, X8, X9, X10, X11, X0)
   792  	VMOVDQA 0*16(Stack), X0 // Restore X0
   793  	ADDQ    $192, Dst
   794  	ADDQ    $192, Src
   795  	SUBQ    $192, Len
   796  
   797  	CMPQ Len, $64
   798  	JL   BUFFER_KEYSTREAM
   799  
   800  	XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
   801  	ADDQ $64, Dst
   802  	ADDQ $64, Src
   803  	SUBQ $64, Len
   804  	JZ   DONE
   805  	CMPQ Len, $64               // If Len <= 64 -> gen. only 64 byte keystream.
   806  	JLE  GENERATE_KEYSTREAM_64
   807  	CMPQ Len, $128              // If 64 < Len <= 128 -> gen. only 128 byte keystream.
   808  	JLE  GENERATE_KEYSTREAM_128
   809  	CMPQ Len, $192              // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream
   810  	JG   GENERATE_KEYSTREAM_256
   811  
   812  GENERATE_KEYSTREAM_192:
   813  	VMOVDQA X0, X12
   814  	VMOVDQA X1, X13
   815  	VMOVDQA X2, X14
   816  	VMOVDQA X3, X15
   817  	VMOVDQA X0, X8
   818  	VMOVDQA X1, X9
   819  	VMOVDQA X2, X10
   820  	VMOVDQA X3, X11
   821  	VPADDQ  4*16(Stack), X11, X11
   822  	VMOVDQA X0, X4
   823  	VMOVDQA X1, X5
   824  	VMOVDQA X2, X6
   825  	VMOVDQA X11, X7
   826  	VPADDQ  4*16(Stack), X7, X7
   827  	MOVQ    Rounds, Tmp0
   828  
   829  	VMOVDQA 6*16(Stack), X1 // Load 16 bit rotate-left constant
   830  	VMOVDQA 7*16(Stack), X2 // Load 8 bit rotate-left constant
   831  
   832  CHACHA_LOOP_192:
   833  	CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2)
   834  	CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2)
   835  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2)
   836  	CHACHA_SHUFFLE_AVX(X13, X14, X15)
   837  	CHACHA_SHUFFLE_AVX(X9, X10, X11)
   838  	CHACHA_SHUFFLE_AVX(X5, X6, X7)
   839  	CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2)
   840  	CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2)
   841  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2)
   842  	CHACHA_SHUFFLE_AVX(X15, X14, X13)
   843  	CHACHA_SHUFFLE_AVX(X11, X10, X9)
   844  	CHACHA_SHUFFLE_AVX(X7, X6, X5)
   845  	SUBQ $2, Tmp0
   846  	JNZ  CHACHA_LOOP_192
   847  
   848  	VMOVDQA 0*16(Stack), X0     // Restore X0
   849  	VMOVDQA 1*16(Stack), X1     // Restore X1
   850  	VMOVDQA 2*16(Stack), X2     // Restore X2
   851  	VPADDD  X0, X12, X12
   852  	VPADDD  X1, X13, X13
   853  	VPADDD  X2, X14, X14
   854  	VPADDD  X3, X15, X15
   855  	VPADDQ  4*16(Stack), X3, X3
   856  	VPADDD  X0, X8, X8
   857  	VPADDD  X1, X9, X9
   858  	VPADDD  X2, X10, X10
   859  	VPADDD  X3, X11, X11
   860  	VPADDQ  4*16(Stack), X3, X3
   861  	VPADDD  X0, X4, X4
   862  	VPADDD  X1, X5, X5
   863  	VPADDD  X2, X6, X6
   864  	VPADDD  X3, X7, X7
   865  	VPADDQ  4*16(Stack), X3, X3
   866  
   867  	XOR_AVX(Dst, Src, 0, X12, X13, X14, X15, X0)
   868  	XOR_AVX(Dst, Src, 64, X8, X9, X10, X11, X0)
   869  	VMOVDQA 0*16(Stack), X0 // Restore X0
   870  	ADDQ    $128, Dst
   871  	ADDQ    $128, Src
   872  	SUBQ    $128, Len
   873  
   874  	CMPQ Len, $64
   875  	JL   BUFFER_KEYSTREAM
   876  
   877  	XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
   878  	ADDQ $64, Dst
   879  	ADDQ $64, Src
   880  	SUBQ $64, Len
   881  	JZ   DONE
   882  	CMPQ Len, $64              // If Len <= 64 -> gen. only 64 byte keystream.
   883  	JLE  GENERATE_KEYSTREAM_64
   884  
   885  GENERATE_KEYSTREAM_128:
   886  	VMOVDQA X0, X8
   887  	VMOVDQA X1, X9
   888  	VMOVDQA X2, X10
   889  	VMOVDQA X3, X11
   890  	VMOVDQA X0, X4
   891  	VMOVDQA X1, X5
   892  	VMOVDQA X2, X6
   893  	VMOVDQA X3, X7
   894  	VPADDQ  4*16(Stack), X7, X7
   895  	MOVQ    Rounds, Tmp0
   896  
   897  	VMOVDQA 6*16(Stack), X13 // Load 16 bit rotate-left constant
   898  	VMOVDQA 7*16(Stack), X14 // Load 8 bit rotate-left constant
   899  
   900  CHACHA_LOOP_128:
   901  	CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14)
   902  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14)
   903  	CHACHA_SHUFFLE_AVX(X9, X10, X11)
   904  	CHACHA_SHUFFLE_AVX(X5, X6, X7)
   905  	CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14)
   906  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14)
   907  	CHACHA_SHUFFLE_AVX(X11, X10, X9)
   908  	CHACHA_SHUFFLE_AVX(X7, X6, X5)
   909  	SUBQ $2, Tmp0
   910  	JNZ  CHACHA_LOOP_128
   911  
   912  	VPADDD X0, X8, X8
   913  	VPADDD X1, X9, X9
   914  	VPADDD X2, X10, X10
   915  	VPADDD X3, X11, X11
   916  	VPADDQ 4*16(Stack), X3, X3
   917  	VPADDD X0, X4, X4
   918  	VPADDD X1, X5, X5
   919  	VPADDD X2, X6, X6
   920  	VPADDD X3, X7, X7
   921  	VPADDQ 4*16(Stack), X3, X3
   922  
   923  	XOR_AVX(Dst, Src, 0, X8, X9, X10, X11, X12)
   924  	ADDQ $64, Dst
   925  	ADDQ $64, Src
   926  	SUBQ $64, Len
   927  
   928  	CMPQ Len, $64
   929  	JL   BUFFER_KEYSTREAM
   930  
   931  	XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
   932  	ADDQ $64, Dst
   933  	ADDQ $64, Src
   934  	SUBQ $64, Len
   935  	JZ   DONE     // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream
   936  
   937  GENERATE_KEYSTREAM_64:
   938  	VMOVDQA X0, X4
   939  	VMOVDQA X1, X5
   940  	VMOVDQA X2, X6
   941  	VMOVDQA X3, X7
   942  	MOVQ    Rounds, Tmp0
   943  
   944  	VMOVDQA 6*16(Stack), X9  // Load 16 bit rotate-left constant
   945  	VMOVDQA 7*16(Stack), X10 // Load 8 bit rotate-left constant
   946  
   947  CHACHA_LOOP_64:
   948  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10)
   949  	CHACHA_SHUFFLE_AVX(X5, X6, X7)
   950  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10)
   951  	CHACHA_SHUFFLE_AVX(X7, X6, X5)
   952  	SUBQ $2, Tmp0
   953  	JNZ  CHACHA_LOOP_64
   954  
   955  	VPADDD X0, X4, X4
   956  	VPADDD X1, X5, X5
   957  	VPADDD X2, X6, X6
   958  	VPADDD X3, X7, X7
   959  	VPADDQ 4*16(Stack), X3, X3
   960  
   961  	CMPQ Len, $64
   962  	JL   BUFFER_KEYSTREAM
   963  
   964  	XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
   965  	ADDQ $64, Src
   966  	ADDQ $64, Dst
   967  	SUBQ $64, Len
   968  	JMP  DONE     // jump directly to DONE - there is no keystream to buffer, Len == 0 always true.
   969  
   970  BUFFER_KEYSTREAM:
   971  	VMOVDQU X4, 0*16(Buffer)
   972  	VMOVDQU X5, 1*16(Buffer)
   973  	VMOVDQU X6, 2*16(Buffer)
   974  	VMOVDQU X7, 3*16(Buffer)
   975  	MOVQ    Len, Tmp0
   976  	FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2)
   977  
   978  DONE:
   979  	MOVQ    SavedSP, Stack  // Restore stack pointer
   980  	VMOVDQU X3, 3*16(State)
   981  	VZEROUPPER
   982  	MOVQ    Len, ret+72(FP)
   983  	RET
   984  
   985  #undef Dst
   986  #undef Src
   987  #undef Len
   988  #undef Rounds
   989  #undef Buffer
   990  #undef State
   991  #undef Stack
   992  #undef SavedSP
   993  #undef Tmp0
   994  #undef Tmp1
   995  #undef Tmp2