git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/chacha/chachaAVX2_amd64.s (about)

     1  // Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
     2  // Use of this source code is governed by a license that can be
     3  // found in the LICENSE file.
     4  
     5  // +build amd64,!gccgo,!appengine,!nacl
     6  
     7  #include "const.s"
     8  #include "macro.s"
     9  
    10  #define TWO 0(SP)
    11  #define C16 32(SP)
    12  #define C8 64(SP)
    13  #define STATE_0 96(SP)
    14  #define STATE_1 128(SP)
    15  #define STATE_2 160(SP)
    16  #define STATE_3 192(SP)
    17  #define TMP_0 224(SP)
    18  #define TMP_1 256(SP)
    19  
    20  // func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
    21  TEXT ·xorKeyStreamAVX2(SB), 4, $320-80
    22  	MOVQ dst_base+0(FP), DI
    23  	MOVQ src_base+24(FP), SI
    24  	MOVQ block+48(FP), BX
    25  	MOVQ state+56(FP), AX
    26  	MOVQ rounds+64(FP), DX
    27  	MOVQ src_len+32(FP), CX
    28  
    29  	MOVQ SP, R8
    30  	ADDQ $32, SP
    31  	ANDQ $-32, SP
    32  
    33  	VMOVDQU    0(AX), Y2
    34  	VMOVDQU    32(AX), Y3
    35  	VPERM2I128 $0x22, Y2, Y0, Y0
    36  	VPERM2I128 $0x33, Y2, Y1, Y1
    37  	VPERM2I128 $0x22, Y3, Y2, Y2
    38  	VPERM2I128 $0x33, Y3, Y3, Y3
    39  
    40  	TESTQ CX, CX
    41  	JZ    done
    42  
    43  	VMOVDQU ·one_AVX2<>(SB), Y4
    44  	VPADDD  Y4, Y3, Y3
    45  
    46  	VMOVDQA Y0, STATE_0
    47  	VMOVDQA Y1, STATE_1
    48  	VMOVDQA Y2, STATE_2
    49  	VMOVDQA Y3, STATE_3
    50  
    51  	VMOVDQU ·rol16_AVX2<>(SB), Y4
    52  	VMOVDQU ·rol8_AVX2<>(SB), Y5
    53  	VMOVDQU ·two_AVX2<>(SB), Y6
    54  	VMOVDQA Y4, Y14
    55  	VMOVDQA Y5, Y15
    56  	VMOVDQA Y4, C16
    57  	VMOVDQA Y5, C8
    58  	VMOVDQA Y6, TWO
    59  
    60  	CMPQ CX, $64
    61  	JBE  between_0_and_64
    62  	CMPQ CX, $192
    63  	JBE  between_64_and_192
    64  	CMPQ CX, $320
    65  	JBE  between_192_and_320
    66  	CMPQ CX, $448
    67  	JBE  between_320_and_448
    68  
    69  at_least_512:
    70  	VMOVDQA Y0, Y4
    71  	VMOVDQA Y1, Y5
    72  	VMOVDQA Y2, Y6
    73  	VPADDQ  TWO, Y3, Y7
    74  	VMOVDQA Y0, Y8
    75  	VMOVDQA Y1, Y9
    76  	VMOVDQA Y2, Y10
    77  	VPADDQ  TWO, Y7, Y11
    78  	VMOVDQA Y0, Y12
    79  	VMOVDQA Y1, Y13
    80  	VMOVDQA Y2, Y14
    81  	VPADDQ  TWO, Y11, Y15
    82  
    83  	MOVQ DX, R9
    84  
    85  chacha_loop_512:
    86  	VMOVDQA Y8, TMP_0
    87  	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
    88  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
    89  	VMOVDQA TMP_0, Y8
    90  	VMOVDQA Y0, TMP_0
    91  	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
    92  	CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
    93  	CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
    94  	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
    95  	CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
    96  	CHACHA_SHUFFLE_AVX(Y13, Y14, Y15)
    97  
    98  	CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
    99  	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
   100  	VMOVDQA TMP_0, Y0
   101  	VMOVDQA Y8, TMP_0
   102  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
   103  	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
   104  	VMOVDQA TMP_0, Y8
   105  	CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
   106  	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
   107  	CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
   108  	CHACHA_SHUFFLE_AVX(Y15, Y14, Y13)
   109  	SUBQ    $2, R9
   110  	JA      chacha_loop_512
   111  
   112  	VMOVDQA Y12, TMP_0
   113  	VMOVDQA Y13, TMP_1
   114  	VPADDD  STATE_0, Y0, Y0
   115  	VPADDD  STATE_1, Y1, Y1
   116  	VPADDD  STATE_2, Y2, Y2
   117  	VPADDD  STATE_3, Y3, Y3
   118  	XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
   119  	VMOVDQA STATE_0, Y0
   120  	VMOVDQA STATE_1, Y1
   121  	VMOVDQA STATE_2, Y2
   122  	VMOVDQA STATE_3, Y3
   123  	VPADDQ  TWO, Y3, Y3
   124  
   125  	VPADDD Y0, Y4, Y4
   126  	VPADDD Y1, Y5, Y5
   127  	VPADDD Y2, Y6, Y6
   128  	VPADDD Y3, Y7, Y7
   129  	XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
   130  	VPADDQ TWO, Y3, Y3
   131  
   132  	VPADDD Y0, Y8, Y8
   133  	VPADDD Y1, Y9, Y9
   134  	VPADDD Y2, Y10, Y10
   135  	VPADDD Y3, Y11, Y11
   136  	XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
   137  	VPADDQ TWO, Y3, Y3
   138  
   139  	VPADDD TMP_0, Y0, Y12
   140  	VPADDD TMP_1, Y1, Y13
   141  	VPADDD Y2, Y14, Y14
   142  	VPADDD Y3, Y15, Y15
   143  	VPADDQ TWO, Y3, Y3
   144  
   145  	CMPQ CX, $512
   146  	JB   less_than_512
   147  
   148  	XOR_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
   149  	VMOVDQA Y3, STATE_3
   150  	ADDQ    $512, SI
   151  	ADDQ    $512, DI
   152  	SUBQ    $512, CX
   153  	CMPQ    CX, $448
   154  	JA      at_least_512
   155  
   156  	TESTQ CX, CX
   157  	JZ    done
   158  
   159  	VMOVDQA C16, Y14
   160  	VMOVDQA C8, Y15
   161  
   162  	CMPQ CX, $64
   163  	JBE  between_0_and_64
   164  	CMPQ CX, $192
   165  	JBE  between_64_and_192
   166  	CMPQ CX, $320
   167  	JBE  between_192_and_320
   168  	JMP  between_320_and_448
   169  
   170  less_than_512:
   171  	XOR_UPPER_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
   172  	EXTRACT_LOWER(BX, Y12, Y13, Y14, Y15, Y4)
   173  	ADDQ $448, SI
   174  	ADDQ $448, DI
   175  	SUBQ $448, CX
   176  	JMP  finalize
   177  
   178  between_320_and_448:
   179  	VMOVDQA Y0, Y4
   180  	VMOVDQA Y1, Y5
   181  	VMOVDQA Y2, Y6
   182  	VPADDQ  TWO, Y3, Y7
   183  	VMOVDQA Y0, Y8
   184  	VMOVDQA Y1, Y9
   185  	VMOVDQA Y2, Y10
   186  	VPADDQ  TWO, Y7, Y11
   187  
   188  	MOVQ DX, R9
   189  
   190  chacha_loop_384:
   191  	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
   192  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
   193  	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
   194  	CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
   195  	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
   196  	CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
   197  	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
   198  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
   199  	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
   200  	CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
   201  	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
   202  	CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
   203  	SUBQ $2, R9
   204  	JA   chacha_loop_384
   205  
   206  	VPADDD  STATE_0, Y0, Y0
   207  	VPADDD  STATE_1, Y1, Y1
   208  	VPADDD  STATE_2, Y2, Y2
   209  	VPADDD  STATE_3, Y3, Y3
   210  	XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
   211  	VMOVDQA STATE_0, Y0
   212  	VMOVDQA STATE_1, Y1
   213  	VMOVDQA STATE_2, Y2
   214  	VMOVDQA STATE_3, Y3
   215  	VPADDQ  TWO, Y3, Y3
   216  
   217  	VPADDD Y0, Y4, Y4
   218  	VPADDD Y1, Y5, Y5
   219  	VPADDD Y2, Y6, Y6
   220  	VPADDD Y3, Y7, Y7
   221  	XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
   222  	VPADDQ TWO, Y3, Y3
   223  
   224  	VPADDD Y0, Y8, Y8
   225  	VPADDD Y1, Y9, Y9
   226  	VPADDD Y2, Y10, Y10
   227  	VPADDD Y3, Y11, Y11
   228  	VPADDQ TWO, Y3, Y3
   229  
   230  	CMPQ CX, $384
   231  	JB   less_than_384
   232  
   233  	XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
   234  	SUBQ  $384, CX
   235  	TESTQ CX, CX
   236  	JE    done
   237  
   238  	ADDQ $384, SI
   239  	ADDQ $384, DI
   240  	JMP  between_0_and_64
   241  
   242  less_than_384:
   243  	XOR_UPPER_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
   244  	EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
   245  	ADDQ $320, SI
   246  	ADDQ $320, DI
   247  	SUBQ $320, CX
   248  	JMP  finalize
   249  
   250  between_192_and_320:
   251  	VMOVDQA Y0, Y4
   252  	VMOVDQA Y1, Y5
   253  	VMOVDQA Y2, Y6
   254  	VMOVDQA Y3, Y7
   255  	VMOVDQA Y0, Y8
   256  	VMOVDQA Y1, Y9
   257  	VMOVDQA Y2, Y10
   258  	VPADDQ  TWO, Y3, Y11
   259  
   260  	MOVQ DX, R9
   261  
   262  chacha_loop_256:
   263  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
   264  	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
   265  	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
   266  	CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
   267  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
   268  	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
   269  	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
   270  	CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
   271  	SUBQ $2, R9
   272  	JA   chacha_loop_256
   273  
   274  	VPADDD Y0, Y4, Y4
   275  	VPADDD Y1, Y5, Y5
   276  	VPADDD Y2, Y6, Y6
   277  	VPADDD Y3, Y7, Y7
   278  	VPADDQ TWO, Y3, Y3
   279  	XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
   280  	VPADDD Y0, Y8, Y8
   281  	VPADDD Y1, Y9, Y9
   282  	VPADDD Y2, Y10, Y10
   283  	VPADDD Y3, Y11, Y11
   284  	VPADDQ TWO, Y3, Y3
   285  
   286  	CMPQ CX, $256
   287  	JB   less_than_256
   288  
   289  	XOR_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
   290  	SUBQ  $256, CX
   291  	TESTQ CX, CX
   292  	JE    done
   293  
   294  	ADDQ $256, SI
   295  	ADDQ $256, DI
   296  	JMP  between_0_and_64
   297  
   298  less_than_256:
   299  	XOR_UPPER_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
   300  	EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
   301  	ADDQ $192, SI
   302  	ADDQ $192, DI
   303  	SUBQ $192, CX
   304  	JMP  finalize
   305  
   306  between_64_and_192:
   307  	VMOVDQA Y0, Y4
   308  	VMOVDQA Y1, Y5
   309  	VMOVDQA Y2, Y6
   310  	VMOVDQA Y3, Y7
   311  
   312  	MOVQ DX, R9
   313  
   314  chacha_loop_128:
   315  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
   316  	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
   317  	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
   318  	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
   319  	SUBQ $2, R9
   320  	JA   chacha_loop_128
   321  
   322  	VPADDD Y0, Y4, Y4
   323  	VPADDD Y1, Y5, Y5
   324  	VPADDD Y2, Y6, Y6
   325  	VPADDD Y3, Y7, Y7
   326  	VPADDQ TWO, Y3, Y3
   327  
   328  	CMPQ CX, $128
   329  	JB   less_than_128
   330  
   331  	XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
   332  	SUBQ  $128, CX
   333  	TESTQ CX, CX
   334  	JE    done
   335  
   336  	ADDQ $128, SI
   337  	ADDQ $128, DI
   338  	JMP  between_0_and_64
   339  
   340  less_than_128:
   341  	XOR_UPPER_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
   342  	EXTRACT_LOWER(BX, Y4, Y5, Y6, Y7, Y13)
   343  	ADDQ $64, SI
   344  	ADDQ $64, DI
   345  	SUBQ $64, CX
   346  	JMP  finalize
   347  
   348  between_0_and_64:
   349  	VMOVDQA X0, X4
   350  	VMOVDQA X1, X5
   351  	VMOVDQA X2, X6
   352  	VMOVDQA X3, X7
   353  
   354  	MOVQ DX, R9
   355  
   356  chacha_loop_64:
   357  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
   358  	CHACHA_SHUFFLE_AVX(X5, X6, X7)
   359  	CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
   360  	CHACHA_SHUFFLE_AVX(X7, X6, X5)
   361  	SUBQ $2, R9
   362  	JA   chacha_loop_64
   363  
   364  	VPADDD  X0, X4, X4
   365  	VPADDD  X1, X5, X5
   366  	VPADDD  X2, X6, X6
   367  	VPADDD  X3, X7, X7
   368  	VMOVDQU ·one<>(SB), X0
   369  	VPADDQ  X0, X3, X3
   370  
   371  	CMPQ CX, $64
   372  	JB   less_than_64
   373  
   374  	XOR_AVX(DI, SI, 0, X4, X5, X6, X7, X13)
   375  	SUBQ $64, CX
   376  	JMP  done
   377  
   378  less_than_64:
   379  	VMOVDQU X4, 0(BX)
   380  	VMOVDQU X5, 16(BX)
   381  	VMOVDQU X6, 32(BX)
   382  	VMOVDQU X7, 48(BX)
   383  
   384  finalize:
   385  	XORQ R11, R11
   386  	XORQ R12, R12
   387  	MOVQ CX, BP
   388  
   389  xor_loop:
   390  	MOVB 0(SI), R11
   391  	MOVB 0(BX), R12
   392  	XORQ R11, R12
   393  	MOVB R12, 0(DI)
   394  	INCQ SI
   395  	INCQ BX
   396  	INCQ DI
   397  	DECQ BP
   398  	JA   xor_loop
   399  
   400  done:
   401  	VMOVDQU X3, 48(AX)
   402  	VZEROUPPER
   403  	MOVQ    R8, SP
   404  	MOVQ    CX, ret+72(FP)
   405  	RET
   406