git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/chacha/chachaAVX2_amd64.s (about) 1 // Copyright (c) 2016 Andreas Auernhammer. All rights reserved. 2 // Use of this source code is governed by a license that can be 3 // found in the LICENSE file. 4 5 // +build amd64,!gccgo,!appengine,!nacl 6 7 #include "const.s" 8 #include "macro.s" 9 10 #define TWO 0(SP) 11 #define C16 32(SP) 12 #define C8 64(SP) 13 #define STATE_0 96(SP) 14 #define STATE_1 128(SP) 15 #define STATE_2 160(SP) 16 #define STATE_3 192(SP) 17 #define TMP_0 224(SP) 18 #define TMP_1 256(SP) 19 20 // func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int 21 TEXT ·xorKeyStreamAVX2(SB), 4, $320-80 22 MOVQ dst_base+0(FP), DI 23 MOVQ src_base+24(FP), SI 24 MOVQ block+48(FP), BX 25 MOVQ state+56(FP), AX 26 MOVQ rounds+64(FP), DX 27 MOVQ src_len+32(FP), CX 28 29 MOVQ SP, R8 30 ADDQ $32, SP 31 ANDQ $-32, SP 32 33 VMOVDQU 0(AX), Y2 34 VMOVDQU 32(AX), Y3 35 VPERM2I128 $0x22, Y2, Y0, Y0 36 VPERM2I128 $0x33, Y2, Y1, Y1 37 VPERM2I128 $0x22, Y3, Y2, Y2 38 VPERM2I128 $0x33, Y3, Y3, Y3 39 40 TESTQ CX, CX 41 JZ done 42 43 VMOVDQU ·one_AVX2<>(SB), Y4 44 VPADDD Y4, Y3, Y3 45 46 VMOVDQA Y0, STATE_0 47 VMOVDQA Y1, STATE_1 48 VMOVDQA Y2, STATE_2 49 VMOVDQA Y3, STATE_3 50 51 VMOVDQU ·rol16_AVX2<>(SB), Y4 52 VMOVDQU ·rol8_AVX2<>(SB), Y5 53 VMOVDQU ·two_AVX2<>(SB), Y6 54 VMOVDQA Y4, Y14 55 VMOVDQA Y5, Y15 56 VMOVDQA Y4, C16 57 VMOVDQA Y5, C8 58 VMOVDQA Y6, TWO 59 60 CMPQ CX, $64 61 JBE between_0_and_64 62 CMPQ CX, $192 63 JBE between_64_and_192 64 CMPQ CX, $320 65 JBE between_192_and_320 66 CMPQ CX, $448 67 JBE between_320_and_448 68 69 at_least_512: 70 VMOVDQA Y0, Y4 71 VMOVDQA Y1, Y5 72 VMOVDQA Y2, Y6 73 VPADDQ TWO, Y3, Y7 74 VMOVDQA Y0, Y8 75 VMOVDQA Y1, Y9 76 VMOVDQA Y2, Y10 77 VPADDQ TWO, Y7, Y11 78 VMOVDQA Y0, Y12 79 VMOVDQA Y1, Y13 80 VMOVDQA Y2, Y14 81 VPADDQ TWO, Y11, Y15 82 83 MOVQ DX, R9 84 85 chacha_loop_512: 86 VMOVDQA Y8, TMP_0 87 CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8) 88 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8) 89 VMOVDQA TMP_0, Y8 90 VMOVDQA Y0, TMP_0 91 CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8) 92 CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8) 93 CHACHA_SHUFFLE_AVX(Y1, Y2, Y3) 94 CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) 95 CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) 96 CHACHA_SHUFFLE_AVX(Y13, Y14, Y15) 97 98 CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8) 99 CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8) 100 VMOVDQA TMP_0, Y0 101 VMOVDQA Y8, TMP_0 102 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8) 103 CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8) 104 VMOVDQA TMP_0, Y8 105 CHACHA_SHUFFLE_AVX(Y3, Y2, Y1) 106 CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) 107 CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) 108 CHACHA_SHUFFLE_AVX(Y15, Y14, Y13) 109 SUBQ $2, R9 110 JA chacha_loop_512 111 112 VMOVDQA Y12, TMP_0 113 VMOVDQA Y13, TMP_1 114 VPADDD STATE_0, Y0, Y0 115 VPADDD STATE_1, Y1, Y1 116 VPADDD STATE_2, Y2, Y2 117 VPADDD STATE_3, Y3, Y3 118 XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13) 119 VMOVDQA STATE_0, Y0 120 VMOVDQA STATE_1, Y1 121 VMOVDQA STATE_2, Y2 122 VMOVDQA STATE_3, Y3 123 VPADDQ TWO, Y3, Y3 124 125 VPADDD Y0, Y4, Y4 126 VPADDD Y1, Y5, Y5 127 VPADDD Y2, Y6, Y6 128 VPADDD Y3, Y7, Y7 129 XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13) 130 VPADDQ TWO, Y3, Y3 131 132 VPADDD Y0, Y8, Y8 133 VPADDD Y1, Y9, Y9 134 VPADDD Y2, Y10, Y10 135 VPADDD Y3, Y11, Y11 136 XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) 137 VPADDQ TWO, Y3, Y3 138 139 VPADDD TMP_0, Y0, Y12 140 VPADDD TMP_1, Y1, Y13 141 VPADDD Y2, Y14, Y14 142 VPADDD Y3, Y15, Y15 143 VPADDQ TWO, Y3, Y3 144 145 CMPQ CX, $512 146 JB less_than_512 147 148 XOR_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5) 149 VMOVDQA Y3, STATE_3 150 ADDQ $512, SI 151 ADDQ $512, DI 152 SUBQ $512, CX 153 CMPQ CX, $448 154 JA at_least_512 155 156 TESTQ CX, CX 157 JZ done 158 159 VMOVDQA C16, Y14 160 VMOVDQA C8, Y15 161 162 CMPQ CX, $64 163 JBE between_0_and_64 164 CMPQ CX, $192 165 JBE between_64_and_192 166 CMPQ CX, $320 167 JBE between_192_and_320 168 JMP between_320_and_448 169 170 less_than_512: 171 XOR_UPPER_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5) 172 EXTRACT_LOWER(BX, Y12, Y13, Y14, Y15, Y4) 173 ADDQ $448, SI 174 ADDQ $448, DI 175 SUBQ $448, CX 176 JMP finalize 177 178 between_320_and_448: 179 VMOVDQA Y0, Y4 180 VMOVDQA Y1, Y5 181 VMOVDQA Y2, Y6 182 VPADDQ TWO, Y3, Y7 183 VMOVDQA Y0, Y8 184 VMOVDQA Y1, Y9 185 VMOVDQA Y2, Y10 186 VPADDQ TWO, Y7, Y11 187 188 MOVQ DX, R9 189 190 chacha_loop_384: 191 CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15) 192 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) 193 CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) 194 CHACHA_SHUFFLE_AVX(Y1, Y2, Y3) 195 CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) 196 CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) 197 CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15) 198 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) 199 CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) 200 CHACHA_SHUFFLE_AVX(Y3, Y2, Y1) 201 CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) 202 CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) 203 SUBQ $2, R9 204 JA chacha_loop_384 205 206 VPADDD STATE_0, Y0, Y0 207 VPADDD STATE_1, Y1, Y1 208 VPADDD STATE_2, Y2, Y2 209 VPADDD STATE_3, Y3, Y3 210 XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13) 211 VMOVDQA STATE_0, Y0 212 VMOVDQA STATE_1, Y1 213 VMOVDQA STATE_2, Y2 214 VMOVDQA STATE_3, Y3 215 VPADDQ TWO, Y3, Y3 216 217 VPADDD Y0, Y4, Y4 218 VPADDD Y1, Y5, Y5 219 VPADDD Y2, Y6, Y6 220 VPADDD Y3, Y7, Y7 221 XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13) 222 VPADDQ TWO, Y3, Y3 223 224 VPADDD Y0, Y8, Y8 225 VPADDD Y1, Y9, Y9 226 VPADDD Y2, Y10, Y10 227 VPADDD Y3, Y11, Y11 228 VPADDQ TWO, Y3, Y3 229 230 CMPQ CX, $384 231 JB less_than_384 232 233 XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) 234 SUBQ $384, CX 235 TESTQ CX, CX 236 JE done 237 238 ADDQ $384, SI 239 ADDQ $384, DI 240 JMP between_0_and_64 241 242 less_than_384: 243 XOR_UPPER_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) 244 EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12) 245 ADDQ $320, SI 246 ADDQ $320, DI 247 SUBQ $320, CX 248 JMP finalize 249 250 between_192_and_320: 251 VMOVDQA Y0, Y4 252 VMOVDQA Y1, Y5 253 VMOVDQA Y2, Y6 254 VMOVDQA Y3, Y7 255 VMOVDQA Y0, Y8 256 VMOVDQA Y1, Y9 257 VMOVDQA Y2, Y10 258 VPADDQ TWO, Y3, Y11 259 260 MOVQ DX, R9 261 262 chacha_loop_256: 263 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) 264 CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) 265 CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) 266 CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) 267 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) 268 CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) 269 CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) 270 CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) 271 SUBQ $2, R9 272 JA chacha_loop_256 273 274 VPADDD Y0, Y4, Y4 275 VPADDD Y1, Y5, Y5 276 VPADDD Y2, Y6, Y6 277 VPADDD Y3, Y7, Y7 278 VPADDQ TWO, Y3, Y3 279 XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) 280 VPADDD Y0, Y8, Y8 281 VPADDD Y1, Y9, Y9 282 VPADDD Y2, Y10, Y10 283 VPADDD Y3, Y11, Y11 284 VPADDQ TWO, Y3, Y3 285 286 CMPQ CX, $256 287 JB less_than_256 288 289 XOR_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13) 290 SUBQ $256, CX 291 TESTQ CX, CX 292 JE done 293 294 ADDQ $256, SI 295 ADDQ $256, DI 296 JMP between_0_and_64 297 298 less_than_256: 299 XOR_UPPER_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13) 300 EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12) 301 ADDQ $192, SI 302 ADDQ $192, DI 303 SUBQ $192, CX 304 JMP finalize 305 306 between_64_and_192: 307 VMOVDQA Y0, Y4 308 VMOVDQA Y1, Y5 309 VMOVDQA Y2, Y6 310 VMOVDQA Y3, Y7 311 312 MOVQ DX, R9 313 314 chacha_loop_128: 315 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) 316 CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) 317 CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) 318 CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) 319 SUBQ $2, R9 320 JA chacha_loop_128 321 322 VPADDD Y0, Y4, Y4 323 VPADDD Y1, Y5, Y5 324 VPADDD Y2, Y6, Y6 325 VPADDD Y3, Y7, Y7 326 VPADDQ TWO, Y3, Y3 327 328 CMPQ CX, $128 329 JB less_than_128 330 331 XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) 332 SUBQ $128, CX 333 TESTQ CX, CX 334 JE done 335 336 ADDQ $128, SI 337 ADDQ $128, DI 338 JMP between_0_and_64 339 340 less_than_128: 341 XOR_UPPER_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) 342 EXTRACT_LOWER(BX, Y4, Y5, Y6, Y7, Y13) 343 ADDQ $64, SI 344 ADDQ $64, DI 345 SUBQ $64, CX 346 JMP finalize 347 348 between_0_and_64: 349 VMOVDQA X0, X4 350 VMOVDQA X1, X5 351 VMOVDQA X2, X6 352 VMOVDQA X3, X7 353 354 MOVQ DX, R9 355 356 chacha_loop_64: 357 CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15) 358 CHACHA_SHUFFLE_AVX(X5, X6, X7) 359 CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15) 360 CHACHA_SHUFFLE_AVX(X7, X6, X5) 361 SUBQ $2, R9 362 JA chacha_loop_64 363 364 VPADDD X0, X4, X4 365 VPADDD X1, X5, X5 366 VPADDD X2, X6, X6 367 VPADDD X3, X7, X7 368 VMOVDQU ·one<>(SB), X0 369 VPADDQ X0, X3, X3 370 371 CMPQ CX, $64 372 JB less_than_64 373 374 XOR_AVX(DI, SI, 0, X4, X5, X6, X7, X13) 375 SUBQ $64, CX 376 JMP done 377 378 less_than_64: 379 VMOVDQU X4, 0(BX) 380 VMOVDQU X5, 16(BX) 381 VMOVDQU X6, 32(BX) 382 VMOVDQU X7, 48(BX) 383 384 finalize: 385 XORQ R11, R11 386 XORQ R12, R12 387 MOVQ CX, BP 388 389 xor_loop: 390 MOVB 0(SI), R11 391 MOVB 0(BX), R12 392 XORQ R11, R12 393 MOVB R12, 0(DI) 394 INCQ SI 395 INCQ BX 396 INCQ DI 397 DECQ BP 398 JA xor_loop 399 400 done: 401 VMOVDQU X3, 48(AX) 402 VZEROUPPER 403 MOVQ R8, SP 404 MOVQ CX, ret+72(FP) 405 RET 406