gitlab.com/yawning/chacha20.git@v0.0.0-20230427033715-7877545b1b37/internal/hardware/impl_amd64.s (about) 1 // Copryright (C) 2019 Yawning Angel 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 // +build !noasm 17 18 #include "textflag.h" 19 20 DATA ·chacha_constants<>+0x00(SB)/4, $0x61707865 21 DATA ·chacha_constants<>+0x04(SB)/4, $0x3320646E 22 DATA ·chacha_constants<>+0x08(SB)/4, $0x79622D32 23 DATA ·chacha_constants<>+0x0c(SB)/4, $0x6B206574 24 DATA ·chacha_constants<>+0x10(SB)/8, $0x0504070601000302 25 DATA ·chacha_constants<>+0x18(SB)/8, $0x0D0C0F0E09080B0A 26 DATA ·chacha_constants<>+0x20(SB)/8, $0x0605040702010003 27 DATA ·chacha_constants<>+0x28(SB)/8, $0x0E0D0C0F0A09080B 28 GLOBL ·chacha_constants<>(SB), (NOPTR+RODATA), $48 29 30 // func blocksAVX2(s *[api.StateSize]uint32, in, out []byte) 31 TEXT ·blocksAVX2(SB), NOSPLIT, $576-56 32 // This is Andrew Moon's AVX2 ChaCha implementation taken from 33 // supercop-20171218, with some minor changes, primarily calling 34 // convention and assembly dialect related. 35 36 // Align the stack on a 64 byte boundary. 37 MOVQ SP, BP 38 ADDQ $64, BP 39 ANDQ $-64, BP 40 41 // Go calling convention -> SYSV AMD64 (and a fixup). 42 MOVQ s+0(FP), DI // &s -> DI 43 ADDQ $16, DI // Skip the ChaCha constants in the chachaState. 44 MOVQ in+8(FP), SI // &in[0] -> SI 45 MOVQ out+32(FP), DX // &out[0] -> DX 46 MOVQ in_len+16(FP), CX // len(in) -> CX 47 48 // Begin the main body of `chacha_blocks_avx2`. 49 // 50 // Mostly a direct translation except: 51 // * The number of rounds is always 20. 52 // * %rbp is used instead of %rsp. 53 LEAQ ·chacha_constants<>(SB), AX 54 VMOVDQU 0(AX), X8 55 VMOVDQU 16(AX), X6 56 VMOVDQU 32(AX), X7 57 VMOVDQU 0(DI), X9 58 VMOVDQU 16(DI), X10 59 VMOVDQU 32(DI), X11 60 61 // MOVQ 48(DI), AX 62 MOVQ $1, R9 63 VMOVDQA X8, 0(BP) 64 VMOVDQA X9, 16(BP) 65 VMOVDQA X10, 32(BP) 66 VMOVDQA X11, 48(BP) 67 68 // MOVQ AX, 64(BP) 69 VMOVDQA X6, 448(BP) 70 VMOVDQA X6, 464(BP) 71 VMOVDQA X7, 480(BP) 72 VMOVDQA X7, 496(BP) 73 CMPQ CX, $512 74 JAE chacha_blocks_avx2_atleast512 75 CMPQ CX, $256 76 JAE chacha_blocks_avx2_atleast256 77 JMP chacha_blocks_avx2_below256 78 79 chacha_blocks_avx2_atleast512: 80 MOVQ 48(BP), AX 81 LEAQ 1(AX), R8 82 LEAQ 2(AX), R9 83 LEAQ 3(AX), R10 84 LEAQ 4(AX), BX 85 LEAQ 5(AX), R11 86 LEAQ 6(AX), R12 87 LEAQ 7(AX), R13 88 LEAQ 8(AX), R14 89 MOVL AX, 128(BP) 90 MOVL R8, 4+128(BP) 91 MOVL R9, 8+128(BP) 92 MOVL R10, 12+128(BP) 93 MOVL BX, 16+128(BP) 94 MOVL R11, 20+128(BP) 95 MOVL R12, 24+128(BP) 96 MOVL R13, 28+128(BP) 97 SHRQ $32, AX 98 SHRQ $32, R8 99 SHRQ $32, R9 100 SHRQ $32, R10 101 SHRQ $32, BX 102 SHRQ $32, R11 103 SHRQ $32, R12 104 SHRQ $32, R13 105 MOVL AX, 160(BP) 106 MOVL R8, 4+160(BP) 107 MOVL R9, 8+160(BP) 108 MOVL R10, 12+160(BP) 109 MOVL BX, 16+160(BP) 110 MOVL R11, 20+160(BP) 111 MOVL R12, 24+160(BP) 112 MOVL R13, 28+160(BP) 113 MOVQ R14, 48(BP) 114 115 // MOVQ 64(BP), AX 116 MOVQ $20, AX 117 VPBROADCASTD 0(BP), Y0 118 VPBROADCASTD 4+0(BP), Y1 119 VPBROADCASTD 8+0(BP), Y2 120 VPBROADCASTD 12+0(BP), Y3 121 VPBROADCASTD 16(BP), Y4 122 VPBROADCASTD 4+16(BP), Y5 123 VPBROADCASTD 8+16(BP), Y6 124 VPBROADCASTD 12+16(BP), Y7 125 VPBROADCASTD 32(BP), Y8 126 VPBROADCASTD 4+32(BP), Y9 127 VPBROADCASTD 8+32(BP), Y10 128 VPBROADCASTD 12+32(BP), Y11 129 VPBROADCASTD 8+48(BP), Y14 130 VPBROADCASTD 12+48(BP), Y15 131 VMOVDQA 128(BP), Y12 132 VMOVDQA 160(BP), Y13 133 134 chacha_blocks_avx2_mainloop1: 135 VPADDD Y0, Y4, Y0 136 VPADDD Y1, Y5, Y1 137 VPXOR Y12, Y0, Y12 138 VPXOR Y13, Y1, Y13 139 VPADDD Y2, Y6, Y2 140 VPADDD Y3, Y7, Y3 141 VPXOR Y14, Y2, Y14 142 VPXOR Y15, Y3, Y15 143 VPSHUFB 448(BP), Y12, Y12 144 VPSHUFB 448(BP), Y13, Y13 145 VPADDD Y8, Y12, Y8 146 VPADDD Y9, Y13, Y9 147 VPSHUFB 448(BP), Y14, Y14 148 VPSHUFB 448(BP), Y15, Y15 149 VPADDD Y10, Y14, Y10 150 VPADDD Y11, Y15, Y11 151 VMOVDQA Y12, 96(BP) 152 VPXOR Y4, Y8, Y4 153 VPXOR Y5, Y9, Y5 154 VPSLLD $ 12, Y4, Y12 155 VPSRLD $20, Y4, Y4 156 VPXOR Y4, Y12, Y4 157 VPSLLD $ 12, Y5, Y12 158 VPSRLD $20, Y5, Y5 159 VPXOR Y5, Y12, Y5 160 VPXOR Y6, Y10, Y6 161 VPXOR Y7, Y11, Y7 162 VPSLLD $ 12, Y6, Y12 163 VPSRLD $20, Y6, Y6 164 VPXOR Y6, Y12, Y6 165 VPSLLD $ 12, Y7, Y12 166 VPSRLD $20, Y7, Y7 167 VPXOR Y7, Y12, Y7 168 VPADDD Y0, Y4, Y0 169 VPADDD Y1, Y5, Y1 170 VPXOR 96(BP), Y0, Y12 171 VPXOR Y13, Y1, Y13 172 VPADDD Y2, Y6, Y2 173 VPADDD Y3, Y7, Y3 174 VPXOR Y14, Y2, Y14 175 VPXOR Y15, Y3, Y15 176 VPSHUFB 480(BP), Y12, Y12 177 VPSHUFB 480(BP), Y13, Y13 178 VPADDD Y8, Y12, Y8 179 VPADDD Y9, Y13, Y9 180 VPSHUFB 480(BP), Y14, Y14 181 VPSHUFB 480(BP), Y15, Y15 182 VPADDD Y10, Y14, Y10 183 VPADDD Y11, Y15, Y11 184 VMOVDQA Y12, 96(BP) 185 VPXOR Y4, Y8, Y4 186 VPXOR Y5, Y9, Y5 187 VPSLLD $ 7, Y4, Y12 188 VPSRLD $25, Y4, Y4 189 VPXOR Y4, Y12, Y4 190 VPSLLD $ 7, Y5, Y12 191 VPSRLD $25, Y5, Y5 192 VPXOR Y5, Y12, Y5 193 VPXOR Y6, Y10, Y6 194 VPXOR Y7, Y11, Y7 195 VPSLLD $ 7, Y6, Y12 196 VPSRLD $25, Y6, Y6 197 VPXOR Y6, Y12, Y6 198 VPSLLD $ 7, Y7, Y12 199 VPSRLD $25, Y7, Y7 200 VPXOR Y7, Y12, Y7 201 VPADDD Y0, Y5, Y0 202 VPADDD Y1, Y6, Y1 203 VPXOR Y15, Y0, Y15 204 VPXOR 96(BP), Y1, Y12 205 VPADDD Y2, Y7, Y2 206 VPADDD Y3, Y4, Y3 207 VPXOR Y13, Y2, Y13 208 VPXOR Y14, Y3, Y14 209 VPSHUFB 448(BP), Y15, Y15 210 VPSHUFB 448(BP), Y12, Y12 211 VPADDD Y10, Y15, Y10 212 VPADDD Y11, Y12, Y11 213 VPSHUFB 448(BP), Y13, Y13 214 VPSHUFB 448(BP), Y14, Y14 215 VPADDD Y8, Y13, Y8 216 VPADDD Y9, Y14, Y9 217 VMOVDQA Y15, 96(BP) 218 VPXOR Y5, Y10, Y5 219 VPXOR Y6, Y11, Y6 220 VPSLLD $ 12, Y5, Y15 221 VPSRLD $20, Y5, Y5 222 VPXOR Y5, Y15, Y5 223 VPSLLD $ 12, Y6, Y15 224 VPSRLD $20, Y6, Y6 225 VPXOR Y6, Y15, Y6 226 VPXOR Y7, Y8, Y7 227 VPXOR Y4, Y9, Y4 228 VPSLLD $ 12, Y7, Y15 229 VPSRLD $20, Y7, Y7 230 VPXOR Y7, Y15, Y7 231 VPSLLD $ 12, Y4, Y15 232 VPSRLD $20, Y4, Y4 233 VPXOR Y4, Y15, Y4 234 VPADDD Y0, Y5, Y0 235 VPADDD Y1, Y6, Y1 236 VPXOR 96(BP), Y0, Y15 237 VPXOR Y12, Y1, Y12 238 VPADDD Y2, Y7, Y2 239 VPADDD Y3, Y4, Y3 240 VPXOR Y13, Y2, Y13 241 VPXOR Y14, Y3, Y14 242 VPSHUFB 480(BP), Y15, Y15 243 VPSHUFB 480(BP), Y12, Y12 244 VPADDD Y10, Y15, Y10 245 VPADDD Y11, Y12, Y11 246 VPSHUFB 480(BP), Y13, Y13 247 VPSHUFB 480(BP), Y14, Y14 248 VPADDD Y8, Y13, Y8 249 VPADDD Y9, Y14, Y9 250 VMOVDQA Y15, 96(BP) 251 VPXOR Y5, Y10, Y5 252 VPXOR Y6, Y11, Y6 253 VPSLLD $ 7, Y5, Y15 254 VPSRLD $25, Y5, Y5 255 VPXOR Y5, Y15, Y5 256 VPSLLD $ 7, Y6, Y15 257 VPSRLD $25, Y6, Y6 258 VPXOR Y6, Y15, Y6 259 VPXOR Y7, Y8, Y7 260 VPXOR Y4, Y9, Y4 261 VPSLLD $ 7, Y7, Y15 262 VPSRLD $25, Y7, Y7 263 VPXOR Y7, Y15, Y7 264 VPSLLD $ 7, Y4, Y15 265 VPSRLD $25, Y4, Y4 266 VPXOR Y4, Y15, Y4 267 VMOVDQA 96(BP), Y15 268 SUBQ $2, AX 269 JNZ chacha_blocks_avx2_mainloop1 270 VMOVDQA Y8, 192(BP) 271 VMOVDQA Y9, 224(BP) 272 VMOVDQA Y10, 256(BP) 273 VMOVDQA Y11, 288(BP) 274 VMOVDQA Y12, 320(BP) 275 VMOVDQA Y13, 352(BP) 276 VMOVDQA Y14, 384(BP) 277 VMOVDQA Y15, 416(BP) 278 VPBROADCASTD 0(BP), Y8 279 VPBROADCASTD 4+0(BP), Y9 280 VPBROADCASTD 8+0(BP), Y10 281 VPBROADCASTD 12+0(BP), Y11 282 VPBROADCASTD 16(BP), Y12 283 VPBROADCASTD 4+16(BP), Y13 284 VPBROADCASTD 8+16(BP), Y14 285 VPBROADCASTD 12+16(BP), Y15 286 VPADDD Y8, Y0, Y0 287 VPADDD Y9, Y1, Y1 288 VPADDD Y10, Y2, Y2 289 VPADDD Y11, Y3, Y3 290 VPADDD Y12, Y4, Y4 291 VPADDD Y13, Y5, Y5 292 VPADDD Y14, Y6, Y6 293 VPADDD Y15, Y7, Y7 294 VPUNPCKLDQ Y1, Y0, Y8 295 VPUNPCKLDQ Y3, Y2, Y9 296 VPUNPCKHDQ Y1, Y0, Y12 297 VPUNPCKHDQ Y3, Y2, Y13 298 VPUNPCKLDQ Y5, Y4, Y10 299 VPUNPCKLDQ Y7, Y6, Y11 300 VPUNPCKHDQ Y5, Y4, Y14 301 VPUNPCKHDQ Y7, Y6, Y15 302 VPUNPCKLQDQ Y9, Y8, Y0 303 VPUNPCKLQDQ Y11, Y10, Y1 304 VPUNPCKHQDQ Y9, Y8, Y2 305 VPUNPCKHQDQ Y11, Y10, Y3 306 VPUNPCKLQDQ Y13, Y12, Y4 307 VPUNPCKLQDQ Y15, Y14, Y5 308 VPUNPCKHQDQ Y13, Y12, Y6 309 VPUNPCKHQDQ Y15, Y14, Y7 310 VPERM2I128 $0x20, Y1, Y0, Y8 311 VPERM2I128 $0x20, Y3, Y2, Y9 312 VPERM2I128 $0x31, Y1, Y0, Y12 313 VPERM2I128 $0x31, Y3, Y2, Y13 314 VPERM2I128 $0x20, Y5, Y4, Y10 315 VPERM2I128 $0x20, Y7, Y6, Y11 316 VPERM2I128 $0x31, Y5, Y4, Y14 317 VPERM2I128 $0x31, Y7, Y6, Y15 318 ANDQ SI, SI 319 JZ chacha_blocks_avx2_noinput1 320 VPXOR 0(SI), Y8, Y8 321 VPXOR 64(SI), Y9, Y9 322 VPXOR 128(SI), Y10, Y10 323 VPXOR 192(SI), Y11, Y11 324 VPXOR 256(SI), Y12, Y12 325 VPXOR 320(SI), Y13, Y13 326 VPXOR 384(SI), Y14, Y14 327 VPXOR 448(SI), Y15, Y15 328 VMOVDQU Y8, 0(DX) 329 VMOVDQU Y9, 64(DX) 330 VMOVDQU Y10, 128(DX) 331 VMOVDQU Y11, 192(DX) 332 VMOVDQU Y12, 256(DX) 333 VMOVDQU Y13, 320(DX) 334 VMOVDQU Y14, 384(DX) 335 VMOVDQU Y15, 448(DX) 336 VMOVDQA 192(BP), Y0 337 VMOVDQA 224(BP), Y1 338 VMOVDQA 256(BP), Y2 339 VMOVDQA 288(BP), Y3 340 VMOVDQA 320(BP), Y4 341 VMOVDQA 352(BP), Y5 342 VMOVDQA 384(BP), Y6 343 VMOVDQA 416(BP), Y7 344 VPBROADCASTD 32(BP), Y8 345 VPBROADCASTD 4+32(BP), Y9 346 VPBROADCASTD 8+32(BP), Y10 347 VPBROADCASTD 12+32(BP), Y11 348 VMOVDQA 128(BP), Y12 349 VMOVDQA 160(BP), Y13 350 VPBROADCASTD 8+48(BP), Y14 351 VPBROADCASTD 12+48(BP), Y15 352 VPADDD Y8, Y0, Y0 353 VPADDD Y9, Y1, Y1 354 VPADDD Y10, Y2, Y2 355 VPADDD Y11, Y3, Y3 356 VPADDD Y12, Y4, Y4 357 VPADDD Y13, Y5, Y5 358 VPADDD Y14, Y6, Y6 359 VPADDD Y15, Y7, Y7 360 VPUNPCKLDQ Y1, Y0, Y8 361 VPUNPCKLDQ Y3, Y2, Y9 362 VPUNPCKHDQ Y1, Y0, Y12 363 VPUNPCKHDQ Y3, Y2, Y13 364 VPUNPCKLDQ Y5, Y4, Y10 365 VPUNPCKLDQ Y7, Y6, Y11 366 VPUNPCKHDQ Y5, Y4, Y14 367 VPUNPCKHDQ Y7, Y6, Y15 368 VPUNPCKLQDQ Y9, Y8, Y0 369 VPUNPCKLQDQ Y11, Y10, Y1 370 VPUNPCKHQDQ Y9, Y8, Y2 371 VPUNPCKHQDQ Y11, Y10, Y3 372 VPUNPCKLQDQ Y13, Y12, Y4 373 VPUNPCKLQDQ Y15, Y14, Y5 374 VPUNPCKHQDQ Y13, Y12, Y6 375 VPUNPCKHQDQ Y15, Y14, Y7 376 VPERM2I128 $0x20, Y1, Y0, Y8 377 VPERM2I128 $0x20, Y3, Y2, Y9 378 VPERM2I128 $0x31, Y1, Y0, Y12 379 VPERM2I128 $0x31, Y3, Y2, Y13 380 VPERM2I128 $0x20, Y5, Y4, Y10 381 VPERM2I128 $0x20, Y7, Y6, Y11 382 VPERM2I128 $0x31, Y5, Y4, Y14 383 VPERM2I128 $0x31, Y7, Y6, Y15 384 VPXOR 32(SI), Y8, Y8 385 VPXOR 96(SI), Y9, Y9 386 VPXOR 160(SI), Y10, Y10 387 VPXOR 224(SI), Y11, Y11 388 VPXOR 288(SI), Y12, Y12 389 VPXOR 352(SI), Y13, Y13 390 VPXOR 416(SI), Y14, Y14 391 VPXOR 480(SI), Y15, Y15 392 VMOVDQU Y8, 32(DX) 393 VMOVDQU Y9, 96(DX) 394 VMOVDQU Y10, 160(DX) 395 VMOVDQU Y11, 224(DX) 396 VMOVDQU Y12, 288(DX) 397 VMOVDQU Y13, 352(DX) 398 VMOVDQU Y14, 416(DX) 399 VMOVDQU Y15, 480(DX) 400 ADDQ $512, SI 401 JMP chacha_blocks_avx2_mainloop1_cont 402 403 chacha_blocks_avx2_noinput1: 404 VMOVDQU Y8, 0(DX) 405 VMOVDQU Y9, 64(DX) 406 VMOVDQU Y10, 128(DX) 407 VMOVDQU Y11, 192(DX) 408 VMOVDQU Y12, 256(DX) 409 VMOVDQU Y13, 320(DX) 410 VMOVDQU Y14, 384(DX) 411 VMOVDQU Y15, 448(DX) 412 VMOVDQA 192(BP), Y0 413 VMOVDQA 224(BP), Y1 414 VMOVDQA 256(BP), Y2 415 VMOVDQA 288(BP), Y3 416 VMOVDQA 320(BP), Y4 417 VMOVDQA 352(BP), Y5 418 VMOVDQA 384(BP), Y6 419 VMOVDQA 416(BP), Y7 420 VPBROADCASTD 32(BP), Y8 421 VPBROADCASTD 4+32(BP), Y9 422 VPBROADCASTD 8+32(BP), Y10 423 VPBROADCASTD 12+32(BP), Y11 424 VMOVDQA 128(BP), Y12 425 VMOVDQA 160(BP), Y13 426 VPBROADCASTD 8+48(BP), Y14 427 VPBROADCASTD 12+48(BP), Y15 428 VPADDD Y8, Y0, Y0 429 VPADDD Y9, Y1, Y1 430 VPADDD Y10, Y2, Y2 431 VPADDD Y11, Y3, Y3 432 VPADDD Y12, Y4, Y4 433 VPADDD Y13, Y5, Y5 434 VPADDD Y14, Y6, Y6 435 VPADDD Y15, Y7, Y7 436 VPUNPCKLDQ Y1, Y0, Y8 437 VPUNPCKLDQ Y3, Y2, Y9 438 VPUNPCKHDQ Y1, Y0, Y12 439 VPUNPCKHDQ Y3, Y2, Y13 440 VPUNPCKLDQ Y5, Y4, Y10 441 VPUNPCKLDQ Y7, Y6, Y11 442 VPUNPCKHDQ Y5, Y4, Y14 443 VPUNPCKHDQ Y7, Y6, Y15 444 VPUNPCKLQDQ Y9, Y8, Y0 445 VPUNPCKLQDQ Y11, Y10, Y1 446 VPUNPCKHQDQ Y9, Y8, Y2 447 VPUNPCKHQDQ Y11, Y10, Y3 448 VPUNPCKLQDQ Y13, Y12, Y4 449 VPUNPCKLQDQ Y15, Y14, Y5 450 VPUNPCKHQDQ Y13, Y12, Y6 451 VPUNPCKHQDQ Y15, Y14, Y7 452 VPERM2I128 $0x20, Y1, Y0, Y8 453 VPERM2I128 $0x20, Y3, Y2, Y9 454 VPERM2I128 $0x31, Y1, Y0, Y12 455 VPERM2I128 $0x31, Y3, Y2, Y13 456 VPERM2I128 $0x20, Y5, Y4, Y10 457 VPERM2I128 $0x20, Y7, Y6, Y11 458 VPERM2I128 $0x31, Y5, Y4, Y14 459 VPERM2I128 $0x31, Y7, Y6, Y15 460 VMOVDQU Y8, 32(DX) 461 VMOVDQU Y9, 96(DX) 462 VMOVDQU Y10, 160(DX) 463 VMOVDQU Y11, 224(DX) 464 VMOVDQU Y12, 288(DX) 465 VMOVDQU Y13, 352(DX) 466 VMOVDQU Y14, 416(DX) 467 VMOVDQU Y15, 480(DX) 468 469 chacha_blocks_avx2_mainloop1_cont: 470 ADDQ $512, DX 471 SUBQ $512, CX 472 CMPQ CX, $512 473 JAE chacha_blocks_avx2_atleast512 474 CMPQ CX, $256 475 JB chacha_blocks_avx2_below256_fixup 476 477 chacha_blocks_avx2_atleast256: 478 MOVQ 48(BP), AX 479 LEAQ 1(AX), R8 480 LEAQ 2(AX), R9 481 LEAQ 3(AX), R10 482 LEAQ 4(AX), BX 483 MOVL AX, 128(BP) 484 MOVL R8, 4+128(BP) 485 MOVL R9, 8+128(BP) 486 MOVL R10, 12+128(BP) 487 SHRQ $32, AX 488 SHRQ $32, R8 489 SHRQ $32, R9 490 SHRQ $32, R10 491 MOVL AX, 160(BP) 492 MOVL R8, 4+160(BP) 493 MOVL R9, 8+160(BP) 494 MOVL R10, 12+160(BP) 495 MOVQ BX, 48(BP) 496 497 // MOVQ 64(BP), AX 498 MOVQ $20, AX 499 VPBROADCASTD 0(BP), X0 500 VPBROADCASTD 4+0(BP), X1 501 VPBROADCASTD 8+0(BP), X2 502 VPBROADCASTD 12+0(BP), X3 503 VPBROADCASTD 16(BP), X4 504 VPBROADCASTD 4+16(BP), X5 505 VPBROADCASTD 8+16(BP), X6 506 VPBROADCASTD 12+16(BP), X7 507 VPBROADCASTD 32(BP), X8 508 VPBROADCASTD 4+32(BP), X9 509 VPBROADCASTD 8+32(BP), X10 510 VPBROADCASTD 12+32(BP), X11 511 VMOVDQA 128(BP), X12 512 VMOVDQA 160(BP), X13 513 VPBROADCASTD 8+48(BP), X14 514 VPBROADCASTD 12+48(BP), X15 515 516 chacha_blocks_avx2_mainloop2: 517 VPADDD X0, X4, X0 518 VPADDD X1, X5, X1 519 VPXOR X12, X0, X12 520 VPXOR X13, X1, X13 521 VPADDD X2, X6, X2 522 VPADDD X3, X7, X3 523 VPXOR X14, X2, X14 524 VPXOR X15, X3, X15 525 VPSHUFB 448(BP), X12, X12 526 VPSHUFB 448(BP), X13, X13 527 VPADDD X8, X12, X8 528 VPADDD X9, X13, X9 529 VPSHUFB 448(BP), X14, X14 530 VPSHUFB 448(BP), X15, X15 531 VPADDD X10, X14, X10 532 VPADDD X11, X15, X11 533 VMOVDQA X12, 96(BP) 534 VPXOR X4, X8, X4 535 VPXOR X5, X9, X5 536 VPSLLD $ 12, X4, X12 537 VPSRLD $20, X4, X4 538 VPXOR X4, X12, X4 539 VPSLLD $ 12, X5, X12 540 VPSRLD $20, X5, X5 541 VPXOR X5, X12, X5 542 VPXOR X6, X10, X6 543 VPXOR X7, X11, X7 544 VPSLLD $ 12, X6, X12 545 VPSRLD $20, X6, X6 546 VPXOR X6, X12, X6 547 VPSLLD $ 12, X7, X12 548 VPSRLD $20, X7, X7 549 VPXOR X7, X12, X7 550 VPADDD X0, X4, X0 551 VPADDD X1, X5, X1 552 VPXOR 96(BP), X0, X12 553 VPXOR X13, X1, X13 554 VPADDD X2, X6, X2 555 VPADDD X3, X7, X3 556 VPXOR X14, X2, X14 557 VPXOR X15, X3, X15 558 VPSHUFB 480(BP), X12, X12 559 VPSHUFB 480(BP), X13, X13 560 VPADDD X8, X12, X8 561 VPADDD X9, X13, X9 562 VPSHUFB 480(BP), X14, X14 563 VPSHUFB 480(BP), X15, X15 564 VPADDD X10, X14, X10 565 VPADDD X11, X15, X11 566 VMOVDQA X12, 96(BP) 567 VPXOR X4, X8, X4 568 VPXOR X5, X9, X5 569 VPSLLD $ 7, X4, X12 570 VPSRLD $25, X4, X4 571 VPXOR X4, X12, X4 572 VPSLLD $ 7, X5, X12 573 VPSRLD $25, X5, X5 574 VPXOR X5, X12, X5 575 VPXOR X6, X10, X6 576 VPXOR X7, X11, X7 577 VPSLLD $ 7, X6, X12 578 VPSRLD $25, X6, X6 579 VPXOR X6, X12, X6 580 VPSLLD $ 7, X7, X12 581 VPSRLD $25, X7, X7 582 VPXOR X7, X12, X7 583 VPADDD X0, X5, X0 584 VPADDD X1, X6, X1 585 VPXOR X15, X0, X15 586 VPXOR 96(BP), X1, X12 587 VPADDD X2, X7, X2 588 VPADDD X3, X4, X3 589 VPXOR X13, X2, X13 590 VPXOR X14, X3, X14 591 VPSHUFB 448(BP), X15, X15 592 VPSHUFB 448(BP), X12, X12 593 VPADDD X10, X15, X10 594 VPADDD X11, X12, X11 595 VPSHUFB 448(BP), X13, X13 596 VPSHUFB 448(BP), X14, X14 597 VPADDD X8, X13, X8 598 VPADDD X9, X14, X9 599 VMOVDQA X15, 96(BP) 600 VPXOR X5, X10, X5 601 VPXOR X6, X11, X6 602 VPSLLD $ 12, X5, X15 603 VPSRLD $20, X5, X5 604 VPXOR X5, X15, X5 605 VPSLLD $ 12, X6, X15 606 VPSRLD $20, X6, X6 607 VPXOR X6, X15, X6 608 VPXOR X7, X8, X7 609 VPXOR X4, X9, X4 610 VPSLLD $ 12, X7, X15 611 VPSRLD $20, X7, X7 612 VPXOR X7, X15, X7 613 VPSLLD $ 12, X4, X15 614 VPSRLD $20, X4, X4 615 VPXOR X4, X15, X4 616 VPADDD X0, X5, X0 617 VPADDD X1, X6, X1 618 VPXOR 96(BP), X0, X15 619 VPXOR X12, X1, X12 620 VPADDD X2, X7, X2 621 VPADDD X3, X4, X3 622 VPXOR X13, X2, X13 623 VPXOR X14, X3, X14 624 VPSHUFB 480(BP), X15, X15 625 VPSHUFB 480(BP), X12, X12 626 VPADDD X10, X15, X10 627 VPADDD X11, X12, X11 628 VPSHUFB 480(BP), X13, X13 629 VPSHUFB 480(BP), X14, X14 630 VPADDD X8, X13, X8 631 VPADDD X9, X14, X9 632 VMOVDQA X15, 96(BP) 633 VPXOR X5, X10, X5 634 VPXOR X6, X11, X6 635 VPSLLD $ 7, X5, X15 636 VPSRLD $25, X5, X5 637 VPXOR X5, X15, X5 638 VPSLLD $ 7, X6, X15 639 VPSRLD $25, X6, X6 640 VPXOR X6, X15, X6 641 VPXOR X7, X8, X7 642 VPXOR X4, X9, X4 643 VPSLLD $ 7, X7, X15 644 VPSRLD $25, X7, X7 645 VPXOR X7, X15, X7 646 VPSLLD $ 7, X4, X15 647 VPSRLD $25, X4, X4 648 VPXOR X4, X15, X4 649 VMOVDQA 96(BP), X15 650 SUBQ $2, AX 651 JNZ chacha_blocks_avx2_mainloop2 652 VMOVDQA X8, 192(BP) 653 VMOVDQA X9, 208(BP) 654 VMOVDQA X10, 224(BP) 655 VMOVDQA X11, 240(BP) 656 VMOVDQA X12, 256(BP) 657 VMOVDQA X13, 272(BP) 658 VMOVDQA X14, 288(BP) 659 VMOVDQA X15, 304(BP) 660 VPBROADCASTD 0(BP), X8 661 VPBROADCASTD 4+0(BP), X9 662 VPBROADCASTD 8+0(BP), X10 663 VPBROADCASTD 12+0(BP), X11 664 VPBROADCASTD 16(BP), X12 665 VPBROADCASTD 4+16(BP), X13 666 VPBROADCASTD 8+16(BP), X14 667 VPBROADCASTD 12+16(BP), X15 668 VPADDD X8, X0, X0 669 VPADDD X9, X1, X1 670 VPADDD X10, X2, X2 671 VPADDD X11, X3, X3 672 VPADDD X12, X4, X4 673 VPADDD X13, X5, X5 674 VPADDD X14, X6, X6 675 VPADDD X15, X7, X7 676 VPUNPCKLDQ X1, X0, X8 677 VPUNPCKLDQ X3, X2, X9 678 VPUNPCKHDQ X1, X0, X12 679 VPUNPCKHDQ X3, X2, X13 680 VPUNPCKLDQ X5, X4, X10 681 VPUNPCKLDQ X7, X6, X11 682 VPUNPCKHDQ X5, X4, X14 683 VPUNPCKHDQ X7, X6, X15 684 VPUNPCKLQDQ X9, X8, X0 685 VPUNPCKLQDQ X11, X10, X1 686 VPUNPCKHQDQ X9, X8, X2 687 VPUNPCKHQDQ X11, X10, X3 688 VPUNPCKLQDQ X13, X12, X4 689 VPUNPCKLQDQ X15, X14, X5 690 VPUNPCKHQDQ X13, X12, X6 691 VPUNPCKHQDQ X15, X14, X7 692 ANDQ SI, SI 693 JZ chacha_blocks_avx2_noinput2 694 VPXOR 0(SI), X0, X0 695 VPXOR 16(SI), X1, X1 696 VPXOR 64(SI), X2, X2 697 VPXOR 80(SI), X3, X3 698 VPXOR 128(SI), X4, X4 699 VPXOR 144(SI), X5, X5 700 VPXOR 192(SI), X6, X6 701 VPXOR 208(SI), X7, X7 702 VMOVDQU X0, 0(DX) 703 VMOVDQU X1, 16(DX) 704 VMOVDQU X2, 64(DX) 705 VMOVDQU X3, 80(DX) 706 VMOVDQU X4, 128(DX) 707 VMOVDQU X5, 144(DX) 708 VMOVDQU X6, 192(DX) 709 VMOVDQU X7, 208(DX) 710 VMOVDQA 192(BP), X0 711 VMOVDQA 208(BP), X1 712 VMOVDQA 224(BP), X2 713 VMOVDQA 240(BP), X3 714 VMOVDQA 256(BP), X4 715 VMOVDQA 272(BP), X5 716 VMOVDQA 288(BP), X6 717 VMOVDQA 304(BP), X7 718 VPBROADCASTD 32(BP), X8 719 VPBROADCASTD 4+32(BP), X9 720 VPBROADCASTD 8+32(BP), X10 721 VPBROADCASTD 12+32(BP), X11 722 VMOVDQA 128(BP), X12 723 VMOVDQA 160(BP), X13 724 VPBROADCASTD 8+48(BP), X14 725 VPBROADCASTD 12+48(BP), X15 726 VPADDD X8, X0, X0 727 VPADDD X9, X1, X1 728 VPADDD X10, X2, X2 729 VPADDD X11, X3, X3 730 VPADDD X12, X4, X4 731 VPADDD X13, X5, X5 732 VPADDD X14, X6, X6 733 VPADDD X15, X7, X7 734 VPUNPCKLDQ X1, X0, X8 735 VPUNPCKLDQ X3, X2, X9 736 VPUNPCKHDQ X1, X0, X12 737 VPUNPCKHDQ X3, X2, X13 738 VPUNPCKLDQ X5, X4, X10 739 VPUNPCKLDQ X7, X6, X11 740 VPUNPCKHDQ X5, X4, X14 741 VPUNPCKHDQ X7, X6, X15 742 VPUNPCKLQDQ X9, X8, X0 743 VPUNPCKLQDQ X11, X10, X1 744 VPUNPCKHQDQ X9, X8, X2 745 VPUNPCKHQDQ X11, X10, X3 746 VPUNPCKLQDQ X13, X12, X4 747 VPUNPCKLQDQ X15, X14, X5 748 VPUNPCKHQDQ X13, X12, X6 749 VPUNPCKHQDQ X15, X14, X7 750 VPXOR 32(SI), X0, X0 751 VPXOR 48(SI), X1, X1 752 VPXOR 96(SI), X2, X2 753 VPXOR 112(SI), X3, X3 754 VPXOR 160(SI), X4, X4 755 VPXOR 176(SI), X5, X5 756 VPXOR 224(SI), X6, X6 757 VPXOR 240(SI), X7, X7 758 VMOVDQU X0, 32(DX) 759 VMOVDQU X1, 48(DX) 760 VMOVDQU X2, 96(DX) 761 VMOVDQU X3, 112(DX) 762 VMOVDQU X4, 160(DX) 763 VMOVDQU X5, 176(DX) 764 VMOVDQU X6, 224(DX) 765 VMOVDQU X7, 240(DX) 766 ADDQ $256, SI 767 JMP chacha_blocks_avx2_mainloop2_cont 768 769 chacha_blocks_avx2_noinput2: 770 VMOVDQU X0, 0(DX) 771 VMOVDQU X1, 16(DX) 772 VMOVDQU X2, 64(DX) 773 VMOVDQU X3, 80(DX) 774 VMOVDQU X4, 128(DX) 775 VMOVDQU X5, 144(DX) 776 VMOVDQU X6, 192(DX) 777 VMOVDQU X7, 208(DX) 778 VMOVDQA 192(BP), X0 779 VMOVDQA 208(BP), X1 780 VMOVDQA 224(BP), X2 781 VMOVDQA 240(BP), X3 782 VMOVDQA 256(BP), X4 783 VMOVDQA 272(BP), X5 784 VMOVDQA 288(BP), X6 785 VMOVDQA 304(BP), X7 786 VPBROADCASTD 32(BP), X8 787 VPBROADCASTD 4+32(BP), X9 788 VPBROADCASTD 8+32(BP), X10 789 VPBROADCASTD 12+32(BP), X11 790 VMOVDQA 128(BP), X12 791 VMOVDQA 160(BP), X13 792 VPBROADCASTD 8+48(BP), X14 793 VPBROADCASTD 12+48(BP), X15 794 VPADDD X8, X0, X0 795 VPADDD X9, X1, X1 796 VPADDD X10, X2, X2 797 VPADDD X11, X3, X3 798 VPADDD X12, X4, X4 799 VPADDD X13, X5, X5 800 VPADDD X14, X6, X6 801 VPADDD X15, X7, X7 802 VPUNPCKLDQ X1, X0, X8 803 VPUNPCKLDQ X3, X2, X9 804 VPUNPCKHDQ X1, X0, X12 805 VPUNPCKHDQ X3, X2, X13 806 VPUNPCKLDQ X5, X4, X10 807 VPUNPCKLDQ X7, X6, X11 808 VPUNPCKHDQ X5, X4, X14 809 VPUNPCKHDQ X7, X6, X15 810 VPUNPCKLQDQ X9, X8, X0 811 VPUNPCKLQDQ X11, X10, X1 812 VPUNPCKHQDQ X9, X8, X2 813 VPUNPCKHQDQ X11, X10, X3 814 VPUNPCKLQDQ X13, X12, X4 815 VPUNPCKLQDQ X15, X14, X5 816 VPUNPCKHQDQ X13, X12, X6 817 VPUNPCKHQDQ X15, X14, X7 818 VMOVDQU X0, 32(DX) 819 VMOVDQU X1, 48(DX) 820 VMOVDQU X2, 96(DX) 821 VMOVDQU X3, 112(DX) 822 VMOVDQU X4, 160(DX) 823 VMOVDQU X5, 176(DX) 824 VMOVDQU X6, 224(DX) 825 VMOVDQU X7, 240(DX) 826 827 chacha_blocks_avx2_mainloop2_cont: 828 ADDQ $256, DX 829 SUBQ $256, CX 830 CMPQ CX, $256 831 JAE chacha_blocks_avx2_atleast256 832 833 chacha_blocks_avx2_below256_fixup: 834 VMOVDQA 448(BP), X6 835 VMOVDQA 480(BP), X7 836 VMOVDQA 0(BP), X8 837 VMOVDQA 16(BP), X9 838 VMOVDQA 32(BP), X10 839 VMOVDQA 48(BP), X11 840 MOVQ $1, R9 841 842 chacha_blocks_avx2_below256: 843 VMOVQ R9, X5 844 ANDQ CX, CX 845 JZ chacha_blocks_avx2_done 846 CMPQ CX, $64 847 JAE chacha_blocks_avx2_above63 848 MOVQ DX, R9 849 ANDQ SI, SI 850 JZ chacha_blocks_avx2_noinput3 851 MOVQ CX, R10 852 MOVQ BP, DX 853 ADDQ R10, SI 854 ADDQ R10, DX 855 NEGQ R10 856 857 chacha_blocks_avx2_copyinput: 858 MOVB (SI)(R10*1), AX 859 MOVB AX, (DX)(R10*1) 860 INCQ R10 861 JNZ chacha_blocks_avx2_copyinput 862 MOVQ BP, SI 863 864 chacha_blocks_avx2_noinput3: 865 MOVQ BP, DX 866 867 chacha_blocks_avx2_above63: 868 VMOVDQA X8, X0 869 VMOVDQA X9, X1 870 VMOVDQA X10, X2 871 VMOVDQA X11, X3 872 873 // MOVQ 64(BP), AX 874 MOVQ $20, AX 875 876 chacha_blocks_avx2_mainloop3: 877 VPADDD X0, X1, X0 878 VPXOR X3, X0, X3 879 VPSHUFB X6, X3, X3 880 VPADDD X2, X3, X2 881 VPXOR X1, X2, X1 882 VPSLLD $12, X1, X4 883 VPSRLD $20, X1, X1 884 VPXOR X1, X4, X1 885 VPADDD X0, X1, X0 886 VPXOR X3, X0, X3 887 VPSHUFB X7, X3, X3 888 VPSHUFD $0x93, X0, X0 889 VPADDD X2, X3, X2 890 VPSHUFD $0x4e, X3, X3 891 VPXOR X1, X2, X1 892 VPSHUFD $0x39, X2, X2 893 VPSLLD $7, X1, X4 894 VPSRLD $25, X1, X1 895 VPXOR X1, X4, X1 896 VPADDD X0, X1, X0 897 VPXOR X3, X0, X3 898 VPSHUFB X6, X3, X3 899 VPADDD X2, X3, X2 900 VPXOR X1, X2, X1 901 VPSLLD $12, X1, X4 902 VPSRLD $20, X1, X1 903 VPXOR X1, X4, X1 904 VPADDD X0, X1, X0 905 VPXOR X3, X0, X3 906 VPSHUFB X7, X3, X3 907 VPSHUFD $0x39, X0, X0 908 VPADDD X2, X3, X2 909 VPSHUFD $0x4e, X3, X3 910 VPXOR X1, X2, X1 911 VPSHUFD $0x93, X2, X2 912 VPSLLD $7, X1, X4 913 VPSRLD $25, X1, X1 914 VPXOR X1, X4, X1 915 SUBQ $2, AX 916 JNZ chacha_blocks_avx2_mainloop3 917 VPADDD X0, X8, X0 918 VPADDD X1, X9, X1 919 VPADDD X2, X10, X2 920 VPADDD X3, X11, X3 921 ANDQ SI, SI 922 JZ chacha_blocks_avx2_noinput4 923 VPXOR 0(SI), X0, X0 924 VPXOR 16(SI), X1, X1 925 VPXOR 32(SI), X2, X2 926 VPXOR 48(SI), X3, X3 927 ADDQ $64, SI 928 929 chacha_blocks_avx2_noinput4: 930 VMOVDQU X0, 0(DX) 931 VMOVDQU X1, 16(DX) 932 VMOVDQU X2, 32(DX) 933 VMOVDQU X3, 48(DX) 934 VPADDQ X11, X5, X11 935 CMPQ CX, $64 936 JBE chacha_blocks_avx2_mainloop3_finishup 937 ADDQ $64, DX 938 SUBQ $64, CX 939 JMP chacha_blocks_avx2_below256 940 941 chacha_blocks_avx2_mainloop3_finishup: 942 CMPQ CX, $64 943 JE chacha_blocks_avx2_done 944 ADDQ CX, R9 945 ADDQ CX, DX 946 NEGQ CX 947 948 chacha_blocks_avx2_copyoutput: 949 MOVB (DX)(CX*1), AX 950 MOVB AX, (R9)(CX*1) 951 INCQ CX 952 JNZ chacha_blocks_avx2_copyoutput 953 954 chacha_blocks_avx2_done: 955 VMOVDQU X11, 32(DI) 956 957 VZEROUPPER 958 RET 959 960 // func hChaChaAVX2(key, nonce []byte, dst *byte) 961 TEXT ·hChaChaAVX2(SB), NOSPLIT|NOFRAME, $0-56 962 MOVQ key+0(FP), DI 963 MOVQ nonce+24(FP), SI 964 MOVQ dst+48(FP), DX 965 966 MOVL $20, CX 967 968 LEAQ ·chacha_constants<>(SB), AX 969 VMOVDQA 0(AX), X0 970 VMOVDQA 16(AX), X6 971 VMOVDQA 32(AX), X5 972 973 VMOVDQU 0(DI), X1 974 VMOVDQU 16(DI), X2 975 VMOVDQU 0(SI), X3 976 977 hhacha_mainloop_avx2: 978 VPADDD X0, X1, X0 979 VPXOR X3, X0, X3 980 VPSHUFB X6, X3, X3 981 VPADDD X2, X3, X2 982 VPXOR X1, X2, X1 983 VPSLLD $12, X1, X4 984 VPSRLD $20, X1, X1 985 VPXOR X1, X4, X1 986 VPADDD X0, X1, X0 987 VPXOR X3, X0, X3 988 VPSHUFB X5, X3, X3 989 VPADDD X2, X3, X2 990 VPXOR X1, X2, X1 991 VPSLLD $7, X1, X4 992 VPSRLD $25, X1, X1 993 VPSHUFD $0x93, X0, X0 994 VPXOR X1, X4, X1 995 VPSHUFD $0x4e, X3, X3 996 VPADDD X0, X1, X0 997 VPXOR X3, X0, X3 998 VPSHUFB X6, X3, X3 999 VPSHUFD $0x39, X2, X2 1000 VPADDD X2, X3, X2 1001 VPXOR X1, X2, X1 1002 VPSLLD $12, X1, X4 1003 VPSRLD $20, X1, X1 1004 VPXOR X1, X4, X1 1005 VPADDD X0, X1, X0 1006 VPXOR X3, X0, X3 1007 VPSHUFB X5, X3, X3 1008 VPADDD X2, X3, X2 1009 VPXOR X1, X2, X1 1010 VPSHUFD $0x39, X0, X0 1011 VPSLLD $7, X1, X4 1012 VPSHUFD $0x4e, X3, X3 1013 VPSRLD $25, X1, X1 1014 VPSHUFD $0x93, X2, X2 1015 VPXOR X1, X4, X1 1016 SUBL $2, CX 1017 JNE hhacha_mainloop_avx2 1018 1019 VMOVDQU X0, (DX) 1020 VMOVDQU X3, 16(DX) 1021 1022 VZEROUPPER 1023 RET 1024 1025 // func blocksSSSE3(s *[api.StateSize]uint32, in, out []byte) 1026 TEXT ·blocksSSSE3(SB), NOSPLIT, $576-56 1027 // This is Andrew Moon's SSSE3 ChaCha implementation taken from 1028 // supercop-20190110, with some minor changes, primarily calling 1029 // convention and assembly dialect related. 1030 1031 // Align the stack on a 64 byte boundary. 1032 MOVQ SP, BP 1033 ADDQ $64, BP 1034 ANDQ $-64, BP 1035 1036 // Go calling convention -> SYSV AMD64 (and a fixup). 1037 MOVQ s+0(FP), DI // &s -> DI 1038 ADDQ $16, DI // Skip the ChaCha constants in the chachaState. 1039 MOVQ in+8(FP), SI // &in[0] -> SI 1040 MOVQ out+32(FP), DX // &out[0] -> DX 1041 MOVQ in_len+16(FP), CX // len(in) -> CX 1042 1043 // Begin the main body of `chacha_blocks_ssse3`. 1044 // 1045 // Mostly a direct translation except: 1046 // * The number of rounds is always 20. 1047 // * %rbp is used instead of BP. 1048 LEAQ ·chacha_constants<>(SB), AX 1049 MOVO 0(AX), X8 1050 MOVO 16(AX), X6 1051 MOVO 32(AX), X7 1052 MOVOU 0(DI), X9 1053 MOVOU 16(DI), X10 1054 MOVOU 32(DI), X11 1055 1056 // MOVQ 48(DI), AX 1057 MOVQ $1, R9 1058 MOVO X8, 0(BP) 1059 MOVO X9, 16(BP) 1060 MOVO X10, 32(BP) 1061 MOVO X11, 48(BP) 1062 1063 MOVO X6, 80(BP) 1064 MOVO X7, 96(BP) 1065 // MOVQ AX, 64(BP) 1066 CMPQ CX, $256 1067 JB chacha_blocks_ssse3_below256 1068 PSHUFD $0x00, X8, X0 1069 PSHUFD $0x55, X8, X1 1070 PSHUFD $0xaa, X8, X2 1071 PSHUFD $0xff, X8, X3 1072 MOVO X0, 128(BP) 1073 MOVO X1, 144(BP) 1074 MOVO X2, 160(BP) 1075 MOVO X3, 176(BP) 1076 PSHUFD $0x00, X9, X0 1077 PSHUFD $0x55, X9, X1 1078 PSHUFD $0xaa, X9, X2 1079 PSHUFD $0xff, X9, X3 1080 MOVO X0, 192(BP) 1081 MOVO X1, 208(BP) 1082 MOVO X2, 224(BP) 1083 MOVO X3, 240(BP) 1084 PSHUFD $0x00, X10, X0 1085 PSHUFD $0x55, X10, X1 1086 PSHUFD $0xaa, X10, X2 1087 PSHUFD $0xff, X10, X3 1088 MOVO X0, 256(BP) 1089 MOVO X1, 272(BP) 1090 MOVO X2, 288(BP) 1091 MOVO X3, 304(BP) 1092 PSHUFD $0xaa, X11, X0 1093 PSHUFD $0xff, X11, X1 1094 MOVO X0, 352(BP) 1095 MOVO X1, 368(BP) 1096 JMP chacha_blocks_ssse3_atleast256 1097 1098 // .p2align 6,,63 1099 // # align to 4 mod 64 1100 // nop;nop;nop;nop; 1101 chacha_blocks_ssse3_atleast256: 1102 MOVQ 48(BP), AX 1103 LEAQ 1(AX), R8 1104 LEAQ 2(AX), R9 1105 LEAQ 3(AX), R10 1106 LEAQ 4(AX), BX 1107 MOVL AX, 320(BP) 1108 MOVL R8, 4+320(BP) 1109 MOVL R9, 8+320(BP) 1110 MOVL R10, 12+320(BP) 1111 SHRQ $32, AX 1112 SHRQ $32, R8 1113 SHRQ $32, R9 1114 SHRQ $32, R10 1115 MOVL AX, 336(BP) 1116 MOVL R8, 4+336(BP) 1117 MOVL R9, 8+336(BP) 1118 MOVL R10, 12+336(BP) 1119 MOVQ BX, 48(BP) 1120 1121 // MOVQ 64(BP), AX 1122 MOVQ $20, AX 1123 MOVO 128(BP), X0 1124 MOVO 144(BP), X1 1125 MOVO 160(BP), X2 1126 MOVO 176(BP), X3 1127 MOVO 192(BP), X4 1128 MOVO 208(BP), X5 1129 MOVO 224(BP), X6 1130 MOVO 240(BP), X7 1131 MOVO 256(BP), X8 1132 MOVO 272(BP), X9 1133 MOVO 288(BP), X10 1134 MOVO 304(BP), X11 1135 MOVO 320(BP), X12 1136 MOVO 336(BP), X13 1137 MOVO 352(BP), X14 1138 MOVO 368(BP), X15 1139 1140 chacha_blocks_ssse3_mainloop1: 1141 PADDD X4, X0 1142 PADDD X5, X1 1143 PXOR X0, X12 1144 PXOR X1, X13 1145 PADDD X6, X2 1146 PADDD X7, X3 1147 PXOR X2, X14 1148 PXOR X3, X15 1149 PSHUFB 80(BP), X12 1150 PSHUFB 80(BP), X13 1151 PADDD X12, X8 1152 PADDD X13, X9 1153 PSHUFB 80(BP), X14 1154 PSHUFB 80(BP), X15 1155 PADDD X14, X10 1156 PADDD X15, X11 1157 MOVO X12, 112(BP) 1158 PXOR X8, X4 1159 PXOR X9, X5 1160 MOVO X4, X12 1161 PSLLL $ 12, X4 1162 PSRLL $20, X12 1163 PXOR X12, X4 1164 MOVO X5, X12 1165 PSLLL $ 12, X5 1166 PSRLL $20, X12 1167 PXOR X12, X5 1168 PXOR X10, X6 1169 PXOR X11, X7 1170 MOVO X6, X12 1171 PSLLL $ 12, X6 1172 PSRLL $20, X12 1173 PXOR X12, X6 1174 MOVO X7, X12 1175 PSLLL $ 12, X7 1176 PSRLL $20, X12 1177 PXOR X12, X7 1178 MOVO 112(BP), X12 1179 PADDD X4, X0 1180 PADDD X5, X1 1181 PXOR X0, X12 1182 PXOR X1, X13 1183 PADDD X6, X2 1184 PADDD X7, X3 1185 PXOR X2, X14 1186 PXOR X3, X15 1187 PSHUFB 96(BP), X12 1188 PSHUFB 96(BP), X13 1189 PADDD X12, X8 1190 PADDD X13, X9 1191 PSHUFB 96(BP), X14 1192 PSHUFB 96(BP), X15 1193 PADDD X14, X10 1194 PADDD X15, X11 1195 MOVO X12, 112(BP) 1196 PXOR X8, X4 1197 PXOR X9, X5 1198 MOVO X4, X12 1199 PSLLL $ 7, X4 1200 PSRLL $25, X12 1201 PXOR X12, X4 1202 MOVO X5, X12 1203 PSLLL $ 7, X5 1204 PSRLL $25, X12 1205 PXOR X12, X5 1206 PXOR X10, X6 1207 PXOR X11, X7 1208 MOVO X6, X12 1209 PSLLL $ 7, X6 1210 PSRLL $25, X12 1211 PXOR X12, X6 1212 MOVO X7, X12 1213 PSLLL $ 7, X7 1214 PSRLL $25, X12 1215 PXOR X12, X7 1216 MOVO 112(BP), X12 1217 PADDD X5, X0 1218 PADDD X6, X1 1219 PXOR X0, X15 1220 PXOR X1, X12 1221 PADDD X7, X2 1222 PADDD X4, X3 1223 PXOR X2, X13 1224 PXOR X3, X14 1225 PSHUFB 80(BP), X15 1226 PSHUFB 80(BP), X12 1227 PADDD X15, X10 1228 PADDD X12, X11 1229 PSHUFB 80(BP), X13 1230 PSHUFB 80(BP), X14 1231 PADDD X13, X8 1232 PADDD X14, X9 1233 MOVO X15, 112(BP) 1234 PXOR X10, X5 1235 PXOR X11, X6 1236 MOVO X5, X15 1237 PSLLL $ 12, X5 1238 PSRLL $20, X15 1239 PXOR X15, X5 1240 MOVO X6, X15 1241 PSLLL $ 12, X6 1242 PSRLL $20, X15 1243 PXOR X15, X6 1244 PXOR X8, X7 1245 PXOR X9, X4 1246 MOVO X7, X15 1247 PSLLL $ 12, X7 1248 PSRLL $20, X15 1249 PXOR X15, X7 1250 MOVO X4, X15 1251 PSLLL $ 12, X4 1252 PSRLL $20, X15 1253 PXOR X15, X4 1254 MOVO 112(BP), X15 1255 PADDD X5, X0 1256 PADDD X6, X1 1257 PXOR X0, X15 1258 PXOR X1, X12 1259 PADDD X7, X2 1260 PADDD X4, X3 1261 PXOR X2, X13 1262 PXOR X3, X14 1263 PSHUFB 96(BP), X15 1264 PSHUFB 96(BP), X12 1265 PADDD X15, X10 1266 PADDD X12, X11 1267 PSHUFB 96(BP), X13 1268 PSHUFB 96(BP), X14 1269 PADDD X13, X8 1270 PADDD X14, X9 1271 MOVO X15, 112(BP) 1272 PXOR X10, X5 1273 PXOR X11, X6 1274 MOVO X5, X15 1275 PSLLL $ 7, X5 1276 PSRLL $25, X15 1277 PXOR X15, X5 1278 MOVO X6, X15 1279 PSLLL $ 7, X6 1280 PSRLL $25, X15 1281 PXOR X15, X6 1282 PXOR X8, X7 1283 PXOR X9, X4 1284 MOVO X7, X15 1285 PSLLL $ 7, X7 1286 PSRLL $25, X15 1287 PXOR X15, X7 1288 MOVO X4, X15 1289 PSLLL $ 7, X4 1290 PSRLL $25, X15 1291 PXOR X15, X4 1292 SUBQ $2, AX 1293 MOVO 112(BP), X15 1294 JNZ chacha_blocks_ssse3_mainloop1 1295 PADDD 128(BP), X0 1296 PADDD 144(BP), X1 1297 PADDD 160(BP), X2 1298 PADDD 176(BP), X3 1299 PADDD 192(BP), X4 1300 PADDD 208(BP), X5 1301 PADDD 224(BP), X6 1302 PADDD 240(BP), X7 1303 PADDD 256(BP), X8 1304 PADDD 272(BP), X9 1305 PADDD 288(BP), X10 1306 PADDD 304(BP), X11 1307 PADDD 320(BP), X12 1308 PADDD 336(BP), X13 1309 PADDD 352(BP), X14 1310 PADDD 368(BP), X15 1311 MOVO X8, 384(BP) 1312 MOVO X9, 400(BP) 1313 MOVO X10, 416(BP) 1314 MOVO X11, 432(BP) 1315 MOVO X12, 448(BP) 1316 MOVO X13, 464(BP) 1317 MOVO X14, 480(BP) 1318 MOVO X15, 496(BP) 1319 MOVO X0, X8 1320 MOVO X2, X9 1321 MOVO X4, X10 1322 MOVO X6, X11 1323 PUNPCKHLQ X1, X0 1324 PUNPCKHLQ X3, X2 1325 PUNPCKHLQ X5, X4 1326 PUNPCKHLQ X7, X6 1327 PUNPCKLLQ X1, X8 1328 PUNPCKLLQ X3, X9 1329 PUNPCKLLQ X5, X10 1330 PUNPCKLLQ X7, X11 1331 MOVO X0, X1 1332 MOVO X4, X3 1333 MOVO X8, X5 1334 MOVO X10, X7 1335 PUNPCKHQDQ X2, X0 1336 PUNPCKHQDQ X6, X4 1337 PUNPCKHQDQ X9, X8 1338 PUNPCKHQDQ X11, X10 1339 PUNPCKLQDQ X2, X1 1340 PUNPCKLQDQ X6, X3 1341 PUNPCKLQDQ X9, X5 1342 PUNPCKLQDQ X11, X7 1343 ANDQ SI, SI 1344 JZ chacha_blocks_ssse3_noinput1 1345 MOVOU 0(SI), X2 1346 MOVOU 16(SI), X6 1347 MOVOU 64(SI), X9 1348 MOVOU 80(SI), X11 1349 MOVOU 128(SI), X12 1350 MOVOU 144(SI), X13 1351 MOVOU 192(SI), X14 1352 MOVOU 208(SI), X15 1353 PXOR X2, X5 1354 PXOR X6, X7 1355 PXOR X9, X8 1356 PXOR X11, X10 1357 PXOR X12, X1 1358 PXOR X13, X3 1359 PXOR X14, X0 1360 PXOR X15, X4 1361 MOVOU X5, 0(DX) 1362 MOVOU X7, 16(DX) 1363 MOVOU X8, 64(DX) 1364 MOVOU X10, 80(DX) 1365 MOVOU X1, 128(DX) 1366 MOVOU X3, 144(DX) 1367 MOVOU X0, 192(DX) 1368 MOVOU X4, 208(DX) 1369 MOVO 384(BP), X0 1370 MOVO 400(BP), X1 1371 MOVO 416(BP), X2 1372 MOVO 432(BP), X3 1373 MOVO 448(BP), X4 1374 MOVO 464(BP), X5 1375 MOVO 480(BP), X6 1376 MOVO 496(BP), X7 1377 MOVO X0, X8 1378 MOVO X2, X9 1379 MOVO X4, X10 1380 MOVO X6, X11 1381 PUNPCKLLQ X1, X8 1382 PUNPCKLLQ X3, X9 1383 PUNPCKHLQ X1, X0 1384 PUNPCKHLQ X3, X2 1385 PUNPCKLLQ X5, X10 1386 PUNPCKLLQ X7, X11 1387 PUNPCKHLQ X5, X4 1388 PUNPCKHLQ X7, X6 1389 MOVO X8, X1 1390 MOVO X0, X3 1391 MOVO X10, X5 1392 MOVO X4, X7 1393 PUNPCKLQDQ X9, X1 1394 PUNPCKLQDQ X11, X5 1395 PUNPCKHQDQ X9, X8 1396 PUNPCKHQDQ X11, X10 1397 PUNPCKLQDQ X2, X3 1398 PUNPCKLQDQ X6, X7 1399 PUNPCKHQDQ X2, X0 1400 PUNPCKHQDQ X6, X4 1401 MOVOU 32(SI), X2 1402 MOVOU 48(SI), X6 1403 MOVOU 96(SI), X9 1404 MOVOU 112(SI), X11 1405 MOVOU 160(SI), X12 1406 MOVOU 176(SI), X13 1407 MOVOU 224(SI), X14 1408 MOVOU 240(SI), X15 1409 PXOR X2, X1 1410 PXOR X6, X5 1411 PXOR X9, X8 1412 PXOR X11, X10 1413 PXOR X12, X3 1414 PXOR X13, X7 1415 PXOR X14, X0 1416 PXOR X15, X4 1417 MOVOU X1, 32(DX) 1418 MOVOU X5, 48(DX) 1419 MOVOU X8, 96(DX) 1420 MOVOU X10, 112(DX) 1421 MOVOU X3, 160(DX) 1422 MOVOU X7, 176(DX) 1423 MOVOU X0, 224(DX) 1424 MOVOU X4, 240(DX) 1425 ADDQ $256, SI 1426 JMP chacha_blocks_ssse3_mainloop_cont 1427 1428 chacha_blocks_ssse3_noinput1: 1429 MOVOU X5, 0(DX) 1430 MOVOU X7, 16(DX) 1431 MOVOU X8, 64(DX) 1432 MOVOU X10, 80(DX) 1433 MOVOU X1, 128(DX) 1434 MOVOU X3, 144(DX) 1435 MOVOU X0, 192(DX) 1436 MOVOU X4, 208(DX) 1437 MOVO 384(BP), X0 1438 MOVO 400(BP), X1 1439 MOVO 416(BP), X2 1440 MOVO 432(BP), X3 1441 MOVO 448(BP), X4 1442 MOVO 464(BP), X5 1443 MOVO 480(BP), X6 1444 MOVO 496(BP), X7 1445 MOVO X0, X8 1446 MOVO X2, X9 1447 MOVO X4, X10 1448 MOVO X6, X11 1449 PUNPCKLLQ X1, X8 1450 PUNPCKLLQ X3, X9 1451 PUNPCKHLQ X1, X0 1452 PUNPCKHLQ X3, X2 1453 PUNPCKLLQ X5, X10 1454 PUNPCKLLQ X7, X11 1455 PUNPCKHLQ X5, X4 1456 PUNPCKHLQ X7, X6 1457 MOVO X8, X1 1458 MOVO X0, X3 1459 MOVO X10, X5 1460 MOVO X4, X7 1461 PUNPCKLQDQ X9, X1 1462 PUNPCKLQDQ X11, X5 1463 PUNPCKHQDQ X9, X8 1464 PUNPCKHQDQ X11, X10 1465 PUNPCKLQDQ X2, X3 1466 PUNPCKLQDQ X6, X7 1467 PUNPCKHQDQ X2, X0 1468 PUNPCKHQDQ X6, X4 1469 MOVOU X1, 32(DX) 1470 MOVOU X5, 48(DX) 1471 MOVOU X8, 96(DX) 1472 MOVOU X10, 112(DX) 1473 MOVOU X3, 160(DX) 1474 MOVOU X7, 176(DX) 1475 MOVOU X0, 224(DX) 1476 MOVOU X4, 240(DX) 1477 1478 chacha_blocks_ssse3_mainloop_cont: 1479 ADDQ $256, DX 1480 SUBQ $256, CX 1481 CMPQ CX, $256 1482 JAE chacha_blocks_ssse3_atleast256 1483 MOVO 80(BP), X6 1484 MOVO 96(BP), X7 1485 MOVO 0(BP), X8 1486 MOVO 16(BP), X9 1487 MOVO 32(BP), X10 1488 MOVO 48(BP), X11 1489 MOVQ $1, R9 1490 1491 chacha_blocks_ssse3_below256: 1492 MOVQ R9, X5 1493 ANDQ CX, CX 1494 JZ chacha_blocks_ssse3_done 1495 CMPQ CX, $64 1496 JAE chacha_blocks_ssse3_above63 1497 MOVQ DX, R9 1498 ANDQ SI, SI 1499 JZ chacha_blocks_ssse3_noinput2 1500 MOVQ CX, R10 1501 MOVQ BP, DX 1502 ADDQ R10, SI 1503 ADDQ R10, DX 1504 NEGQ R10 1505 1506 chacha_blocks_ssse3_copyinput: 1507 MOVB (SI)(R10*1), AX 1508 MOVB AX, (DX)(R10*1) 1509 INCQ R10 1510 JNZ chacha_blocks_ssse3_copyinput 1511 MOVQ BP, SI 1512 1513 chacha_blocks_ssse3_noinput2: 1514 MOVQ BP, DX 1515 1516 chacha_blocks_ssse3_above63: 1517 MOVO X8, X0 1518 MOVO X9, X1 1519 MOVO X10, X2 1520 MOVO X11, X3 1521 1522 // MOVQ 64(BP), AX 1523 MOVQ $20, AX 1524 1525 chacha_blocks_ssse3_mainloop2: 1526 PADDD X1, X0 1527 PXOR X0, X3 1528 PSHUFB X6, X3 1529 PADDD X3, X2 1530 PXOR X2, X1 1531 MOVO X1, X4 1532 PSLLL $12, X4 1533 PSRLL $20, X1 1534 PXOR X4, X1 1535 PADDD X1, X0 1536 PXOR X0, X3 1537 PSHUFB X7, X3 1538 PSHUFD $0x93, X0, X0 1539 PADDD X3, X2 1540 PSHUFD $0x4e, X3, X3 1541 PXOR X2, X1 1542 PSHUFD $0x39, X2, X2 1543 MOVO X1, X4 1544 PSLLL $7, X4 1545 PSRLL $25, X1 1546 PXOR X4, X1 1547 PADDD X1, X0 1548 PXOR X0, X3 1549 PSHUFB X6, X3 1550 PADDD X3, X2 1551 PXOR X2, X1 1552 MOVO X1, X4 1553 PSLLL $12, X4 1554 PSRLL $20, X1 1555 PXOR X4, X1 1556 PADDD X1, X0 1557 PXOR X0, X3 1558 PSHUFB X7, X3 1559 PSHUFD $0x39, X0, X0 1560 PADDD X3, X2 1561 PSHUFD $0x4e, X3, X3 1562 PXOR X2, X1 1563 PSHUFD $0x93, X2, X2 1564 MOVO X1, X4 1565 PSLLL $7, X4 1566 PSRLL $25, X1 1567 PXOR X4, X1 1568 SUBQ $2, AX 1569 JNZ chacha_blocks_ssse3_mainloop2 1570 PADDD X8, X0 1571 PADDD X9, X1 1572 PADDD X10, X2 1573 PADDD X11, X3 1574 ANDQ SI, SI 1575 JZ chacha_blocks_ssse3_noinput3 1576 MOVOU 0(SI), X12 1577 MOVOU 16(SI), X13 1578 MOVOU 32(SI), X14 1579 MOVOU 48(SI), X15 1580 PXOR X12, X0 1581 PXOR X13, X1 1582 PXOR X14, X2 1583 PXOR X15, X3 1584 ADDQ $64, SI 1585 1586 chacha_blocks_ssse3_noinput3: 1587 MOVOU X0, 0(DX) 1588 MOVOU X1, 16(DX) 1589 MOVOU X2, 32(DX) 1590 MOVOU X3, 48(DX) 1591 PADDQ X5, X11 1592 CMPQ CX, $64 1593 JBE chacha_blocks_ssse3_mainloop2_finishup 1594 ADDQ $64, DX 1595 SUBQ $64, CX 1596 JMP chacha_blocks_ssse3_below256 1597 1598 chacha_blocks_ssse3_mainloop2_finishup: 1599 CMPQ CX, $64 1600 JE chacha_blocks_ssse3_done 1601 ADDQ CX, R9 1602 ADDQ CX, DX 1603 NEGQ CX 1604 1605 chacha_blocks_ssse3_copyoutput: 1606 MOVB (DX)(CX*1), AX 1607 MOVB AX, (R9)(CX*1) 1608 INCQ CX 1609 JNZ chacha_blocks_ssse3_copyoutput 1610 1611 chacha_blocks_ssse3_done: 1612 MOVOU X11, 32(DI) 1613 1614 RET 1615 1616 // func hChaChaSSSE3(key, nonce []byte, dst *byte) 1617 TEXT ·hChaChaSSSE3(SB), NOSPLIT|NOFRAME, $0-56 1618 MOVQ key+0(FP), DI 1619 MOVQ nonce+24(FP), SI 1620 MOVQ dst+48(FP), DX 1621 1622 MOVL $20, CX 1623 1624 LEAQ ·chacha_constants<>(SB), AX 1625 MOVO 0(AX), X0 1626 MOVO 16(AX), X5 1627 MOVO 32(AX), X6 1628 1629 MOVOU 0(DI), X1 1630 MOVOU 16(DI), X2 1631 MOVOU 0(SI), X3 1632 1633 hchacha_ssse3_mainloop: 1634 PADDD X1, X0 1635 PXOR X0, X3 1636 PSHUFB X5, X3 1637 PADDD X3, X2 1638 PXOR X2, X1 1639 MOVO X1, X4 1640 PSLLL $12, X1 1641 PSRLL $20, X4 1642 PXOR X4, X1 1643 PADDD X1, X0 1644 PXOR X0, X3 1645 PSHUFB X6, X3 1646 PSHUFD $0X93, X0, X0 1647 PADDD X3, X2 1648 PSHUFD $0X4E, X3, X3 1649 PXOR X2, X1 1650 PSHUFD $0X39, X2, X2 1651 MOVO X1, X4 1652 PSLLL $7, X1 1653 PSRLL $25, X4 1654 PXOR X4, X1 1655 SUBQ $2, CX 1656 PADDD X1, X0 1657 PXOR X0, X3 1658 PSHUFB X5, X3 1659 PADDD X3, X2 1660 PXOR X2, X1 1661 MOVO X1, X4 1662 PSLLL $12, X1 1663 PSRLL $20, X4 1664 PXOR X4, X1 1665 PADDD X1, X0 1666 PXOR X0, X3 1667 PSHUFB X6, X3 1668 PSHUFD $0X39, X0, X0 1669 PADDD X3, X2 1670 PSHUFD $0X4E, X3, X3 1671 PXOR X2, X1 1672 PSHUFD $0X93, X2, X2 1673 MOVO X1, X4 1674 PSLLL $7, X1 1675 PSRLL $25, X4 1676 PXOR X4, X1 1677 JA hchacha_ssse3_mainloop 1678 1679 MOVOU X0, 0(DX) 1680 MOVOU X3, 16(DX) 1681 1682 RET