git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/chacha/chacha_amd64.s (about) 1 // Copyright (c) 2016 Andreas Auernhammer. All rights reserved. 2 // Use of this source code is governed by a license that can be 3 // found in the LICENSE file. 4 5 // +build amd64,!gccgo,!appengine,!nacl 6 7 #include "const.s" 8 #include "macro.s" 9 10 // FINALIZE xors len bytes from src and block using 11 // the temp. registers t0 and t1 and writes the result 12 // to dst. 13 #define FINALIZE(dst, src, block, len, t0, t1) \ 14 XORQ t0, t0; \ 15 XORQ t1, t1; \ 16 FINALIZE_LOOP:; \ 17 MOVB 0(src), t0; \ 18 MOVB 0(block), t1; \ 19 XORQ t0, t1; \ 20 MOVB t1, 0(dst); \ 21 INCQ src; \ 22 INCQ block; \ 23 INCQ dst; \ 24 DECQ len; \ 25 JG FINALIZE_LOOP \ 26 27 #define Dst DI 28 #define Nonce AX 29 #define Key BX 30 #define Rounds DX 31 32 // func initialize(state *[64]byte, key []byte, nonce *[16]byte) 33 TEXT ·initialize(SB), 4, $0-40 34 MOVQ state+0(FP), Dst 35 MOVQ key+8(FP), Key 36 MOVQ nonce+32(FP), Nonce 37 38 MOVOU ·sigma<>(SB), X0 39 MOVOU 0*16(Key), X1 40 MOVOU 1*16(Key), X2 41 MOVOU 0*16(Nonce), X3 42 43 MOVOU X0, 0*16(Dst) 44 MOVOU X1, 1*16(Dst) 45 MOVOU X2, 2*16(Dst) 46 MOVOU X3, 3*16(Dst) 47 RET 48 49 #undef Dst 50 #undef Nonce 51 #undef Key 52 #undef Rounds 53 54 #define Dst DI 55 #define Src SI 56 #define Len R12 57 #define Rounds DX 58 #define Buffer BX 59 #define State AX 60 #define Stack SP 61 #define SavedSP R8 62 #define Tmp0 R9 63 #define Tmp1 R10 64 #define Tmp2 R11 65 66 // func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int 67 TEXT ·xorKeyStreamSSE2(SB), 4, $112-80 68 MOVQ dst_base+0(FP), Dst 69 MOVQ src_base+24(FP), Src 70 MOVQ block+48(FP), Buffer 71 MOVQ state+56(FP), State 72 MOVQ rounds+64(FP), Rounds 73 MOVQ src_len+32(FP), Len 74 75 MOVOU 0*16(State), X0 76 MOVOU 1*16(State), X1 77 MOVOU 2*16(State), X2 78 MOVOU 3*16(State), X3 79 80 MOVQ Stack, SavedSP 81 ADDQ $16, Stack 82 ANDQ $-16, Stack 83 84 TESTQ Len, Len 85 JZ DONE 86 87 MOVOU ·one<>(SB), X4 88 MOVO X0, 0*16(Stack) 89 MOVO X1, 1*16(Stack) 90 MOVO X2, 2*16(Stack) 91 MOVO X3, 3*16(Stack) 92 MOVO X4, 4*16(Stack) 93 94 CMPQ Len, $64 95 JLE GENERATE_KEYSTREAM_64 96 CMPQ Len, $128 97 JLE GENERATE_KEYSTREAM_128 98 CMPQ Len, $192 99 JLE GENERATE_KEYSTREAM_192 100 101 GENERATE_KEYSTREAM_256: 102 MOVO X0, X12 103 MOVO X1, X13 104 MOVO X2, X14 105 MOVO X3, X15 106 PADDQ 4*16(Stack), X15 107 MOVO X0, X8 108 MOVO X1, X9 109 MOVO X2, X10 110 MOVO X15, X11 111 PADDQ 4*16(Stack), X11 112 MOVO X0, X4 113 MOVO X1, X5 114 MOVO X2, X6 115 MOVO X11, X7 116 PADDQ 4*16(Stack), X7 117 MOVQ Rounds, Tmp0 118 119 MOVO X3, 3*16(Stack) // Save X3 120 121 CHACHA_LOOP_256: 122 MOVO X4, 5*16(Stack) 123 CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) 124 CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) 125 MOVO 5*16(Stack), X4 126 MOVO X0, 5*16(Stack) 127 CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) 128 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) 129 MOVO 5*16(Stack), X0 130 CHACHA_SHUFFLE_SSE(X1, X2, X3) 131 CHACHA_SHUFFLE_SSE(X13, X14, X15) 132 CHACHA_SHUFFLE_SSE(X9, X10, X11) 133 CHACHA_SHUFFLE_SSE(X5, X6, X7) 134 MOVO X4, 5*16(Stack) 135 CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) 136 CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) 137 MOVO 5*16(Stack), X4 138 MOVO X0, 5*16(Stack) 139 CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) 140 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) 141 MOVO 5*16(Stack), X0 142 CHACHA_SHUFFLE_SSE(X3, X2, X1) 143 CHACHA_SHUFFLE_SSE(X15, X14, X13) 144 CHACHA_SHUFFLE_SSE(X11, X10, X9) 145 CHACHA_SHUFFLE_SSE(X7, X6, X5) 146 SUBQ $2, Tmp0 147 JNZ CHACHA_LOOP_256 148 149 PADDL 0*16(Stack), X0 150 PADDL 1*16(Stack), X1 151 PADDL 2*16(Stack), X2 152 PADDL 3*16(Stack), X3 153 MOVO X4, 5*16(Stack) // Save X4 154 XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) 155 MOVO 5*16(Stack), X4 // Restore X4 156 157 MOVO 0*16(Stack), X0 158 MOVO 1*16(Stack), X1 159 MOVO 2*16(Stack), X2 160 MOVO 3*16(Stack), X3 161 PADDQ 4*16(Stack), X3 162 163 PADDL X0, X12 164 PADDL X1, X13 165 PADDL X2, X14 166 PADDL X3, X15 167 PADDQ 4*16(Stack), X3 168 PADDL X0, X8 169 PADDL X1, X9 170 PADDL X2, X10 171 PADDL X3, X11 172 PADDQ 4*16(Stack), X3 173 PADDL X0, X4 174 PADDL X1, X5 175 PADDL X2, X6 176 PADDL X3, X7 177 PADDQ 4*16(Stack), X3 178 179 XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) 180 XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) 181 MOVO 0*16(Stack), X0 // Restore X0 182 ADDQ $192, Dst 183 ADDQ $192, Src 184 SUBQ $192, Len 185 186 CMPQ Len, $64 187 JL BUFFER_KEYSTREAM 188 189 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 190 ADDQ $64, Dst 191 ADDQ $64, Src 192 SUBQ $64, Len 193 JZ DONE 194 CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. 195 JLE GENERATE_KEYSTREAM_64 196 CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. 197 JLE GENERATE_KEYSTREAM_128 198 CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream 199 JG GENERATE_KEYSTREAM_256 200 201 GENERATE_KEYSTREAM_192: 202 MOVO X0, X12 203 MOVO X1, X13 204 MOVO X2, X14 205 MOVO X3, X15 206 MOVO X0, X8 207 MOVO X1, X9 208 MOVO X2, X10 209 MOVO X3, X11 210 PADDQ 4*16(Stack), X11 211 MOVO X0, X4 212 MOVO X1, X5 213 MOVO X2, X6 214 MOVO X11, X7 215 PADDQ 4*16(Stack), X7 216 MOVQ Rounds, Tmp0 217 218 CHACHA_LOOP_192: 219 CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) 220 CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) 221 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) 222 CHACHA_SHUFFLE_SSE(X13, X14, X15) 223 CHACHA_SHUFFLE_SSE(X9, X10, X11) 224 CHACHA_SHUFFLE_SSE(X5, X6, X7) 225 CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) 226 CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) 227 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) 228 CHACHA_SHUFFLE_SSE(X15, X14, X13) 229 CHACHA_SHUFFLE_SSE(X11, X10, X9) 230 CHACHA_SHUFFLE_SSE(X7, X6, X5) 231 SUBQ $2, Tmp0 232 JNZ CHACHA_LOOP_192 233 234 MOVO 0*16(Stack), X0 // Restore X0 235 PADDL X0, X12 236 PADDL X1, X13 237 PADDL X2, X14 238 PADDL X3, X15 239 PADDQ 4*16(Stack), X3 240 PADDL X0, X8 241 PADDL X1, X9 242 PADDL X2, X10 243 PADDL X3, X11 244 PADDQ 4*16(Stack), X3 245 PADDL X0, X4 246 PADDL X1, X5 247 PADDL X2, X6 248 PADDL X3, X7 249 PADDQ 4*16(Stack), X3 250 251 XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) 252 XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) 253 MOVO 0*16(Stack), X0 // Restore X0 254 ADDQ $128, Dst 255 ADDQ $128, Src 256 SUBQ $128, Len 257 258 CMPQ Len, $64 259 JL BUFFER_KEYSTREAM 260 261 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 262 ADDQ $64, Dst 263 ADDQ $64, Src 264 SUBQ $64, Len 265 JZ DONE 266 CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. 267 JLE GENERATE_KEYSTREAM_64 268 269 GENERATE_KEYSTREAM_128: 270 MOVO X0, X8 271 MOVO X1, X9 272 MOVO X2, X10 273 MOVO X3, X11 274 MOVO X0, X4 275 MOVO X1, X5 276 MOVO X2, X6 277 MOVO X3, X7 278 PADDQ 4*16(Stack), X7 279 MOVQ Rounds, Tmp0 280 281 CHACHA_LOOP_128: 282 CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) 283 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) 284 CHACHA_SHUFFLE_SSE(X9, X10, X11) 285 CHACHA_SHUFFLE_SSE(X5, X6, X7) 286 CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) 287 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) 288 CHACHA_SHUFFLE_SSE(X11, X10, X9) 289 CHACHA_SHUFFLE_SSE(X7, X6, X5) 290 SUBQ $2, Tmp0 291 JNZ CHACHA_LOOP_128 292 293 PADDL X0, X8 294 PADDL X1, X9 295 PADDL X2, X10 296 PADDL X3, X11 297 PADDQ 4*16(Stack), X3 298 PADDL X0, X4 299 PADDL X1, X5 300 PADDL X2, X6 301 PADDL X3, X7 302 PADDQ 4*16(Stack), X3 303 304 XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) 305 ADDQ $64, Dst 306 ADDQ $64, Src 307 SUBQ $64, Len 308 309 CMPQ Len, $64 310 JL BUFFER_KEYSTREAM 311 312 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 313 ADDQ $64, Dst 314 ADDQ $64, Src 315 SUBQ $64, Len 316 JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream 317 318 GENERATE_KEYSTREAM_64: 319 MOVO X0, X4 320 MOVO X1, X5 321 MOVO X2, X6 322 MOVO X3, X7 323 MOVQ Rounds, Tmp0 324 325 CHACHA_LOOP_64: 326 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) 327 CHACHA_SHUFFLE_SSE(X5, X6, X7) 328 CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) 329 CHACHA_SHUFFLE_SSE(X7, X6, X5) 330 SUBQ $2, Tmp0 331 JNZ CHACHA_LOOP_64 332 333 PADDL X0, X4 334 PADDL X1, X5 335 PADDL X2, X6 336 PADDL X3, X7 337 PADDQ 4*16(Stack), X3 338 339 CMPQ Len, $64 340 JL BUFFER_KEYSTREAM 341 342 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 343 ADDQ $64, Src 344 ADDQ $64, Dst 345 SUBQ $64, Len 346 JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. 347 348 BUFFER_KEYSTREAM: 349 MOVOU X4, 0*16(Buffer) 350 MOVOU X5, 1*16(Buffer) 351 MOVOU X6, 2*16(Buffer) 352 MOVOU X7, 3*16(Buffer) 353 MOVQ Len, Tmp0 354 FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) 355 356 DONE: 357 MOVQ SavedSP, Stack // Restore stack pointer 358 MOVOU X3, 3*16(State) 359 MOVQ Len, ret+72(FP) 360 RET 361 362 // func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int 363 TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80 364 MOVQ dst_base+0(FP), Dst 365 MOVQ src_base+24(FP), Src 366 MOVQ block+48(FP), Buffer 367 MOVQ state+56(FP), State 368 MOVQ rounds+64(FP), Rounds 369 MOVQ src_len+32(FP), Len 370 371 MOVOU 0*16(State), X0 372 MOVOU 1*16(State), X1 373 MOVOU 2*16(State), X2 374 MOVOU 3*16(State), X3 375 376 MOVQ Stack, SavedSP 377 ADDQ $16, Stack 378 ANDQ $-16, Stack 379 380 TESTQ Len, Len 381 JZ DONE 382 383 MOVOU ·one<>(SB), X4 384 MOVOU ·rol16<>(SB), X5 385 MOVOU ·rol8<>(SB), X6 386 MOVO X0, 0*16(Stack) 387 MOVO X1, 1*16(Stack) 388 MOVO X2, 2*16(Stack) 389 MOVO X3, 3*16(Stack) 390 MOVO X4, 4*16(Stack) 391 MOVO X5, 6*16(Stack) 392 MOVO X6, 7*16(Stack) 393 394 CMPQ Len, $64 395 JLE GENERATE_KEYSTREAM_64 396 CMPQ Len, $128 397 JLE GENERATE_KEYSTREAM_128 398 CMPQ Len, $192 399 JLE GENERATE_KEYSTREAM_192 400 401 GENERATE_KEYSTREAM_256: 402 MOVO X0, X12 403 MOVO X1, X13 404 MOVO X2, X14 405 MOVO X3, X15 406 PADDQ 4*16(Stack), X15 407 MOVO X0, X8 408 MOVO X1, X9 409 MOVO X2, X10 410 MOVO X15, X11 411 PADDQ 4*16(Stack), X11 412 MOVO X0, X4 413 MOVO X1, X5 414 MOVO X2, X6 415 MOVO X11, X7 416 PADDQ 4*16(Stack), X7 417 MOVQ Rounds, Tmp0 418 419 MOVO X3, 3*16(Stack) // Save X3 420 421 CHACHA_LOOP_256: 422 MOVO X4, 5*16(Stack) 423 CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) 424 CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) 425 MOVO 5*16(Stack), X4 426 MOVO X0, 5*16(Stack) 427 CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) 428 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) 429 MOVO 5*16(Stack), X0 430 CHACHA_SHUFFLE_SSE(X1, X2, X3) 431 CHACHA_SHUFFLE_SSE(X13, X14, X15) 432 CHACHA_SHUFFLE_SSE(X9, X10, X11) 433 CHACHA_SHUFFLE_SSE(X5, X6, X7) 434 MOVO X4, 5*16(Stack) 435 CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) 436 CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) 437 MOVO 5*16(Stack), X4 438 MOVO X0, 5*16(Stack) 439 CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) 440 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) 441 MOVO 5*16(Stack), X0 442 CHACHA_SHUFFLE_SSE(X3, X2, X1) 443 CHACHA_SHUFFLE_SSE(X15, X14, X13) 444 CHACHA_SHUFFLE_SSE(X11, X10, X9) 445 CHACHA_SHUFFLE_SSE(X7, X6, X5) 446 SUBQ $2, Tmp0 447 JNZ CHACHA_LOOP_256 448 449 PADDL 0*16(Stack), X0 450 PADDL 1*16(Stack), X1 451 PADDL 2*16(Stack), X2 452 PADDL 3*16(Stack), X3 453 MOVO X4, 5*16(Stack) // Save X4 454 XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) 455 MOVO 5*16(Stack), X4 // Restore X4 456 457 MOVO 0*16(Stack), X0 458 MOVO 1*16(Stack), X1 459 MOVO 2*16(Stack), X2 460 MOVO 3*16(Stack), X3 461 PADDQ 4*16(Stack), X3 462 463 PADDL X0, X12 464 PADDL X1, X13 465 PADDL X2, X14 466 PADDL X3, X15 467 PADDQ 4*16(Stack), X3 468 PADDL X0, X8 469 PADDL X1, X9 470 PADDL X2, X10 471 PADDL X3, X11 472 PADDQ 4*16(Stack), X3 473 PADDL X0, X4 474 PADDL X1, X5 475 PADDL X2, X6 476 PADDL X3, X7 477 PADDQ 4*16(Stack), X3 478 479 XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) 480 XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) 481 MOVO 0*16(Stack), X0 // Restore X0 482 ADDQ $192, Dst 483 ADDQ $192, Src 484 SUBQ $192, Len 485 486 CMPQ Len, $64 487 JL BUFFER_KEYSTREAM 488 489 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 490 ADDQ $64, Dst 491 ADDQ $64, Src 492 SUBQ $64, Len 493 JZ DONE 494 CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. 495 JLE GENERATE_KEYSTREAM_64 496 CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. 497 JLE GENERATE_KEYSTREAM_128 498 CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream 499 JG GENERATE_KEYSTREAM_256 500 501 GENERATE_KEYSTREAM_192: 502 MOVO X0, X12 503 MOVO X1, X13 504 MOVO X2, X14 505 MOVO X3, X15 506 MOVO X0, X8 507 MOVO X1, X9 508 MOVO X2, X10 509 MOVO X3, X11 510 PADDQ 4*16(Stack), X11 511 MOVO X0, X4 512 MOVO X1, X5 513 MOVO X2, X6 514 MOVO X11, X7 515 PADDQ 4*16(Stack), X7 516 MOVQ Rounds, Tmp0 517 518 MOVO 6*16(Stack), X1 // Load 16 bit rotate-left constant 519 MOVO 7*16(Stack), X2 // Load 8 bit rotate-left constant 520 521 CHACHA_LOOP_192: 522 CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) 523 CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) 524 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) 525 CHACHA_SHUFFLE_SSE(X13, X14, X15) 526 CHACHA_SHUFFLE_SSE(X9, X10, X11) 527 CHACHA_SHUFFLE_SSE(X5, X6, X7) 528 CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) 529 CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) 530 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) 531 CHACHA_SHUFFLE_SSE(X15, X14, X13) 532 CHACHA_SHUFFLE_SSE(X11, X10, X9) 533 CHACHA_SHUFFLE_SSE(X7, X6, X5) 534 SUBQ $2, Tmp0 535 JNZ CHACHA_LOOP_192 536 537 MOVO 0*16(Stack), X0 // Restore X0 538 MOVO 1*16(Stack), X1 // Restore X1 539 MOVO 2*16(Stack), X2 // Restore X2 540 PADDL X0, X12 541 PADDL X1, X13 542 PADDL X2, X14 543 PADDL X3, X15 544 PADDQ 4*16(Stack), X3 545 PADDL X0, X8 546 PADDL X1, X9 547 PADDL X2, X10 548 PADDL X3, X11 549 PADDQ 4*16(Stack), X3 550 PADDL X0, X4 551 PADDL X1, X5 552 PADDL X2, X6 553 PADDL X3, X7 554 PADDQ 4*16(Stack), X3 555 556 XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) 557 XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) 558 MOVO 0*16(Stack), X0 // Restore X0 559 ADDQ $128, Dst 560 ADDQ $128, Src 561 SUBQ $128, Len 562 563 CMPQ Len, $64 564 JL BUFFER_KEYSTREAM 565 566 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 567 ADDQ $64, Dst 568 ADDQ $64, Src 569 SUBQ $64, Len 570 JZ DONE 571 CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. 572 JLE GENERATE_KEYSTREAM_64 573 574 GENERATE_KEYSTREAM_128: 575 MOVO X0, X8 576 MOVO X1, X9 577 MOVO X2, X10 578 MOVO X3, X11 579 MOVO X0, X4 580 MOVO X1, X5 581 MOVO X2, X6 582 MOVO X3, X7 583 PADDQ 4*16(Stack), X7 584 MOVQ Rounds, Tmp0 585 586 MOVO 6*16(Stack), X13 // Load 16 bit rotate-left constant 587 MOVO 7*16(Stack), X14 // Load 8 bit rotate-left constant 588 589 CHACHA_LOOP_128: 590 CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) 591 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) 592 CHACHA_SHUFFLE_SSE(X9, X10, X11) 593 CHACHA_SHUFFLE_SSE(X5, X6, X7) 594 CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) 595 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) 596 CHACHA_SHUFFLE_SSE(X11, X10, X9) 597 CHACHA_SHUFFLE_SSE(X7, X6, X5) 598 SUBQ $2, Tmp0 599 JNZ CHACHA_LOOP_128 600 601 PADDL X0, X8 602 PADDL X1, X9 603 PADDL X2, X10 604 PADDL X3, X11 605 PADDQ 4*16(Stack), X3 606 PADDL X0, X4 607 PADDL X1, X5 608 PADDL X2, X6 609 PADDL X3, X7 610 PADDQ 4*16(Stack), X3 611 612 XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) 613 ADDQ $64, Dst 614 ADDQ $64, Src 615 SUBQ $64, Len 616 617 CMPQ Len, $64 618 JL BUFFER_KEYSTREAM 619 620 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 621 ADDQ $64, Dst 622 ADDQ $64, Src 623 SUBQ $64, Len 624 JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream 625 626 GENERATE_KEYSTREAM_64: 627 MOVO X0, X4 628 MOVO X1, X5 629 MOVO X2, X6 630 MOVO X3, X7 631 MOVQ Rounds, Tmp0 632 633 MOVO 6*16(Stack), X9 // Load 16 bit rotate-left constant 634 MOVO 7*16(Stack), X10 // Load 8 bit rotate-left constant 635 636 CHACHA_LOOP_64: 637 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) 638 CHACHA_SHUFFLE_SSE(X5, X6, X7) 639 CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) 640 CHACHA_SHUFFLE_SSE(X7, X6, X5) 641 SUBQ $2, Tmp0 642 JNZ CHACHA_LOOP_64 643 644 PADDL X0, X4 645 PADDL X1, X5 646 PADDL X2, X6 647 PADDL X3, X7 648 PADDQ 4*16(Stack), X3 649 650 CMPQ Len, $64 651 JL BUFFER_KEYSTREAM 652 653 XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) 654 ADDQ $64, Src 655 ADDQ $64, Dst 656 SUBQ $64, Len 657 JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. 658 659 BUFFER_KEYSTREAM: 660 MOVOU X4, 0*16(Buffer) 661 MOVOU X5, 1*16(Buffer) 662 MOVOU X6, 2*16(Buffer) 663 MOVOU X7, 3*16(Buffer) 664 MOVQ Len, Tmp0 665 FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) 666 667 DONE: 668 MOVQ SavedSP, Stack // Restore stack pointer 669 MOVOU X3, 3*16(State) 670 MOVQ Len, ret+72(FP) 671 RET 672 673 // func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int 674 TEXT ·xorKeyStreamAVX(SB), 4, $144-80 675 MOVQ dst_base+0(FP), Dst 676 MOVQ src_base+24(FP), Src 677 MOVQ block+48(FP), Buffer 678 MOVQ state+56(FP), State 679 MOVQ rounds+64(FP), Rounds 680 MOVQ src_len+32(FP), Len 681 682 VMOVDQU 0*16(State), X0 683 VMOVDQU 1*16(State), X1 684 VMOVDQU 2*16(State), X2 685 VMOVDQU 3*16(State), X3 686 687 MOVQ Stack, SavedSP 688 ADDQ $16, Stack 689 ANDQ $-16, Stack 690 691 TESTQ Len, Len 692 JZ DONE 693 694 VMOVDQU ·one<>(SB), X4 695 VMOVDQU ·rol16<>(SB), X5 696 VMOVDQU ·rol8<>(SB), X6 697 VMOVDQA X0, 0*16(Stack) 698 VMOVDQA X1, 1*16(Stack) 699 VMOVDQA X2, 2*16(Stack) 700 VMOVDQA X3, 3*16(Stack) 701 VMOVDQA X4, 4*16(Stack) 702 VMOVDQA X5, 6*16(Stack) 703 VMOVDQA X6, 7*16(Stack) 704 705 CMPQ Len, $64 706 JLE GENERATE_KEYSTREAM_64 707 CMPQ Len, $128 708 JLE GENERATE_KEYSTREAM_128 709 CMPQ Len, $192 710 JLE GENERATE_KEYSTREAM_192 711 712 GENERATE_KEYSTREAM_256: 713 VMOVDQA X0, X12 714 VMOVDQA X1, X13 715 VMOVDQA X2, X14 716 VMOVDQA X3, X15 717 VPADDQ 4*16(Stack), X15, X15 718 VMOVDQA X0, X8 719 VMOVDQA X1, X9 720 VMOVDQA X2, X10 721 VMOVDQA X15, X11 722 VPADDQ 4*16(Stack), X11, X11 723 VMOVDQA X0, X4 724 VMOVDQA X1, X5 725 VMOVDQA X2, X6 726 VMOVDQA X11, X7 727 VPADDQ 4*16(Stack), X7, X7 728 MOVQ Rounds, Tmp0 729 730 VMOVDQA X3, 3*16(Stack) // Save X3 731 732 CHACHA_LOOP_256: 733 VMOVDQA X4, 5*16(Stack) 734 CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) 735 CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) 736 VMOVDQA 5*16(Stack), X4 737 VMOVDQA X0, 5*16(Stack) 738 CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) 739 CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) 740 VMOVDQA 5*16(Stack), X0 741 CHACHA_SHUFFLE_AVX(X1, X2, X3) 742 CHACHA_SHUFFLE_AVX(X13, X14, X15) 743 CHACHA_SHUFFLE_AVX(X9, X10, X11) 744 CHACHA_SHUFFLE_AVX(X5, X6, X7) 745 VMOVDQA X4, 5*16(Stack) 746 CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) 747 CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) 748 VMOVDQA 5*16(Stack), X4 749 VMOVDQA X0, 5*16(Stack) 750 CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) 751 CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) 752 VMOVDQA 5*16(Stack), X0 753 CHACHA_SHUFFLE_AVX(X3, X2, X1) 754 CHACHA_SHUFFLE_AVX(X15, X14, X13) 755 CHACHA_SHUFFLE_AVX(X11, X10, X9) 756 CHACHA_SHUFFLE_AVX(X7, X6, X5) 757 SUBQ $2, Tmp0 758 JNZ CHACHA_LOOP_256 759 760 VPADDD 0*16(Stack), X0, X0 761 VPADDD 1*16(Stack), X1, X1 762 VPADDD 2*16(Stack), X2, X2 763 VPADDD 3*16(Stack), X3, X3 764 VMOVDQA X4, 5*16(Stack) // Save X4 765 XOR_AVX(Dst, Src, 0, X0, X1, X2, X3, X4) 766 VMOVDQA 5*16(Stack), X4 // Restore X4 767 768 VMOVDQA 0*16(Stack), X0 769 VMOVDQA 1*16(Stack), X1 770 VMOVDQA 2*16(Stack), X2 771 VMOVDQA 3*16(Stack), X3 772 VPADDQ 4*16(Stack), X3, X3 773 774 VPADDD X0, X12, X12 775 VPADDD X1, X13, X13 776 VPADDD X2, X14, X14 777 VPADDD X3, X15, X15 778 VPADDQ 4*16(Stack), X3, X3 779 VPADDD X0, X8, X8 780 VPADDD X1, X9, X9 781 VPADDD X2, X10, X10 782 VPADDD X3, X11, X11 783 VPADDQ 4*16(Stack), X3, X3 784 VPADDD X0, X4, X4 785 VPADDD X1, X5, X5 786 VPADDD X2, X6, X6 787 VPADDD X3, X7, X7 788 VPADDQ 4*16(Stack), X3, X3 789 790 XOR_AVX(Dst, Src, 64, X12, X13, X14, X15, X0) 791 XOR_AVX(Dst, Src, 128, X8, X9, X10, X11, X0) 792 VMOVDQA 0*16(Stack), X0 // Restore X0 793 ADDQ $192, Dst 794 ADDQ $192, Src 795 SUBQ $192, Len 796 797 CMPQ Len, $64 798 JL BUFFER_KEYSTREAM 799 800 XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) 801 ADDQ $64, Dst 802 ADDQ $64, Src 803 SUBQ $64, Len 804 JZ DONE 805 CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. 806 JLE GENERATE_KEYSTREAM_64 807 CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. 808 JLE GENERATE_KEYSTREAM_128 809 CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream 810 JG GENERATE_KEYSTREAM_256 811 812 GENERATE_KEYSTREAM_192: 813 VMOVDQA X0, X12 814 VMOVDQA X1, X13 815 VMOVDQA X2, X14 816 VMOVDQA X3, X15 817 VMOVDQA X0, X8 818 VMOVDQA X1, X9 819 VMOVDQA X2, X10 820 VMOVDQA X3, X11 821 VPADDQ 4*16(Stack), X11, X11 822 VMOVDQA X0, X4 823 VMOVDQA X1, X5 824 VMOVDQA X2, X6 825 VMOVDQA X11, X7 826 VPADDQ 4*16(Stack), X7, X7 827 MOVQ Rounds, Tmp0 828 829 VMOVDQA 6*16(Stack), X1 // Load 16 bit rotate-left constant 830 VMOVDQA 7*16(Stack), X2 // Load 8 bit rotate-left constant 831 832 CHACHA_LOOP_192: 833 CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) 834 CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) 835 CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) 836 CHACHA_SHUFFLE_AVX(X13, X14, X15) 837 CHACHA_SHUFFLE_AVX(X9, X10, X11) 838 CHACHA_SHUFFLE_AVX(X5, X6, X7) 839 CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) 840 CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) 841 CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) 842 CHACHA_SHUFFLE_AVX(X15, X14, X13) 843 CHACHA_SHUFFLE_AVX(X11, X10, X9) 844 CHACHA_SHUFFLE_AVX(X7, X6, X5) 845 SUBQ $2, Tmp0 846 JNZ CHACHA_LOOP_192 847 848 VMOVDQA 0*16(Stack), X0 // Restore X0 849 VMOVDQA 1*16(Stack), X1 // Restore X1 850 VMOVDQA 2*16(Stack), X2 // Restore X2 851 VPADDD X0, X12, X12 852 VPADDD X1, X13, X13 853 VPADDD X2, X14, X14 854 VPADDD X3, X15, X15 855 VPADDQ 4*16(Stack), X3, X3 856 VPADDD X0, X8, X8 857 VPADDD X1, X9, X9 858 VPADDD X2, X10, X10 859 VPADDD X3, X11, X11 860 VPADDQ 4*16(Stack), X3, X3 861 VPADDD X0, X4, X4 862 VPADDD X1, X5, X5 863 VPADDD X2, X6, X6 864 VPADDD X3, X7, X7 865 VPADDQ 4*16(Stack), X3, X3 866 867 XOR_AVX(Dst, Src, 0, X12, X13, X14, X15, X0) 868 XOR_AVX(Dst, Src, 64, X8, X9, X10, X11, X0) 869 VMOVDQA 0*16(Stack), X0 // Restore X0 870 ADDQ $128, Dst 871 ADDQ $128, Src 872 SUBQ $128, Len 873 874 CMPQ Len, $64 875 JL BUFFER_KEYSTREAM 876 877 XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) 878 ADDQ $64, Dst 879 ADDQ $64, Src 880 SUBQ $64, Len 881 JZ DONE 882 CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. 883 JLE GENERATE_KEYSTREAM_64 884 885 GENERATE_KEYSTREAM_128: 886 VMOVDQA X0, X8 887 VMOVDQA X1, X9 888 VMOVDQA X2, X10 889 VMOVDQA X3, X11 890 VMOVDQA X0, X4 891 VMOVDQA X1, X5 892 VMOVDQA X2, X6 893 VMOVDQA X3, X7 894 VPADDQ 4*16(Stack), X7, X7 895 MOVQ Rounds, Tmp0 896 897 VMOVDQA 6*16(Stack), X13 // Load 16 bit rotate-left constant 898 VMOVDQA 7*16(Stack), X14 // Load 8 bit rotate-left constant 899 900 CHACHA_LOOP_128: 901 CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) 902 CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) 903 CHACHA_SHUFFLE_AVX(X9, X10, X11) 904 CHACHA_SHUFFLE_AVX(X5, X6, X7) 905 CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) 906 CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) 907 CHACHA_SHUFFLE_AVX(X11, X10, X9) 908 CHACHA_SHUFFLE_AVX(X7, X6, X5) 909 SUBQ $2, Tmp0 910 JNZ CHACHA_LOOP_128 911 912 VPADDD X0, X8, X8 913 VPADDD X1, X9, X9 914 VPADDD X2, X10, X10 915 VPADDD X3, X11, X11 916 VPADDQ 4*16(Stack), X3, X3 917 VPADDD X0, X4, X4 918 VPADDD X1, X5, X5 919 VPADDD X2, X6, X6 920 VPADDD X3, X7, X7 921 VPADDQ 4*16(Stack), X3, X3 922 923 XOR_AVX(Dst, Src, 0, X8, X9, X10, X11, X12) 924 ADDQ $64, Dst 925 ADDQ $64, Src 926 SUBQ $64, Len 927 928 CMPQ Len, $64 929 JL BUFFER_KEYSTREAM 930 931 XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) 932 ADDQ $64, Dst 933 ADDQ $64, Src 934 SUBQ $64, Len 935 JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream 936 937 GENERATE_KEYSTREAM_64: 938 VMOVDQA X0, X4 939 VMOVDQA X1, X5 940 VMOVDQA X2, X6 941 VMOVDQA X3, X7 942 MOVQ Rounds, Tmp0 943 944 VMOVDQA 6*16(Stack), X9 // Load 16 bit rotate-left constant 945 VMOVDQA 7*16(Stack), X10 // Load 8 bit rotate-left constant 946 947 CHACHA_LOOP_64: 948 CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) 949 CHACHA_SHUFFLE_AVX(X5, X6, X7) 950 CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) 951 CHACHA_SHUFFLE_AVX(X7, X6, X5) 952 SUBQ $2, Tmp0 953 JNZ CHACHA_LOOP_64 954 955 VPADDD X0, X4, X4 956 VPADDD X1, X5, X5 957 VPADDD X2, X6, X6 958 VPADDD X3, X7, X7 959 VPADDQ 4*16(Stack), X3, X3 960 961 CMPQ Len, $64 962 JL BUFFER_KEYSTREAM 963 964 XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) 965 ADDQ $64, Src 966 ADDQ $64, Dst 967 SUBQ $64, Len 968 JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. 969 970 BUFFER_KEYSTREAM: 971 VMOVDQU X4, 0*16(Buffer) 972 VMOVDQU X5, 1*16(Buffer) 973 VMOVDQU X6, 2*16(Buffer) 974 VMOVDQU X7, 3*16(Buffer) 975 MOVQ Len, Tmp0 976 FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) 977 978 DONE: 979 MOVQ SavedSP, Stack // Restore stack pointer 980 VMOVDQU X3, 3*16(State) 981 VZEROUPPER 982 MOVQ Len, ret+72(FP) 983 RET 984 985 #undef Dst 986 #undef Src 987 #undef Len 988 #undef Rounds 989 #undef Buffer 990 #undef State 991 #undef Stack 992 #undef SavedSP 993 #undef Tmp0 994 #undef Tmp1 995 #undef Tmp2