github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm2/p256_asm_amd64.s (about) 1 // This file contains constant-time, 64-bit assembly implementation of 2 // P256. The optimizations performed here are described in detail in: 3 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 4 // 256-bit primes" 5 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 6 // https://eprint.iacr.org/2013/816.pdf 7 8 #include "textflag.h" 9 10 #define res_ptr DI 11 #define x_ptr SI 12 #define y_ptr CX 13 14 #define acc0 R8 15 #define acc1 R9 16 #define acc2 R10 17 #define acc3 R11 18 #define acc4 R12 19 #define acc5 R13 20 #define t0 R14 21 #define t1 R15 22 23 DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff 24 DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 25 DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff 26 DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff 27 DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975 28 DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123 29 DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b 30 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 31 DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff 32 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 33 DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff 34 DATA p256one<>+0x10(SB)/8, $0x0000000000000000 35 DATA p256one<>+0x18(SB)/8, $0x0000000100000000 36 GLOBL p256p<>(SB), RODATA, $32 37 GLOBL p256ordK0<>(SB), RODATA, $8 38 GLOBL p256ord<>(SB), RODATA, $32 39 GLOBL p256one<>(SB), RODATA, $32 40 41 /* ---------------------------------------*/ 42 // func p256LittleToBig(res []byte, in []uint64) 43 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 44 JMP ·p256BigToLittle(SB) 45 /* ---------------------------------------*/ 46 // func p256BigToLittle(res []uint64, in []byte) 47 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 48 MOVQ res+0(FP), res_ptr 49 MOVQ in+24(FP), x_ptr 50 51 MOVQ (8*0)(x_ptr), acc0 52 MOVQ (8*1)(x_ptr), acc1 53 MOVQ (8*2)(x_ptr), acc2 54 MOVQ (8*3)(x_ptr), acc3 55 56 BSWAPQ acc0 57 BSWAPQ acc1 58 BSWAPQ acc2 59 BSWAPQ acc3 60 61 MOVQ acc3, (8*0)(res_ptr) 62 MOVQ acc2, (8*1)(res_ptr) 63 MOVQ acc1, (8*2)(res_ptr) 64 MOVQ acc0, (8*3)(res_ptr) 65 66 RET 67 /* ---------------------------------------*/ 68 // func p256MovCond(res, a, b []uint64, cond int) 69 // If cond == 0 res=b, else res=a 70 TEXT ·p256MovCond(SB),NOSPLIT,$0 71 MOVQ res+0(FP), res_ptr 72 MOVQ a+24(FP), x_ptr 73 MOVQ b+48(FP), y_ptr 74 MOVQ cond+72(FP), X12 75 76 PXOR X13, X13 77 PSHUFD $0, X12, X12 78 PCMPEQL X13, X12 79 80 MOVOU X12, X0 81 MOVOU (16*0)(x_ptr), X6 82 PANDN X6, X0 83 MOVOU X12, X1 84 MOVOU (16*1)(x_ptr), X7 85 PANDN X7, X1 86 MOVOU X12, X2 87 MOVOU (16*2)(x_ptr), X8 88 PANDN X8, X2 89 MOVOU X12, X3 90 MOVOU (16*3)(x_ptr), X9 91 PANDN X9, X3 92 MOVOU X12, X4 93 MOVOU (16*4)(x_ptr), X10 94 PANDN X10, X4 95 MOVOU X12, X5 96 MOVOU (16*5)(x_ptr), X11 97 PANDN X11, X5 98 99 MOVOU (16*0)(y_ptr), X6 100 MOVOU (16*1)(y_ptr), X7 101 MOVOU (16*2)(y_ptr), X8 102 MOVOU (16*3)(y_ptr), X9 103 MOVOU (16*4)(y_ptr), X10 104 MOVOU (16*5)(y_ptr), X11 105 106 PAND X12, X6 107 PAND X12, X7 108 PAND X12, X8 109 PAND X12, X9 110 PAND X12, X10 111 PAND X12, X11 112 113 PXOR X6, X0 114 PXOR X7, X1 115 PXOR X8, X2 116 PXOR X9, X3 117 PXOR X10, X4 118 PXOR X11, X5 119 120 MOVOU X0, (16*0)(res_ptr) 121 MOVOU X1, (16*1)(res_ptr) 122 MOVOU X2, (16*2)(res_ptr) 123 MOVOU X3, (16*3)(res_ptr) 124 MOVOU X4, (16*4)(res_ptr) 125 MOVOU X5, (16*5)(res_ptr) 126 127 RET 128 /* ---------------------------------------*/ 129 // func p256NegCond(val []uint64, cond int) 130 TEXT ·p256NegCond(SB),NOSPLIT,$0 131 MOVQ val+0(FP), res_ptr 132 MOVQ cond+24(FP), t0 133 // acc = poly 134 MOVQ $-1, acc0 135 MOVQ p256p<>+0x08(SB), acc1 136 MOVQ $-1, acc2 137 MOVQ p256p<>+0x18(SB), acc3 138 // Load the original value 139 MOVQ (8*0)(res_ptr), acc5 140 MOVQ (8*1)(res_ptr), x_ptr 141 MOVQ (8*2)(res_ptr), y_ptr 142 MOVQ (8*3)(res_ptr), t1 143 // Speculatively subtract 144 SUBQ acc5, acc0 145 SBBQ x_ptr, acc1 146 SBBQ y_ptr, acc2 147 SBBQ t1, acc3 148 // If condition is 0, keep original value 149 TESTQ t0, t0 150 CMOVQEQ acc5, acc0 151 CMOVQEQ x_ptr, acc1 152 CMOVQEQ y_ptr, acc2 153 CMOVQEQ t1, acc3 154 // Store result 155 MOVQ acc0, (8*0)(res_ptr) 156 MOVQ acc1, (8*1)(res_ptr) 157 MOVQ acc2, (8*2)(res_ptr) 158 MOVQ acc3, (8*3)(res_ptr) 159 160 RET 161 /* ---------------------------------------*/ 162 // func p256Sqr(res, in []uint64, n int) 163 TEXT ·p256Sqr(SB),NOSPLIT,$0 164 MOVQ res+0(FP), res_ptr 165 MOVQ in+24(FP), x_ptr 166 MOVQ n+48(FP), BX 167 168 sqrLoop: 169 170 // y[1:] * y[0] 171 MOVQ (8*0)(x_ptr), t0 172 173 MOVQ (8*1)(x_ptr), AX 174 MULQ t0 175 MOVQ AX, acc1 176 MOVQ DX, acc2 177 178 MOVQ (8*2)(x_ptr), AX 179 MULQ t0 180 ADDQ AX, acc2 181 ADCQ $0, DX 182 MOVQ DX, acc3 183 184 MOVQ (8*3)(x_ptr), AX 185 MULQ t0 186 ADDQ AX, acc3 187 ADCQ $0, DX 188 MOVQ DX, acc4 189 // y[2:] * y[1] 190 MOVQ (8*1)(x_ptr), t0 191 192 MOVQ (8*2)(x_ptr), AX 193 MULQ t0 194 ADDQ AX, acc3 195 ADCQ $0, DX 196 MOVQ DX, t1 197 198 MOVQ (8*3)(x_ptr), AX 199 MULQ t0 200 ADDQ t1, acc4 201 ADCQ $0, DX 202 ADDQ AX, acc4 203 ADCQ $0, DX 204 MOVQ DX, acc5 205 // y[3] * y[2] 206 MOVQ (8*2)(x_ptr), t0 207 208 MOVQ (8*3)(x_ptr), AX 209 MULQ t0 210 ADDQ AX, acc5 211 ADCQ $0, DX 212 MOVQ DX, y_ptr 213 XORQ t1, t1 214 // *2 215 ADDQ acc1, acc1 216 ADCQ acc2, acc2 217 ADCQ acc3, acc3 218 ADCQ acc4, acc4 219 ADCQ acc5, acc5 220 ADCQ y_ptr, y_ptr 221 ADCQ $0, t1 222 // Missing products 223 MOVQ (8*0)(x_ptr), AX 224 MULQ AX 225 MOVQ AX, acc0 226 MOVQ DX, t0 227 228 MOVQ (8*1)(x_ptr), AX 229 MULQ AX 230 ADDQ t0, acc1 231 ADCQ AX, acc2 232 ADCQ $0, DX 233 MOVQ DX, t0 234 235 MOVQ (8*2)(x_ptr), AX 236 MULQ AX 237 ADDQ t0, acc3 238 ADCQ AX, acc4 239 ADCQ $0, DX 240 MOVQ DX, t0 241 242 MOVQ (8*3)(x_ptr), AX 243 MULQ AX 244 ADDQ t0, acc5 245 ADCQ AX, y_ptr 246 ADCQ DX, t1 247 MOVQ t1, x_ptr 248 // First reduction step 249 MOVQ acc0, AX 250 MOVQ acc0, DX 251 SHLQ $32, AX 252 SHRQ $32, DX 253 254 ADDQ acc0, acc1 255 ADCQ $0, acc2 256 ADCQ $0, acc3 257 ADCQ $0, acc0 258 259 SUBQ AX, acc1 260 SBBQ DX, acc2 261 SBBQ AX, acc3 262 SBBQ DX, acc0 263 // Second reduction step 264 MOVQ acc1, AX 265 MOVQ acc1, DX 266 SHLQ $32, AX 267 SHRQ $32, DX 268 269 ADDQ acc1, acc2 270 ADCQ $0, acc3 271 ADCQ $0, acc0 272 ADCQ $0, acc1 273 274 SUBQ AX, acc2 275 SBBQ DX, acc3 276 SBBQ AX, acc0 277 SBBQ DX, acc1 278 // Third reduction step 279 MOVQ acc2, AX 280 MOVQ acc2, DX 281 SHLQ $32, AX 282 SHRQ $32, DX 283 284 ADDQ acc2, acc3 285 ADCQ $0, acc0 286 ADCQ $0, acc1 287 ADCQ $0, acc2 288 289 SUBQ AX, acc3 290 SBBQ DX, acc0 291 SBBQ AX, acc1 292 SBBQ DX, acc2 293 // Last reduction step 294 XORQ t0, t0 295 MOVQ acc3, AX 296 MOVQ acc3, DX 297 SHLQ $32, AX 298 SHRQ $32, DX 299 300 ADDQ acc3, acc0 301 ADCQ $0, acc1 302 ADCQ $0, acc2 303 ADCQ $0, acc3 304 305 SUBQ AX, acc0 306 SBBQ DX, acc1 307 SBBQ AX, acc2 308 SBBQ DX, acc3 309 310 // Add bits [511:256] of the sqr result 311 ADCQ acc4, acc0 312 ADCQ acc5, acc1 313 ADCQ y_ptr, acc2 314 ADCQ x_ptr, acc3 315 ADCQ $0, t0 316 317 MOVQ acc0, acc4 318 MOVQ acc1, acc5 319 MOVQ acc2, y_ptr 320 MOVQ acc3, t1 321 // Subtract p256 322 SUBQ $-1, acc0 323 SBBQ p256p<>+0x08(SB), acc1 324 SBBQ $-1, acc2 325 SBBQ p256p<>+0x018(SB), acc3 326 SBBQ $0, t0 327 328 CMOVQCS acc4, acc0 329 CMOVQCS acc5, acc1 330 CMOVQCS y_ptr, acc2 331 CMOVQCS t1, acc3 332 333 MOVQ acc0, (8*0)(res_ptr) 334 MOVQ acc1, (8*1)(res_ptr) 335 MOVQ acc2, (8*2)(res_ptr) 336 MOVQ acc3, (8*3)(res_ptr) 337 MOVQ res_ptr, x_ptr 338 DECQ BX 339 JNE sqrLoop 340 341 RET 342 /* ---------------------------------------*/ 343 // func p256Mul(res, in1, in2 []uint64) 344 TEXT ·p256Mul(SB),NOSPLIT,$0 345 MOVQ res+0(FP), res_ptr 346 MOVQ in1+24(FP), x_ptr 347 MOVQ in2+48(FP), y_ptr 348 // x * y[0] 349 MOVQ (8*0)(y_ptr), t0 350 351 MOVQ (8*0)(x_ptr), AX 352 MULQ t0 353 MOVQ AX, acc0 354 MOVQ DX, acc1 355 356 MOVQ (8*1)(x_ptr), AX 357 MULQ t0 358 ADDQ AX, acc1 359 ADCQ $0, DX 360 MOVQ DX, acc2 361 362 MOVQ (8*2)(x_ptr), AX 363 MULQ t0 364 ADDQ AX, acc2 365 ADCQ $0, DX 366 MOVQ DX, acc3 367 368 MOVQ (8*3)(x_ptr), AX 369 MULQ t0 370 ADDQ AX, acc3 371 ADCQ $0, DX 372 MOVQ DX, acc4 373 XORQ acc5, acc5 374 // First reduction step 375 MOVQ acc0, AX 376 MOVQ acc0, DX 377 SHLQ $32, AX 378 SHRQ $32, DX 379 380 ADDQ acc0, acc1 381 ADCQ $0, acc2 382 ADCQ $0, acc3 383 ADCQ acc0, acc4 384 ADCQ $0, acc5 385 386 SUBQ AX, acc1 387 SBBQ DX, acc2 388 SBBQ AX, acc3 389 SBBQ DX, acc4 390 SBBQ $0, acc5 391 XORQ acc0, acc0 392 393 // x * y[1] 394 MOVQ (8*1)(y_ptr), t0 395 396 MOVQ (8*0)(x_ptr), AX 397 MULQ t0 398 ADDQ AX, acc1 399 ADCQ $0, DX 400 MOVQ DX, t1 401 402 MOVQ (8*1)(x_ptr), AX 403 MULQ t0 404 ADDQ t1, acc2 405 ADCQ $0, DX 406 ADDQ AX, acc2 407 ADCQ $0, DX 408 MOVQ DX, t1 409 410 MOVQ (8*2)(x_ptr), AX 411 MULQ t0 412 ADDQ t1, acc3 413 ADCQ $0, DX 414 ADDQ AX, acc3 415 ADCQ $0, DX 416 MOVQ DX, t1 417 418 MOVQ (8*3)(x_ptr), AX 419 MULQ t0 420 ADDQ t1, acc4 421 ADCQ $0, DX 422 ADDQ AX, acc4 423 ADCQ DX, acc5 424 ADCQ $0, acc0 425 // Second reduction step 426 MOVQ acc1, AX 427 MOVQ acc1, DX 428 SHLQ $32, AX 429 SHRQ $32, DX 430 431 ADDQ acc1, acc2 432 ADCQ $0, acc3 433 ADCQ $0, acc4 434 ADCQ acc1, acc5 435 ADCQ $0, acc0 436 437 SUBQ AX, acc2 438 SBBQ DX, acc3 439 SBBQ AX, acc4 440 SBBQ DX, acc5 441 SBBQ $0, acc0 442 XORQ acc1, acc1 443 444 // x * y[2] 445 MOVQ (8*2)(y_ptr), t0 446 447 MOVQ (8*0)(x_ptr), AX 448 MULQ t0 449 ADDQ AX, acc2 450 ADCQ $0, DX 451 MOVQ DX, t1 452 453 MOVQ (8*1)(x_ptr), AX 454 MULQ t0 455 ADDQ t1, acc3 456 ADCQ $0, DX 457 ADDQ AX, acc3 458 ADCQ $0, DX 459 MOVQ DX, t1 460 461 MOVQ (8*2)(x_ptr), AX 462 MULQ t0 463 ADDQ t1, acc4 464 ADCQ $0, DX 465 ADDQ AX, acc4 466 ADCQ $0, DX 467 MOVQ DX, t1 468 469 MOVQ (8*3)(x_ptr), AX 470 MULQ t0 471 ADDQ t1, acc5 472 ADCQ $0, DX 473 ADDQ AX, acc5 474 ADCQ DX, acc0 475 ADCQ $0, acc1 476 // Third reduction step 477 MOVQ acc2, AX 478 MOVQ acc2, DX 479 SHLQ $32, AX 480 SHRQ $32, DX 481 482 ADDQ acc2, acc3 483 ADCQ $0, acc4 484 ADCQ $0, acc5 485 ADCQ acc2, acc0 486 ADCQ $0, acc1 487 488 SUBQ AX, acc3 489 SBBQ DX, acc4 490 SBBQ AX, acc5 491 SBBQ DX, acc0 492 SBBQ $0, acc1 493 XORQ acc2, acc2 494 // x * y[3] 495 MOVQ (8*3)(y_ptr), t0 496 497 MOVQ (8*0)(x_ptr), AX 498 MULQ t0 499 ADDQ AX, acc3 500 ADCQ $0, DX 501 MOVQ DX, t1 502 503 MOVQ (8*1)(x_ptr), AX 504 MULQ t0 505 ADDQ t1, acc4 506 ADCQ $0, DX 507 ADDQ AX, acc4 508 ADCQ $0, DX 509 MOVQ DX, t1 510 511 MOVQ (8*2)(x_ptr), AX 512 MULQ t0 513 ADDQ t1, acc5 514 ADCQ $0, DX 515 ADDQ AX, acc5 516 ADCQ $0, DX 517 MOVQ DX, t1 518 519 MOVQ (8*3)(x_ptr), AX 520 MULQ t0 521 ADDQ t1, acc0 522 ADCQ $0, DX 523 ADDQ AX, acc0 524 ADCQ DX, acc1 525 ADCQ $0, acc2 526 // Last reduction step 527 MOVQ acc3, AX 528 MOVQ acc3, DX 529 SHLQ $32, AX 530 SHRQ $32, DX 531 532 ADDQ acc3, acc4 533 ADCQ $0, acc5 534 ADCQ $0, acc0 535 ADCQ acc3, acc1 536 ADCQ $0, acc2 537 538 SUBQ AX, acc4 539 SBBQ DX, acc5 540 SBBQ AX, acc0 541 SBBQ DX, acc1 542 SBBQ $0, acc2 543 // Copy result [255:0] 544 MOVQ acc4, x_ptr 545 MOVQ acc5, acc3 546 MOVQ acc0, t0 547 MOVQ acc1, t1 548 // Subtract p256 549 SUBQ $-1, acc4 550 SBBQ p256p<>+0x08(SB), acc5 551 SBBQ $-1, acc0 552 SBBQ p256p<>+0x018(SB), acc1 553 SBBQ $0, acc2 554 555 CMOVQCS x_ptr, acc4 556 CMOVQCS acc3, acc5 557 CMOVQCS t0, acc0 558 CMOVQCS t1, acc1 559 560 MOVQ acc4, (8*0)(res_ptr) 561 MOVQ acc5, (8*1)(res_ptr) 562 MOVQ acc0, (8*2)(res_ptr) 563 MOVQ acc1, (8*3)(res_ptr) 564 565 RET 566 /* ---------------------------------------*/ 567 // func p256FromMont(res, in []uint64) 568 TEXT ·p256FromMont(SB),NOSPLIT,$0 569 MOVQ res+0(FP), res_ptr 570 MOVQ in+24(FP), x_ptr 571 572 MOVQ (8*0)(x_ptr), acc0 573 MOVQ (8*1)(x_ptr), acc1 574 MOVQ (8*2)(x_ptr), acc2 575 MOVQ (8*3)(x_ptr), acc3 576 XORQ acc4, acc4 577 578 // Only reduce, no multiplications are needed 579 // First stage 580 MOVQ acc0, AX 581 MOVQ acc0, DX 582 SHLQ $32, AX 583 SHRQ $32, DX 584 585 ADDQ acc0, acc1 586 ADCQ $0, acc2 587 ADCQ $0, acc3 588 ADCQ acc0, acc4 589 590 SUBQ AX, acc1 591 SBBQ DX, acc2 592 SBBQ AX, acc3 593 SBBQ DX, acc4 594 XORQ acc5, acc5 595 596 // Second stage 597 MOVQ acc1, AX 598 MOVQ acc1, DX 599 SHLQ $32, AX 600 SHRQ $32, DX 601 602 ADDQ acc1, acc2 603 ADCQ $0, acc3 604 ADCQ $0, acc4 605 ADCQ acc1, acc5 606 607 SUBQ AX, acc2 608 SBBQ DX, acc3 609 SBBQ AX, acc4 610 SBBQ DX, acc5 611 XORQ acc0, acc0 612 // Third stage 613 MOVQ acc2, AX 614 MOVQ acc2, DX 615 SHLQ $32, AX 616 SHRQ $32, DX 617 618 ADDQ acc2, acc3 619 ADCQ $0, acc4 620 ADCQ $0, acc5 621 ADCQ acc2, acc0 622 623 SUBQ AX, acc3 624 SBBQ DX, acc4 625 SBBQ AX, acc5 626 SBBQ DX, acc0 627 XORQ acc1, acc1 628 // Last stage 629 MOVQ acc3, AX 630 MOVQ acc3, DX 631 SHLQ $32, AX 632 SHRQ $32, DX 633 634 ADDQ acc3, acc4 635 ADCQ $0, acc5 636 ADCQ $0, acc0 637 ADCQ acc3, acc1 638 639 SUBQ AX, acc4 640 SBBQ DX, acc5 641 SBBQ AX, acc0 642 SBBQ DX, acc1 643 644 MOVQ acc4, x_ptr 645 MOVQ acc5, acc3 646 MOVQ acc0, t0 647 MOVQ acc1, t1 648 649 SUBQ $-1, acc4 650 SBBQ p256p<>+0x08(SB), acc5 651 SBBQ $-1, acc0 652 SBBQ p256p<>+0x018(SB), acc1 653 654 CMOVQCS x_ptr, acc4 655 CMOVQCS acc3, acc5 656 CMOVQCS t0, acc0 657 CMOVQCS t1, acc1 658 659 MOVQ acc4, (8*0)(res_ptr) 660 MOVQ acc5, (8*1)(res_ptr) 661 MOVQ acc0, (8*2)(res_ptr) 662 MOVQ acc1, (8*3)(res_ptr) 663 664 RET 665 /* ---------------------------------------*/ 666 // Constant time point access to arbitrary point table. 667 // Indexed from 1 to 15, with -1 offset 668 // (index 0 is implicitly point at infinity) 669 // func p256Select(point, table []uint64, idx int) 670 TEXT ·p256Select(SB),NOSPLIT,$0 671 MOVQ idx+48(FP),AX 672 MOVQ table+24(FP),DI 673 MOVQ point+0(FP),DX 674 675 PXOR X15, X15 // X15 = 0 676 PCMPEQL X14, X14 // X14 = -1 677 PSUBL X14, X15 // X15 = 1 678 MOVL AX, X14 679 PSHUFD $0, X14, X14 680 681 PXOR X0, X0 682 PXOR X1, X1 683 PXOR X2, X2 684 PXOR X3, X3 685 PXOR X4, X4 686 PXOR X5, X5 687 MOVQ $16, AX 688 689 MOVOU X15, X13 690 691 loop_select: 692 693 MOVOU X13, X12 694 PADDL X15, X13 695 PCMPEQL X14, X12 696 697 MOVOU (16*0)(DI), X6 698 MOVOU (16*1)(DI), X7 699 MOVOU (16*2)(DI), X8 700 MOVOU (16*3)(DI), X9 701 MOVOU (16*4)(DI), X10 702 MOVOU (16*5)(DI), X11 703 ADDQ $(16*6), DI 704 705 PAND X12, X6 706 PAND X12, X7 707 PAND X12, X8 708 PAND X12, X9 709 PAND X12, X10 710 PAND X12, X11 711 712 PXOR X6, X0 713 PXOR X7, X1 714 PXOR X8, X2 715 PXOR X9, X3 716 PXOR X10, X4 717 PXOR X11, X5 718 719 DECQ AX 720 JNE loop_select 721 722 MOVOU X0, (16*0)(DX) 723 MOVOU X1, (16*1)(DX) 724 MOVOU X2, (16*2)(DX) 725 MOVOU X3, (16*3)(DX) 726 MOVOU X4, (16*4)(DX) 727 MOVOU X5, (16*5)(DX) 728 729 RET 730 /* ---------------------------------------*/ 731 // Constant time point access to base point table. 732 // func p256SelectBase(point *[12]uint64, table string, idx int) 733 TEXT ·p256SelectBase(SB),NOSPLIT,$0 734 MOVQ idx+24(FP),AX 735 MOVQ table+8(FP),DI 736 MOVQ point+0(FP),DX 737 738 PXOR X15, X15 // X15 = 0 739 PCMPEQL X14, X14 // X14 = -1 740 PSUBL X14, X15 // X15 = 1 741 MOVL AX, X14 742 PSHUFD $0, X14, X14 743 744 PXOR X0, X0 745 PXOR X1, X1 746 PXOR X2, X2 747 PXOR X3, X3 748 MOVQ $16, AX 749 750 MOVOU X15, X13 751 752 loop_select_base: 753 754 MOVOU X13, X12 755 PADDL X15, X13 756 PCMPEQL X14, X12 757 758 MOVOU (16*0)(DI), X4 759 MOVOU (16*1)(DI), X5 760 MOVOU (16*2)(DI), X6 761 MOVOU (16*3)(DI), X7 762 763 MOVOU (16*4)(DI), X8 764 MOVOU (16*5)(DI), X9 765 MOVOU (16*6)(DI), X10 766 MOVOU (16*7)(DI), X11 767 768 ADDQ $(16*8), DI 769 770 PAND X12, X4 771 PAND X12, X5 772 PAND X12, X6 773 PAND X12, X7 774 775 MOVOU X13, X12 776 PADDL X15, X13 777 PCMPEQL X14, X12 778 779 PAND X12, X8 780 PAND X12, X9 781 PAND X12, X10 782 PAND X12, X11 783 784 PXOR X4, X0 785 PXOR X5, X1 786 PXOR X6, X2 787 PXOR X7, X3 788 789 PXOR X8, X0 790 PXOR X9, X1 791 PXOR X10, X2 792 PXOR X11, X3 793 794 DECQ AX 795 JNE loop_select_base 796 797 MOVOU X0, (16*0)(DX) 798 MOVOU X1, (16*1)(DX) 799 MOVOU X2, (16*2)(DX) 800 MOVOU X3, (16*3)(DX) 801 802 RET 803 /* ---------------------------------------*/ 804 // func p256OrdMul(res, in1, in2 []uint64) 805 TEXT ·p256OrdMul(SB),NOSPLIT,$0 806 MOVQ res+0(FP), res_ptr 807 MOVQ in1+24(FP), x_ptr 808 MOVQ in2+48(FP), y_ptr 809 // x * y[0] 810 MOVQ (8*0)(y_ptr), t0 811 812 MOVQ (8*0)(x_ptr), AX 813 MULQ t0 814 MOVQ AX, acc0 815 MOVQ DX, acc1 816 817 MOVQ (8*1)(x_ptr), AX 818 MULQ t0 819 ADDQ AX, acc1 820 ADCQ $0, DX 821 MOVQ DX, acc2 822 823 MOVQ (8*2)(x_ptr), AX 824 MULQ t0 825 ADDQ AX, acc2 826 ADCQ $0, DX 827 MOVQ DX, acc3 828 829 MOVQ (8*3)(x_ptr), AX 830 MULQ t0 831 ADDQ AX, acc3 832 ADCQ $0, DX 833 MOVQ DX, acc4 834 XORQ acc5, acc5 835 // First reduction step 836 MOVQ acc0, AX 837 MULQ p256ordK0<>(SB) 838 MOVQ AX, t0 839 840 MOVQ p256ord<>+0x00(SB), AX 841 MULQ t0 842 ADDQ AX, acc0 843 ADCQ $0, DX 844 MOVQ DX, t1 845 846 MOVQ p256ord<>+0x08(SB), AX 847 MULQ t0 848 ADDQ t1, acc1 849 ADCQ $0, DX 850 ADDQ AX, acc1 851 ADCQ DX, acc2 852 ADCQ $0, acc3 853 ADCQ t0, acc4 854 ADCQ $0, acc5 855 856 MOVQ t0, AX 857 MOVQ t0, DX 858 SHLQ $32, AX 859 SHRQ $32, DX 860 861 SUBQ t0, acc2 862 SBBQ AX, acc3 863 SBBQ DX, acc4 864 SBBQ $0, acc5 865 // x * y[1] 866 MOVQ (8*1)(y_ptr), t0 867 868 MOVQ (8*0)(x_ptr), AX 869 MULQ t0 870 ADDQ AX, acc1 871 ADCQ $0, DX 872 MOVQ DX, t1 873 874 MOVQ (8*1)(x_ptr), AX 875 MULQ t0 876 ADDQ t1, acc2 877 ADCQ $0, DX 878 ADDQ AX, acc2 879 ADCQ $0, DX 880 MOVQ DX, t1 881 882 MOVQ (8*2)(x_ptr), AX 883 MULQ t0 884 ADDQ t1, acc3 885 ADCQ $0, DX 886 ADDQ AX, acc3 887 ADCQ $0, DX 888 MOVQ DX, t1 889 890 MOVQ (8*3)(x_ptr), AX 891 MULQ t0 892 ADDQ t1, acc4 893 ADCQ $0, DX 894 ADDQ AX, acc4 895 ADCQ DX, acc5 896 ADCQ $0, acc0 897 // Second reduction step 898 MOVQ acc1, AX 899 MULQ p256ordK0<>(SB) 900 MOVQ AX, t0 901 902 MOVQ p256ord<>+0x00(SB), AX 903 MULQ t0 904 ADDQ AX, acc1 905 ADCQ $0, DX 906 MOVQ DX, t1 907 908 MOVQ p256ord<>+0x08(SB), AX 909 MULQ t0 910 ADDQ t1, acc2 911 ADCQ $0, DX 912 ADDQ AX, acc2 913 ADCQ DX, acc3 914 ADCQ $0, acc4 915 ADCQ t0, acc5 916 ADCQ $0, acc0 917 918 MOVQ t0, AX 919 MOVQ t0, DX 920 SHLQ $32, AX 921 SHRQ $32, DX 922 923 SUBQ t0, acc3 924 SBBQ AX, acc4 925 SBBQ DX, acc5 926 SBBQ $0, acc0 927 // x * y[2] 928 MOVQ (8*2)(y_ptr), t0 929 930 MOVQ (8*0)(x_ptr), AX 931 MULQ t0 932 ADDQ AX, acc2 933 ADCQ $0, DX 934 MOVQ DX, t1 935 936 MOVQ (8*1)(x_ptr), AX 937 MULQ t0 938 ADDQ t1, acc3 939 ADCQ $0, DX 940 ADDQ AX, acc3 941 ADCQ $0, DX 942 MOVQ DX, t1 943 944 MOVQ (8*2)(x_ptr), AX 945 MULQ t0 946 ADDQ t1, acc4 947 ADCQ $0, DX 948 ADDQ AX, acc4 949 ADCQ $0, DX 950 MOVQ DX, t1 951 952 MOVQ (8*3)(x_ptr), AX 953 MULQ t0 954 ADDQ t1, acc5 955 ADCQ $0, DX 956 ADDQ AX, acc5 957 ADCQ DX, acc0 958 ADCQ $0, acc1 959 // Third reduction step 960 MOVQ acc2, AX 961 MULQ p256ordK0<>(SB) 962 MOVQ AX, t0 963 964 MOVQ p256ord<>+0x00(SB), AX 965 MULQ t0 966 ADDQ AX, acc2 967 ADCQ $0, DX 968 MOVQ DX, t1 969 970 MOVQ p256ord<>+0x08(SB), AX 971 MULQ t0 972 ADDQ t1, acc3 973 ADCQ $0, DX 974 ADDQ AX, acc3 975 ADCQ DX, acc4 976 ADCQ $0, acc5 977 ADCQ t0, acc0 978 ADCQ $0, acc1 979 980 MOVQ t0, AX 981 MOVQ t0, DX 982 SHLQ $32, AX 983 SHRQ $32, DX 984 985 SUBQ t0, acc4 986 SBBQ AX, acc5 987 SBBQ DX, acc0 988 SBBQ $0, acc1 989 // x * y[3] 990 MOVQ (8*3)(y_ptr), t0 991 992 MOVQ (8*0)(x_ptr), AX 993 MULQ t0 994 ADDQ AX, acc3 995 ADCQ $0, DX 996 MOVQ DX, t1 997 998 MOVQ (8*1)(x_ptr), AX 999 MULQ t0 1000 ADDQ t1, acc4 1001 ADCQ $0, DX 1002 ADDQ AX, acc4 1003 ADCQ $0, DX 1004 MOVQ DX, t1 1005 1006 MOVQ (8*2)(x_ptr), AX 1007 MULQ t0 1008 ADDQ t1, acc5 1009 ADCQ $0, DX 1010 ADDQ AX, acc5 1011 ADCQ $0, DX 1012 MOVQ DX, t1 1013 1014 MOVQ (8*3)(x_ptr), AX 1015 MULQ t0 1016 ADDQ t1, acc0 1017 ADCQ $0, DX 1018 ADDQ AX, acc0 1019 ADCQ DX, acc1 1020 ADCQ $0, acc2 1021 // Last reduction step 1022 MOVQ acc3, AX 1023 MULQ p256ordK0<>(SB) 1024 MOVQ AX, t0 1025 1026 MOVQ p256ord<>+0x00(SB), AX 1027 MULQ t0 1028 ADDQ AX, acc3 1029 ADCQ $0, DX 1030 MOVQ DX, t1 1031 1032 MOVQ p256ord<>+0x08(SB), AX 1033 MULQ t0 1034 ADDQ t1, acc4 1035 ADCQ $0, DX 1036 ADDQ AX, acc4 1037 ADCQ DX, acc5 1038 ADCQ $0, acc0 1039 ADCQ t0, acc1 1040 ADCQ $0, acc2 1041 1042 MOVQ t0, AX 1043 MOVQ t0, DX 1044 SHLQ $32, AX 1045 SHRQ $32, DX 1046 1047 SUBQ t0, acc5 1048 SBBQ AX, acc0 1049 SBBQ DX, acc1 1050 SBBQ $0, acc2 1051 // Copy result [255:0] 1052 MOVQ acc4, x_ptr 1053 MOVQ acc5, acc3 1054 MOVQ acc0, t0 1055 MOVQ acc1, t1 1056 // Subtract p256 1057 SUBQ p256ord<>+0x00(SB), acc4 1058 SBBQ p256ord<>+0x08(SB) ,acc5 1059 SBBQ p256ord<>+0x10(SB), acc0 1060 SBBQ p256ord<>+0x18(SB), acc1 1061 SBBQ $0, acc2 1062 1063 CMOVQCS x_ptr, acc4 1064 CMOVQCS acc3, acc5 1065 CMOVQCS t0, acc0 1066 CMOVQCS t1, acc1 1067 1068 MOVQ acc4, (8*0)(res_ptr) 1069 MOVQ acc5, (8*1)(res_ptr) 1070 MOVQ acc0, (8*2)(res_ptr) 1071 MOVQ acc1, (8*3)(res_ptr) 1072 1073 RET 1074 /* ---------------------------------------*/ 1075 // func p256OrdSqr(res, in []uint64, n int) 1076 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 1077 MOVQ res+0(FP), res_ptr 1078 MOVQ in+24(FP), x_ptr 1079 MOVQ n+48(FP), BX 1080 1081 ordSqrLoop: 1082 1083 // y[1:] * y[0] 1084 MOVQ (8*0)(x_ptr), t0 1085 1086 MOVQ (8*1)(x_ptr), AX 1087 MULQ t0 1088 MOVQ AX, acc1 1089 MOVQ DX, acc2 1090 1091 MOVQ (8*2)(x_ptr), AX 1092 MULQ t0 1093 ADDQ AX, acc2 1094 ADCQ $0, DX 1095 MOVQ DX, acc3 1096 1097 MOVQ (8*3)(x_ptr), AX 1098 MULQ t0 1099 ADDQ AX, acc3 1100 ADCQ $0, DX 1101 MOVQ DX, acc4 1102 // y[2:] * y[1] 1103 MOVQ (8*1)(x_ptr), t0 1104 1105 MOVQ (8*2)(x_ptr), AX 1106 MULQ t0 1107 ADDQ AX, acc3 1108 ADCQ $0, DX 1109 MOVQ DX, t1 1110 1111 MOVQ (8*3)(x_ptr), AX 1112 MULQ t0 1113 ADDQ t1, acc4 1114 ADCQ $0, DX 1115 ADDQ AX, acc4 1116 ADCQ $0, DX 1117 MOVQ DX, acc5 1118 // y[3] * y[2] 1119 MOVQ (8*2)(x_ptr), t0 1120 1121 MOVQ (8*3)(x_ptr), AX 1122 MULQ t0 1123 ADDQ AX, acc5 1124 ADCQ $0, DX 1125 MOVQ DX, y_ptr 1126 XORQ t1, t1 1127 // *2 1128 ADDQ acc1, acc1 1129 ADCQ acc2, acc2 1130 ADCQ acc3, acc3 1131 ADCQ acc4, acc4 1132 ADCQ acc5, acc5 1133 ADCQ y_ptr, y_ptr 1134 ADCQ $0, t1 1135 // Missing products 1136 MOVQ (8*0)(x_ptr), AX 1137 MULQ AX 1138 MOVQ AX, acc0 1139 MOVQ DX, t0 1140 1141 MOVQ (8*1)(x_ptr), AX 1142 MULQ AX 1143 ADDQ t0, acc1 1144 ADCQ AX, acc2 1145 ADCQ $0, DX 1146 MOVQ DX, t0 1147 1148 MOVQ (8*2)(x_ptr), AX 1149 MULQ AX 1150 ADDQ t0, acc3 1151 ADCQ AX, acc4 1152 ADCQ $0, DX 1153 MOVQ DX, t0 1154 1155 MOVQ (8*3)(x_ptr), AX 1156 MULQ AX 1157 ADDQ t0, acc5 1158 ADCQ AX, y_ptr 1159 ADCQ DX, t1 1160 MOVQ t1, x_ptr 1161 // First reduction step 1162 MOVQ acc0, AX 1163 MULQ p256ordK0<>(SB) 1164 MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 1165 1166 MOVQ p256ord<>+0x00(SB), AX 1167 MULQ t0 1168 ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 1169 ADCQ $0, DX // DX = carry1 + H(t0 * ord0) 1170 MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) 1171 MOVQ t0, acc0 1172 1173 MOVQ p256ord<>+0x08(SB), AX 1174 MULQ t0 1175 ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1 1176 ADCQ $0, DX // DX = carry2 + H(t0*ord1) 1177 1178 ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1) 1179 ADCQ DX, acc2 1180 ADCQ $0, acc3 1181 ADCQ $0, acc0 1182 1183 MOVQ t0, AX 1184 MOVQ t0, DX 1185 SHLQ $32, AX 1186 SHRQ $32, DX 1187 1188 SUBQ t0, acc2 1189 SBBQ AX, acc3 1190 SBBQ DX, acc0 1191 // Second reduction step 1192 MOVQ acc1, AX 1193 MULQ p256ordK0<>(SB) 1194 MOVQ AX, t0 1195 1196 MOVQ p256ord<>+0x00(SB), AX 1197 MULQ t0 1198 ADDQ AX, acc1 1199 ADCQ $0, DX 1200 MOVQ DX, t1 1201 MOVQ t0, acc1 1202 1203 MOVQ p256ord<>+0x08(SB), AX 1204 MULQ t0 1205 ADDQ t1, acc2 1206 ADCQ $0, DX 1207 1208 ADDQ AX, acc2 1209 ADCQ DX, acc3 1210 ADCQ $0, acc0 1211 ADCQ $0, acc1 1212 1213 MOVQ t0, AX 1214 MOVQ t0, DX 1215 SHLQ $32, AX 1216 SHRQ $32, DX 1217 1218 SUBQ t0, acc3 1219 SBBQ AX, acc0 1220 SBBQ DX, acc1 1221 // Third reduction step 1222 MOVQ acc2, AX 1223 MULQ p256ordK0<>(SB) 1224 MOVQ AX, t0 1225 1226 MOVQ p256ord<>+0x00(SB), AX 1227 MULQ t0 1228 ADDQ AX, acc2 1229 ADCQ $0, DX 1230 MOVQ DX, t1 1231 MOVQ t0, acc2 1232 1233 MOVQ p256ord<>+0x08(SB), AX 1234 MULQ t0 1235 ADDQ t1, acc3 1236 ADCQ $0, DX 1237 1238 ADDQ AX, acc3 1239 ADCQ DX, acc0 1240 ADCQ $0, acc1 1241 ADCQ $0, acc2 1242 1243 MOVQ t0, AX 1244 MOVQ t0, DX 1245 SHLQ $32, AX 1246 SHRQ $32, DX 1247 1248 SUBQ t0, acc0 1249 SBBQ AX, acc1 1250 SBBQ DX, acc2 1251 // Last reduction step 1252 MOVQ acc3, AX 1253 MULQ p256ordK0<>(SB) 1254 MOVQ AX, t0 1255 1256 MOVQ p256ord<>+0x00(SB), AX 1257 MULQ t0 1258 ADDQ AX, acc3 1259 ADCQ $0, DX 1260 MOVQ DX, t1 1261 MOVQ t0, acc3 1262 1263 MOVQ p256ord<>+0x08(SB), AX 1264 MULQ t0 1265 ADDQ t1, acc0 1266 ADCQ $0, DX 1267 1268 ADDQ AX, acc0 1269 ADCQ DX, acc1 1270 ADCQ $0, acc2 1271 ADCQ $0, acc3 1272 1273 MOVQ t0, AX 1274 MOVQ t0, DX 1275 SHLQ $32, AX 1276 SHRQ $32, DX 1277 1278 SUBQ t0, acc1 1279 SBBQ AX, acc2 1280 SBBQ DX, acc3 1281 1282 XORQ t0, t0 1283 // Add bits [511:256] of the sqr result 1284 ADCQ acc4, acc0 1285 ADCQ acc5, acc1 1286 ADCQ y_ptr, acc2 1287 ADCQ x_ptr, acc3 1288 ADCQ $0, t0 1289 1290 MOVQ acc0, acc4 1291 MOVQ acc1, acc5 1292 MOVQ acc2, y_ptr 1293 MOVQ acc3, t1 1294 // Subtract p256 1295 SUBQ p256ord<>+0x00(SB), acc0 1296 SBBQ p256ord<>+0x08(SB) ,acc1 1297 SBBQ p256ord<>+0x10(SB), acc2 1298 SBBQ p256ord<>+0x18(SB), acc3 1299 SBBQ $0, t0 1300 1301 CMOVQCS acc4, acc0 1302 CMOVQCS acc5, acc1 1303 CMOVQCS y_ptr, acc2 1304 CMOVQCS t1, acc3 1305 1306 MOVQ acc0, (8*0)(res_ptr) 1307 MOVQ acc1, (8*1)(res_ptr) 1308 MOVQ acc2, (8*2)(res_ptr) 1309 MOVQ acc3, (8*3)(res_ptr) 1310 MOVQ res_ptr, x_ptr 1311 DECQ BX 1312 JNE ordSqrLoop 1313 1314 RET 1315 /* ---------------------------------------*/ 1316 #undef res_ptr 1317 #undef x_ptr 1318 #undef y_ptr 1319 1320 #undef acc0 1321 #undef acc1 1322 #undef acc2 1323 #undef acc3 1324 #undef acc4 1325 #undef acc5 1326 #undef t0 1327 #undef t1 1328 /* ---------------------------------------*/ 1329 #define mul0 AX 1330 #define mul1 DX 1331 #define acc0 BX 1332 #define acc1 CX 1333 #define acc2 R8 1334 #define acc3 R9 1335 #define acc4 R10 1336 #define acc5 R11 1337 #define acc6 R12 1338 #define acc7 R13 1339 #define t0 R14 1340 #define t1 R15 1341 #define t2 DI 1342 #define t3 SI 1343 #define hlp BP 1344 /* ---------------------------------------*/ 1345 TEXT sm2P256SubInternal(SB),NOSPLIT,$0 1346 XORQ mul0, mul0 1347 SUBQ t0, acc4 1348 SBBQ t1, acc5 1349 SBBQ t2, acc6 1350 SBBQ t3, acc7 1351 SBBQ $0, mul0 1352 1353 MOVQ acc4, acc0 1354 MOVQ acc5, acc1 1355 MOVQ acc6, acc2 1356 MOVQ acc7, acc3 1357 1358 ADDQ $-1, acc4 1359 ADCQ p256p<>+0x08(SB), acc5 1360 ADCQ $-1, acc6 1361 ADCQ p256p<>+0x018(SB), acc7 1362 ANDQ $1, mul0 1363 1364 CMOVQEQ acc0, acc4 1365 CMOVQEQ acc1, acc5 1366 CMOVQEQ acc2, acc6 1367 CMOVQEQ acc3, acc7 1368 1369 RET 1370 /* ---------------------------------------*/ 1371 TEXT sm2P256MulInternal(SB),NOSPLIT,$8 1372 MOVQ acc4, mul0 1373 MULQ t0 1374 MOVQ mul0, acc0 1375 MOVQ mul1, acc1 1376 1377 MOVQ acc4, mul0 1378 MULQ t1 1379 ADDQ mul0, acc1 1380 ADCQ $0, mul1 1381 MOVQ mul1, acc2 1382 1383 MOVQ acc4, mul0 1384 MULQ t2 1385 ADDQ mul0, acc2 1386 ADCQ $0, mul1 1387 MOVQ mul1, acc3 1388 1389 MOVQ acc4, mul0 1390 MULQ t3 1391 ADDQ mul0, acc3 1392 ADCQ $0, mul1 1393 MOVQ mul1, acc4 1394 1395 MOVQ acc5, mul0 1396 MULQ t0 1397 ADDQ mul0, acc1 1398 ADCQ $0, mul1 1399 MOVQ mul1, hlp 1400 1401 MOVQ acc5, mul0 1402 MULQ t1 1403 ADDQ hlp, acc2 1404 ADCQ $0, mul1 1405 ADDQ mul0, acc2 1406 ADCQ $0, mul1 1407 MOVQ mul1, hlp 1408 1409 MOVQ acc5, mul0 1410 MULQ t2 1411 ADDQ hlp, acc3 1412 ADCQ $0, mul1 1413 ADDQ mul0, acc3 1414 ADCQ $0, mul1 1415 MOVQ mul1, hlp 1416 1417 MOVQ acc5, mul0 1418 MULQ t3 1419 ADDQ hlp, acc4 1420 ADCQ $0, mul1 1421 ADDQ mul0, acc4 1422 ADCQ $0, mul1 1423 MOVQ mul1, acc5 1424 1425 MOVQ acc6, mul0 1426 MULQ t0 1427 ADDQ mul0, acc2 1428 ADCQ $0, mul1 1429 MOVQ mul1, hlp 1430 1431 MOVQ acc6, mul0 1432 MULQ t1 1433 ADDQ hlp, acc3 1434 ADCQ $0, mul1 1435 ADDQ mul0, acc3 1436 ADCQ $0, mul1 1437 MOVQ mul1, hlp 1438 1439 MOVQ acc6, mul0 1440 MULQ t2 1441 ADDQ hlp, acc4 1442 ADCQ $0, mul1 1443 ADDQ mul0, acc4 1444 ADCQ $0, mul1 1445 MOVQ mul1, hlp 1446 1447 MOVQ acc6, mul0 1448 MULQ t3 1449 ADDQ hlp, acc5 1450 ADCQ $0, mul1 1451 ADDQ mul0, acc5 1452 ADCQ $0, mul1 1453 MOVQ mul1, acc6 1454 1455 MOVQ acc7, mul0 1456 MULQ t0 1457 ADDQ mul0, acc3 1458 ADCQ $0, mul1 1459 MOVQ mul1, hlp 1460 1461 MOVQ acc7, mul0 1462 MULQ t1 1463 ADDQ hlp, acc4 1464 ADCQ $0, mul1 1465 ADDQ mul0, acc4 1466 ADCQ $0, mul1 1467 MOVQ mul1, hlp 1468 1469 MOVQ acc7, mul0 1470 MULQ t2 1471 ADDQ hlp, acc5 1472 ADCQ $0, mul1 1473 ADDQ mul0, acc5 1474 ADCQ $0, mul1 1475 MOVQ mul1, hlp 1476 1477 MOVQ acc7, mul0 1478 MULQ t3 1479 ADDQ hlp, acc6 1480 ADCQ $0, mul1 1481 ADDQ mul0, acc6 1482 ADCQ $0, mul1 1483 MOVQ mul1, acc7 1484 // First reduction step 1485 MOVQ acc0, mul0 1486 MOVQ acc0, mul1 1487 SHLQ $32, mul0 1488 SHRQ $32, mul1 1489 1490 ADDQ acc0, acc1 1491 ADCQ $0, acc2 1492 ADCQ $0, acc3 1493 ADCQ $0, acc0 1494 1495 SUBQ mul0, acc1 1496 SBBQ mul1, acc2 1497 SBBQ mul0, acc3 1498 SBBQ mul1, acc0 1499 // Second reduction step 1500 MOVQ acc1, mul0 1501 MOVQ acc1, mul1 1502 SHLQ $32, mul0 1503 SHRQ $32, mul1 1504 1505 ADDQ acc1, acc2 1506 ADCQ $0, acc3 1507 ADCQ $0, acc0 1508 ADCQ $0, acc1 1509 1510 SUBQ mul0, acc2 1511 SBBQ mul1, acc3 1512 SBBQ mul0, acc0 1513 SBBQ mul1, acc1 1514 // Third reduction step 1515 MOVQ acc2, mul0 1516 MOVQ acc2, mul1 1517 SHLQ $32, mul0 1518 SHRQ $32, mul1 1519 1520 ADDQ acc2, acc3 1521 ADCQ $0, acc0 1522 ADCQ $0, acc1 1523 ADCQ $0, acc2 1524 1525 SUBQ mul0, acc3 1526 SBBQ mul1, acc0 1527 SBBQ mul0, acc1 1528 SBBQ mul1, acc2 1529 // Last reduction step 1530 MOVQ acc3, mul0 1531 MOVQ acc3, mul1 1532 SHLQ $32, mul0 1533 SHRQ $32, mul1 1534 1535 ADDQ acc3, acc0 1536 ADCQ $0, acc1 1537 ADCQ $0, acc2 1538 ADCQ $0, acc3 1539 1540 SUBQ mul0, acc0 1541 SBBQ mul1, acc1 1542 SBBQ mul0, acc2 1543 SBBQ mul1, acc3 1544 MOVQ $0, BP 1545 // Add bits [511:256] of the result 1546 ADCQ acc0, acc4 1547 ADCQ acc1, acc5 1548 ADCQ acc2, acc6 1549 ADCQ acc3, acc7 1550 ADCQ $0, hlp 1551 // Copy result 1552 MOVQ acc4, acc0 1553 MOVQ acc5, acc1 1554 MOVQ acc6, acc2 1555 MOVQ acc7, acc3 1556 // Subtract p256 1557 SUBQ $-1, acc4 1558 SBBQ p256p<>+0x08(SB), acc5 1559 SBBQ $-1, acc6 1560 SBBQ p256p<>+0x018(SB), acc7 1561 SBBQ $0, hlp 1562 // If the result of the subtraction is negative, restore the previous result 1563 CMOVQCS acc0, acc4 1564 CMOVQCS acc1, acc5 1565 CMOVQCS acc2, acc6 1566 CMOVQCS acc3, acc7 1567 1568 RET 1569 /* ---------------------------------------*/ 1570 TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 1571 1572 MOVQ acc4, mul0 1573 MULQ acc5 1574 MOVQ mul0, acc1 1575 MOVQ mul1, acc2 1576 1577 MOVQ acc4, mul0 1578 MULQ acc6 1579 ADDQ mul0, acc2 1580 ADCQ $0, mul1 1581 MOVQ mul1, acc3 1582 1583 MOVQ acc4, mul0 1584 MULQ acc7 1585 ADDQ mul0, acc3 1586 ADCQ $0, mul1 1587 MOVQ mul1, t0 1588 1589 MOVQ acc5, mul0 1590 MULQ acc6 1591 ADDQ mul0, acc3 1592 ADCQ $0, mul1 1593 MOVQ mul1, hlp 1594 1595 MOVQ acc5, mul0 1596 MULQ acc7 1597 ADDQ hlp, t0 1598 ADCQ $0, mul1 1599 ADDQ mul0, t0 1600 ADCQ $0, mul1 1601 MOVQ mul1, t1 1602 1603 MOVQ acc6, mul0 1604 MULQ acc7 1605 ADDQ mul0, t1 1606 ADCQ $0, mul1 1607 MOVQ mul1, t2 1608 XORQ t3, t3 1609 // *2 1610 ADDQ acc1, acc1 1611 ADCQ acc2, acc2 1612 ADCQ acc3, acc3 1613 ADCQ t0, t0 1614 ADCQ t1, t1 1615 ADCQ t2, t2 1616 ADCQ $0, t3 1617 // Missing products 1618 MOVQ acc4, mul0 1619 MULQ mul0 1620 MOVQ mul0, acc0 1621 MOVQ DX, acc4 1622 1623 MOVQ acc5, mul0 1624 MULQ mul0 1625 ADDQ acc4, acc1 1626 ADCQ mul0, acc2 1627 ADCQ $0, DX 1628 MOVQ DX, acc4 1629 1630 MOVQ acc6, mul0 1631 MULQ mul0 1632 ADDQ acc4, acc3 1633 ADCQ mul0, t0 1634 ADCQ $0, DX 1635 MOVQ DX, acc4 1636 1637 MOVQ acc7, mul0 1638 MULQ mul0 1639 ADDQ acc4, t1 1640 ADCQ mul0, t2 1641 ADCQ DX, t3 1642 // First reduction step 1643 MOVQ acc0, mul0 1644 MOVQ acc0, mul1 1645 SHLQ $32, mul0 1646 SHRQ $32, mul1 1647 1648 ADDQ acc0, acc1 1649 ADCQ $0, acc2 1650 ADCQ $0, acc3 1651 ADCQ $0, acc0 1652 1653 SUBQ mul0, acc1 1654 SBBQ mul1, acc2 1655 SBBQ mul0, acc3 1656 SBBQ mul1, acc0 1657 // Second reduction step 1658 MOVQ acc1, mul0 1659 MOVQ acc1, mul1 1660 SHLQ $32, mul0 1661 SHRQ $32, mul1 1662 1663 ADDQ acc1, acc2 1664 ADCQ $0, acc3 1665 ADCQ $0, acc0 1666 ADCQ $0, acc1 1667 1668 SUBQ mul0, acc2 1669 SBBQ mul1, acc3 1670 SBBQ mul0, acc0 1671 SBBQ mul1, acc1 1672 // Third reduction step 1673 MOVQ acc2, mul0 1674 MOVQ acc2, mul1 1675 SHLQ $32, mul0 1676 SHRQ $32, mul1 1677 1678 ADDQ acc2, acc3 1679 ADCQ $0, acc0 1680 ADCQ $0, acc1 1681 ADCQ $0, acc2 1682 1683 SUBQ mul0, acc3 1684 SBBQ mul1, acc0 1685 SBBQ mul0, acc1 1686 SBBQ mul1, acc2 1687 // Last reduction step 1688 MOVQ acc3, mul0 1689 MOVQ acc3, mul1 1690 SHLQ $32, mul0 1691 SHRQ $32, mul1 1692 1693 ADDQ acc3, acc0 1694 ADCQ $0, acc1 1695 ADCQ $0, acc2 1696 ADCQ $0, acc3 1697 1698 SUBQ mul0, acc0 1699 SBBQ mul1, acc1 1700 SBBQ mul0, acc2 1701 SBBQ mul1, acc3 1702 MOVQ $0, BP 1703 // Add bits [511:256] of the result 1704 ADCQ acc0, t0 1705 ADCQ acc1, t1 1706 ADCQ acc2, t2 1707 ADCQ acc3, t3 1708 ADCQ $0, hlp 1709 // Copy result 1710 MOVQ t0, acc4 1711 MOVQ t1, acc5 1712 MOVQ t2, acc6 1713 MOVQ t3, acc7 1714 // Subtract p256 1715 SUBQ $-1, acc4 1716 SBBQ p256p<>+0x08(SB), acc5 1717 SBBQ $-1, acc6 1718 SBBQ p256p<>+0x018(SB), acc7 1719 SBBQ $0, hlp 1720 // If the result of the subtraction is negative, restore the previous result 1721 CMOVQCS t0, acc4 1722 CMOVQCS t1, acc5 1723 CMOVQCS t2, acc6 1724 CMOVQCS t3, acc7 1725 1726 RET 1727 /* ---------------------------------------*/ 1728 #define p256MulBy2Inline\ 1729 XORQ mul0, mul0;\ 1730 ADDQ acc4, acc4;\ 1731 ADCQ acc5, acc5;\ 1732 ADCQ acc6, acc6;\ 1733 ADCQ acc7, acc7;\ 1734 ADCQ $0, mul0;\ 1735 MOVQ acc4, t0;\ 1736 MOVQ acc5, t1;\ 1737 MOVQ acc6, t2;\ 1738 MOVQ acc7, t3;\ 1739 SUBQ $-1, t0;\ 1740 SBBQ p256p<>+0x08(SB), t1;\ 1741 SBBQ $-1, t2;\ 1742 SBBQ p256p<>+0x018(SB), t3;\ 1743 SBBQ $0, mul0;\ 1744 CMOVQCS acc4, t0;\ 1745 CMOVQCS acc5, t1;\ 1746 CMOVQCS acc6, t2;\ 1747 CMOVQCS acc7, t3; 1748 /* ---------------------------------------*/ 1749 #define p256AddInline \ 1750 XORQ mul0, mul0;\ 1751 ADDQ t0, acc4;\ 1752 ADCQ t1, acc5;\ 1753 ADCQ t2, acc6;\ 1754 ADCQ t3, acc7;\ 1755 ADCQ $0, mul0;\ 1756 MOVQ acc4, t0;\ 1757 MOVQ acc5, t1;\ 1758 MOVQ acc6, t2;\ 1759 MOVQ acc7, t3;\ 1760 SUBQ $-1, t0;\ 1761 SBBQ p256p<>+0x08(SB), t1;\ 1762 SBBQ $-1, t2;\ 1763 SBBQ p256p<>+0x018(SB), t3;\ 1764 SBBQ $0, mul0;\ 1765 CMOVQCS acc4, t0;\ 1766 CMOVQCS acc5, t1;\ 1767 CMOVQCS acc6, t2;\ 1768 CMOVQCS acc7, t3; 1769 /* ---------------------------------------*/ 1770 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1771 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1772 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1773 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1774 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1775 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1776 /* ---------------------------------------*/ 1777 #define x1in(off) (32*0 + off)(SP) 1778 #define y1in(off) (32*1 + off)(SP) 1779 #define z1in(off) (32*2 + off)(SP) 1780 #define x2in(off) (32*3 + off)(SP) 1781 #define y2in(off) (32*4 + off)(SP) 1782 #define xout(off) (32*5 + off)(SP) 1783 #define yout(off) (32*6 + off)(SP) 1784 #define zout(off) (32*7 + off)(SP) 1785 #define s2(off) (32*8 + off)(SP) 1786 #define z1sqr(off) (32*9 + off)(SP) 1787 #define h(off) (32*10 + off)(SP) 1788 #define r(off) (32*11 + off)(SP) 1789 #define hsqr(off) (32*12 + off)(SP) 1790 #define rsqr(off) (32*13 + off)(SP) 1791 #define hcub(off) (32*14 + off)(SP) 1792 #define rptr (32*15)(SP) 1793 #define sel_save (32*15 + 8)(SP) 1794 #define zero_save (32*15 + 8 + 4)(SP) 1795 1796 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 1797 TEXT ·p256PointAddAffineAsm(SB),0,$512-96 1798 // Move input to stack in order to free registers 1799 MOVQ res+0(FP), AX 1800 MOVQ in1+24(FP), BX 1801 MOVQ in2+48(FP), CX 1802 MOVQ sign+72(FP), DX 1803 MOVQ sel+80(FP), t1 1804 MOVQ zero+88(FP), t2 1805 1806 MOVOU (16*0)(BX), X0 1807 MOVOU (16*1)(BX), X1 1808 MOVOU (16*2)(BX), X2 1809 MOVOU (16*3)(BX), X3 1810 MOVOU (16*4)(BX), X4 1811 MOVOU (16*5)(BX), X5 1812 1813 MOVOU X0, x1in(16*0) 1814 MOVOU X1, x1in(16*1) 1815 MOVOU X2, y1in(16*0) 1816 MOVOU X3, y1in(16*1) 1817 MOVOU X4, z1in(16*0) 1818 MOVOU X5, z1in(16*1) 1819 1820 MOVOU (16*0)(CX), X0 1821 MOVOU (16*1)(CX), X1 1822 1823 MOVOU X0, x2in(16*0) 1824 MOVOU X1, x2in(16*1) 1825 // Store pointer to result 1826 MOVQ mul0, rptr 1827 MOVL t1, sel_save 1828 MOVL t2, zero_save 1829 // Negate y2in based on sign 1830 MOVQ (16*2 + 8*0)(CX), acc4 1831 MOVQ (16*2 + 8*1)(CX), acc5 1832 MOVQ (16*2 + 8*2)(CX), acc6 1833 MOVQ (16*2 + 8*3)(CX), acc7 1834 MOVQ $-1, acc0 1835 MOVQ p256p<>+0x08(SB), acc1 1836 MOVQ $-1, acc2 1837 MOVQ p256p<>+0x018(SB), acc3 1838 XORQ mul0, mul0 1839 // Speculatively subtract 1840 SUBQ acc4, acc0 1841 SBBQ acc5, acc1 1842 SBBQ acc6, acc2 1843 SBBQ acc7, acc3 1844 SBBQ $0, mul0 1845 MOVQ acc0, t0 1846 MOVQ acc1, t1 1847 MOVQ acc2, t2 1848 MOVQ acc3, t3 1849 // Add in case the operand was > p256 1850 ADDQ $-1, acc0 1851 ADCQ p256p<>+0x08(SB), acc1 1852 ADCQ $-1, acc2 1853 ADCQ p256p<>+0x018(SB), acc3 1854 ADCQ $0, mul0 1855 CMOVQNE t0, acc0 1856 CMOVQNE t1, acc1 1857 CMOVQNE t2, acc2 1858 CMOVQNE t3, acc3 1859 // If condition is 0, keep original value 1860 TESTQ DX, DX 1861 CMOVQEQ acc4, acc0 1862 CMOVQEQ acc5, acc1 1863 CMOVQEQ acc6, acc2 1864 CMOVQEQ acc7, acc3 1865 // Store result 1866 MOVQ acc0, y2in(8*0) 1867 MOVQ acc1, y2in(8*1) 1868 MOVQ acc2, y2in(8*2) 1869 MOVQ acc3, y2in(8*3) 1870 // Begin point add 1871 LDacc (z1in) 1872 CALL sm2P256SqrInternal(SB) // z1ˆ2 1873 ST (z1sqr) 1874 1875 LDt (x2in) 1876 CALL sm2P256MulInternal(SB) // x2 * z1ˆ2 1877 1878 LDt (x1in) 1879 CALL sm2P256SubInternal(SB) // h = u2 - u1 1880 ST (h) 1881 1882 LDt (z1in) 1883 CALL sm2P256MulInternal(SB) // z3 = h * z1 1884 ST (zout) 1885 1886 LDacc (z1sqr) 1887 CALL sm2P256MulInternal(SB) // z1ˆ3 1888 1889 LDt (y2in) 1890 CALL sm2P256MulInternal(SB) // s2 = y2 * z1ˆ3 1891 ST (s2) 1892 1893 LDt (y1in) 1894 CALL sm2P256SubInternal(SB) // r = s2 - s1 1895 ST (r) 1896 1897 CALL sm2P256SqrInternal(SB) // rsqr = rˆ2 1898 ST (rsqr) 1899 1900 LDacc (h) 1901 CALL sm2P256SqrInternal(SB) // hsqr = hˆ2 1902 ST (hsqr) 1903 1904 LDt (h) 1905 CALL sm2P256MulInternal(SB) // hcub = hˆ3 1906 ST (hcub) 1907 1908 LDt (y1in) 1909 CALL sm2P256MulInternal(SB) // y1 * hˆ3 1910 ST (s2) 1911 1912 LDacc (x1in) 1913 LDt (hsqr) 1914 CALL sm2P256MulInternal(SB) // u1 * hˆ2 1915 ST (h) 1916 1917 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1918 LDacc (rsqr) 1919 CALL sm2P256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 1920 1921 LDt (hcub) 1922 CALL sm2P256SubInternal(SB) 1923 ST (xout) 1924 1925 MOVQ acc4, t0 1926 MOVQ acc5, t1 1927 MOVQ acc6, t2 1928 MOVQ acc7, t3 1929 LDacc (h) 1930 CALL sm2P256SubInternal(SB) 1931 1932 LDt (r) 1933 CALL sm2P256MulInternal(SB) 1934 1935 LDt (s2) 1936 CALL sm2P256SubInternal(SB) 1937 ST (yout) 1938 // Load stored values from stack 1939 MOVQ rptr, AX 1940 MOVL sel_save, BX 1941 MOVL zero_save, CX 1942 // The result is not valid if (sel == 0), conditional choose 1943 MOVOU xout(16*0), X0 1944 MOVOU xout(16*1), X1 1945 MOVOU yout(16*0), X2 1946 MOVOU yout(16*1), X3 1947 MOVOU zout(16*0), X4 1948 MOVOU zout(16*1), X5 1949 1950 MOVL BX, X6 1951 MOVL CX, X7 1952 1953 PXOR X8, X8 1954 PCMPEQL X9, X9 1955 1956 PSHUFD $0, X6, X6 1957 PSHUFD $0, X7, X7 1958 1959 PCMPEQL X8, X6 1960 PCMPEQL X8, X7 1961 1962 MOVOU X6, X15 1963 PANDN X9, X15 1964 1965 MOVOU x1in(16*0), X9 1966 MOVOU x1in(16*1), X10 1967 MOVOU y1in(16*0), X11 1968 MOVOU y1in(16*1), X12 1969 MOVOU z1in(16*0), X13 1970 MOVOU z1in(16*1), X14 1971 1972 PAND X15, X0 1973 PAND X15, X1 1974 PAND X15, X2 1975 PAND X15, X3 1976 PAND X15, X4 1977 PAND X15, X5 1978 1979 PAND X6, X9 1980 PAND X6, X10 1981 PAND X6, X11 1982 PAND X6, X12 1983 PAND X6, X13 1984 PAND X6, X14 1985 1986 PXOR X9, X0 1987 PXOR X10, X1 1988 PXOR X11, X2 1989 PXOR X12, X3 1990 PXOR X13, X4 1991 PXOR X14, X5 1992 // Similarly if zero == 0 1993 PCMPEQL X9, X9 1994 MOVOU X7, X15 1995 PANDN X9, X15 1996 1997 MOVOU x2in(16*0), X9 1998 MOVOU x2in(16*1), X10 1999 MOVOU y2in(16*0), X11 2000 MOVOU y2in(16*1), X12 2001 MOVOU p256one<>+0x00(SB), X13 2002 MOVOU p256one<>+0x10(SB), X14 2003 2004 PAND X15, X0 2005 PAND X15, X1 2006 PAND X15, X2 2007 PAND X15, X3 2008 PAND X15, X4 2009 PAND X15, X5 2010 2011 PAND X7, X9 2012 PAND X7, X10 2013 PAND X7, X11 2014 PAND X7, X12 2015 PAND X7, X13 2016 PAND X7, X14 2017 2018 PXOR X9, X0 2019 PXOR X10, X1 2020 PXOR X11, X2 2021 PXOR X12, X3 2022 PXOR X13, X4 2023 PXOR X14, X5 2024 // Finally output the result 2025 MOVOU X0, (16*0)(AX) 2026 MOVOU X1, (16*1)(AX) 2027 MOVOU X2, (16*2)(AX) 2028 MOVOU X3, (16*3)(AX) 2029 MOVOU X4, (16*4)(AX) 2030 MOVOU X5, (16*5)(AX) 2031 MOVQ $0, rptr 2032 2033 RET 2034 #undef x1in 2035 #undef y1in 2036 #undef z1in 2037 #undef x2in 2038 #undef y2in 2039 #undef xout 2040 #undef yout 2041 #undef zout 2042 #undef s2 2043 #undef z1sqr 2044 #undef h 2045 #undef r 2046 #undef hsqr 2047 #undef rsqr 2048 #undef hcub 2049 #undef rptr 2050 #undef sel_save 2051 #undef zero_save 2052 2053 // sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero 2054 // otherwise. It writes to [acc4..acc7], t0 and t1. 2055 TEXT sm2P256IsZero(SB),NOSPLIT,$0 2056 // AX contains a flag that is set if the input is zero. 2057 XORQ AX, AX 2058 MOVQ $1, t1 2059 2060 // Check whether [acc4..acc7] are all zero. 2061 MOVQ acc4, t0 2062 ORQ acc5, t0 2063 ORQ acc6, t0 2064 ORQ acc7, t0 2065 2066 // Set the zero flag if so. (CMOV of a constant to a register doesn't 2067 // appear to be supported in Go. Thus t1 = 1.) 2068 CMOVQEQ t1, AX 2069 2070 // XOR [acc4..acc7] with P and compare with zero again. 2071 XORQ $-1, acc4 2072 XORQ p256p<>+0x08(SB), acc5 2073 XORQ $-1, acc6 2074 XORQ p256p<>+0x018(SB), acc7 2075 ORQ acc5, acc4 2076 ORQ acc6, acc4 2077 ORQ acc7, acc4 2078 2079 // Set the zero flag if so. 2080 CMOVQEQ t1, AX 2081 RET 2082 2083 /* ---------------------------------------*/ 2084 #define x1in(off) (32*0 + off)(SP) 2085 #define y1in(off) (32*1 + off)(SP) 2086 #define z1in(off) (32*2 + off)(SP) 2087 #define x2in(off) (32*3 + off)(SP) 2088 #define y2in(off) (32*4 + off)(SP) 2089 #define z2in(off) (32*5 + off)(SP) 2090 2091 #define xout(off) (32*6 + off)(SP) 2092 #define yout(off) (32*7 + off)(SP) 2093 #define zout(off) (32*8 + off)(SP) 2094 2095 #define u1(off) (32*9 + off)(SP) 2096 #define u2(off) (32*10 + off)(SP) 2097 #define s1(off) (32*11 + off)(SP) 2098 #define s2(off) (32*12 + off)(SP) 2099 #define z1sqr(off) (32*13 + off)(SP) 2100 #define z2sqr(off) (32*14 + off)(SP) 2101 #define h(off) (32*15 + off)(SP) 2102 #define r(off) (32*16 + off)(SP) 2103 #define hsqr(off) (32*17 + off)(SP) 2104 #define rsqr(off) (32*18 + off)(SP) 2105 #define hcub(off) (32*19 + off)(SP) 2106 #define rptr (32*20)(SP) 2107 #define points_eq (32*20+8)(SP) 2108 2109 //func p256PointAddAsm(res, in1, in2 []uint64) int 2110 TEXT ·p256PointAddAsm(SB),0,$680-80 2111 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 2112 // Move input to stack in order to free registers 2113 MOVQ res+0(FP), AX 2114 MOVQ in1+24(FP), BX 2115 MOVQ in2+48(FP), CX 2116 2117 MOVOU (16*0)(BX), X0 2118 MOVOU (16*1)(BX), X1 2119 MOVOU (16*2)(BX), X2 2120 MOVOU (16*3)(BX), X3 2121 MOVOU (16*4)(BX), X4 2122 MOVOU (16*5)(BX), X5 2123 2124 MOVOU X0, x1in(16*0) 2125 MOVOU X1, x1in(16*1) 2126 MOVOU X2, y1in(16*0) 2127 MOVOU X3, y1in(16*1) 2128 MOVOU X4, z1in(16*0) 2129 MOVOU X5, z1in(16*1) 2130 2131 MOVOU (16*0)(CX), X0 2132 MOVOU (16*1)(CX), X1 2133 MOVOU (16*2)(CX), X2 2134 MOVOU (16*3)(CX), X3 2135 MOVOU (16*4)(CX), X4 2136 MOVOU (16*5)(CX), X5 2137 2138 MOVOU X0, x2in(16*0) 2139 MOVOU X1, x2in(16*1) 2140 MOVOU X2, y2in(16*0) 2141 MOVOU X3, y2in(16*1) 2142 MOVOU X4, z2in(16*0) 2143 MOVOU X5, z2in(16*1) 2144 // Store pointer to result 2145 MOVQ AX, rptr 2146 // Begin point add 2147 LDacc (z2in) 2148 CALL sm2P256SqrInternal(SB) // z2ˆ2 2149 ST (z2sqr) 2150 LDt (z2in) 2151 CALL sm2P256MulInternal(SB) // z2ˆ3 2152 LDt (y1in) 2153 CALL sm2P256MulInternal(SB) // s1 = z2ˆ3*y1 2154 ST (s1) 2155 2156 LDacc (z1in) 2157 CALL sm2P256SqrInternal(SB) // z1ˆ2 2158 ST (z1sqr) 2159 LDt (z1in) 2160 CALL sm2P256MulInternal(SB) // z1ˆ3 2161 LDt (y2in) 2162 CALL sm2P256MulInternal(SB) // s2 = z1ˆ3*y2 2163 ST (s2) 2164 2165 LDt (s1) 2166 CALL sm2P256SubInternal(SB) // r = s2 - s1 2167 ST (r) 2168 CALL sm2P256IsZero(SB) 2169 MOVQ AX, points_eq 2170 2171 LDacc (z2sqr) 2172 LDt (x1in) 2173 CALL sm2P256MulInternal(SB) // u1 = x1 * z2ˆ2 2174 ST (u1) 2175 LDacc (z1sqr) 2176 LDt (x2in) 2177 CALL sm2P256MulInternal(SB) // u2 = x2 * z1ˆ2 2178 ST (u2) 2179 2180 LDt (u1) 2181 CALL sm2P256SubInternal(SB) // h = u2 - u1 2182 ST (h) 2183 CALL sm2P256IsZero(SB) 2184 ANDQ points_eq, AX 2185 MOVQ AX, points_eq 2186 2187 LDacc (r) 2188 CALL sm2P256SqrInternal(SB) // rsqr = rˆ2 2189 ST (rsqr) 2190 2191 LDacc (h) 2192 CALL sm2P256SqrInternal(SB) // hsqr = hˆ2 2193 ST (hsqr) 2194 2195 LDt (h) 2196 CALL sm2P256MulInternal(SB) // hcub = hˆ3 2197 ST (hcub) 2198 2199 LDt (s1) 2200 CALL sm2P256MulInternal(SB) 2201 ST (s2) 2202 2203 LDacc (z1in) 2204 LDt (z2in) 2205 CALL sm2P256MulInternal(SB) // z1 * z2 2206 LDt (h) 2207 CALL sm2P256MulInternal(SB) // z1 * z2 * h 2208 ST (zout) 2209 2210 LDacc (hsqr) 2211 LDt (u1) 2212 CALL sm2P256MulInternal(SB) // hˆ2 * u1 2213 ST (u2) 2214 2215 p256MulBy2Inline // u1 * hˆ2 * 2, inline 2216 LDacc (rsqr) 2217 CALL sm2P256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 2218 2219 LDt (hcub) 2220 CALL sm2P256SubInternal(SB) 2221 ST (xout) 2222 2223 MOVQ acc4, t0 2224 MOVQ acc5, t1 2225 MOVQ acc6, t2 2226 MOVQ acc7, t3 2227 LDacc (u2) 2228 CALL sm2P256SubInternal(SB) 2229 2230 LDt (r) 2231 CALL sm2P256MulInternal(SB) 2232 2233 LDt (s2) 2234 CALL sm2P256SubInternal(SB) 2235 ST (yout) 2236 2237 MOVOU xout(16*0), X0 2238 MOVOU xout(16*1), X1 2239 MOVOU yout(16*0), X2 2240 MOVOU yout(16*1), X3 2241 MOVOU zout(16*0), X4 2242 MOVOU zout(16*1), X5 2243 // Finally output the result 2244 MOVQ rptr, AX 2245 MOVQ $0, rptr 2246 MOVOU X0, (16*0)(AX) 2247 MOVOU X1, (16*1)(AX) 2248 MOVOU X2, (16*2)(AX) 2249 MOVOU X3, (16*3)(AX) 2250 MOVOU X4, (16*4)(AX) 2251 MOVOU X5, (16*5)(AX) 2252 2253 MOVQ points_eq, AX 2254 MOVQ AX, ret+72(FP) 2255 2256 RET 2257 #undef x1in 2258 #undef y1in 2259 #undef z1in 2260 #undef x2in 2261 #undef y2in 2262 #undef z2in 2263 #undef xout 2264 #undef yout 2265 #undef zout 2266 #undef s1 2267 #undef s2 2268 #undef u1 2269 #undef u2 2270 #undef z1sqr 2271 #undef z2sqr 2272 #undef h 2273 #undef r 2274 #undef hsqr 2275 #undef rsqr 2276 #undef hcub 2277 #undef rptr 2278 /* ---------------------------------------*/ 2279 #define x(off) (32*0 + off)(SP) 2280 #define y(off) (32*1 + off)(SP) 2281 #define z(off) (32*2 + off)(SP) 2282 2283 #define s(off) (32*3 + off)(SP) 2284 #define m(off) (32*4 + off)(SP) 2285 #define zsqr(off) (32*5 + off)(SP) 2286 #define tmp(off) (32*6 + off)(SP) 2287 #define rptr (32*7)(SP) 2288 2289 //func p256PointDoubleAsm(res, in []uint64) 2290 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 2291 // Move input to stack in order to free registers 2292 MOVQ res+0(FP), AX 2293 MOVQ in+24(FP), BX 2294 2295 MOVOU (16*0)(BX), X0 2296 MOVOU (16*1)(BX), X1 2297 MOVOU (16*2)(BX), X2 2298 MOVOU (16*3)(BX), X3 2299 MOVOU (16*4)(BX), X4 2300 MOVOU (16*5)(BX), X5 2301 2302 MOVOU X0, x(16*0) 2303 MOVOU X1, x(16*1) 2304 MOVOU X2, y(16*0) 2305 MOVOU X3, y(16*1) 2306 MOVOU X4, z(16*0) 2307 MOVOU X5, z(16*1) 2308 // Store pointer to result 2309 MOVQ AX, rptr 2310 // Begin point double 2311 LDacc (z) 2312 CALL sm2P256SqrInternal(SB) 2313 ST (zsqr) 2314 2315 LDt (x) 2316 p256AddInline 2317 STt (m) 2318 2319 LDacc (z) 2320 LDt (y) 2321 CALL sm2P256MulInternal(SB) 2322 p256MulBy2Inline 2323 MOVQ rptr, AX 2324 // Store z 2325 MOVQ t0, (16*4 + 8*0)(AX) 2326 MOVQ t1, (16*4 + 8*1)(AX) 2327 MOVQ t2, (16*4 + 8*2)(AX) 2328 MOVQ t3, (16*4 + 8*3)(AX) 2329 2330 LDacc (x) 2331 LDt (zsqr) 2332 CALL sm2P256SubInternal(SB) 2333 LDt (m) 2334 CALL sm2P256MulInternal(SB) 2335 ST (m) 2336 // Multiply by 3 2337 p256MulBy2Inline 2338 LDacc (m) 2339 p256AddInline 2340 STt (m) 2341 //////////////////////// 2342 LDacc (y) 2343 p256MulBy2Inline 2344 t2acc 2345 CALL sm2P256SqrInternal(SB) 2346 ST (s) 2347 CALL sm2P256SqrInternal(SB) 2348 // Divide by 2 2349 XORQ mul0, mul0 2350 MOVQ acc4, t0 2351 MOVQ acc5, t1 2352 MOVQ acc6, t2 2353 MOVQ acc7, t3 2354 2355 ADDQ $-1, acc4 2356 ADCQ p256p<>+0x08(SB), acc5 2357 ADCQ $-1, acc6 2358 ADCQ p256p<>+0x018(SB), acc7 2359 ADCQ $0, mul0 2360 TESTQ $1, t0 2361 2362 CMOVQEQ t0, acc4 2363 CMOVQEQ t1, acc5 2364 CMOVQEQ t2, acc6 2365 CMOVQEQ t3, acc7 2366 ANDQ t0, mul0 2367 2368 SHRQ $1, acc5, acc4 2369 SHRQ $1, acc6, acc5 2370 SHRQ $1, acc7, acc6 2371 SHRQ $1, mul0, acc7 2372 ST (y) 2373 ///////////////////////// 2374 LDacc (x) 2375 LDt (s) 2376 CALL sm2P256MulInternal(SB) 2377 ST (s) 2378 p256MulBy2Inline 2379 STt (tmp) 2380 2381 LDacc (m) 2382 CALL sm2P256SqrInternal(SB) 2383 LDt (tmp) 2384 CALL sm2P256SubInternal(SB) 2385 2386 MOVQ rptr, AX 2387 // Store x 2388 MOVQ acc4, (16*0 + 8*0)(AX) 2389 MOVQ acc5, (16*0 + 8*1)(AX) 2390 MOVQ acc6, (16*0 + 8*2)(AX) 2391 MOVQ acc7, (16*0 + 8*3)(AX) 2392 2393 acc2t 2394 LDacc (s) 2395 CALL sm2P256SubInternal(SB) 2396 2397 LDt (m) 2398 CALL sm2P256MulInternal(SB) 2399 2400 LDt (y) 2401 CALL sm2P256SubInternal(SB) 2402 MOVQ rptr, AX 2403 // Store y 2404 MOVQ acc4, (16*2 + 8*0)(AX) 2405 MOVQ acc5, (16*2 + 8*1)(AX) 2406 MOVQ acc6, (16*2 + 8*2)(AX) 2407 MOVQ acc7, (16*2 + 8*3)(AX) 2408 /////////////////////// 2409 MOVQ $0, rptr 2410 2411 RET 2412 /* ---------------------------------------*/ 2413