github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp2_g1_amd64.s (about) 1 //go:build !(purego || plugin) 2 3 #include "textflag.h" 4 5 /* ---------------------------------------*/ 6 #define mul0 AX 7 #define mul1 DX 8 #define acc0 BX 9 #define acc1 CX 10 #define acc2 R8 11 #define acc3 R9 12 #define acc4 R10 13 #define acc5 R11 14 #define acc6 R12 15 #define acc7 R13 16 #define t0 R14 17 #define t1 R15 18 #define t2 DI 19 #define t3 SI 20 #define hlp BP 21 /* ---------------------------------------*/ 22 // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) - (t3, t2, t1, t0) 23 TEXT gfpSubInternal(SB),NOSPLIT,$0 24 XORQ mul0, mul0 25 SUBQ t0, acc4 26 SBBQ t1, acc5 27 SBBQ t2, acc6 28 SBBQ t3, acc7 29 SBBQ $0, mul0 30 31 MOVQ acc4, acc0 32 MOVQ acc5, acc1 33 MOVQ acc6, acc2 34 MOVQ acc7, acc3 35 36 ADDQ ·p2+0(SB), acc4 37 ADCQ ·p2+8(SB), acc5 38 ADCQ ·p2+16(SB), acc6 39 ADCQ ·p2+24(SB), acc7 40 ANDQ $1, mul0 41 42 // CMOVQEQ: Move if equal (ZF == 1) 43 CMOVQEQ acc0, acc4 44 CMOVQEQ acc1, acc5 45 CMOVQEQ acc2, acc6 46 CMOVQEQ acc3, acc7 47 48 RET 49 50 /* ---------------------------------------*/ 51 // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) * (t3, t2, t1, t0) 52 // t0, t1 will be overwrited after this function call 53 TEXT gfpMulInternal(SB),NOSPLIT,$8 54 CMPB ·supportADX(SB), $0 55 JE noAdxMul 56 57 // [t3, t2, t1, t0] * acc4 58 MOVQ acc4, mul1 59 MULXQ t0, acc0, acc1 60 61 MULXQ t1, mul0, acc2 62 ADDQ mul0, acc1 63 64 MULXQ t2, mul0, acc3 65 ADCQ mul0, acc2 66 67 MULXQ t3, mul0, acc4 68 ADCQ mul0, acc3 69 ADCQ $0, acc4 70 71 // [t3, t2, t1, t0] * acc5 72 MOVQ acc5, mul1 73 MULXQ t0, mul0, hlp 74 ADDQ mul0, acc1 75 ADCQ hlp, acc2 76 77 MULXQ t1, mul0, hlp 78 ADCQ $0, hlp 79 ADDQ mul0, acc2 80 ADCQ hlp, acc3 81 82 MULXQ t2, mul0, hlp 83 ADCQ $0, hlp 84 ADDQ mul0, acc3 85 ADCQ hlp, acc4 86 87 MULXQ t3, mul0, acc5 88 ADCQ $0, acc5 89 ADDQ mul0, acc4 90 ADCQ $0, acc5 91 92 // [t3, t2, t1, t0] * acc6 93 MOVQ acc6, mul1 94 MULXQ t0, mul0, hlp 95 ADDQ mul0, acc2 96 ADCQ hlp, acc3 97 98 MULXQ t1, mul0, hlp 99 ADCQ $0, hlp 100 ADDQ mul0, acc3 101 ADCQ hlp, acc4 102 103 MULXQ t2, mul0, hlp 104 ADCQ $0, hlp 105 ADDQ mul0, acc4 106 ADCQ hlp, acc5 107 108 MULXQ t3, mul0, acc6 109 ADCQ $0, acc6 110 ADDQ mul0, acc5 111 ADCQ $0, acc6 112 113 // [t3, t2, t1, t0] * acc7 114 MOVQ acc7, mul1 115 MULXQ t0, mul0, hlp 116 ADDQ mul0, acc3 117 ADCQ hlp, acc4 118 119 MULXQ t1, mul0, hlp 120 ADCQ $0, hlp 121 ADDQ mul0, acc4 122 ADCQ hlp, acc5 123 124 MULXQ t2, mul0, hlp 125 ADCQ $0, hlp 126 ADDQ mul0, acc5 127 ADCQ hlp, acc6 128 129 MULXQ t3, mul0, acc7 130 ADCQ $0, acc7 131 ADDQ mul0, acc6 132 ADCQ $0, acc7 133 134 // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] 135 // First reduction step 136 XORQ t1, t1 137 MOVQ acc0, mul1 138 MULXQ ·np+0x00(SB), mul1, mul0 139 140 MULXQ ·p2+0x00(SB), mul0, t0 141 ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + t0 * ord0 142 143 MULXQ ·p2+0x08(SB), mul0, hlp 144 ADCXQ t0, mul0 145 ADOXQ mul0, acc1 146 147 MULXQ ·p2+0x10(SB), mul0, t0 148 ADCXQ hlp, mul0 149 ADOXQ mul0, acc2 150 151 MULXQ ·p2+0x18(SB), mul0, acc0 152 ADCXQ t0, mul0 153 ADOXQ mul0, acc3 154 ADCXQ t1, acc0 155 ADOXQ t1, acc0 156 157 // Second reduction step 158 MOVQ acc1, mul1 159 MULXQ ·np+0x00(SB), mul1, mul0 160 161 MULXQ ·p2+0x00(SB), mul0, t0 162 ADOXQ mul0, acc1 163 164 MULXQ ·p2+0x08(SB), mul0, hlp 165 ADCXQ t0, mul0 166 ADOXQ mul0, acc2 167 168 MULXQ ·p2+0x10(SB), mul0, t0 169 ADCXQ hlp, mul0 170 ADOXQ mul0, acc3 171 172 MULXQ ·p2+0x18(SB), mul0, acc1 173 ADCXQ t0, mul0 174 ADOXQ mul0, acc0 175 ADCXQ t1, acc1 176 ADOXQ t1, acc1 177 178 // Third reduction step 179 MOVQ acc2, mul1 180 MULXQ ·np+0x00(SB), mul1, mul0 181 182 MULXQ ·p2+0x00(SB), mul0, t0 183 ADOXQ mul0, acc2 184 185 MULXQ ·p2+0x08(SB), mul0, hlp 186 ADCXQ t0, mul0 187 ADOXQ mul0, acc3 188 189 MULXQ ·p2+0x10(SB), mul0, t0 190 ADCXQ hlp, mul0 191 ADOXQ mul0, acc0 192 193 MULXQ ·p2+0x18(SB), mul0, acc2 194 ADCXQ t0, mul0 195 ADOXQ mul0, acc1 196 ADCXQ t1, acc2 197 ADOXQ t1, acc2 198 199 // Last reduction step 200 MOVQ acc3, mul1 201 MULXQ ·np+0x00(SB), mul1, mul0 202 203 MULXQ ·p2+0x00(SB), mul0, t0 204 ADOXQ mul0, acc3 205 206 MULXQ ·p2+0x08(SB), mul0, hlp 207 ADCXQ t0, mul0 208 ADOXQ mul0, acc0 209 210 MULXQ ·p2+0x10(SB), mul0, t0 211 ADCXQ hlp, mul0 212 ADOXQ mul0, acc1 213 214 MULXQ ·p2+0x18(SB), mul0, acc3 215 ADCXQ t0, mul0 216 ADOXQ mul0, acc2 217 ADCXQ t1, acc3 218 ADOXQ t1, acc3 219 220 MOVQ $0, hlp 221 // Add bits [511:256] of the result 222 ADDQ acc0, acc4 223 ADCQ acc1, acc5 224 ADCQ acc2, acc6 225 ADCQ acc3, acc7 226 ADCQ $0, hlp 227 // Copy result 228 MOVQ acc4, acc0 229 MOVQ acc5, acc1 230 MOVQ acc6, acc2 231 MOVQ acc7, acc3 232 // Subtract p 233 SUBQ ·p2+0(SB), acc4 234 SBBQ ·p2+8(SB), acc5 235 SBBQ ·p2+16(SB), acc6 236 SBBQ ·p2+24(SB), acc7 237 SBBQ $0, hlp 238 // If the result of the subtraction is negative, restore the previous result 239 CMOVQCS acc0, acc4 240 CMOVQCS acc1, acc5 241 CMOVQCS acc2, acc6 242 CMOVQCS acc3, acc7 243 244 RET 245 246 noAdxMul: 247 // [t3, t2, t1, t0] * acc4 248 MOVQ acc4, mul0 249 MULQ t0 250 MOVQ mul0, acc0 251 MOVQ mul1, acc1 252 253 MOVQ acc4, mul0 254 MULQ t1 255 ADDQ mul0, acc1 256 ADCQ $0, mul1 257 MOVQ mul1, acc2 258 259 MOVQ acc4, mul0 260 MULQ t2 261 ADDQ mul0, acc2 262 ADCQ $0, mul1 263 MOVQ mul1, acc3 264 265 MOVQ acc4, mul0 266 MULQ t3 267 ADDQ mul0, acc3 268 ADCQ $0, mul1 269 MOVQ mul1, acc4 270 271 // [t3, t2, t1, t0] * acc5 272 MOVQ acc5, mul0 273 MULQ t0 274 ADDQ mul0, acc1 275 ADCQ $0, mul1 276 MOVQ mul1, hlp 277 278 MOVQ acc5, mul0 279 MULQ t1 280 ADDQ hlp, acc2 281 ADCQ $0, mul1 282 ADDQ mul0, acc2 283 ADCQ $0, mul1 284 MOVQ mul1, hlp 285 286 MOVQ acc5, mul0 287 MULQ t2 288 ADDQ hlp, acc3 289 ADCQ $0, mul1 290 ADDQ mul0, acc3 291 ADCQ $0, mul1 292 MOVQ mul1, hlp 293 294 MOVQ acc5, mul0 295 MULQ t3 296 ADDQ hlp, acc4 297 ADCQ $0, mul1 298 ADDQ mul0, acc4 299 ADCQ $0, mul1 300 MOVQ mul1, acc5 301 302 // [t3, t2, t1, t0] * acc6 303 MOVQ acc6, mul0 304 MULQ t0 305 ADDQ mul0, acc2 306 ADCQ $0, mul1 307 MOVQ mul1, hlp 308 309 MOVQ acc6, mul0 310 MULQ t1 311 ADDQ hlp, acc3 312 ADCQ $0, mul1 313 ADDQ mul0, acc3 314 ADCQ $0, mul1 315 MOVQ mul1, hlp 316 317 MOVQ acc6, mul0 318 MULQ t2 319 ADDQ hlp, acc4 320 ADCQ $0, mul1 321 ADDQ mul0, acc4 322 ADCQ $0, mul1 323 MOVQ mul1, hlp 324 325 MOVQ acc6, mul0 326 MULQ t3 327 ADDQ hlp, acc5 328 ADCQ $0, mul1 329 ADDQ mul0, acc5 330 ADCQ $0, mul1 331 MOVQ mul1, acc6 332 333 // [t3, t2, t1, t0] * acc7 334 MOVQ acc7, mul0 335 MULQ t0 336 ADDQ mul0, acc3 337 ADCQ $0, mul1 338 MOVQ mul1, hlp 339 340 MOVQ acc7, mul0 341 MULQ t1 342 ADDQ hlp, acc4 343 ADCQ $0, mul1 344 ADDQ mul0, acc4 345 ADCQ $0, mul1 346 MOVQ mul1, hlp 347 348 MOVQ acc7, mul0 349 MULQ t2 350 ADDQ hlp, acc5 351 ADCQ $0, mul1 352 ADDQ mul0, acc5 353 ADCQ $0, mul1 354 MOVQ mul1, hlp 355 356 MOVQ acc7, mul0 357 MULQ t3 358 ADDQ hlp, acc6 359 ADCQ $0, mul1 360 ADDQ mul0, acc6 361 ADCQ $0, mul1 362 MOVQ mul1, acc7 363 // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] 364 // First reduction step 365 MOVQ acc0, mul0 366 MULQ ·np+0x00(SB) 367 MOVQ mul0, hlp 368 369 MOVQ ·p2+0x00(SB), mul0 370 MULQ hlp 371 ADDQ mul0, acc0 372 ADCQ $0, mul1 373 MOVQ mul1, t0 374 XORQ acc0, acc0 375 376 MOVQ ·p2+0x08(SB), mul0 377 MULQ hlp 378 ADDQ t0, acc1 379 ADCQ $0, mul1 380 ADDQ mul0, acc1 381 ADCQ $0, mul1 382 MOVQ mul1, t0 383 384 MOVQ ·p2+0x10(SB), mul0 385 MULQ hlp 386 ADDQ t0, acc2 387 ADCQ $0, mul1 388 ADDQ mul0, acc2 389 ADCQ $0, mul1 390 MOVQ mul1, t0 391 392 MOVQ ·p2+0x18(SB), mul0 393 MULQ hlp 394 ADDQ t0, acc3 395 ADCQ $0, mul1 396 ADDQ mul0, acc3 397 ADCQ mul1, acc0 398 399 // Second reduction step 400 MOVQ acc1, mul0 401 MULQ ·np+0x00(SB) 402 MOVQ mul0, hlp 403 404 MOVQ ·p2+0x00(SB), mul0 405 MULQ hlp 406 ADDQ mul0, acc1 407 ADCQ $0, mul1 408 MOVQ mul1, t0 409 XORQ acc1, acc1 410 411 MOVQ ·p2+0x08(SB), mul0 412 MULQ hlp 413 ADDQ t0, acc2 414 ADCQ $0, mul1 415 ADDQ mul0, acc2 416 ADCQ $0, mul1 417 MOVQ mul1, t0 418 419 MOVQ ·p2+0x10(SB), mul0 420 MULQ hlp 421 ADDQ t0, acc3 422 ADCQ $0, mul1 423 ADDQ mul0, acc3 424 ADCQ $0, mul1 425 MOVQ mul1, t0 426 427 MOVQ ·p2+0x18(SB), mul0 428 MULQ hlp 429 ADDQ t0, acc0 430 ADCQ $0, mul1 431 ADDQ mul0, acc0 432 ADCQ mul1, acc1 433 434 // Third reduction step 435 MOVQ acc2, mul0 436 MULQ ·np+0x00(SB) 437 MOVQ mul0, hlp 438 439 MOVQ ·p2+0x00(SB), mul0 440 MULQ hlp 441 ADDQ mul0, acc2 442 ADCQ $0, mul1 443 MOVQ mul1, t0 444 XORQ acc2, acc2 445 446 MOVQ ·p2+0x08(SB), mul0 447 MULQ hlp 448 ADDQ t0, acc3 449 ADCQ $0, mul1 450 ADDQ mul0, acc3 451 ADCQ $0, mul1 452 MOVQ mul1, t0 453 454 MOVQ ·p2+0x10(SB), mul0 455 MULQ hlp 456 ADDQ t0, acc0 457 ADCQ $0, mul1 458 ADDQ mul0, acc0 459 ADCQ $0, mul1 460 MOVQ mul1, t0 461 462 MOVQ ·p2+0x18(SB), mul0 463 MULQ hlp 464 ADDQ t0, acc1 465 ADCQ $0, mul1 466 ADDQ mul0, acc1 467 ADCQ mul1, acc2 468 469 // Last reduction step 470 MOVQ acc3, mul0 471 MULQ ·np+0x00(SB) 472 MOVQ mul0, hlp 473 474 MOVQ ·p2+0x00(SB), mul0 475 MULQ hlp 476 ADDQ mul0, acc3 477 ADCQ $0, mul1 478 MOVQ mul1, t0 479 XORQ acc3, acc3 480 481 MOVQ ·p2+0x08(SB), mul0 482 MULQ hlp 483 ADDQ t0, acc0 484 ADCQ $0, mul1 485 ADDQ mul0, acc0 486 ADCQ $0, mul1 487 MOVQ mul1, t0 488 489 MOVQ ·p2+0x10(SB), mul0 490 MULQ hlp 491 ADDQ t0, acc1 492 ADCQ $0, mul1 493 ADDQ mul0, acc1 494 ADCQ $0, mul1 495 MOVQ mul1, t0 496 497 MOVQ ·p2+0x18(SB), mul0 498 MULQ hlp 499 ADDQ t0, acc2 500 ADCQ $0, mul1 501 ADDQ mul0, acc2 502 ADCQ mul1, acc3 503 504 MOVQ $0, hlp 505 // Add bits [511:256] of the result 506 ADDQ acc0, acc4 507 ADCQ acc1, acc5 508 ADCQ acc2, acc6 509 ADCQ acc3, acc7 510 ADCQ $0, hlp 511 // Copy result 512 MOVQ acc4, acc0 513 MOVQ acc5, acc1 514 MOVQ acc6, acc2 515 MOVQ acc7, acc3 516 // Subtract p 517 SUBQ ·p2+0(SB), acc4 518 SBBQ ·p2+8(SB), acc5 519 SBBQ ·p2+16(SB), acc6 520 SBBQ ·p2+24(SB), acc7 521 SBBQ $0, hlp 522 // If the result of the subtraction is negative, restore the previous result 523 CMOVQCS acc0, acc4 524 CMOVQCS acc1, acc5 525 CMOVQCS acc2, acc6 526 CMOVQCS acc3, acc7 527 528 RET 529 530 /* ---------------------------------------*/ 531 // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) ^ 2 532 TEXT gfpSqrInternal(SB),NOSPLIT,$8 533 CMPB ·supportADX(SB), $0 534 JE noAdxSqr 535 536 XORQ t3, t3 537 538 // [acc7, acc6, acc5] * acc4 539 MOVQ acc4, mul1 540 MULXQ acc5, acc1, acc2 541 542 MULXQ acc6, mul0, acc3 543 ADOXQ mul0, acc2 544 545 MULXQ acc7, mul0, t0 546 ADOXQ mul0, acc3 547 ADOXQ t3, t0 548 549 // [acc7, acc6] * acc5 550 MOVQ acc5, mul1 551 MULXQ acc6, mul0, hlp 552 ADOXQ mul0, acc3 553 554 MULXQ acc7, mul0, t1 555 ADCXQ hlp, mul0 556 ADOXQ mul0, t0 557 ADCXQ t3, t1 558 559 // acc7 * acc6 560 MOVQ acc6, mul1 561 MULXQ acc7, mul0, t2 562 ADOXQ mul0, t1 563 ADOXQ t3, t2 564 565 // *2 566 ADOXQ acc1, acc1 567 ADOXQ acc2, acc2 568 ADOXQ acc3, acc3 569 ADOXQ t0, t0 570 ADOXQ t1, t1 571 ADOXQ t2, t2 572 ADOXQ t3, t3 573 574 // Missing products 575 MOVQ acc4, mul1 576 MULXQ mul1, acc0, acc4 577 ADCXQ acc4, acc1 578 579 MOVQ acc5, mul1 580 MULXQ mul1, mul0, acc4 581 ADCXQ mul0, acc2 582 ADCXQ acc4, acc3 583 584 MOVQ acc6, mul1 585 MULXQ mul1, mul0, acc4 586 ADCXQ mul0, t0 587 ADCXQ acc4, t1 588 589 MOVQ acc7, mul1 590 MULXQ mul1, mul0, acc4 591 ADCXQ mul0, t2 592 ADCXQ acc4, t3 593 594 // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] 595 // First reduction step 596 XORQ acc5, acc5 597 MOVQ acc0, mul1 598 MULXQ ·np+0x00(SB), mul1, mul0 599 600 MULXQ ·p2+0x00(SB), mul0, acc4 601 ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + acc5 * ord0 602 603 MULXQ ·p2+0x08(SB), mul0, hlp 604 ADCXQ acc4, mul0 605 ADOXQ mul0, acc1 606 607 MULXQ ·p2+0x10(SB), mul0, acc4 608 ADCXQ hlp, mul0 609 ADOXQ mul0, acc2 610 611 MULXQ ·p2+0x18(SB), mul0, acc0 612 ADCXQ acc4, mul0 613 ADOXQ mul0, acc3 614 ADCXQ acc5, acc0 615 ADOXQ acc5, acc0 616 617 // Second reduction step 618 MOVQ acc1, mul1 619 MULXQ ·np+0x00(SB), mul1, mul0 620 621 MULXQ ·p2+0x00(SB), mul0, acc4 622 ADOXQ mul0, acc1 623 624 MULXQ ·p2+0x08(SB), mul0, hlp 625 ADCXQ acc4, mul0 626 ADOXQ mul0, acc2 627 628 MULXQ ·p2+0x10(SB), mul0, acc4 629 ADCXQ hlp, mul0 630 ADOXQ mul0, acc3 631 632 MULXQ ·p2+0x18(SB), mul0, acc1 633 ADCXQ acc4, mul0 634 ADOXQ mul0, acc0 635 ADCXQ acc5, acc1 636 ADOXQ acc5, acc1 637 638 // Third reduction step 639 MOVQ acc2, mul1 640 MULXQ ·np+0x00(SB), mul1, mul0 641 642 MULXQ ·p2+0x00(SB), mul0, acc4 643 ADOXQ mul0, acc2 644 645 MULXQ ·p2+0x08(SB), mul0, hlp 646 ADCXQ acc4, mul0 647 ADOXQ mul0, acc3 648 649 MULXQ ·p2+0x10(SB), mul0, acc4 650 ADCXQ hlp, mul0 651 ADOXQ mul0, acc0 652 653 MULXQ ·p2+0x18(SB), mul0, acc2 654 ADCXQ acc4, mul0 655 ADOXQ mul0, acc1 656 ADCXQ acc5, acc2 657 ADOXQ acc5, acc2 658 659 // Last reduction step 660 MOVQ acc3, mul1 661 MULXQ ·np+0x00(SB), mul1, mul0 662 663 MULXQ ·p2+0x00(SB), mul0, acc4 664 ADOXQ mul0, acc3 665 666 MULXQ ·p2+0x08(SB), mul0, hlp 667 ADCXQ acc4, mul0 668 ADOXQ mul0, acc0 669 670 MULXQ ·p2+0x10(SB), mul0, acc4 671 ADCXQ hlp, mul0 672 ADOXQ mul0, acc1 673 674 MULXQ ·p2+0x18(SB), mul0, acc3 675 ADCXQ acc4, mul0 676 ADOXQ mul0, acc2 677 ADCXQ acc5, acc3 678 ADOXQ acc5, acc3 679 680 MOVQ $0, hlp 681 // Add bits [511:256] of the result 682 ADDQ acc0, t0 683 ADCQ acc1, t1 684 ADCQ acc2, t2 685 ADCQ acc3, t3 686 ADCQ $0, hlp 687 // Copy result 688 MOVQ t0, acc4 689 MOVQ t1, acc5 690 MOVQ t2, acc6 691 MOVQ t3, acc7 692 // Subtract p 693 SUBQ ·p2+0(SB), acc4 694 SBBQ ·p2+8(SB), acc5 695 SBBQ ·p2+16(SB), acc6 696 SBBQ ·p2+24(SB), acc7 697 SBBQ $0, hlp 698 // If the result of the subtraction is negative, restore the previous result 699 CMOVQCS t0, acc4 700 CMOVQCS t1, acc5 701 CMOVQCS t2, acc6 702 CMOVQCS t3, acc7 703 704 RET 705 706 noAdxSqr: 707 MOVQ acc4, mul0 708 MULQ acc5 709 MOVQ mul0, acc1 710 MOVQ mul1, acc2 711 712 MOVQ acc4, mul0 713 MULQ acc6 714 ADDQ mul0, acc2 715 ADCQ $0, mul1 716 MOVQ mul1, acc3 717 718 MOVQ acc4, mul0 719 MULQ acc7 720 ADDQ mul0, acc3 721 ADCQ $0, mul1 722 MOVQ mul1, t0 723 724 MOVQ acc5, mul0 725 MULQ acc6 726 ADDQ mul0, acc3 727 ADCQ $0, mul1 728 MOVQ mul1, hlp 729 730 MOVQ acc5, mul0 731 MULQ acc7 732 ADDQ hlp, t0 733 ADCQ $0, mul1 734 ADDQ mul0, t0 735 ADCQ $0, mul1 736 MOVQ mul1, t1 737 738 MOVQ acc6, mul0 739 MULQ acc7 740 ADDQ mul0, t1 741 ADCQ $0, mul1 742 MOVQ mul1, t2 743 XORQ t3, t3 744 // *2 745 ADDQ acc1, acc1 746 ADCQ acc2, acc2 747 ADCQ acc3, acc3 748 ADCQ t0, t0 749 ADCQ t1, t1 750 ADCQ t2, t2 751 ADCQ $0, t3 752 // Missing products 753 MOVQ acc4, mul0 754 MULQ mul0 755 MOVQ mul0, acc0 756 MOVQ DX, acc4 757 758 MOVQ acc5, mul0 759 MULQ mul0 760 ADDQ acc4, acc1 761 ADCQ mul0, acc2 762 ADCQ $0, DX 763 MOVQ DX, acc4 764 765 MOVQ acc6, mul0 766 MULQ mul0 767 ADDQ acc4, acc3 768 ADCQ mul0, t0 769 ADCQ $0, DX 770 MOVQ DX, acc4 771 772 MOVQ acc7, mul0 773 MULQ mul0 774 ADDQ acc4, t1 775 ADCQ mul0, t2 776 ADCQ DX, t3 777 // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] 778 // First reduction step 779 MOVQ acc0, mul0 780 MULQ ·np+0x00(SB) 781 MOVQ mul0, hlp 782 783 MOVQ ·p2+0x00(SB), mul0 784 MULQ hlp 785 ADDQ mul0, acc0 786 ADCQ $0, mul1 787 MOVQ mul1, acc5 788 XORQ acc0, acc0 789 790 MOVQ ·p2+0x08(SB), mul0 791 MULQ hlp 792 ADDQ acc5, acc1 793 ADCQ $0, mul1 794 ADDQ mul0, acc1 795 ADCQ $0, mul1 796 MOVQ mul1, acc5 797 798 MOVQ ·p2+0x10(SB), mul0 799 MULQ hlp 800 ADDQ acc5, acc2 801 ADCQ $0, mul1 802 ADDQ mul0, acc2 803 ADCQ $0, mul1 804 MOVQ mul1, acc5 805 806 MOVQ ·p2+0x18(SB), mul0 807 MULQ hlp 808 ADDQ acc5, acc3 809 ADCQ $0, mul1 810 ADDQ mul0, acc3 811 ADCQ mul1, acc0 812 813 // Second reduction step 814 MOVQ acc1, mul0 815 MULQ ·np+0x00(SB) 816 MOVQ mul0, hlp 817 818 MOVQ ·p2+0x00(SB), mul0 819 MULQ hlp 820 ADDQ mul0, acc1 821 ADCQ $0, mul1 822 MOVQ mul1, acc5 823 XORQ acc1, acc1 824 825 MOVQ ·p2+0x08(SB), mul0 826 MULQ hlp 827 ADDQ acc5, acc2 828 ADCQ $0, mul1 829 ADDQ mul0, acc2 830 ADCQ $0, mul1 831 MOVQ mul1, acc5 832 833 MOVQ ·p2+0x10(SB), mul0 834 MULQ hlp 835 ADDQ acc5, acc3 836 ADCQ $0, mul1 837 ADDQ mul0, acc3 838 ADCQ $0, mul1 839 MOVQ mul1, acc5 840 841 MOVQ ·p2+0x18(SB), mul0 842 MULQ hlp 843 ADDQ acc5, acc0 844 ADCQ $0, mul1 845 ADDQ mul0, acc0 846 ADCQ mul1, acc1 847 848 // Third reduction step 849 MOVQ acc2, mul0 850 MULQ ·np+0x00(SB) 851 MOVQ mul0, hlp 852 853 MOVQ ·p2+0x00(SB), mul0 854 MULQ hlp 855 ADDQ mul0, acc2 856 ADCQ $0, mul1 857 MOVQ mul1, acc5 858 XORQ acc2, acc2 859 860 MOVQ ·p2+0x08(SB), mul0 861 MULQ hlp 862 ADDQ acc5, acc3 863 ADCQ $0, mul1 864 ADDQ mul0, acc3 865 ADCQ $0, mul1 866 MOVQ mul1, acc5 867 868 MOVQ ·p2+0x10(SB), mul0 869 MULQ hlp 870 ADDQ acc5, acc0 871 ADCQ $0, mul1 872 ADDQ mul0, acc0 873 ADCQ $0, mul1 874 MOVQ mul1, acc5 875 876 MOVQ ·p2+0x18(SB), mul0 877 MULQ hlp 878 ADDQ acc5, acc1 879 ADCQ $0, mul1 880 ADDQ mul0, acc1 881 ADCQ mul1, acc2 882 883 // Last reduction step 884 MOVQ acc3, mul0 885 MULQ ·np+0x00(SB) 886 MOVQ mul0, hlp 887 888 MOVQ ·p2+0x00(SB), mul0 889 MULQ hlp 890 ADDQ mul0, acc3 891 ADCQ $0, mul1 892 MOVQ mul1, acc5 893 XORQ acc3, acc3 894 895 MOVQ ·p2+0x08(SB), mul0 896 MULQ hlp 897 ADDQ acc5, acc0 898 ADCQ $0, mul1 899 ADDQ mul0, acc0 900 ADCQ $0, mul1 901 MOVQ mul1, acc5 902 903 MOVQ ·p2+0x10(SB), mul0 904 MULQ hlp 905 ADDQ acc5, acc1 906 ADCQ $0, mul1 907 ADDQ mul0, acc1 908 ADCQ $0, mul1 909 MOVQ mul1, acc5 910 911 MOVQ ·p2+0x18(SB), mul0 912 MULQ hlp 913 ADDQ acc5, acc2 914 ADCQ $0, mul1 915 ADDQ mul0, acc2 916 ADCQ mul1, acc3 917 918 MOVQ $0, hlp 919 // Add bits [511:256] of the result 920 ADDQ acc0, t0 921 ADCQ acc1, t1 922 ADCQ acc2, t2 923 ADCQ acc3, t3 924 ADCQ $0, hlp 925 // Copy result 926 MOVQ t0, acc4 927 MOVQ t1, acc5 928 MOVQ t2, acc6 929 MOVQ t3, acc7 930 // Subtract p 931 SUBQ ·p2+0(SB), acc4 932 SBBQ ·p2+8(SB), acc5 933 SBBQ ·p2+16(SB), acc6 934 SBBQ ·p2+24(SB), acc7 935 SBBQ $0, hlp 936 // If the result of the subtraction is negative, restore the previous result 937 CMOVQCS t0, acc4 938 CMOVQCS t1, acc5 939 CMOVQCS t2, acc6 940 CMOVQCS t3, acc7 941 942 RET 943 944 /* ---------------------------------------*/ 945 // (t3, t2, t1, t0) = 2(acc7, acc6, acc5, acc4) 946 #define gfpMulBy2Inline \ 947 XORQ mul0, mul0;\ 948 ADDQ acc4, acc4;\ 949 ADCQ acc5, acc5;\ 950 ADCQ acc6, acc6;\ 951 ADCQ acc7, acc7;\ 952 ADCQ $0, mul0;\ 953 MOVQ acc4, t0;\ 954 MOVQ acc5, t1;\ 955 MOVQ acc6, t2;\ 956 MOVQ acc7, t3;\ 957 SUBQ ·p2+0(SB), t0;\ 958 SBBQ ·p2+8(SB), t1;\ 959 SBBQ ·p2+16(SB), t2;\ 960 SBBQ ·p2+24(SB), t3;\ 961 SBBQ $0, mul0;\ 962 CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) 963 CMOVQCS acc5, t1;\ 964 CMOVQCS acc6, t2;\ 965 CMOVQCS acc7, t3; 966 967 // (acc7, acc6, acc5, acc4) = 2(acc7, acc6, acc5, acc4) 968 #define gfpMulBy2Inline2 \ 969 XORQ mul0, mul0;\ 970 ADDQ acc4, acc4;\ 971 ADCQ acc5, acc5;\ 972 ADCQ acc6, acc6;\ 973 ADCQ acc7, acc7;\ 974 ADCQ $0, mul0;\ 975 MOVQ acc4, t0;\ 976 MOVQ acc5, t1;\ 977 MOVQ acc6, t2;\ 978 MOVQ acc7, t3;\ 979 SUBQ ·p2+0(SB), acc4;\ 980 SBBQ ·p2+8(SB), acc5;\ 981 SBBQ ·p2+16(SB), acc6;\ 982 SBBQ ·p2+24(SB), acc7;\ 983 SBBQ $0, mul0;\ 984 CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1) 985 CMOVQCS t1, acc5;\ 986 CMOVQCS t2, acc6;\ 987 CMOVQCS t3, acc7; 988 989 /* ---------------------------------------*/ 990 // (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0) 991 #define gfpAddInline \ 992 XORQ mul0, mul0;\ 993 ADDQ t0, acc4;\ 994 ADCQ t1, acc5;\ 995 ADCQ t2, acc6;\ 996 ADCQ t3, acc7;\ 997 ADCQ $0, mul0;\ 998 MOVQ acc4, t0;\ 999 MOVQ acc5, t1;\ 1000 MOVQ acc6, t2;\ 1001 MOVQ acc7, t3;\ 1002 SUBQ ·p2+0(SB), t0;\ 1003 SBBQ ·p2+8(SB), t1;\ 1004 SBBQ ·p2+16(SB), t2;\ 1005 SBBQ ·p2+24(SB), t3;\ 1006 SBBQ $0, mul0;\ 1007 CMOVQCS acc4, t0;\ 1008 CMOVQCS acc5, t1;\ 1009 CMOVQCS acc6, t2;\ 1010 CMOVQCS acc7, t3; 1011 1012 /* ---------------------------------------*/ 1013 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1014 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1015 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1016 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1017 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1018 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1019 1020 /* ---------------------------------------*/ 1021 #define axin(off) (32*0 + off)(SP) 1022 #define ayin(off) (32*1 + off)(SP) 1023 #define bxin(off) (32*2 + off)(SP) 1024 #define byin(off) (32*3 + off)(SP) 1025 #define tmp0(off) (32*4 + off)(SP) 1026 #define tmp1(off) (32*5 + off)(SP) 1027 #define cxout(off) (32*6 + off)(SP) 1028 #define rptr (32*7)(SP) 1029 1030 TEXT ·gfp2Mul(SB),NOSPLIT,$256-24 1031 // Move input to stack in order to free registers 1032 MOVQ res+0(FP), CX 1033 MOVQ in1+8(FP), AX 1034 MOVQ in2+16(FP), BX 1035 1036 MOVOU (16*0)(AX), X0 1037 MOVOU (16*1)(AX), X1 1038 MOVOU (16*2)(AX), X2 1039 MOVOU (16*3)(AX), X3 1040 1041 MOVOU X0, axin(16*0) 1042 MOVOU X1, axin(16*1) 1043 MOVOU X2, ayin(16*0) 1044 MOVOU X3, ayin(16*1) 1045 1046 MOVOU (16*0)(BX), X0 1047 MOVOU (16*1)(BX), X1 1048 MOVOU (16*2)(BX), X2 1049 MOVOU (16*3)(BX), X3 1050 1051 MOVOU X0, bxin(16*0) 1052 MOVOU X1, bxin(16*1) 1053 MOVOU X2, byin(16*0) 1054 MOVOU X3, byin(16*1) 1055 1056 // Store pointer to result 1057 MOVQ CX, rptr 1058 1059 LDacc (ayin) 1060 LDt (byin) 1061 CALL gfpMulInternal(SB) 1062 ST (tmp0) 1063 1064 LDacc (axin) 1065 LDt (bxin) 1066 CALL gfpMulInternal(SB) 1067 ST (tmp1) 1068 1069 LDacc (axin) 1070 LDt (ayin) 1071 gfpAddInline 1072 STt (cxout) 1073 1074 LDacc (bxin) 1075 LDt (byin) 1076 gfpAddInline 1077 1078 LDacc (cxout) 1079 CALL gfpMulInternal(SB) 1080 LDt (tmp0) 1081 CALL gfpSubInternal(SB) 1082 LDt (tmp1) 1083 CALL gfpSubInternal(SB) 1084 1085 // Store x 1086 MOVQ rptr, AX 1087 MOVQ acc4, (16*0 + 8*0)(AX) 1088 MOVQ acc5, (16*0 + 8*1)(AX) 1089 MOVQ acc6, (16*0 + 8*2)(AX) 1090 MOVQ acc7, (16*0 + 8*3)(AX) 1091 1092 LDacc (tmp0) 1093 //LDt (tmp1) 1094 CALL gfpSubInternal(SB) 1095 CALL gfpSubInternal(SB) 1096 MOVQ rptr, AX 1097 /////////////////////// 1098 MOVQ $0, rptr 1099 // Store y 1100 MOVQ acc4, (16*2 + 8*0)(AX) 1101 MOVQ acc5, (16*2 + 8*1)(AX) 1102 MOVQ acc6, (16*2 + 8*2)(AX) 1103 MOVQ acc7, (16*2 + 8*3)(AX) 1104 1105 RET 1106 1107 TEXT ·gfp2MulU(SB),NOSPLIT,$256-24 1108 // Move input to stack in order to free registers 1109 MOVQ res+0(FP), CX 1110 MOVQ in1+8(FP), AX 1111 MOVQ in2+16(FP), BX 1112 1113 MOVOU (16*0)(AX), X0 1114 MOVOU (16*1)(AX), X1 1115 MOVOU (16*2)(AX), X2 1116 MOVOU (16*3)(AX), X3 1117 1118 MOVOU X0, axin(16*0) 1119 MOVOU X1, axin(16*1) 1120 MOVOU X2, ayin(16*0) 1121 MOVOU X3, ayin(16*1) 1122 1123 MOVOU (16*0)(BX), X0 1124 MOVOU (16*1)(BX), X1 1125 MOVOU (16*2)(BX), X2 1126 MOVOU (16*3)(BX), X3 1127 1128 MOVOU X0, bxin(16*0) 1129 MOVOU X1, bxin(16*1) 1130 MOVOU X2, byin(16*0) 1131 MOVOU X3, byin(16*1) 1132 1133 // Store pointer to result 1134 MOVQ CX, rptr 1135 1136 LDacc (ayin) 1137 LDt (byin) 1138 CALL gfpMulInternal(SB) 1139 ST (tmp0) 1140 1141 LDacc (axin) 1142 LDt (bxin) 1143 CALL gfpMulInternal(SB) 1144 ST (tmp1) 1145 1146 LDacc (axin) 1147 LDt (ayin) 1148 gfpAddInline 1149 STt (cxout) 1150 1151 LDacc (bxin) 1152 LDt (byin) 1153 gfpAddInline 1154 1155 LDacc (cxout) 1156 CALL gfpMulInternal(SB) 1157 LDt (tmp0) 1158 CALL gfpSubInternal(SB) 1159 LDt (tmp1) 1160 CALL gfpSubInternal(SB) 1161 gfpMulBy2Inline 1162 XORQ acc4, acc4 1163 XORQ acc5, acc5 1164 XORQ acc6, acc6 1165 XORQ acc7, acc7 1166 CALL gfpSubInternal(SB) 1167 1168 // Store y 1169 MOVQ rptr, AX 1170 MOVQ acc4, (16*2 + 8*0)(AX) 1171 MOVQ acc5, (16*2 + 8*1)(AX) 1172 MOVQ acc6, (16*2 + 8*2)(AX) 1173 MOVQ acc7, (16*2 + 8*3)(AX) 1174 1175 LDacc (tmp0) 1176 LDt (tmp1) 1177 CALL gfpSubInternal(SB) 1178 CALL gfpSubInternal(SB) 1179 MOVQ rptr, AX 1180 /////////////////////// 1181 MOVQ $0, rptr 1182 // Store x 1183 MOVQ acc4, (16*0 + 8*0)(AX) 1184 MOVQ acc5, (16*0 + 8*1)(AX) 1185 MOVQ acc6, (16*0 + 8*2)(AX) 1186 MOVQ acc7, (16*0 + 8*3)(AX) 1187 1188 RET 1189 1190 #undef axin 1191 #undef ayin 1192 #undef bxin 1193 #undef byin 1194 #undef tmp0 1195 #undef tmp1 1196 #undef cxout 1197 #undef rptr 1198 1199 TEXT ·gfp2MulU1(SB),NOSPLIT,$0-16 1200 // Move input to stack in order to free registers 1201 MOVQ res+0(FP), mul1 1202 MOVQ in1+8(FP), AX 1203 1204 //LDacc (axin) 1205 MOVOU (16*2)(AX), X2 1206 MOVOU (16*3)(AX), X3 1207 MOVQ (16*0 + 8*0)(AX), acc4 1208 MOVQ (16*0 + 8*1)(AX), acc5 1209 MOVQ (16*0 + 8*2)(AX), acc6 1210 MOVQ (16*0 + 8*3)(AX), acc7 1211 1212 gfpMulBy2Inline 1213 XORQ acc4, acc4 1214 XORQ acc5, acc5 1215 XORQ acc6, acc6 1216 XORQ acc7, acc7 1217 CALL gfpSubInternal(SB) 1218 1219 MOVOU X2, (16*0)(mul1) 1220 MOVOU X3, (16*1)(mul1) 1221 MOVQ acc4, (16*2 + 8*0)(mul1) 1222 MOVQ acc5, (16*2 + 8*1)(mul1) 1223 MOVQ acc6, (16*2 + 8*2)(mul1) 1224 MOVQ acc7, (16*2 + 8*3)(mul1) 1225 1226 RET 1227 1228 #define axin(off) (32*0 + off)(SP) 1229 #define ayin(off) (32*1 + off)(SP) 1230 #define cxout(off) (32*2 + off)(SP) 1231 #define cyout(off) (32*3 + off)(SP) 1232 #define rptr (32*4)(SP) 1233 1234 TEXT ·gfp2Square(SB),NOSPLIT,$160-16 1235 // Move input to stack in order to free registers 1236 MOVQ res+0(FP), AX 1237 MOVQ in1+8(FP), BX 1238 1239 MOVOU (16*0)(BX), X0 1240 MOVOU (16*1)(BX), X1 1241 MOVOU (16*2)(BX), X2 1242 MOVOU (16*3)(BX), X3 1243 1244 MOVOU X0, axin(16*0) 1245 MOVOU X1, axin(16*1) 1246 MOVOU X2, ayin(16*0) 1247 MOVOU X3, ayin(16*1) 1248 1249 // Store pointer to result 1250 MOVQ AX, rptr 1251 1252 LDacc (axin) 1253 LDt (ayin) 1254 gfpAddInline 1255 STt (cyout) 1256 1257 LDacc (axin) 1258 gfpMulBy2Inline 1259 LDacc (ayin) 1260 CALL gfpSubInternal(SB) 1261 1262 LDt (cyout) 1263 CALL gfpMulInternal(SB) 1264 ST (cyout) 1265 1266 LDacc (axin) 1267 LDt (ayin) 1268 CALL gfpMulInternal(SB) 1269 ST (cxout) 1270 1271 LDt (cyout) 1272 gfpAddInline 1273 // Store y 1274 MOVQ rptr, AX 1275 MOVQ t0, (16*2 + 8*0)(AX) 1276 MOVQ t1, (16*2 + 8*1)(AX) 1277 MOVQ t2, (16*2 + 8*2)(AX) 1278 MOVQ t3, (16*2 + 8*3)(AX) 1279 1280 LDacc (cxout) 1281 gfpMulBy2Inline 1282 // Store x 1283 MOVQ rptr, AX 1284 /////////////////////// 1285 MOVQ $0, rptr 1286 MOVQ t0, (16*0 + 8*0)(AX) 1287 MOVQ t1, (16*0 + 8*1)(AX) 1288 MOVQ t2, (16*0 + 8*2)(AX) 1289 MOVQ t3, (16*0 + 8*3)(AX) 1290 1291 RET 1292 1293 TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16 1294 // Move input to stack in order to free registers 1295 MOVQ res+0(FP), AX 1296 MOVQ in1+8(FP), BX 1297 1298 MOVOU (16*0)(BX), X0 1299 MOVOU (16*1)(BX), X1 1300 MOVOU (16*2)(BX), X2 1301 MOVOU (16*3)(BX), X3 1302 1303 MOVOU X0, axin(16*0) 1304 MOVOU X1, axin(16*1) 1305 MOVOU X2, ayin(16*0) 1306 MOVOU X3, ayin(16*1) 1307 1308 // Store pointer to result 1309 MOVQ AX, rptr 1310 1311 LDacc (axin) 1312 LDt (ayin) 1313 gfpAddInline 1314 STt (cxout) 1315 1316 LDacc (axin) 1317 gfpMulBy2Inline 1318 1319 LDacc (ayin) 1320 CALL gfpSubInternal(SB) 1321 1322 LDt (cxout) 1323 CALL gfpMulInternal(SB) 1324 ST (cxout) 1325 1326 LDacc (axin) 1327 LDt (ayin) 1328 CALL gfpMulInternal(SB) 1329 ST (cyout) 1330 1331 LDt (cxout) 1332 gfpAddInline 1333 1334 // Store x 1335 MOVQ rptr, AX 1336 MOVQ t0, (16*0 + 8*0)(AX) 1337 MOVQ t1, (16*0 + 8*1)(AX) 1338 MOVQ t2, (16*0 + 8*2)(AX) 1339 MOVQ t3, (16*0 + 8*3)(AX) 1340 1341 LDacc (cyout) 1342 gfpMulBy2Inline2 1343 gfpMulBy2Inline 1344 XORQ acc4, acc4 1345 XORQ acc5, acc5 1346 XORQ acc6, acc6 1347 XORQ acc7, acc7 1348 CALL gfpSubInternal(SB) 1349 1350 // Store y 1351 MOVQ rptr, AX 1352 /////////////////////// 1353 MOVQ $0, rptr 1354 MOVQ acc4, (16*2 + 8*0)(AX) 1355 MOVQ acc5, (16*2 + 8*1)(AX) 1356 MOVQ acc6, (16*2 + 8*2)(AX) 1357 MOVQ acc7, (16*2 + 8*3)(AX) 1358 1359 RET 1360 1361 #undef axin 1362 #undef ayin 1363 #undef cxout 1364 #undef cyout 1365 #undef rptr 1366 1367 /* ---------------------------------------*/ 1368 #define xin(off) (32*0 + off)(SP) 1369 #define yin(off) (32*1 + off)(SP) 1370 #define zin(off) (32*2 + off)(SP) 1371 1372 #define xout(off) (32*3 + off)(SP) 1373 #define yout(off) (32*4 + off)(SP) 1374 #define zout(off) (32*5 + off)(SP) 1375 #define tmp0(off) (32*6 + off)(SP) 1376 #define tmp2(off) (32*7 + off)(SP) 1377 #define rptr (32*8)(SP) 1378 1379 // func curvePointDoubleComplete(c, a *curvePoint) 1380 TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16 1381 MOVQ res+0(FP), AX 1382 MOVQ in+8(FP), BX 1383 1384 MOVOU (16*0)(BX), X0 1385 MOVOU (16*1)(BX), X1 1386 MOVOU (16*2)(BX), X2 1387 MOVOU (16*3)(BX), X3 1388 MOVOU (16*4)(BX), X4 1389 MOVOU (16*5)(BX), X5 1390 1391 MOVOU X0, xin(16*0) 1392 MOVOU X1, xin(16*1) 1393 MOVOU X2, yin(16*0) 1394 MOVOU X3, yin(16*1) 1395 MOVOU X4, zin(16*0) 1396 MOVOU X5, zin(16*1) 1397 1398 // Store pointer to result 1399 MOVQ AX, rptr 1400 1401 LDacc (yin) 1402 CALL gfpSqrInternal(SB) // t0 := Y^2 1403 ST (tmp0) 1404 1405 gfpMulBy2Inline2 // Z3 := t0 + t0 1406 gfpMulBy2Inline2 // Z3 := Z3 + Z3 1407 gfpMulBy2Inline // Z3 := Z3 + Z3 1408 STt (zout) 1409 1410 LDacc (zin) 1411 CALL gfpSqrInternal(SB) // t2 := Z^2 1412 MOVQ acc4, acc0 1413 MOVQ acc5, acc1 1414 MOVQ acc6, acc2 1415 MOVQ acc7, acc3 1416 gfpMulBy2Inline2 1417 gfpMulBy2Inline2 1418 gfpMulBy2Inline2 1419 gfpMulBy2Inline2 1420 MOVQ acc0, t0 1421 MOVQ acc1, t1 1422 MOVQ acc2, t2 1423 MOVQ acc3, t3 1424 CALL gfpSubInternal(SB) // t2 := 3b * t2 1425 ST (tmp2) 1426 LDt (zout) 1427 CALL gfpMulInternal(SB) // X3 := Z3 * t2 1428 ST (xout) 1429 1430 LDacc (tmp0) 1431 LDt (tmp2) 1432 gfpAddInline // Y3 := t0 + t2 1433 STt (yout) 1434 1435 LDacc (yin) 1436 LDt (zin) 1437 CALL gfpMulInternal(SB) // t1 := YZ 1438 LDt (zout) 1439 CALL gfpMulInternal(SB) // Z3 := t1 * Z3 1440 MOVQ rptr, AX 1441 // Store Z 1442 MOVQ acc4, (16*4 + 8*0)(AX) 1443 MOVQ acc5, (16*4 + 8*1)(AX) 1444 MOVQ acc6, (16*4 + 8*2)(AX) 1445 MOVQ acc7, (16*4 + 8*3)(AX) 1446 1447 LDacc (tmp2) 1448 gfpMulBy2Inline 1449 LDacc (tmp2) 1450 gfpAddInline // t2 := t2 + t2 + t2 1451 LDacc (tmp0) 1452 CALL gfpSubInternal(SB) // t0 := t0 - t2 1453 ST (tmp0) 1454 LDt (yout) 1455 CALL gfpMulInternal(SB) // Y3 = t0 * Y3 1456 LDt (xout) 1457 gfpAddInline // Y3 := X3 + Y3 1458 MOVQ rptr, AX 1459 // Store y 1460 MOVQ t0, (16*2 + 8*0)(AX) 1461 MOVQ t1, (16*2 + 8*1)(AX) 1462 MOVQ t2, (16*2 + 8*2)(AX) 1463 MOVQ t3, (16*2 + 8*3)(AX) 1464 1465 LDacc (xin) 1466 LDt (yin) 1467 CALL gfpMulInternal(SB) // t1 := XY 1468 LDt (tmp0) 1469 CALL gfpMulInternal(SB) // X3 := t0 * t1 1470 gfpMulBy2Inline // X3 := X3 + X3 1471 MOVQ rptr, AX 1472 MOVQ $0, rptr 1473 // Store x 1474 MOVQ t0, (16*0 + 8*0)(AX) 1475 MOVQ t1, (16*0 + 8*1)(AX) 1476 MOVQ t2, (16*0 + 8*2)(AX) 1477 MOVQ t3, (16*0 + 8*3)(AX) 1478 1479 RET 1480 1481 #undef xin 1482 #undef yin 1483 #undef zin 1484 #undef xout 1485 #undef yout 1486 #undef zout 1487 #undef tmp0 1488 #undef tmp2 1489 #undef rptr 1490 1491 /* ---------------------------------------*/ 1492 #define x1in(off) (32*0 + off)(SP) 1493 #define y1in(off) (32*1 + off)(SP) 1494 #define z1in(off) (32*2 + off)(SP) 1495 #define x2in(off) (32*3 + off)(SP) 1496 #define y2in(off) (32*4 + off)(SP) 1497 #define z2in(off) (32*5 + off)(SP) 1498 #define xout(off) (32*6 + off)(SP) 1499 #define yout(off) (32*7 + off)(SP) 1500 #define zout(off) (32*8 + off)(SP) 1501 #define tmp0(off) (32*9 + off)(SP) 1502 #define tmp1(off) (32*10 + off)(SP) 1503 #define tmp2(off) (32*11 + off)(SP) 1504 #define tmp3(off) (32*12 + off)(SP) 1505 #define tmp4(off) (32*13 + off)(SP) 1506 #define rptr (32*14)(SP) 1507 1508 #define curvePointAddCompleteInline \ 1509 LDacc (x1in) \ 1510 LDt (x2in) \ 1511 CALL gfpMulInternal(SB) \ // t0 := X1X2 1512 ST (tmp0) \ 1513 LDacc (y1in) \ 1514 LDt (y2in) \ 1515 CALL gfpMulInternal(SB) \ // t1 := Y1Y2 1516 ST (tmp1) \ 1517 LDacc (z1in) \ 1518 LDt (z2in) \ 1519 CALL gfpMulInternal(SB) \ // t2 := Z1Z2 1520 ST (tmp2) \ 1521 \ 1522 LDacc (x1in) \ 1523 LDt (y1in) \ 1524 gfpAddInline \ 1525 STt (tmp3) \ // t3 := X1 + Y1 1526 LDacc (x2in) \ 1527 LDt (y2in) \ 1528 gfpAddInline \ 1529 LDacc (tmp3) \ 1530 CALL gfpMulInternal(SB) \ // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) 1531 ST (tmp3) \ 1532 LDacc (tmp0) \ 1533 LDt (tmp1) \ 1534 gfpAddInline \ 1535 LDacc (tmp3) \ 1536 CALL gfpSubInternal(SB) \ // t3 := t3 - t4 = X1Y2 + X2Y1 1537 ST (tmp3) \ 1538 \ 1539 LDacc (y1in) \ 1540 LDt (z1in) \ 1541 gfpAddInline \ // t4 := Y1 + Z1 1542 STt (tmp4) \ 1543 LDacc (y2in) \ 1544 LDt (z2in) \ 1545 gfpAddInline \ 1546 LDacc (tmp4) \ 1547 CALL gfpMulInternal(SB) \ // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) 1548 ST (tmp4) \ 1549 LDacc (tmp1) \ 1550 LDt (tmp2) \ 1551 gfpAddInline \ 1552 LDacc (tmp4) \ 1553 CALL gfpSubInternal(SB) \ // t4 := t4 - X3 = Y1Z2 + Y2Z1 1554 ST (tmp4) \ 1555 \ 1556 LDacc (z1in) \ 1557 LDt (x1in) \ 1558 gfpAddInline \ // X3 := X1 + Z1 1559 STt (xout) \ 1560 LDacc (z2in) \ 1561 LDt (x2in) \ 1562 gfpAddInline \ 1563 LDacc (xout) \ 1564 CALL gfpMulInternal(SB) \ // X3 := X3 * Y3 1565 ST (xout) \ 1566 LDacc (tmp0) \ 1567 LDt (tmp2) \ 1568 gfpAddInline \ 1569 LDacc (xout) \ 1570 CALL gfpSubInternal(SB) \ // Y3 := X3 - Y3 = X1Z2 + X2Z1 1571 ST (yout) \ 1572 \ 1573 LDacc (tmp0) \ 1574 gfpMulBy2Inline \ 1575 LDacc (tmp0) \ 1576 gfpAddInline \ // t0 := t0 + t0 + t0 = 3X1X2 1577 STt (tmp0) \ 1578 \ 1579 LDacc (tmp2) \ 1580 gfpMulBy2Inline2 \ 1581 gfpMulBy2Inline2 \ 1582 gfpMulBy2Inline2 \ 1583 gfpMulBy2Inline2 \ 1584 LDt (tmp2) \ 1585 CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2 1586 ST (tmp2) \ 1587 \ 1588 LDt (tmp1) \ 1589 gfpAddInline \ // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 1590 STt (zout) \ 1591 \ 1592 LDacc (tmp1) \ 1593 LDt (tmp2) \ 1594 CALL gfpSubInternal(SB) \ // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 1595 ST (tmp1) \ 1596 \ 1597 LDacc (yout) \ 1598 gfpMulBy2Inline2 \ 1599 gfpMulBy2Inline2 \ 1600 gfpMulBy2Inline2 \ 1601 gfpMulBy2Inline2 \ 1602 LDt (yout) \ 1603 CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) 1604 ST (yout) \ 1605 \ 1606 LDt (tmp4) \ 1607 CALL gfpMulInternal(SB) \ // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) 1608 ST (xout) \ 1609 \ 1610 LDacc (tmp1) \ 1611 LDt (tmp3) \ 1612 CALL gfpMulInternal(SB) \ // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) 1613 LDt (xout) \ 1614 CALL gfpSubInternal(SB) \ // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) 1615 MOVQ rptr, AX \ 1616 \// Store x 1617 MOVQ acc4, (16*0 + 8*0)(AX) \ 1618 MOVQ acc5, (16*0 + 8*1)(AX) \ 1619 MOVQ acc6, (16*0 + 8*2)(AX) \ 1620 MOVQ acc7, (16*0 + 8*3)(AX) \ 1621 \ 1622 LDacc (yout) \ 1623 LDt (tmp0) \ 1624 CALL gfpMulInternal(SB) \ // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) 1625 ST (yout) \ 1626 \ 1627 LDacc (tmp1) \ 1628 LDt (zout) \ 1629 CALL gfpMulInternal(SB) \ // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) 1630 LDt (yout) \ 1631 gfpAddInline \ // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) 1632 MOVQ rptr, AX \ 1633 \// Store y 1634 MOVQ t0, (16*2 + 8*0)(AX) \ 1635 MOVQ t1, (16*2 + 8*1)(AX) \ 1636 MOVQ t2, (16*2 + 8*2)(AX) \ 1637 MOVQ t3, (16*2 + 8*3)(AX) \ 1638 \ 1639 LDacc (tmp0) \ 1640 LDt (tmp3) \ 1641 CALL gfpMulInternal(SB) \ // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) 1642 ST (tmp0) \ 1643 LDacc (zout) \ 1644 LDt (tmp4) \ 1645 CALL gfpMulInternal(SB) \ // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) 1646 LDt (tmp0) \ 1647 gfpAddInline \ // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) 1648 MOVQ rptr, AX \ 1649 MOVQ $0, rptr \ 1650 \// Store z 1651 MOVQ t0, (16*4 + 8*0)(AX) \ 1652 MOVQ t1, (16*4 + 8*1)(AX) \ 1653 MOVQ t2, (16*4 + 8*2)(AX) \ 1654 MOVQ t3, (16*4 + 8*3)(AX) \ 1655 1656 // func curvePointAddComplete(c, a, b *curvePoint) 1657 TEXT ·curvePointAddComplete(SB),0,$480-24 1658 // Move input to stack in order to free registers 1659 MOVQ res+0(FP), AX 1660 MOVQ in1+8(FP), BX 1661 MOVQ in2+16(FP), CX 1662 1663 CMPB ·supportAVX2+0(SB), $0x01 1664 JEQ pointadd_avx2 1665 1666 MOVOU (16*0)(BX), X0 1667 MOVOU (16*1)(BX), X1 1668 MOVOU (16*2)(BX), X2 1669 MOVOU (16*3)(BX), X3 1670 MOVOU (16*4)(BX), X4 1671 MOVOU (16*5)(BX), X5 1672 1673 MOVOU X0, x1in(16*0) 1674 MOVOU X1, x1in(16*1) 1675 MOVOU X2, y1in(16*0) 1676 MOVOU X3, y1in(16*1) 1677 MOVOU X4, z1in(16*0) 1678 MOVOU X5, z1in(16*1) 1679 1680 MOVOU (16*0)(CX), X0 1681 MOVOU (16*1)(CX), X1 1682 MOVOU (16*2)(CX), X2 1683 MOVOU (16*3)(CX), X3 1684 MOVOU (16*4)(CX), X4 1685 MOVOU (16*5)(CX), X5 1686 1687 MOVOU X0, x2in(16*0) 1688 MOVOU X1, x2in(16*1) 1689 MOVOU X2, y2in(16*0) 1690 MOVOU X3, y2in(16*1) 1691 MOVOU X4, z2in(16*0) 1692 MOVOU X5, z2in(16*1) 1693 // Store pointer to result 1694 MOVQ AX, rptr 1695 1696 curvePointAddCompleteInline 1697 1698 RET 1699 1700 pointadd_avx2: 1701 VMOVDQU (32*0)(BX), Y0 1702 VMOVDQU (32*1)(BX), Y1 1703 VMOVDQU (32*2)(BX), Y2 1704 1705 VMOVDQU Y0, x1in(32*0) 1706 VMOVDQU Y1, y1in(32*0) 1707 VMOVDQU Y2, z1in(32*0) 1708 1709 VMOVDQU (32*0)(CX), Y0 1710 VMOVDQU (32*1)(CX), Y1 1711 VMOVDQU (32*2)(CX), Y2 1712 1713 VMOVDQU Y0, x2in(32*0) 1714 VMOVDQU Y1, y2in(32*0) 1715 VMOVDQU Y2, z2in(32*0) 1716 1717 // Store pointer to result 1718 MOVQ AX, rptr 1719 curvePointAddCompleteInline 1720 1721 VZEROUPPER 1722 RET 1723 1724 #undef x1in 1725 #undef y1in 1726 #undef z1in 1727 #undef x2in 1728 #undef y2in 1729 #undef z2in 1730 #undef xout 1731 #undef yout 1732 #undef zout 1733 #undef tmp0 1734 #undef tmp1 1735 #undef tmp2 1736 #undef tmp3 1737 #undef tmp4 1738 #undef rptr 1739 1740 /* ---------------------------------------*/ 1741 /* 1742 // gfpIsZero returns 1 in AX if [acc4..acc7] represents zero and zero 1743 // otherwise. It writes to [acc4..acc7], t0 and t1. 1744 TEXT gfpIsZero(SB),NOSPLIT,$0 1745 // AX contains a flag that is set if the input is zero. 1746 XORQ AX, AX 1747 MOVQ $1, t1 1748 1749 // Check whether [acc4..acc7] are all zero. 1750 MOVQ acc4, t0 1751 ORQ acc5, t0 1752 ORQ acc6, t0 1753 ORQ acc7, t0 1754 1755 // Set the zero flag if so. (CMOV of a constant to a register doesn't 1756 // appear to be supported in Go. Thus t1 = 1.) 1757 CMOVQEQ t1, AX 1758 1759 // XOR [acc4..acc7] with P and compare with zero again. 1760 XORQ ·p2+0(SB), acc4 1761 XORQ ·p2+8(SB), acc5 1762 XORQ ·p2+16(SB), acc6 1763 XORQ ·p2+24(SB), acc7 1764 ORQ acc5, acc4 1765 ORQ acc6, acc4 1766 ORQ acc7, acc4 1767 1768 // Set the zero flag if so. 1769 CMOVQEQ t1, AX 1770 RET 1771 1772 #define x1in(off) (32*0 + off)(SP) 1773 #define y1in(off) (32*1 + off)(SP) 1774 #define z1in(off) (32*2 + off)(SP) 1775 #define x2in(off) (32*3 + off)(SP) 1776 #define y2in(off) (32*4 + off)(SP) 1777 #define z2in(off) (32*5 + off)(SP) 1778 1779 #define xout(off) (32*6 + off)(SP) 1780 #define yout(off) (32*7 + off)(SP) 1781 #define zout(off) (32*8 + off)(SP) 1782 1783 #define u1(off) (32*9 + off)(SP) 1784 #define u2(off) (32*10 + off)(SP) 1785 #define s1(off) (32*11 + off)(SP) 1786 #define s2(off) (32*12 + off)(SP) 1787 #define z1sqr(off) (32*13 + off)(SP) 1788 #define z2sqr(off) (32*14 + off)(SP) 1789 #define h(off) (32*15 + off)(SP) 1790 #define r(off) (32*16 + off)(SP) 1791 #define hsqr(off) (32*17 + off)(SP) 1792 #define rsqr(off) (32*18 + off)(SP) 1793 #define hcub(off) (32*19 + off)(SP) 1794 #define rptr (32*20)(SP) 1795 #define points_eq (32*20+8)(SP) 1796 1797 #define curvePointAddInline \ 1798 \// Begin point add 1799 LDacc (z2in) \ 1800 CALL gfpSqrInternal(SB) \// z2ˆ2 1801 ST (z2sqr) \ 1802 LDt (z2in) \ 1803 CALL gfpMulInternal(SB) \// z2ˆ3 1804 LDt (y1in) \ 1805 CALL gfpMulInternal(SB) \// s1 = z2ˆ3*y1 1806 ST (s1) \ 1807 \ 1808 LDacc (z1in) \ 1809 CALL gfpSqrInternal(SB) \// z1ˆ2 1810 ST (z1sqr) \ 1811 LDt (z1in) \ 1812 CALL gfpMulInternal(SB) \// z1ˆ3 1813 LDt (y2in) \ 1814 CALL gfpMulInternal(SB) \// s2 = z1ˆ3*y2 1815 ST (s2) \ 1816 \ 1817 LDt (s1) \ 1818 CALL gfpSubInternal(SB) \// r = s2 - s1 1819 ST (r) \ 1820 CALL gfpIsZero(SB) \ 1821 MOVQ AX, points_eq \ 1822 \ 1823 LDacc (z2sqr) \ 1824 LDt (x1in) \ 1825 CALL gfpMulInternal(SB) \// u1 = x1 * z2ˆ2 1826 ST (u1) \ 1827 LDacc (z1sqr) \ 1828 LDt (x2in) \ 1829 CALL gfpMulInternal(SB) \// u2 = x2 * z1ˆ2 1830 ST (u2) \ 1831 \ 1832 LDt (u1) \ 1833 CALL gfpSubInternal(SB) \// h = u2 - u1 1834 ST (h) \ 1835 CALL gfpIsZero(SB) \ 1836 ANDQ points_eq, AX \ 1837 MOVQ AX, points_eq \ 1838 \ 1839 LDacc (r) \ 1840 CALL gfpSqrInternal(SB) \// rsqr = rˆ2 1841 ST (rsqr) \ 1842 \ 1843 LDacc (h) \ 1844 CALL gfpSqrInternal(SB) \// hsqr = hˆ2 1845 ST (hsqr) \ 1846 \ 1847 LDt (h) \ 1848 CALL gfpMulInternal(SB) \// hcub = hˆ3 1849 ST (hcub) \ 1850 \ 1851 LDt (s1) \ 1852 CALL gfpMulInternal(SB) \ 1853 ST (s2) \ 1854 \ 1855 LDacc (z1in) \ 1856 LDt (z2in) \ 1857 CALL gfpMulInternal(SB) \// z1 * z2 1858 LDt (h) \ 1859 CALL gfpMulInternal(SB) \// z1 * z2 * h 1860 ST (zout) \ 1861 \ 1862 LDacc (hsqr) \ 1863 LDt (u1) \ 1864 CALL gfpMulInternal(SB) \// hˆ2 * u1 1865 ST (u2) \ 1866 \ 1867 gfpMulBy2Inline \// u1 * hˆ2 * 2, inline 1868 LDacc (rsqr) \ 1869 CALL gfpSubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 1870 \ 1871 LDt (hcub) \ 1872 CALL gfpSubInternal(SB) \ 1873 ST (xout) \ 1874 \ 1875 MOVQ acc4, t0 \ 1876 MOVQ acc5, t1 \ 1877 MOVQ acc6, t2 \ 1878 MOVQ acc7, t3 \ 1879 LDacc (u2) \ 1880 CALL gfpSubInternal(SB) \ 1881 \ 1882 LDt (r) \ 1883 CALL gfpMulInternal(SB) \ 1884 \ 1885 LDt (s2) \ 1886 CALL gfpSubInternal(SB) \ 1887 ST (yout) \ 1888 1889 // func curvePointAdd(c, a, b *curvePoint) int 1890 TEXT ·curvePointAdd(SB),0,$680-32 1891 // Move input to stack in order to free registers 1892 MOVQ res+0(FP), AX 1893 MOVQ in1+8(FP), BX 1894 MOVQ in2+16(FP), CX 1895 1896 MOVOU (16*0)(BX), X0 1897 MOVOU (16*1)(BX), X1 1898 MOVOU (16*2)(BX), X2 1899 MOVOU (16*3)(BX), X3 1900 MOVOU (16*4)(BX), X4 1901 MOVOU (16*5)(BX), X5 1902 1903 MOVOU X0, x1in(16*0) 1904 MOVOU X1, x1in(16*1) 1905 MOVOU X2, y1in(16*0) 1906 MOVOU X3, y1in(16*1) 1907 MOVOU X4, z1in(16*0) 1908 MOVOU X5, z1in(16*1) 1909 1910 MOVOU (16*0)(CX), X0 1911 MOVOU (16*1)(CX), X1 1912 MOVOU (16*2)(CX), X2 1913 MOVOU (16*3)(CX), X3 1914 MOVOU (16*4)(CX), X4 1915 MOVOU (16*5)(CX), X5 1916 1917 MOVOU X0, x2in(16*0) 1918 MOVOU X1, x2in(16*1) 1919 MOVOU X2, y2in(16*0) 1920 MOVOU X3, y2in(16*1) 1921 MOVOU X4, z2in(16*0) 1922 MOVOU X5, z2in(16*1) 1923 // Store pointer to result 1924 MOVQ AX, rptr 1925 1926 curvePointAddInline 1927 1928 MOVOU xout(16*0), X0 1929 MOVOU xout(16*1), X1 1930 MOVOU yout(16*0), X2 1931 MOVOU yout(16*1), X3 1932 MOVOU zout(16*0), X4 1933 MOVOU zout(16*1), X5 1934 // Finally output the result 1935 MOVQ rptr, AX 1936 MOVQ $0, rptr 1937 MOVOU X0, (16*0)(AX) 1938 MOVOU X1, (16*1)(AX) 1939 MOVOU X2, (16*2)(AX) 1940 MOVOU X3, (16*3)(AX) 1941 MOVOU X4, (16*4)(AX) 1942 MOVOU X5, (16*5)(AX) 1943 1944 MOVQ points_eq, AX 1945 MOVQ AX, ret+24(FP) 1946 1947 RET 1948 */