github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp2_g1_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define res_ptr R0 6 #define a_ptr R1 7 #define b_ptr R2 8 9 #define acc0 R3 10 #define acc1 R4 11 #define acc2 R5 12 #define acc3 R6 13 14 #define acc4 R7 15 #define acc5 R8 16 #define acc6 R9 17 #define acc7 R10 18 #define t0 R11 19 #define t1 R12 20 #define const0 R13 21 #define const1 R14 22 #define const2 R15 23 #define const3 R16 24 25 #define hlp0 R17 26 #define hlp1 res_ptr 27 28 #define x0 R19 29 #define x1 R20 30 #define x2 R21 31 #define x3 R22 32 #define y0 R23 33 #define y1 R24 34 #define y2 R25 35 #define y3 R26 36 37 /* ---------------------------------------*/ 38 // (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0) 39 TEXT gfpSubInternal(SB),NOSPLIT,$0 40 SUBS x0, y0, acc0 41 SBCS x1, y1, acc1 42 SBCS x2, y2, acc2 43 SBCS x3, y3, acc3 44 SBC $0, ZR, t0 45 46 ADDS const0, acc0, acc4 47 ADCS const1, acc1, acc5 48 ADCS const2, acc2, acc6 49 ADC const3, acc3, acc7 50 51 ANDS $1, t0 52 CSEL EQ, acc0, acc4, x0 53 CSEL EQ, acc1, acc5, x1 54 CSEL EQ, acc2, acc6, x2 55 CSEL EQ, acc3, acc7, x3 56 57 RET 58 59 /* ---------------------------------------*/ 60 // (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0) 61 TEXT gfpMulInternal(SB),NOSPLIT,$0 62 // y[0] * x 63 MUL y0, x0, acc0 64 UMULH y0, x0, acc1 65 66 MUL y0, x1, t0 67 ADDS t0, acc1 68 UMULH y0, x1, acc2 69 70 MUL y0, x2, t0 71 ADCS t0, acc2 72 UMULH y0, x2, acc3 73 74 MUL y0, x3, t0 75 ADCS t0, acc3 76 UMULH y0, x3, acc4 77 ADC $0, acc4 78 // First reduction step 79 MUL acc0, hlp1, hlp0 80 81 MUL const0, hlp0, t0 82 ADDS t0, acc0, acc0 83 UMULH const0, hlp0, t1 84 85 MUL const1, hlp0, t0 86 ADCS t0, acc1, acc1 87 UMULH const1, hlp0, y0 88 89 MUL const2, hlp0, t0 90 ADCS t0, acc2, acc2 91 UMULH const2, hlp0, acc0 92 93 MUL const3, hlp0, t0 94 ADCS t0, acc3, acc3 95 96 UMULH const3, hlp0, hlp0 97 ADC $0, acc4 98 99 ADDS t1, acc1, acc1 100 ADCS y0, acc2, acc2 101 ADCS acc0, acc3, acc3 102 ADC $0, hlp0, acc0 103 104 // y[1] * x 105 MUL y1, x0, t0 106 ADDS t0, acc1 107 UMULH y1, x0, t1 108 109 MUL y1, x1, t0 110 ADCS t0, acc2 111 UMULH y1, x1, y0 112 113 MUL y1, x2, t0 114 ADCS t0, acc3 115 UMULH y1, x2, hlp0 116 117 MUL y1, x3, t0 118 ADCS t0, acc4 119 UMULH y1, x3, y1 120 ADC $0, ZR, acc5 121 122 ADDS t1, acc2 123 ADCS y0, acc3 124 ADCS hlp0, acc4 125 ADC y1, acc5 126 // Second reduction step 127 MUL acc1, hlp1, hlp0 128 129 MUL const0, hlp0, t0 130 ADDS t0, acc1, acc1 131 UMULH const0, hlp0, t1 132 133 MUL const1, hlp0, t0 134 ADCS t0, acc2, acc2 135 UMULH const1, hlp0, y0 136 137 MUL const2, hlp0, t0 138 ADCS t0, acc3, acc3 139 UMULH const2, hlp0, acc1 140 141 MUL const3, hlp0, t0 142 ADCS t0, acc0, acc0 143 144 UMULH const3, hlp0, hlp0 145 ADC $0, acc5 146 147 ADDS t1, acc2, acc2 148 ADCS y0, acc3, acc3 149 ADCS acc1, acc0, acc0 150 ADC $0, hlp0, acc1 151 152 // y[2] * x 153 MUL y2, x0, t0 154 ADDS t0, acc2 155 UMULH y2, x0, t1 156 157 MUL y2, x1, t0 158 ADCS t0, acc3 159 UMULH y2, x1, y0 160 161 MUL y2, x2, t0 162 ADCS t0, acc4 163 UMULH y2, x2, y1 164 165 MUL y2, x3, t0 166 ADCS t0, acc5 167 UMULH y2, x3, hlp0 168 ADC $0, ZR, acc6 169 170 ADDS t1, acc3 171 ADCS y0, acc4 172 ADCS y1, acc5 173 ADC hlp0, acc6 174 // Third reduction step 175 MUL acc2, hlp1, hlp0 176 177 MUL const0, hlp0, t0 178 ADDS t0, acc2, acc2 179 UMULH const0, hlp0, t1 180 181 MUL const1, hlp0, t0 182 ADCS t0, acc3, acc3 183 UMULH const1, hlp0, y0 184 185 MUL const2, hlp0, t0 186 ADCS t0, acc0, acc0 187 UMULH const2, hlp0, acc2 188 189 MUL const3, hlp0, t0 190 ADCS t0, acc1, acc1 191 192 UMULH const3, hlp0, hlp0 193 ADC $0, acc6 194 195 ADDS t1, acc3, acc3 196 ADCS y0, acc0, acc0 197 ADCS acc2, acc1, acc1 198 ADC $0, hlp0, acc2 199 // y[3] * x 200 MUL y3, x0, t0 201 ADDS t0, acc3 202 UMULH y3, x0, t1 203 204 MUL y3, x1, t0 205 ADCS t0, acc4 206 UMULH y3, x1, y0 207 208 MUL y3, x2, t0 209 ADCS t0, acc5 210 UMULH y3, x2, y1 211 212 MUL y3, x3, t0 213 ADCS t0, acc6 214 UMULH y3, x3, hlp0 215 ADC $0, ZR, acc7 216 217 ADDS t1, acc4 218 ADCS y0, acc5 219 ADCS y1, acc6 220 ADC hlp0, acc7 221 // Last reduction step 222 MUL acc3, hlp1, hlp0 223 224 MUL const0, hlp0, t0 225 ADDS t0, acc3, acc3 226 UMULH const0, hlp0, t1 227 228 MUL const1, hlp0, t0 229 ADCS t0, acc0, acc0 230 UMULH const1, hlp0, y0 231 232 MUL const2, hlp0, t0 233 ADCS t0, acc1, acc1 234 UMULH const2, hlp0, acc3 235 236 MUL const3, hlp0, t0 237 ADCS t0, acc2, acc2 238 239 UMULH const3, hlp0, hlp0 240 ADC $0, acc7 241 242 ADDS t1, acc0, acc0 243 ADCS y0, acc1, acc1 244 ADCS acc3, acc2, acc2 245 ADC $0, hlp0, acc3 246 247 // Add bits [511:256] of the mul result 248 ADDS acc4, acc0, acc0 249 ADCS acc5, acc1, acc1 250 ADCS acc6, acc2, acc2 251 ADCS acc7, acc3, acc3 252 ADC $0, ZR, acc4 253 254 SUBS const0, acc0, t0 255 SBCS const1, acc1, t1 256 SBCS const2, acc2, acc6 257 SBCS const3, acc3, acc7 258 SBCS $0, acc4, acc4 259 260 CSEL CS, t0, acc0, y0 261 CSEL CS, t1, acc1, y1 262 CSEL CS, acc6, acc2, y2 263 CSEL CS, acc7, acc3, y3 264 265 RET 266 267 /* ---------------------------------------*/ 268 // (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2 269 TEXT gfpSqrInternal(SB),NOSPLIT,$0 270 // x[1:] * x[0] 271 MUL x0, x1, acc1 272 UMULH x0, x1, acc2 273 274 MUL x0, x2, t0 275 ADDS t0, acc2, acc2 276 UMULH x0, x2, acc3 277 278 MUL x0, x3, t0 279 ADCS t0, acc3, acc3 280 UMULH x0, x3, acc4 281 ADC $0, acc4, acc4 282 // x[2:] * x[1] 283 MUL x1, x2, t0 284 ADDS t0, acc3 285 UMULH x1, x2, t1 286 ADCS t1, acc4 287 ADC $0, ZR, acc5 288 289 MUL x1, x3, t0 290 ADDS t0, acc4 291 UMULH x1, x3, t1 292 ADC t1, acc5 293 // x[3] * x[2] 294 MUL x2, x3, t0 295 ADDS t0, acc5 296 UMULH x2, x3, acc6 297 ADC $0, acc6 298 299 MOVD $0, acc7 300 // *2 301 ADDS acc1, acc1 302 ADCS acc2, acc2 303 ADCS acc3, acc3 304 ADCS acc4, acc4 305 ADCS acc5, acc5 306 ADCS acc6, acc6 307 ADC $0, acc7 308 // Missing products 309 MUL x0, x0, acc0 310 UMULH x0, x0, t0 311 ADDS t0, acc1, acc1 312 313 MUL x1, x1, t0 314 ADCS t0, acc2, acc2 315 UMULH x1, x1, t1 316 ADCS t1, acc3, acc3 317 318 MUL x2, x2, t0 319 ADCS t0, acc4, acc4 320 UMULH x2, x2, t1 321 ADCS t1, acc5, acc5 322 323 MUL x3, x3, t0 324 ADCS t0, acc6, acc6 325 UMULH x3, x3, t1 326 ADCS t1, acc7, acc7 327 // First reduction step 328 MUL acc0, hlp1, hlp0 329 330 MUL const0, hlp0, t0 331 ADDS t0, acc0, acc0 332 UMULH const0, hlp0, t1 333 334 MUL const1, hlp0, t0 335 ADCS t0, acc1, acc1 336 UMULH const1, hlp0, y0 337 338 MUL const2, hlp0, t0 339 ADCS t0, acc2, acc2 340 UMULH const2, hlp0, acc0 341 342 MUL const3, hlp0, t0 343 ADCS t0, acc3, acc3 344 345 UMULH const3, hlp0, hlp0 346 ADC $0, hlp0 347 348 ADDS t1, acc1, acc1 349 ADCS y0, acc2, acc2 350 ADCS acc0, acc3, acc3 351 ADC $0, hlp0, acc0 352 // Second reduction step 353 MUL acc1, hlp1, hlp0 354 355 MUL const0, hlp0, t0 356 ADDS t0, acc1, acc1 357 UMULH const0, hlp0, t1 358 359 MUL const1, hlp0, t0 360 ADCS t0, acc2, acc2 361 UMULH const1, hlp0, y0 362 363 MUL const2, hlp0, t0 364 ADCS t0, acc3, acc3 365 UMULH const2, hlp0, acc1 366 367 MUL const3, hlp0, t0 368 ADCS t0, acc0, acc0 369 370 UMULH const3, hlp0, hlp0 371 ADC $0, hlp0 372 373 ADDS t1, acc2, acc2 374 ADCS y0, acc3, acc3 375 ADCS acc1, acc0, acc0 376 ADC $0, hlp0, acc1 377 // Third reduction step 378 MUL acc2, hlp1, hlp0 379 380 MUL const0, hlp0, t0 381 ADDS t0, acc2, acc2 382 UMULH const0, hlp0, t1 383 384 MUL const1, hlp0, t0 385 ADCS t0, acc3, acc3 386 UMULH const1, hlp0, y0 387 388 MUL const2, hlp0, t0 389 ADCS t0, acc0, acc0 390 UMULH const2, hlp0, acc2 391 392 MUL const3, hlp0, t0 393 ADCS t0, acc1, acc1 394 395 UMULH const3, hlp0, hlp0 396 ADC $0, hlp0 397 398 ADDS t1, acc3, acc3 399 ADCS y0, acc0, acc0 400 ADCS acc2, acc1, acc1 401 ADC $0, hlp0, acc2 402 403 // Last reduction step 404 MUL acc3, hlp1, hlp0 405 406 MUL const0, hlp0, t0 407 ADDS t0, acc3, acc3 408 UMULH const0, hlp0, t1 409 410 MUL const1, hlp0, t0 411 ADCS t0, acc0, acc0 412 UMULH const1, hlp0, y0 413 414 MUL const2, hlp0, t0 415 ADCS t0, acc1, acc1 416 UMULH const2, hlp0, acc3 417 418 MUL const3, hlp0, t0 419 ADCS t0, acc2, acc2 420 421 UMULH const3, hlp0, hlp0 422 ADC $0, acc7 423 424 ADDS t1, acc0, acc0 425 ADCS y0, acc1, acc1 426 ADCS acc3, acc2, acc2 427 ADC $0, hlp0, acc3 428 // Add bits [511:256] of the sqr result 429 ADDS acc4, acc0, acc0 430 ADCS acc5, acc1, acc1 431 ADCS acc6, acc2, acc2 432 ADCS acc7, acc3, acc3 433 ADC $0, ZR, acc4 434 435 SUBS const0, acc0, t0 436 SBCS const1, acc1, t1 437 SBCS const2, acc2, acc6 438 SBCS const3, acc3, acc7 439 SBCS $0, acc4, acc4 440 441 CSEL CS, t0, acc0, y0 442 CSEL CS, t1, acc1, y1 443 CSEL CS, acc6, acc2, y2 444 CSEL CS, acc7, acc3, y3 445 RET 446 447 /* ---------------------------------------*/ 448 // (x3, x2, x1, x0) = 2(y3, y2, y1, y0) 449 #define gfpMulBy2Inline \ 450 ADDS y0, y0, x0; \ 451 ADCS y1, y1, x1; \ 452 ADCS y2, y2, x2; \ 453 ADCS y3, y3, x3; \ 454 ADC $0, ZR, hlp0; \ 455 SUBS const0, x0, acc0; \ 456 SBCS const1, x1, acc1;\ 457 SBCS const2, x2, acc2; \ 458 SBCS const3, x3, acc3;\ 459 SBCS $0, hlp0, hlp0;\ 460 CSEL CC, x0, acc0, x0;\ 461 CSEL CC, x1, acc1, x1;\ 462 CSEL CC, x2, acc2, x2;\ 463 CSEL CC, x3, acc3, x3; 464 465 // (y3, y2, y1, y0) = 2(y3, y2, y1, y0) 466 #define gfpMulBy2Inline2 \ 467 ADDS y0, y0, x0; \ 468 ADCS y1, y1, x1; \ 469 ADCS y2, y2, x2; \ 470 ADCS y3, y3, x3; \ 471 ADC $0, ZR, hlp0; \ 472 SUBS const0, x0, acc0; \ 473 SBCS const1, x1, acc1;\ 474 SBCS const2, x2, acc2; \ 475 SBCS const3, x3, acc3;\ 476 SBCS $0, hlp0, hlp0;\ 477 CSEL CC, x0, acc0, y0;\ 478 CSEL CC, x1, acc1, y1;\ 479 CSEL CC, x2, acc2, y2;\ 480 CSEL CC, x3, acc3, y3; 481 482 /* ---------------------------------------*/ 483 // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0) 484 #define gfpAddInline \ 485 ADDS y0, x0, x0; \ 486 ADCS y1, x1, x1; \ 487 ADCS y2, x2, x2; \ 488 ADCS y3, x3, x3; \ 489 ADC $0, ZR, hlp0; \ 490 SUBS const0, x0, acc0; \ 491 SBCS const1, x1, acc1;\ 492 SBCS const2, x2, acc2; \ 493 SBCS const3, x3, acc3;\ 494 SBCS $0, hlp0, hlp0;\ 495 CSEL CC, x0, acc0, x0;\ 496 CSEL CC, x1, acc1, x1;\ 497 CSEL CC, x2, acc2, x2;\ 498 CSEL CC, x3, acc3, x3; 499 500 /* ---------------------------------------*/ 501 #define x1in(off) (off)(a_ptr) 502 #define y1in(off) (off + 32)(a_ptr) 503 #define z1in(off) (off + 64)(a_ptr) 504 #define x2in(off) (off)(b_ptr) 505 #define y2in(off) (off + 32)(b_ptr) 506 #define z2in(off) (off + 64)(b_ptr) 507 #define x3out(off) (off)(res_ptr) 508 #define y3out(off) (off + 32)(res_ptr) 509 #define z3out(off) (off + 64)(res_ptr) 510 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) 511 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) 512 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) 513 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) 514 #define y2x MOVD y0, x0; MOVD y1, x1; MOVD y2, x2; MOVD y3, x3 515 #define x2y MOVD x0, y0; MOVD x1, y1; MOVD x2, y2; MOVD x3, y3 516 517 /* ---------------------------------------*/ 518 #define tmp0(off) (32*0 + 8 + off)(RSP) 519 #define tmp1(off) (32*1 + 8 + off)(RSP) 520 #define tmp2(off) (32*2 + 8 + off)(RSP) 521 522 // func gfp2Mul(c, a, b *gfP2) 523 TEXT ·gfp2Mul(SB),NOSPLIT,$104-24 524 MOVD in1+8(FP), a_ptr 525 MOVD in2+16(FP), b_ptr 526 527 MOVD ·np+0x00(SB), hlp1 528 LDP ·p2+0x00(SB), (const0, const1) 529 LDP ·p2+0x10(SB), (const2, const3) 530 531 LDx (y1in) 532 LDy (y2in) 533 CALL gfpMulInternal(SB) 534 STy (tmp0) 535 536 LDx (x1in) 537 LDy (x2in) 538 CALL gfpMulInternal(SB) 539 STy (tmp1) 540 541 LDx (x1in) 542 LDy (y1in) 543 gfpAddInline 544 STx (tmp2) 545 546 LDx (x2in) 547 LDy (y2in) 548 gfpAddInline 549 LDy (tmp2) 550 CALL gfpMulInternal(SB) 551 552 LDx (tmp0) 553 CALL gfpSubInternal(SB) 554 x2y 555 LDx (tmp1) 556 CALL gfpSubInternal(SB) 557 MOVD res+0(FP), res_ptr // not use hlp1 any more 558 STx (x3out) 559 560 LDy (tmp1) 561 gfpMulBy2Inline 562 LDy (tmp0) 563 CALL gfpSubInternal(SB) 564 STx (y3out) 565 566 RET 567 568 // func gfp2MulU(c, a, b *gfP2) 569 TEXT ·gfp2MulU(SB),NOSPLIT,$104-24 570 MOVD in1+8(FP), a_ptr 571 MOVD in2+16(FP), b_ptr 572 573 MOVD ·np+0x00(SB), hlp1 574 LDP ·p2+0x00(SB), (const0, const1) 575 LDP ·p2+0x10(SB), (const2, const3) 576 577 LDx (y1in) 578 LDy (y2in) 579 CALL gfpMulInternal(SB) 580 STy (tmp0) 581 582 LDx (x1in) 583 LDy (x2in) 584 CALL gfpMulInternal(SB) 585 STy (tmp1) 586 587 LDx (x1in) 588 LDy (y1in) 589 gfpAddInline 590 STx (tmp2) 591 592 LDx (x2in) 593 LDy (y2in) 594 gfpAddInline 595 LDy (tmp2) 596 CALL gfpMulInternal(SB) 597 598 LDx (tmp0) 599 CALL gfpSubInternal(SB) 600 x2y 601 LDx (tmp1) 602 CALL gfpSubInternal(SB) 603 x2y 604 gfpMulBy2Inline 605 MOVD $0, y0 606 MOVD $0, y1 607 MOVD $0, y2 608 MOVD $0, y3 609 CALL gfpSubInternal(SB) 610 MOVD res+0(FP), res_ptr // not use hlp1 any more 611 STx (y3out) 612 613 LDy (tmp1) 614 gfpMulBy2Inline 615 LDy (tmp0) 616 CALL gfpSubInternal(SB) 617 STx (x3out) 618 619 RET 620 621 // func gfp2MulU1(c, a *gfP2) 622 TEXT ·gfp2MulU1(SB),NOSPLIT,$0-16 623 MOVD res+0(FP), b_ptr 624 MOVD in1+8(FP), a_ptr 625 626 LDP ·p2+0x00(SB), (const0, const1) 627 LDP ·p2+0x10(SB), (const2, const3) 628 629 LDy (x1in) 630 gfpMulBy2Inline 631 MOVD $0, y0 632 MOVD $0, y1 633 MOVD $0, y2 634 MOVD $0, y3 635 CALL gfpSubInternal(SB) 636 637 ADD $32, a_ptr, a_ptr 638 VLD1 (a_ptr), [V0.B16, V1.B16] 639 VST1 [V0.B16, V1.B16], (b_ptr) 640 STx (y2in) 641 642 RET 643 644 // func gfp2Square(c, a *gfP2) 645 TEXT ·gfp2Square(SB),NOSPLIT,$72-16 646 MOVD res+0(FP), b_ptr 647 MOVD in1+8(FP), a_ptr 648 649 MOVD ·np+0x00(SB), hlp1 650 LDP ·p2+0x00(SB), (const0, const1) 651 LDP ·p2+0x10(SB), (const2, const3) 652 653 LDx (y1in) 654 LDy (x1in) 655 gfpAddInline 656 STx (tmp0) 657 gfpMulBy2Inline 658 LDy (y1in) 659 CALL gfpSubInternal(SB) 660 LDy (tmp0) 661 CALL gfpMulInternal(SB) 662 STy (tmp0) 663 664 LDx (y1in) 665 LDy (x1in) 666 CALL gfpMulInternal(SB) 667 //STy (tmp1) 668 LDx (tmp0) 669 gfpAddInline 670 STx (y2in) 671 672 //LDy (tmp1) 673 gfpMulBy2Inline 674 STx (x2in) 675 676 RET 677 678 // func gfp2SquareU(c, a *gfP2) 679 TEXT ·gfp2SquareU(SB),NOSPLIT,$72-16 680 MOVD res+0(FP), b_ptr 681 MOVD in1+8(FP), a_ptr 682 683 MOVD ·np+0x00(SB), hlp1 684 LDP ·p2+0x00(SB), (const0, const1) 685 LDP ·p2+0x10(SB), (const2, const3) 686 687 LDx (y1in) 688 LDy (x1in) 689 gfpAddInline 690 STx (tmp0) 691 gfpMulBy2Inline 692 LDy (y1in) 693 CALL gfpSubInternal(SB) 694 LDy (tmp0) 695 CALL gfpMulInternal(SB) 696 STy (tmp0) 697 698 LDx (y1in) 699 LDy (x1in) 700 CALL gfpMulInternal(SB) 701 //STy (tmp1) 702 LDx (tmp0) 703 gfpAddInline 704 STx (x2in) 705 706 //LDy (tmp1) 707 gfpMulBy2Inline2 708 gfpMulBy2Inline 709 MOVD $0, y0 710 MOVD $0, y1 711 MOVD $0, y2 712 MOVD $0, y3 713 CALL gfpSubInternal(SB) 714 STx (y2in) 715 716 RET 717 718 /* ---------------------------------------*/ 719 #undef tmp2 720 #define x3t(off) (32*2 + 8 + off)(RSP) 721 #define y3t(off) (32*3 + 8 + off)(RSP) 722 #define z3t(off) (32*4 + 8 + off)(RSP) 723 724 // func curvePointDoubleComplete(c, a *curvePoint) 725 TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16 726 MOVD res+0(FP), b_ptr 727 MOVD in1+8(FP), a_ptr 728 729 MOVD ·np+0x00(SB), hlp1 730 LDP ·p2+0x00(SB), (const0, const1) 731 LDP ·p2+0x10(SB), (const2, const3) 732 733 LDx (y1in) 734 CALL gfpSqrInternal(SB) // t0 := Y^2 735 STy (tmp0) 736 737 gfpMulBy2Inline2 // Z3 := t0 + t0 738 gfpMulBy2Inline2 // Z3 := Z3 + Z3 739 gfpMulBy2Inline // Z3 := Z3 + Z3 740 STx (z3t) 741 742 LDx (z1in) 743 CALL gfpSqrInternal(SB) // t2 := Z^2 744 STy (tmp1) 745 gfpMulBy2Inline2 746 gfpMulBy2Inline2 747 gfpMulBy2Inline2 748 gfpMulBy2Inline2 749 LDx (tmp1) 750 CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ^2 751 STx (tmp1) 752 LDy (z3t) 753 CALL gfpMulInternal(SB) // X3 := t2 * Z3 754 STy (x3t) 755 756 LDx (tmp0) 757 LDy (tmp1) 758 gfpAddInline // Y3 := t0 + t2 759 STx (y3t) 760 gfpMulBy2Inline 761 gfpAddInline // t2 := t2 + t2 + t2 762 STx (tmp1) 763 LDy (tmp0) 764 CALL gfpSubInternal(SB) // t0 := t0 - t2 765 STx (tmp0) 766 LDy (y3t) 767 CALL gfpMulInternal(SB) // Y3 := t0 * Y3 768 LDx (x3t) 769 gfpAddInline // Y3 := X3 + Y3 770 STx (y3t) 771 772 LDx (y1in) 773 LDy (z1in) 774 CALL gfpMulInternal(SB) // t1 := YZ 775 LDx (z3t) 776 CALL gfpMulInternal(SB) // Z3 := t1 * Z3 777 STy (z2in) // Store Z3 778 779 LDx (x1in) 780 LDy (y1in) 781 CALL gfpMulInternal(SB) // t1 := XY 782 LDx (tmp0) 783 CALL gfpMulInternal(SB) // X3 := t0 * t1 784 gfpMulBy2Inline // X3 := X3 + X3 785 STx (x2in) // Store X3 786 // Store Y3 787 LDx (y3t) 788 STx (y2in) 789 790 RET 791 792 /* ---------------------------------------*/ 793 #undef x3t 794 #undef y3t 795 #undef z3t 796 797 #define tmp2(off) (32*2 + 8 + off)(RSP) 798 #define tmp3(off) (32*3 + 8 + off)(RSP) 799 #define tmp4(off) (32*4 + 8 + off)(RSP) 800 #define x3t(off) (32*5 + 8 + off)(RSP) 801 #define y3t(off) (32*6 + 8 + off)(RSP) 802 #define z3t(off) (32*7 + 8 + off)(RSP) 803 804 // func curvePointAddComplete(c, a, b *curvePoint) 805 TEXT ·curvePointAddComplete(SB),0,$264-24 806 MOVD in1+8(FP), a_ptr 807 MOVD in2+16(FP), b_ptr 808 809 MOVD ·np+0x00(SB), hlp1 810 LDP ·p2+0x00(SB), (const0, const1) 811 LDP ·p2+0x10(SB), (const2, const3) 812 813 LDx (x1in) 814 LDy (x2in) 815 CALL gfpMulInternal(SB) // t0 := X1X2 816 STy (tmp0) 817 LDx (y1in) 818 LDy (y2in) 819 CALL gfpMulInternal(SB) // t1 := Y1Y2 820 STy (tmp1) 821 LDx (z1in) 822 LDy (z2in) 823 CALL gfpMulInternal(SB) // t2 := Z1Z2 824 STy (tmp2) 825 826 LDx (x1in) 827 LDy (y1in) 828 gfpAddInline // t3 := X1 + Y1 829 STx (tmp3) 830 831 LDx (x2in) 832 LDy (y2in) 833 gfpAddInline // t4 := X2 + Y2 834 LDy (tmp3) 835 CALL gfpMulInternal(SB) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) 836 STy (tmp3) 837 838 LDx (tmp0) 839 LDy (tmp1) 840 gfpAddInline // t4 := t0 + t1 841 LDy (tmp3) 842 CALL gfpSubInternal(SB) // t3 := t3 - t4 = X1Y2 + X2Y1 843 STx (tmp3) 844 845 LDx (y1in) 846 LDy (z1in) 847 gfpAddInline // t4 := Y1 + Z1 848 STx (tmp4) 849 850 LDx (y2in) 851 LDy (z2in) 852 gfpAddInline // t3 := Y2 + Z2 853 LDy (tmp4) 854 CALL gfpMulInternal(SB) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) 855 STy (tmp4) 856 857 LDx (tmp1) 858 LDy (tmp2) 859 gfpAddInline // X3 := t1 + t2 860 LDy (tmp4) 861 CALL gfpSubInternal(SB) // t4 := t4 - X3 = Y1Z2 + Y2Z1 862 STx (tmp4) 863 864 LDx (x1in) 865 LDy (z1in) 866 gfpAddInline // X3 := X1 + Z1 867 STx (x3t) 868 869 LDx (x2in) 870 LDy (z2in) 871 gfpAddInline // Y3 := X2 + Z2 872 LDy (x3t) 873 CALL gfpMulInternal(SB) // X3 := X3 * Y3 874 STy (x3t) 875 876 LDx (tmp0) 877 LDy (tmp2) 878 gfpAddInline // Y3 := t0 + t2 879 LDy (x3t) 880 CALL gfpSubInternal(SB) // Y3 := X3 - Y3 = X1Z2 + X2Z1 881 STx (y3t) 882 883 LDy (tmp0) 884 gfpMulBy2Inline 885 gfpAddInline // t0 := t0 + t0 + t0 = 3X1X2 886 STx (tmp0) 887 888 LDy (tmp2) 889 gfpMulBy2Inline2 890 gfpMulBy2Inline2 891 gfpMulBy2Inline2 892 gfpMulBy2Inline2 893 LDx (tmp2) 894 CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ1Z2 895 STx (tmp2) 896 897 LDy (tmp1) 898 gfpAddInline // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 899 STx (z3t) 900 901 LDx (tmp2) 902 CALL gfpSubInternal(SB) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 903 STx (tmp1) 904 905 LDy (y3t) 906 gfpMulBy2Inline2 907 gfpMulBy2Inline2 908 gfpMulBy2Inline2 909 gfpMulBy2Inline2 910 LDx (y3t) 911 CALL gfpSubInternal(SB) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) 912 STx (y3t) 913 914 LDy (tmp4) 915 CALL gfpMulInternal(SB) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) 916 STy (x3t) 917 918 MOVD res+0(FP), b_ptr 919 920 LDx (tmp3) 921 LDy (tmp1) 922 CALL gfpMulInternal(SB) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) 923 LDx (x3t) 924 CALL gfpSubInternal(SB) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) 925 STx (x2in) 926 927 LDy (y3t) 928 LDx (tmp0) 929 CALL gfpMulInternal(SB) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) 930 STy (y3t) 931 932 LDx (tmp1) 933 LDy (z3t) 934 CALL gfpMulInternal(SB) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) 935 LDx (y3t) 936 gfpAddInline // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) 937 STx (y2in) 938 939 LDx (tmp0) 940 LDy (tmp3) 941 CALL gfpMulInternal(SB) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) 942 STy (tmp0) 943 944 LDx (tmp4) 945 LDy (z3t) 946 CALL gfpMulInternal(SB) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) 947 LDx (tmp0) 948 gfpAddInline // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) 949 STx (z2in) 950 951 RET