github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_amd64.s (about) 1 // This file contains constant-time, 64-bit assembly implementation of 2 // P256. The optimizations performed here are described in detail in: 3 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 4 // 256-bit primes" 5 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 6 // https://eprint.iacr.org/2013/816.pdf 7 // https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2) 8 //go:build !(purego || plugin) 9 10 #include "textflag.h" 11 #include "p256_macros_amd64.s" 12 #define t1 R15 13 14 /* ---------------------------------------*/ 15 // func p256Sqr(res, in *p256Element, n int) 16 TEXT ·p256Sqr(SB),NOSPLIT,$0 17 MOVQ res+0(FP), res_ptr 18 MOVQ in+8(FP), x_ptr 19 MOVQ n+16(FP), BX 20 CMPB ·supportBMI2+0(SB), $0x01 21 JEQ sqrBMI2 22 23 sqrLoop: 24 p256SqrRound(t1) 25 DECQ BX 26 JNE sqrLoop 27 RET 28 29 sqrBMI2: 30 p256SqrRoundAdx(t1) 31 DECQ BX 32 JNE sqrBMI2 33 RET 34 35 /* ---------------------------------------*/ 36 // func p256OrdSqr(res, in *p256OrdElement, n int) 37 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 38 MOVQ res+0(FP), res_ptr 39 MOVQ in+8(FP), x_ptr 40 MOVQ n+16(FP), BX 41 42 CMPB ·supportBMI2+0(SB), $0x01 43 JEQ ordSqrLoopBMI2 44 45 ordSqrLoop: 46 p256OrdSqrRound(t1) 47 DECQ BX 48 JNE ordSqrLoop 49 50 RET 51 52 ordSqrLoopBMI2: 53 p256OrdSqrRoundAdx(t1) 54 DECQ BX 55 JNE ordSqrLoopBMI2 56 57 RET 58 59 /* ---------------------------------------*/ 60 #undef res_ptr 61 #undef x_ptr 62 #undef y_ptr 63 64 #undef acc0 65 #undef acc1 66 #undef acc2 67 #undef acc3 68 #undef acc4 69 #undef acc5 70 #undef t0 71 #undef t1 72 /* ---------------------------------------*/ 73 #define mul0 AX 74 #define mul1 DX 75 #define acc0 BX 76 #define acc1 CX 77 #define acc2 R8 78 #define acc3 R9 79 #define acc4 R10 80 #define acc5 R11 81 #define acc6 R12 82 #define acc7 R13 83 #define t0 R14 84 #define t1 R15 85 #define t2 DI 86 #define t3 SI 87 #define hlp BP 88 89 /* ---------------------------------------*/ 90 // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0] 91 TEXT sm2P256MulInternal(SB),NOSPLIT,$8 92 CMPB ·supportBMI2+0(SB), $0x01 93 JEQ internalMulBMI2 94 95 MOVQ acc4, mul0 96 MULQ t0 97 MOVQ mul0, acc0 98 MOVQ mul1, acc1 99 100 MOVQ acc4, mul0 101 MULQ t1 102 ADDQ mul0, acc1 103 ADCQ $0, mul1 104 MOVQ mul1, acc2 105 106 MOVQ acc4, mul0 107 MULQ t2 108 ADDQ mul0, acc2 109 ADCQ $0, mul1 110 MOVQ mul1, acc3 111 112 MOVQ acc4, mul0 113 MULQ t3 114 ADDQ mul0, acc3 115 ADCQ $0, mul1 116 MOVQ mul1, acc4 117 118 MOVQ acc5, mul0 119 MULQ t0 120 ADDQ mul0, acc1 121 ADCQ $0, mul1 122 MOVQ mul1, hlp 123 124 MOVQ acc5, mul0 125 MULQ t1 126 ADDQ hlp, acc2 127 ADCQ $0, mul1 128 ADDQ mul0, acc2 129 ADCQ $0, mul1 130 MOVQ mul1, hlp 131 132 MOVQ acc5, mul0 133 MULQ t2 134 ADDQ hlp, acc3 135 ADCQ $0, mul1 136 ADDQ mul0, acc3 137 ADCQ $0, mul1 138 MOVQ mul1, hlp 139 140 MOVQ acc5, mul0 141 MULQ t3 142 ADDQ hlp, acc4 143 ADCQ $0, mul1 144 ADDQ mul0, acc4 145 ADCQ $0, mul1 146 MOVQ mul1, acc5 147 148 MOVQ acc6, mul0 149 MULQ t0 150 ADDQ mul0, acc2 151 ADCQ $0, mul1 152 MOVQ mul1, hlp 153 154 MOVQ acc6, mul0 155 MULQ t1 156 ADDQ hlp, acc3 157 ADCQ $0, mul1 158 ADDQ mul0, acc3 159 ADCQ $0, mul1 160 MOVQ mul1, hlp 161 162 MOVQ acc6, mul0 163 MULQ t2 164 ADDQ hlp, acc4 165 ADCQ $0, mul1 166 ADDQ mul0, acc4 167 ADCQ $0, mul1 168 MOVQ mul1, hlp 169 170 MOVQ acc6, mul0 171 MULQ t3 172 ADDQ hlp, acc5 173 ADCQ $0, mul1 174 ADDQ mul0, acc5 175 ADCQ $0, mul1 176 MOVQ mul1, acc6 177 178 MOVQ acc7, mul0 179 MULQ t0 180 ADDQ mul0, acc3 181 ADCQ $0, mul1 182 MOVQ mul1, hlp 183 184 MOVQ acc7, mul0 185 MULQ t1 186 ADDQ hlp, acc4 187 ADCQ $0, mul1 188 ADDQ mul0, acc4 189 ADCQ $0, mul1 190 MOVQ mul1, hlp 191 192 MOVQ acc7, mul0 193 MULQ t2 194 ADDQ hlp, acc5 195 ADCQ $0, mul1 196 ADDQ mul0, acc5 197 ADCQ $0, mul1 198 MOVQ mul1, hlp 199 200 MOVQ acc7, mul0 201 MULQ t3 202 ADDQ hlp, acc6 203 ADCQ $0, mul1 204 ADDQ mul0, acc6 205 ADCQ $0, mul1 206 MOVQ mul1, acc7 207 sm2P256MulReductionInline 208 209 MOVQ $0, BP 210 // Add bits [511:256] of the result 211 ADCQ acc0, acc4 212 ADCQ acc1, acc5 213 ADCQ acc2, acc6 214 ADCQ acc3, acc7 215 ADCQ $0, hlp 216 // Copy result 217 MOVQ acc4, acc0 218 MOVQ acc5, acc1 219 MOVQ acc6, acc2 220 MOVQ acc7, acc3 221 // Subtract p256 222 SUBQ $-1, acc4 223 SBBQ p256p<>+0x08(SB), acc5 224 SBBQ $-1, acc6 225 SBBQ p256p<>+0x018(SB), acc7 226 SBBQ $0, hlp 227 // If the result of the subtraction is negative, restore the previous result 228 CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1) 229 CMOVQCS acc1, acc5 230 CMOVQCS acc2, acc6 231 CMOVQCS acc3, acc7 232 233 RET 234 internalMulBMI2: 235 MOVQ acc4, mul1 236 MULXQ t0, acc0, acc1 237 238 MULXQ t1, mul0, acc2 239 ADDQ mul0, acc1 240 241 MULXQ t2, mul0, acc3 242 ADCQ mul0, acc2 243 244 MULXQ t3, mul0, acc4 245 ADCQ mul0, acc3 246 ADCQ $0, acc4 247 248 MOVQ acc5, mul1 249 MULXQ t0, mul0, hlp 250 ADDQ mul0, acc1 251 ADCQ hlp, acc2 252 253 MULXQ t1, mul0, hlp 254 ADCQ $0, hlp 255 ADDQ mul0, acc2 256 ADCQ hlp, acc3 257 258 MULXQ t2, mul0, hlp 259 ADCQ $0, hlp 260 ADDQ mul0, acc3 261 ADCQ hlp, acc4 262 263 MULXQ t3, mul0, acc5 264 ADCQ $0, acc5 265 ADDQ mul0, acc4 266 ADCQ $0, acc5 267 268 MOVQ acc6, mul1 269 MULXQ t0, mul0, hlp 270 ADDQ mul0, acc2 271 ADCQ hlp, acc3 272 273 MULXQ t1, mul0, hlp 274 ADCQ $0, hlp 275 ADDQ mul0, acc3 276 ADCQ hlp, acc4 277 278 MULXQ t2, mul0, hlp 279 ADCQ $0, hlp 280 ADDQ mul0, acc4 281 ADCQ hlp, acc5 282 283 MULXQ t3, mul0, acc6 284 ADCQ $0, acc6 285 ADDQ mul0, acc5 286 ADCQ $0, acc6 287 288 MOVQ acc7, mul1 289 MULXQ t0, mul0, hlp 290 ADDQ mul0, acc3 291 ADCQ hlp, acc4 292 293 MULXQ t1, mul0, hlp 294 ADCQ $0, hlp 295 ADDQ mul0, acc4 296 ADCQ hlp, acc5 297 298 MULXQ t2, mul0, hlp 299 ADCQ $0, hlp 300 ADDQ mul0, acc5 301 ADCQ hlp, acc6 302 303 MULXQ t3, mul0, acc7 304 ADCQ $0, acc7 305 ADDQ mul0, acc6 306 ADCQ $0, acc7 307 308 sm2P256MulReductionInline 309 MOVQ $0, BP 310 // Add bits [511:256] of the result 311 ADCQ acc0, acc4 312 ADCQ acc1, acc5 313 ADCQ acc2, acc6 314 ADCQ acc3, acc7 315 ADCQ $0, hlp 316 // Copy result 317 MOVQ acc4, acc0 318 MOVQ acc5, acc1 319 MOVQ acc6, acc2 320 MOVQ acc7, acc3 321 // Subtract p256 322 SUBQ $-1, acc4 323 SBBQ p256p<>+0x08(SB), acc5 324 SBBQ $-1, acc6 325 SBBQ p256p<>+0x018(SB), acc7 326 SBBQ $0, hlp 327 // If the result of the subtraction is negative, restore the previous result 328 CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1) 329 CMOVQCS acc1, acc5 330 CMOVQCS acc2, acc6 331 CMOVQCS acc3, acc7 332 333 RET 334 335 /* ---------------------------------------*/ 336 // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2 337 TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 338 CMPB ·supportBMI2+0(SB), $0x01 339 JEQ internalSqrBMI2 340 341 p256SqrInternalInline 342 RET 343 344 internalSqrBMI2: 345 p256SqrInternalInlineAdx 346 RET 347 348 /* ---------------------------------------*/ 349 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 350 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 351 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 352 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 353 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 354 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 355 /* ---------------------------------------*/ 356 #define x1in(off) (32*0 + off)(SP) 357 #define y1in(off) (32*1 + off)(SP) 358 #define z1in(off) (32*2 + off)(SP) 359 #define x2in(off) (32*3 + off)(SP) 360 #define y2in(off) (32*4 + off)(SP) 361 #define xout(off) (32*5 + off)(SP) 362 #define yout(off) (32*6 + off)(SP) 363 #define zout(off) (32*7 + off)(SP) 364 #define s2(off) (32*8 + off)(SP) 365 #define z1sqr(off) (32*9 + off)(SP) 366 #define h(off) (32*10 + off)(SP) 367 #define r(off) (32*11 + off)(SP) 368 #define hsqr(off) (32*12 + off)(SP) 369 #define rsqr(off) (32*13 + off)(SP) 370 #define hcub(off) (32*14 + off)(SP) 371 #define rptr (32*15)(SP) 372 #define sel_save (32*15 + 8)(SP) 373 #define zero_save (32*15 + 8 + 4)(SP) 374 375 #define p256PointAddAffineInline() \ 376 \// Store pointer to result 377 MOVQ mul0, rptr \ 378 MOVL t1, sel_save \ 379 MOVL t2, zero_save \ 380 \// Negate y2in based on sign 381 MOVQ (16*2 + 8*0)(CX), acc4 \ 382 MOVQ (16*2 + 8*1)(CX), acc5 \ 383 MOVQ (16*2 + 8*2)(CX), acc6 \ 384 MOVQ (16*2 + 8*3)(CX), acc7 \ 385 MOVQ $-1, acc0 \ 386 MOVQ p256p<>+0x08(SB), acc1 \ 387 MOVQ $-1, acc2 \ 388 MOVQ p256p<>+0x018(SB), acc3 \ 389 XORQ mul0, mul0 \ 390 \// Speculatively subtract 391 SUBQ acc4, acc0 \ 392 SBBQ acc5, acc1 \ 393 SBBQ acc6, acc2 \ 394 SBBQ acc7, acc3 \ 395 SBBQ $0, mul0 \ 396 MOVQ acc0, t0 \ 397 MOVQ acc1, t1 \ 398 MOVQ acc2, t2 \ 399 MOVQ acc3, t3 \ 400 \// Add in case the operand was > p256 401 ADDQ $-1, acc0 \ 402 ADCQ p256p<>+0x08(SB), acc1 \ 403 ADCQ $-1, acc2 \ 404 ADCQ p256p<>+0x018(SB), acc3 \ 405 ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC 406 CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0) 407 CMOVQNE t1, acc1 \ 408 CMOVQNE t2, acc2 \ 409 CMOVQNE t3, acc3 \ 410 \// If condition is 0, keep original value 411 TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0) 412 CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1) 413 CMOVQEQ acc5, acc1 \ 414 CMOVQEQ acc6, acc2 \ 415 CMOVQEQ acc7, acc3 \ 416 \// Store result 417 MOVQ acc0, y2in(8*0) \ 418 MOVQ acc1, y2in(8*1) \ 419 MOVQ acc2, y2in(8*2) \ 420 MOVQ acc3, y2in(8*3) \ 421 \// Begin point add 422 LDacc (z1in) \ 423 CALL sm2P256SqrInternal(SB) \// z1ˆ2 424 ST (z1sqr) \ 425 \ 426 LDt (x2in) \ 427 CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2 428 \ 429 LDt (x1in) \ 430 p256SubInline2 \// h = u2 - x1 431 ST (h) \ 432 \ 433 LDt (z1in) \ 434 CALL sm2P256MulInternal(SB) \// z3 = h * z1 435 ST (zout) \ 436 \ 437 LDacc (z1sqr) \ 438 CALL sm2P256MulInternal(SB) \// z1ˆ3 439 \ 440 LDt (y2in) \ 441 CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3 442 ST (s2) \ 443 \ 444 LDt (y1in) \ 445 p256SubInline2 \// r = s2 - y1 446 ST (r) \ 447 \ 448 CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 449 ST (rsqr) \ 450 \ 451 LDacc (h) \ 452 CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 453 ST (hsqr) \ 454 \ 455 LDt (h) \ 456 CALL sm2P256MulInternal(SB) \// hcub = hˆ3 457 ST (hcub) \ 458 \ 459 LDt (y1in) \ 460 CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3 461 ST (s2) \ 462 \ 463 LDacc (x1in) \ 464 LDt (hsqr) \ 465 CALL sm2P256MulInternal(SB) \// x1 * hˆ2 466 ST (h) \ 467 \ 468 p256MulBy2Inline \// x1 * hˆ2 * 2, inline 469 LDacc (rsqr) \ 470 p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2 471 \ 472 LDt (hcub) \ 473 p256SubInline \ 474 STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3 475 LDacc (h) \ 476 p256SubInline2 \ 477 \ 478 LDt (r) \ 479 CALL sm2P256MulInternal(SB) \ 480 \ 481 LDt (s2) \ 482 p256SubInline2 \ 483 ST (yout) \ 484 \// Load stored values from stack 485 MOVQ rptr, AX \ 486 487 // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) 488 TEXT ·p256PointAddAffineAsm(SB),0,$512-48 489 // Move input to stack in order to free registers 490 MOVQ res+0(FP), AX 491 MOVQ in1+8(FP), BX 492 MOVQ in2+16(FP), CX 493 MOVQ sign+24(FP), DX 494 MOVQ sel+32(FP), t1 495 MOVQ zero+40(FP), t2 496 497 CMPB ·supportAVX2+0(SB), $0x01 498 JEQ pointaddaffine_avx2 499 500 MOVOU (16*0)(BX), X0 501 MOVOU (16*1)(BX), X1 502 MOVOU (16*2)(BX), X2 503 MOVOU (16*3)(BX), X3 504 MOVOU (16*4)(BX), X4 505 MOVOU (16*5)(BX), X5 506 507 MOVOU X0, x1in(16*0) 508 MOVOU X1, x1in(16*1) 509 MOVOU X2, y1in(16*0) 510 MOVOU X3, y1in(16*1) 511 MOVOU X4, z1in(16*0) 512 MOVOU X5, z1in(16*1) 513 514 MOVOU (16*0)(CX), X0 515 MOVOU (16*1)(CX), X1 516 517 MOVOU X0, x2in(16*0) 518 MOVOU X1, x2in(16*1) 519 520 p256PointAddAffineInline() 521 // The result is not valid if (sel == 0), conditional choose 522 MOVOU xout(16*0), X0 523 MOVOU xout(16*1), X1 524 MOVOU yout(16*0), X2 525 MOVOU yout(16*1), X3 526 MOVOU zout(16*0), X4 527 MOVOU zout(16*1), X5 528 529 MOVL sel_save, X6 // sel 530 MOVL zero_save, X7 // zero 531 532 PXOR X8, X8 // X8's bits are all 0 533 PCMPEQL X9, X9 // X9's bits are all 1 534 535 PSHUFD $0, X6, X6 536 PSHUFD $0, X7, X7 537 538 PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0 539 PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0 540 541 MOVOU X6, X15 542 PANDN X9, X15 // X15 = NOT(X6) 543 544 MOVOU x1in(16*0), X9 545 MOVOU x1in(16*1), X10 546 MOVOU y1in(16*0), X11 547 MOVOU y1in(16*1), X12 548 MOVOU z1in(16*0), X13 549 MOVOU z1in(16*1), X14 550 551 PAND X15, X0 552 PAND X15, X1 553 PAND X15, X2 554 PAND X15, X3 555 PAND X15, X4 556 PAND X15, X5 557 558 PAND X6, X9 559 PAND X6, X10 560 PAND X6, X11 561 PAND X6, X12 562 PAND X6, X13 563 PAND X6, X14 564 565 PXOR X9, X0 566 PXOR X10, X1 567 PXOR X11, X2 568 PXOR X12, X3 569 PXOR X13, X4 570 PXOR X14, X5 571 // Similarly if zero == 0 572 PCMPEQL X9, X9 573 MOVOU X7, X15 574 PANDN X9, X15 // X15 = NOT(X7) 575 576 MOVOU x2in(16*0), X9 577 MOVOU x2in(16*1), X10 578 MOVOU y2in(16*0), X11 579 MOVOU y2in(16*1), X12 580 MOVOU p256one<>+0x00(SB), X13 581 MOVOU p256one<>+0x10(SB), X14 582 583 PAND X15, X0 584 PAND X15, X1 585 PAND X15, X2 586 PAND X15, X3 587 PAND X15, X4 588 PAND X15, X5 589 590 PAND X7, X9 591 PAND X7, X10 592 PAND X7, X11 593 PAND X7, X12 594 PAND X7, X13 595 PAND X7, X14 596 597 PXOR X9, X0 598 PXOR X10, X1 599 PXOR X11, X2 600 PXOR X12, X3 601 PXOR X13, X4 602 PXOR X14, X5 603 // Finally output the result 604 MOVOU X0, (16*0)(AX) 605 MOVOU X1, (16*1)(AX) 606 MOVOU X2, (16*2)(AX) 607 MOVOU X3, (16*3)(AX) 608 MOVOU X4, (16*4)(AX) 609 MOVOU X5, (16*5)(AX) 610 MOVQ $0, rptr 611 612 RET 613 pointaddaffine_avx2: 614 VMOVDQU (32*0)(BX), Y0 615 VMOVDQU (32*1)(BX), Y1 616 VMOVDQU (32*2)(BX), Y2 617 618 VMOVDQU Y0, x1in(32*0) 619 VMOVDQU Y1, y1in(32*0) 620 VMOVDQU Y2, z1in(32*0) 621 622 VMOVDQU (32*0)(CX), Y0 623 VMOVDQU Y0, x2in(32*0) 624 625 p256PointAddAffineInline() 626 // The result is not valid if (sel == 0), conditional choose 627 VPXOR Y8, Y8, Y8 // Y8's bits are all 0 628 VPBROADCASTD sel_save, Y6 // sel 629 VPBROADCASTD zero_save, Y7 // zero 630 631 VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0 632 VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0 633 634 VPANDN xout(32*0), Y6, Y0 635 VPANDN yout(32*0), Y6, Y1 636 VPANDN zout(32*0), Y6, Y2 637 638 VPAND x1in(32*0), Y6, Y9 639 VPAND y1in(32*0), Y6, Y10 640 VPAND z1in(32*0), Y6, Y11 641 642 VPXOR Y9, Y0, Y0 643 VPXOR Y10, Y1, Y1 644 VPXOR Y11, Y2, Y2 645 646 // Similarly if zero == 0 647 VPANDN Y0, Y7, Y0 648 VPANDN Y1, Y7, Y1 649 VPANDN Y2, Y7, Y2 650 651 VPAND x2in(32*0), Y7, Y9 652 VPAND y2in(32*0), Y7, Y10 653 VPAND p256one<>+0x00(SB), Y7, Y11 654 655 VPXOR Y9, Y0, Y0 656 VPXOR Y10, Y1, Y1 657 VPXOR Y11, Y2, Y2 658 659 // Finally output the result 660 VMOVDQU Y0, (32*0)(AX) 661 VMOVDQU Y1, (32*1)(AX) 662 VMOVDQU Y2, (32*2)(AX) 663 MOVQ $0, rptr 664 665 VZEROUPPER 666 RET 667 #undef x1in 668 #undef y1in 669 #undef z1in 670 #undef x2in 671 #undef y2in 672 #undef xout 673 #undef yout 674 #undef zout 675 #undef s2 676 #undef z1sqr 677 #undef h 678 #undef r 679 #undef hsqr 680 #undef rsqr 681 #undef hcub 682 #undef rptr 683 #undef sel_save 684 #undef zero_save 685 686 /* ---------------------------------------*/ 687 #define x1in(off) (32*0 + off)(SP) 688 #define y1in(off) (32*1 + off)(SP) 689 #define z1in(off) (32*2 + off)(SP) 690 #define x2in(off) (32*3 + off)(SP) 691 #define y2in(off) (32*4 + off)(SP) 692 #define z2in(off) (32*5 + off)(SP) 693 694 #define xout(off) (32*6 + off)(SP) 695 #define yout(off) (32*7 + off)(SP) 696 #define zout(off) (32*8 + off)(SP) 697 698 #define u1(off) (32*9 + off)(SP) 699 #define u2(off) (32*10 + off)(SP) 700 #define s1(off) (32*11 + off)(SP) 701 #define s2(off) (32*12 + off)(SP) 702 #define z1sqr(off) (32*13 + off)(SP) 703 #define z2sqr(off) (32*14 + off)(SP) 704 #define h(off) (32*15 + off)(SP) 705 #define r(off) (32*16 + off)(SP) 706 #define hsqr(off) (32*17 + off)(SP) 707 #define rsqr(off) (32*18 + off)(SP) 708 #define hcub(off) (32*19 + off)(SP) 709 #define rptr (32*20)(SP) 710 #define points_eq (32*20+8)(SP) 711 712 #define p256PointAddInline() \ 713 \// Begin point add 714 LDacc (z2in) \ 715 CALL sm2P256SqrInternal(SB) \// z2ˆ2 716 ST (z2sqr) \ 717 LDt (z2in) \ 718 CALL sm2P256MulInternal(SB) \// z2ˆ3 719 LDt (y1in) \ 720 CALL sm2P256MulInternal(SB) \// s1 = z2ˆ3*y1 721 ST (s1) \ 722 \ 723 LDacc (z1in) \ 724 CALL sm2P256SqrInternal(SB) \// z1ˆ2 725 ST (z1sqr) \ 726 LDt (z1in) \ 727 CALL sm2P256MulInternal(SB) \// z1ˆ3 728 LDt (y2in) \ 729 CALL sm2P256MulInternal(SB) \// s2 = z1ˆ3*y2 730 ST (s2) \ 731 \ 732 LDt (s1) \ 733 p256SubInline2 \// r = s2 - s1 734 ST (r) \ 735 p256IsZeroInline \ 736 MOVQ AX, points_eq \ 737 \ 738 LDacc (z2sqr) \ 739 LDt (x1in) \ 740 CALL sm2P256MulInternal(SB) \// u1 = x1 * z2ˆ2 741 ST (u1) \ 742 LDacc (z1sqr) \ 743 LDt (x2in) \ 744 CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2 745 ST (u2) \ 746 \ 747 LDt (u1) \ 748 p256SubInline2 \// h = u2 - u1 749 ST (h) \ 750 p256IsZeroInline \ 751 ANDQ points_eq, AX \ 752 MOVQ AX, points_eq \ 753 \ 754 LDacc (r) \ 755 CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 756 ST (rsqr) \ 757 \ 758 LDacc (h) \ 759 CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 760 ST (hsqr) \ 761 \ 762 LDt (h) \ 763 CALL sm2P256MulInternal(SB) \// hcub = hˆ3 764 ST (hcub) \ 765 \ 766 LDt (s1) \ 767 CALL sm2P256MulInternal(SB) \ 768 ST (s2) \ 769 \ 770 LDacc (z1in) \ 771 LDt (z2in) \ 772 CALL sm2P256MulInternal(SB) \// z1 * z2 773 LDt (h) \ 774 CALL sm2P256MulInternal(SB) \// z1 * z2 * h 775 ST (zout) \ 776 \ 777 LDacc (hsqr) \ 778 LDt (u1) \ 779 CALL sm2P256MulInternal(SB) \// hˆ2 * u1 780 ST (u2) \ 781 \ 782 p256MulBy2Inline \// u1 * hˆ2 * 2, inline 783 LDacc (rsqr) \ 784 p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 785 \ 786 LDt (hcub) \ 787 p256SubInline \ 788 STt (xout) \ 789 LDacc (u2) \ 790 p256SubInline2 \ 791 \ 792 LDt (r) \ 793 CALL sm2P256MulInternal(SB) \ 794 \ 795 LDt (s2) \ 796 p256SubInline2 \ 797 ST (yout) \ 798 799 //func p256PointAddAsm(res, in1, in2 *SM2P256Point) int 800 TEXT ·p256PointAddAsm(SB),0,$680-32 801 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 802 // Move input to stack in order to free registers 803 MOVQ res+0(FP), AX 804 MOVQ in1+8(FP), BX 805 MOVQ in2+16(FP), CX 806 807 CMPB ·supportAVX2+0(SB), $0x01 808 JEQ pointadd_avx2 809 810 MOVOU (16*0)(BX), X0 811 MOVOU (16*1)(BX), X1 812 MOVOU (16*2)(BX), X2 813 MOVOU (16*3)(BX), X3 814 MOVOU (16*4)(BX), X4 815 MOVOU (16*5)(BX), X5 816 817 MOVOU X0, x1in(16*0) 818 MOVOU X1, x1in(16*1) 819 MOVOU X2, y1in(16*0) 820 MOVOU X3, y1in(16*1) 821 MOVOU X4, z1in(16*0) 822 MOVOU X5, z1in(16*1) 823 824 MOVOU (16*0)(CX), X0 825 MOVOU (16*1)(CX), X1 826 MOVOU (16*2)(CX), X2 827 MOVOU (16*3)(CX), X3 828 MOVOU (16*4)(CX), X4 829 MOVOU (16*5)(CX), X5 830 831 MOVOU X0, x2in(16*0) 832 MOVOU X1, x2in(16*1) 833 MOVOU X2, y2in(16*0) 834 MOVOU X3, y2in(16*1) 835 MOVOU X4, z2in(16*0) 836 MOVOU X5, z2in(16*1) 837 // Store pointer to result 838 MOVQ AX, rptr 839 p256PointAddInline() 840 841 MOVOU xout(16*0), X0 842 MOVOU xout(16*1), X1 843 MOVOU yout(16*0), X2 844 MOVOU yout(16*1), X3 845 MOVOU zout(16*0), X4 846 MOVOU zout(16*1), X5 847 // Finally output the result 848 MOVQ rptr, AX 849 MOVQ $0, rptr 850 MOVOU X0, (16*0)(AX) 851 MOVOU X1, (16*1)(AX) 852 MOVOU X2, (16*2)(AX) 853 MOVOU X3, (16*3)(AX) 854 MOVOU X4, (16*4)(AX) 855 MOVOU X5, (16*5)(AX) 856 857 MOVQ points_eq, AX 858 MOVQ AX, ret+24(FP) 859 860 RET 861 pointadd_avx2: 862 VMOVDQU (32*0)(BX), Y0 863 VMOVDQU (32*1)(BX), Y1 864 VMOVDQU (32*2)(BX), Y2 865 866 VMOVDQU Y0, x1in(32*0) 867 VMOVDQU Y1, y1in(32*0) 868 VMOVDQU Y2, z1in(32*0) 869 870 VMOVDQU (32*0)(CX), Y0 871 VMOVDQU (32*1)(CX), Y1 872 VMOVDQU (32*2)(CX), Y2 873 874 VMOVDQU Y0, x2in(32*0) 875 VMOVDQU Y1, y2in(32*0) 876 VMOVDQU Y2, z2in(32*0) 877 878 // Store pointer to result 879 MOVQ AX, rptr 880 p256PointAddInline() 881 882 VMOVDQU xout(32*0), Y0 883 VMOVDQU yout(32*0), Y1 884 VMOVDQU zout(32*0), Y2 885 // Finally output the result 886 MOVQ rptr, AX 887 MOVQ $0, rptr 888 VMOVDQU Y0, (32*0)(AX) 889 VMOVDQU Y1, (32*1)(AX) 890 VMOVDQU Y2, (32*2)(AX) 891 892 MOVQ points_eq, AX 893 MOVQ AX, ret+24(FP) 894 895 VZEROUPPER 896 RET 897 898 #undef x1in 899 #undef y1in 900 #undef z1in 901 #undef x2in 902 #undef y2in 903 #undef z2in 904 #undef xout 905 #undef yout 906 #undef zout 907 #undef s1 908 #undef s2 909 #undef u1 910 #undef u2 911 #undef z1sqr 912 #undef z2sqr 913 #undef h 914 #undef r 915 #undef hsqr 916 #undef rsqr 917 #undef hcub 918 #undef rptr 919 /* ---------------------------------------*/ 920 #define x(off) (32*0 + off)(SP) 921 #define y(off) (32*1 + off)(SP) 922 #define z(off) (32*2 + off)(SP) 923 924 #define s(off) (32*3 + off)(SP) 925 #define m(off) (32*4 + off)(SP) 926 #define zsqr(off) (32*5 + off)(SP) 927 #define tmp(off) (32*6 + off)(SP) 928 #define rptr (32*7)(SP) 929 930 #define calZ() \ 931 LDacc (z) \ 932 CALL sm2P256SqrInternal(SB) \ 933 ST (zsqr) \ // ZZ = Z1^2 934 \ 935 LDt (x) \ 936 p256AddInline \ 937 STt (m) \ // M = ZZ + X1 938 \ 939 LDacc (z) \ 940 LDt (y) \ 941 CALL sm2P256MulInternal(SB) \ // Z1 * Y1 942 p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2 943 944 #define calX() \ 945 LDacc (x) \ 946 LDt (zsqr) \ 947 p256SubInline2 \ // X1 - ZZ 948 LDt (m) \ 949 CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2 950 ST (m) \ 951 \// Multiply by 3 952 p256TripleInline \ 953 STt (m) \ // M = 3 * (X1^2 - ZZ^2) 954 \//////////////////////// 955 LDacc (y) \ 956 p256MulBy2Inline2 \ 957 CALL sm2P256SqrInternal(SB) \ // 4 * YY = (2*Y1)^2 958 ST (s) \ // S = 4 * YY 959 CALL sm2P256SqrInternal(SB) \ // (4 * YY)^2 = 16 * YYYY 960 \// Divide by 2 961 XORQ mul0, mul0 \ 962 MOVQ acc4, t0 \ 963 MOVQ acc5, t1 \ 964 MOVQ acc6, t2 \ 965 MOVQ acc7, t3 \ 966 \ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P 967 ADDQ $-1, acc4 \ 968 ADCQ p256p<>+0x08(SB), acc5 \ 969 ADCQ $-1, acc6 \ 970 ADCQ p256p<>+0x018(SB), acc7 \ 971 ADCQ $0, mul0 \ 972 TESTQ $1, t0 \ // ZF := 1 if (t0 AND 1 == 0) 973 \ // CMOVQEQ: Move if equal (ZF == 1) 974 CMOVQEQ t0, acc4 \ // acc4 := t0 if (ZF == 1) 975 CMOVQEQ t1, acc5 \ // acc5 := t1 if (ZF == 1) 976 CMOVQEQ t2, acc6 \ // acc6 := t2 if (ZF == 1) 977 CMOVQEQ t3, acc7 \ // acc7 := t3 if (ZF == 1) 978 ANDQ t0, mul0 \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1) 979 \ // Divide even by 2 980 SHRQ $1, acc5, acc4 \ // acc4 := acc4 >> 1 | acc5 << 63 981 SHRQ $1, acc6, acc5 \ // acc5 := acc5 >> 1 | acc6 << 63 982 SHRQ $1, acc7, acc6 \ // acc6 := acc6 >> 1 | acc7 << 63 983 SHRQ $1, mul0, acc7 \ // acc7 := acc7 >> 1 | mul0 << 63 984 ST (y) \ // Y3 = 8 * YYYY 985 \///////////////////////// 986 LDacc (x) \ 987 LDt (s) \ 988 CALL sm2P256MulInternal(SB) \ // X1 * 4 * YY 989 ST (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY) 990 p256MulBy2Inline \ 991 STt (tmp) \ // tmp = 2*S = 8 * X1 * YY 992 \ 993 LDacc (m) \ 994 CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2 995 LDt (tmp) \ 996 p256SubInline2 \ // X3 = M^2 - 2*S 997 998 #define calY() \ 999 acc2t \ 1000 LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY) 1001 p256SubInline2 \ // S - X3 1002 \ 1003 LDt (m) \ 1004 CALL sm2P256MulInternal(SB) \ // M * (S - X3) 1005 \ 1006 LDt (y) \ 1007 p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY 1008 1009 #define lastP256PointDouble() \ 1010 \ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl 1011 calZ() \ 1012 MOVQ rptr, AX \ 1013 \// Store z 1014 MOVQ t0, (16*4 + 8*0)(AX) \ 1015 MOVQ t1, (16*4 + 8*1)(AX) \ 1016 MOVQ t2, (16*4 + 8*2)(AX) \ 1017 MOVQ t3, (16*4 + 8*3)(AX) \ 1018 \ 1019 calX() \ 1020 MOVQ rptr, AX \ 1021 \// Store x 1022 MOVQ acc4, (16*0 + 8*0)(AX) \ 1023 MOVQ acc5, (16*0 + 8*1)(AX) \ 1024 MOVQ acc6, (16*0 + 8*2)(AX) \ 1025 MOVQ acc7, (16*0 + 8*3)(AX) \ 1026 \ 1027 calY() \ 1028 MOVQ rptr, AX \ 1029 \// Store y 1030 MOVQ acc4, (16*2 + 8*0)(AX) \ 1031 MOVQ acc5, (16*2 + 8*1)(AX) \ 1032 MOVQ acc6, (16*2 + 8*2)(AX) \ 1033 MOVQ acc7, (16*2 + 8*3)(AX) \ 1034 \/////////////////////// 1035 MOVQ $0, rptr \ 1036 1037 //func p256PointDoubleAsm(res, in *SM2P256Point) 1038 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16 1039 // Move input to stack in order to free registers 1040 MOVQ res+0(FP), AX 1041 MOVQ in+8(FP), BX 1042 1043 p256PointDoubleInit() 1044 // Store pointer to result 1045 MOVQ AX, rptr 1046 // Begin point double 1047 lastP256PointDouble() 1048 1049 RET 1050 1051 #define storeTmpX() \ 1052 MOVQ acc4, x(8*0) \ 1053 MOVQ acc5, x(8*1) \ 1054 MOVQ acc6, x(8*2) \ 1055 MOVQ acc7, x(8*3) \ 1056 1057 #define storeTmpY() \ 1058 MOVQ acc4, y(8*0) \ 1059 MOVQ acc5, y(8*1) \ 1060 MOVQ acc6, y(8*2) \ 1061 MOVQ acc7, y(8*3) \ 1062 1063 #define storeTmpZ() \ 1064 MOVQ t0, z(8*0) \ 1065 MOVQ t1, z(8*1) \ 1066 MOVQ t2, z(8*2) \ 1067 MOVQ t3, z(8*3) \ 1068 1069 #define p256PointDoubleRound() \ 1070 calZ() \ 1071 storeTmpZ() \ 1072 calX() \ 1073 storeTmpX() \ 1074 calY() \ 1075 storeTmpY() \ 1076 1077 //func p256PointDouble6TimesAsm(res, in *SM2P256Point) 1078 TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$256-16 1079 // Move input to stack in order to free registers 1080 MOVQ res+0(FP), AX 1081 MOVQ in+8(FP), BX 1082 1083 p256PointDoubleInit() 1084 // Store pointer to result 1085 MOVQ AX, rptr 1086 1087 // point double 1-5 rounds 1088 p256PointDoubleRound() 1089 p256PointDoubleRound() 1090 p256PointDoubleRound() 1091 p256PointDoubleRound() 1092 p256PointDoubleRound() 1093 1094 // last point double round 1095 lastP256PointDouble() 1096 1097 RET 1098 /* ---------------------------------------*/