github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_plugin_amd64.s (about) 1 // This file contains constant-time, 64-bit assembly implementation of 2 // P256. The optimizations performed here are described in detail in: 3 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 4 // 256-bit primes" 5 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 6 // https://eprint.iacr.org/2013/816.pdf 7 // https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2) 8 //go:build plugin && !purego 9 10 // plugin mode - DO NOT use the R15 Register. 11 // Below functions are different: 12 // 1.p256Sqr 13 // 2.p256OrdSqr 14 // 3.sm2P256MulInternal 15 // 4.sm2P256SqrInternal 16 // The most affected one is sm2P256MulInternal, it uses SIMD register X0 as temp storage. 17 18 #include "textflag.h" 19 20 #include "p256_macros_amd64.s" 21 22 /* ---------------------------------------*/ 23 // This func is same as non-plugin mode, except that it uses BP to store n 24 // and does not use R15. 25 // 26 // func p256Sqr(res, in *p256Element, n int) 27 TEXT ·p256Sqr(SB),NOSPLIT,$0 28 MOVQ res+0(FP), res_ptr 29 MOVQ in+8(FP), x_ptr 30 MOVQ n+16(FP), BP 31 32 CMPB ·supportBMI2+0(SB), $0x01 33 JEQ sqrBMI2 34 35 sqrLoop: 36 p256SqrRound(BX) 37 DECQ BP 38 JNE sqrLoop 39 RET 40 41 sqrBMI2: 42 p256SqrRoundAdx(BX) 43 DECQ BP 44 JNE sqrBMI2 45 RET 46 47 /* ---------------------------------------*/ 48 // This func is same as non-plugin mode, except that it uses BP to store n 49 // and does not use R15. 50 // 51 // func p256OrdSqr(res, in *p256OrdElement, n int) 52 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 53 MOVQ res+0(FP), res_ptr 54 MOVQ in+8(FP), x_ptr 55 MOVQ n+16(FP), BP 56 57 CMPB ·supportBMI2+0(SB), $0x01 58 JEQ ordSqrLoopBMI2 59 60 ordSqrLoop: 61 p256OrdSqrRound(BX) 62 DECQ BP 63 JNE ordSqrLoop 64 65 RET 66 67 ordSqrLoopBMI2: 68 p256OrdSqrRoundAdx(BX) 69 DECQ BP 70 JNE ordSqrLoopBMI2 71 72 RET 73 74 /* ---------------------------------------*/ 75 #undef res_ptr 76 #undef x_ptr 77 #undef y_ptr 78 79 #undef acc0 80 #undef acc1 81 #undef acc2 82 #undef acc3 83 #undef acc4 84 #undef acc5 85 #undef t0 86 /* ---------------------------------------*/ 87 #define mul0 AX 88 #define mul1 DX 89 #define acc0 BX 90 #define acc1 CX 91 #define acc2 R8 92 #define acc3 BP 93 #define acc4 R10 94 #define acc5 R11 95 #define acc6 R12 96 #define acc7 R13 97 #define t0 R14 98 #define t1 DI 99 #define t2 SI 100 #define t3 R9 101 102 /* ---------------------------------------*/ 103 // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0] 104 TEXT sm2P256MulInternal(SB),NOSPLIT,$8 105 CMPB ·supportBMI2+0(SB), $0x01 106 JEQ internalMulBMI2 107 108 MOVQ acc4, mul0 109 MULQ t0 110 MOVQ mul0, X0 // uses X0 as temp register/storage 111 MOVQ mul1, acc1 112 113 MOVQ acc4, mul0 114 MULQ t1 115 ADDQ mul0, acc1 116 ADCQ $0, mul1 117 MOVQ mul1, acc2 118 119 MOVQ acc4, mul0 120 MULQ t2 121 ADDQ mul0, acc2 122 ADCQ $0, mul1 123 MOVQ mul1, acc3 124 125 MOVQ acc4, mul0 126 MULQ t3 127 ADDQ mul0, acc3 128 ADCQ $0, mul1 129 MOVQ mul1, acc4 130 131 MOVQ acc5, mul0 132 MULQ t0 133 ADDQ mul0, acc1 134 ADCQ $0, mul1 135 MOVQ mul1, acc0 136 137 MOVQ acc5, mul0 138 MULQ t1 139 ADDQ acc0, acc2 140 ADCQ $0, mul1 141 ADDQ mul0, acc2 142 ADCQ $0, mul1 143 MOVQ mul1, acc0 144 145 MOVQ acc5, mul0 146 MULQ t2 147 ADDQ acc0, acc3 148 ADCQ $0, mul1 149 ADDQ mul0, acc3 150 ADCQ $0, mul1 151 MOVQ mul1, acc0 152 153 MOVQ acc5, mul0 154 MULQ t3 155 ADDQ acc0, acc4 156 ADCQ $0, mul1 157 ADDQ mul0, acc4 158 ADCQ $0, mul1 159 MOVQ mul1, acc5 160 161 MOVQ acc6, mul0 162 MULQ t0 163 ADDQ mul0, acc2 164 ADCQ $0, mul1 165 MOVQ mul1, acc0 166 167 MOVQ acc6, mul0 168 MULQ t1 169 ADDQ acc0, acc3 170 ADCQ $0, mul1 171 ADDQ mul0, acc3 172 ADCQ $0, mul1 173 MOVQ mul1, acc0 174 175 MOVQ acc6, mul0 176 MULQ t2 177 ADDQ acc0, acc4 178 ADCQ $0, mul1 179 ADDQ mul0, acc4 180 ADCQ $0, mul1 181 MOVQ mul1, acc0 182 183 MOVQ acc6, mul0 184 MULQ t3 185 ADDQ acc0, acc5 186 ADCQ $0, mul1 187 ADDQ mul0, acc5 188 ADCQ $0, mul1 189 MOVQ mul1, acc6 190 191 MOVQ acc7, mul0 192 MULQ t0 193 ADDQ mul0, acc3 194 ADCQ $0, mul1 195 MOVQ mul1, acc0 196 197 MOVQ acc7, mul0 198 MULQ t1 199 ADDQ acc0, acc4 200 ADCQ $0, mul1 201 ADDQ mul0, acc4 202 ADCQ $0, mul1 203 MOVQ mul1, acc0 204 205 MOVQ acc7, mul0 206 MULQ t2 207 ADDQ acc0, acc5 208 ADCQ $0, mul1 209 ADDQ mul0, acc5 210 ADCQ $0, mul1 211 MOVQ mul1, acc0 212 213 MOVQ acc7, mul0 214 MULQ t3 215 ADDQ acc0, acc6 216 ADCQ $0, mul1 217 ADDQ mul0, acc6 218 ADCQ $0, mul1 219 MOVQ mul1, acc7 220 221 PEXTRQ $0, X0, acc0 222 sm2P256MulReductionInline 223 MOVQ $0, mul0 224 // Add bits [511:256] of the result 225 ADCQ acc0, acc4 226 ADCQ acc1, acc5 227 ADCQ acc2, acc6 228 ADCQ acc3, acc7 229 ADCQ $0, mul0 230 // Copy result 231 MOVQ acc4, acc0 232 MOVQ acc5, acc1 233 MOVQ acc6, acc2 234 MOVQ acc7, acc3 235 // Subtract p256 236 SUBQ $-1, acc4 237 SBBQ p256p<>+0x08(SB), acc5 238 SBBQ $-1, acc6 239 SBBQ p256p<>+0x018(SB), acc7 240 SBBQ $0, mul0 241 // If the result of the subtraction is negative, restore the previous result 242 CMOVQCS acc0, acc4 243 CMOVQCS acc1, acc5 244 CMOVQCS acc2, acc6 245 CMOVQCS acc3, acc7 246 247 RET 248 internalMulBMI2: 249 MOVQ acc4, mul1 250 MULXQ t0, acc0, acc1 251 MOVQ acc0, X0 // uses X0 as temp register/storage 252 253 MULXQ t1, mul0, acc2 254 ADDQ mul0, acc1 255 256 MULXQ t2, mul0, acc3 257 ADCQ mul0, acc2 258 259 MULXQ t3, mul0, acc4 260 ADCQ mul0, acc3 261 ADCQ $0, acc4 262 263 MOVQ acc5, mul1 264 MULXQ t0, mul0, acc0 265 ADDQ mul0, acc1 266 ADCQ acc0, acc2 267 268 MULXQ t1, mul0, acc0 269 ADCQ $0, acc0 270 ADDQ mul0, acc2 271 ADCQ acc0, acc3 272 273 MULXQ t2, mul0, acc0 274 ADCQ $0, acc0 275 ADDQ mul0, acc3 276 ADCQ acc0, acc4 277 278 MULXQ t3, mul0, acc5 279 ADCQ $0, acc5 280 ADDQ mul0, acc4 281 ADCQ $0, acc5 282 283 MOVQ acc6, mul1 284 MULXQ t0, mul0, acc0 285 ADDQ mul0, acc2 286 ADCQ acc0, acc3 287 288 MULXQ t1, mul0, acc0 289 ADCQ $0, acc0 290 ADDQ mul0, acc3 291 ADCQ acc0, acc4 292 293 MULXQ t2, mul0, acc0 294 ADCQ $0, acc0 295 ADDQ mul0, acc4 296 ADCQ acc0, acc5 297 298 MULXQ t3, mul0, acc6 299 ADCQ $0, acc6 300 ADDQ mul0, acc5 301 ADCQ $0, acc6 302 303 MOVQ acc7, mul1 304 MULXQ t0, mul0, acc0 305 ADDQ mul0, acc3 306 ADCQ acc0, acc4 307 308 MULXQ t1, mul0, acc0 309 ADCQ $0, acc0 310 ADDQ mul0, acc4 311 ADCQ acc0, acc5 312 313 MULXQ t2, mul0, acc0 314 ADCQ $0, acc0 315 ADDQ mul0, acc5 316 ADCQ acc0, acc6 317 318 MULXQ t3, mul0, acc7 319 ADCQ $0, acc7 320 ADDQ mul0, acc6 321 ADCQ $0, acc7 322 323 PEXTRQ $0, X0, acc0 324 sm2P256MulReductionInline 325 MOVQ $0, mul0 326 // Add bits [511:256] of the result 327 ADCQ acc0, acc4 328 ADCQ acc1, acc5 329 ADCQ acc2, acc6 330 ADCQ acc3, acc7 331 ADCQ $0, mul0 332 // Copy result 333 MOVQ acc4, acc0 334 MOVQ acc5, acc1 335 MOVQ acc6, acc2 336 MOVQ acc7, acc3 337 // Subtract p256 338 SUBQ $-1, acc4 339 SBBQ p256p<>+0x08(SB), acc5 340 SBBQ $-1, acc6 341 SBBQ p256p<>+0x018(SB), acc7 342 SBBQ $0, mul0 343 // If the result of the subtraction is negative, restore the previous result 344 CMOVQCS acc0, acc4 345 CMOVQCS acc1, acc5 346 CMOVQCS acc2, acc6 347 CMOVQCS acc3, acc7 348 349 RET 350 351 /* ---------------------------------------*/ 352 // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2 353 TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 354 CMPB ·supportBMI2+0(SB), $0x01 355 JEQ internalSqrBMI2 356 357 p256SqrInternalInline 358 RET 359 360 internalSqrBMI2: 361 p256SqrInternalInlineAdx 362 RET 363 364 // Below is same as non-plugin 365 /* ---------------------------------------*/ 366 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 367 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 368 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 369 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 370 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 371 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 372 /* ---------------------------------------*/ 373 #define x1in(off) (32*0 + off)(SP) 374 #define y1in(off) (32*1 + off)(SP) 375 #define z1in(off) (32*2 + off)(SP) 376 #define x2in(off) (32*3 + off)(SP) 377 #define y2in(off) (32*4 + off)(SP) 378 #define xout(off) (32*5 + off)(SP) 379 #define yout(off) (32*6 + off)(SP) 380 #define zout(off) (32*7 + off)(SP) 381 #define s2(off) (32*8 + off)(SP) 382 #define z1sqr(off) (32*9 + off)(SP) 383 #define h(off) (32*10 + off)(SP) 384 #define r(off) (32*11 + off)(SP) 385 #define hsqr(off) (32*12 + off)(SP) 386 #define rsqr(off) (32*13 + off)(SP) 387 #define hcub(off) (32*14 + off)(SP) 388 #define rptr (32*15)(SP) 389 #define sel_save (32*15 + 8)(SP) 390 #define zero_save (32*15 + 8 + 4)(SP) 391 392 #define p256PointAddAffineInline() \ 393 \// Store pointer to result 394 MOVQ mul0, rptr \ 395 MOVL t1, sel_save \ 396 MOVL t2, zero_save \ 397 \// Negate y2in based on sign 398 MOVQ (16*2 + 8*0)(CX), acc4 \ 399 MOVQ (16*2 + 8*1)(CX), acc5 \ 400 MOVQ (16*2 + 8*2)(CX), acc6 \ 401 MOVQ (16*2 + 8*3)(CX), acc7 \ 402 MOVQ $-1, acc0 \ 403 MOVQ p256p<>+0x08(SB), acc1 \ 404 MOVQ $-1, acc2 \ 405 MOVQ p256p<>+0x018(SB), acc3 \ 406 XORQ mul0, mul0 \ 407 \// Speculatively subtract 408 SUBQ acc4, acc0 \ 409 SBBQ acc5, acc1 \ 410 SBBQ acc6, acc2 \ 411 SBBQ acc7, acc3 \ 412 SBBQ $0, mul0 \ 413 MOVQ acc0, t0 \ 414 MOVQ acc1, t1 \ 415 MOVQ acc2, t2 \ 416 MOVQ acc3, t3 \ 417 \// Add in case the operand was > p256 418 ADDQ $-1, acc0 \ 419 ADCQ p256p<>+0x08(SB), acc1 \ 420 ADCQ $-1, acc2 \ 421 ADCQ p256p<>+0x018(SB), acc3 \ 422 ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC 423 CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0) 424 CMOVQNE t1, acc1 \ 425 CMOVQNE t2, acc2 \ 426 CMOVQNE t3, acc3 \ 427 \// If condition is 0, keep original value 428 TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0) 429 CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1) 430 CMOVQEQ acc5, acc1 \ 431 CMOVQEQ acc6, acc2 \ 432 CMOVQEQ acc7, acc3 \ 433 \// Store result 434 MOVQ acc0, y2in(8*0) \ 435 MOVQ acc1, y2in(8*1) \ 436 MOVQ acc2, y2in(8*2) \ 437 MOVQ acc3, y2in(8*3) \ 438 \// Begin point add 439 LDacc (z1in) \ 440 CALL sm2P256SqrInternal(SB) \// z1ˆ2 441 ST (z1sqr) \ 442 \ 443 LDt (x2in) \ 444 CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2 445 \ 446 LDt (x1in) \ 447 p256SubInline2 \// h = u2 - x1 448 ST (h) \ 449 \ 450 LDt (z1in) \ 451 CALL sm2P256MulInternal(SB) \// z3 = h * z1 452 ST (zout) \ 453 \ 454 LDacc (z1sqr) \ 455 CALL sm2P256MulInternal(SB) \// z1ˆ3 456 \ 457 LDt (y2in) \ 458 CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3 459 ST (s2) \ 460 \ 461 LDt (y1in) \ 462 p256SubInline2 \// r = s2 - y1 463 ST (r) \ 464 \ 465 CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 466 ST (rsqr) \ 467 \ 468 LDacc (h) \ 469 CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 470 ST (hsqr) \ 471 \ 472 LDt (h) \ 473 CALL sm2P256MulInternal(SB) \// hcub = hˆ3 474 ST (hcub) \ 475 \ 476 LDt (y1in) \ 477 CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3 478 ST (s2) \ 479 \ 480 LDacc (x1in) \ 481 LDt (hsqr) \ 482 CALL sm2P256MulInternal(SB) \// x1 * hˆ2 483 ST (h) \ 484 \ 485 p256MulBy2Inline \// x1 * hˆ2 * 2, inline 486 LDacc (rsqr) \ 487 p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2 488 \ 489 LDt (hcub) \ 490 p256SubInline \ 491 STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3 492 LDacc (h) \ 493 p256SubInline2 \ 494 \ 495 LDt (r) \ 496 CALL sm2P256MulInternal(SB) \ 497 \ 498 LDt (s2) \ 499 p256SubInline2 \ 500 ST (yout) \ 501 \// Load stored values from stack 502 MOVQ rptr, AX \ 503 504 // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) 505 TEXT ·p256PointAddAffineAsm(SB),0,$512-48 506 // Move input to stack in order to free registers 507 MOVQ res+0(FP), AX 508 MOVQ in1+8(FP), BX 509 MOVQ in2+16(FP), CX 510 MOVQ sign+24(FP), DX 511 MOVQ sel+32(FP), t1 512 MOVQ zero+40(FP), t2 513 514 CMPB ·supportAVX2+0(SB), $0x01 515 JEQ pointaddaffine_avx2 516 517 MOVOU (16*0)(BX), X0 518 MOVOU (16*1)(BX), X1 519 MOVOU (16*2)(BX), X2 520 MOVOU (16*3)(BX), X3 521 MOVOU (16*4)(BX), X4 522 MOVOU (16*5)(BX), X5 523 524 MOVOU X0, x1in(16*0) 525 MOVOU X1, x1in(16*1) 526 MOVOU X2, y1in(16*0) 527 MOVOU X3, y1in(16*1) 528 MOVOU X4, z1in(16*0) 529 MOVOU X5, z1in(16*1) 530 531 MOVOU (16*0)(CX), X0 532 MOVOU (16*1)(CX), X1 533 534 MOVOU X0, x2in(16*0) 535 MOVOU X1, x2in(16*1) 536 537 p256PointAddAffineInline() 538 // The result is not valid if (sel == 0), conditional choose 539 MOVOU xout(16*0), X0 540 MOVOU xout(16*1), X1 541 MOVOU yout(16*0), X2 542 MOVOU yout(16*1), X3 543 MOVOU zout(16*0), X4 544 MOVOU zout(16*1), X5 545 546 MOVL sel_save, X6 // sel 547 MOVL zero_save, X7 // zero 548 549 PXOR X8, X8 // X8's bits are all 0 550 PCMPEQL X9, X9 // X9's bits are all 1 551 552 PSHUFD $0, X6, X6 553 PSHUFD $0, X7, X7 554 555 PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0 556 PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0 557 558 MOVOU X6, X15 559 PANDN X9, X15 // X15 = NOT(X6) 560 561 MOVOU x1in(16*0), X9 562 MOVOU x1in(16*1), X10 563 MOVOU y1in(16*0), X11 564 MOVOU y1in(16*1), X12 565 MOVOU z1in(16*0), X13 566 MOVOU z1in(16*1), X14 567 568 PAND X15, X0 569 PAND X15, X1 570 PAND X15, X2 571 PAND X15, X3 572 PAND X15, X4 573 PAND X15, X5 574 575 PAND X6, X9 576 PAND X6, X10 577 PAND X6, X11 578 PAND X6, X12 579 PAND X6, X13 580 PAND X6, X14 581 582 PXOR X9, X0 583 PXOR X10, X1 584 PXOR X11, X2 585 PXOR X12, X3 586 PXOR X13, X4 587 PXOR X14, X5 588 // Similarly if zero == 0 589 PCMPEQL X9, X9 590 MOVOU X7, X15 591 PANDN X9, X15 // X15 = NOT(X7) 592 593 MOVOU x2in(16*0), X9 594 MOVOU x2in(16*1), X10 595 MOVOU y2in(16*0), X11 596 MOVOU y2in(16*1), X12 597 MOVOU p256one<>+0x00(SB), X13 598 MOVOU p256one<>+0x10(SB), X14 599 600 PAND X15, X0 601 PAND X15, X1 602 PAND X15, X2 603 PAND X15, X3 604 PAND X15, X4 605 PAND X15, X5 606 607 PAND X7, X9 608 PAND X7, X10 609 PAND X7, X11 610 PAND X7, X12 611 PAND X7, X13 612 PAND X7, X14 613 614 PXOR X9, X0 615 PXOR X10, X1 616 PXOR X11, X2 617 PXOR X12, X3 618 PXOR X13, X4 619 PXOR X14, X5 620 // Finally output the result 621 MOVOU X0, (16*0)(AX) 622 MOVOU X1, (16*1)(AX) 623 MOVOU X2, (16*2)(AX) 624 MOVOU X3, (16*3)(AX) 625 MOVOU X4, (16*4)(AX) 626 MOVOU X5, (16*5)(AX) 627 MOVQ $0, rptr 628 629 RET 630 pointaddaffine_avx2: 631 VMOVDQU (32*0)(BX), Y0 632 VMOVDQU (32*1)(BX), Y1 633 VMOVDQU (32*2)(BX), Y2 634 635 VMOVDQU Y0, x1in(32*0) 636 VMOVDQU Y1, y1in(32*0) 637 VMOVDQU Y2, z1in(32*0) 638 639 VMOVDQU (32*0)(CX), Y0 640 VMOVDQU Y0, x2in(32*0) 641 642 p256PointAddAffineInline() 643 // The result is not valid if (sel == 0), conditional choose 644 VPXOR Y8, Y8, Y8 // Y8's bits are all 0 645 VPBROADCASTD sel_save, Y6 // sel 646 VPBROADCASTD zero_save, Y7 // zero 647 648 VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0 649 VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0 650 651 VPANDN xout(32*0), Y6, Y0 652 VPANDN yout(32*0), Y6, Y1 653 VPANDN zout(32*0), Y6, Y2 654 655 VPAND x1in(32*0), Y6, Y9 656 VPAND y1in(32*0), Y6, Y10 657 VPAND z1in(32*0), Y6, Y11 658 659 VPXOR Y9, Y0, Y0 660 VPXOR Y10, Y1, Y1 661 VPXOR Y11, Y2, Y2 662 663 // Similarly if zero == 0 664 VPANDN Y0, Y7, Y0 665 VPANDN Y1, Y7, Y1 666 VPANDN Y2, Y7, Y2 667 668 VPAND x2in(32*0), Y7, Y9 669 VPAND y2in(32*0), Y7, Y10 670 VPAND p256one<>+0x00(SB), Y7, Y11 671 672 VPXOR Y9, Y0, Y0 673 VPXOR Y10, Y1, Y1 674 VPXOR Y11, Y2, Y2 675 676 // Finally output the result 677 VMOVDQU Y0, (32*0)(AX) 678 VMOVDQU Y1, (32*1)(AX) 679 VMOVDQU Y2, (32*2)(AX) 680 MOVQ $0, rptr 681 682 VZEROUPPER 683 RET 684 #undef x1in 685 #undef y1in 686 #undef z1in 687 #undef x2in 688 #undef y2in 689 #undef xout 690 #undef yout 691 #undef zout 692 #undef s2 693 #undef z1sqr 694 #undef h 695 #undef r 696 #undef hsqr 697 #undef rsqr 698 #undef hcub 699 #undef rptr 700 #undef sel_save 701 #undef zero_save 702 703 /* ---------------------------------------*/ 704 #define x1in(off) (32*0 + off)(SP) 705 #define y1in(off) (32*1 + off)(SP) 706 #define z1in(off) (32*2 + off)(SP) 707 #define x2in(off) (32*3 + off)(SP) 708 #define y2in(off) (32*4 + off)(SP) 709 #define z2in(off) (32*5 + off)(SP) 710 711 #define xout(off) (32*6 + off)(SP) 712 #define yout(off) (32*7 + off)(SP) 713 #define zout(off) (32*8 + off)(SP) 714 715 #define u1(off) (32*9 + off)(SP) 716 #define u2(off) (32*10 + off)(SP) 717 #define s1(off) (32*11 + off)(SP) 718 #define s2(off) (32*12 + off)(SP) 719 #define z1sqr(off) (32*13 + off)(SP) 720 #define z2sqr(off) (32*14 + off)(SP) 721 #define h(off) (32*15 + off)(SP) 722 #define r(off) (32*16 + off)(SP) 723 #define hsqr(off) (32*17 + off)(SP) 724 #define rsqr(off) (32*18 + off)(SP) 725 #define hcub(off) (32*19 + off)(SP) 726 #define rptr (32*20)(SP) 727 #define points_eq (32*20+8)(SP) 728 729 #define p256PointAddInline() \ 730 \// Begin point add 731 LDacc (z2in) \ 732 CALL sm2P256SqrInternal(SB) \// z2ˆ2 733 ST (z2sqr) \ 734 LDt (z2in) \ 735 CALL sm2P256MulInternal(SB) \// z2ˆ3 736 LDt (y1in) \ 737 CALL sm2P256MulInternal(SB) \// s1 = z2ˆ3*y1 738 ST (s1) \ 739 \ 740 LDacc (z1in) \ 741 CALL sm2P256SqrInternal(SB) \// z1ˆ2 742 ST (z1sqr) \ 743 LDt (z1in) \ 744 CALL sm2P256MulInternal(SB) \// z1ˆ3 745 LDt (y2in) \ 746 CALL sm2P256MulInternal(SB) \// s2 = z1ˆ3*y2 747 ST (s2) \ 748 \ 749 LDt (s1) \ 750 p256SubInline2 \// r = s2 - s1 751 ST (r) \ 752 p256IsZeroInline \ 753 MOVQ AX, points_eq \ 754 \ 755 LDacc (z2sqr) \ 756 LDt (x1in) \ 757 CALL sm2P256MulInternal(SB) \// u1 = x1 * z2ˆ2 758 ST (u1) \ 759 LDacc (z1sqr) \ 760 LDt (x2in) \ 761 CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2 762 ST (u2) \ 763 \ 764 LDt (u1) \ 765 p256SubInline2 \// h = u2 - u1 766 ST (h) \ 767 p256IsZeroInline \ 768 ANDQ points_eq, AX \ 769 MOVQ AX, points_eq \ 770 \ 771 LDacc (r) \ 772 CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 773 ST (rsqr) \ 774 \ 775 LDacc (h) \ 776 CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 777 ST (hsqr) \ 778 \ 779 LDt (h) \ 780 CALL sm2P256MulInternal(SB) \// hcub = hˆ3 781 ST (hcub) \ 782 \ 783 LDt (s1) \ 784 CALL sm2P256MulInternal(SB) \ 785 ST (s2) \ 786 \ 787 LDacc (z1in) \ 788 LDt (z2in) \ 789 CALL sm2P256MulInternal(SB) \// z1 * z2 790 LDt (h) \ 791 CALL sm2P256MulInternal(SB) \// z1 * z2 * h 792 ST (zout) \ 793 \ 794 LDacc (hsqr) \ 795 LDt (u1) \ 796 CALL sm2P256MulInternal(SB) \// hˆ2 * u1 797 ST (u2) \ 798 \ 799 p256MulBy2Inline \// u1 * hˆ2 * 2, inline 800 LDacc (rsqr) \ 801 p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 802 \ 803 LDt (hcub) \ 804 p256SubInline \ 805 STt (xout) \ 806 LDacc (u2) \ 807 p256SubInline2 \ 808 \ 809 LDt (r) \ 810 CALL sm2P256MulInternal(SB) \ 811 \ 812 LDt (s2) \ 813 p256SubInline2 \ 814 ST (yout) \ 815 816 //func p256PointAddAsm(res, in1, in2 *SM2P256Point) int 817 TEXT ·p256PointAddAsm(SB),0,$680-32 818 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 819 // Move input to stack in order to free registers 820 MOVQ res+0(FP), AX 821 MOVQ in1+8(FP), BX 822 MOVQ in2+16(FP), CX 823 824 CMPB ·supportAVX2+0(SB), $0x01 825 JEQ pointadd_avx2 826 827 MOVOU (16*0)(BX), X0 828 MOVOU (16*1)(BX), X1 829 MOVOU (16*2)(BX), X2 830 MOVOU (16*3)(BX), X3 831 MOVOU (16*4)(BX), X4 832 MOVOU (16*5)(BX), X5 833 834 MOVOU X0, x1in(16*0) 835 MOVOU X1, x1in(16*1) 836 MOVOU X2, y1in(16*0) 837 MOVOU X3, y1in(16*1) 838 MOVOU X4, z1in(16*0) 839 MOVOU X5, z1in(16*1) 840 841 MOVOU (16*0)(CX), X0 842 MOVOU (16*1)(CX), X1 843 MOVOU (16*2)(CX), X2 844 MOVOU (16*3)(CX), X3 845 MOVOU (16*4)(CX), X4 846 MOVOU (16*5)(CX), X5 847 848 MOVOU X0, x2in(16*0) 849 MOVOU X1, x2in(16*1) 850 MOVOU X2, y2in(16*0) 851 MOVOU X3, y2in(16*1) 852 MOVOU X4, z2in(16*0) 853 MOVOU X5, z2in(16*1) 854 // Store pointer to result 855 MOVQ AX, rptr 856 p256PointAddInline() 857 858 MOVOU xout(16*0), X0 859 MOVOU xout(16*1), X1 860 MOVOU yout(16*0), X2 861 MOVOU yout(16*1), X3 862 MOVOU zout(16*0), X4 863 MOVOU zout(16*1), X5 864 // Finally output the result 865 MOVQ rptr, AX 866 MOVQ $0, rptr 867 MOVOU X0, (16*0)(AX) 868 MOVOU X1, (16*1)(AX) 869 MOVOU X2, (16*2)(AX) 870 MOVOU X3, (16*3)(AX) 871 MOVOU X4, (16*4)(AX) 872 MOVOU X5, (16*5)(AX) 873 874 MOVQ points_eq, AX 875 MOVQ AX, ret+24(FP) 876 877 RET 878 pointadd_avx2: 879 VMOVDQU (32*0)(BX), Y0 880 VMOVDQU (32*1)(BX), Y1 881 VMOVDQU (32*2)(BX), Y2 882 883 VMOVDQU Y0, x1in(32*0) 884 VMOVDQU Y1, y1in(32*0) 885 VMOVDQU Y2, z1in(32*0) 886 887 VMOVDQU (32*0)(CX), Y0 888 VMOVDQU (32*1)(CX), Y1 889 VMOVDQU (32*2)(CX), Y2 890 891 VMOVDQU Y0, x2in(32*0) 892 VMOVDQU Y1, y2in(32*0) 893 VMOVDQU Y2, z2in(32*0) 894 895 // Store pointer to result 896 MOVQ AX, rptr 897 p256PointAddInline() 898 899 VMOVDQU xout(32*0), Y0 900 VMOVDQU yout(32*0), Y1 901 VMOVDQU zout(32*0), Y2 902 // Finally output the result 903 MOVQ rptr, AX 904 MOVQ $0, rptr 905 VMOVDQU Y0, (32*0)(AX) 906 VMOVDQU Y1, (32*1)(AX) 907 VMOVDQU Y2, (32*2)(AX) 908 909 MOVQ points_eq, AX 910 MOVQ AX, ret+24(FP) 911 912 VZEROUPPER 913 RET 914 915 #undef x1in 916 #undef y1in 917 #undef z1in 918 #undef x2in 919 #undef y2in 920 #undef z2in 921 #undef xout 922 #undef yout 923 #undef zout 924 #undef s1 925 #undef s2 926 #undef u1 927 #undef u2 928 #undef z1sqr 929 #undef z2sqr 930 #undef h 931 #undef r 932 #undef hsqr 933 #undef rsqr 934 #undef hcub 935 #undef rptr 936 /* ---------------------------------------*/ 937 #define x(off) (32*0 + off)(SP) 938 #define y(off) (32*1 + off)(SP) 939 #define z(off) (32*2 + off)(SP) 940 941 #define s(off) (32*3 + off)(SP) 942 #define m(off) (32*4 + off)(SP) 943 #define zsqr(off) (32*5 + off)(SP) 944 #define tmp(off) (32*6 + off)(SP) 945 #define rptr (32*7)(SP) 946 947 #define calZ() \ 948 LDacc (z) \ 949 CALL sm2P256SqrInternal(SB) \ 950 ST (zsqr) \ // ZZ = Z1^2 951 \ 952 LDt (x) \ 953 p256AddInline \ 954 STt (m) \ // M = ZZ + X1 955 \ 956 LDacc (z) \ 957 LDt (y) \ 958 CALL sm2P256MulInternal(SB) \ // Z1 * Y1 959 p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2 960 961 #define calX() \ 962 LDacc (x) \ 963 LDt (zsqr) \ 964 p256SubInline2 \ // X1 - ZZ 965 LDt (m) \ 966 CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2 967 ST (m) \ 968 \// Multiply by 3 969 p256TripleInline \ 970 STt (m) \ // M = 3 * (X1^2 - ZZ^2) 971 \//////////////////////// 972 LDacc (y) \ 973 p256MulBy2Inline2 \ 974 CALL sm2P256SqrInternal(SB) \ // 4 * YY = (2*Y1)^2 975 ST (s) \ // S = 4 * YY 976 CALL sm2P256SqrInternal(SB) \ // (4 * YY)^2 = 16 * YYYY 977 \// Divide by 2 978 XORQ mul0, mul0 \ 979 MOVQ acc4, t0 \ 980 MOVQ acc5, t1 \ 981 MOVQ acc6, t2 \ 982 MOVQ acc7, t3 \ 983 \ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P 984 ADDQ $-1, acc4 \ 985 ADCQ p256p<>+0x08(SB), acc5 \ 986 ADCQ $-1, acc6 \ 987 ADCQ p256p<>+0x018(SB), acc7 \ 988 ADCQ $0, mul0 \ 989 TESTQ $1, t0 \ // ZF := 1 if (t0 AND 1 == 0) 990 \ // CMOVQEQ: Move if equal (ZF == 1) 991 CMOVQEQ t0, acc4 \ // acc4 := t0 if (ZF == 1) 992 CMOVQEQ t1, acc5 \ // acc5 := t1 if (ZF == 1) 993 CMOVQEQ t2, acc6 \ // acc6 := t2 if (ZF == 1) 994 CMOVQEQ t3, acc7 \ // acc7 := t3 if (ZF == 1) 995 ANDQ t0, mul0 \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1) 996 \ // Divide even by 2 997 SHRQ $1, acc5, acc4 \ // acc4 := acc4 >> 1 | acc5 << 63 998 SHRQ $1, acc6, acc5 \ // acc5 := acc5 >> 1 | acc6 << 63 999 SHRQ $1, acc7, acc6 \ // acc6 := acc6 >> 1 | acc7 << 63 1000 SHRQ $1, mul0, acc7 \ // acc7 := acc7 >> 1 | mul0 << 63 1001 ST (y) \ // Y3 = 8 * YYYY 1002 \///////////////////////// 1003 LDacc (x) \ 1004 LDt (s) \ 1005 CALL sm2P256MulInternal(SB) \ // X1 * 4 * YY 1006 ST (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY) 1007 p256MulBy2Inline \ 1008 STt (tmp) \ // tmp = 2*S = 8 * X1 * YY 1009 \ 1010 LDacc (m) \ 1011 CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2 1012 LDt (tmp) \ 1013 p256SubInline2 \ // X3 = M^2 - 2*S 1014 1015 #define calY() \ 1016 acc2t \ 1017 LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY) 1018 p256SubInline2 \ // S - X3 1019 \ 1020 LDt (m) \ 1021 CALL sm2P256MulInternal(SB) \ // M * (S - X3) 1022 \ 1023 LDt (y) \ 1024 p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY 1025 1026 #define lastP256PointDouble() \ 1027 \ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl 1028 calZ() \ 1029 MOVQ rptr, AX \ 1030 \// Store z 1031 MOVQ t0, (16*4 + 8*0)(AX) \ 1032 MOVQ t1, (16*4 + 8*1)(AX) \ 1033 MOVQ t2, (16*4 + 8*2)(AX) \ 1034 MOVQ t3, (16*4 + 8*3)(AX) \ 1035 \ 1036 calX() \ 1037 MOVQ rptr, AX \ 1038 \// Store x 1039 MOVQ acc4, (16*0 + 8*0)(AX) \ 1040 MOVQ acc5, (16*0 + 8*1)(AX) \ 1041 MOVQ acc6, (16*0 + 8*2)(AX) \ 1042 MOVQ acc7, (16*0 + 8*3)(AX) \ 1043 \ 1044 calY() \ 1045 MOVQ rptr, AX \ 1046 \// Store y 1047 MOVQ acc4, (16*2 + 8*0)(AX) \ 1048 MOVQ acc5, (16*2 + 8*1)(AX) \ 1049 MOVQ acc6, (16*2 + 8*2)(AX) \ 1050 MOVQ acc7, (16*2 + 8*3)(AX) \ 1051 \/////////////////////// 1052 MOVQ $0, rptr \ 1053 1054 //func p256PointDoubleAsm(res, in *SM2P256Point) 1055 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16 1056 // Move input to stack in order to free registers 1057 MOVQ res+0(FP), AX 1058 MOVQ in+8(FP), BX 1059 1060 p256PointDoubleInit() 1061 // Store pointer to result 1062 MOVQ AX, rptr 1063 // Begin point double 1064 lastP256PointDouble() 1065 1066 RET 1067 1068 #define storeTmpX() \ 1069 MOVQ acc4, x(8*0) \ 1070 MOVQ acc5, x(8*1) \ 1071 MOVQ acc6, x(8*2) \ 1072 MOVQ acc7, x(8*3) \ 1073 1074 #define storeTmpY() \ 1075 MOVQ acc4, y(8*0) \ 1076 MOVQ acc5, y(8*1) \ 1077 MOVQ acc6, y(8*2) \ 1078 MOVQ acc7, y(8*3) \ 1079 1080 #define storeTmpZ() \ 1081 MOVQ t0, z(8*0) \ 1082 MOVQ t1, z(8*1) \ 1083 MOVQ t2, z(8*2) \ 1084 MOVQ t3, z(8*3) \ 1085 1086 #define p256PointDoubleRound() \ 1087 calZ() \ 1088 storeTmpZ() \ 1089 calX() \ 1090 storeTmpX() \ 1091 calY() \ 1092 storeTmpY() \ 1093 1094 //func p256PointDouble6TimesAsm(res, in *SM2P256Point) 1095 TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$256-16 1096 // Move input to stack in order to free registers 1097 MOVQ res+0(FP), AX 1098 MOVQ in+8(FP), BX 1099 1100 p256PointDoubleInit() 1101 // Store pointer to result 1102 MOVQ AX, rptr 1103 1104 // point double 1-5 rounds 1105 p256PointDoubleRound() 1106 p256PointDoubleRound() 1107 p256PointDoubleRound() 1108 p256PointDoubleRound() 1109 p256PointDoubleRound() 1110 1111 // last point double round 1112 lastP256PointDouble() 1113 1114 RET 1115 /* ---------------------------------------*/