github.com/hellobchain/newcryptosm@v0.0.0-20221019060107-edb949a317e9/sm2/sm2_asm_amd64.s (about) 1 2 #include "textflag.h" 3 4 #define res_ptr DI 5 #define x_ptr SI 6 #define y_ptr CX 7 8 #define acc0 R8 9 #define acc1 R9 10 #define acc2 R10 11 #define acc3 R11 12 #define acc4 R12 13 #define acc5 R13 14 #define t0 R14 15 #define t1 R15 16 17 //sm2 fffffffe ffffffff ffffffff ffffffff 18 // ffffffff 00000000 ffffffff ffffffff 19 DATA sm2const0<>+0x00(SB)/8, $0xffffffff00000000 20 DATA sm2const1<>+0x00(SB)/8, $0xfffffffeffffffff 21 DATA sm2ordK0<>+0x00(SB)/8, $0x327f9e8872350975 22 DATA sm2ord<>+0x00(SB)/8, $0x53bbf40939d54123 23 DATA sm2ord<>+0x08(SB)/8, $0x7203df6b21c6052b 24 DATA sm2ord<>+0x10(SB)/8, $0xffffffffffffffff 25 DATA sm2ord<>+0x18(SB)/8, $0xfffffffeffffffff 26 DATA sm2one<>+0x00(SB)/8, $0x0000000000000001 27 DATA sm2one<>+0x08(SB)/8, $0x00000000ffffffff 28 DATA sm2one<>+0x10(SB)/8, $0x0000000000000000 29 DATA sm2one<>+0x18(SB)/8, $0x0000000100000000 30 GLOBL sm2const0<>(SB), 8, $8 31 GLOBL sm2const1<>(SB), 8, $8 32 GLOBL sm2ordK0<>(SB), 8, $8 33 GLOBL sm2ord<>(SB), 8, $32 34 GLOBL sm2one<>(SB), 8, $32 35 36 /* ---------------------------------------*/ 37 // func sm2LittleToBig(res []byte, in []uint64) 38 TEXT ·sm2LittleToBig(SB),NOSPLIT,$0 39 JMP ·sm2BigToLittle(SB) 40 /* ---------------------------------------*/ 41 // func sm2BigToLittle(res []uint64, in []byte) 42 TEXT ·sm2BigToLittle(SB),NOSPLIT,$0 43 MOVQ res+0(FP), res_ptr 44 MOVQ in+24(FP), x_ptr 45 46 MOVQ (8*0)(x_ptr), acc0 47 MOVQ (8*1)(x_ptr), acc1 48 MOVQ (8*2)(x_ptr), acc2 49 MOVQ (8*3)(x_ptr), acc3 50 51 BSWAPQ acc0 52 BSWAPQ acc1 53 BSWAPQ acc2 54 BSWAPQ acc3 55 56 MOVQ acc3, (8*0)(res_ptr) 57 MOVQ acc2, (8*1)(res_ptr) 58 MOVQ acc1, (8*2)(res_ptr) 59 MOVQ acc0, (8*3)(res_ptr) 60 61 RET 62 /* ---------------------------------------*/ 63 // func sm2MovCond(res, a, b []uint64, cond int) 64 // If cond == 0 res=b, else res=a 65 TEXT ·sm2MovCond(SB),NOSPLIT,$0 66 MOVQ res+0(FP), res_ptr 67 MOVQ a+24(FP), x_ptr 68 MOVQ b+48(FP), y_ptr 69 MOVQ cond+72(FP), X12 70 71 PXOR X13, X13 72 PSHUFD $0, X12, X12 73 PCMPEQL X13, X12 74 75 MOVOU X12, X0 76 MOVOU (16*0)(x_ptr), X6 77 PANDN X6, X0 78 MOVOU X12, X1 79 MOVOU (16*1)(x_ptr), X7 80 PANDN X7, X1 81 MOVOU X12, X2 82 MOVOU (16*2)(x_ptr), X8 83 PANDN X8, X2 84 MOVOU X12, X3 85 MOVOU (16*3)(x_ptr), X9 86 PANDN X9, X3 87 MOVOU X12, X4 88 MOVOU (16*4)(x_ptr), X10 89 PANDN X10, X4 90 MOVOU X12, X5 91 MOVOU (16*5)(x_ptr), X11 92 PANDN X11, X5 93 94 MOVOU (16*0)(y_ptr), X6 95 MOVOU (16*1)(y_ptr), X7 96 MOVOU (16*2)(y_ptr), X8 97 MOVOU (16*3)(y_ptr), X9 98 MOVOU (16*4)(y_ptr), X10 99 MOVOU (16*5)(y_ptr), X11 100 101 PAND X12, X6 102 PAND X12, X7 103 PAND X12, X8 104 PAND X12, X9 105 PAND X12, X10 106 PAND X12, X11 107 108 PXOR X6, X0 109 PXOR X7, X1 110 PXOR X8, X2 111 PXOR X9, X3 112 PXOR X10, X4 113 PXOR X11, X5 114 115 MOVOU X0, (16*0)(res_ptr) 116 MOVOU X1, (16*1)(res_ptr) 117 MOVOU X2, (16*2)(res_ptr) 118 MOVOU X3, (16*3)(res_ptr) 119 MOVOU X4, (16*4)(res_ptr) 120 MOVOU X5, (16*5)(res_ptr) 121 122 RET 123 /* ---------------------------------------*/ 124 // func sm2NegCond(val []uint64, cond int) 125 TEXT ·sm2NegCond(SB),NOSPLIT,$0 126 MOVQ val+0(FP), res_ptr 127 MOVQ cond+24(FP), t0 128 // acc = poly 129 MOVQ $-1, acc0 130 MOVQ sm2const0<>(SB), acc1 131 MOVQ $-1, acc2 132 MOVQ sm2const1<>(SB), acc3 133 // Load the original value 134 MOVQ (8*0)(res_ptr), acc5 135 MOVQ (8*1)(res_ptr), x_ptr 136 MOVQ (8*2)(res_ptr), y_ptr 137 MOVQ (8*3)(res_ptr), t1 138 // Speculatively subtract 139 SUBQ acc5, acc0 140 SBBQ x_ptr, acc1 141 SBBQ y_ptr, acc2 142 SBBQ t1, acc3 143 // If condition is 0, keep original value 144 TESTQ t0, t0 145 CMOVQEQ acc5, acc0 146 CMOVQEQ x_ptr, acc1 147 CMOVQEQ y_ptr, acc2 148 CMOVQEQ t1, acc3 149 // Store result 150 MOVQ acc0, (8*0)(res_ptr) 151 MOVQ acc1, (8*1)(res_ptr) 152 MOVQ acc2, (8*2)(res_ptr) 153 MOVQ acc3, (8*3)(res_ptr) 154 155 RET 156 /* ---------------------------------------*/ 157 // func sm2Sqr(res, in []uint64, n int) 158 TEXT ·sm2Sqr(SB),NOSPLIT,$0 159 MOVQ res+0(FP), res_ptr 160 MOVQ in+24(FP), x_ptr 161 MOVQ n+48(FP), BX 162 163 sqrLoop: 164 165 // y[1:] * y[0] 166 MOVQ (8*0)(x_ptr), t0 167 168 MOVQ (8*1)(x_ptr), AX 169 MULQ t0 170 MOVQ AX, acc1 171 MOVQ DX, acc2 172 173 MOVQ (8*2)(x_ptr), AX 174 MULQ t0 175 ADDQ AX, acc2 176 ADCQ $0, DX 177 MOVQ DX, acc3 178 179 MOVQ (8*3)(x_ptr), AX 180 MULQ t0 181 ADDQ AX, acc3 182 ADCQ $0, DX 183 MOVQ DX, acc4 184 // y[2:] * y[1] 185 MOVQ (8*1)(x_ptr), t0 186 187 MOVQ (8*2)(x_ptr), AX 188 MULQ t0 189 ADDQ AX, acc3 190 ADCQ $0, DX 191 MOVQ DX, t1 192 193 MOVQ (8*3)(x_ptr), AX 194 MULQ t0 195 ADDQ t1, acc4 196 ADCQ $0, DX 197 ADDQ AX, acc4 198 ADCQ $0, DX 199 MOVQ DX, acc5 200 // y[3] * y[2] 201 MOVQ (8*2)(x_ptr), t0 202 203 MOVQ (8*3)(x_ptr), AX 204 MULQ t0 205 ADDQ AX, acc5 206 ADCQ $0, DX 207 MOVQ DX, y_ptr 208 XORQ t1, t1 209 // *2 210 ADDQ acc1, acc1 211 ADCQ acc2, acc2 212 ADCQ acc3, acc3 213 ADCQ acc4, acc4 214 ADCQ acc5, acc5 215 ADCQ y_ptr, y_ptr 216 ADCQ $0, t1 217 // Missing products 218 MOVQ (8*0)(x_ptr), AX 219 MULQ AX 220 MOVQ AX, acc0 221 MOVQ DX, t0 222 223 MOVQ (8*1)(x_ptr), AX 224 MULQ AX 225 ADDQ t0, acc1 226 ADCQ AX, acc2 227 ADCQ $0, DX 228 MOVQ DX, t0 229 230 MOVQ (8*2)(x_ptr), AX 231 MULQ AX 232 ADDQ t0, acc3 233 ADCQ AX, acc4 234 ADCQ $0, DX 235 MOVQ DX, t0 236 237 MOVQ (8*3)(x_ptr), AX 238 MULQ AX 239 ADDQ t0, acc5 240 ADCQ AX, y_ptr 241 ADCQ DX, t1 242 MOVQ t1, x_ptr 243 // First reduction step 244 MOVQ acc0, AX 245 MOVQ acc0, t1 246 ADDQ acc0, acc1 247 ADCQ $0, acc2 248 ADCQ $0, acc3 249 ADCQ $0, AX 250 SHLQ $32, t1 251 SHRQ $32, acc0 252 SUBQ t1, acc1 253 SBBQ acc0, acc2 254 SBBQ t1, acc3 255 SBBQ acc0, AX 256 MOVQ AX, acc0 257 258 // Second reduction step 259 MOVQ acc1, AX 260 MOVQ acc1, t1 261 ADDQ acc1, acc2 262 ADCQ $0, acc3 263 ADCQ $0, acc0 264 ADCQ $0, AX 265 SHLQ $32, t1 266 SHRQ $32, acc1 267 SUBQ t1, acc2 268 SBBQ acc1, acc3 269 SBBQ t1, acc0 270 SBBQ acc1, AX 271 MOVQ AX, acc1 272 273 // Third reduction step 274 MOVQ acc2, AX 275 MOVQ acc2, t1 276 ADDQ acc2, acc3 277 ADCQ $0, acc0 278 ADCQ $0, acc1 279 ADCQ $0, AX 280 SHLQ $32, t1 281 SHRQ $32, acc2 282 SUBQ t1, acc3 283 SBBQ acc2, acc0 284 SBBQ t1, acc1 285 SBBQ acc2, AX 286 MOVQ AX, acc2 287 288 // Last reduction step 289 XORQ t0, t0 290 MOVQ acc3, AX 291 MOVQ acc3, t1 292 ADDQ acc3, acc0 293 ADCQ $0, acc1 294 ADCQ $0, acc2 295 ADCQ $0, AX 296 SHLQ $32, t1 297 SHRQ $32, acc3 298 SUBQ t1, acc0 299 SBBQ acc3, acc1 300 SBBQ t1, acc2 301 SBBQ acc3, AX 302 MOVQ AX, acc3 303 304 // Add bits [511:256] of the sqr result 305 ADCQ acc4, acc0 306 ADCQ acc5, acc1 307 ADCQ y_ptr, acc2 308 ADCQ x_ptr, acc3 309 ADCQ $0, t0 310 311 MOVQ acc0, acc4 312 MOVQ acc1, acc5 313 MOVQ acc2, y_ptr 314 MOVQ acc3, t1 315 // Subtract sm2-p 316 SUBQ $-1, acc0 317 SBBQ sm2const0<>(SB) ,acc1 318 SBBQ $-1, acc2 319 SBBQ sm2const1<>(SB), acc3 320 SBBQ $0, t0 321 322 CMOVQCS acc4, acc0 323 CMOVQCS acc5, acc1 324 CMOVQCS y_ptr, acc2 325 CMOVQCS t1, acc3 326 327 MOVQ acc0, (8*0)(res_ptr) 328 MOVQ acc1, (8*1)(res_ptr) 329 MOVQ acc2, (8*2)(res_ptr) 330 MOVQ acc3, (8*3)(res_ptr) 331 MOVQ res_ptr, x_ptr 332 DECQ BX 333 JNE sqrLoop 334 335 RET 336 /* ---------------------------------------*/ 337 // func sm2Mul(res, in1, in2 []uint64) 338 TEXT ·sm2Mul(SB),NOSPLIT,$0 339 MOVQ res+0(FP), res_ptr 340 MOVQ in1+24(FP), x_ptr 341 MOVQ in2+48(FP), y_ptr 342 // x * y[0] 343 MOVQ (8*0)(y_ptr), t0 344 345 MOVQ (8*0)(x_ptr), AX 346 MULQ t0 347 MOVQ AX, acc0 348 MOVQ DX, acc1 349 350 MOVQ (8*1)(x_ptr), AX 351 MULQ t0 352 ADDQ AX, acc1 353 ADCQ $0, DX 354 MOVQ DX, acc2 355 356 MOVQ (8*2)(x_ptr), AX 357 MULQ t0 358 ADDQ AX, acc2 359 ADCQ $0, DX 360 MOVQ DX, acc3 361 362 MOVQ (8*3)(x_ptr), AX 363 MULQ t0 364 ADDQ AX, acc3 365 ADCQ $0, DX 366 MOVQ DX, acc4 367 XORQ acc5, acc5 368 // First reduction step 369 MOVQ acc0, t1 370 ADDQ acc0, acc1 371 ADCQ $0, acc2 372 ADCQ $0, acc3 373 ADCQ acc0, acc4 374 ADCQ $0, acc5 375 SHLQ $32, t1 376 SHRQ $32, acc0 377 SUBQ t1, acc1 378 SBBQ acc0, acc2 379 SBBQ t1, acc3 380 SBBQ acc0, acc4 381 SBBQ $0, acc5 382 XORQ acc0, acc0 383 // x * y[1] 384 MOVQ (8*1)(y_ptr), t0 385 386 MOVQ (8*0)(x_ptr), AX 387 MULQ t0 388 ADDQ AX, acc1 389 ADCQ $0, DX 390 MOVQ DX, t1 391 392 MOVQ (8*1)(x_ptr), AX 393 MULQ t0 394 ADDQ t1, acc2 395 ADCQ $0, DX 396 ADDQ AX, acc2 397 ADCQ $0, DX 398 MOVQ DX, t1 399 400 MOVQ (8*2)(x_ptr), AX 401 MULQ t0 402 ADDQ t1, acc3 403 ADCQ $0, DX 404 ADDQ AX, acc3 405 ADCQ $0, DX 406 MOVQ DX, t1 407 408 MOVQ (8*3)(x_ptr), AX 409 MULQ t0 410 ADDQ t1, acc4 411 ADCQ $0, DX 412 ADDQ AX, acc4 413 ADCQ DX, acc5 414 ADCQ $0, acc0 415 // Second reduction step 416 MOVQ acc1, t1 417 ADDQ acc1, acc2 418 ADCQ $0, acc3 419 ADCQ $0, acc4 420 ADCQ acc1, acc5 421 ADCQ $0, acc0 422 SHLQ $32, t1 423 SHRQ $32, acc1 424 SUBQ t1, acc2 425 SBBQ acc1, acc3 426 SBBQ t1, acc4 427 SBBQ acc1, acc5 428 SBBQ $0, acc0 429 XORQ acc1, acc1 430 // x * y[2] 431 MOVQ (8*2)(y_ptr), t0 432 433 MOVQ (8*0)(x_ptr), AX 434 MULQ t0 435 ADDQ AX, acc2 436 ADCQ $0, DX 437 MOVQ DX, t1 438 439 MOVQ (8*1)(x_ptr), AX 440 MULQ t0 441 ADDQ t1, acc3 442 ADCQ $0, DX 443 ADDQ AX, acc3 444 ADCQ $0, DX 445 MOVQ DX, t1 446 447 MOVQ (8*2)(x_ptr), AX 448 MULQ t0 449 ADDQ t1, acc4 450 ADCQ $0, DX 451 ADDQ AX, acc4 452 ADCQ $0, DX 453 MOVQ DX, t1 454 455 MOVQ (8*3)(x_ptr), AX 456 MULQ t0 457 ADDQ t1, acc5 458 ADCQ $0, DX 459 ADDQ AX, acc5 460 ADCQ DX, acc0 461 ADCQ $0, acc1 462 // Third reduction step 463 MOVQ acc2, t1 464 ADDQ acc2, acc3 465 ADCQ $0, acc4 466 ADCQ $0, acc5 467 ADCQ acc2, acc0 468 ADCQ $0, acc1 469 SHLQ $32, t1 470 SHRQ $32, acc2 471 SUBQ t1, acc3 472 SBBQ acc2, acc4 473 SBBQ t1, acc5 474 SBBQ acc2, acc0 475 SBBQ $0, acc1 476 XORQ acc2, acc2 477 // x * y[3] 478 MOVQ (8*3)(y_ptr), t0 479 480 MOVQ (8*0)(x_ptr), AX 481 MULQ t0 482 ADDQ AX, acc3 483 ADCQ $0, DX 484 MOVQ DX, t1 485 486 MOVQ (8*1)(x_ptr), AX 487 MULQ t0 488 ADDQ t1, acc4 489 ADCQ $0, DX 490 ADDQ AX, acc4 491 ADCQ $0, DX 492 MOVQ DX, t1 493 494 MOVQ (8*2)(x_ptr), AX 495 MULQ t0 496 ADDQ t1, acc5 497 ADCQ $0, DX 498 ADDQ AX, acc5 499 ADCQ $0, DX 500 MOVQ DX, t1 501 502 MOVQ (8*3)(x_ptr), AX 503 MULQ t0 504 ADDQ t1, acc0 505 ADCQ $0, DX 506 ADDQ AX, acc0 507 ADCQ DX, acc1 508 ADCQ $0, acc2 509 // Last reduction step 510 MOVQ acc3, t1 511 ADDQ acc3, acc4 512 ADCQ $0, acc5 513 ADCQ $0, acc0 514 ADCQ acc3, acc1 515 ADCQ $0, acc2 516 SHLQ $32, t1 517 SHRQ $32, acc3 518 SUBQ t1, acc4 519 SBBQ acc3, acc5 520 SBBQ t1, acc0 521 SBBQ acc3, acc1 522 SBBQ $0, acc2 523 // Copy result [255:0] 524 MOVQ acc4, x_ptr 525 MOVQ acc5, acc3 526 MOVQ acc0, t0 527 MOVQ acc1, t1 528 // Subtract sm2-p 529 SUBQ $-1, acc4 530 SBBQ sm2const0<>(SB) ,acc5 531 SBBQ $-1, acc0 532 SBBQ sm2const1<>(SB), acc1 533 SBBQ $0, acc2 534 535 CMOVQCS x_ptr, acc4 536 CMOVQCS acc3, acc5 537 CMOVQCS t0, acc0 538 CMOVQCS t1, acc1 539 540 MOVQ acc4, (8*0)(res_ptr) 541 MOVQ acc5, (8*1)(res_ptr) 542 MOVQ acc0, (8*2)(res_ptr) 543 MOVQ acc1, (8*3)(res_ptr) 544 545 RET 546 /* ---------------------------------------*/ 547 // func sm2FromMont(res, in []uint64) 548 TEXT ·sm2FromMont(SB),NOSPLIT,$0 549 MOVQ res+0(FP), res_ptr 550 MOVQ in+24(FP), x_ptr 551 552 MOVQ (8*0)(x_ptr), acc0 553 MOVQ (8*1)(x_ptr), acc1 554 MOVQ (8*2)(x_ptr), acc2 555 MOVQ (8*3)(x_ptr), acc3 556 XORQ acc4, acc4 557 558 // Only reduce, no multiplications are needed 559 // First stage 560 MOVQ acc0, t1 561 ADDQ acc0, acc1 562 ADCQ $0, acc2 563 ADCQ $0, acc3 564 ADCQ acc0, acc4 565 SHLQ $32, t1 566 SHRQ $32, acc0 567 SUBQ t1, acc1 568 SBBQ acc0, acc2 569 SBBQ t1, acc3 570 SBBQ acc0, acc4 571 XORQ acc5, acc5 572 // Second stage 573 MOVQ acc1, t1 574 ADDQ acc1, acc2 575 ADCQ $0, acc3 576 ADCQ $0, acc4 577 ADCQ acc1, acc5 578 SHLQ $32, t1 579 SHRQ $32, acc1 580 SUBQ t1, acc2 581 SBBQ acc1, acc3 582 SBBQ t1, acc4 583 SBBQ acc1, acc5 584 XORQ acc0, acc0 585 // Third stage 586 MOVQ acc2, t1 587 ADDQ acc2, acc3 588 ADCQ $0, acc4 589 ADCQ $0, acc5 590 ADCQ acc2, acc0 591 SHLQ $32, t1 592 SHRQ $32, acc2 593 SUBQ t1, acc3 594 SBBQ acc2, acc4 595 SBBQ t1, acc5 596 SBBQ acc2, acc0 597 XORQ acc1, acc1 598 // Last stage 599 MOVQ acc3, t1 600 ADDQ acc3, acc4 601 ADCQ $0, acc5 602 ADCQ $0, acc0 603 ADCQ acc3, acc1 604 SHLQ $32, t1 605 SHRQ $32, acc3 606 SUBQ t1, acc4 607 SBBQ acc3, acc5 608 SBBQ t1, acc0 609 SBBQ acc3, acc1 610 611 MOVQ acc4, x_ptr 612 MOVQ acc5, acc3 613 MOVQ acc0, t0 614 MOVQ acc1, t1 615 616 SUBQ $-1, acc4 617 SBBQ sm2const0<>(SB), acc5 618 SBBQ $-1, acc0 619 SBBQ sm2const1<>(SB), acc1 620 621 CMOVQCS x_ptr, acc4 622 CMOVQCS acc3, acc5 623 CMOVQCS t0, acc0 624 CMOVQCS t1, acc1 625 626 MOVQ acc4, (8*0)(res_ptr) 627 MOVQ acc5, (8*1)(res_ptr) 628 MOVQ acc0, (8*2)(res_ptr) 629 MOVQ acc1, (8*3)(res_ptr) 630 631 RET 632 /* ---------------------------------------*/ 633 // Constant time point access to arbitrary point table. 634 // Indexed from 1 to 15, with -1 offset 635 // (index 0 is implicitly point at infinity) 636 // func sm2Select(point, table []uint64, idx int) 637 TEXT ·sm2Select(SB),NOSPLIT,$0 638 MOVQ idx+48(FP),AX 639 MOVQ table+24(FP),DI 640 MOVQ point+0(FP),DX 641 642 PXOR X15, X15 // X15 = 0 643 PCMPEQL X14, X14 // X14 = -1 644 PSUBL X14, X15 // X15 = 1 645 MOVL AX, X14 646 PSHUFD $0, X14, X14 647 648 PXOR X0, X0 649 PXOR X1, X1 650 PXOR X2, X2 651 PXOR X3, X3 652 PXOR X4, X4 653 PXOR X5, X5 654 MOVQ $16, AX 655 656 MOVOU X15, X13 657 658 loop_select: 659 660 MOVOU X13, X12 661 PADDL X15, X13 662 PCMPEQL X14, X12 663 664 MOVOU (16*0)(DI), X6 665 MOVOU (16*1)(DI), X7 666 MOVOU (16*2)(DI), X8 667 MOVOU (16*3)(DI), X9 668 MOVOU (16*4)(DI), X10 669 MOVOU (16*5)(DI), X11 670 ADDQ $(16*6), DI 671 672 PAND X12, X6 673 PAND X12, X7 674 PAND X12, X8 675 PAND X12, X9 676 PAND X12, X10 677 PAND X12, X11 678 679 PXOR X6, X0 680 PXOR X7, X1 681 PXOR X8, X2 682 PXOR X9, X3 683 PXOR X10, X4 684 PXOR X11, X5 685 686 DECQ AX 687 JNE loop_select 688 689 MOVOU X0, (16*0)(DX) 690 MOVOU X1, (16*1)(DX) 691 MOVOU X2, (16*2)(DX) 692 MOVOU X3, (16*3)(DX) 693 MOVOU X4, (16*4)(DX) 694 MOVOU X5, (16*5)(DX) 695 696 RET 697 /* ---------------------------------------*/ 698 // Constant time point access to base point table. 699 // func sm2SelectBase(point, table []uint64, idx int) 700 TEXT ·sm2SelectBase(SB),NOSPLIT,$0 701 MOVQ idx+48(FP),AX 702 MOVQ table+24(FP),DI 703 MOVQ point+0(FP),DX 704 705 PXOR X15, X15 // X15 = 0 706 PCMPEQL X14, X14 // X14 = -1 707 PSUBL X14, X15 // X15 = 1 708 MOVL AX, X14 709 PSHUFD $0, X14, X14 710 711 PXOR X0, X0 712 PXOR X1, X1 713 PXOR X2, X2 714 PXOR X3, X3 715 MOVQ $16, AX 716 717 MOVOU X15, X13 718 719 loop_select_base: 720 721 MOVOU X13, X12 722 PADDL X15, X13 723 PCMPEQL X14, X12 724 725 MOVOU (16*0)(DI), X4 726 MOVOU (16*1)(DI), X5 727 MOVOU (16*2)(DI), X6 728 MOVOU (16*3)(DI), X7 729 730 MOVOU (16*4)(DI), X8 731 MOVOU (16*5)(DI), X9 732 MOVOU (16*6)(DI), X10 733 MOVOU (16*7)(DI), X11 734 735 ADDQ $(16*8), DI 736 737 PAND X12, X4 738 PAND X12, X5 739 PAND X12, X6 740 PAND X12, X7 741 742 MOVOU X13, X12 743 PADDL X15, X13 744 PCMPEQL X14, X12 745 746 PAND X12, X8 747 PAND X12, X9 748 PAND X12, X10 749 PAND X12, X11 750 751 PXOR X4, X0 752 PXOR X5, X1 753 PXOR X6, X2 754 PXOR X7, X3 755 756 PXOR X8, X0 757 PXOR X9, X1 758 PXOR X10, X2 759 PXOR X11, X3 760 761 DECQ AX 762 JNE loop_select_base 763 764 MOVOU X0, (16*0)(DX) 765 MOVOU X1, (16*1)(DX) 766 MOVOU X2, (16*2)(DX) 767 MOVOU X3, (16*3)(DX) 768 769 RET 770 /* ---------------------------------------*/ 771 // func sm2OrdMul(res, in1, in2 []uint64) 772 TEXT ·sm2OrdMul(SB),NOSPLIT,$0 773 MOVQ res+0(FP), res_ptr 774 MOVQ in1+24(FP), x_ptr 775 MOVQ in2+48(FP), y_ptr 776 // x * y[0] 777 MOVQ (8*0)(y_ptr), t0 778 779 MOVQ (8*0)(x_ptr), AX 780 MULQ t0 781 MOVQ AX, acc0 782 MOVQ DX, acc1 783 784 MOVQ (8*1)(x_ptr), AX 785 MULQ t0 786 ADDQ AX, acc1 787 ADCQ $0, DX 788 MOVQ DX, acc2 789 790 MOVQ (8*2)(x_ptr), AX 791 MULQ t0 792 ADDQ AX, acc2 793 ADCQ $0, DX 794 MOVQ DX, acc3 795 796 MOVQ (8*3)(x_ptr), AX 797 MULQ t0 798 ADDQ AX, acc3 799 ADCQ $0, DX 800 MOVQ DX, acc4 801 XORQ acc5, acc5 802 // First reduction step 803 MOVQ acc0, AX 804 MULQ sm2ordK0<>(SB) 805 MOVQ AX, t0 806 807 MOVQ sm2ord<>+0x00(SB), AX 808 MULQ t0 809 ADDQ AX, acc0 810 ADCQ $0, DX 811 MOVQ DX, t1 812 813 MOVQ sm2ord<>+0x08(SB), AX 814 MULQ t0 815 ADDQ t1, acc1 816 ADCQ $0, DX 817 ADDQ AX, acc1 818 ADCQ $0, DX 819 MOVQ DX, t1 820 821 MOVQ sm2ord<>+0x10(SB), AX 822 MULQ t0 823 ADDQ t1, acc2 824 ADCQ $0, DX 825 ADDQ AX, acc2 826 ADCQ $0, DX 827 MOVQ DX, t1 828 829 MOVQ sm2ord<>+0x18(SB), AX 830 MULQ t0 831 ADDQ t1, acc3 832 ADCQ $0, DX 833 ADDQ AX, acc3 834 ADCQ DX, acc4 835 ADCQ $0, acc5 836 // x * y[1] 837 MOVQ (8*1)(y_ptr), t0 838 839 MOVQ (8*0)(x_ptr), AX 840 MULQ t0 841 ADDQ AX, acc1 842 ADCQ $0, DX 843 MOVQ DX, t1 844 845 MOVQ (8*1)(x_ptr), AX 846 MULQ t0 847 ADDQ t1, acc2 848 ADCQ $0, DX 849 ADDQ AX, acc2 850 ADCQ $0, DX 851 MOVQ DX, t1 852 853 MOVQ (8*2)(x_ptr), AX 854 MULQ t0 855 ADDQ t1, acc3 856 ADCQ $0, DX 857 ADDQ AX, acc3 858 ADCQ $0, DX 859 MOVQ DX, t1 860 861 MOVQ (8*3)(x_ptr), AX 862 MULQ t0 863 ADDQ t1, acc4 864 ADCQ $0, DX 865 ADDQ AX, acc4 866 ADCQ DX, acc5 867 ADCQ $0, acc0 868 // Second reduction step 869 MOVQ acc1, AX 870 MULQ sm2ordK0<>(SB) 871 MOVQ AX, t0 872 873 MOVQ sm2ord<>+0x00(SB), AX 874 MULQ t0 875 ADDQ AX, acc1 876 ADCQ $0, DX 877 MOVQ DX, t1 878 879 MOVQ sm2ord<>+0x08(SB), AX 880 MULQ t0 881 ADDQ t1, acc2 882 ADCQ $0, DX 883 ADDQ AX, acc2 884 ADCQ $0, DX 885 MOVQ DX, t1 886 887 MOVQ sm2ord<>+0x10(SB), AX 888 MULQ t0 889 ADDQ t1, acc3 890 ADCQ $0, DX 891 ADDQ AX, acc3 892 ADCQ $0, DX 893 MOVQ DX, t1 894 895 MOVQ sm2ord<>+0x18(SB), AX 896 MULQ t0 897 ADDQ t1, acc4 898 ADCQ $0, DX 899 ADDQ AX, acc4 900 ADCQ DX, acc5 901 ADCQ $0, acc0 902 // x * y[2] 903 MOVQ (8*2)(y_ptr), t0 904 905 MOVQ (8*0)(x_ptr), AX 906 MULQ t0 907 ADDQ AX, acc2 908 ADCQ $0, DX 909 MOVQ DX, t1 910 911 MOVQ (8*1)(x_ptr), AX 912 MULQ t0 913 ADDQ t1, acc3 914 ADCQ $0, DX 915 ADDQ AX, acc3 916 ADCQ $0, DX 917 MOVQ DX, t1 918 919 MOVQ (8*2)(x_ptr), AX 920 MULQ t0 921 ADDQ t1, acc4 922 ADCQ $0, DX 923 ADDQ AX, acc4 924 ADCQ $0, DX 925 MOVQ DX, t1 926 927 MOVQ (8*3)(x_ptr), AX 928 MULQ t0 929 ADDQ t1, acc5 930 ADCQ $0, DX 931 ADDQ AX, acc5 932 ADCQ DX, acc0 933 ADCQ $0, acc1 934 // Third reduction step 935 MOVQ acc2, AX 936 MULQ sm2ordK0<>(SB) 937 MOVQ AX, t0 938 939 MOVQ sm2ord<>+0x00(SB), AX 940 MULQ t0 941 ADDQ AX, acc2 942 ADCQ $0, DX 943 MOVQ DX, t1 944 945 MOVQ sm2ord<>+0x08(SB), AX 946 MULQ t0 947 ADDQ t1, acc3 948 ADCQ $0, DX 949 ADDQ AX, acc3 950 ADCQ $0, DX 951 MOVQ DX, t1 952 953 MOVQ sm2ord<>+0x10(SB), AX 954 MULQ t0 955 ADDQ t1, acc4 956 ADCQ $0, DX 957 ADDQ AX, acc4 958 ADCQ $0, DX 959 MOVQ DX, t1 960 961 MOVQ sm2ord<>+0x18(SB), AX 962 MULQ t0 963 ADDQ t1, acc5 964 ADCQ $0, DX 965 ADDQ AX, acc5 966 ADCQ DX, acc0 967 ADCQ $0, acc1 968 // x * y[3] 969 MOVQ (8*3)(y_ptr), t0 970 971 MOVQ (8*0)(x_ptr), AX 972 MULQ t0 973 ADDQ AX, acc3 974 ADCQ $0, DX 975 MOVQ DX, t1 976 977 MOVQ (8*1)(x_ptr), AX 978 MULQ t0 979 ADDQ t1, acc4 980 ADCQ $0, DX 981 ADDQ AX, acc4 982 ADCQ $0, DX 983 MOVQ DX, t1 984 985 MOVQ (8*2)(x_ptr), AX 986 MULQ t0 987 ADDQ t1, acc5 988 ADCQ $0, DX 989 ADDQ AX, acc5 990 ADCQ $0, DX 991 MOVQ DX, t1 992 993 MOVQ (8*3)(x_ptr), AX 994 MULQ t0 995 ADDQ t1, acc0 996 ADCQ $0, DX 997 ADDQ AX, acc0 998 ADCQ DX, acc1 999 ADCQ $0, acc2 1000 // Last reduction step 1001 MOVQ acc3, AX 1002 MULQ sm2ordK0<>(SB) 1003 MOVQ AX, t0 1004 1005 MOVQ sm2ord<>+0x00(SB), AX 1006 MULQ t0 1007 ADDQ AX, acc3 1008 ADCQ $0, DX 1009 MOVQ DX, t1 1010 1011 MOVQ sm2ord<>+0x08(SB), AX 1012 MULQ t0 1013 ADDQ t1, acc4 1014 ADCQ $0, DX 1015 ADDQ AX, acc4 1016 ADCQ $0, DX 1017 MOVQ DX, t1 1018 1019 MOVQ sm2ord<>+0x10(SB), AX 1020 MULQ t0 1021 ADDQ t1, acc5 1022 ADCQ $0, DX 1023 ADDQ AX, acc5 1024 ADCQ $0, DX 1025 MOVQ DX, t1 1026 1027 MOVQ sm2ord<>+0x18(SB), AX 1028 MULQ t0 1029 ADDQ t1, acc0 1030 ADCQ $0, DX 1031 ADDQ AX, acc0 1032 ADCQ DX, acc1 1033 ADCQ $0, acc2 1034 // Copy result [255:0] 1035 MOVQ acc4, x_ptr 1036 MOVQ acc5, acc3 1037 MOVQ acc0, t0 1038 MOVQ acc1, t1 1039 // Subtract sm2 1040 SUBQ sm2ord<>+0x00(SB), acc4 1041 SBBQ sm2ord<>+0x08(SB) ,acc5 1042 SBBQ sm2ord<>+0x10(SB), acc0 1043 SBBQ sm2ord<>+0x18(SB), acc1 1044 SBBQ $0, acc2 1045 1046 CMOVQCS x_ptr, acc4 1047 CMOVQCS acc3, acc5 1048 CMOVQCS t0, acc0 1049 CMOVQCS t1, acc1 1050 1051 MOVQ acc4, (8*0)(res_ptr) 1052 MOVQ acc5, (8*1)(res_ptr) 1053 MOVQ acc0, (8*2)(res_ptr) 1054 MOVQ acc1, (8*3)(res_ptr) 1055 1056 RET 1057 /* ---------------------------------------*/ 1058 // func sm2OrdSqr(res, in []uint64, n int) 1059 TEXT ·sm2OrdSqr(SB),NOSPLIT,$0 1060 MOVQ res+0(FP), res_ptr 1061 MOVQ in+24(FP), x_ptr 1062 MOVQ n+48(FP), BX 1063 1064 ordSqrLoop: 1065 1066 // y[1:] * y[0] 1067 MOVQ (8*0)(x_ptr), t0 1068 1069 MOVQ (8*1)(x_ptr), AX 1070 MULQ t0 1071 MOVQ AX, acc1 1072 MOVQ DX, acc2 1073 1074 MOVQ (8*2)(x_ptr), AX 1075 MULQ t0 1076 ADDQ AX, acc2 1077 ADCQ $0, DX 1078 MOVQ DX, acc3 1079 1080 MOVQ (8*3)(x_ptr), AX 1081 MULQ t0 1082 ADDQ AX, acc3 1083 ADCQ $0, DX 1084 MOVQ DX, acc4 1085 // y[2:] * y[1] 1086 MOVQ (8*1)(x_ptr), t0 1087 1088 MOVQ (8*2)(x_ptr), AX 1089 MULQ t0 1090 ADDQ AX, acc3 1091 ADCQ $0, DX 1092 MOVQ DX, t1 1093 1094 MOVQ (8*3)(x_ptr), AX 1095 MULQ t0 1096 ADDQ t1, acc4 1097 ADCQ $0, DX 1098 ADDQ AX, acc4 1099 ADCQ $0, DX 1100 MOVQ DX, acc5 1101 // y[3] * y[2] 1102 MOVQ (8*2)(x_ptr), t0 1103 1104 MOVQ (8*3)(x_ptr), AX 1105 MULQ t0 1106 ADDQ AX, acc5 1107 ADCQ $0, DX 1108 MOVQ DX, y_ptr 1109 XORQ t1, t1 1110 // *2 1111 ADDQ acc1, acc1 1112 ADCQ acc2, acc2 1113 ADCQ acc3, acc3 1114 ADCQ acc4, acc4 1115 ADCQ acc5, acc5 1116 ADCQ y_ptr, y_ptr 1117 ADCQ $0, t1 1118 // Missing products 1119 MOVQ (8*0)(x_ptr), AX 1120 MULQ AX 1121 MOVQ AX, acc0 1122 MOVQ DX, t0 1123 1124 MOVQ (8*1)(x_ptr), AX 1125 MULQ AX 1126 ADDQ t0, acc1 1127 ADCQ AX, acc2 1128 ADCQ $0, DX 1129 MOVQ DX, t0 1130 1131 MOVQ (8*2)(x_ptr), AX 1132 MULQ AX 1133 ADDQ t0, acc3 1134 ADCQ AX, acc4 1135 ADCQ $0, DX 1136 MOVQ DX, t0 1137 1138 MOVQ (8*3)(x_ptr), AX 1139 MULQ AX 1140 ADDQ t0, acc5 1141 ADCQ AX, y_ptr 1142 ADCQ DX, t1 1143 MOVQ t1, x_ptr 1144 // First reduction step 1145 MOVQ acc0, AX 1146 MULQ sm2ordK0<>(SB) 1147 MOVQ AX, t0 1148 1149 MOVQ sm2ord<>+0x00(SB), AX 1150 MULQ t0 1151 ADDQ AX, acc0 1152 ADCQ $0, DX 1153 MOVQ DX, t1 1154 1155 MOVQ sm2ord<>+0x08(SB), AX 1156 MULQ t0 1157 ADDQ t1, acc1 1158 ADCQ $0, DX 1159 ADDQ AX, acc1 1160 1161 //MOVQ t0, t1 1162 ADCQ DX, acc2 1163 ADCQ $0, acc3 1164 SUBQ t0, acc2 1165 SBBQ $0, acc3 1166 1167 MOVQ t0, AX 1168 MOVQ t0, DX 1169 MOVQ t0, acc0 1170 SBBQ $0, acc0 1171 SHLQ $32, AX 1172 SHRQ $32, DX 1173 1174 //ADDQ t1, acc3 1175 SUBQ AX, acc3 1176 SBBQ DX, acc0 1177 // Second reduction step 1178 MOVQ acc1, AX 1179 MULQ sm2ordK0<>(SB) 1180 MOVQ AX, t0 1181 1182 MOVQ sm2ord<>+0x00(SB), AX 1183 MULQ t0 1184 ADDQ AX, acc1 1185 ADCQ $0, DX 1186 MOVQ DX, t1 1187 1188 MOVQ sm2ord<>+0x08(SB), AX 1189 MULQ t0 1190 ADDQ t1, acc2 1191 ADCQ $0, DX 1192 ADDQ AX, acc2 1193 1194 //MOVQ t0, t1 1195 ADCQ DX, acc3 1196 ADCQ $0, acc0 1197 SUBQ t0, acc3 1198 SBBQ $0, acc0 1199 1200 MOVQ t0, AX 1201 MOVQ t0, DX 1202 MOVQ t0, acc1 1203 SBBQ $0, acc1 1204 SHLQ $32, AX 1205 SHRQ $32, DX 1206 1207 //ADDQ t1, acc0 1208 SUBQ AX, acc0 1209 SBBQ DX, acc1 1210 // Third reduction step 1211 MOVQ acc2, AX 1212 MULQ sm2ordK0<>(SB) 1213 MOVQ AX, t0 1214 1215 MOVQ sm2ord<>+0x00(SB), AX 1216 MULQ t0 1217 ADDQ AX, acc2 1218 ADCQ $0, DX 1219 MOVQ DX, t1 1220 1221 MOVQ sm2ord<>+0x08(SB), AX 1222 MULQ t0 1223 ADDQ t1, acc3 1224 ADCQ $0, DX 1225 ADDQ AX, acc3 1226 1227 //MOVQ t0, t1 1228 ADCQ DX, acc0 1229 ADCQ $0, acc1 1230 SUBQ t0, acc0 1231 SBBQ $0, acc1 1232 1233 MOVQ t0, AX 1234 MOVQ t0, DX 1235 MOVQ t0, acc2 1236 SBBQ $0, acc2 1237 SHLQ $32, AX 1238 SHRQ $32, DX 1239 1240 //ADDQ t1, acc1 1241 SUBQ AX, acc1 1242 SBBQ DX, acc2 1243 // Last reduction step 1244 MOVQ acc3, AX 1245 MULQ sm2ordK0<>(SB) 1246 MOVQ AX, t0 1247 1248 MOVQ sm2ord<>+0x00(SB), AX 1249 MULQ t0 1250 ADDQ AX, acc3 1251 ADCQ $0, DX 1252 MOVQ DX, t1 1253 1254 MOVQ sm2ord<>+0x08(SB), AX 1255 MULQ t0 1256 ADDQ t1, acc0 1257 ADCQ $0, DX 1258 ADDQ AX, acc0 1259 //ADCQ $0, DX 1260 //MOVQ DX, t1 1261 1262 //MOVQ t0, t1 1263 ADCQ DX, acc1 1264 ADCQ $0, acc2 1265 SUBQ t0, acc1 1266 SBBQ $0, acc2 1267 1268 MOVQ t0, AX 1269 MOVQ t0, DX 1270 MOVQ t0, acc3 1271 SBBQ $0, acc3 1272 SHLQ $32, AX 1273 SHRQ $32, DX 1274 1275 //ADDQ t1, acc2 1276 SUBQ AX, acc2 1277 SBBQ DX, acc3 1278 XORQ t0, t0 1279 // Add bits [511:256] of the sqr result 1280 ADDQ acc4, acc0 1281 ADCQ acc5, acc1 1282 ADCQ y_ptr, acc2 1283 ADCQ x_ptr, acc3 1284 ADCQ $0, t0 1285 1286 MOVQ acc0, acc4 1287 MOVQ acc1, acc5 1288 MOVQ acc2, y_ptr 1289 MOVQ acc3, t1 1290 // Subtract sm2 1291 SUBQ sm2ord<>+0x00(SB), acc0 1292 SBBQ sm2ord<>+0x08(SB) ,acc1 1293 SBBQ sm2ord<>+0x10(SB), acc2 1294 SBBQ sm2ord<>+0x18(SB), acc3 1295 SBBQ $0, t0 1296 1297 CMOVQCS acc4, acc0 1298 CMOVQCS acc5, acc1 1299 CMOVQCS y_ptr, acc2 1300 CMOVQCS t1, acc3 1301 1302 MOVQ acc0, (8*0)(res_ptr) 1303 MOVQ acc1, (8*1)(res_ptr) 1304 MOVQ acc2, (8*2)(res_ptr) 1305 MOVQ acc3, (8*3)(res_ptr) 1306 MOVQ res_ptr, x_ptr 1307 DECQ BX 1308 JNE ordSqrLoop 1309 1310 RET 1311 /* ---------------------------------------*/ 1312 #undef res_ptr 1313 #undef x_ptr 1314 #undef y_ptr 1315 1316 #undef acc0 1317 #undef acc1 1318 #undef acc2 1319 #undef acc3 1320 #undef acc4 1321 #undef acc5 1322 #undef t0 1323 #undef t1 1324 /* ---------------------------------------*/ 1325 #define mul0 AX 1326 #define mul1 DX 1327 #define acc0 BX 1328 #define acc1 CX 1329 #define acc2 R8 1330 #define acc3 R9 1331 #define acc4 R10 1332 #define acc5 R11 1333 #define acc6 R12 1334 #define acc7 R13 1335 #define t0 R14 1336 #define t1 R15 1337 #define t2 DI 1338 #define t3 SI 1339 #define hlp BP 1340 /* ---------------------------------------*/ 1341 TEXT sm2SubInternal(SB),NOSPLIT | DUPOK,$0 1342 XORQ mul0, mul0 1343 SUBQ t0, acc4 1344 SBBQ t1, acc5 1345 SBBQ t2, acc6 1346 SBBQ t3, acc7 1347 SBBQ $0, mul0 1348 1349 MOVQ acc4, acc0 1350 MOVQ acc5, acc1 1351 MOVQ acc6, acc2 1352 MOVQ acc7, acc3 1353 1354 ADDQ $-1, acc4 1355 ADCQ sm2const0<>(SB), acc5 1356 ADCQ $-1, acc6 1357 ADCQ sm2const1<>(SB), acc7 1358 ANDQ $1, mul0 1359 1360 CMOVQEQ acc0, acc4 1361 CMOVQEQ acc1, acc5 1362 CMOVQEQ acc2, acc6 1363 CMOVQEQ acc3, acc7 1364 1365 RET 1366 /* ---------------------------------------*/ 1367 TEXT sm2MulInternal(SB),NOSPLIT | DUPOK,$0 1368 MOVQ acc4, mul0 1369 MULQ t0 1370 MOVQ mul0, acc0 1371 MOVQ mul1, acc1 1372 1373 MOVQ acc4, mul0 1374 MULQ t1 1375 ADDQ mul0, acc1 1376 ADCQ $0, mul1 1377 MOVQ mul1, acc2 1378 1379 MOVQ acc4, mul0 1380 MULQ t2 1381 ADDQ mul0, acc2 1382 ADCQ $0, mul1 1383 MOVQ mul1, acc3 1384 1385 MOVQ acc4, mul0 1386 MULQ t3 1387 ADDQ mul0, acc3 1388 ADCQ $0, mul1 1389 MOVQ mul1, acc4 1390 1391 MOVQ acc5, mul0 1392 MULQ t0 1393 ADDQ mul0, acc1 1394 ADCQ $0, mul1 1395 MOVQ mul1, hlp 1396 1397 MOVQ acc5, mul0 1398 MULQ t1 1399 ADDQ hlp, acc2 1400 ADCQ $0, mul1 1401 ADDQ mul0, acc2 1402 ADCQ $0, mul1 1403 MOVQ mul1, hlp 1404 1405 MOVQ acc5, mul0 1406 MULQ t2 1407 ADDQ hlp, acc3 1408 ADCQ $0, mul1 1409 ADDQ mul0, acc3 1410 ADCQ $0, mul1 1411 MOVQ mul1, hlp 1412 1413 MOVQ acc5, mul0 1414 MULQ t3 1415 ADDQ hlp, acc4 1416 ADCQ $0, mul1 1417 ADDQ mul0, acc4 1418 ADCQ $0, mul1 1419 MOVQ mul1, acc5 1420 1421 MOVQ acc6, mul0 1422 MULQ t0 1423 ADDQ mul0, acc2 1424 ADCQ $0, mul1 1425 MOVQ mul1, hlp 1426 1427 MOVQ acc6, mul0 1428 MULQ t1 1429 ADDQ hlp, acc3 1430 ADCQ $0, mul1 1431 ADDQ mul0, acc3 1432 ADCQ $0, mul1 1433 MOVQ mul1, hlp 1434 1435 MOVQ acc6, mul0 1436 MULQ t2 1437 ADDQ hlp, acc4 1438 ADCQ $0, mul1 1439 ADDQ mul0, acc4 1440 ADCQ $0, mul1 1441 MOVQ mul1, hlp 1442 1443 MOVQ acc6, mul0 1444 MULQ t3 1445 ADDQ hlp, acc5 1446 ADCQ $0, mul1 1447 ADDQ mul0, acc5 1448 ADCQ $0, mul1 1449 MOVQ mul1, acc6 1450 1451 MOVQ acc7, mul0 1452 MULQ t0 1453 ADDQ mul0, acc3 1454 ADCQ $0, mul1 1455 MOVQ mul1, hlp 1456 1457 MOVQ acc7, mul0 1458 MULQ t1 1459 ADDQ hlp, acc4 1460 ADCQ $0, mul1 1461 ADDQ mul0, acc4 1462 ADCQ $0, mul1 1463 MOVQ mul1, hlp 1464 1465 MOVQ acc7, mul0 1466 MULQ t2 1467 ADDQ hlp, acc5 1468 ADCQ $0, mul1 1469 ADDQ mul0, acc5 1470 ADCQ $0, mul1 1471 MOVQ mul1, hlp 1472 1473 MOVQ acc7, mul0 1474 MULQ t3 1475 ADDQ hlp, acc6 1476 ADCQ $0, mul1 1477 ADDQ mul0, acc6 1478 ADCQ $0, mul1 1479 MOVQ mul1, acc7 1480 // First reduction step 1481 MOVQ acc0, mul0 1482 MOVQ acc0, hlp 1483 ADDQ acc0, acc1 1484 ADCQ $0, acc2 1485 ADCQ $0, acc3 1486 ADCQ $0, mul0 1487 SHLQ $32, hlp 1488 SHRQ $32, acc0 1489 SUBQ hlp, acc1 1490 SBBQ acc0, acc2 1491 SBBQ hlp, acc3 1492 SBBQ acc0, mul0 1493 MOVQ mul0, acc0 1494 // Second reduction step 1495 MOVQ acc1, mul0 1496 MOVQ acc1, hlp 1497 ADDQ acc1, acc2 1498 ADCQ $0, acc3 1499 ADCQ $0, acc0 1500 ADCQ $0, mul0 1501 SHLQ $32, hlp 1502 SHRQ $32, acc1 1503 SUBQ hlp, acc2 1504 SBBQ acc1, acc3 1505 SBBQ hlp, acc0 1506 SBBQ acc1, mul0 1507 MOVQ mul0, acc1 1508 // Third reduction step 1509 MOVQ acc2, mul0 1510 MOVQ acc2, hlp 1511 ADDQ acc2, acc3 1512 ADCQ $0, acc0 1513 ADCQ $0, acc1 1514 ADCQ $0, mul0 1515 SHLQ $32, hlp 1516 SHRQ $32, acc2 1517 SUBQ hlp, acc3 1518 SBBQ acc2, acc0 1519 SBBQ hlp, acc1 1520 SBBQ acc2, mul0 1521 MOVQ mul0, acc2 1522 // Last reduction step 1523 MOVQ acc3, mul0 1524 MOVQ acc3, hlp 1525 ADDQ acc3, acc0 1526 ADCQ $0, acc1 1527 ADCQ $0, acc2 1528 ADCQ $0, mul0 1529 SHLQ $32, hlp 1530 SHRQ $32, acc3 1531 SUBQ hlp, acc0 1532 SBBQ acc3, acc1 1533 SBBQ hlp, acc2 1534 SBBQ acc3, mul0 1535 MOVQ mul0, acc3 1536 MOVQ $0, BP 1537 // Add bits [511:256] of the result 1538 ADCQ acc0, acc4 1539 ADCQ acc1, acc5 1540 ADCQ acc2, acc6 1541 ADCQ acc3, acc7 1542 ADCQ $0, hlp 1543 // Copy result 1544 MOVQ acc4, acc0 1545 MOVQ acc5, acc1 1546 MOVQ acc6, acc2 1547 MOVQ acc7, acc3 1548 // Subtract sm2 1549 SUBQ $-1, acc4 1550 SBBQ sm2const0<>(SB) ,acc5 1551 SBBQ $-1, acc6 1552 SBBQ sm2const1<>(SB), acc7 1553 SBBQ $0, hlp 1554 // If the result of the subtraction is negative, restore the previous result 1555 CMOVQCS acc0, acc4 1556 CMOVQCS acc1, acc5 1557 CMOVQCS acc2, acc6 1558 CMOVQCS acc3, acc7 1559 1560 RET 1561 /* ---------------------------------------*/ 1562 TEXT sm2SqrInternal(SB),NOSPLIT | DUPOK,$0 1563 1564 MOVQ acc4, mul0 1565 MULQ acc5 1566 MOVQ mul0, acc1 1567 MOVQ mul1, acc2 1568 1569 MOVQ acc4, mul0 1570 MULQ acc6 1571 ADDQ mul0, acc2 1572 ADCQ $0, mul1 1573 MOVQ mul1, acc3 1574 1575 MOVQ acc4, mul0 1576 MULQ acc7 1577 ADDQ mul0, acc3 1578 ADCQ $0, mul1 1579 MOVQ mul1, t0 1580 1581 MOVQ acc5, mul0 1582 MULQ acc6 1583 ADDQ mul0, acc3 1584 ADCQ $0, mul1 1585 MOVQ mul1, hlp 1586 1587 MOVQ acc5, mul0 1588 MULQ acc7 1589 ADDQ hlp, t0 1590 ADCQ $0, mul1 1591 ADDQ mul0, t0 1592 ADCQ $0, mul1 1593 MOVQ mul1, t1 1594 1595 MOVQ acc6, mul0 1596 MULQ acc7 1597 ADDQ mul0, t1 1598 ADCQ $0, mul1 1599 MOVQ mul1, t2 1600 XORQ t3, t3 1601 // *2 1602 ADDQ acc1, acc1 1603 ADCQ acc2, acc2 1604 ADCQ acc3, acc3 1605 ADCQ t0, t0 1606 ADCQ t1, t1 1607 ADCQ t2, t2 1608 ADCQ $0, t3 1609 // Missing products 1610 MOVQ acc4, mul0 1611 MULQ mul0 1612 MOVQ mul0, acc0 1613 MOVQ DX, acc4 1614 1615 MOVQ acc5, mul0 1616 MULQ mul0 1617 ADDQ acc4, acc1 1618 ADCQ mul0, acc2 1619 ADCQ $0, DX 1620 MOVQ DX, acc4 1621 1622 MOVQ acc6, mul0 1623 MULQ mul0 1624 ADDQ acc4, acc3 1625 ADCQ mul0, t0 1626 ADCQ $0, DX 1627 MOVQ DX, acc4 1628 1629 MOVQ acc7, mul0 1630 MULQ mul0 1631 ADDQ acc4, t1 1632 ADCQ mul0, t2 1633 ADCQ DX, t3 1634 // First reduction step 1635 MOVQ acc0, mul0 1636 MOVQ acc0, hlp 1637 ADDQ acc0, acc1 1638 ADCQ $0, acc2 1639 ADCQ $0, acc3 1640 ADCQ $0, mul0 1641 SHLQ $32, hlp 1642 SHRQ $32, acc0 1643 SUBQ hlp, acc1 1644 SBBQ acc0, acc2 1645 SBBQ hlp, acc3 1646 SBBQ acc0, mul0 1647 MOVQ mul0, acc0 1648 // Second reduction step 1649 MOVQ acc1, mul0 1650 MOVQ acc1, hlp 1651 ADDQ acc1, acc2 1652 ADCQ $0, acc3 1653 ADCQ $0, acc0 1654 ADCQ $0, mul0 1655 SHLQ $32, hlp 1656 SHRQ $32, acc1 1657 SUBQ hlp, acc2 1658 SBBQ acc1, acc3 1659 SBBQ hlp, acc0 1660 SBBQ acc1, mul0 1661 MOVQ mul0, acc1 1662 // Third reduction step 1663 MOVQ acc2, mul0 1664 MOVQ acc2, hlp 1665 ADDQ acc2, acc3 1666 ADCQ $0, acc0 1667 ADCQ $0, acc1 1668 ADCQ $0, mul0 1669 SHLQ $32, hlp 1670 SHRQ $32, acc2 1671 SUBQ hlp, acc3 1672 SBBQ acc2, acc0 1673 SBBQ hlp, acc1 1674 SBBQ acc2, mul0 1675 MOVQ mul0, acc2 1676 // Last reduction step 1677 MOVQ acc3, mul0 1678 MOVQ acc3, hlp 1679 ADDQ acc3, acc0 1680 ADCQ $0, acc1 1681 ADCQ $0, acc2 1682 ADCQ $0, mul0 1683 SHLQ $32, hlp 1684 SHRQ $32, acc3 1685 SUBQ hlp, acc0 1686 SBBQ acc3, acc1 1687 SBBQ hlp, acc2 1688 SBBQ acc3, mul0 1689 MOVQ mul0, acc3 1690 MOVQ $0, BP 1691 // Add bits [511:256] of the result 1692 ADCQ acc0, t0 1693 ADCQ acc1, t1 1694 ADCQ acc2, t2 1695 ADCQ acc3, t3 1696 ADCQ $0, hlp 1697 // Copy result 1698 MOVQ t0, acc4 1699 MOVQ t1, acc5 1700 MOVQ t2, acc6 1701 MOVQ t3, acc7 1702 // Subtract sm2 1703 SUBQ $-1, acc4 1704 SBBQ sm2const0<>(SB) ,acc5 1705 SBBQ $-1, acc6 1706 SBBQ sm2const1<>(SB), acc7 1707 SBBQ $0, hlp 1708 // If the result of the subtraction is negative, restore the previous result 1709 CMOVQCS t0, acc4 1710 CMOVQCS t1, acc5 1711 CMOVQCS t2, acc6 1712 CMOVQCS t3, acc7 1713 1714 RET 1715 /* ---------------------------------------*/ 1716 #define sm2MulBy2Inline\ 1717 XORQ mul0, mul0;\ 1718 ADDQ acc4, acc4;\ 1719 ADCQ acc5, acc5;\ 1720 ADCQ acc6, acc6;\ 1721 ADCQ acc7, acc7;\ 1722 ADCQ $0, mul0;\ 1723 MOVQ acc4, t0;\ 1724 MOVQ acc5, t1;\ 1725 MOVQ acc6, t2;\ 1726 MOVQ acc7, t3;\ 1727 SUBQ $-1, t0;\ 1728 SBBQ sm2const0<>(SB), t1;\ 1729 SBBQ $-1, t2;\ 1730 SBBQ sm2const1<>(SB), t3;\ 1731 SBBQ $0, mul0;\ 1732 CMOVQCS acc4, t0;\ 1733 CMOVQCS acc5, t1;\ 1734 CMOVQCS acc6, t2;\ 1735 CMOVQCS acc7, t3; 1736 /* ---------------------------------------*/ 1737 #define sm2AddInline \ 1738 XORQ mul0, mul0;\ 1739 ADDQ t0, acc4;\ 1740 ADCQ t1, acc5;\ 1741 ADCQ t2, acc6;\ 1742 ADCQ t3, acc7;\ 1743 ADCQ $0, mul0;\ 1744 MOVQ acc4, t0;\ 1745 MOVQ acc5, t1;\ 1746 MOVQ acc6, t2;\ 1747 MOVQ acc7, t3;\ 1748 SUBQ $-1, t0;\ 1749 SBBQ sm2const0<>(SB), t1;\ 1750 SBBQ $-1, t2;\ 1751 SBBQ sm2const1<>(SB), t3;\ 1752 SBBQ $0, mul0;\ 1753 CMOVQCS acc4, t0;\ 1754 CMOVQCS acc5, t1;\ 1755 CMOVQCS acc6, t2;\ 1756 CMOVQCS acc7, t3; 1757 /* ---------------------------------------*/ 1758 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1759 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1760 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1761 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1762 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1763 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1764 /* ---------------------------------------*/ 1765 #define x1in(off) (32*0 + off)(SP) 1766 #define y1in(off) (32*1 + off)(SP) 1767 #define z1in(off) (32*2 + off)(SP) 1768 #define x2in(off) (32*3 + off)(SP) 1769 #define y2in(off) (32*4 + off)(SP) 1770 #define xout(off) (32*5 + off)(SP) 1771 #define yout(off) (32*6 + off)(SP) 1772 #define zout(off) (32*7 + off)(SP) 1773 #define s2(off) (32*8 + off)(SP) 1774 #define z1sqr(off) (32*9 + off)(SP) 1775 #define h(off) (32*10 + off)(SP) 1776 #define r(off) (32*11 + off)(SP) 1777 #define hsqr(off) (32*12 + off)(SP) 1778 #define rsqr(off) (32*13 + off)(SP) 1779 #define hcub(off) (32*14 + off)(SP) 1780 #define rptr (32*15)(SP) 1781 #define sel_save (32*15 + 8)(SP) 1782 #define zero_save (32*15 + 8 + 4)(SP) 1783 1784 // func sm2PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 1785 TEXT ·sm2PointAddAffineAsm(SB),0,$512-96 1786 // Move input to stack in order to free registers 1787 MOVQ res+0(FP), AX 1788 MOVQ in1+24(FP), BX 1789 MOVQ in2+48(FP), CX 1790 MOVQ sign+72(FP), DX 1791 MOVQ sel+80(FP), t1 1792 MOVQ zero+88(FP), t2 1793 1794 MOVOU (16*0)(BX), X0 1795 MOVOU (16*1)(BX), X1 1796 MOVOU (16*2)(BX), X2 1797 MOVOU (16*3)(BX), X3 1798 MOVOU (16*4)(BX), X4 1799 MOVOU (16*5)(BX), X5 1800 1801 MOVOU X0, x1in(16*0) 1802 MOVOU X1, x1in(16*1) 1803 MOVOU X2, y1in(16*0) 1804 MOVOU X3, y1in(16*1) 1805 MOVOU X4, z1in(16*0) 1806 MOVOU X5, z1in(16*1) 1807 1808 MOVOU (16*0)(CX), X0 1809 MOVOU (16*1)(CX), X1 1810 1811 MOVOU X0, x2in(16*0) 1812 MOVOU X1, x2in(16*1) 1813 // Store pointer to result 1814 MOVQ mul0, rptr 1815 MOVL t1, sel_save 1816 MOVL t2, zero_save 1817 // Negate y2in based on sign 1818 MOVQ (16*2 + 8*0)(CX), acc4 1819 MOVQ (16*2 + 8*1)(CX), acc5 1820 MOVQ (16*2 + 8*2)(CX), acc6 1821 MOVQ (16*2 + 8*3)(CX), acc7 1822 MOVQ $-1, acc0 1823 MOVQ sm2const0<>(SB), acc1 1824 MOVQ $-1, acc2 1825 MOVQ sm2const1<>(SB), acc3 1826 XORQ mul0, mul0 1827 // Speculatively subtract 1828 SUBQ acc4, acc0 1829 SBBQ acc5, acc1 1830 SBBQ acc6, acc2 1831 SBBQ acc7, acc3 1832 SBBQ $0, mul0 1833 MOVQ acc0, t0 1834 MOVQ acc1, t1 1835 MOVQ acc2, t2 1836 MOVQ acc3, t3 1837 // Add in case the operand was > sm2-p 1838 ADDQ $-1, acc0 1839 ADCQ sm2const0<>(SB), acc1 1840 ADCQ $-1, acc2 1841 ADCQ sm2const1<>(SB), acc3 1842 ADCQ $0, mul0 1843 CMOVQNE t0, acc0 1844 CMOVQNE t1, acc1 1845 CMOVQNE t2, acc2 1846 CMOVQNE t3, acc3 1847 // If condition is 0, keep original value 1848 TESTQ DX, DX 1849 CMOVQEQ acc4, acc0 1850 CMOVQEQ acc5, acc1 1851 CMOVQEQ acc6, acc2 1852 CMOVQEQ acc7, acc3 1853 // Store result 1854 MOVQ acc0, y2in(8*0) 1855 MOVQ acc1, y2in(8*1) 1856 MOVQ acc2, y2in(8*2) 1857 MOVQ acc3, y2in(8*3) 1858 // Begin point add 1859 LDacc (z1in) 1860 CALL sm2SqrInternal(SB) // z1ˆ2 1861 ST (z1sqr) 1862 1863 LDt (x2in) 1864 CALL sm2MulInternal(SB) // x2 * z1ˆ2 1865 1866 LDt (x1in) 1867 CALL sm2SubInternal(SB) // h = u2 - u1 1868 ST (h) 1869 1870 LDt (z1in) 1871 CALL sm2MulInternal(SB) // z3 = h * z1 1872 ST (zout) 1873 1874 LDacc (z1sqr) 1875 CALL sm2MulInternal(SB) // z1ˆ3 1876 1877 LDt (y2in) 1878 CALL sm2MulInternal(SB) // s2 = y2 * z1ˆ3 1879 ST (s2) 1880 1881 LDt (y1in) 1882 CALL sm2SubInternal(SB) // r = s2 - s1 1883 ST (r) 1884 1885 CALL sm2SqrInternal(SB) // rsqr = rˆ2 1886 ST (rsqr) 1887 1888 LDacc (h) 1889 CALL sm2SqrInternal(SB) // hsqr = hˆ2 1890 ST (hsqr) 1891 1892 LDt (h) 1893 CALL sm2MulInternal(SB) // hcub = hˆ3 1894 ST (hcub) 1895 1896 LDt (y1in) 1897 CALL sm2MulInternal(SB) // y1 * hˆ3 1898 ST (s2) 1899 1900 LDacc (x1in) 1901 LDt (hsqr) 1902 CALL sm2MulInternal(SB) // u1 * hˆ2 1903 ST (h) 1904 1905 sm2MulBy2Inline // u1 * hˆ2 * 2, inline 1906 LDacc (rsqr) 1907 CALL sm2SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 1908 1909 LDt (hcub) 1910 CALL sm2SubInternal(SB) 1911 ST (xout) 1912 1913 MOVQ acc4, t0 1914 MOVQ acc5, t1 1915 MOVQ acc6, t2 1916 MOVQ acc7, t3 1917 LDacc (h) 1918 CALL sm2SubInternal(SB) 1919 1920 LDt (r) 1921 CALL sm2MulInternal(SB) 1922 1923 LDt (s2) 1924 CALL sm2SubInternal(SB) 1925 ST (yout) 1926 // Load stored values from stack 1927 MOVQ rptr, AX 1928 MOVL sel_save, BX 1929 MOVL zero_save, CX 1930 // The result is not valid if (sel == 0), conditional choose 1931 MOVOU xout(16*0), X0 1932 MOVOU xout(16*1), X1 1933 MOVOU yout(16*0), X2 1934 MOVOU yout(16*1), X3 1935 MOVOU zout(16*0), X4 1936 MOVOU zout(16*1), X5 1937 1938 MOVL BX, X6 1939 MOVL CX, X7 1940 1941 PXOR X8, X8 1942 PCMPEQL X9, X9 1943 1944 PSHUFD $0, X6, X6 1945 PSHUFD $0, X7, X7 1946 1947 PCMPEQL X8, X6 1948 PCMPEQL X8, X7 1949 1950 MOVOU X6, X15 1951 PANDN X9, X15 1952 1953 MOVOU x1in(16*0), X9 1954 MOVOU x1in(16*1), X10 1955 MOVOU y1in(16*0), X11 1956 MOVOU y1in(16*1), X12 1957 MOVOU z1in(16*0), X13 1958 MOVOU z1in(16*1), X14 1959 1960 PAND X15, X0 1961 PAND X15, X1 1962 PAND X15, X2 1963 PAND X15, X3 1964 PAND X15, X4 1965 PAND X15, X5 1966 1967 PAND X6, X9 1968 PAND X6, X10 1969 PAND X6, X11 1970 PAND X6, X12 1971 PAND X6, X13 1972 PAND X6, X14 1973 1974 PXOR X9, X0 1975 PXOR X10, X1 1976 PXOR X11, X2 1977 PXOR X12, X3 1978 PXOR X13, X4 1979 PXOR X14, X5 1980 // Similarly if zero == 0 1981 PCMPEQL X9, X9 1982 MOVOU X7, X15 1983 PANDN X9, X15 1984 1985 MOVOU x2in(16*0), X9 1986 MOVOU x2in(16*1), X10 1987 MOVOU y2in(16*0), X11 1988 MOVOU y2in(16*1), X12 1989 MOVOU sm2one<>+0x00(SB), X13 1990 MOVOU sm2one<>+0x10(SB), X14 1991 1992 PAND X15, X0 1993 PAND X15, X1 1994 PAND X15, X2 1995 PAND X15, X3 1996 PAND X15, X4 1997 PAND X15, X5 1998 1999 PAND X7, X9 2000 PAND X7, X10 2001 PAND X7, X11 2002 PAND X7, X12 2003 PAND X7, X13 2004 PAND X7, X14 2005 2006 PXOR X9, X0 2007 PXOR X10, X1 2008 PXOR X11, X2 2009 PXOR X12, X3 2010 PXOR X13, X4 2011 PXOR X14, X5 2012 // Finally output the result 2013 MOVOU X0, (16*0)(AX) 2014 MOVOU X1, (16*1)(AX) 2015 MOVOU X2, (16*2)(AX) 2016 MOVOU X3, (16*3)(AX) 2017 MOVOU X4, (16*4)(AX) 2018 MOVOU X5, (16*5)(AX) 2019 MOVQ $0, rptr 2020 2021 RET 2022 #undef x1in 2023 #undef y1in 2024 #undef z1in 2025 #undef x2in 2026 #undef y2in 2027 #undef xout 2028 #undef yout 2029 #undef zout 2030 #undef s2 2031 #undef z1sqr 2032 #undef h 2033 #undef r 2034 #undef hsqr 2035 #undef rsqr 2036 #undef hcub 2037 #undef rptr 2038 #undef sel_save 2039 #undef zero_save 2040 2041 // sm2IsZero returns 1 in AX if [acc4..acc7] represents zero and zero 2042 // otherwise. It writes to [acc4..acc7], t0 and t1. 2043 TEXT sm2IsZero(SB),NOSPLIT | DUPOK,$0 2044 // AX contains a flag that is set if the input is zero. 2045 XORQ AX, AX 2046 MOVQ $1, t1 2047 2048 // Check whether [acc4..acc7] are all zero. 2049 MOVQ acc4, t0 2050 ORQ acc5, t0 2051 ORQ acc6, t0 2052 ORQ acc7, t0 2053 2054 // Set the zero flag if so. (CMOV of a constant to a register doesn't 2055 // appear to be supported in Go. Thus t1 = 1.) 2056 CMOVQEQ t1, AX 2057 2058 // XOR [acc4..acc7] with P and compare with zero again. 2059 XORQ $-1, acc4 2060 XORQ sm2const0<>(SB), acc5 2061 XORQ $-1, acc6 2062 XORQ sm2const1<>(SB), acc7 2063 ORQ acc5, acc4 2064 ORQ acc6, acc4 2065 ORQ acc7, acc4 2066 2067 // Set the zero flag if so. 2068 CMOVQEQ t1, AX 2069 RET 2070 2071 /* ---------------------------------------*/ 2072 #define x1in(off) (32*0 + off)(SP) 2073 #define y1in(off) (32*1 + off)(SP) 2074 #define z1in(off) (32*2 + off)(SP) 2075 #define x2in(off) (32*3 + off)(SP) 2076 #define y2in(off) (32*4 + off)(SP) 2077 #define z2in(off) (32*5 + off)(SP) 2078 2079 #define xout(off) (32*6 + off)(SP) 2080 #define yout(off) (32*7 + off)(SP) 2081 #define zout(off) (32*8 + off)(SP) 2082 2083 #define u1(off) (32*9 + off)(SP) 2084 #define u2(off) (32*10 + off)(SP) 2085 #define s1(off) (32*11 + off)(SP) 2086 #define s2(off) (32*12 + off)(SP) 2087 #define z1sqr(off) (32*13 + off)(SP) 2088 #define z2sqr(off) (32*14 + off)(SP) 2089 #define h(off) (32*15 + off)(SP) 2090 #define r(off) (32*16 + off)(SP) 2091 #define hsqr(off) (32*17 + off)(SP) 2092 #define rsqr(off) (32*18 + off)(SP) 2093 #define hcub(off) (32*19 + off)(SP) 2094 #define rptr (32*20)(SP) 2095 #define points_eq (32*20+8)(SP) 2096 2097 //func sm2PointAddAsm(res, in1, in2 []uint64) int 2098 TEXT ·sm2PointAddAsm(SB),0,$680-80 2099 // Move input to stack in order to free registers 2100 MOVQ res+0(FP), AX 2101 MOVQ in1+24(FP), BX 2102 MOVQ in2+48(FP), CX 2103 2104 MOVOU (16*0)(BX), X0 2105 MOVOU (16*1)(BX), X1 2106 MOVOU (16*2)(BX), X2 2107 MOVOU (16*3)(BX), X3 2108 MOVOU (16*4)(BX), X4 2109 MOVOU (16*5)(BX), X5 2110 2111 MOVOU X0, x1in(16*0) 2112 MOVOU X1, x1in(16*1) 2113 MOVOU X2, y1in(16*0) 2114 MOVOU X3, y1in(16*1) 2115 MOVOU X4, z1in(16*0) 2116 MOVOU X5, z1in(16*1) 2117 2118 MOVOU (16*0)(CX), X0 2119 MOVOU (16*1)(CX), X1 2120 MOVOU (16*2)(CX), X2 2121 MOVOU (16*3)(CX), X3 2122 MOVOU (16*4)(CX), X4 2123 MOVOU (16*5)(CX), X5 2124 2125 MOVOU X0, x2in(16*0) 2126 MOVOU X1, x2in(16*1) 2127 MOVOU X2, y2in(16*0) 2128 MOVOU X3, y2in(16*1) 2129 MOVOU X4, z2in(16*0) 2130 MOVOU X5, z2in(16*1) 2131 // Store pointer to result 2132 MOVQ AX, rptr 2133 // Begin point add 2134 LDacc (z2in) 2135 CALL sm2SqrInternal(SB) // z2ˆ2 2136 ST (z2sqr) 2137 LDt (z2in) 2138 CALL sm2MulInternal(SB) // z2ˆ3 2139 LDt (y1in) 2140 CALL sm2MulInternal(SB) // s1 = z2ˆ3*y1 2141 ST (s1) 2142 2143 LDacc (z1in) 2144 CALL sm2SqrInternal(SB) // z1ˆ2 2145 ST (z1sqr) 2146 LDt (z1in) 2147 CALL sm2MulInternal(SB) // z1ˆ3 2148 LDt (y2in) 2149 CALL sm2MulInternal(SB) // s2 = z1ˆ3*y2 2150 ST (s2) 2151 2152 LDt (s1) 2153 CALL sm2SubInternal(SB) // r = s2 - s1 2154 ST (r) 2155 CALL sm2IsZero(SB) 2156 MOVQ AX, points_eq 2157 2158 LDacc (z2sqr) 2159 LDt (x1in) 2160 CALL sm2MulInternal(SB) // u1 = x1 * z2ˆ2 2161 ST (u1) 2162 LDacc (z1sqr) 2163 LDt (x2in) 2164 CALL sm2MulInternal(SB) // u2 = x2 * z1ˆ2 2165 ST (u2) 2166 2167 LDt (u1) 2168 CALL sm2SubInternal(SB) // h = u2 - u1 2169 ST (h) 2170 CALL sm2IsZero(SB) 2171 ANDQ points_eq, AX 2172 MOVQ AX, points_eq 2173 2174 LDacc (r) 2175 CALL sm2SqrInternal(SB) // rsqr = rˆ2 2176 ST (rsqr) 2177 2178 LDacc (h) 2179 CALL sm2SqrInternal(SB) // hsqr = hˆ2 2180 ST (hsqr) 2181 2182 LDt (h) 2183 CALL sm2MulInternal(SB) // hcub = hˆ3 2184 ST (hcub) 2185 2186 LDt (s1) 2187 CALL sm2MulInternal(SB) 2188 ST (s2) 2189 2190 LDacc (z1in) 2191 LDt (z2in) 2192 CALL sm2MulInternal(SB) // z1 * z2 2193 LDt (h) 2194 CALL sm2MulInternal(SB) // z1 * z2 * h 2195 ST (zout) 2196 2197 LDacc (hsqr) 2198 LDt (u1) 2199 CALL sm2MulInternal(SB) // hˆ2 * u1 2200 ST (u2) 2201 2202 sm2MulBy2Inline // u1 * hˆ2 * 2, inline 2203 LDacc (rsqr) 2204 CALL sm2SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 2205 2206 LDt (hcub) 2207 CALL sm2SubInternal(SB) 2208 ST (xout) 2209 2210 MOVQ acc4, t0 2211 MOVQ acc5, t1 2212 MOVQ acc6, t2 2213 MOVQ acc7, t3 2214 LDacc (u2) 2215 CALL sm2SubInternal(SB) 2216 2217 LDt (r) 2218 CALL sm2MulInternal(SB) 2219 2220 LDt (s2) 2221 CALL sm2SubInternal(SB) 2222 ST (yout) 2223 2224 MOVOU xout(16*0), X0 2225 MOVOU xout(16*1), X1 2226 MOVOU yout(16*0), X2 2227 MOVOU yout(16*1), X3 2228 MOVOU zout(16*0), X4 2229 MOVOU zout(16*1), X5 2230 // Finally output the result 2231 MOVQ rptr, AX 2232 MOVQ $0, rptr 2233 MOVOU X0, (16*0)(AX) 2234 MOVOU X1, (16*1)(AX) 2235 MOVOU X2, (16*2)(AX) 2236 MOVOU X3, (16*3)(AX) 2237 MOVOU X4, (16*4)(AX) 2238 MOVOU X5, (16*5)(AX) 2239 2240 MOVQ points_eq, AX 2241 MOVQ AX, ret+72(FP) 2242 2243 RET 2244 #undef x1in 2245 #undef y1in 2246 #undef z1in 2247 #undef x2in 2248 #undef y2in 2249 #undef z2in 2250 #undef xout 2251 #undef yout 2252 #undef zout 2253 #undef s1 2254 #undef s2 2255 #undef u1 2256 #undef u2 2257 #undef z1sqr 2258 #undef z2sqr 2259 #undef h 2260 #undef r 2261 #undef hsqr 2262 #undef rsqr 2263 #undef hcub 2264 #undef rptr 2265 /* ---------------------------------------*/ 2266 #define x(off) (32*0 + off)(SP) 2267 #define y(off) (32*1 + off)(SP) 2268 #define z(off) (32*2 + off)(SP) 2269 2270 #define s(off) (32*3 + off)(SP) 2271 #define m(off) (32*4 + off)(SP) 2272 #define zsqr(off) (32*5 + off)(SP) 2273 #define tmp(off) (32*6 + off)(SP) 2274 #define rptr (32*7)(SP) 2275 2276 //func sm2PointDoubleAsm(res, in []uint64) 2277 TEXT ·sm2PointDoubleAsm(SB),NOSPLIT,$256-48 2278 // Move input to stack in order to free registers 2279 MOVQ res+0(FP), AX 2280 MOVQ in+24(FP), BX 2281 2282 MOVOU (16*0)(BX), X0 2283 MOVOU (16*1)(BX), X1 2284 MOVOU (16*2)(BX), X2 2285 MOVOU (16*3)(BX), X3 2286 MOVOU (16*4)(BX), X4 2287 MOVOU (16*5)(BX), X5 2288 2289 MOVOU X0, x(16*0) 2290 MOVOU X1, x(16*1) 2291 MOVOU X2, y(16*0) 2292 MOVOU X3, y(16*1) 2293 MOVOU X4, z(16*0) 2294 MOVOU X5, z(16*1) 2295 // Store pointer to result 2296 MOVQ AX, rptr 2297 // Begin point double 2298 LDacc (z) 2299 CALL sm2SqrInternal(SB) 2300 ST (zsqr) 2301 2302 LDt (x) 2303 sm2AddInline 2304 STt (m) 2305 2306 LDacc (z) 2307 LDt (y) 2308 CALL sm2MulInternal(SB) 2309 sm2MulBy2Inline 2310 MOVQ rptr, AX 2311 // Store z 2312 MOVQ t0, (16*4 + 8*0)(AX) 2313 MOVQ t1, (16*4 + 8*1)(AX) 2314 MOVQ t2, (16*4 + 8*2)(AX) 2315 MOVQ t3, (16*4 + 8*3)(AX) 2316 2317 LDacc (x) 2318 LDt (zsqr) 2319 CALL sm2SubInternal(SB) 2320 LDt (m) 2321 CALL sm2MulInternal(SB) 2322 ST (m) 2323 // Multiply by 3 2324 sm2MulBy2Inline 2325 LDacc (m) 2326 sm2AddInline 2327 STt (m) 2328 //////////////////////// 2329 LDacc (y) 2330 sm2MulBy2Inline 2331 t2acc 2332 CALL sm2SqrInternal(SB) 2333 ST (s) 2334 CALL sm2SqrInternal(SB) 2335 // Divide by 2 2336 XORQ mul0, mul0 2337 MOVQ acc4, t0 2338 MOVQ acc5, t1 2339 MOVQ acc6, t2 2340 MOVQ acc7, t3 2341 2342 ADDQ $-1, acc4 2343 ADCQ sm2const0<>(SB), acc5 2344 ADCQ $-1, acc6 2345 ADCQ sm2const1<>(SB), acc7 2346 ADCQ $0, mul0 2347 TESTQ $1, t0 2348 2349 CMOVQEQ t0, acc4 2350 CMOVQEQ t1, acc5 2351 CMOVQEQ t2, acc6 2352 CMOVQEQ t3, acc7 2353 ANDQ t0, mul0 2354 2355 SHRQ $1, acc4:acc5 2356 SHRQ $1, acc5:acc6 2357 SHRQ $1, acc6:acc7 2358 SHRQ $1, acc7:mul0 2359 ST (y) 2360 ///////////////////////// 2361 LDacc (x) 2362 LDt (s) 2363 CALL sm2MulInternal(SB) 2364 ST (s) 2365 sm2MulBy2Inline 2366 STt (tmp) 2367 2368 LDacc (m) 2369 CALL sm2SqrInternal(SB) 2370 LDt (tmp) 2371 CALL sm2SubInternal(SB) 2372 2373 MOVQ rptr, AX 2374 // Store x 2375 MOVQ acc4, (16*0 + 8*0)(AX) 2376 MOVQ acc5, (16*0 + 8*1)(AX) 2377 MOVQ acc6, (16*0 + 8*2)(AX) 2378 MOVQ acc7, (16*0 + 8*3)(AX) 2379 2380 acc2t 2381 LDacc (s) 2382 CALL sm2SubInternal(SB) 2383 2384 LDt (m) 2385 CALL sm2MulInternal(SB) 2386 2387 LDt (y) 2388 CALL sm2SubInternal(SB) 2389 MOVQ rptr, AX 2390 // Store y 2391 MOVQ acc4, (16*2 + 8*0)(AX) 2392 MOVQ acc5, (16*2 + 8*1)(AX) 2393 MOVQ acc6, (16*2 + 8*2)(AX) 2394 MOVQ acc7, (16*2 + 8*3)(AX) 2395 /////////////////////// 2396 MOVQ $0, rptr 2397 2398 RET 2399 /* ---------------------------------------*/ 2400