github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_common_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #include "p256_macros_amd64.s" 6 7 /* ---------------------------------------*/ 8 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 9 TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 10 JMP ·p256BigToLittle(SB) 11 /* ---------------------------------------*/ 12 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 13 TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 14 JMP ·p256BigToLittle(SB) 15 /* ---------------------------------------*/ 16 // func p256LittleToBig(res *[32]byte, in *p256Element) 17 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 18 JMP ·p256BigToLittle(SB) 19 /* ---------------------------------------*/ 20 // func p256BigToLittle(res *p256Element, in *[32]byte) 21 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 22 MOVQ res+0(FP), res_ptr 23 MOVQ in+8(FP), x_ptr 24 25 MOVQ (8*0)(x_ptr), acc0 26 MOVQ (8*1)(x_ptr), acc1 27 MOVQ (8*2)(x_ptr), acc2 28 MOVQ (8*3)(x_ptr), acc3 29 30 BSWAPQ acc0 31 BSWAPQ acc1 32 BSWAPQ acc2 33 BSWAPQ acc3 34 35 MOVQ acc3, (8*0)(res_ptr) 36 MOVQ acc2, (8*1)(res_ptr) 37 MOVQ acc1, (8*2)(res_ptr) 38 MOVQ acc0, (8*3)(res_ptr) 39 40 RET 41 /* ---------------------------------------*/ 42 // func p256MovCond(res, a, b *SM2P256Point, cond int) 43 TEXT ·p256MovCond(SB),NOSPLIT,$0 44 MOVQ res+0(FP), res_ptr 45 MOVQ a+8(FP), x_ptr 46 MOVQ b+16(FP), y_ptr 47 MOVQ cond+24(FP), X12 48 49 CMPB ·supportAVX2+0(SB), $0x01 50 JEQ move_avx2 51 52 PXOR X13, X13 53 PSHUFD $0, X12, X12 54 PCMPEQL X13, X12 55 56 MOVOU X12, X0 57 MOVOU (16*0)(x_ptr), X6 58 PANDN X6, X0 59 60 MOVOU X12, X1 61 MOVOU (16*1)(x_ptr), X7 62 PANDN X7, X1 63 64 MOVOU X12, X2 65 MOVOU (16*2)(x_ptr), X8 66 PANDN X8, X2 67 68 MOVOU X12, X3 69 MOVOU (16*3)(x_ptr), X9 70 PANDN X9, X3 71 72 MOVOU X12, X4 73 MOVOU (16*4)(x_ptr), X10 74 PANDN X10, X4 75 76 MOVOU X12, X5 77 MOVOU (16*5)(x_ptr), X11 78 PANDN X11, X5 79 80 MOVOU (16*0)(y_ptr), X6 81 MOVOU (16*1)(y_ptr), X7 82 MOVOU (16*2)(y_ptr), X8 83 MOVOU (16*3)(y_ptr), X9 84 MOVOU (16*4)(y_ptr), X10 85 MOVOU (16*5)(y_ptr), X11 86 87 PAND X12, X6 88 PAND X12, X7 89 PAND X12, X8 90 PAND X12, X9 91 PAND X12, X10 92 PAND X12, X11 93 94 PXOR X6, X0 95 PXOR X7, X1 96 PXOR X8, X2 97 PXOR X9, X3 98 PXOR X10, X4 99 PXOR X11, X5 100 101 MOVOU X0, (16*0)(res_ptr) 102 MOVOU X1, (16*1)(res_ptr) 103 MOVOU X2, (16*2)(res_ptr) 104 MOVOU X3, (16*3)(res_ptr) 105 MOVOU X4, (16*4)(res_ptr) 106 MOVOU X5, (16*5)(res_ptr) 107 108 RET 109 110 move_avx2: 111 VPXOR Y13, Y13, Y13 112 VPBROADCASTD X12, Y12 113 VPCMPEQD Y13, Y12, Y12 114 115 VPANDN (32*0)(x_ptr), Y12, Y0 116 VPANDN (32*1)(x_ptr), Y12, Y1 117 VPANDN (32*2)(x_ptr), Y12, Y2 118 119 VPAND (32*0)(y_ptr), Y12, Y3 120 VPAND (32*1)(y_ptr), Y12, Y4 121 VPAND (32*2)(y_ptr), Y12, Y5 122 123 VPXOR Y3, Y0, Y0 124 VPXOR Y4, Y1, Y1 125 VPXOR Y5, Y2, Y2 126 127 VMOVDQU Y0, (32*0)(res_ptr) 128 VMOVDQU Y1, (32*1)(res_ptr) 129 VMOVDQU Y2, (32*2)(res_ptr) 130 131 VZEROUPPER 132 RET 133 134 /* ---------------------------------------*/ 135 // func p256NegCond(val *p256Element, cond int) 136 TEXT ·p256NegCond(SB),NOSPLIT,$0 137 MOVQ val+0(FP), res_ptr 138 MOVQ cond+8(FP), t0 139 // acc = poly 140 MOVQ $-1, acc0 141 MOVQ p256p<>+0x08(SB), acc1 142 MOVQ $-1, acc2 143 MOVQ p256p<>+0x18(SB), acc3 144 // Load the original value 145 MOVQ (8*0)(res_ptr), acc4 146 MOVQ (8*1)(res_ptr), x_ptr 147 MOVQ (8*2)(res_ptr), y_ptr 148 MOVQ (8*3)(res_ptr), acc5 149 // Speculatively subtract 150 SUBQ acc4, acc0 151 SBBQ x_ptr, acc1 152 SBBQ y_ptr, acc2 153 SBBQ acc5, acc3 154 // If condition is 0, keep original value 155 TESTQ t0, t0 156 CMOVQEQ acc4, acc0 157 CMOVQEQ x_ptr, acc1 158 CMOVQEQ y_ptr, acc2 159 CMOVQEQ acc5, acc3 160 // Store result 161 MOVQ acc0, (8*0)(res_ptr) 162 MOVQ acc1, (8*1)(res_ptr) 163 MOVQ acc2, (8*2)(res_ptr) 164 MOVQ acc3, (8*3)(res_ptr) 165 166 RET 167 168 /* ---------------------------------------*/ 169 // func p256Mul(res, in1, in2 *p256Element) 170 TEXT ·p256Mul(SB),NOSPLIT,$0 171 MOVQ in1+8(FP), x_ptr 172 MOVQ in2+16(FP), y_ptr 173 174 CMPB ·supportBMI2+0(SB), $0x01 175 JEQ mulBMI2 176 177 // x * y[0] 178 MOVQ (8*0)(y_ptr), t0 179 180 MOVQ (8*0)(x_ptr), AX 181 MULQ t0 182 MOVQ AX, acc0 183 MOVQ DX, acc1 184 185 MOVQ (8*1)(x_ptr), AX 186 MULQ t0 187 ADDQ AX, acc1 188 ADCQ $0, DX 189 MOVQ DX, acc2 190 191 MOVQ (8*2)(x_ptr), AX 192 MULQ t0 193 ADDQ AX, acc2 194 ADCQ $0, DX 195 MOVQ DX, acc3 196 197 MOVQ (8*3)(x_ptr), AX 198 MULQ t0 199 ADDQ AX, acc3 200 ADCQ $0, DX 201 MOVQ DX, acc4 202 XORQ acc5, acc5 203 // First reduction step 204 MOVQ acc0, AX 205 MOVQ acc0, DX 206 SHLQ $32, AX 207 SHRQ $32, DX 208 209 SUBQ AX, acc1 210 SBBQ DX, acc2 211 SBBQ AX, acc3 212 MOVQ acc0, AX 213 SBBQ DX, acc0 214 215 ADDQ AX, acc1 216 ADCQ $0, acc2 217 ADCQ $0, acc3 218 ADCQ acc0, acc4 219 ADCQ $0, acc5 220 221 XORQ acc0, acc0 222 // x * y[1] 223 MOVQ (8*1)(y_ptr), t0 224 225 MOVQ (8*0)(x_ptr), AX 226 MULQ t0 227 ADDQ AX, acc1 228 ADCQ $0, DX 229 MOVQ DX, BX 230 231 MOVQ (8*1)(x_ptr), AX 232 MULQ t0 233 ADDQ BX, acc2 234 ADCQ $0, DX 235 ADDQ AX, acc2 236 ADCQ $0, DX 237 MOVQ DX, BX 238 239 MOVQ (8*2)(x_ptr), AX 240 MULQ t0 241 ADDQ BX, acc3 242 ADCQ $0, DX 243 ADDQ AX, acc3 244 ADCQ $0, DX 245 MOVQ DX, BX 246 247 MOVQ (8*3)(x_ptr), AX 248 MULQ t0 249 ADDQ BX, acc4 250 ADCQ $0, DX 251 ADDQ AX, acc4 252 ADCQ DX, acc5 253 ADCQ $0, acc0 254 // Second reduction step 255 MOVQ acc1, AX 256 MOVQ acc1, DX 257 SHLQ $32, AX 258 SHRQ $32, DX 259 260 SUBQ AX, acc2 261 SBBQ DX, acc3 262 SBBQ AX, acc4 263 MOVQ acc1, AX 264 SBBQ DX, acc1 265 266 ADDQ AX, acc2 267 ADCQ $0, acc3 268 ADCQ $0, acc4 269 ADCQ acc1, acc5 270 ADCQ $0, acc0 271 272 XORQ acc1, acc1 273 // x * y[2] 274 MOVQ (8*2)(y_ptr), t0 275 276 MOVQ (8*0)(x_ptr), AX 277 MULQ t0 278 ADDQ AX, acc2 279 ADCQ $0, DX 280 MOVQ DX, BX 281 282 MOVQ (8*1)(x_ptr), AX 283 MULQ t0 284 ADDQ BX, acc3 285 ADCQ $0, DX 286 ADDQ AX, acc3 287 ADCQ $0, DX 288 MOVQ DX, BX 289 290 MOVQ (8*2)(x_ptr), AX 291 MULQ t0 292 ADDQ BX, acc4 293 ADCQ $0, DX 294 ADDQ AX, acc4 295 ADCQ $0, DX 296 MOVQ DX, BX 297 298 MOVQ (8*3)(x_ptr), AX 299 MULQ t0 300 ADDQ BX, acc5 301 ADCQ $0, DX 302 ADDQ AX, acc5 303 ADCQ DX, acc0 304 ADCQ $0, acc1 305 // Third reduction step 306 MOVQ acc2, AX 307 MOVQ acc2, DX 308 SHLQ $32, AX 309 SHRQ $32, DX 310 311 SUBQ AX, acc3 312 SBBQ DX, acc4 313 SBBQ AX, acc5 314 MOVQ acc2, AX 315 SBBQ DX, acc2 316 317 ADDQ AX, acc3 318 ADCQ $0, acc4 319 ADCQ $0, acc5 320 ADCQ acc2, acc0 321 ADCQ $0, acc1 322 323 XORQ acc2, acc2 324 // x * y[3] 325 MOVQ (8*3)(y_ptr), t0 326 327 MOVQ (8*0)(x_ptr), AX 328 MULQ t0 329 ADDQ AX, acc3 330 ADCQ $0, DX 331 MOVQ DX, BX 332 333 MOVQ (8*1)(x_ptr), AX 334 MULQ t0 335 ADDQ BX, acc4 336 ADCQ $0, DX 337 ADDQ AX, acc4 338 ADCQ $0, DX 339 MOVQ DX, BX 340 341 MOVQ (8*2)(x_ptr), AX 342 MULQ t0 343 ADDQ BX, acc5 344 ADCQ $0, DX 345 ADDQ AX, acc5 346 ADCQ $0, DX 347 MOVQ DX, BX 348 349 MOVQ (8*3)(x_ptr), AX 350 MULQ t0 351 ADDQ BX, acc0 352 ADCQ $0, DX 353 ADDQ AX, acc0 354 ADCQ DX, acc1 355 ADCQ $0, acc2 356 // Last reduction step 357 MOVQ acc3, AX 358 MOVQ acc3, DX 359 SHLQ $32, AX 360 SHRQ $32, DX 361 362 SUBQ AX, acc4 363 SBBQ DX, acc5 364 SBBQ AX, acc0 365 MOVQ acc3, AX 366 SBBQ DX, acc3 367 368 ADDQ AX, acc4 369 ADCQ $0, acc5 370 ADCQ $0, acc0 371 ADCQ acc3, acc1 372 ADCQ $0, acc2 373 374 MOVQ res+0(FP), res_ptr 375 p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) 376 RET 377 378 mulBMI2: 379 XORQ acc5, acc5 380 XORQ res_ptr, res_ptr 381 // x * y[0] 382 MOVQ (8*0)(y_ptr), DX 383 MULXQ (8*0)(x_ptr), acc0, acc1 384 385 MULXQ (8*1)(x_ptr), AX, acc2 386 ADCXQ AX, acc1 387 388 MULXQ (8*2)(x_ptr), AX, acc3 389 ADCXQ AX, acc2 390 391 MULXQ (8*3)(x_ptr), AX, acc4 392 ADCXQ AX, acc3 393 ADCXQ acc5, acc4 394 395 // First reduction step 396 MOVQ acc0, AX 397 MOVQ acc0, DX 398 SHLQ $32, AX 399 SHRQ $32, DX 400 401 SUBQ AX, acc1 402 SBBQ DX, acc2 403 SBBQ AX, acc3 404 MOVQ acc0, AX 405 SBBQ DX, acc0 406 407 ADOXQ AX, acc1 408 ADOXQ res_ptr, acc2 409 ADOXQ res_ptr, acc3 410 ADOXQ acc0, acc4 411 ADOXQ res_ptr, acc5 412 413 XORQ acc0, acc0 414 // x * y[1] 415 MOVQ (8*1)(y_ptr), DX 416 MULXQ (8*0)(x_ptr), AX, t0 417 ADOXQ AX, acc1 418 419 MULXQ (8*1)(x_ptr), AX, BX 420 ADCXQ t0, AX 421 ADOXQ AX, acc2 422 423 MULXQ (8*2)(x_ptr), AX, t0 424 ADCXQ BX, AX 425 ADOXQ AX, acc3 426 427 MULXQ (8*3)(x_ptr), AX, BX 428 ADCXQ t0, AX 429 ADOXQ AX, acc4 430 ADCXQ acc0, BX 431 ADOXQ BX, acc5 432 ADOXQ res_ptr, acc0 433 434 // Second reduction step 435 MOVQ acc1, AX 436 MOVQ acc1, DX 437 SHLQ $32, AX 438 SHRQ $32, DX 439 440 SUBQ AX, acc2 441 SBBQ DX, acc3 442 SBBQ AX, acc4 443 MOVQ acc1, AX 444 SBBQ DX, acc1 445 446 ADOXQ AX, acc2 447 ADOXQ res_ptr, acc3 448 ADOXQ res_ptr, acc4 449 ADOXQ acc1, acc5 450 ADOXQ res_ptr, acc0 451 452 XORQ acc1, acc1 453 // x * y[2] 454 MOVQ (8*2)(y_ptr), DX 455 MULXQ (8*0)(x_ptr), AX, t0 456 ADOXQ AX, acc2 457 458 MULXQ (8*1)(x_ptr), AX, BX 459 ADCXQ t0, AX 460 ADOXQ AX, acc3 461 462 MULXQ (8*2)(x_ptr), AX, t0 463 ADCXQ BX, AX 464 ADOXQ AX, acc4 465 466 MULXQ (8*3)(x_ptr), AX, BX 467 ADCXQ t0, AX 468 ADOXQ AX, acc5 469 ADCXQ res_ptr, BX 470 ADOXQ BX, acc0 471 ADOXQ res_ptr, acc1 472 473 // Third reduction step 474 MOVQ acc2, AX 475 MOVQ acc2, DX 476 SHLQ $32, AX 477 SHRQ $32, DX 478 479 SUBQ AX, acc3 480 SBBQ DX, acc4 481 SBBQ AX, acc5 482 MOVQ acc2, AX 483 SBBQ DX, acc2 484 485 ADOXQ AX, acc3 486 ADOXQ res_ptr, acc4 487 ADOXQ res_ptr, acc5 488 ADOXQ acc2, acc0 489 ADOXQ res_ptr, acc1 490 491 XORQ acc2, acc2 492 // x * y[3] 493 MOVQ (8*3)(y_ptr), DX 494 MULXQ (8*0)(x_ptr), AX, t0 495 ADOXQ AX, acc3 496 497 MULXQ (8*1)(x_ptr), AX, BX 498 ADCXQ t0, AX 499 ADOXQ AX, acc4 500 501 MULXQ (8*2)(x_ptr), AX, t0 502 ADCXQ BX, AX 503 ADOXQ AX, acc5 504 505 MULXQ (8*3)(x_ptr), AX, BX 506 ADCXQ t0, AX 507 ADOXQ AX, acc0 508 ADCXQ res_ptr, BX 509 ADOXQ BX, acc1 510 ADOXQ res_ptr, acc2 511 512 // Last reduction step 513 MOVQ acc3, AX 514 MOVQ acc3, DX 515 SHLQ $32, AX 516 SHRQ $32, DX 517 518 SUBQ AX, acc4 519 SBBQ DX, acc5 520 SBBQ AX, acc0 521 MOVQ acc3, AX 522 SBBQ DX, acc3 523 524 ADOXQ AX, acc4 525 ADOXQ res_ptr, acc5 526 ADOXQ res_ptr, acc0 527 ADOXQ acc3, acc1 528 ADOXQ res_ptr, acc2 529 530 MOVQ res+0(FP), res_ptr 531 p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) 532 RET 533 534 /* ---------------------------------------*/ 535 // func p256FromMont(res, in *p256Element) 536 TEXT ·p256FromMont(SB),NOSPLIT,$0 537 MOVQ res+0(FP), res_ptr 538 MOVQ in+8(FP), x_ptr 539 540 MOVQ (8*0)(x_ptr), acc0 541 MOVQ (8*1)(x_ptr), acc1 542 MOVQ (8*2)(x_ptr), acc2 543 MOVQ (8*3)(x_ptr), acc3 544 XORQ acc4, acc4 545 546 // Only reduce, no multiplications are needed 547 // First stage 548 MOVQ acc0, AX 549 MOVQ acc0, DX 550 SHLQ $32, AX 551 SHRQ $32, DX 552 553 SUBQ AX, acc1 554 SBBQ DX, acc2 555 SBBQ AX, acc3 556 MOVQ acc0, AX 557 SBBQ DX, acc0 558 559 ADDQ AX, acc1 560 ADCQ $0, acc2 561 ADCQ $0, acc3 562 ADCQ acc0, acc4 563 564 XORQ acc5, acc5 565 // Second stage 566 MOVQ acc1, AX 567 MOVQ acc1, DX 568 SHLQ $32, AX 569 SHRQ $32, DX 570 571 SUBQ AX, acc2 572 SBBQ DX, acc3 573 SBBQ AX, acc4 574 MOVQ acc1, AX 575 SBBQ DX, acc5 576 577 ADDQ AX, acc2 578 ADCQ $0, acc3 579 ADCQ $0, acc4 580 ADCQ acc1, acc5 581 582 XORQ acc0, acc0 583 // Third stage 584 MOVQ acc2, AX 585 MOVQ acc2, DX 586 SHLQ $32, AX 587 SHRQ $32, DX 588 589 SUBQ AX, acc3 590 SBBQ DX, acc4 591 SBBQ AX, acc5 592 MOVQ acc2, AX 593 SBBQ DX, acc2 594 595 ADDQ AX, acc3 596 ADCQ $0, acc4 597 ADCQ $0, acc5 598 ADCQ acc2, acc0 599 600 XORQ acc1, acc1 601 // Last stage 602 MOVQ acc3, AX 603 MOVQ acc3, DX 604 SHLQ $32, AX 605 SHRQ $32, DX 606 607 SUBQ AX, acc4 608 SBBQ DX, acc5 609 SBBQ AX, acc0 610 MOVQ acc3, AX 611 SBBQ DX, acc3 612 613 ADDQ AX, acc4 614 ADCQ $0, acc5 615 ADCQ $0, acc0 616 ADCQ acc3, acc1 617 618 MOVQ acc4, x_ptr 619 MOVQ acc5, acc3 620 MOVQ acc0, t0 621 MOVQ acc1, BX 622 623 SUBQ $-1, acc4 624 SBBQ p256p<>+0x08(SB), acc5 625 SBBQ $-1, acc0 626 SBBQ p256p<>+0x018(SB), acc1 627 628 CMOVQCS x_ptr, acc4 629 CMOVQCS acc3, acc5 630 CMOVQCS t0, acc0 631 CMOVQCS BX, acc1 632 633 MOVQ acc4, (8*0)(res_ptr) 634 MOVQ acc5, (8*1)(res_ptr) 635 MOVQ acc0, (8*2)(res_ptr) 636 MOVQ acc1, (8*3)(res_ptr) 637 638 RET 639 /* ---------------------------------------*/ 640 // func p256Select(res *SM2P256Point, table *p256Table, idx, limit int) 641 TEXT ·p256Select(SB),NOSPLIT,$0 642 //MOVQ idx+16(FP),AX 643 MOVQ table+8(FP),DI 644 MOVQ res+0(FP),DX 645 646 CMPB ·supportAVX2+0(SB), $0x01 647 JEQ select_avx2 648 649 PXOR X15, X15 // X15 = 0 650 PCMPEQL X14, X14 // X14 = -1 651 PSUBL X14, X15 // X15 = 1 652 MOVL idx+16(FP), X14 653 PSHUFD $0, X14, X14 654 655 PXOR X0, X0 656 PXOR X1, X1 657 PXOR X2, X2 658 PXOR X3, X3 659 PXOR X4, X4 660 PXOR X5, X5 661 MOVQ limit+24(FP),AX 662 663 MOVOU X15, X13 664 665 loop_select: 666 667 MOVOU X13, X12 668 PADDL X15, X13 669 PCMPEQL X14, X12 670 671 MOVOU (16*0)(DI), X6 672 MOVOU (16*1)(DI), X7 673 MOVOU (16*2)(DI), X8 674 MOVOU (16*3)(DI), X9 675 MOVOU (16*4)(DI), X10 676 MOVOU (16*5)(DI), X11 677 ADDQ $(16*6), DI 678 679 PAND X12, X6 680 PAND X12, X7 681 PAND X12, X8 682 PAND X12, X9 683 PAND X12, X10 684 PAND X12, X11 685 686 PXOR X6, X0 687 PXOR X7, X1 688 PXOR X8, X2 689 PXOR X9, X3 690 PXOR X10, X4 691 PXOR X11, X5 692 693 DECQ AX 694 JNE loop_select 695 696 MOVOU X0, (16*0)(DX) 697 MOVOU X1, (16*1)(DX) 698 MOVOU X2, (16*2)(DX) 699 MOVOU X3, (16*3)(DX) 700 MOVOU X4, (16*4)(DX) 701 MOVOU X5, (16*5)(DX) 702 703 RET 704 705 select_avx2: 706 VPXOR Y15, Y15, Y15 707 VPCMPEQD Y14, Y14, Y14 708 VPSUBD Y14, Y15, Y15 // Y15 = 1 709 VPBROADCASTD idx+16(FP), Y14 710 711 MOVQ limit+24(FP),AX 712 VMOVDQU Y15, Y13 713 714 VPXOR Y0, Y0, Y0 715 VPXOR Y1, Y1, Y1 716 VPXOR Y2, Y2, Y2 717 718 loop_select_avx2: 719 VPCMPEQD Y14, Y13, Y12 720 VPADDD Y15, Y13, Y13 721 722 VPAND (32*0)(DI), Y12, Y3 723 VPAND (32*1)(DI), Y12, Y4 724 VPAND (32*2)(DI), Y12, Y5 725 726 ADDQ $(32*3), DI 727 728 VPXOR Y3, Y0, Y0 729 VPXOR Y4, Y1, Y1 730 VPXOR Y5, Y2, Y2 731 732 DECQ AX 733 JNE loop_select_avx2 734 735 VMOVDQU Y0, (32*0)(DX) 736 VMOVDQU Y1, (32*1)(DX) 737 VMOVDQU Y2, (32*2)(DX) 738 VZEROUPPER 739 RET 740 741 /* ---------------------------------------*/ 742 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 743 TEXT ·p256SelectAffine(SB),NOSPLIT,$0 744 MOVQ idx+16(FP),AX 745 MOVQ table+8(FP),DI 746 MOVQ res+0(FP),DX 747 748 CMPB ·supportAVX2+0(SB), $0x01 749 JEQ select_base_avx2 750 751 PXOR X15, X15 // X15 = 0 752 PCMPEQL X14, X14 // X14 = -1 753 PSUBL X14, X15 // X15 = 1 754 MOVL idx+16(FP), X14 // x14 = idx 755 PSHUFD $0, X14, X14 756 757 MOVQ $16, AX 758 MOVOU X15, X13 759 760 PXOR X0, X0 761 PXOR X1, X1 762 PXOR X2, X2 763 PXOR X3, X3 764 765 loop_select_base: 766 767 MOVOU X13, X12 768 PADDL X15, X13 769 PCMPEQL X14, X12 770 771 MOVOU (16*0)(DI), X4 772 MOVOU (16*1)(DI), X5 773 MOVOU (16*2)(DI), X6 774 MOVOU (16*3)(DI), X7 775 776 MOVOU (16*4)(DI), X8 777 MOVOU (16*5)(DI), X9 778 MOVOU (16*6)(DI), X10 779 MOVOU (16*7)(DI), X11 780 781 ADDQ $(16*8), DI 782 783 PAND X12, X4 784 PAND X12, X5 785 PAND X12, X6 786 PAND X12, X7 787 788 MOVOU X13, X12 789 PADDL X15, X13 790 PCMPEQL X14, X12 791 792 PAND X12, X8 793 PAND X12, X9 794 PAND X12, X10 795 PAND X12, X11 796 797 PXOR X4, X0 798 PXOR X5, X1 799 PXOR X6, X2 800 PXOR X7, X3 801 802 PXOR X8, X0 803 PXOR X9, X1 804 PXOR X10, X2 805 PXOR X11, X3 806 807 DECQ AX 808 JNE loop_select_base 809 810 MOVOU X0, (16*0)(DX) 811 MOVOU X1, (16*1)(DX) 812 MOVOU X2, (16*2)(DX) 813 MOVOU X3, (16*3)(DX) 814 815 RET 816 817 select_base_avx2: 818 VPXOR Y15, Y15, Y15 819 VPCMPEQD Y14, Y14, Y14 820 VPSUBD Y14, Y15, Y15 821 VPBROADCASTD idx+16(FP), Y14 822 823 MOVQ $16, AX 824 VMOVDQU Y15, Y13 825 VPXOR Y0, Y0, Y0 826 VPXOR Y1, Y1, Y1 827 828 loop_select_base_avx2: 829 VPCMPEQD Y14, Y13, Y12 830 VPADDD Y15, Y13, Y13 831 832 VPAND (32*0)(DI), Y12, Y2 833 VPAND (32*1)(DI), Y12, Y3 834 835 VPCMPEQD Y14, Y13, Y12 836 VPADDD Y15, Y13, Y13 837 838 VPAND (32*2)(DI), Y12, Y4 839 VPAND (32*3)(DI), Y12, Y5 840 841 ADDQ $(32*4), DI 842 843 VPXOR Y2, Y0, Y0 844 VPXOR Y3, Y1, Y1 845 846 VPXOR Y4, Y0, Y0 847 VPXOR Y5, Y1, Y1 848 849 DECQ AX 850 JNE loop_select_base_avx2 851 852 VMOVDQU Y0, (32*0)(DX) 853 VMOVDQU Y1, (32*1)(DX) 854 VZEROUPPER 855 RET 856 857 //func p256OrdReduce(s *p256OrdElement) 858 TEXT ·p256OrdReduce(SB),NOSPLIT,$0 859 MOVQ s+0(FP), res_ptr 860 MOVQ (8*0)(res_ptr), acc0 861 MOVQ (8*1)(res_ptr), acc1 862 MOVQ (8*2)(res_ptr), acc2 863 MOVQ (8*3)(res_ptr), acc3 864 XORQ acc4, acc4 865 p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr) 866 RET 867 868 // func p256OrdMul(res, in1, in2 *p256OrdElement) 869 TEXT ·p256OrdMul(SB),NOSPLIT,$0 870 MOVQ in1+8(FP), x_ptr 871 MOVQ in2+16(FP), y_ptr 872 CMPB ·supportBMI2+0(SB), $0x01 873 JEQ ordMulBMI2 874 875 // x * y[0] 876 MOVQ (8*0)(y_ptr), t0 877 878 MOVQ (8*0)(x_ptr), AX 879 MULQ t0 880 MOVQ AX, acc0 881 MOVQ DX, acc1 882 883 MOVQ (8*1)(x_ptr), AX 884 MULQ t0 885 ADDQ AX, acc1 886 ADCQ $0, DX 887 MOVQ DX, acc2 888 889 MOVQ (8*2)(x_ptr), AX 890 MULQ t0 891 ADDQ AX, acc2 892 ADCQ $0, DX 893 MOVQ DX, acc3 894 895 MOVQ (8*3)(x_ptr), AX 896 MULQ t0 897 ADDQ AX, acc3 898 ADCQ $0, DX 899 MOVQ DX, acc4 900 XORQ acc5, acc5 901 // First reduction step 902 MOVQ acc0, AX 903 MULQ p256ordK0<>(SB) 904 MOVQ AX, t0 905 906 MOVQ p256ord<>+0x00(SB), AX 907 MULQ t0 908 ADDQ AX, acc0 909 ADCQ $0, DX 910 MOVQ DX, BX 911 912 MOVQ t0, acc0 913 MOVQ t0, AX 914 MOVQ t0, DX 915 SHLQ $32, AX 916 SHRQ $32, DX 917 918 SUBQ t0, acc2 919 SBBQ AX, acc3 920 SBBQ DX, acc0 921 922 MOVQ p256ord<>+0x08(SB), AX 923 MULQ t0 924 ADDQ BX, acc1 925 ADCQ $0, DX 926 ADDQ AX, acc1 927 ADCQ DX, acc2 928 ADCQ $0, acc3 929 ADCQ acc0, acc4 930 ADCQ $0, acc5 931 932 XORQ acc0, acc0 // It seems this line is optional. 933 // x * y[1] 934 MOVQ (8*1)(y_ptr), t0 935 936 MOVQ (8*0)(x_ptr), AX 937 MULQ t0 938 ADDQ AX, acc1 939 ADCQ $0, DX 940 MOVQ DX, BX 941 942 MOVQ (8*1)(x_ptr), AX 943 MULQ t0 944 ADDQ BX, acc2 945 ADCQ $0, DX 946 ADDQ AX, acc2 947 ADCQ $0, DX 948 MOVQ DX, BX 949 950 MOVQ (8*2)(x_ptr), AX 951 MULQ t0 952 ADDQ BX, acc3 953 ADCQ $0, DX 954 ADDQ AX, acc3 955 ADCQ $0, DX 956 MOVQ DX, BX 957 958 MOVQ (8*3)(x_ptr), AX 959 MULQ t0 960 ADDQ BX, acc4 961 ADCQ $0, DX 962 ADDQ AX, acc4 963 ADCQ DX, acc5 964 ADCQ $0, acc0 965 // Second reduction step 966 MOVQ acc1, AX 967 MULQ p256ordK0<>(SB) 968 MOVQ AX, t0 969 970 MOVQ p256ord<>+0x00(SB), AX 971 MULQ t0 972 ADDQ AX, acc1 973 ADCQ $0, DX 974 MOVQ DX, BX 975 976 MOVQ t0, acc1 977 MOVQ t0, AX 978 MOVQ t0, DX 979 SHLQ $32, AX 980 SHRQ $32, DX 981 982 SUBQ t0, acc3 983 SBBQ AX, acc4 984 SBBQ DX, acc1 985 986 MOVQ p256ord<>+0x08(SB), AX 987 MULQ t0 988 ADDQ BX, acc2 989 ADCQ $0, DX 990 ADDQ AX, acc2 991 ADCQ DX, acc3 992 ADCQ $0, acc4 993 ADCQ acc1, acc5 994 ADCQ $0, acc0 995 996 XORQ acc1, acc1 // It seems this line is optional. 997 // x * y[2] 998 MOVQ (8*2)(y_ptr), t0 999 1000 MOVQ (8*0)(x_ptr), AX 1001 MULQ t0 1002 ADDQ AX, acc2 1003 ADCQ $0, DX 1004 MOVQ DX, BX 1005 1006 MOVQ (8*1)(x_ptr), AX 1007 MULQ t0 1008 ADDQ BX, acc3 1009 ADCQ $0, DX 1010 ADDQ AX, acc3 1011 ADCQ $0, DX 1012 MOVQ DX, BX 1013 1014 MOVQ (8*2)(x_ptr), AX 1015 MULQ t0 1016 ADDQ BX, acc4 1017 ADCQ $0, DX 1018 ADDQ AX, acc4 1019 ADCQ $0, DX 1020 MOVQ DX, BX 1021 1022 MOVQ (8*3)(x_ptr), AX 1023 MULQ t0 1024 ADDQ BX, acc5 1025 ADCQ $0, DX 1026 ADDQ AX, acc5 1027 ADCQ DX, acc0 1028 ADCQ $0, acc1 1029 // Third reduction step 1030 MOVQ acc2, AX 1031 MULQ p256ordK0<>(SB) 1032 MOVQ AX, t0 1033 1034 MOVQ p256ord<>+0x00(SB), AX 1035 MULQ t0 1036 ADDQ AX, acc2 1037 ADCQ $0, DX 1038 MOVQ DX, BX 1039 1040 MOVQ t0, acc2 1041 MOVQ t0, AX 1042 MOVQ t0, DX 1043 SHLQ $32, AX 1044 SHRQ $32, DX 1045 1046 SUBQ t0, acc4 1047 SBBQ AX, acc5 1048 SBBQ DX, acc2 1049 1050 MOVQ p256ord<>+0x08(SB), AX 1051 MULQ t0 1052 ADDQ BX, acc3 1053 ADCQ $0, DX 1054 ADDQ AX, acc3 1055 ADCQ DX, acc4 1056 ADCQ $0, acc5 1057 ADCQ acc2, acc0 1058 ADCQ $0, acc1 1059 1060 XORQ acc2, acc2 // It seems this line is optional. 1061 // x * y[3] 1062 MOVQ (8*3)(y_ptr), t0 1063 1064 MOVQ (8*0)(x_ptr), AX 1065 MULQ t0 1066 ADDQ AX, acc3 1067 ADCQ $0, DX 1068 MOVQ DX, BX 1069 1070 MOVQ (8*1)(x_ptr), AX 1071 MULQ t0 1072 ADDQ BX, acc4 1073 ADCQ $0, DX 1074 ADDQ AX, acc4 1075 ADCQ $0, DX 1076 MOVQ DX, BX 1077 1078 MOVQ (8*2)(x_ptr), AX 1079 MULQ t0 1080 ADDQ BX, acc5 1081 ADCQ $0, DX 1082 ADDQ AX, acc5 1083 ADCQ $0, DX 1084 MOVQ DX, BX 1085 1086 MOVQ (8*3)(x_ptr), AX 1087 MULQ t0 1088 ADDQ BX, acc0 1089 ADCQ $0, DX 1090 ADDQ AX, acc0 1091 ADCQ DX, acc1 1092 ADCQ $0, acc2 1093 // Last reduction step 1094 MOVQ acc3, AX 1095 MULQ p256ordK0<>(SB) 1096 MOVQ AX, t0 1097 1098 MOVQ p256ord<>+0x00(SB), AX 1099 MULQ t0 1100 ADDQ AX, acc3 1101 ADCQ $0, DX 1102 MOVQ DX, BX 1103 1104 MOVQ t0, acc3 1105 MOVQ t0, AX 1106 MOVQ t0, DX 1107 SHLQ $32, AX 1108 SHRQ $32, DX 1109 1110 SUBQ t0, acc5 1111 SBBQ AX, acc0 1112 SBBQ DX, acc3 1113 1114 MOVQ p256ord<>+0x08(SB), AX 1115 MULQ t0 1116 ADDQ BX, acc4 1117 ADCQ $0, DX 1118 ADDQ AX, acc4 1119 ADCQ DX, acc5 1120 ADCQ $0, acc0 1121 ADCQ acc3, acc1 1122 ADCQ $0, acc2 1123 1124 MOVQ res+0(FP), res_ptr 1125 p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) 1126 1127 RET 1128 1129 ordMulBMI2: 1130 XORQ acc5, acc5 1131 XORQ res_ptr, res_ptr 1132 // x * y[0] 1133 MOVQ (8*0)(y_ptr), DX 1134 MULXQ (8*0)(x_ptr), acc0, acc1 1135 1136 MULXQ (8*1)(x_ptr), AX, acc2 1137 ADCXQ AX, acc1 1138 1139 MULXQ (8*2)(x_ptr), AX, acc3 1140 ADCXQ AX, acc2 1141 1142 MULXQ (8*3)(x_ptr), AX, acc4 1143 ADCXQ AX, acc3 1144 ADCXQ acc5, acc4 1145 1146 // First reduction step 1147 MOVQ acc0, DX 1148 MULXQ p256ordK0<>(SB), DX, AX 1149 1150 MULXQ p256ord<>+0x00(SB), AX, t0 1151 ADOXQ AX, acc0 1152 1153 MULXQ p256ord<>+0x08(SB), AX, BX 1154 ADCXQ t0, AX 1155 ADOXQ AX, acc1 1156 1157 MULXQ p256ord<>+0x10(SB), AX, t0 1158 ADCXQ BX, AX 1159 ADOXQ AX, acc2 1160 1161 MULXQ p256ord<>+0x18(SB), AX, BX 1162 ADCXQ t0, AX 1163 ADOXQ AX, acc3 1164 1165 ADCXQ res_ptr, BX 1166 ADOXQ BX, acc4 1167 ADOXQ res_ptr, acc5 1168 XORQ acc0, acc0 // It seems this line is optional. 1169 1170 // x * y[1] 1171 MOVQ (8*1)(y_ptr), DX 1172 MULXQ (8*0)(x_ptr), AX, t0 1173 ADOXQ AX, acc1 1174 1175 MULXQ (8*1)(x_ptr), AX, BX 1176 ADCXQ t0, AX 1177 ADOXQ AX, acc2 1178 1179 MULXQ (8*2)(x_ptr), AX, t0 1180 ADCXQ BX, AX 1181 ADOXQ AX, acc3 1182 1183 MULXQ (8*3)(x_ptr), AX, BX 1184 ADCXQ t0, AX 1185 ADOXQ AX, acc4 1186 1187 ADCXQ acc0, BX 1188 ADOXQ BX, acc5 1189 ADOXQ res_ptr, acc0 1190 1191 // Second reduction step 1192 MOVQ acc1, DX 1193 MULXQ p256ordK0<>(SB), DX, AX 1194 1195 MULXQ p256ord<>+0x00(SB), AX, t0 1196 ADOXQ AX, acc1 1197 1198 MULXQ p256ord<>+0x08(SB), AX, BX 1199 ADCXQ t0, AX 1200 ADOXQ AX, acc2 1201 1202 MULXQ p256ord<>+0x10(SB), AX, t0 1203 ADCXQ BX, AX 1204 ADOXQ AX, acc3 1205 1206 MULXQ p256ord<>+0x18(SB), AX, BX 1207 ADCXQ t0, AX 1208 ADOXQ AX, acc4 1209 1210 ADCXQ res_ptr, BX 1211 ADOXQ BX, acc5 1212 ADOXQ res_ptr, acc0 1213 XORQ acc1, acc1 // It seems this line is optional. 1214 1215 // x * y[2] 1216 MOVQ (8*2)(y_ptr), DX 1217 MULXQ (8*0)(x_ptr), AX, t0 1218 ADOXQ AX, acc2 1219 1220 MULXQ (8*1)(x_ptr), AX, BX 1221 ADCXQ t0, AX 1222 ADOXQ AX, acc3 1223 1224 MULXQ (8*2)(x_ptr), AX, t0 1225 ADCXQ BX, AX 1226 ADOXQ AX, acc4 1227 1228 MULXQ (8*3)(x_ptr), AX, BX 1229 ADCXQ t0, AX 1230 ADOXQ AX, acc5 1231 1232 ADCXQ res_ptr, BX 1233 ADOXQ BX, acc0 1234 ADOXQ res_ptr, acc1 1235 1236 // Third reduction step 1237 MOVQ acc2, DX 1238 MULXQ p256ordK0<>(SB), DX, AX 1239 1240 MULXQ p256ord<>+0x00(SB), AX, t0 1241 ADOXQ AX, acc2 1242 1243 MULXQ p256ord<>+0x08(SB), AX, BX 1244 ADCXQ t0, AX 1245 ADOXQ AX, acc3 1246 1247 MULXQ p256ord<>+0x10(SB), AX, t0 1248 ADCXQ BX, AX 1249 ADOXQ AX, acc4 1250 1251 MULXQ p256ord<>+0x18(SB), AX, BX 1252 ADCXQ t0, AX 1253 ADOXQ AX, acc5 1254 1255 ADCXQ res_ptr, BX 1256 ADOXQ BX, acc0 1257 ADOXQ res_ptr, acc1 1258 XORQ acc2, acc2 // It seems this line is optional. 1259 1260 // x * y[3] 1261 MOVQ (8*3)(y_ptr), DX 1262 MULXQ (8*0)(x_ptr), AX, t0 1263 ADOXQ AX, acc3 1264 1265 MULXQ (8*1)(x_ptr), AX, BX 1266 ADCXQ t0, AX 1267 ADOXQ AX, acc4 1268 1269 MULXQ (8*2)(x_ptr), AX, t0 1270 ADCXQ BX, AX 1271 ADOXQ AX, acc5 1272 1273 MULXQ (8*3)(x_ptr), AX, BX 1274 ADCXQ t0, AX 1275 ADOXQ AX, acc0 1276 1277 ADCXQ res_ptr, BX 1278 ADOXQ BX, acc1 1279 ADOXQ res_ptr, acc2 1280 1281 // Last reduction step 1282 MOVQ acc3, DX 1283 MULXQ p256ordK0<>(SB), DX, AX 1284 1285 MULXQ p256ord<>+0x00(SB), AX, t0 1286 ADOXQ AX, acc3 1287 1288 MULXQ p256ord<>+0x08(SB), AX, BX 1289 ADCXQ t0, AX 1290 ADOXQ AX, acc4 1291 1292 MULXQ p256ord<>+0x10(SB), AX, t0 1293 ADCXQ BX, AX 1294 ADOXQ AX, acc5 1295 1296 MULXQ p256ord<>+0x18(SB), AX, BX 1297 ADCXQ t0, AX 1298 ADOXQ AX, acc0 1299 1300 ADCXQ res_ptr, BX 1301 ADOXQ BX, acc1 1302 ADOXQ res_ptr, acc2 1303 1304 MOVQ res+0(FP), res_ptr 1305 p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) 1306 1307 RET