github.com/FISCO-BCOS/crypto@v0.0.0-20200202032121-bd8ab0b5d4f1/elliptic/p256_asm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains constant-time, 64-bit assembly implementation of 6 // P256. The optimizations performed here are described in detail in: 7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 8 // 256-bit primes" 9 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 10 // https://eprint.iacr.org/2013/816.pdf 11 12 #include "textflag.h" 13 14 #define res_ptr DI 15 #define x_ptr SI 16 #define y_ptr CX 17 18 #define acc0 R8 19 #define acc1 R9 20 #define acc2 R10 21 #define acc3 R11 22 #define acc4 R12 23 #define acc5 R13 24 #define t0 R14 25 #define t1 R15 26 27 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 28 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 29 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 30 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 31 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 32 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 33 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 34 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 35 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 36 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 37 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 38 GLOBL p256const0<>(SB), 8, $8 39 GLOBL p256const1<>(SB), 8, $8 40 GLOBL p256ordK0<>(SB), 8, $8 41 GLOBL p256ord<>(SB), 8, $32 42 GLOBL p256one<>(SB), 8, $32 43 44 /* ---------------------------------------*/ 45 // func p256LittleToBig(res []byte, in []uint64) 46 TEXT ·p256LittleToBig(SB),18,$0 47 JMP ·p256BigToLittle(SB) 48 /* ---------------------------------------*/ 49 // func p256BigToLittle(res []uint64, in []byte) 50 TEXT ·p256BigToLittle(SB),18,$0 51 MOVQ res+0(FP), res_ptr 52 MOVQ in+24(FP), x_ptr 53 54 MOVQ (8*0)(x_ptr), acc0 55 MOVQ (8*1)(x_ptr), acc1 56 MOVQ (8*2)(x_ptr), acc2 57 MOVQ (8*3)(x_ptr), acc3 58 59 BSWAPQ acc0 60 BSWAPQ acc1 61 BSWAPQ acc2 62 BSWAPQ acc3 63 64 MOVQ acc3, (8*0)(res_ptr) 65 MOVQ acc2, (8*1)(res_ptr) 66 MOVQ acc1, (8*2)(res_ptr) 67 MOVQ acc0, (8*3)(res_ptr) 68 69 RET 70 /* ---------------------------------------*/ 71 // func p256MovCond(res, a, b []uint64, cond int) 72 // If cond == 0 res=b, else res=a 73 TEXT ·p256MovCond(SB),18,$0 74 MOVQ res+0(FP), res_ptr 75 MOVQ a+24(FP), x_ptr 76 MOVQ b+48(FP), y_ptr 77 MOVQ cond+72(FP), X12 78 79 PXOR X13, X13 80 PSHUFD $0, X12, X12 81 PCMPEQL X13, X12 82 83 MOVOU X12, X0 84 MOVOU (16*0)(x_ptr), X6 85 PANDN X6, X0 86 MOVOU X12, X1 87 MOVOU (16*1)(x_ptr), X7 88 PANDN X7, X1 89 MOVOU X12, X2 90 MOVOU (16*2)(x_ptr), X8 91 PANDN X8, X2 92 MOVOU X12, X3 93 MOVOU (16*3)(x_ptr), X9 94 PANDN X9, X3 95 MOVOU X12, X4 96 MOVOU (16*4)(x_ptr), X10 97 PANDN X10, X4 98 MOVOU X12, X5 99 MOVOU (16*5)(x_ptr), X11 100 PANDN X11, X5 101 102 MOVOU (16*0)(y_ptr), X6 103 MOVOU (16*1)(y_ptr), X7 104 MOVOU (16*2)(y_ptr), X8 105 MOVOU (16*3)(y_ptr), X9 106 MOVOU (16*4)(y_ptr), X10 107 MOVOU (16*5)(y_ptr), X11 108 109 PAND X12, X6 110 PAND X12, X7 111 PAND X12, X8 112 PAND X12, X9 113 PAND X12, X10 114 PAND X12, X11 115 116 PXOR X6, X0 117 PXOR X7, X1 118 PXOR X8, X2 119 PXOR X9, X3 120 PXOR X10, X4 121 PXOR X11, X5 122 123 MOVOU X0, (16*0)(res_ptr) 124 MOVOU X1, (16*1)(res_ptr) 125 MOVOU X2, (16*2)(res_ptr) 126 MOVOU X3, (16*3)(res_ptr) 127 MOVOU X4, (16*4)(res_ptr) 128 MOVOU X5, (16*5)(res_ptr) 129 130 RET 131 /* ---------------------------------------*/ 132 // func p256NegCond(val []uint64, cond int) 133 TEXT ·p256NegCond(SB),18,$0 134 MOVQ val+0(FP), res_ptr 135 MOVQ cond+24(FP), t0 136 // acc = poly 137 MOVQ $-1, acc0 138 MOVQ p256const0<>(SB), acc1 139 MOVQ $0, acc2 140 MOVQ p256const1<>(SB), acc3 141 // Load the original value 142 MOVQ (8*0)(res_ptr), acc5 143 MOVQ (8*1)(res_ptr), x_ptr 144 MOVQ (8*2)(res_ptr), y_ptr 145 MOVQ (8*3)(res_ptr), t1 146 // Speculatively subtract 147 SUBQ acc5, acc0 148 SBBQ x_ptr, acc1 149 SBBQ y_ptr, acc2 150 SBBQ t1, acc3 151 // If condition is 0, keep original value 152 TESTQ t0, t0 153 CMOVQEQ acc5, acc0 154 CMOVQEQ x_ptr, acc1 155 CMOVQEQ y_ptr, acc2 156 CMOVQEQ t1, acc3 157 // Store result 158 MOVQ acc0, (8*0)(res_ptr) 159 MOVQ acc1, (8*1)(res_ptr) 160 MOVQ acc2, (8*2)(res_ptr) 161 MOVQ acc3, (8*3)(res_ptr) 162 163 RET 164 /* ---------------------------------------*/ 165 // func p256Sqr(res, in []uint64, n int) 166 TEXT ·p256Sqr(SB),18,$0 167 MOVQ res+0(FP), res_ptr 168 MOVQ in+24(FP), x_ptr 169 MOVQ n+48(FP), BX 170 171 sqrLoop: 172 173 // y[1:] * y[0] 174 MOVQ (8*0)(x_ptr), t0 175 176 MOVQ (8*1)(x_ptr), AX 177 MULQ t0 178 MOVQ AX, acc1 179 MOVQ DX, acc2 180 181 MOVQ (8*2)(x_ptr), AX 182 MULQ t0 183 ADDQ AX, acc2 184 ADCQ $0, DX 185 MOVQ DX, acc3 186 187 MOVQ (8*3)(x_ptr), AX 188 MULQ t0 189 ADDQ AX, acc3 190 ADCQ $0, DX 191 MOVQ DX, acc4 192 // y[2:] * y[1] 193 MOVQ (8*1)(x_ptr), t0 194 195 MOVQ (8*2)(x_ptr), AX 196 MULQ t0 197 ADDQ AX, acc3 198 ADCQ $0, DX 199 MOVQ DX, t1 200 201 MOVQ (8*3)(x_ptr), AX 202 MULQ t0 203 ADDQ t1, acc4 204 ADCQ $0, DX 205 ADDQ AX, acc4 206 ADCQ $0, DX 207 MOVQ DX, acc5 208 // y[3] * y[2] 209 MOVQ (8*2)(x_ptr), t0 210 211 MOVQ (8*3)(x_ptr), AX 212 MULQ t0 213 ADDQ AX, acc5 214 ADCQ $0, DX 215 MOVQ DX, y_ptr 216 XORQ t1, t1 217 // *2 218 ADDQ acc1, acc1 219 ADCQ acc2, acc2 220 ADCQ acc3, acc3 221 ADCQ acc4, acc4 222 ADCQ acc5, acc5 223 ADCQ y_ptr, y_ptr 224 ADCQ $0, t1 225 // Missing products 226 MOVQ (8*0)(x_ptr), AX 227 MULQ AX 228 MOVQ AX, acc0 229 MOVQ DX, t0 230 231 MOVQ (8*1)(x_ptr), AX 232 MULQ AX 233 ADDQ t0, acc1 234 ADCQ AX, acc2 235 ADCQ $0, DX 236 MOVQ DX, t0 237 238 MOVQ (8*2)(x_ptr), AX 239 MULQ AX 240 ADDQ t0, acc3 241 ADCQ AX, acc4 242 ADCQ $0, DX 243 MOVQ DX, t0 244 245 MOVQ (8*3)(x_ptr), AX 246 MULQ AX 247 ADDQ t0, acc5 248 ADCQ AX, y_ptr 249 ADCQ DX, t1 250 MOVQ t1, x_ptr 251 // First reduction step 252 MOVQ acc0, AX 253 MOVQ acc0, t1 254 SHLQ $32, acc0 255 MULQ p256const1<>(SB) 256 SHRQ $32, t1 257 ADDQ acc0, acc1 258 ADCQ t1, acc2 259 ADCQ AX, acc3 260 ADCQ $0, DX 261 MOVQ DX, acc0 262 // Second reduction step 263 MOVQ acc1, AX 264 MOVQ acc1, t1 265 SHLQ $32, acc1 266 MULQ p256const1<>(SB) 267 SHRQ $32, t1 268 ADDQ acc1, acc2 269 ADCQ t1, acc3 270 ADCQ AX, acc0 271 ADCQ $0, DX 272 MOVQ DX, acc1 273 // Third reduction step 274 MOVQ acc2, AX 275 MOVQ acc2, t1 276 SHLQ $32, acc2 277 MULQ p256const1<>(SB) 278 SHRQ $32, t1 279 ADDQ acc2, acc3 280 ADCQ t1, acc0 281 ADCQ AX, acc1 282 ADCQ $0, DX 283 MOVQ DX, acc2 284 // Last reduction step 285 XORQ t0, t0 286 MOVQ acc3, AX 287 MOVQ acc3, t1 288 SHLQ $32, acc3 289 MULQ p256const1<>(SB) 290 SHRQ $32, t1 291 ADDQ acc3, acc0 292 ADCQ t1, acc1 293 ADCQ AX, acc2 294 ADCQ $0, DX 295 MOVQ DX, acc3 296 // Add bits [511:256] of the sqr result 297 ADCQ acc4, acc0 298 ADCQ acc5, acc1 299 ADCQ y_ptr, acc2 300 ADCQ x_ptr, acc3 301 ADCQ $0, t0 302 303 MOVQ acc0, acc4 304 MOVQ acc1, acc5 305 MOVQ acc2, y_ptr 306 MOVQ acc3, t1 307 // Subtract p256 308 SUBQ $-1, acc0 309 SBBQ p256const0<>(SB) ,acc1 310 SBBQ $0, acc2 311 SBBQ p256const1<>(SB), acc3 312 SBBQ $0, t0 313 314 CMOVQCS acc4, acc0 315 CMOVQCS acc5, acc1 316 CMOVQCS y_ptr, acc2 317 CMOVQCS t1, acc3 318 319 MOVQ acc0, (8*0)(res_ptr) 320 MOVQ acc1, (8*1)(res_ptr) 321 MOVQ acc2, (8*2)(res_ptr) 322 MOVQ acc3, (8*3)(res_ptr) 323 MOVQ res_ptr, x_ptr 324 DECQ BX 325 JNE sqrLoop 326 327 RET 328 /* ---------------------------------------*/ 329 // func p256Mul(res, in1, in2 []uint64) 330 TEXT ·p256Mul(SB),18,$0 331 MOVQ res+0(FP), res_ptr 332 MOVQ in1+24(FP), x_ptr 333 MOVQ in2+48(FP), y_ptr 334 // x * y[0] 335 MOVQ (8*0)(y_ptr), t0 336 337 MOVQ (8*0)(x_ptr), AX 338 MULQ t0 339 MOVQ AX, acc0 340 MOVQ DX, acc1 341 342 MOVQ (8*1)(x_ptr), AX 343 MULQ t0 344 ADDQ AX, acc1 345 ADCQ $0, DX 346 MOVQ DX, acc2 347 348 MOVQ (8*2)(x_ptr), AX 349 MULQ t0 350 ADDQ AX, acc2 351 ADCQ $0, DX 352 MOVQ DX, acc3 353 354 MOVQ (8*3)(x_ptr), AX 355 MULQ t0 356 ADDQ AX, acc3 357 ADCQ $0, DX 358 MOVQ DX, acc4 359 XORQ acc5, acc5 360 // First reduction step 361 MOVQ acc0, AX 362 MOVQ acc0, t1 363 SHLQ $32, acc0 364 MULQ p256const1<>(SB) 365 SHRQ $32, t1 366 ADDQ acc0, acc1 367 ADCQ t1, acc2 368 ADCQ AX, acc3 369 ADCQ DX, acc4 370 ADCQ $0, acc5 371 XORQ acc0, acc0 372 // x * y[1] 373 MOVQ (8*1)(y_ptr), t0 374 375 MOVQ (8*0)(x_ptr), AX 376 MULQ t0 377 ADDQ AX, acc1 378 ADCQ $0, DX 379 MOVQ DX, t1 380 381 MOVQ (8*1)(x_ptr), AX 382 MULQ t0 383 ADDQ t1, acc2 384 ADCQ $0, DX 385 ADDQ AX, acc2 386 ADCQ $0, DX 387 MOVQ DX, t1 388 389 MOVQ (8*2)(x_ptr), AX 390 MULQ t0 391 ADDQ t1, acc3 392 ADCQ $0, DX 393 ADDQ AX, acc3 394 ADCQ $0, DX 395 MOVQ DX, t1 396 397 MOVQ (8*3)(x_ptr), AX 398 MULQ t0 399 ADDQ t1, acc4 400 ADCQ $0, DX 401 ADDQ AX, acc4 402 ADCQ DX, acc5 403 ADCQ $0, acc0 404 // Second reduction step 405 MOVQ acc1, AX 406 MOVQ acc1, t1 407 SHLQ $32, acc1 408 MULQ p256const1<>(SB) 409 SHRQ $32, t1 410 ADDQ acc1, acc2 411 ADCQ t1, acc3 412 ADCQ AX, acc4 413 ADCQ DX, acc5 414 ADCQ $0, acc0 415 XORQ acc1, acc1 416 // x * y[2] 417 MOVQ (8*2)(y_ptr), t0 418 419 MOVQ (8*0)(x_ptr), AX 420 MULQ t0 421 ADDQ AX, acc2 422 ADCQ $0, DX 423 MOVQ DX, t1 424 425 MOVQ (8*1)(x_ptr), AX 426 MULQ t0 427 ADDQ t1, acc3 428 ADCQ $0, DX 429 ADDQ AX, acc3 430 ADCQ $0, DX 431 MOVQ DX, t1 432 433 MOVQ (8*2)(x_ptr), AX 434 MULQ t0 435 ADDQ t1, acc4 436 ADCQ $0, DX 437 ADDQ AX, acc4 438 ADCQ $0, DX 439 MOVQ DX, t1 440 441 MOVQ (8*3)(x_ptr), AX 442 MULQ t0 443 ADDQ t1, acc5 444 ADCQ $0, DX 445 ADDQ AX, acc5 446 ADCQ DX, acc0 447 ADCQ $0, acc1 448 // Third reduction step 449 MOVQ acc2, AX 450 MOVQ acc2, t1 451 SHLQ $32, acc2 452 MULQ p256const1<>(SB) 453 SHRQ $32, t1 454 ADDQ acc2, acc3 455 ADCQ t1, acc4 456 ADCQ AX, acc5 457 ADCQ DX, acc0 458 ADCQ $0, acc1 459 XORQ acc2, acc2 460 // x * y[3] 461 MOVQ (8*3)(y_ptr), t0 462 463 MOVQ (8*0)(x_ptr), AX 464 MULQ t0 465 ADDQ AX, acc3 466 ADCQ $0, DX 467 MOVQ DX, t1 468 469 MOVQ (8*1)(x_ptr), AX 470 MULQ t0 471 ADDQ t1, acc4 472 ADCQ $0, DX 473 ADDQ AX, acc4 474 ADCQ $0, DX 475 MOVQ DX, t1 476 477 MOVQ (8*2)(x_ptr), AX 478 MULQ t0 479 ADDQ t1, acc5 480 ADCQ $0, DX 481 ADDQ AX, acc5 482 ADCQ $0, DX 483 MOVQ DX, t1 484 485 MOVQ (8*3)(x_ptr), AX 486 MULQ t0 487 ADDQ t1, acc0 488 ADCQ $0, DX 489 ADDQ AX, acc0 490 ADCQ DX, acc1 491 ADCQ $0, acc2 492 // Last reduction step 493 MOVQ acc3, AX 494 MOVQ acc3, t1 495 SHLQ $32, acc3 496 MULQ p256const1<>(SB) 497 SHRQ $32, t1 498 ADDQ acc3, acc4 499 ADCQ t1, acc5 500 ADCQ AX, acc0 501 ADCQ DX, acc1 502 ADCQ $0, acc2 503 // Copy result [255:0] 504 MOVQ acc4, x_ptr 505 MOVQ acc5, acc3 506 MOVQ acc0, t0 507 MOVQ acc1, t1 508 // Subtract p256 509 SUBQ $-1, acc4 510 SBBQ p256const0<>(SB) ,acc5 511 SBBQ $0, acc0 512 SBBQ p256const1<>(SB), acc1 513 SBBQ $0, acc2 514 515 CMOVQCS x_ptr, acc4 516 CMOVQCS acc3, acc5 517 CMOVQCS t0, acc0 518 CMOVQCS t1, acc1 519 520 MOVQ acc4, (8*0)(res_ptr) 521 MOVQ acc5, (8*1)(res_ptr) 522 MOVQ acc0, (8*2)(res_ptr) 523 MOVQ acc1, (8*3)(res_ptr) 524 525 RET 526 /* ---------------------------------------*/ 527 // func p256FromMont(res, in []uint64) 528 TEXT ·p256FromMont(SB),18,$0 529 MOVQ res+0(FP), res_ptr 530 MOVQ in+24(FP), x_ptr 531 532 MOVQ (8*0)(x_ptr), acc0 533 MOVQ (8*1)(x_ptr), acc1 534 MOVQ (8*2)(x_ptr), acc2 535 MOVQ (8*3)(x_ptr), acc3 536 XORQ acc4, acc4 537 538 // Only reduce, no multiplications are needed 539 // First stage 540 MOVQ acc0, AX 541 MOVQ acc0, t1 542 SHLQ $32, acc0 543 MULQ p256const1<>(SB) 544 SHRQ $32, t1 545 ADDQ acc0, acc1 546 ADCQ t1, acc2 547 ADCQ AX, acc3 548 ADCQ DX, acc4 549 XORQ acc5, acc5 550 // Second stage 551 MOVQ acc1, AX 552 MOVQ acc1, t1 553 SHLQ $32, acc1 554 MULQ p256const1<>(SB) 555 SHRQ $32, t1 556 ADDQ acc1, acc2 557 ADCQ t1, acc3 558 ADCQ AX, acc4 559 ADCQ DX, acc5 560 XORQ acc0, acc0 561 // Third stage 562 MOVQ acc2, AX 563 MOVQ acc2, t1 564 SHLQ $32, acc2 565 MULQ p256const1<>(SB) 566 SHRQ $32, t1 567 ADDQ acc2, acc3 568 ADCQ t1, acc4 569 ADCQ AX, acc5 570 ADCQ DX, acc0 571 XORQ acc1, acc1 572 // Last stage 573 MOVQ acc3, AX 574 MOVQ acc3, t1 575 SHLQ $32, acc3 576 MULQ p256const1<>(SB) 577 SHRQ $32, t1 578 ADDQ acc3, acc4 579 ADCQ t1, acc5 580 ADCQ AX, acc0 581 ADCQ DX, acc1 582 583 MOVQ acc4, x_ptr 584 MOVQ acc5, acc3 585 MOVQ acc0, t0 586 MOVQ acc1, t1 587 588 SUBQ $-1, acc4 589 SBBQ p256const0<>(SB), acc5 590 SBBQ $0, acc0 591 SBBQ p256const1<>(SB), acc1 592 593 CMOVQCS x_ptr, acc4 594 CMOVQCS acc3, acc5 595 CMOVQCS t0, acc0 596 CMOVQCS t1, acc1 597 598 MOVQ acc4, (8*0)(res_ptr) 599 MOVQ acc5, (8*1)(res_ptr) 600 MOVQ acc0, (8*2)(res_ptr) 601 MOVQ acc1, (8*3)(res_ptr) 602 603 RET 604 /* ---------------------------------------*/ 605 // Constant time point access to arbitrary point table. 606 // Indexed from 1 to 15, with -1 offset 607 // (index 0 is implicitly point at infinity) 608 // func p256Select(point, table []uint64, idx int) 609 TEXT ·p256Select(SB),18,$0 610 MOVQ idx+48(FP),AX 611 MOVQ table+24(FP),DI 612 MOVQ point+0(FP),DX 613 614 PXOR X15, X15 // X15 = 0 615 PCMPEQL X14, X14 // X14 = -1 616 PSUBL X14, X15 // X15 = 1 617 MOVL AX, X14 618 PSHUFD $0, X14, X14 619 620 PXOR X0, X0 621 PXOR X1, X1 622 PXOR X2, X2 623 PXOR X3, X3 624 PXOR X4, X4 625 PXOR X5, X5 626 MOVQ $16, AX 627 628 MOVOU X15, X13 629 630 loop_select: 631 632 MOVOU X13, X12 633 PADDL X15, X13 634 PCMPEQL X14, X12 635 636 MOVOU (16*0)(DI), X6 637 MOVOU (16*1)(DI), X7 638 MOVOU (16*2)(DI), X8 639 MOVOU (16*3)(DI), X9 640 MOVOU (16*4)(DI), X10 641 MOVOU (16*5)(DI), X11 642 ADDQ $(16*6), DI 643 644 PAND X12, X6 645 PAND X12, X7 646 PAND X12, X8 647 PAND X12, X9 648 PAND X12, X10 649 PAND X12, X11 650 651 PXOR X6, X0 652 PXOR X7, X1 653 PXOR X8, X2 654 PXOR X9, X3 655 PXOR X10, X4 656 PXOR X11, X5 657 658 DECQ AX 659 JNE loop_select 660 661 MOVOU X0, (16*0)(DX) 662 MOVOU X1, (16*1)(DX) 663 MOVOU X2, (16*2)(DX) 664 MOVOU X3, (16*3)(DX) 665 MOVOU X4, (16*4)(DX) 666 MOVOU X5, (16*5)(DX) 667 668 RET 669 /* ---------------------------------------*/ 670 // Constant time point access to base point table. 671 // func p256SelectBase(point, table []uint64, idx int) 672 TEXT ·p256SelectBase(SB),18,$0 673 MOVQ idx+48(FP),AX 674 MOVQ table+24(FP),DI 675 MOVQ point+0(FP),DX 676 677 PXOR X15, X15 // X15 = 0 678 PCMPEQL X14, X14 // X14 = -1 679 PSUBL X14, X15 // X15 = 1 680 MOVL AX, X14 681 PSHUFD $0, X14, X14 682 683 PXOR X0, X0 684 PXOR X1, X1 685 PXOR X2, X2 686 PXOR X3, X3 687 MOVQ $16, AX 688 689 MOVOU X15, X13 690 691 loop_select_base: 692 693 MOVOU X13, X12 694 PADDL X15, X13 695 PCMPEQL X14, X12 696 697 MOVOU (16*0)(DI), X4 698 MOVOU (16*1)(DI), X5 699 MOVOU (16*2)(DI), X6 700 MOVOU (16*3)(DI), X7 701 702 MOVOU (16*4)(DI), X8 703 MOVOU (16*5)(DI), X9 704 MOVOU (16*6)(DI), X10 705 MOVOU (16*7)(DI), X11 706 707 ADDQ $(16*8), DI 708 709 PAND X12, X4 710 PAND X12, X5 711 PAND X12, X6 712 PAND X12, X7 713 714 MOVOU X13, X12 715 PADDL X15, X13 716 PCMPEQL X14, X12 717 718 PAND X12, X8 719 PAND X12, X9 720 PAND X12, X10 721 PAND X12, X11 722 723 PXOR X4, X0 724 PXOR X5, X1 725 PXOR X6, X2 726 PXOR X7, X3 727 728 PXOR X8, X0 729 PXOR X9, X1 730 PXOR X10, X2 731 PXOR X11, X3 732 733 DECQ AX 734 JNE loop_select_base 735 736 MOVOU X0, (16*0)(DX) 737 MOVOU X1, (16*1)(DX) 738 MOVOU X2, (16*2)(DX) 739 MOVOU X3, (16*3)(DX) 740 741 RET 742 /* ---------------------------------------*/ 743 // func p256OrdMul(res, in1, in2 []uint64) 744 TEXT ·p256OrdMul(SB),18,$0 745 MOVQ res+0(FP), res_ptr 746 MOVQ in1+24(FP), x_ptr 747 MOVQ in2+48(FP), y_ptr 748 // x * y[0] 749 MOVQ (8*0)(y_ptr), t0 750 751 MOVQ (8*0)(x_ptr), AX 752 MULQ t0 753 MOVQ AX, acc0 754 MOVQ DX, acc1 755 756 MOVQ (8*1)(x_ptr), AX 757 MULQ t0 758 ADDQ AX, acc1 759 ADCQ $0, DX 760 MOVQ DX, acc2 761 762 MOVQ (8*2)(x_ptr), AX 763 MULQ t0 764 ADDQ AX, acc2 765 ADCQ $0, DX 766 MOVQ DX, acc3 767 768 MOVQ (8*3)(x_ptr), AX 769 MULQ t0 770 ADDQ AX, acc3 771 ADCQ $0, DX 772 MOVQ DX, acc4 773 XORQ acc5, acc5 774 // First reduction step 775 MOVQ acc0, AX 776 MULQ p256ordK0<>(SB) 777 MOVQ AX, t0 778 779 MOVQ p256ord<>+0x00(SB), AX 780 MULQ t0 781 ADDQ AX, acc0 782 ADCQ $0, DX 783 MOVQ DX, t1 784 785 MOVQ p256ord<>+0x08(SB), AX 786 MULQ t0 787 ADDQ t1, acc1 788 ADCQ $0, DX 789 ADDQ AX, acc1 790 ADCQ $0, DX 791 MOVQ DX, t1 792 793 MOVQ p256ord<>+0x10(SB), AX 794 MULQ t0 795 ADDQ t1, acc2 796 ADCQ $0, DX 797 ADDQ AX, acc2 798 ADCQ $0, DX 799 MOVQ DX, t1 800 801 MOVQ p256ord<>+0x18(SB), AX 802 MULQ t0 803 ADDQ t1, acc3 804 ADCQ $0, DX 805 ADDQ AX, acc3 806 ADCQ DX, acc4 807 ADCQ $0, acc5 808 // x * y[1] 809 MOVQ (8*1)(y_ptr), t0 810 811 MOVQ (8*0)(x_ptr), AX 812 MULQ t0 813 ADDQ AX, acc1 814 ADCQ $0, DX 815 MOVQ DX, t1 816 817 MOVQ (8*1)(x_ptr), AX 818 MULQ t0 819 ADDQ t1, acc2 820 ADCQ $0, DX 821 ADDQ AX, acc2 822 ADCQ $0, DX 823 MOVQ DX, t1 824 825 MOVQ (8*2)(x_ptr), AX 826 MULQ t0 827 ADDQ t1, acc3 828 ADCQ $0, DX 829 ADDQ AX, acc3 830 ADCQ $0, DX 831 MOVQ DX, t1 832 833 MOVQ (8*3)(x_ptr), AX 834 MULQ t0 835 ADDQ t1, acc4 836 ADCQ $0, DX 837 ADDQ AX, acc4 838 ADCQ DX, acc5 839 ADCQ $0, acc0 840 // Second reduction step 841 MOVQ acc1, AX 842 MULQ p256ordK0<>(SB) 843 MOVQ AX, t0 844 845 MOVQ p256ord<>+0x00(SB), AX 846 MULQ t0 847 ADDQ AX, acc1 848 ADCQ $0, DX 849 MOVQ DX, t1 850 851 MOVQ p256ord<>+0x08(SB), AX 852 MULQ t0 853 ADDQ t1, acc2 854 ADCQ $0, DX 855 ADDQ AX, acc2 856 ADCQ $0, DX 857 MOVQ DX, t1 858 859 MOVQ p256ord<>+0x10(SB), AX 860 MULQ t0 861 ADDQ t1, acc3 862 ADCQ $0, DX 863 ADDQ AX, acc3 864 ADCQ $0, DX 865 MOVQ DX, t1 866 867 MOVQ p256ord<>+0x18(SB), AX 868 MULQ t0 869 ADDQ t1, acc4 870 ADCQ $0, DX 871 ADDQ AX, acc4 872 ADCQ DX, acc5 873 ADCQ $0, acc0 874 // x * y[2] 875 MOVQ (8*2)(y_ptr), t0 876 877 MOVQ (8*0)(x_ptr), AX 878 MULQ t0 879 ADDQ AX, acc2 880 ADCQ $0, DX 881 MOVQ DX, t1 882 883 MOVQ (8*1)(x_ptr), AX 884 MULQ t0 885 ADDQ t1, acc3 886 ADCQ $0, DX 887 ADDQ AX, acc3 888 ADCQ $0, DX 889 MOVQ DX, t1 890 891 MOVQ (8*2)(x_ptr), AX 892 MULQ t0 893 ADDQ t1, acc4 894 ADCQ $0, DX 895 ADDQ AX, acc4 896 ADCQ $0, DX 897 MOVQ DX, t1 898 899 MOVQ (8*3)(x_ptr), AX 900 MULQ t0 901 ADDQ t1, acc5 902 ADCQ $0, DX 903 ADDQ AX, acc5 904 ADCQ DX, acc0 905 ADCQ $0, acc1 906 // Third reduction step 907 MOVQ acc2, AX 908 MULQ p256ordK0<>(SB) 909 MOVQ AX, t0 910 911 MOVQ p256ord<>+0x00(SB), AX 912 MULQ t0 913 ADDQ AX, acc2 914 ADCQ $0, DX 915 MOVQ DX, t1 916 917 MOVQ p256ord<>+0x08(SB), AX 918 MULQ t0 919 ADDQ t1, acc3 920 ADCQ $0, DX 921 ADDQ AX, acc3 922 ADCQ $0, DX 923 MOVQ DX, t1 924 925 MOVQ p256ord<>+0x10(SB), AX 926 MULQ t0 927 ADDQ t1, acc4 928 ADCQ $0, DX 929 ADDQ AX, acc4 930 ADCQ $0, DX 931 MOVQ DX, t1 932 933 MOVQ p256ord<>+0x18(SB), AX 934 MULQ t0 935 ADDQ t1, acc5 936 ADCQ $0, DX 937 ADDQ AX, acc5 938 ADCQ DX, acc0 939 ADCQ $0, acc1 940 // x * y[3] 941 MOVQ (8*3)(y_ptr), t0 942 943 MOVQ (8*0)(x_ptr), AX 944 MULQ t0 945 ADDQ AX, acc3 946 ADCQ $0, DX 947 MOVQ DX, t1 948 949 MOVQ (8*1)(x_ptr), AX 950 MULQ t0 951 ADDQ t1, acc4 952 ADCQ $0, DX 953 ADDQ AX, acc4 954 ADCQ $0, DX 955 MOVQ DX, t1 956 957 MOVQ (8*2)(x_ptr), AX 958 MULQ t0 959 ADDQ t1, acc5 960 ADCQ $0, DX 961 ADDQ AX, acc5 962 ADCQ $0, DX 963 MOVQ DX, t1 964 965 MOVQ (8*3)(x_ptr), AX 966 MULQ t0 967 ADDQ t1, acc0 968 ADCQ $0, DX 969 ADDQ AX, acc0 970 ADCQ DX, acc1 971 ADCQ $0, acc2 972 // Last reduction step 973 MOVQ acc3, AX 974 MULQ p256ordK0<>(SB) 975 MOVQ AX, t0 976 977 MOVQ p256ord<>+0x00(SB), AX 978 MULQ t0 979 ADDQ AX, acc3 980 ADCQ $0, DX 981 MOVQ DX, t1 982 983 MOVQ p256ord<>+0x08(SB), AX 984 MULQ t0 985 ADDQ t1, acc4 986 ADCQ $0, DX 987 ADDQ AX, acc4 988 ADCQ $0, DX 989 MOVQ DX, t1 990 991 MOVQ p256ord<>+0x10(SB), AX 992 MULQ t0 993 ADDQ t1, acc5 994 ADCQ $0, DX 995 ADDQ AX, acc5 996 ADCQ $0, DX 997 MOVQ DX, t1 998 999 MOVQ p256ord<>+0x18(SB), AX 1000 MULQ t0 1001 ADDQ t1, acc0 1002 ADCQ $0, DX 1003 ADDQ AX, acc0 1004 ADCQ DX, acc1 1005 ADCQ $0, acc2 1006 // Copy result [255:0] 1007 MOVQ acc4, x_ptr 1008 MOVQ acc5, acc3 1009 MOVQ acc0, t0 1010 MOVQ acc1, t1 1011 // Subtract p256 1012 SUBQ p256ord<>+0x00(SB), acc4 1013 SBBQ p256ord<>+0x08(SB) ,acc5 1014 SBBQ p256ord<>+0x10(SB), acc0 1015 SBBQ p256ord<>+0x18(SB), acc1 1016 SBBQ $0, acc2 1017 1018 CMOVQCS x_ptr, acc4 1019 CMOVQCS acc3, acc5 1020 CMOVQCS t0, acc0 1021 CMOVQCS t1, acc1 1022 1023 MOVQ acc4, (8*0)(res_ptr) 1024 MOVQ acc5, (8*1)(res_ptr) 1025 MOVQ acc0, (8*2)(res_ptr) 1026 MOVQ acc1, (8*3)(res_ptr) 1027 1028 RET 1029 /* ---------------------------------------*/ 1030 // func p256OrdSqr(res, in []uint64, n int) 1031 TEXT ·p256OrdSqr(SB),18,$0 1032 MOVQ res+0(FP), res_ptr 1033 MOVQ in+24(FP), x_ptr 1034 MOVQ n+48(FP), BX 1035 1036 ordSqrLoop: 1037 1038 // y[1:] * y[0] 1039 MOVQ (8*0)(x_ptr), t0 1040 1041 MOVQ (8*1)(x_ptr), AX 1042 MULQ t0 1043 MOVQ AX, acc1 1044 MOVQ DX, acc2 1045 1046 MOVQ (8*2)(x_ptr), AX 1047 MULQ t0 1048 ADDQ AX, acc2 1049 ADCQ $0, DX 1050 MOVQ DX, acc3 1051 1052 MOVQ (8*3)(x_ptr), AX 1053 MULQ t0 1054 ADDQ AX, acc3 1055 ADCQ $0, DX 1056 MOVQ DX, acc4 1057 // y[2:] * y[1] 1058 MOVQ (8*1)(x_ptr), t0 1059 1060 MOVQ (8*2)(x_ptr), AX 1061 MULQ t0 1062 ADDQ AX, acc3 1063 ADCQ $0, DX 1064 MOVQ DX, t1 1065 1066 MOVQ (8*3)(x_ptr), AX 1067 MULQ t0 1068 ADDQ t1, acc4 1069 ADCQ $0, DX 1070 ADDQ AX, acc4 1071 ADCQ $0, DX 1072 MOVQ DX, acc5 1073 // y[3] * y[2] 1074 MOVQ (8*2)(x_ptr), t0 1075 1076 MOVQ (8*3)(x_ptr), AX 1077 MULQ t0 1078 ADDQ AX, acc5 1079 ADCQ $0, DX 1080 MOVQ DX, y_ptr 1081 XORQ t1, t1 1082 // *2 1083 ADDQ acc1, acc1 1084 ADCQ acc2, acc2 1085 ADCQ acc3, acc3 1086 ADCQ acc4, acc4 1087 ADCQ acc5, acc5 1088 ADCQ y_ptr, y_ptr 1089 ADCQ $0, t1 1090 // Missing products 1091 MOVQ (8*0)(x_ptr), AX 1092 MULQ AX 1093 MOVQ AX, acc0 1094 MOVQ DX, t0 1095 1096 MOVQ (8*1)(x_ptr), AX 1097 MULQ AX 1098 ADDQ t0, acc1 1099 ADCQ AX, acc2 1100 ADCQ $0, DX 1101 MOVQ DX, t0 1102 1103 MOVQ (8*2)(x_ptr), AX 1104 MULQ AX 1105 ADDQ t0, acc3 1106 ADCQ AX, acc4 1107 ADCQ $0, DX 1108 MOVQ DX, t0 1109 1110 MOVQ (8*3)(x_ptr), AX 1111 MULQ AX 1112 ADDQ t0, acc5 1113 ADCQ AX, y_ptr 1114 ADCQ DX, t1 1115 MOVQ t1, x_ptr 1116 // First reduction step 1117 MOVQ acc0, AX 1118 MULQ p256ordK0<>(SB) 1119 MOVQ AX, t0 1120 1121 MOVQ p256ord<>+0x00(SB), AX 1122 MULQ t0 1123 ADDQ AX, acc0 1124 ADCQ $0, DX 1125 MOVQ DX, t1 1126 1127 MOVQ p256ord<>+0x08(SB), AX 1128 MULQ t0 1129 ADDQ t1, acc1 1130 ADCQ $0, DX 1131 ADDQ AX, acc1 1132 1133 MOVQ t0, t1 1134 ADCQ DX, acc2 1135 ADCQ $0, t1 1136 SUBQ t0, acc2 1137 SBBQ $0, t1 1138 1139 MOVQ t0, AX 1140 MOVQ t0, DX 1141 MOVQ t0, acc0 1142 SHLQ $32, AX 1143 SHRQ $32, DX 1144 1145 ADDQ t1, acc3 1146 ADCQ $0, acc0 1147 SUBQ AX, acc3 1148 SBBQ DX, acc0 1149 // Second reduction step 1150 MOVQ acc1, AX 1151 MULQ p256ordK0<>(SB) 1152 MOVQ AX, t0 1153 1154 MOVQ p256ord<>+0x00(SB), AX 1155 MULQ t0 1156 ADDQ AX, acc1 1157 ADCQ $0, DX 1158 MOVQ DX, t1 1159 1160 MOVQ p256ord<>+0x08(SB), AX 1161 MULQ t0 1162 ADDQ t1, acc2 1163 ADCQ $0, DX 1164 ADDQ AX, acc2 1165 1166 MOVQ t0, t1 1167 ADCQ DX, acc3 1168 ADCQ $0, t1 1169 SUBQ t0, acc3 1170 SBBQ $0, t1 1171 1172 MOVQ t0, AX 1173 MOVQ t0, DX 1174 MOVQ t0, acc1 1175 SHLQ $32, AX 1176 SHRQ $32, DX 1177 1178 ADDQ t1, acc0 1179 ADCQ $0, acc1 1180 SUBQ AX, acc0 1181 SBBQ DX, acc1 1182 // Third reduction step 1183 MOVQ acc2, AX 1184 MULQ p256ordK0<>(SB) 1185 MOVQ AX, t0 1186 1187 MOVQ p256ord<>+0x00(SB), AX 1188 MULQ t0 1189 ADDQ AX, acc2 1190 ADCQ $0, DX 1191 MOVQ DX, t1 1192 1193 MOVQ p256ord<>+0x08(SB), AX 1194 MULQ t0 1195 ADDQ t1, acc3 1196 ADCQ $0, DX 1197 ADDQ AX, acc3 1198 1199 MOVQ t0, t1 1200 ADCQ DX, acc0 1201 ADCQ $0, t1 1202 SUBQ t0, acc0 1203 SBBQ $0, t1 1204 1205 MOVQ t0, AX 1206 MOVQ t0, DX 1207 MOVQ t0, acc2 1208 SHLQ $32, AX 1209 SHRQ $32, DX 1210 1211 ADDQ t1, acc1 1212 ADCQ $0, acc2 1213 SUBQ AX, acc1 1214 SBBQ DX, acc2 1215 // Last reduction step 1216 MOVQ acc3, AX 1217 MULQ p256ordK0<>(SB) 1218 MOVQ AX, t0 1219 1220 MOVQ p256ord<>+0x00(SB), AX 1221 MULQ t0 1222 ADDQ AX, acc3 1223 ADCQ $0, DX 1224 MOVQ DX, t1 1225 1226 MOVQ p256ord<>+0x08(SB), AX 1227 MULQ t0 1228 ADDQ t1, acc0 1229 ADCQ $0, DX 1230 ADDQ AX, acc0 1231 ADCQ $0, DX 1232 MOVQ DX, t1 1233 1234 MOVQ t0, t1 1235 ADCQ DX, acc1 1236 ADCQ $0, t1 1237 SUBQ t0, acc1 1238 SBBQ $0, t1 1239 1240 MOVQ t0, AX 1241 MOVQ t0, DX 1242 MOVQ t0, acc3 1243 SHLQ $32, AX 1244 SHRQ $32, DX 1245 1246 ADDQ t1, acc2 1247 ADCQ $0, acc3 1248 SUBQ AX, acc2 1249 SBBQ DX, acc3 1250 XORQ t0, t0 1251 // Add bits [511:256] of the sqr result 1252 ADCQ acc4, acc0 1253 ADCQ acc5, acc1 1254 ADCQ y_ptr, acc2 1255 ADCQ x_ptr, acc3 1256 ADCQ $0, t0 1257 1258 MOVQ acc0, acc4 1259 MOVQ acc1, acc5 1260 MOVQ acc2, y_ptr 1261 MOVQ acc3, t1 1262 // Subtract p256 1263 SUBQ p256ord<>+0x00(SB), acc0 1264 SBBQ p256ord<>+0x08(SB) ,acc1 1265 SBBQ p256ord<>+0x10(SB), acc2 1266 SBBQ p256ord<>+0x18(SB), acc3 1267 SBBQ $0, t0 1268 1269 CMOVQCS acc4, acc0 1270 CMOVQCS acc5, acc1 1271 CMOVQCS y_ptr, acc2 1272 CMOVQCS t1, acc3 1273 1274 MOVQ acc0, (8*0)(res_ptr) 1275 MOVQ acc1, (8*1)(res_ptr) 1276 MOVQ acc2, (8*2)(res_ptr) 1277 MOVQ acc3, (8*3)(res_ptr) 1278 MOVQ res_ptr, x_ptr 1279 DECQ BX 1280 JNE ordSqrLoop 1281 1282 RET 1283 /* ---------------------------------------*/ 1284 #undef res_ptr 1285 #undef x_ptr 1286 #undef y_ptr 1287 1288 #undef acc0 1289 #undef acc1 1290 #undef acc2 1291 #undef acc3 1292 #undef acc4 1293 #undef acc5 1294 #undef t0 1295 #undef t1 1296 /* ---------------------------------------*/ 1297 #define mul0 AX 1298 #define mul1 DX 1299 #define acc0 BX 1300 #define acc1 CX 1301 #define acc2 R8 1302 #define acc3 R9 1303 #define acc4 R10 1304 #define acc5 R11 1305 #define acc6 R12 1306 #define acc7 R13 1307 #define t0 R14 1308 #define t1 R15 1309 #define t2 DI 1310 #define t3 SI 1311 #define hlp BP 1312 /* ---------------------------------------*/ 1313 TEXT p256SubInternal(SB),18,$0 1314 XORQ mul0, mul0 1315 SUBQ t0, acc4 1316 SBBQ t1, acc5 1317 SBBQ t2, acc6 1318 SBBQ t3, acc7 1319 SBBQ $0, mul0 1320 1321 MOVQ acc4, acc0 1322 MOVQ acc5, acc1 1323 MOVQ acc6, acc2 1324 MOVQ acc7, acc3 1325 1326 ADDQ $-1, acc4 1327 ADCQ p256const0<>(SB), acc5 1328 ADCQ $0, acc6 1329 ADCQ p256const1<>(SB), acc7 1330 ANDQ $1, mul0 1331 1332 CMOVQEQ acc0, acc4 1333 CMOVQEQ acc1, acc5 1334 CMOVQEQ acc2, acc6 1335 CMOVQEQ acc3, acc7 1336 1337 RET 1338 /* ---------------------------------------*/ 1339 TEXT p256MulInternal(SB),18,$0 1340 MOVQ acc4, mul0 1341 MULQ t0 1342 MOVQ mul0, acc0 1343 MOVQ mul1, acc1 1344 1345 MOVQ acc4, mul0 1346 MULQ t1 1347 ADDQ mul0, acc1 1348 ADCQ $0, mul1 1349 MOVQ mul1, acc2 1350 1351 MOVQ acc4, mul0 1352 MULQ t2 1353 ADDQ mul0, acc2 1354 ADCQ $0, mul1 1355 MOVQ mul1, acc3 1356 1357 MOVQ acc4, mul0 1358 MULQ t3 1359 ADDQ mul0, acc3 1360 ADCQ $0, mul1 1361 MOVQ mul1, acc4 1362 1363 MOVQ acc5, mul0 1364 MULQ t0 1365 ADDQ mul0, acc1 1366 ADCQ $0, mul1 1367 MOVQ mul1, hlp 1368 1369 MOVQ acc5, mul0 1370 MULQ t1 1371 ADDQ hlp, acc2 1372 ADCQ $0, mul1 1373 ADDQ mul0, acc2 1374 ADCQ $0, mul1 1375 MOVQ mul1, hlp 1376 1377 MOVQ acc5, mul0 1378 MULQ t2 1379 ADDQ hlp, acc3 1380 ADCQ $0, mul1 1381 ADDQ mul0, acc3 1382 ADCQ $0, mul1 1383 MOVQ mul1, hlp 1384 1385 MOVQ acc5, mul0 1386 MULQ t3 1387 ADDQ hlp, acc4 1388 ADCQ $0, mul1 1389 ADDQ mul0, acc4 1390 ADCQ $0, mul1 1391 MOVQ mul1, acc5 1392 1393 MOVQ acc6, mul0 1394 MULQ t0 1395 ADDQ mul0, acc2 1396 ADCQ $0, mul1 1397 MOVQ mul1, hlp 1398 1399 MOVQ acc6, mul0 1400 MULQ t1 1401 ADDQ hlp, acc3 1402 ADCQ $0, mul1 1403 ADDQ mul0, acc3 1404 ADCQ $0, mul1 1405 MOVQ mul1, hlp 1406 1407 MOVQ acc6, mul0 1408 MULQ t2 1409 ADDQ hlp, acc4 1410 ADCQ $0, mul1 1411 ADDQ mul0, acc4 1412 ADCQ $0, mul1 1413 MOVQ mul1, hlp 1414 1415 MOVQ acc6, mul0 1416 MULQ t3 1417 ADDQ hlp, acc5 1418 ADCQ $0, mul1 1419 ADDQ mul0, acc5 1420 ADCQ $0, mul1 1421 MOVQ mul1, acc6 1422 1423 MOVQ acc7, mul0 1424 MULQ t0 1425 ADDQ mul0, acc3 1426 ADCQ $0, mul1 1427 MOVQ mul1, hlp 1428 1429 MOVQ acc7, mul0 1430 MULQ t1 1431 ADDQ hlp, acc4 1432 ADCQ $0, mul1 1433 ADDQ mul0, acc4 1434 ADCQ $0, mul1 1435 MOVQ mul1, hlp 1436 1437 MOVQ acc7, mul0 1438 MULQ t2 1439 ADDQ hlp, acc5 1440 ADCQ $0, mul1 1441 ADDQ mul0, acc5 1442 ADCQ $0, mul1 1443 MOVQ mul1, hlp 1444 1445 MOVQ acc7, mul0 1446 MULQ t3 1447 ADDQ hlp, acc6 1448 ADCQ $0, mul1 1449 ADDQ mul0, acc6 1450 ADCQ $0, mul1 1451 MOVQ mul1, acc7 1452 // First reduction step 1453 MOVQ acc0, mul0 1454 MOVQ acc0, hlp 1455 SHLQ $32, acc0 1456 MULQ p256const1<>(SB) 1457 SHRQ $32, hlp 1458 ADDQ acc0, acc1 1459 ADCQ hlp, acc2 1460 ADCQ mul0, acc3 1461 ADCQ $0, mul1 1462 MOVQ mul1, acc0 1463 // Second reduction step 1464 MOVQ acc1, mul0 1465 MOVQ acc1, hlp 1466 SHLQ $32, acc1 1467 MULQ p256const1<>(SB) 1468 SHRQ $32, hlp 1469 ADDQ acc1, acc2 1470 ADCQ hlp, acc3 1471 ADCQ mul0, acc0 1472 ADCQ $0, mul1 1473 MOVQ mul1, acc1 1474 // Third reduction step 1475 MOVQ acc2, mul0 1476 MOVQ acc2, hlp 1477 SHLQ $32, acc2 1478 MULQ p256const1<>(SB) 1479 SHRQ $32, hlp 1480 ADDQ acc2, acc3 1481 ADCQ hlp, acc0 1482 ADCQ mul0, acc1 1483 ADCQ $0, mul1 1484 MOVQ mul1, acc2 1485 // Last reduction step 1486 MOVQ acc3, mul0 1487 MOVQ acc3, hlp 1488 SHLQ $32, acc3 1489 MULQ p256const1<>(SB) 1490 SHRQ $32, hlp 1491 ADDQ acc3, acc0 1492 ADCQ hlp, acc1 1493 ADCQ mul0, acc2 1494 ADCQ $0, mul1 1495 MOVQ mul1, acc3 1496 MOVQ $0, BP 1497 // Add bits [511:256] of the result 1498 ADCQ acc0, acc4 1499 ADCQ acc1, acc5 1500 ADCQ acc2, acc6 1501 ADCQ acc3, acc7 1502 ADCQ $0, hlp 1503 // Copy result 1504 MOVQ acc4, acc0 1505 MOVQ acc5, acc1 1506 MOVQ acc6, acc2 1507 MOVQ acc7, acc3 1508 // Subtract p256 1509 SUBQ $-1, acc4 1510 SBBQ p256const0<>(SB) ,acc5 1511 SBBQ $0, acc6 1512 SBBQ p256const1<>(SB), acc7 1513 SBBQ $0, hlp 1514 // If the result of the subtraction is negative, restore the previous result 1515 CMOVQCS acc0, acc4 1516 CMOVQCS acc1, acc5 1517 CMOVQCS acc2, acc6 1518 CMOVQCS acc3, acc7 1519 1520 RET 1521 /* ---------------------------------------*/ 1522 TEXT p256SqrInternal(SB),18,$0 1523 1524 MOVQ acc4, mul0 1525 MULQ acc5 1526 MOVQ mul0, acc1 1527 MOVQ mul1, acc2 1528 1529 MOVQ acc4, mul0 1530 MULQ acc6 1531 ADDQ mul0, acc2 1532 ADCQ $0, mul1 1533 MOVQ mul1, acc3 1534 1535 MOVQ acc4, mul0 1536 MULQ acc7 1537 ADDQ mul0, acc3 1538 ADCQ $0, mul1 1539 MOVQ mul1, t0 1540 1541 MOVQ acc5, mul0 1542 MULQ acc6 1543 ADDQ mul0, acc3 1544 ADCQ $0, mul1 1545 MOVQ mul1, hlp 1546 1547 MOVQ acc5, mul0 1548 MULQ acc7 1549 ADDQ hlp, t0 1550 ADCQ $0, mul1 1551 ADDQ mul0, t0 1552 ADCQ $0, mul1 1553 MOVQ mul1, t1 1554 1555 MOVQ acc6, mul0 1556 MULQ acc7 1557 ADDQ mul0, t1 1558 ADCQ $0, mul1 1559 MOVQ mul1, t2 1560 XORQ t3, t3 1561 // *2 1562 ADDQ acc1, acc1 1563 ADCQ acc2, acc2 1564 ADCQ acc3, acc3 1565 ADCQ t0, t0 1566 ADCQ t1, t1 1567 ADCQ t2, t2 1568 ADCQ $0, t3 1569 // Missing products 1570 MOVQ acc4, mul0 1571 MULQ mul0 1572 MOVQ mul0, acc0 1573 MOVQ DX, acc4 1574 1575 MOVQ acc5, mul0 1576 MULQ mul0 1577 ADDQ acc4, acc1 1578 ADCQ mul0, acc2 1579 ADCQ $0, DX 1580 MOVQ DX, acc4 1581 1582 MOVQ acc6, mul0 1583 MULQ mul0 1584 ADDQ acc4, acc3 1585 ADCQ mul0, t0 1586 ADCQ $0, DX 1587 MOVQ DX, acc4 1588 1589 MOVQ acc7, mul0 1590 MULQ mul0 1591 ADDQ acc4, t1 1592 ADCQ mul0, t2 1593 ADCQ DX, t3 1594 // First reduction step 1595 MOVQ acc0, mul0 1596 MOVQ acc0, hlp 1597 SHLQ $32, acc0 1598 MULQ p256const1<>(SB) 1599 SHRQ $32, hlp 1600 ADDQ acc0, acc1 1601 ADCQ hlp, acc2 1602 ADCQ mul0, acc3 1603 ADCQ $0, mul1 1604 MOVQ mul1, acc0 1605 // Second reduction step 1606 MOVQ acc1, mul0 1607 MOVQ acc1, hlp 1608 SHLQ $32, acc1 1609 MULQ p256const1<>(SB) 1610 SHRQ $32, hlp 1611 ADDQ acc1, acc2 1612 ADCQ hlp, acc3 1613 ADCQ mul0, acc0 1614 ADCQ $0, mul1 1615 MOVQ mul1, acc1 1616 // Third reduction step 1617 MOVQ acc2, mul0 1618 MOVQ acc2, hlp 1619 SHLQ $32, acc2 1620 MULQ p256const1<>(SB) 1621 SHRQ $32, hlp 1622 ADDQ acc2, acc3 1623 ADCQ hlp, acc0 1624 ADCQ mul0, acc1 1625 ADCQ $0, mul1 1626 MOVQ mul1, acc2 1627 // Last reduction step 1628 MOVQ acc3, mul0 1629 MOVQ acc3, hlp 1630 SHLQ $32, acc3 1631 MULQ p256const1<>(SB) 1632 SHRQ $32, hlp 1633 ADDQ acc3, acc0 1634 ADCQ hlp, acc1 1635 ADCQ mul0, acc2 1636 ADCQ $0, mul1 1637 MOVQ mul1, acc3 1638 MOVQ $0, BP 1639 // Add bits [511:256] of the result 1640 ADCQ acc0, t0 1641 ADCQ acc1, t1 1642 ADCQ acc2, t2 1643 ADCQ acc3, t3 1644 ADCQ $0, hlp 1645 // Copy result 1646 MOVQ t0, acc4 1647 MOVQ t1, acc5 1648 MOVQ t2, acc6 1649 MOVQ t3, acc7 1650 // Subtract p256 1651 SUBQ $-1, acc4 1652 SBBQ p256const0<>(SB) ,acc5 1653 SBBQ $0, acc6 1654 SBBQ p256const1<>(SB), acc7 1655 SBBQ $0, hlp 1656 // If the result of the subtraction is negative, restore the previous result 1657 CMOVQCS t0, acc4 1658 CMOVQCS t1, acc5 1659 CMOVQCS t2, acc6 1660 CMOVQCS t3, acc7 1661 1662 RET 1663 /* ---------------------------------------*/ 1664 #define p256MulBy2Inline\ 1665 XORQ mul0, mul0;\ 1666 ADDQ acc4, acc4;\ 1667 ADCQ acc5, acc5;\ 1668 ADCQ acc6, acc6;\ 1669 ADCQ acc7, acc7;\ 1670 ADCQ $0, mul0;\ 1671 MOVQ acc4, t0;\ 1672 MOVQ acc5, t1;\ 1673 MOVQ acc6, t2;\ 1674 MOVQ acc7, t3;\ 1675 SUBQ $-1, t0;\ 1676 SBBQ p256const0<>(SB), t1;\ 1677 SBBQ $0, t2;\ 1678 SBBQ p256const1<>(SB), t3;\ 1679 SBBQ $0, mul0;\ 1680 CMOVQCS acc4, t0;\ 1681 CMOVQCS acc5, t1;\ 1682 CMOVQCS acc6, t2;\ 1683 CMOVQCS acc7, t3; 1684 /* ---------------------------------------*/ 1685 #define p256AddInline \ 1686 XORQ mul0, mul0;\ 1687 ADDQ t0, acc4;\ 1688 ADCQ t1, acc5;\ 1689 ADCQ t2, acc6;\ 1690 ADCQ t3, acc7;\ 1691 ADCQ $0, mul0;\ 1692 MOVQ acc4, t0;\ 1693 MOVQ acc5, t1;\ 1694 MOVQ acc6, t2;\ 1695 MOVQ acc7, t3;\ 1696 SUBQ $-1, t0;\ 1697 SBBQ p256const0<>(SB), t1;\ 1698 SBBQ $0, t2;\ 1699 SBBQ p256const1<>(SB), t3;\ 1700 SBBQ $0, mul0;\ 1701 CMOVQCS acc4, t0;\ 1702 CMOVQCS acc5, t1;\ 1703 CMOVQCS acc6, t2;\ 1704 CMOVQCS acc7, t3; 1705 /* ---------------------------------------*/ 1706 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1707 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1708 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1709 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1710 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1711 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1712 /* ---------------------------------------*/ 1713 #define x1in(off) (32*0 + off)(SP) 1714 #define y1in(off) (32*1 + off)(SP) 1715 #define z1in(off) (32*2 + off)(SP) 1716 #define x2in(off) (32*3 + off)(SP) 1717 #define y2in(off) (32*4 + off)(SP) 1718 #define xout(off) (32*5 + off)(SP) 1719 #define yout(off) (32*6 + off)(SP) 1720 #define zout(off) (32*7 + off)(SP) 1721 #define s2(off) (32*8 + off)(SP) 1722 #define z1sqr(off) (32*9 + off)(SP) 1723 #define h(off) (32*10 + off)(SP) 1724 #define r(off) (32*11 + off)(SP) 1725 #define hsqr(off) (32*12 + off)(SP) 1726 #define rsqr(off) (32*13 + off)(SP) 1727 #define hcub(off) (32*14 + off)(SP) 1728 #define rptr (32*15)(SP) 1729 #define sel_save (32*15 + 8)(SP) 1730 #define zero_save (32*15 + 8 + 4)(SP) 1731 1732 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 1733 TEXT ·p256PointAddAffineAsm(SB),0,$512-96 1734 // Move input to stack in order to free registers 1735 MOVQ res+0(FP), AX 1736 MOVQ in1+24(FP), BX 1737 MOVQ in2+48(FP), CX 1738 MOVQ sign+72(FP), DX 1739 MOVQ sel+80(FP), t1 1740 MOVQ zero+88(FP), t2 1741 1742 MOVOU (16*0)(BX), X0 1743 MOVOU (16*1)(BX), X1 1744 MOVOU (16*2)(BX), X2 1745 MOVOU (16*3)(BX), X3 1746 MOVOU (16*4)(BX), X4 1747 MOVOU (16*5)(BX), X5 1748 1749 MOVOU X0, x1in(16*0) 1750 MOVOU X1, x1in(16*1) 1751 MOVOU X2, y1in(16*0) 1752 MOVOU X3, y1in(16*1) 1753 MOVOU X4, z1in(16*0) 1754 MOVOU X5, z1in(16*1) 1755 1756 MOVOU (16*0)(CX), X0 1757 MOVOU (16*1)(CX), X1 1758 1759 MOVOU X0, x2in(16*0) 1760 MOVOU X1, x2in(16*1) 1761 // Store pointer to result 1762 MOVQ mul0, rptr 1763 MOVL t1, sel_save 1764 MOVL t2, zero_save 1765 // Negate y2in based on sign 1766 MOVQ (16*2 + 8*0)(CX), acc4 1767 MOVQ (16*2 + 8*1)(CX), acc5 1768 MOVQ (16*2 + 8*2)(CX), acc6 1769 MOVQ (16*2 + 8*3)(CX), acc7 1770 MOVQ $-1, acc0 1771 MOVQ p256const0<>(SB), acc1 1772 MOVQ $0, acc2 1773 MOVQ p256const1<>(SB), acc3 1774 XORQ mul0, mul0 1775 // Speculatively subtract 1776 SUBQ acc4, acc0 1777 SBBQ acc5, acc1 1778 SBBQ acc6, acc2 1779 SBBQ acc7, acc3 1780 SBBQ $0, mul0 1781 MOVQ acc0, t0 1782 MOVQ acc1, t1 1783 MOVQ acc2, t2 1784 MOVQ acc3, t3 1785 // Add in case the operand was > p256 1786 ADDQ $-1, acc0 1787 ADCQ p256const0<>(SB), acc1 1788 ADCQ $0, acc2 1789 ADCQ p256const1<>(SB), acc3 1790 ADCQ $0, mul0 1791 CMOVQNE t0, acc0 1792 CMOVQNE t1, acc1 1793 CMOVQNE t2, acc2 1794 CMOVQNE t3, acc3 1795 // If condition is 0, keep original value 1796 TESTQ DX, DX 1797 CMOVQEQ acc4, acc0 1798 CMOVQEQ acc5, acc1 1799 CMOVQEQ acc6, acc2 1800 CMOVQEQ acc7, acc3 1801 // Store result 1802 MOVQ acc0, y2in(8*0) 1803 MOVQ acc1, y2in(8*1) 1804 MOVQ acc2, y2in(8*2) 1805 MOVQ acc3, y2in(8*3) 1806 // Begin point add 1807 LDacc (z1in) 1808 CALL p256SqrInternal(SB) // z1ˆ2 1809 ST (z1sqr) 1810 1811 LDt (x2in) 1812 CALL p256MulInternal(SB) // x2 * z1ˆ2 1813 1814 LDt (x1in) 1815 CALL p256SubInternal(SB) // h = u2 - u1 1816 ST (h) 1817 1818 LDt (z1in) 1819 CALL p256MulInternal(SB) // z3 = h * z1 1820 ST (zout) 1821 1822 LDacc (z1sqr) 1823 CALL p256MulInternal(SB) // z1ˆ3 1824 1825 LDt (y2in) 1826 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 1827 ST (s2) 1828 1829 LDt (y1in) 1830 CALL p256SubInternal(SB) // r = s2 - s1 1831 ST (r) 1832 1833 CALL p256SqrInternal(SB) // rsqr = rˆ2 1834 ST (rsqr) 1835 1836 LDacc (h) 1837 CALL p256SqrInternal(SB) // hsqr = hˆ2 1838 ST (hsqr) 1839 1840 LDt (h) 1841 CALL p256MulInternal(SB) // hcub = hˆ3 1842 ST (hcub) 1843 1844 LDt (y1in) 1845 CALL p256MulInternal(SB) // y1 * hˆ3 1846 ST (s2) 1847 1848 LDacc (x1in) 1849 LDt (hsqr) 1850 CALL p256MulInternal(SB) // u1 * hˆ2 1851 ST (h) 1852 1853 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1854 LDacc (rsqr) 1855 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 1856 1857 LDt (hcub) 1858 CALL p256SubInternal(SB) 1859 ST (xout) 1860 1861 MOVQ acc4, t0 1862 MOVQ acc5, t1 1863 MOVQ acc6, t2 1864 MOVQ acc7, t3 1865 LDacc (h) 1866 CALL p256SubInternal(SB) 1867 1868 LDt (r) 1869 CALL p256MulInternal(SB) 1870 1871 LDt (s2) 1872 CALL p256SubInternal(SB) 1873 ST (yout) 1874 // Load stored values from stack 1875 MOVQ rptr, AX 1876 MOVL sel_save, BX 1877 MOVL zero_save, CX 1878 // The result is not valid if (sel == 0), conditional choose 1879 MOVOU xout(16*0), X0 1880 MOVOU xout(16*1), X1 1881 MOVOU yout(16*0), X2 1882 MOVOU yout(16*1), X3 1883 MOVOU zout(16*0), X4 1884 MOVOU zout(16*1), X5 1885 1886 MOVL BX, X6 1887 MOVL CX, X7 1888 1889 PXOR X8, X8 1890 PCMPEQL X9, X9 1891 1892 PSHUFD $0, X6, X6 1893 PSHUFD $0, X7, X7 1894 1895 PCMPEQL X8, X6 1896 PCMPEQL X8, X7 1897 1898 MOVOU X6, X15 1899 PANDN X9, X15 1900 1901 MOVOU x1in(16*0), X9 1902 MOVOU x1in(16*1), X10 1903 MOVOU y1in(16*0), X11 1904 MOVOU y1in(16*1), X12 1905 MOVOU z1in(16*0), X13 1906 MOVOU z1in(16*1), X14 1907 1908 PAND X15, X0 1909 PAND X15, X1 1910 PAND X15, X2 1911 PAND X15, X3 1912 PAND X15, X4 1913 PAND X15, X5 1914 1915 PAND X6, X9 1916 PAND X6, X10 1917 PAND X6, X11 1918 PAND X6, X12 1919 PAND X6, X13 1920 PAND X6, X14 1921 1922 PXOR X9, X0 1923 PXOR X10, X1 1924 PXOR X11, X2 1925 PXOR X12, X3 1926 PXOR X13, X4 1927 PXOR X14, X5 1928 // Similarly if zero == 0 1929 PCMPEQL X9, X9 1930 MOVOU X7, X15 1931 PANDN X9, X15 1932 1933 MOVOU x2in(16*0), X9 1934 MOVOU x2in(16*1), X10 1935 MOVOU y2in(16*0), X11 1936 MOVOU y2in(16*1), X12 1937 MOVOU p256one<>+0x00(SB), X13 1938 MOVOU p256one<>+0x10(SB), X14 1939 1940 PAND X15, X0 1941 PAND X15, X1 1942 PAND X15, X2 1943 PAND X15, X3 1944 PAND X15, X4 1945 PAND X15, X5 1946 1947 PAND X7, X9 1948 PAND X7, X10 1949 PAND X7, X11 1950 PAND X7, X12 1951 PAND X7, X13 1952 PAND X7, X14 1953 1954 PXOR X9, X0 1955 PXOR X10, X1 1956 PXOR X11, X2 1957 PXOR X12, X3 1958 PXOR X13, X4 1959 PXOR X14, X5 1960 // Finally output the result 1961 MOVOU X0, (16*0)(AX) 1962 MOVOU X1, (16*1)(AX) 1963 MOVOU X2, (16*2)(AX) 1964 MOVOU X3, (16*3)(AX) 1965 MOVOU X4, (16*4)(AX) 1966 MOVOU X5, (16*5)(AX) 1967 MOVQ $0, rptr 1968 1969 RET 1970 #undef x1in 1971 #undef y1in 1972 #undef z1in 1973 #undef x2in 1974 #undef y2in 1975 #undef xout 1976 #undef yout 1977 #undef zout 1978 #undef s2 1979 #undef z1sqr 1980 #undef h 1981 #undef r 1982 #undef hsqr 1983 #undef rsqr 1984 #undef hcub 1985 #undef rptr 1986 #undef sel_save 1987 #undef zero_save 1988 1989 // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero 1990 // otherwise. It writes to [acc4..acc7], t0 and t1. 1991 TEXT p256IsZero(SB),18,$0 1992 // AX contains a flag that is set if the input is zero. 1993 XORQ AX, AX 1994 MOVQ $1, t1 1995 1996 // Check whether [acc4..acc7] are all zero. 1997 MOVQ acc4, t0 1998 ORQ acc5, t0 1999 ORQ acc6, t0 2000 ORQ acc7, t0 2001 2002 // Set the zero flag if so. (CMOV of a constant to a register doesn't 2003 // appear to be supported in Go. Thus t1 = 1.) 2004 CMOVQEQ t1, AX 2005 2006 // XOR [acc4..acc7] with P and compare with zero again. 2007 XORQ $-1, acc4 2008 XORQ p256const0<>(SB), acc5 2009 XORQ p256const1<>(SB), acc7 2010 ORQ acc5, acc4 2011 ORQ acc6, acc4 2012 ORQ acc7, acc4 2013 2014 // Set the zero flag if so. 2015 CMOVQEQ t1, AX 2016 RET 2017 2018 /* ---------------------------------------*/ 2019 #define x1in(off) (32*0 + off)(SP) 2020 #define y1in(off) (32*1 + off)(SP) 2021 #define z1in(off) (32*2 + off)(SP) 2022 #define x2in(off) (32*3 + off)(SP) 2023 #define y2in(off) (32*4 + off)(SP) 2024 #define z2in(off) (32*5 + off)(SP) 2025 2026 #define xout(off) (32*6 + off)(SP) 2027 #define yout(off) (32*7 + off)(SP) 2028 #define zout(off) (32*8 + off)(SP) 2029 2030 #define u1(off) (32*9 + off)(SP) 2031 #define u2(off) (32*10 + off)(SP) 2032 #define s1(off) (32*11 + off)(SP) 2033 #define s2(off) (32*12 + off)(SP) 2034 #define z1sqr(off) (32*13 + off)(SP) 2035 #define z2sqr(off) (32*14 + off)(SP) 2036 #define h(off) (32*15 + off)(SP) 2037 #define r(off) (32*16 + off)(SP) 2038 #define hsqr(off) (32*17 + off)(SP) 2039 #define rsqr(off) (32*18 + off)(SP) 2040 #define hcub(off) (32*19 + off)(SP) 2041 #define rptr (32*20)(SP) 2042 #define points_eq (32*20+8)(SP) 2043 2044 //func p256PointAddAsm(res, in1, in2 []uint64) int 2045 TEXT ·p256PointAddAsm(SB),0,$680-80 2046 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 2047 // Move input to stack in order to free registers 2048 MOVQ res+0(FP), AX 2049 MOVQ in1+24(FP), BX 2050 MOVQ in2+48(FP), CX 2051 2052 MOVOU (16*0)(BX), X0 2053 MOVOU (16*1)(BX), X1 2054 MOVOU (16*2)(BX), X2 2055 MOVOU (16*3)(BX), X3 2056 MOVOU (16*4)(BX), X4 2057 MOVOU (16*5)(BX), X5 2058 2059 MOVOU X0, x1in(16*0) 2060 MOVOU X1, x1in(16*1) 2061 MOVOU X2, y1in(16*0) 2062 MOVOU X3, y1in(16*1) 2063 MOVOU X4, z1in(16*0) 2064 MOVOU X5, z1in(16*1) 2065 2066 MOVOU (16*0)(CX), X0 2067 MOVOU (16*1)(CX), X1 2068 MOVOU (16*2)(CX), X2 2069 MOVOU (16*3)(CX), X3 2070 MOVOU (16*4)(CX), X4 2071 MOVOU (16*5)(CX), X5 2072 2073 MOVOU X0, x2in(16*0) 2074 MOVOU X1, x2in(16*1) 2075 MOVOU X2, y2in(16*0) 2076 MOVOU X3, y2in(16*1) 2077 MOVOU X4, z2in(16*0) 2078 MOVOU X5, z2in(16*1) 2079 // Store pointer to result 2080 MOVQ AX, rptr 2081 // Begin point add 2082 LDacc (z2in) 2083 CALL p256SqrInternal(SB) // z2ˆ2 2084 ST (z2sqr) 2085 LDt (z2in) 2086 CALL p256MulInternal(SB) // z2ˆ3 2087 LDt (y1in) 2088 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 2089 ST (s1) 2090 2091 LDacc (z1in) 2092 CALL p256SqrInternal(SB) // z1ˆ2 2093 ST (z1sqr) 2094 LDt (z1in) 2095 CALL p256MulInternal(SB) // z1ˆ3 2096 LDt (y2in) 2097 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 2098 ST (s2) 2099 2100 LDt (s1) 2101 CALL p256SubInternal(SB) // r = s2 - s1 2102 ST (r) 2103 CALL p256IsZero(SB) 2104 MOVQ AX, points_eq 2105 2106 LDacc (z2sqr) 2107 LDt (x1in) 2108 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 2109 ST (u1) 2110 LDacc (z1sqr) 2111 LDt (x2in) 2112 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 2113 ST (u2) 2114 2115 LDt (u1) 2116 CALL p256SubInternal(SB) // h = u2 - u1 2117 ST (h) 2118 CALL p256IsZero(SB) 2119 ANDQ points_eq, AX 2120 MOVQ AX, points_eq 2121 2122 LDacc (r) 2123 CALL p256SqrInternal(SB) // rsqr = rˆ2 2124 ST (rsqr) 2125 2126 LDacc (h) 2127 CALL p256SqrInternal(SB) // hsqr = hˆ2 2128 ST (hsqr) 2129 2130 LDt (h) 2131 CALL p256MulInternal(SB) // hcub = hˆ3 2132 ST (hcub) 2133 2134 LDt (s1) 2135 CALL p256MulInternal(SB) 2136 ST (s2) 2137 2138 LDacc (z1in) 2139 LDt (z2in) 2140 CALL p256MulInternal(SB) // z1 * z2 2141 LDt (h) 2142 CALL p256MulInternal(SB) // z1 * z2 * h 2143 ST (zout) 2144 2145 LDacc (hsqr) 2146 LDt (u1) 2147 CALL p256MulInternal(SB) // hˆ2 * u1 2148 ST (u2) 2149 2150 p256MulBy2Inline // u1 * hˆ2 * 2, inline 2151 LDacc (rsqr) 2152 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 2153 2154 LDt (hcub) 2155 CALL p256SubInternal(SB) 2156 ST (xout) 2157 2158 MOVQ acc4, t0 2159 MOVQ acc5, t1 2160 MOVQ acc6, t2 2161 MOVQ acc7, t3 2162 LDacc (u2) 2163 CALL p256SubInternal(SB) 2164 2165 LDt (r) 2166 CALL p256MulInternal(SB) 2167 2168 LDt (s2) 2169 CALL p256SubInternal(SB) 2170 ST (yout) 2171 2172 MOVOU xout(16*0), X0 2173 MOVOU xout(16*1), X1 2174 MOVOU yout(16*0), X2 2175 MOVOU yout(16*1), X3 2176 MOVOU zout(16*0), X4 2177 MOVOU zout(16*1), X5 2178 // Finally output the result 2179 MOVQ rptr, AX 2180 MOVQ $0, rptr 2181 MOVOU X0, (16*0)(AX) 2182 MOVOU X1, (16*1)(AX) 2183 MOVOU X2, (16*2)(AX) 2184 MOVOU X3, (16*3)(AX) 2185 MOVOU X4, (16*4)(AX) 2186 MOVOU X5, (16*5)(AX) 2187 2188 MOVQ points_eq, AX 2189 MOVQ AX, ret+72(FP) 2190 2191 RET 2192 #undef x1in 2193 #undef y1in 2194 #undef z1in 2195 #undef x2in 2196 #undef y2in 2197 #undef z2in 2198 #undef xout 2199 #undef yout 2200 #undef zout 2201 #undef s1 2202 #undef s2 2203 #undef u1 2204 #undef u2 2205 #undef z1sqr 2206 #undef z2sqr 2207 #undef h 2208 #undef r 2209 #undef hsqr 2210 #undef rsqr 2211 #undef hcub 2212 #undef rptr 2213 /* ---------------------------------------*/ 2214 #define x(off) (32*0 + off)(SP) 2215 #define y(off) (32*1 + off)(SP) 2216 #define z(off) (32*2 + off)(SP) 2217 2218 #define s(off) (32*3 + off)(SP) 2219 #define m(off) (32*4 + off)(SP) 2220 #define zsqr(off) (32*5 + off)(SP) 2221 #define tmp(off) (32*6 + off)(SP) 2222 #define rptr (32*7)(SP) 2223 2224 //func p256PointDoubleAsm(res, in []uint64) 2225 TEXT ·p256PointDoubleAsm(SB),18,$256-48 2226 // Move input to stack in order to free registers 2227 MOVQ res+0(FP), AX 2228 MOVQ in+24(FP), BX 2229 2230 MOVOU (16*0)(BX), X0 2231 MOVOU (16*1)(BX), X1 2232 MOVOU (16*2)(BX), X2 2233 MOVOU (16*3)(BX), X3 2234 MOVOU (16*4)(BX), X4 2235 MOVOU (16*5)(BX), X5 2236 2237 MOVOU X0, x(16*0) 2238 MOVOU X1, x(16*1) 2239 MOVOU X2, y(16*0) 2240 MOVOU X3, y(16*1) 2241 MOVOU X4, z(16*0) 2242 MOVOU X5, z(16*1) 2243 // Store pointer to result 2244 MOVQ AX, rptr 2245 // Begin point double 2246 LDacc (z) 2247 CALL p256SqrInternal(SB) 2248 ST (zsqr) 2249 2250 LDt (x) 2251 p256AddInline 2252 STt (m) 2253 2254 LDacc (z) 2255 LDt (y) 2256 CALL p256MulInternal(SB) 2257 p256MulBy2Inline 2258 MOVQ rptr, AX 2259 // Store z 2260 MOVQ t0, (16*4 + 8*0)(AX) 2261 MOVQ t1, (16*4 + 8*1)(AX) 2262 MOVQ t2, (16*4 + 8*2)(AX) 2263 MOVQ t3, (16*4 + 8*3)(AX) 2264 2265 LDacc (x) 2266 LDt (zsqr) 2267 CALL p256SubInternal(SB) 2268 LDt (m) 2269 CALL p256MulInternal(SB) 2270 ST (m) 2271 // Multiply by 3 2272 p256MulBy2Inline 2273 LDacc (m) 2274 p256AddInline 2275 STt (m) 2276 //////////////////////// 2277 LDacc (y) 2278 p256MulBy2Inline 2279 t2acc 2280 CALL p256SqrInternal(SB) 2281 ST (s) 2282 CALL p256SqrInternal(SB) 2283 // Divide by 2 2284 XORQ mul0, mul0 2285 MOVQ acc4, t0 2286 MOVQ acc5, t1 2287 MOVQ acc6, t2 2288 MOVQ acc7, t3 2289 2290 ADDQ $-1, acc4 2291 ADCQ p256const0<>(SB), acc5 2292 ADCQ $0, acc6 2293 ADCQ p256const1<>(SB), acc7 2294 ADCQ $0, mul0 2295 TESTQ $1, t0 2296 2297 CMOVQEQ t0, acc4 2298 CMOVQEQ t1, acc5 2299 CMOVQEQ t2, acc6 2300 CMOVQEQ t3, acc7 2301 ANDQ t0, mul0 2302 2303 SHRQ $1, acc5, acc4 2304 SHRQ $1, acc6, acc5 2305 SHRQ $1, acc7, acc6 2306 SHRQ $1, mul0, acc7 2307 ST (y) 2308 ///////////////////////// 2309 LDacc (x) 2310 LDt (s) 2311 CALL p256MulInternal(SB) 2312 ST (s) 2313 p256MulBy2Inline 2314 STt (tmp) 2315 2316 LDacc (m) 2317 CALL p256SqrInternal(SB) 2318 LDt (tmp) 2319 CALL p256SubInternal(SB) 2320 2321 MOVQ rptr, AX 2322 // Store x 2323 MOVQ acc4, (16*0 + 8*0)(AX) 2324 MOVQ acc5, (16*0 + 8*1)(AX) 2325 MOVQ acc6, (16*0 + 8*2)(AX) 2326 MOVQ acc7, (16*0 + 8*3)(AX) 2327 2328 acc2t 2329 LDacc (s) 2330 CALL p256SubInternal(SB) 2331 2332 LDt (m) 2333 CALL p256MulInternal(SB) 2334 2335 LDt (y) 2336 CALL p256SubInternal(SB) 2337 MOVQ rptr, AX 2338 // Store y 2339 MOVQ acc4, (16*2 + 8*0)(AX) 2340 MOVQ acc5, (16*2 + 8*1)(AX) 2341 MOVQ acc6, (16*2 + 8*2)(AX) 2342 MOVQ acc7, (16*2 + 8*3)(AX) 2343 /////////////////////// 2344 MOVQ $0, rptr 2345 2346 RET 2347 /* ---------------------------------------*/ 2348