github.com/euank/go@v0.0.0-20160829210321-495514729181/src/crypto/elliptic/p256_asm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains constant-time, 64-bit assembly implementation of 6 // P256. The optimizations performed here are described in detail in: 7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 8 // 256-bit primes" 9 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 10 // https://eprint.iacr.org/2013/816.pdf 11 12 #include "textflag.h" 13 14 #define res_ptr DI 15 #define x_ptr SI 16 #define y_ptr CX 17 18 #define acc0 R8 19 #define acc1 R9 20 #define acc2 R10 21 #define acc3 R11 22 #define acc4 R12 23 #define acc5 R13 24 #define t0 R14 25 #define t1 R15 26 27 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 28 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 29 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 30 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 31 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 32 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 33 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 34 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 35 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 36 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 37 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 38 GLOBL p256const0<>(SB), 8, $8 39 GLOBL p256const1<>(SB), 8, $8 40 GLOBL p256ordK0<>(SB), 8, $8 41 GLOBL p256ord<>(SB), 8, $32 42 GLOBL p256one<>(SB), 8, $32 43 44 /* ---------------------------------------*/ 45 // func p256LittleToBig(res []byte, in []uint64) 46 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 47 JMP ·p256BigToLittle(SB) 48 /* ---------------------------------------*/ 49 // func p256BigToLittle(res []uint64, in []byte) 50 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 51 MOVQ res+0(FP), res_ptr 52 MOVQ in+24(FP), x_ptr 53 54 MOVQ (8*0)(x_ptr), acc0 55 MOVQ (8*1)(x_ptr), acc1 56 MOVQ (8*2)(x_ptr), acc2 57 MOVQ (8*3)(x_ptr), acc3 58 59 BSWAPQ acc0 60 BSWAPQ acc1 61 BSWAPQ acc2 62 BSWAPQ acc3 63 64 MOVQ acc3, (8*0)(res_ptr) 65 MOVQ acc2, (8*1)(res_ptr) 66 MOVQ acc1, (8*2)(res_ptr) 67 MOVQ acc0, (8*3)(res_ptr) 68 69 RET 70 /* ---------------------------------------*/ 71 // func p256MovCond(res, a, b []uint64, cond int) 72 // If cond == 0 res=b, else res=a 73 TEXT ·p256MovCond(SB),NOSPLIT,$0 74 MOVQ res+0(FP), res_ptr 75 MOVQ a+24(FP), x_ptr 76 MOVQ b+48(FP), y_ptr 77 MOVQ cond+72(FP), X12 78 79 PXOR X13, X13 80 PSHUFD $0, X12, X12 81 PCMPEQL X13, X12 82 83 MOVOU X12, X0 84 PANDN (16*0)(x_ptr), X0 85 MOVOU X12, X1 86 PANDN (16*1)(x_ptr), X1 87 MOVOU X12, X2 88 PANDN (16*2)(x_ptr), X2 89 MOVOU X12, X3 90 PANDN (16*3)(x_ptr), X3 91 MOVOU X12, X4 92 PANDN (16*4)(x_ptr), X4 93 MOVOU X12, X5 94 PANDN (16*5)(x_ptr), X5 95 96 MOVOU (16*0)(y_ptr), X6 97 MOVOU (16*1)(y_ptr), X7 98 MOVOU (16*2)(y_ptr), X8 99 MOVOU (16*3)(y_ptr), X9 100 MOVOU (16*4)(y_ptr), X10 101 MOVOU (16*5)(y_ptr), X11 102 103 PAND X12, X6 104 PAND X12, X7 105 PAND X12, X8 106 PAND X12, X9 107 PAND X12, X10 108 PAND X12, X11 109 110 PXOR X6, X0 111 PXOR X7, X1 112 PXOR X8, X2 113 PXOR X9, X3 114 PXOR X10, X4 115 PXOR X11, X5 116 117 MOVOU X0, (16*0)(res_ptr) 118 MOVOU X1, (16*1)(res_ptr) 119 MOVOU X2, (16*2)(res_ptr) 120 MOVOU X3, (16*3)(res_ptr) 121 MOVOU X4, (16*4)(res_ptr) 122 MOVOU X5, (16*5)(res_ptr) 123 124 RET 125 /* ---------------------------------------*/ 126 // func p256NegCond(val []uint64, cond int) 127 TEXT ·p256NegCond(SB),NOSPLIT,$0 128 MOVQ val+0(FP), res_ptr 129 MOVQ cond+24(FP), t0 130 // acc = poly 131 MOVQ $-1, acc0 132 MOVQ p256const0<>(SB), acc1 133 MOVQ $0, acc2 134 MOVQ p256const1<>(SB), acc3 135 // Load the original value 136 MOVQ (8*0)(res_ptr), acc5 137 MOVQ (8*1)(res_ptr), x_ptr 138 MOVQ (8*2)(res_ptr), y_ptr 139 MOVQ (8*3)(res_ptr), t1 140 // Speculatively subtract 141 SUBQ acc5, acc0 142 SBBQ x_ptr, acc1 143 SBBQ y_ptr, acc2 144 SBBQ t1, acc3 145 // If condition is 0, keep original value 146 TESTQ t0, t0 147 CMOVQEQ acc5, acc0 148 CMOVQEQ x_ptr, acc1 149 CMOVQEQ y_ptr, acc2 150 CMOVQEQ t1, acc3 151 // Store result 152 MOVQ acc0, (8*0)(res_ptr) 153 MOVQ acc1, (8*1)(res_ptr) 154 MOVQ acc2, (8*2)(res_ptr) 155 MOVQ acc3, (8*3)(res_ptr) 156 157 RET 158 /* ---------------------------------------*/ 159 // func p256Sqr(res, in []uint64) 160 TEXT ·p256Sqr(SB),NOSPLIT,$0 161 MOVQ res+0(FP), res_ptr 162 MOVQ in+24(FP), x_ptr 163 // y[1:] * y[0] 164 MOVQ (8*0)(x_ptr), t0 165 166 MOVQ (8*1)(x_ptr), AX 167 MULQ t0 168 MOVQ AX, acc1 169 MOVQ DX, acc2 170 171 MOVQ (8*2)(x_ptr), AX 172 MULQ t0 173 ADDQ AX, acc2 174 ADCQ $0, DX 175 MOVQ DX, acc3 176 177 MOVQ (8*3)(x_ptr), AX 178 MULQ t0 179 ADDQ AX, acc3 180 ADCQ $0, DX 181 MOVQ DX, acc4 182 // y[2:] * y[1] 183 MOVQ (8*1)(x_ptr), t0 184 185 MOVQ (8*2)(x_ptr), AX 186 MULQ t0 187 ADDQ AX, acc3 188 ADCQ $0, DX 189 MOVQ DX, t1 190 191 MOVQ (8*3)(x_ptr), AX 192 MULQ t0 193 ADDQ t1, acc4 194 ADCQ $0, DX 195 ADDQ AX, acc4 196 ADCQ $0, DX 197 MOVQ DX, acc5 198 // y[3] * y[2] 199 MOVQ (8*2)(x_ptr), t0 200 201 MOVQ (8*3)(x_ptr), AX 202 MULQ t0 203 ADDQ AX, acc5 204 ADCQ $0, DX 205 MOVQ DX, y_ptr 206 XORQ t1, t1 207 // *2 208 ADDQ acc1, acc1 209 ADCQ acc2, acc2 210 ADCQ acc3, acc3 211 ADCQ acc4, acc4 212 ADCQ acc5, acc5 213 ADCQ y_ptr, y_ptr 214 ADCQ $0, t1 215 // Missing products 216 MOVQ (8*0)(x_ptr), AX 217 MULQ AX 218 MOVQ AX, acc0 219 MOVQ DX, t0 220 221 MOVQ (8*1)(x_ptr), AX 222 MULQ AX 223 ADDQ t0, acc1 224 ADCQ AX, acc2 225 ADCQ $0, DX 226 MOVQ DX, t0 227 228 MOVQ (8*2)(x_ptr), AX 229 MULQ AX 230 ADDQ t0, acc3 231 ADCQ AX, acc4 232 ADCQ $0, DX 233 MOVQ DX, t0 234 235 MOVQ (8*3)(x_ptr), AX 236 MULQ AX 237 ADDQ t0, acc5 238 ADCQ AX, y_ptr 239 ADCQ DX, t1 240 MOVQ t1, x_ptr 241 // First reduction step 242 MOVQ acc0, AX 243 MOVQ acc0, t1 244 SHLQ $32, acc0 245 MULQ p256const1<>(SB) 246 SHRQ $32, t1 247 ADDQ acc0, acc1 248 ADCQ t1, acc2 249 ADCQ AX, acc3 250 ADCQ $0, DX 251 MOVQ DX, acc0 252 // Second reduction step 253 MOVQ acc1, AX 254 MOVQ acc1, t1 255 SHLQ $32, acc1 256 MULQ p256const1<>(SB) 257 SHRQ $32, t1 258 ADDQ acc1, acc2 259 ADCQ t1, acc3 260 ADCQ AX, acc0 261 ADCQ $0, DX 262 MOVQ DX, acc1 263 // Third reduction step 264 MOVQ acc2, AX 265 MOVQ acc2, t1 266 SHLQ $32, acc2 267 MULQ p256const1<>(SB) 268 SHRQ $32, t1 269 ADDQ acc2, acc3 270 ADCQ t1, acc0 271 ADCQ AX, acc1 272 ADCQ $0, DX 273 MOVQ DX, acc2 274 // Last reduction step 275 XORQ t0, t0 276 MOVQ acc3, AX 277 MOVQ acc3, t1 278 SHLQ $32, acc3 279 MULQ p256const1<>(SB) 280 SHRQ $32, t1 281 ADDQ acc3, acc0 282 ADCQ t1, acc1 283 ADCQ AX, acc2 284 ADCQ $0, DX 285 MOVQ DX, acc3 286 // Add bits [511:256] of the sqr result 287 ADCQ acc4, acc0 288 ADCQ acc5, acc1 289 ADCQ y_ptr, acc2 290 ADCQ x_ptr, acc3 291 ADCQ $0, t0 292 293 MOVQ acc0, acc4 294 MOVQ acc1, acc5 295 MOVQ acc2, y_ptr 296 MOVQ acc3, t1 297 // Subtract p256 298 SUBQ $-1, acc0 299 SBBQ p256const0<>(SB) ,acc1 300 SBBQ $0, acc2 301 SBBQ p256const1<>(SB), acc3 302 SBBQ $0, t0 303 304 CMOVQCS acc4, acc0 305 CMOVQCS acc5, acc1 306 CMOVQCS y_ptr, acc2 307 CMOVQCS t1, acc3 308 309 MOVQ acc0, (8*0)(res_ptr) 310 MOVQ acc1, (8*1)(res_ptr) 311 MOVQ acc2, (8*2)(res_ptr) 312 MOVQ acc3, (8*3)(res_ptr) 313 314 RET 315 /* ---------------------------------------*/ 316 // func p256Mul(res, in1, in2 []uint64) 317 TEXT ·p256Mul(SB),NOSPLIT,$0 318 MOVQ res+0(FP), res_ptr 319 MOVQ in1+24(FP), x_ptr 320 MOVQ in2+48(FP), y_ptr 321 // x * y[0] 322 MOVQ (8*0)(y_ptr), t0 323 324 MOVQ (8*0)(x_ptr), AX 325 MULQ t0 326 MOVQ AX, acc0 327 MOVQ DX, acc1 328 329 MOVQ (8*1)(x_ptr), AX 330 MULQ t0 331 ADDQ AX, acc1 332 ADCQ $0, DX 333 MOVQ DX, acc2 334 335 MOVQ (8*2)(x_ptr), AX 336 MULQ t0 337 ADDQ AX, acc2 338 ADCQ $0, DX 339 MOVQ DX, acc3 340 341 MOVQ (8*3)(x_ptr), AX 342 MULQ t0 343 ADDQ AX, acc3 344 ADCQ $0, DX 345 MOVQ DX, acc4 346 XORQ acc5, acc5 347 // First reduction step 348 MOVQ acc0, AX 349 MOVQ acc0, t1 350 SHLQ $32, acc0 351 MULQ p256const1<>(SB) 352 SHRQ $32, t1 353 ADDQ acc0, acc1 354 ADCQ t1, acc2 355 ADCQ AX, acc3 356 ADCQ DX, acc4 357 ADCQ $0, acc5 358 XORQ acc0, acc0 359 // x * y[1] 360 MOVQ (8*1)(y_ptr), t0 361 362 MOVQ (8*0)(x_ptr), AX 363 MULQ t0 364 ADDQ AX, acc1 365 ADCQ $0, DX 366 MOVQ DX, t1 367 368 MOVQ (8*1)(x_ptr), AX 369 MULQ t0 370 ADDQ t1, acc2 371 ADCQ $0, DX 372 ADDQ AX, acc2 373 ADCQ $0, DX 374 MOVQ DX, t1 375 376 MOVQ (8*2)(x_ptr), AX 377 MULQ t0 378 ADDQ t1, acc3 379 ADCQ $0, DX 380 ADDQ AX, acc3 381 ADCQ $0, DX 382 MOVQ DX, t1 383 384 MOVQ (8*3)(x_ptr), AX 385 MULQ t0 386 ADDQ t1, acc4 387 ADCQ $0, DX 388 ADDQ AX, acc4 389 ADCQ DX, acc5 390 ADCQ $0, acc0 391 // Second reduction step 392 MOVQ acc1, AX 393 MOVQ acc1, t1 394 SHLQ $32, acc1 395 MULQ p256const1<>(SB) 396 SHRQ $32, t1 397 ADDQ acc1, acc2 398 ADCQ t1, acc3 399 ADCQ AX, acc4 400 ADCQ DX, acc5 401 ADCQ $0, acc0 402 XORQ acc1, acc1 403 // x * y[2] 404 MOVQ (8*2)(y_ptr), t0 405 406 MOVQ (8*0)(x_ptr), AX 407 MULQ t0 408 ADDQ AX, acc2 409 ADCQ $0, DX 410 MOVQ DX, t1 411 412 MOVQ (8*1)(x_ptr), AX 413 MULQ t0 414 ADDQ t1, acc3 415 ADCQ $0, DX 416 ADDQ AX, acc3 417 ADCQ $0, DX 418 MOVQ DX, t1 419 420 MOVQ (8*2)(x_ptr), AX 421 MULQ t0 422 ADDQ t1, acc4 423 ADCQ $0, DX 424 ADDQ AX, acc4 425 ADCQ $0, DX 426 MOVQ DX, t1 427 428 MOVQ (8*3)(x_ptr), AX 429 MULQ t0 430 ADDQ t1, acc5 431 ADCQ $0, DX 432 ADDQ AX, acc5 433 ADCQ DX, acc0 434 ADCQ $0, acc1 435 // Third reduction step 436 MOVQ acc2, AX 437 MOVQ acc2, t1 438 SHLQ $32, acc2 439 MULQ p256const1<>(SB) 440 SHRQ $32, t1 441 ADDQ acc2, acc3 442 ADCQ t1, acc4 443 ADCQ AX, acc5 444 ADCQ DX, acc0 445 ADCQ $0, acc1 446 XORQ acc2, acc2 447 // x * y[3] 448 MOVQ (8*3)(y_ptr), t0 449 450 MOVQ (8*0)(x_ptr), AX 451 MULQ t0 452 ADDQ AX, acc3 453 ADCQ $0, DX 454 MOVQ DX, t1 455 456 MOVQ (8*1)(x_ptr), AX 457 MULQ t0 458 ADDQ t1, acc4 459 ADCQ $0, DX 460 ADDQ AX, acc4 461 ADCQ $0, DX 462 MOVQ DX, t1 463 464 MOVQ (8*2)(x_ptr), AX 465 MULQ t0 466 ADDQ t1, acc5 467 ADCQ $0, DX 468 ADDQ AX, acc5 469 ADCQ $0, DX 470 MOVQ DX, t1 471 472 MOVQ (8*3)(x_ptr), AX 473 MULQ t0 474 ADDQ t1, acc0 475 ADCQ $0, DX 476 ADDQ AX, acc0 477 ADCQ DX, acc1 478 ADCQ $0, acc2 479 // Last reduction step 480 MOVQ acc3, AX 481 MOVQ acc3, t1 482 SHLQ $32, acc3 483 MULQ p256const1<>(SB) 484 SHRQ $32, t1 485 ADDQ acc3, acc4 486 ADCQ t1, acc5 487 ADCQ AX, acc0 488 ADCQ DX, acc1 489 ADCQ $0, acc2 490 // Copy result [255:0] 491 MOVQ acc4, x_ptr 492 MOVQ acc5, acc3 493 MOVQ acc0, t0 494 MOVQ acc1, t1 495 // Subtract p256 496 SUBQ $-1, acc4 497 SBBQ p256const0<>(SB) ,acc5 498 SBBQ $0, acc0 499 SBBQ p256const1<>(SB), acc1 500 SBBQ $0, acc2 501 502 CMOVQCS x_ptr, acc4 503 CMOVQCS acc3, acc5 504 CMOVQCS t0, acc0 505 CMOVQCS t1, acc1 506 507 MOVQ acc4, (8*0)(res_ptr) 508 MOVQ acc5, (8*1)(res_ptr) 509 MOVQ acc0, (8*2)(res_ptr) 510 MOVQ acc1, (8*3)(res_ptr) 511 512 RET 513 /* ---------------------------------------*/ 514 // func p256FromMont(res, in []uint64) 515 TEXT ·p256FromMont(SB),NOSPLIT,$0 516 MOVQ res+0(FP), res_ptr 517 MOVQ in+24(FP), x_ptr 518 519 MOVQ (8*0)(x_ptr), acc0 520 MOVQ (8*1)(x_ptr), acc1 521 MOVQ (8*2)(x_ptr), acc2 522 MOVQ (8*3)(x_ptr), acc3 523 XORQ acc4, acc4 524 525 // Only reduce, no multiplications are needed 526 // First stage 527 MOVQ acc0, AX 528 MOVQ acc0, t1 529 SHLQ $32, acc0 530 MULQ p256const1<>(SB) 531 SHRQ $32, t1 532 ADDQ acc0, acc1 533 ADCQ t1, acc2 534 ADCQ AX, acc3 535 ADCQ DX, acc4 536 XORQ acc5, acc5 537 // Second stage 538 MOVQ acc1, AX 539 MOVQ acc1, t1 540 SHLQ $32, acc1 541 MULQ p256const1<>(SB) 542 SHRQ $32, t1 543 ADDQ acc1, acc2 544 ADCQ t1, acc3 545 ADCQ AX, acc4 546 ADCQ DX, acc5 547 XORQ acc0, acc0 548 // Third stage 549 MOVQ acc2, AX 550 MOVQ acc2, t1 551 SHLQ $32, acc2 552 MULQ p256const1<>(SB) 553 SHRQ $32, t1 554 ADDQ acc2, acc3 555 ADCQ t1, acc4 556 ADCQ AX, acc5 557 ADCQ DX, acc0 558 XORQ acc1, acc1 559 // Last stage 560 MOVQ acc3, AX 561 MOVQ acc3, t1 562 SHLQ $32, acc3 563 MULQ p256const1<>(SB) 564 SHRQ $32, t1 565 ADDQ acc3, acc4 566 ADCQ t1, acc5 567 ADCQ AX, acc0 568 ADCQ DX, acc1 569 570 MOVQ acc4, x_ptr 571 MOVQ acc5, acc3 572 MOVQ acc0, t0 573 MOVQ acc1, t1 574 575 SUBQ $-1, acc4 576 SBBQ p256const0<>(SB), acc5 577 SBBQ $0, acc0 578 SBBQ p256const1<>(SB), acc1 579 580 CMOVQCS x_ptr, acc4 581 CMOVQCS acc3, acc5 582 CMOVQCS t0, acc0 583 CMOVQCS t1, acc1 584 585 MOVQ acc4, (8*0)(res_ptr) 586 MOVQ acc5, (8*1)(res_ptr) 587 MOVQ acc0, (8*2)(res_ptr) 588 MOVQ acc1, (8*3)(res_ptr) 589 590 RET 591 /* ---------------------------------------*/ 592 // Constant time point access to arbitrary point table. 593 // Indexed from 1 to 15, with -1 offset 594 // (index 0 is implicitly point at infinity) 595 // func p256Select(point, table []uint64, idx int) 596 TEXT ·p256Select(SB),NOSPLIT,$0 597 MOVQ idx+48(FP),AX 598 MOVQ table+24(FP),DI 599 MOVQ point+0(FP),DX 600 601 PXOR X15, X15 // X15 = 0 602 PCMPEQL X14, X14 // X14 = -1 603 PSUBL X14, X15 // X15 = 1 604 MOVL AX, X14 605 PSHUFD $0, X14, X14 606 607 PXOR X0, X0 608 PXOR X1, X1 609 PXOR X2, X2 610 PXOR X3, X3 611 PXOR X4, X4 612 PXOR X5, X5 613 MOVQ $16, AX 614 615 MOVOU X15, X13 616 617 loop_select: 618 619 MOVOU X13, X12 620 PADDL X15, X13 621 PCMPEQL X14, X12 622 623 MOVOU (16*0)(DI), X6 624 MOVOU (16*1)(DI), X7 625 MOVOU (16*2)(DI), X8 626 MOVOU (16*3)(DI), X9 627 MOVOU (16*4)(DI), X10 628 MOVOU (16*5)(DI), X11 629 ADDQ $(16*6), DI 630 631 PAND X12, X6 632 PAND X12, X7 633 PAND X12, X8 634 PAND X12, X9 635 PAND X12, X10 636 PAND X12, X11 637 638 PXOR X6, X0 639 PXOR X7, X1 640 PXOR X8, X2 641 PXOR X9, X3 642 PXOR X10, X4 643 PXOR X11, X5 644 645 DECQ AX 646 JNE loop_select 647 648 MOVOU X0, (16*0)(DX) 649 MOVOU X1, (16*1)(DX) 650 MOVOU X2, (16*2)(DX) 651 MOVOU X3, (16*3)(DX) 652 MOVOU X4, (16*4)(DX) 653 MOVOU X5, (16*5)(DX) 654 655 RET 656 /* ---------------------------------------*/ 657 // Constant time point access to base point table. 658 // func p256SelectBase(point, table []uint64, idx int) 659 TEXT ·p256SelectBase(SB),NOSPLIT,$0 660 MOVQ idx+48(FP),AX 661 MOVQ table+24(FP),DI 662 MOVQ point+0(FP),DX 663 664 PXOR X15, X15 // X15 = 0 665 PCMPEQL X14, X14 // X14 = -1 666 PSUBL X14, X15 // X15 = 1 667 MOVL AX, X14 668 PSHUFD $0, X14, X14 669 670 PXOR X0, X0 671 PXOR X1, X1 672 PXOR X2, X2 673 PXOR X3, X3 674 MOVQ $32, AX 675 676 MOVOU X15, X13 677 678 loop_select_base: 679 680 MOVOU X13, X12 681 PADDL X15, X13 682 PCMPEQL X14, X12 683 684 MOVOU (16*0)(DI), X4 685 MOVOU (16*1)(DI), X5 686 MOVOU (16*2)(DI), X6 687 MOVOU (16*3)(DI), X7 688 689 MOVOU (16*4)(DI), X8 690 MOVOU (16*5)(DI), X9 691 MOVOU (16*6)(DI), X10 692 MOVOU (16*7)(DI), X11 693 694 ADDQ $(16*8), DI 695 696 PAND X12, X4 697 PAND X12, X5 698 PAND X12, X6 699 PAND X12, X7 700 701 MOVOU X13, X12 702 PADDL X15, X13 703 PCMPEQL X14, X12 704 705 PAND X12, X8 706 PAND X12, X9 707 PAND X12, X10 708 PAND X12, X11 709 710 PXOR X4, X0 711 PXOR X5, X1 712 PXOR X6, X2 713 PXOR X7, X3 714 715 PXOR X8, X0 716 PXOR X9, X1 717 PXOR X10, X2 718 PXOR X11, X3 719 720 DECQ AX 721 JNE loop_select_base 722 723 MOVOU X0, (16*0)(DX) 724 MOVOU X1, (16*1)(DX) 725 MOVOU X2, (16*2)(DX) 726 MOVOU X3, (16*3)(DX) 727 728 RET 729 /* ---------------------------------------*/ 730 // func p256OrdMul(res, in1, in2 []uint64) 731 TEXT ·p256OrdMul(SB),NOSPLIT,$0 732 MOVQ res+0(FP), res_ptr 733 MOVQ in1+24(FP), x_ptr 734 MOVQ in2+48(FP), y_ptr 735 // x * y[0] 736 MOVQ (8*0)(y_ptr), t0 737 738 MOVQ (8*0)(x_ptr), AX 739 MULQ t0 740 MOVQ AX, acc0 741 MOVQ DX, acc1 742 743 MOVQ (8*1)(x_ptr), AX 744 MULQ t0 745 ADDQ AX, acc1 746 ADCQ $0, DX 747 MOVQ DX, acc2 748 749 MOVQ (8*2)(x_ptr), AX 750 MULQ t0 751 ADDQ AX, acc2 752 ADCQ $0, DX 753 MOVQ DX, acc3 754 755 MOVQ (8*3)(x_ptr), AX 756 MULQ t0 757 ADDQ AX, acc3 758 ADCQ $0, DX 759 MOVQ DX, acc4 760 XORQ acc5, acc5 761 // First reduction step 762 MOVQ acc0, AX 763 MULQ p256ordK0<>(SB) 764 MOVQ AX, t0 765 766 MOVQ p256ord<>+0x00(SB), AX 767 MULQ t0 768 ADDQ AX, acc0 769 ADCQ $0, DX 770 MOVQ DX, t1 771 772 MOVQ p256ord<>+0x08(SB), AX 773 MULQ t0 774 ADDQ t1, acc1 775 ADCQ $0, DX 776 ADDQ AX, acc1 777 ADCQ $0, DX 778 MOVQ DX, t1 779 780 MOVQ p256ord<>+0x10(SB), AX 781 MULQ t0 782 ADDQ t1, acc2 783 ADCQ $0, DX 784 ADDQ AX, acc2 785 ADCQ $0, DX 786 MOVQ DX, t1 787 788 MOVQ p256ord<>+0x18(SB), AX 789 MULQ t0 790 ADDQ t1, acc3 791 ADCQ $0, DX 792 ADDQ AX, acc3 793 ADCQ DX, acc4 794 ADCQ $0, acc5 795 // x * y[1] 796 MOVQ (8*1)(y_ptr), t0 797 798 MOVQ (8*0)(x_ptr), AX 799 MULQ t0 800 ADDQ AX, acc1 801 ADCQ $0, DX 802 MOVQ DX, t1 803 804 MOVQ (8*1)(x_ptr), AX 805 MULQ t0 806 ADDQ t1, acc2 807 ADCQ $0, DX 808 ADDQ AX, acc2 809 ADCQ $0, DX 810 MOVQ DX, t1 811 812 MOVQ (8*2)(x_ptr), AX 813 MULQ t0 814 ADDQ t1, acc3 815 ADCQ $0, DX 816 ADDQ AX, acc3 817 ADCQ $0, DX 818 MOVQ DX, t1 819 820 MOVQ (8*3)(x_ptr), AX 821 MULQ t0 822 ADDQ t1, acc4 823 ADCQ $0, DX 824 ADDQ AX, acc4 825 ADCQ DX, acc5 826 ADCQ $0, acc0 827 // Second reduction step 828 MOVQ acc1, AX 829 MULQ p256ordK0<>(SB) 830 MOVQ AX, t0 831 832 MOVQ p256ord<>+0x00(SB), AX 833 MULQ t0 834 ADDQ AX, acc1 835 ADCQ $0, DX 836 MOVQ DX, t1 837 838 MOVQ p256ord<>+0x08(SB), AX 839 MULQ t0 840 ADDQ t1, acc2 841 ADCQ $0, DX 842 ADDQ AX, acc2 843 ADCQ $0, DX 844 MOVQ DX, t1 845 846 MOVQ p256ord<>+0x10(SB), AX 847 MULQ t0 848 ADDQ t1, acc3 849 ADCQ $0, DX 850 ADDQ AX, acc3 851 ADCQ $0, DX 852 MOVQ DX, t1 853 854 MOVQ p256ord<>+0x18(SB), AX 855 MULQ t0 856 ADDQ t1, acc4 857 ADCQ $0, DX 858 ADDQ AX, acc4 859 ADCQ DX, acc5 860 ADCQ $0, acc0 861 // x * y[2] 862 MOVQ (8*2)(y_ptr), t0 863 864 MOVQ (8*0)(x_ptr), AX 865 MULQ t0 866 ADDQ AX, acc2 867 ADCQ $0, DX 868 MOVQ DX, t1 869 870 MOVQ (8*1)(x_ptr), AX 871 MULQ t0 872 ADDQ t1, acc3 873 ADCQ $0, DX 874 ADDQ AX, acc3 875 ADCQ $0, DX 876 MOVQ DX, t1 877 878 MOVQ (8*2)(x_ptr), AX 879 MULQ t0 880 ADDQ t1, acc4 881 ADCQ $0, DX 882 ADDQ AX, acc4 883 ADCQ $0, DX 884 MOVQ DX, t1 885 886 MOVQ (8*3)(x_ptr), AX 887 MULQ t0 888 ADDQ t1, acc5 889 ADCQ $0, DX 890 ADDQ AX, acc5 891 ADCQ DX, acc0 892 ADCQ $0, acc1 893 // Third reduction step 894 MOVQ acc2, AX 895 MULQ p256ordK0<>(SB) 896 MOVQ AX, t0 897 898 MOVQ p256ord<>+0x00(SB), AX 899 MULQ t0 900 ADDQ AX, acc2 901 ADCQ $0, DX 902 MOVQ DX, t1 903 904 MOVQ p256ord<>+0x08(SB), AX 905 MULQ t0 906 ADDQ t1, acc3 907 ADCQ $0, DX 908 ADDQ AX, acc3 909 ADCQ $0, DX 910 MOVQ DX, t1 911 912 MOVQ p256ord<>+0x10(SB), AX 913 MULQ t0 914 ADDQ t1, acc4 915 ADCQ $0, DX 916 ADDQ AX, acc4 917 ADCQ $0, DX 918 MOVQ DX, t1 919 920 MOVQ p256ord<>+0x18(SB), AX 921 MULQ t0 922 ADDQ t1, acc5 923 ADCQ $0, DX 924 ADDQ AX, acc5 925 ADCQ DX, acc0 926 ADCQ $0, acc1 927 // x * y[3] 928 MOVQ (8*3)(y_ptr), t0 929 930 MOVQ (8*0)(x_ptr), AX 931 MULQ t0 932 ADDQ AX, acc3 933 ADCQ $0, DX 934 MOVQ DX, t1 935 936 MOVQ (8*1)(x_ptr), AX 937 MULQ t0 938 ADDQ t1, acc4 939 ADCQ $0, DX 940 ADDQ AX, acc4 941 ADCQ $0, DX 942 MOVQ DX, t1 943 944 MOVQ (8*2)(x_ptr), AX 945 MULQ t0 946 ADDQ t1, acc5 947 ADCQ $0, DX 948 ADDQ AX, acc5 949 ADCQ $0, DX 950 MOVQ DX, t1 951 952 MOVQ (8*3)(x_ptr), AX 953 MULQ t0 954 ADDQ t1, acc0 955 ADCQ $0, DX 956 ADDQ AX, acc0 957 ADCQ DX, acc1 958 ADCQ $0, acc2 959 // Last reduction step 960 MOVQ acc3, AX 961 MULQ p256ordK0<>(SB) 962 MOVQ AX, t0 963 964 MOVQ p256ord<>+0x00(SB), AX 965 MULQ t0 966 ADDQ AX, acc3 967 ADCQ $0, DX 968 MOVQ DX, t1 969 970 MOVQ p256ord<>+0x08(SB), AX 971 MULQ t0 972 ADDQ t1, acc4 973 ADCQ $0, DX 974 ADDQ AX, acc4 975 ADCQ $0, DX 976 MOVQ DX, t1 977 978 MOVQ p256ord<>+0x10(SB), AX 979 MULQ t0 980 ADDQ t1, acc5 981 ADCQ $0, DX 982 ADDQ AX, acc5 983 ADCQ $0, DX 984 MOVQ DX, t1 985 986 MOVQ p256ord<>+0x18(SB), AX 987 MULQ t0 988 ADDQ t1, acc0 989 ADCQ $0, DX 990 ADDQ AX, acc0 991 ADCQ DX, acc1 992 ADCQ $0, acc2 993 // Copy result [255:0] 994 MOVQ acc4, x_ptr 995 MOVQ acc5, acc3 996 MOVQ acc0, t0 997 MOVQ acc1, t1 998 // Subtract p256 999 SUBQ p256ord<>+0x00(SB), acc4 1000 SBBQ p256ord<>+0x08(SB) ,acc5 1001 SBBQ p256ord<>+0x10(SB), acc0 1002 SBBQ p256ord<>+0x18(SB), acc1 1003 SBBQ $0, acc2 1004 1005 CMOVQCS x_ptr, acc4 1006 CMOVQCS acc3, acc5 1007 CMOVQCS t0, acc0 1008 CMOVQCS t1, acc1 1009 1010 MOVQ acc4, (8*0)(res_ptr) 1011 MOVQ acc5, (8*1)(res_ptr) 1012 MOVQ acc0, (8*2)(res_ptr) 1013 MOVQ acc1, (8*3)(res_ptr) 1014 1015 RET 1016 /* ---------------------------------------*/ 1017 // func p256OrdSqr(res, in []uint64, n int) 1018 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 1019 MOVQ res+0(FP), res_ptr 1020 MOVQ in+24(FP), x_ptr 1021 MOVQ n+48(FP), BX 1022 1023 ordSqrLoop: 1024 1025 // y[1:] * y[0] 1026 MOVQ (8*0)(x_ptr), t0 1027 1028 MOVQ (8*1)(x_ptr), AX 1029 MULQ t0 1030 MOVQ AX, acc1 1031 MOVQ DX, acc2 1032 1033 MOVQ (8*2)(x_ptr), AX 1034 MULQ t0 1035 ADDQ AX, acc2 1036 ADCQ $0, DX 1037 MOVQ DX, acc3 1038 1039 MOVQ (8*3)(x_ptr), AX 1040 MULQ t0 1041 ADDQ AX, acc3 1042 ADCQ $0, DX 1043 MOVQ DX, acc4 1044 // y[2:] * y[1] 1045 MOVQ (8*1)(x_ptr), t0 1046 1047 MOVQ (8*2)(x_ptr), AX 1048 MULQ t0 1049 ADDQ AX, acc3 1050 ADCQ $0, DX 1051 MOVQ DX, t1 1052 1053 MOVQ (8*3)(x_ptr), AX 1054 MULQ t0 1055 ADDQ t1, acc4 1056 ADCQ $0, DX 1057 ADDQ AX, acc4 1058 ADCQ $0, DX 1059 MOVQ DX, acc5 1060 // y[3] * y[2] 1061 MOVQ (8*2)(x_ptr), t0 1062 1063 MOVQ (8*3)(x_ptr), AX 1064 MULQ t0 1065 ADDQ AX, acc5 1066 ADCQ $0, DX 1067 MOVQ DX, y_ptr 1068 XORQ t1, t1 1069 // *2 1070 ADDQ acc1, acc1 1071 ADCQ acc2, acc2 1072 ADCQ acc3, acc3 1073 ADCQ acc4, acc4 1074 ADCQ acc5, acc5 1075 ADCQ y_ptr, y_ptr 1076 ADCQ $0, t1 1077 // Missing products 1078 MOVQ (8*0)(x_ptr), AX 1079 MULQ AX 1080 MOVQ AX, acc0 1081 MOVQ DX, t0 1082 1083 MOVQ (8*1)(x_ptr), AX 1084 MULQ AX 1085 ADDQ t0, acc1 1086 ADCQ AX, acc2 1087 ADCQ $0, DX 1088 MOVQ DX, t0 1089 1090 MOVQ (8*2)(x_ptr), AX 1091 MULQ AX 1092 ADDQ t0, acc3 1093 ADCQ AX, acc4 1094 ADCQ $0, DX 1095 MOVQ DX, t0 1096 1097 MOVQ (8*3)(x_ptr), AX 1098 MULQ AX 1099 ADDQ t0, acc5 1100 ADCQ AX, y_ptr 1101 ADCQ DX, t1 1102 MOVQ t1, x_ptr 1103 // First reduction step 1104 MOVQ acc0, AX 1105 MULQ p256ordK0<>(SB) 1106 MOVQ AX, t0 1107 1108 MOVQ p256ord<>+0x00(SB), AX 1109 MULQ t0 1110 ADDQ AX, acc0 1111 ADCQ $0, DX 1112 MOVQ DX, t1 1113 1114 MOVQ p256ord<>+0x08(SB), AX 1115 MULQ t0 1116 ADDQ t1, acc1 1117 ADCQ $0, DX 1118 ADDQ AX, acc1 1119 1120 MOVQ t0, t1 1121 ADCQ DX, acc2 1122 ADCQ $0, t1 1123 SUBQ t0, acc2 1124 SBBQ $0, t1 1125 1126 MOVQ t0, AX 1127 MOVQ t0, DX 1128 MOVQ t0, acc0 1129 SHLQ $32, AX 1130 SHRQ $32, DX 1131 1132 ADDQ t1, acc3 1133 ADCQ $0, acc0 1134 SUBQ AX, acc3 1135 SBBQ DX, acc0 1136 // Second reduction step 1137 MOVQ acc1, AX 1138 MULQ p256ordK0<>(SB) 1139 MOVQ AX, t0 1140 1141 MOVQ p256ord<>+0x00(SB), AX 1142 MULQ t0 1143 ADDQ AX, acc1 1144 ADCQ $0, DX 1145 MOVQ DX, t1 1146 1147 MOVQ p256ord<>+0x08(SB), AX 1148 MULQ t0 1149 ADDQ t1, acc2 1150 ADCQ $0, DX 1151 ADDQ AX, acc2 1152 1153 MOVQ t0, t1 1154 ADCQ DX, acc3 1155 ADCQ $0, t1 1156 SUBQ t0, acc3 1157 SBBQ $0, t1 1158 1159 MOVQ t0, AX 1160 MOVQ t0, DX 1161 MOVQ t0, acc1 1162 SHLQ $32, AX 1163 SHRQ $32, DX 1164 1165 ADDQ t1, acc0 1166 ADCQ $0, acc1 1167 SUBQ AX, acc0 1168 SBBQ DX, acc1 1169 // Third reduction step 1170 MOVQ acc2, AX 1171 MULQ p256ordK0<>(SB) 1172 MOVQ AX, t0 1173 1174 MOVQ p256ord<>+0x00(SB), AX 1175 MULQ t0 1176 ADDQ AX, acc2 1177 ADCQ $0, DX 1178 MOVQ DX, t1 1179 1180 MOVQ p256ord<>+0x08(SB), AX 1181 MULQ t0 1182 ADDQ t1, acc3 1183 ADCQ $0, DX 1184 ADDQ AX, acc3 1185 1186 MOVQ t0, t1 1187 ADCQ DX, acc0 1188 ADCQ $0, t1 1189 SUBQ t0, acc0 1190 SBBQ $0, t1 1191 1192 MOVQ t0, AX 1193 MOVQ t0, DX 1194 MOVQ t0, acc2 1195 SHLQ $32, AX 1196 SHRQ $32, DX 1197 1198 ADDQ t1, acc1 1199 ADCQ $0, acc2 1200 SUBQ AX, acc1 1201 SBBQ DX, acc2 1202 // Last reduction step 1203 MOVQ acc3, AX 1204 MULQ p256ordK0<>(SB) 1205 MOVQ AX, t0 1206 1207 MOVQ p256ord<>+0x00(SB), AX 1208 MULQ t0 1209 ADDQ AX, acc3 1210 ADCQ $0, DX 1211 MOVQ DX, t1 1212 1213 MOVQ p256ord<>+0x08(SB), AX 1214 MULQ t0 1215 ADDQ t1, acc0 1216 ADCQ $0, DX 1217 ADDQ AX, acc0 1218 ADCQ $0, DX 1219 MOVQ DX, t1 1220 1221 MOVQ t0, t1 1222 ADCQ DX, acc1 1223 ADCQ $0, t1 1224 SUBQ t0, acc1 1225 SBBQ $0, t1 1226 1227 MOVQ t0, AX 1228 MOVQ t0, DX 1229 MOVQ t0, acc3 1230 SHLQ $32, AX 1231 SHRQ $32, DX 1232 1233 ADDQ t1, acc2 1234 ADCQ $0, acc3 1235 SUBQ AX, acc2 1236 SBBQ DX, acc3 1237 XORQ t0, t0 1238 // Add bits [511:256] of the sqr result 1239 ADCQ acc4, acc0 1240 ADCQ acc5, acc1 1241 ADCQ y_ptr, acc2 1242 ADCQ x_ptr, acc3 1243 ADCQ $0, t0 1244 1245 MOVQ acc0, acc4 1246 MOVQ acc1, acc5 1247 MOVQ acc2, y_ptr 1248 MOVQ acc3, t1 1249 // Subtract p256 1250 SUBQ p256ord<>+0x00(SB), acc0 1251 SBBQ p256ord<>+0x08(SB) ,acc1 1252 SBBQ p256ord<>+0x10(SB), acc2 1253 SBBQ p256ord<>+0x18(SB), acc3 1254 SBBQ $0, t0 1255 1256 CMOVQCS acc4, acc0 1257 CMOVQCS acc5, acc1 1258 CMOVQCS y_ptr, acc2 1259 CMOVQCS t1, acc3 1260 1261 MOVQ acc0, (8*0)(res_ptr) 1262 MOVQ acc1, (8*1)(res_ptr) 1263 MOVQ acc2, (8*2)(res_ptr) 1264 MOVQ acc3, (8*3)(res_ptr) 1265 MOVQ res_ptr, x_ptr 1266 DECQ BX 1267 JNE ordSqrLoop 1268 1269 RET 1270 /* ---------------------------------------*/ 1271 #undef res_ptr 1272 #undef x_ptr 1273 #undef y_ptr 1274 1275 #undef acc0 1276 #undef acc1 1277 #undef acc2 1278 #undef acc3 1279 #undef acc4 1280 #undef acc5 1281 #undef t0 1282 #undef t1 1283 /* ---------------------------------------*/ 1284 #define mul0 AX 1285 #define mul1 DX 1286 #define acc0 BX 1287 #define acc1 CX 1288 #define acc2 R8 1289 #define acc3 R9 1290 #define acc4 R10 1291 #define acc5 R11 1292 #define acc6 R12 1293 #define acc7 R13 1294 #define t0 R14 1295 #define t1 R15 1296 #define t2 DI 1297 #define t3 SI 1298 #define hlp BP 1299 /* ---------------------------------------*/ 1300 TEXT p256SubInternal(SB),NOSPLIT,$0 1301 XORQ mul0, mul0 1302 SUBQ t0, acc4 1303 SBBQ t1, acc5 1304 SBBQ t2, acc6 1305 SBBQ t3, acc7 1306 SBBQ $0, mul0 1307 1308 MOVQ acc4, acc0 1309 MOVQ acc5, acc1 1310 MOVQ acc6, acc2 1311 MOVQ acc7, acc3 1312 1313 ADDQ $-1, acc4 1314 ADCQ p256const0<>(SB), acc5 1315 ADCQ $0, acc6 1316 ADCQ p256const1<>(SB), acc7 1317 ADCQ $0, mul0 1318 1319 CMOVQNE acc0, acc4 1320 CMOVQNE acc1, acc5 1321 CMOVQNE acc2, acc6 1322 CMOVQNE acc3, acc7 1323 1324 RET 1325 /* ---------------------------------------*/ 1326 TEXT p256MulInternal(SB),NOSPLIT,$0 1327 MOVQ acc4, mul0 1328 MULQ t0 1329 MOVQ mul0, acc0 1330 MOVQ mul1, acc1 1331 1332 MOVQ acc4, mul0 1333 MULQ t1 1334 ADDQ mul0, acc1 1335 ADCQ $0, mul1 1336 MOVQ mul1, acc2 1337 1338 MOVQ acc4, mul0 1339 MULQ t2 1340 ADDQ mul0, acc2 1341 ADCQ $0, mul1 1342 MOVQ mul1, acc3 1343 1344 MOVQ acc4, mul0 1345 MULQ t3 1346 ADDQ mul0, acc3 1347 ADCQ $0, mul1 1348 MOVQ mul1, acc4 1349 1350 MOVQ acc5, mul0 1351 MULQ t0 1352 ADDQ mul0, acc1 1353 ADCQ $0, mul1 1354 MOVQ mul1, hlp 1355 1356 MOVQ acc5, mul0 1357 MULQ t1 1358 ADDQ hlp, acc2 1359 ADCQ $0, mul1 1360 ADDQ mul0, acc2 1361 ADCQ $0, mul1 1362 MOVQ mul1, hlp 1363 1364 MOVQ acc5, mul0 1365 MULQ t2 1366 ADDQ hlp, acc3 1367 ADCQ $0, mul1 1368 ADDQ mul0, acc3 1369 ADCQ $0, mul1 1370 MOVQ mul1, hlp 1371 1372 MOVQ acc5, mul0 1373 MULQ t3 1374 ADDQ hlp, acc4 1375 ADCQ $0, mul1 1376 ADDQ mul0, acc4 1377 ADCQ $0, mul1 1378 MOVQ mul1, acc5 1379 1380 MOVQ acc6, mul0 1381 MULQ t0 1382 ADDQ mul0, acc2 1383 ADCQ $0, mul1 1384 MOVQ mul1, hlp 1385 1386 MOVQ acc6, mul0 1387 MULQ t1 1388 ADDQ hlp, acc3 1389 ADCQ $0, mul1 1390 ADDQ mul0, acc3 1391 ADCQ $0, mul1 1392 MOVQ mul1, hlp 1393 1394 MOVQ acc6, mul0 1395 MULQ t2 1396 ADDQ hlp, acc4 1397 ADCQ $0, mul1 1398 ADDQ mul0, acc4 1399 ADCQ $0, mul1 1400 MOVQ mul1, hlp 1401 1402 MOVQ acc6, mul0 1403 MULQ t3 1404 ADDQ hlp, acc5 1405 ADCQ $0, mul1 1406 ADDQ mul0, acc5 1407 ADCQ $0, mul1 1408 MOVQ mul1, acc6 1409 1410 MOVQ acc7, mul0 1411 MULQ t0 1412 ADDQ mul0, acc3 1413 ADCQ $0, mul1 1414 MOVQ mul1, hlp 1415 1416 MOVQ acc7, mul0 1417 MULQ t1 1418 ADDQ hlp, acc4 1419 ADCQ $0, mul1 1420 ADDQ mul0, acc4 1421 ADCQ $0, mul1 1422 MOVQ mul1, hlp 1423 1424 MOVQ acc7, mul0 1425 MULQ t2 1426 ADDQ hlp, acc5 1427 ADCQ $0, mul1 1428 ADDQ mul0, acc5 1429 ADCQ $0, mul1 1430 MOVQ mul1, hlp 1431 1432 MOVQ acc7, mul0 1433 MULQ t3 1434 ADDQ hlp, acc6 1435 ADCQ $0, mul1 1436 ADDQ mul0, acc6 1437 ADCQ $0, mul1 1438 MOVQ mul1, acc7 1439 // First reduction step 1440 MOVQ acc0, mul0 1441 MOVQ acc0, hlp 1442 SHLQ $32, acc0 1443 MULQ p256const1<>(SB) 1444 SHRQ $32, hlp 1445 ADDQ acc0, acc1 1446 ADCQ hlp, acc2 1447 ADCQ mul0, acc3 1448 ADCQ $0, mul1 1449 MOVQ mul1, acc0 1450 // Second reduction step 1451 MOVQ acc1, mul0 1452 MOVQ acc1, hlp 1453 SHLQ $32, acc1 1454 MULQ p256const1<>(SB) 1455 SHRQ $32, hlp 1456 ADDQ acc1, acc2 1457 ADCQ hlp, acc3 1458 ADCQ mul0, acc0 1459 ADCQ $0, mul1 1460 MOVQ mul1, acc1 1461 // Third reduction step 1462 MOVQ acc2, mul0 1463 MOVQ acc2, hlp 1464 SHLQ $32, acc2 1465 MULQ p256const1<>(SB) 1466 SHRQ $32, hlp 1467 ADDQ acc2, acc3 1468 ADCQ hlp, acc0 1469 ADCQ mul0, acc1 1470 ADCQ $0, mul1 1471 MOVQ mul1, acc2 1472 // Last reduction step 1473 MOVQ acc3, mul0 1474 MOVQ acc3, hlp 1475 SHLQ $32, acc3 1476 MULQ p256const1<>(SB) 1477 SHRQ $32, hlp 1478 ADDQ acc3, acc0 1479 ADCQ hlp, acc1 1480 ADCQ mul0, acc2 1481 ADCQ $0, mul1 1482 MOVQ mul1, acc3 1483 BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP 1484 // Add bits [511:256] of the result 1485 ADCQ acc0, acc4 1486 ADCQ acc1, acc5 1487 ADCQ acc2, acc6 1488 ADCQ acc3, acc7 1489 ADCQ $0, hlp 1490 // Copy result 1491 MOVQ acc4, acc0 1492 MOVQ acc5, acc1 1493 MOVQ acc6, acc2 1494 MOVQ acc7, acc3 1495 // Subtract p256 1496 SUBQ $-1, acc4 1497 SBBQ p256const0<>(SB) ,acc5 1498 SBBQ $0, acc6 1499 SBBQ p256const1<>(SB), acc7 1500 SBBQ $0, hlp 1501 // If the result of the subtraction is negative, restore the previous result 1502 CMOVQCS acc0, acc4 1503 CMOVQCS acc1, acc5 1504 CMOVQCS acc2, acc6 1505 CMOVQCS acc3, acc7 1506 1507 RET 1508 /* ---------------------------------------*/ 1509 TEXT p256SqrInternal(SB),NOSPLIT,$0 1510 1511 MOVQ acc4, mul0 1512 MULQ acc5 1513 MOVQ mul0, acc1 1514 MOVQ mul1, acc2 1515 1516 MOVQ acc4, mul0 1517 MULQ acc6 1518 ADDQ mul0, acc2 1519 ADCQ $0, mul1 1520 MOVQ mul1, acc3 1521 1522 MOVQ acc4, mul0 1523 MULQ acc7 1524 ADDQ mul0, acc3 1525 ADCQ $0, mul1 1526 MOVQ mul1, t0 1527 1528 MOVQ acc5, mul0 1529 MULQ acc6 1530 ADDQ mul0, acc3 1531 ADCQ $0, mul1 1532 MOVQ mul1, hlp 1533 1534 MOVQ acc5, mul0 1535 MULQ acc7 1536 ADDQ hlp, t0 1537 ADCQ $0, mul1 1538 ADDQ mul0, t0 1539 ADCQ $0, mul1 1540 MOVQ mul1, t1 1541 1542 MOVQ acc6, mul0 1543 MULQ acc7 1544 ADDQ mul0, t1 1545 ADCQ $0, mul1 1546 MOVQ mul1, t2 1547 XORQ t3, t3 1548 // *2 1549 ADDQ acc1, acc1 1550 ADCQ acc2, acc2 1551 ADCQ acc3, acc3 1552 ADCQ t0, t0 1553 ADCQ t1, t1 1554 ADCQ t2, t2 1555 ADCQ $0, t3 1556 // Missing products 1557 MOVQ acc4, mul0 1558 MULQ mul0 1559 MOVQ mul0, acc0 1560 MOVQ DX, acc4 1561 1562 MOVQ acc5, mul0 1563 MULQ mul0 1564 ADDQ acc4, acc1 1565 ADCQ mul0, acc2 1566 ADCQ $0, DX 1567 MOVQ DX, acc4 1568 1569 MOVQ acc6, mul0 1570 MULQ mul0 1571 ADDQ acc4, acc3 1572 ADCQ mul0, t0 1573 ADCQ $0, DX 1574 MOVQ DX, acc4 1575 1576 MOVQ acc7, mul0 1577 MULQ mul0 1578 ADDQ acc4, t1 1579 ADCQ mul0, t2 1580 ADCQ DX, t3 1581 // First reduction step 1582 MOVQ acc0, mul0 1583 MOVQ acc0, hlp 1584 SHLQ $32, acc0 1585 MULQ p256const1<>(SB) 1586 SHRQ $32, hlp 1587 ADDQ acc0, acc1 1588 ADCQ hlp, acc2 1589 ADCQ mul0, acc3 1590 ADCQ $0, mul1 1591 MOVQ mul1, acc0 1592 // Second reduction step 1593 MOVQ acc1, mul0 1594 MOVQ acc1, hlp 1595 SHLQ $32, acc1 1596 MULQ p256const1<>(SB) 1597 SHRQ $32, hlp 1598 ADDQ acc1, acc2 1599 ADCQ hlp, acc3 1600 ADCQ mul0, acc0 1601 ADCQ $0, mul1 1602 MOVQ mul1, acc1 1603 // Third reduction step 1604 MOVQ acc2, mul0 1605 MOVQ acc2, hlp 1606 SHLQ $32, acc2 1607 MULQ p256const1<>(SB) 1608 SHRQ $32, hlp 1609 ADDQ acc2, acc3 1610 ADCQ hlp, acc0 1611 ADCQ mul0, acc1 1612 ADCQ $0, mul1 1613 MOVQ mul1, acc2 1614 // Last reduction step 1615 MOVQ acc3, mul0 1616 MOVQ acc3, hlp 1617 SHLQ $32, acc3 1618 MULQ p256const1<>(SB) 1619 SHRQ $32, hlp 1620 ADDQ acc3, acc0 1621 ADCQ hlp, acc1 1622 ADCQ mul0, acc2 1623 ADCQ $0, mul1 1624 MOVQ mul1, acc3 1625 BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP 1626 // Add bits [511:256] of the result 1627 ADCQ acc0, t0 1628 ADCQ acc1, t1 1629 ADCQ acc2, t2 1630 ADCQ acc3, t3 1631 ADCQ $0, hlp 1632 // Copy result 1633 MOVQ t0, acc4 1634 MOVQ t1, acc5 1635 MOVQ t2, acc6 1636 MOVQ t3, acc7 1637 // Subtract p256 1638 SUBQ $-1, acc4 1639 SBBQ p256const0<>(SB) ,acc5 1640 SBBQ $0, acc6 1641 SBBQ p256const1<>(SB), acc7 1642 SBBQ $0, hlp 1643 // If the result of the subtraction is negative, restore the previous result 1644 CMOVQCS t0, acc4 1645 CMOVQCS t1, acc5 1646 CMOVQCS t2, acc6 1647 CMOVQCS t3, acc7 1648 1649 RET 1650 /* ---------------------------------------*/ 1651 #define p256MulBy2Inline\ 1652 XORQ mul0, mul0;\ 1653 ADDQ acc4, acc4;\ 1654 ADCQ acc5, acc5;\ 1655 ADCQ acc6, acc6;\ 1656 ADCQ acc7, acc7;\ 1657 ADCQ $0, mul0;\ 1658 MOVQ acc4, t0;\ 1659 MOVQ acc5, t1;\ 1660 MOVQ acc6, t2;\ 1661 MOVQ acc7, t3;\ 1662 SUBQ $-1, t0;\ 1663 SBBQ p256const0<>(SB), t1;\ 1664 SBBQ $0, t2;\ 1665 SBBQ p256const1<>(SB), t3;\ 1666 SBBQ $0, mul0;\ 1667 CMOVQCS acc4, t0;\ 1668 CMOVQCS acc5, t1;\ 1669 CMOVQCS acc6, t2;\ 1670 CMOVQCS acc7, t3; 1671 /* ---------------------------------------*/ 1672 #define p256AddInline \ 1673 XORQ mul0, mul0;\ 1674 ADDQ t0, acc4;\ 1675 ADCQ t1, acc5;\ 1676 ADCQ t2, acc6;\ 1677 ADCQ t3, acc7;\ 1678 ADCQ $0, mul0;\ 1679 MOVQ acc4, t0;\ 1680 MOVQ acc5, t1;\ 1681 MOVQ acc6, t2;\ 1682 MOVQ acc7, t3;\ 1683 SUBQ $-1, t0;\ 1684 SBBQ p256const0<>(SB), t1;\ 1685 SBBQ $0, t2;\ 1686 SBBQ p256const1<>(SB), t3;\ 1687 SBBQ $0, mul0;\ 1688 CMOVQCS acc4, t0;\ 1689 CMOVQCS acc5, t1;\ 1690 CMOVQCS acc6, t2;\ 1691 CMOVQCS acc7, t3; 1692 /* ---------------------------------------*/ 1693 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1694 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1695 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1696 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1697 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1698 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1699 /* ---------------------------------------*/ 1700 #define x1in(off) (32*0 + off)(SP) 1701 #define y1in(off) (32*1 + off)(SP) 1702 #define z1in(off) (32*2 + off)(SP) 1703 #define x2in(off) (32*3 + off)(SP) 1704 #define y2in(off) (32*4 + off)(SP) 1705 #define xout(off) (32*5 + off)(SP) 1706 #define yout(off) (32*6 + off)(SP) 1707 #define zout(off) (32*7 + off)(SP) 1708 #define s2(off) (32*8 + off)(SP) 1709 #define z1sqr(off) (32*9 + off)(SP) 1710 #define h(off) (32*10 + off)(SP) 1711 #define r(off) (32*11 + off)(SP) 1712 #define hsqr(off) (32*12 + off)(SP) 1713 #define rsqr(off) (32*13 + off)(SP) 1714 #define hcub(off) (32*14 + off)(SP) 1715 #define rptr (32*15)(SP) 1716 #define sel_save (32*15 + 8)(SP) 1717 #define zero_save (32*15 + 8 + 4)(SP) 1718 1719 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 1720 TEXT ·p256PointAddAffineAsm(SB),0,$512-96 1721 // Move input to stack in order to free registers 1722 MOVQ res+0(FP), AX 1723 MOVQ in1+24(FP), BX 1724 MOVQ in2+48(FP), CX 1725 MOVQ sign+72(FP), DX 1726 MOVQ sel+80(FP), t1 1727 MOVQ zero+88(FP), t2 1728 1729 MOVOU (16*0)(BX), X0 1730 MOVOU (16*1)(BX), X1 1731 MOVOU (16*2)(BX), X2 1732 MOVOU (16*3)(BX), X3 1733 MOVOU (16*4)(BX), X4 1734 MOVOU (16*5)(BX), X5 1735 1736 MOVOU X0, x1in(16*0) 1737 MOVOU X1, x1in(16*1) 1738 MOVOU X2, y1in(16*0) 1739 MOVOU X3, y1in(16*1) 1740 MOVOU X4, z1in(16*0) 1741 MOVOU X5, z1in(16*1) 1742 1743 MOVOU (16*0)(CX), X0 1744 MOVOU (16*1)(CX), X1 1745 1746 MOVOU X0, x2in(16*0) 1747 MOVOU X1, x2in(16*1) 1748 // Store pointer to result 1749 MOVQ mul0, rptr 1750 MOVL t1, sel_save 1751 MOVL t2, zero_save 1752 // Negate y2in based on sign 1753 MOVQ (16*2 + 8*0)(CX), acc4 1754 MOVQ (16*2 + 8*1)(CX), acc5 1755 MOVQ (16*2 + 8*2)(CX), acc6 1756 MOVQ (16*2 + 8*3)(CX), acc7 1757 MOVQ $-1, acc0 1758 MOVQ p256const0<>(SB), acc1 1759 MOVQ $0, acc2 1760 MOVQ p256const1<>(SB), acc3 1761 XORQ mul0, mul0 1762 // Speculatively subtract 1763 SUBQ acc4, acc0 1764 SBBQ acc5, acc1 1765 SBBQ acc6, acc2 1766 SBBQ acc7, acc3 1767 SBBQ $0, mul0 1768 MOVQ acc0, t0 1769 MOVQ acc1, t1 1770 MOVQ acc2, t2 1771 MOVQ acc3, t3 1772 // Add in case the operand was > p256 1773 ADDQ $-1, acc0 1774 ADCQ p256const0<>(SB), acc1 1775 ADCQ $0, acc2 1776 ADCQ p256const1<>(SB), acc3 1777 ADCQ $0, mul0 1778 CMOVQNE t0, acc0 1779 CMOVQNE t1, acc1 1780 CMOVQNE t2, acc2 1781 CMOVQNE t3, acc3 1782 // If condition is 0, keep original value 1783 TESTQ DX, DX 1784 CMOVQEQ acc4, acc0 1785 CMOVQEQ acc5, acc1 1786 CMOVQEQ acc6, acc2 1787 CMOVQEQ acc7, acc3 1788 // Store result 1789 MOVQ acc0, y2in(8*0) 1790 MOVQ acc1, y2in(8*1) 1791 MOVQ acc2, y2in(8*2) 1792 MOVQ acc3, y2in(8*3) 1793 // Begin point add 1794 LDacc (z1in) 1795 CALL p256SqrInternal(SB) // z1ˆ2 1796 ST (z1sqr) 1797 1798 LDt (x2in) 1799 CALL p256MulInternal(SB) // x2 * z1ˆ2 1800 1801 LDt (x1in) 1802 CALL p256SubInternal(SB) // h = u2 - u1 1803 ST (h) 1804 1805 LDt (z1in) 1806 CALL p256MulInternal(SB) // z3 = h * z1 1807 ST (zout) 1808 1809 LDacc (z1sqr) 1810 CALL p256MulInternal(SB) // z1ˆ3 1811 1812 LDt (y2in) 1813 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 1814 ST (s2) 1815 1816 LDt (y1in) 1817 CALL p256SubInternal(SB) // r = s2 - s1 1818 ST (r) 1819 1820 CALL p256SqrInternal(SB) // rsqr = rˆ2 1821 ST (rsqr) 1822 1823 LDacc (h) 1824 CALL p256SqrInternal(SB) // hsqr = hˆ2 1825 ST (hsqr) 1826 1827 LDt (h) 1828 CALL p256MulInternal(SB) // hcub = hˆ3 1829 ST (hcub) 1830 1831 LDt (y1in) 1832 CALL p256MulInternal(SB) // y1 * hˆ3 1833 ST (s2) 1834 1835 LDacc (x1in) 1836 LDt (hsqr) 1837 CALL p256MulInternal(SB) // u1 * hˆ2 1838 ST (h) 1839 1840 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1841 LDacc (rsqr) 1842 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 1843 1844 LDt (hcub) 1845 CALL p256SubInternal(SB) 1846 ST (xout) 1847 1848 MOVQ acc4, t0 1849 MOVQ acc5, t1 1850 MOVQ acc6, t2 1851 MOVQ acc7, t3 1852 LDacc (h) 1853 CALL p256SubInternal(SB) 1854 1855 LDt (r) 1856 CALL p256MulInternal(SB) 1857 1858 LDt (s2) 1859 CALL p256SubInternal(SB) 1860 ST (yout) 1861 // Load stored values from stack 1862 MOVQ rptr, AX 1863 MOVL sel_save, BX 1864 MOVL zero_save, CX 1865 // The result is not valid if (sel == 0), conditional choose 1866 MOVOU xout(16*0), X0 1867 MOVOU xout(16*1), X1 1868 MOVOU yout(16*0), X2 1869 MOVOU yout(16*1), X3 1870 MOVOU zout(16*0), X4 1871 MOVOU zout(16*1), X5 1872 1873 MOVL BX, X6 1874 MOVL CX, X7 1875 1876 PXOR X8, X8 1877 PCMPEQL X9, X9 1878 1879 PSHUFD $0, X6, X6 1880 PSHUFD $0, X7, X7 1881 1882 PCMPEQL X8, X6 1883 PCMPEQL X8, X7 1884 1885 MOVOU X6, X15 1886 PANDN X9, X15 1887 1888 MOVOU x1in(16*0), X9 1889 MOVOU x1in(16*1), X10 1890 MOVOU y1in(16*0), X11 1891 MOVOU y1in(16*1), X12 1892 MOVOU z1in(16*0), X13 1893 MOVOU z1in(16*1), X14 1894 1895 PAND X15, X0 1896 PAND X15, X1 1897 PAND X15, X2 1898 PAND X15, X3 1899 PAND X15, X4 1900 PAND X15, X5 1901 1902 PAND X6, X9 1903 PAND X6, X10 1904 PAND X6, X11 1905 PAND X6, X12 1906 PAND X6, X13 1907 PAND X6, X14 1908 1909 PXOR X9, X0 1910 PXOR X10, X1 1911 PXOR X11, X2 1912 PXOR X12, X3 1913 PXOR X13, X4 1914 PXOR X14, X5 1915 // Similarly if zero == 0 1916 PCMPEQL X9, X9 1917 MOVOU X7, X15 1918 PANDN X9, X15 1919 1920 MOVOU x2in(16*0), X9 1921 MOVOU x2in(16*1), X10 1922 MOVOU y2in(16*0), X11 1923 MOVOU y2in(16*1), X12 1924 MOVOU p256one<>+0x00(SB), X13 1925 MOVOU p256one<>+0x10(SB), X14 1926 1927 PAND X15, X0 1928 PAND X15, X1 1929 PAND X15, X2 1930 PAND X15, X3 1931 PAND X15, X4 1932 PAND X15, X5 1933 1934 PAND X7, X9 1935 PAND X7, X10 1936 PAND X7, X11 1937 PAND X7, X12 1938 PAND X7, X13 1939 PAND X7, X14 1940 1941 PXOR X9, X0 1942 PXOR X10, X1 1943 PXOR X11, X2 1944 PXOR X12, X3 1945 PXOR X13, X4 1946 PXOR X14, X5 1947 // Finally output the result 1948 MOVOU X0, (16*0)(AX) 1949 MOVOU X1, (16*1)(AX) 1950 MOVOU X2, (16*2)(AX) 1951 MOVOU X3, (16*3)(AX) 1952 MOVOU X4, (16*4)(AX) 1953 MOVOU X5, (16*5)(AX) 1954 MOVQ $0, rptr 1955 1956 RET 1957 #undef x1in 1958 #undef y1in 1959 #undef z1in 1960 #undef x2in 1961 #undef y2in 1962 #undef xout 1963 #undef yout 1964 #undef zout 1965 #undef s2 1966 #undef z1sqr 1967 #undef h 1968 #undef r 1969 #undef hsqr 1970 #undef rsqr 1971 #undef hcub 1972 #undef rptr 1973 #undef sel_save 1974 #undef zero_save 1975 /* ---------------------------------------*/ 1976 #define x1in(off) (32*0 + off)(SP) 1977 #define y1in(off) (32*1 + off)(SP) 1978 #define z1in(off) (32*2 + off)(SP) 1979 #define x2in(off) (32*3 + off)(SP) 1980 #define y2in(off) (32*4 + off)(SP) 1981 #define z2in(off) (32*5 + off)(SP) 1982 1983 #define xout(off) (32*6 + off)(SP) 1984 #define yout(off) (32*7 + off)(SP) 1985 #define zout(off) (32*8 + off)(SP) 1986 1987 #define u1(off) (32*9 + off)(SP) 1988 #define u2(off) (32*10 + off)(SP) 1989 #define s1(off) (32*11 + off)(SP) 1990 #define s2(off) (32*12 + off)(SP) 1991 #define z1sqr(off) (32*13 + off)(SP) 1992 #define z2sqr(off) (32*14 + off)(SP) 1993 #define h(off) (32*15 + off)(SP) 1994 #define r(off) (32*16 + off)(SP) 1995 #define hsqr(off) (32*17 + off)(SP) 1996 #define rsqr(off) (32*18 + off)(SP) 1997 #define hcub(off) (32*19 + off)(SP) 1998 #define rptr (32*20)(SP) 1999 2000 //func p256PointAddAsm(res, in1, in2 []uint64) 2001 TEXT ·p256PointAddAsm(SB),0,$672-72 2002 // Move input to stack in order to free registers 2003 MOVQ res+0(FP), AX 2004 MOVQ in1+24(FP), BX 2005 MOVQ in2+48(FP), CX 2006 2007 MOVOU (16*0)(BX), X0 2008 MOVOU (16*1)(BX), X1 2009 MOVOU (16*2)(BX), X2 2010 MOVOU (16*3)(BX), X3 2011 MOVOU (16*4)(BX), X4 2012 MOVOU (16*5)(BX), X5 2013 2014 MOVOU X0, x1in(16*0) 2015 MOVOU X1, x1in(16*1) 2016 MOVOU X2, y1in(16*0) 2017 MOVOU X3, y1in(16*1) 2018 MOVOU X4, z1in(16*0) 2019 MOVOU X5, z1in(16*1) 2020 2021 MOVOU (16*0)(CX), X0 2022 MOVOU (16*1)(CX), X1 2023 MOVOU (16*2)(CX), X2 2024 MOVOU (16*3)(CX), X3 2025 MOVOU (16*4)(CX), X4 2026 MOVOU (16*5)(CX), X5 2027 2028 MOVOU X0, x2in(16*0) 2029 MOVOU X1, x2in(16*1) 2030 MOVOU X2, y2in(16*0) 2031 MOVOU X3, y2in(16*1) 2032 MOVOU X4, z2in(16*0) 2033 MOVOU X5, z2in(16*1) 2034 // Store pointer to result 2035 MOVQ AX, rptr 2036 // Begin point add 2037 LDacc (z2in) 2038 CALL p256SqrInternal(SB) // z2ˆ2 2039 ST (z2sqr) 2040 LDt (z2in) 2041 CALL p256MulInternal(SB) // z2ˆ3 2042 LDt (y1in) 2043 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 2044 ST (s1) 2045 2046 LDacc (z1in) 2047 CALL p256SqrInternal(SB) // z1ˆ2 2048 ST (z1sqr) 2049 LDt (z1in) 2050 CALL p256MulInternal(SB) // z1ˆ3 2051 LDt (y2in) 2052 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 2053 ST (s2) 2054 2055 LDt (s1) 2056 CALL p256SubInternal(SB) // r = s2 - s1 2057 ST (r) 2058 2059 LDacc (z2sqr) 2060 LDt (x1in) 2061 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 2062 ST (u1) 2063 LDacc (z1sqr) 2064 LDt (x2in) 2065 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 2066 ST (u2) 2067 2068 LDt (u1) 2069 CALL p256SubInternal(SB) // h = u2 - u1 2070 ST (h) 2071 2072 LDacc (r) 2073 CALL p256SqrInternal(SB) // rsqr = rˆ2 2074 ST (rsqr) 2075 2076 LDacc (h) 2077 CALL p256SqrInternal(SB) // hsqr = hˆ2 2078 ST (hsqr) 2079 2080 LDt (h) 2081 CALL p256MulInternal(SB) // hcub = hˆ3 2082 ST (hcub) 2083 2084 LDt (s1) 2085 CALL p256MulInternal(SB) 2086 ST (s2) 2087 2088 LDacc (z1in) 2089 LDt (z2in) 2090 CALL p256MulInternal(SB) // z1 * z2 2091 LDt (h) 2092 CALL p256MulInternal(SB) // z1 * z2 * h 2093 ST (zout) 2094 2095 LDacc (hsqr) 2096 LDt (u1) 2097 CALL p256MulInternal(SB) // hˆ2 * u1 2098 ST (u2) 2099 2100 p256MulBy2Inline // u1 * hˆ2 * 2, inline 2101 LDacc (rsqr) 2102 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 2103 2104 LDt (hcub) 2105 CALL p256SubInternal(SB) 2106 ST (xout) 2107 2108 MOVQ acc4, t0 2109 MOVQ acc5, t1 2110 MOVQ acc6, t2 2111 MOVQ acc7, t3 2112 LDacc (u2) 2113 CALL p256SubInternal(SB) 2114 2115 LDt (r) 2116 CALL p256MulInternal(SB) 2117 2118 LDt (s2) 2119 CALL p256SubInternal(SB) 2120 ST (yout) 2121 2122 MOVOU xout(16*0), X0 2123 MOVOU xout(16*1), X1 2124 MOVOU yout(16*0), X2 2125 MOVOU yout(16*1), X3 2126 MOVOU zout(16*0), X4 2127 MOVOU zout(16*1), X5 2128 // Finally output the result 2129 MOVQ rptr, AX 2130 MOVQ $0, rptr 2131 MOVOU X0, (16*0)(AX) 2132 MOVOU X1, (16*1)(AX) 2133 MOVOU X2, (16*2)(AX) 2134 MOVOU X3, (16*3)(AX) 2135 MOVOU X4, (16*4)(AX) 2136 MOVOU X5, (16*5)(AX) 2137 2138 RET 2139 #undef x1in 2140 #undef y1in 2141 #undef z1in 2142 #undef x2in 2143 #undef y2in 2144 #undef z2in 2145 #undef xout 2146 #undef yout 2147 #undef zout 2148 #undef s1 2149 #undef s2 2150 #undef u1 2151 #undef u2 2152 #undef z1sqr 2153 #undef z2sqr 2154 #undef h 2155 #undef r 2156 #undef hsqr 2157 #undef rsqr 2158 #undef hcub 2159 #undef rptr 2160 /* ---------------------------------------*/ 2161 #define x(off) (32*0 + off)(SP) 2162 #define y(off) (32*1 + off)(SP) 2163 #define z(off) (32*2 + off)(SP) 2164 2165 #define s(off) (32*3 + off)(SP) 2166 #define m(off) (32*4 + off)(SP) 2167 #define zsqr(off) (32*5 + off)(SP) 2168 #define tmp(off) (32*6 + off)(SP) 2169 #define rptr (32*7)(SP) 2170 2171 //func p256PointDoubleAsm(res, in []uint64) 2172 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 2173 // Move input to stack in order to free registers 2174 MOVQ res+0(FP), AX 2175 MOVQ in+24(FP), BX 2176 2177 MOVOU (16*0)(BX), X0 2178 MOVOU (16*1)(BX), X1 2179 MOVOU (16*2)(BX), X2 2180 MOVOU (16*3)(BX), X3 2181 MOVOU (16*4)(BX), X4 2182 MOVOU (16*5)(BX), X5 2183 2184 MOVOU X0, x(16*0) 2185 MOVOU X1, x(16*1) 2186 MOVOU X2, y(16*0) 2187 MOVOU X3, y(16*1) 2188 MOVOU X4, z(16*0) 2189 MOVOU X5, z(16*1) 2190 // Store pointer to result 2191 MOVQ AX, rptr 2192 // Begin point double 2193 LDacc (z) 2194 CALL p256SqrInternal(SB) 2195 ST (zsqr) 2196 2197 LDt (x) 2198 p256AddInline 2199 STt (m) 2200 2201 LDacc (z) 2202 LDt (y) 2203 CALL p256MulInternal(SB) 2204 p256MulBy2Inline 2205 MOVQ rptr, AX 2206 // Store z 2207 MOVQ t0, (16*4 + 8*0)(AX) 2208 MOVQ t1, (16*4 + 8*1)(AX) 2209 MOVQ t2, (16*4 + 8*2)(AX) 2210 MOVQ t3, (16*4 + 8*3)(AX) 2211 2212 LDacc (x) 2213 LDt (zsqr) 2214 CALL p256SubInternal(SB) 2215 LDt (m) 2216 CALL p256MulInternal(SB) 2217 ST (m) 2218 // Multiply by 3 2219 p256MulBy2Inline 2220 LDacc (m) 2221 p256AddInline 2222 STt (m) 2223 //////////////////////// 2224 LDacc (y) 2225 p256MulBy2Inline 2226 t2acc 2227 CALL p256SqrInternal(SB) 2228 ST (s) 2229 CALL p256SqrInternal(SB) 2230 // Divide by 2 2231 XORQ mul0, mul0 2232 MOVQ acc4, t0 2233 MOVQ acc5, t1 2234 MOVQ acc6, t2 2235 MOVQ acc7, t3 2236 2237 ADDQ $-1, acc4 2238 ADCQ p256const0<>(SB), acc5 2239 ADCQ $0, acc6 2240 ADCQ p256const1<>(SB), acc7 2241 ADCQ $0, mul0 2242 TESTQ $1, t0 2243 2244 CMOVQEQ t0, acc4 2245 CMOVQEQ t1, acc5 2246 CMOVQEQ t2, acc6 2247 CMOVQEQ t3, acc7 2248 ANDQ t0, mul0 2249 2250 SHRQ $1, acc4:acc5 2251 SHRQ $1, acc5:acc6 2252 SHRQ $1, acc6:acc7 2253 SHRQ $1, acc7:mul0 2254 ST (y) 2255 ///////////////////////// 2256 LDacc (x) 2257 LDt (s) 2258 CALL p256MulInternal(SB) 2259 ST (s) 2260 p256MulBy2Inline 2261 STt (tmp) 2262 2263 LDacc (m) 2264 CALL p256SqrInternal(SB) 2265 LDt (tmp) 2266 CALL p256SubInternal(SB) 2267 2268 MOVQ rptr, AX 2269 // Store x 2270 MOVQ acc4, (16*0 + 8*0)(AX) 2271 MOVQ acc5, (16*0 + 8*1)(AX) 2272 MOVQ acc6, (16*0 + 8*2)(AX) 2273 MOVQ acc7, (16*0 + 8*3)(AX) 2274 2275 acc2t 2276 LDacc (s) 2277 CALL p256SubInternal(SB) 2278 2279 LDt (m) 2280 CALL p256MulInternal(SB) 2281 2282 LDt (y) 2283 CALL p256SubInternal(SB) 2284 MOVQ rptr, AX 2285 // Store y 2286 MOVQ acc4, (16*2 + 8*0)(AX) 2287 MOVQ acc5, (16*2 + 8*1)(AX) 2288 MOVQ acc6, (16*2 + 8*2)(AX) 2289 MOVQ acc7, (16*2 + 8*3)(AX) 2290 /////////////////////// 2291 MOVQ $0, rptr 2292 2293 RET 2294 /* ---------------------------------------*/ 2295