github.com/s1s1ty/go@v0.0.0-20180207192209-104445e3140f/src/crypto/elliptic/p256_asm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains constant-time, 64-bit assembly implementation of 6 // P256. The optimizations performed here are described in detail in: 7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 8 // 256-bit primes" 9 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 10 // https://eprint.iacr.org/2013/816.pdf 11 12 #include "textflag.h" 13 14 #define res_ptr DI 15 #define x_ptr SI 16 #define y_ptr CX 17 18 #define acc0 R8 19 #define acc1 R9 20 #define acc2 R10 21 #define acc3 R11 22 #define acc4 R12 23 #define acc5 R13 24 #define t0 R14 25 #define t1 R15 26 27 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 28 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 29 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 30 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 31 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 32 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 33 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 34 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 35 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 36 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 37 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 38 GLOBL p256const0<>(SB), 8, $8 39 GLOBL p256const1<>(SB), 8, $8 40 GLOBL p256ordK0<>(SB), 8, $8 41 GLOBL p256ord<>(SB), 8, $32 42 GLOBL p256one<>(SB), 8, $32 43 44 /* ---------------------------------------*/ 45 // func p256LittleToBig(res []byte, in []uint64) 46 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 47 JMP ·p256BigToLittle(SB) 48 /* ---------------------------------------*/ 49 // func p256BigToLittle(res []uint64, in []byte) 50 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 51 MOVQ res+0(FP), res_ptr 52 MOVQ in+24(FP), x_ptr 53 54 MOVQ (8*0)(x_ptr), acc0 55 MOVQ (8*1)(x_ptr), acc1 56 MOVQ (8*2)(x_ptr), acc2 57 MOVQ (8*3)(x_ptr), acc3 58 59 BSWAPQ acc0 60 BSWAPQ acc1 61 BSWAPQ acc2 62 BSWAPQ acc3 63 64 MOVQ acc3, (8*0)(res_ptr) 65 MOVQ acc2, (8*1)(res_ptr) 66 MOVQ acc1, (8*2)(res_ptr) 67 MOVQ acc0, (8*3)(res_ptr) 68 69 RET 70 /* ---------------------------------------*/ 71 // func p256MovCond(res, a, b []uint64, cond int) 72 // If cond == 0 res=b, else res=a 73 TEXT ·p256MovCond(SB),NOSPLIT,$0 74 MOVQ res+0(FP), res_ptr 75 MOVQ a+24(FP), x_ptr 76 MOVQ b+48(FP), y_ptr 77 MOVQ cond+72(FP), X12 78 79 PXOR X13, X13 80 PSHUFD $0, X12, X12 81 PCMPEQL X13, X12 82 83 MOVOU X12, X0 84 MOVOU (16*0)(x_ptr), X6 85 PANDN X6, X0 86 MOVOU X12, X1 87 MOVOU (16*1)(x_ptr), X7 88 PANDN X7, X1 89 MOVOU X12, X2 90 MOVOU (16*2)(x_ptr), X8 91 PANDN X8, X2 92 MOVOU X12, X3 93 MOVOU (16*3)(x_ptr), X9 94 PANDN X9, X3 95 MOVOU X12, X4 96 MOVOU (16*4)(x_ptr), X10 97 PANDN X10, X4 98 MOVOU X12, X5 99 MOVOU (16*5)(x_ptr), X11 100 PANDN X11, X5 101 102 MOVOU (16*0)(y_ptr), X6 103 MOVOU (16*1)(y_ptr), X7 104 MOVOU (16*2)(y_ptr), X8 105 MOVOU (16*3)(y_ptr), X9 106 MOVOU (16*4)(y_ptr), X10 107 MOVOU (16*5)(y_ptr), X11 108 109 PAND X12, X6 110 PAND X12, X7 111 PAND X12, X8 112 PAND X12, X9 113 PAND X12, X10 114 PAND X12, X11 115 116 PXOR X6, X0 117 PXOR X7, X1 118 PXOR X8, X2 119 PXOR X9, X3 120 PXOR X10, X4 121 PXOR X11, X5 122 123 MOVOU X0, (16*0)(res_ptr) 124 MOVOU X1, (16*1)(res_ptr) 125 MOVOU X2, (16*2)(res_ptr) 126 MOVOU X3, (16*3)(res_ptr) 127 MOVOU X4, (16*4)(res_ptr) 128 MOVOU X5, (16*5)(res_ptr) 129 130 RET 131 /* ---------------------------------------*/ 132 // func p256NegCond(val []uint64, cond int) 133 TEXT ·p256NegCond(SB),NOSPLIT,$0 134 MOVQ val+0(FP), res_ptr 135 MOVQ cond+24(FP), t0 136 // acc = poly 137 MOVQ $-1, acc0 138 MOVQ p256const0<>(SB), acc1 139 MOVQ $0, acc2 140 MOVQ p256const1<>(SB), acc3 141 // Load the original value 142 MOVQ (8*0)(res_ptr), acc5 143 MOVQ (8*1)(res_ptr), x_ptr 144 MOVQ (8*2)(res_ptr), y_ptr 145 MOVQ (8*3)(res_ptr), t1 146 // Speculatively subtract 147 SUBQ acc5, acc0 148 SBBQ x_ptr, acc1 149 SBBQ y_ptr, acc2 150 SBBQ t1, acc3 151 // If condition is 0, keep original value 152 TESTQ t0, t0 153 CMOVQEQ acc5, acc0 154 CMOVQEQ x_ptr, acc1 155 CMOVQEQ y_ptr, acc2 156 CMOVQEQ t1, acc3 157 // Store result 158 MOVQ acc0, (8*0)(res_ptr) 159 MOVQ acc1, (8*1)(res_ptr) 160 MOVQ acc2, (8*2)(res_ptr) 161 MOVQ acc3, (8*3)(res_ptr) 162 163 RET 164 /* ---------------------------------------*/ 165 // func p256Sqr(res, in []uint64) 166 TEXT ·p256Sqr(SB),NOSPLIT,$0 167 MOVQ res+0(FP), res_ptr 168 MOVQ in+24(FP), x_ptr 169 // y[1:] * y[0] 170 MOVQ (8*0)(x_ptr), t0 171 172 MOVQ (8*1)(x_ptr), AX 173 MULQ t0 174 MOVQ AX, acc1 175 MOVQ DX, acc2 176 177 MOVQ (8*2)(x_ptr), AX 178 MULQ t0 179 ADDQ AX, acc2 180 ADCQ $0, DX 181 MOVQ DX, acc3 182 183 MOVQ (8*3)(x_ptr), AX 184 MULQ t0 185 ADDQ AX, acc3 186 ADCQ $0, DX 187 MOVQ DX, acc4 188 // y[2:] * y[1] 189 MOVQ (8*1)(x_ptr), t0 190 191 MOVQ (8*2)(x_ptr), AX 192 MULQ t0 193 ADDQ AX, acc3 194 ADCQ $0, DX 195 MOVQ DX, t1 196 197 MOVQ (8*3)(x_ptr), AX 198 MULQ t0 199 ADDQ t1, acc4 200 ADCQ $0, DX 201 ADDQ AX, acc4 202 ADCQ $0, DX 203 MOVQ DX, acc5 204 // y[3] * y[2] 205 MOVQ (8*2)(x_ptr), t0 206 207 MOVQ (8*3)(x_ptr), AX 208 MULQ t0 209 ADDQ AX, acc5 210 ADCQ $0, DX 211 MOVQ DX, y_ptr 212 XORQ t1, t1 213 // *2 214 ADDQ acc1, acc1 215 ADCQ acc2, acc2 216 ADCQ acc3, acc3 217 ADCQ acc4, acc4 218 ADCQ acc5, acc5 219 ADCQ y_ptr, y_ptr 220 ADCQ $0, t1 221 // Missing products 222 MOVQ (8*0)(x_ptr), AX 223 MULQ AX 224 MOVQ AX, acc0 225 MOVQ DX, t0 226 227 MOVQ (8*1)(x_ptr), AX 228 MULQ AX 229 ADDQ t0, acc1 230 ADCQ AX, acc2 231 ADCQ $0, DX 232 MOVQ DX, t0 233 234 MOVQ (8*2)(x_ptr), AX 235 MULQ AX 236 ADDQ t0, acc3 237 ADCQ AX, acc4 238 ADCQ $0, DX 239 MOVQ DX, t0 240 241 MOVQ (8*3)(x_ptr), AX 242 MULQ AX 243 ADDQ t0, acc5 244 ADCQ AX, y_ptr 245 ADCQ DX, t1 246 MOVQ t1, x_ptr 247 // First reduction step 248 MOVQ acc0, AX 249 MOVQ acc0, t1 250 SHLQ $32, acc0 251 MULQ p256const1<>(SB) 252 SHRQ $32, t1 253 ADDQ acc0, acc1 254 ADCQ t1, acc2 255 ADCQ AX, acc3 256 ADCQ $0, DX 257 MOVQ DX, acc0 258 // Second reduction step 259 MOVQ acc1, AX 260 MOVQ acc1, t1 261 SHLQ $32, acc1 262 MULQ p256const1<>(SB) 263 SHRQ $32, t1 264 ADDQ acc1, acc2 265 ADCQ t1, acc3 266 ADCQ AX, acc0 267 ADCQ $0, DX 268 MOVQ DX, acc1 269 // Third reduction step 270 MOVQ acc2, AX 271 MOVQ acc2, t1 272 SHLQ $32, acc2 273 MULQ p256const1<>(SB) 274 SHRQ $32, t1 275 ADDQ acc2, acc3 276 ADCQ t1, acc0 277 ADCQ AX, acc1 278 ADCQ $0, DX 279 MOVQ DX, acc2 280 // Last reduction step 281 XORQ t0, t0 282 MOVQ acc3, AX 283 MOVQ acc3, t1 284 SHLQ $32, acc3 285 MULQ p256const1<>(SB) 286 SHRQ $32, t1 287 ADDQ acc3, acc0 288 ADCQ t1, acc1 289 ADCQ AX, acc2 290 ADCQ $0, DX 291 MOVQ DX, acc3 292 // Add bits [511:256] of the sqr result 293 ADCQ acc4, acc0 294 ADCQ acc5, acc1 295 ADCQ y_ptr, acc2 296 ADCQ x_ptr, acc3 297 ADCQ $0, t0 298 299 MOVQ acc0, acc4 300 MOVQ acc1, acc5 301 MOVQ acc2, y_ptr 302 MOVQ acc3, t1 303 // Subtract p256 304 SUBQ $-1, acc0 305 SBBQ p256const0<>(SB) ,acc1 306 SBBQ $0, acc2 307 SBBQ p256const1<>(SB), acc3 308 SBBQ $0, t0 309 310 CMOVQCS acc4, acc0 311 CMOVQCS acc5, acc1 312 CMOVQCS y_ptr, acc2 313 CMOVQCS t1, acc3 314 315 MOVQ acc0, (8*0)(res_ptr) 316 MOVQ acc1, (8*1)(res_ptr) 317 MOVQ acc2, (8*2)(res_ptr) 318 MOVQ acc3, (8*3)(res_ptr) 319 320 RET 321 /* ---------------------------------------*/ 322 // func p256Mul(res, in1, in2 []uint64) 323 TEXT ·p256Mul(SB),NOSPLIT,$0 324 MOVQ res+0(FP), res_ptr 325 MOVQ in1+24(FP), x_ptr 326 MOVQ in2+48(FP), y_ptr 327 // x * y[0] 328 MOVQ (8*0)(y_ptr), t0 329 330 MOVQ (8*0)(x_ptr), AX 331 MULQ t0 332 MOVQ AX, acc0 333 MOVQ DX, acc1 334 335 MOVQ (8*1)(x_ptr), AX 336 MULQ t0 337 ADDQ AX, acc1 338 ADCQ $0, DX 339 MOVQ DX, acc2 340 341 MOVQ (8*2)(x_ptr), AX 342 MULQ t0 343 ADDQ AX, acc2 344 ADCQ $0, DX 345 MOVQ DX, acc3 346 347 MOVQ (8*3)(x_ptr), AX 348 MULQ t0 349 ADDQ AX, acc3 350 ADCQ $0, DX 351 MOVQ DX, acc4 352 XORQ acc5, acc5 353 // First reduction step 354 MOVQ acc0, AX 355 MOVQ acc0, t1 356 SHLQ $32, acc0 357 MULQ p256const1<>(SB) 358 SHRQ $32, t1 359 ADDQ acc0, acc1 360 ADCQ t1, acc2 361 ADCQ AX, acc3 362 ADCQ DX, acc4 363 ADCQ $0, acc5 364 XORQ acc0, acc0 365 // x * y[1] 366 MOVQ (8*1)(y_ptr), t0 367 368 MOVQ (8*0)(x_ptr), AX 369 MULQ t0 370 ADDQ AX, acc1 371 ADCQ $0, DX 372 MOVQ DX, t1 373 374 MOVQ (8*1)(x_ptr), AX 375 MULQ t0 376 ADDQ t1, acc2 377 ADCQ $0, DX 378 ADDQ AX, acc2 379 ADCQ $0, DX 380 MOVQ DX, t1 381 382 MOVQ (8*2)(x_ptr), AX 383 MULQ t0 384 ADDQ t1, acc3 385 ADCQ $0, DX 386 ADDQ AX, acc3 387 ADCQ $0, DX 388 MOVQ DX, t1 389 390 MOVQ (8*3)(x_ptr), AX 391 MULQ t0 392 ADDQ t1, acc4 393 ADCQ $0, DX 394 ADDQ AX, acc4 395 ADCQ DX, acc5 396 ADCQ $0, acc0 397 // Second reduction step 398 MOVQ acc1, AX 399 MOVQ acc1, t1 400 SHLQ $32, acc1 401 MULQ p256const1<>(SB) 402 SHRQ $32, t1 403 ADDQ acc1, acc2 404 ADCQ t1, acc3 405 ADCQ AX, acc4 406 ADCQ DX, acc5 407 ADCQ $0, acc0 408 XORQ acc1, acc1 409 // x * y[2] 410 MOVQ (8*2)(y_ptr), t0 411 412 MOVQ (8*0)(x_ptr), AX 413 MULQ t0 414 ADDQ AX, acc2 415 ADCQ $0, DX 416 MOVQ DX, t1 417 418 MOVQ (8*1)(x_ptr), AX 419 MULQ t0 420 ADDQ t1, acc3 421 ADCQ $0, DX 422 ADDQ AX, acc3 423 ADCQ $0, DX 424 MOVQ DX, t1 425 426 MOVQ (8*2)(x_ptr), AX 427 MULQ t0 428 ADDQ t1, acc4 429 ADCQ $0, DX 430 ADDQ AX, acc4 431 ADCQ $0, DX 432 MOVQ DX, t1 433 434 MOVQ (8*3)(x_ptr), AX 435 MULQ t0 436 ADDQ t1, acc5 437 ADCQ $0, DX 438 ADDQ AX, acc5 439 ADCQ DX, acc0 440 ADCQ $0, acc1 441 // Third reduction step 442 MOVQ acc2, AX 443 MOVQ acc2, t1 444 SHLQ $32, acc2 445 MULQ p256const1<>(SB) 446 SHRQ $32, t1 447 ADDQ acc2, acc3 448 ADCQ t1, acc4 449 ADCQ AX, acc5 450 ADCQ DX, acc0 451 ADCQ $0, acc1 452 XORQ acc2, acc2 453 // x * y[3] 454 MOVQ (8*3)(y_ptr), t0 455 456 MOVQ (8*0)(x_ptr), AX 457 MULQ t0 458 ADDQ AX, acc3 459 ADCQ $0, DX 460 MOVQ DX, t1 461 462 MOVQ (8*1)(x_ptr), AX 463 MULQ t0 464 ADDQ t1, acc4 465 ADCQ $0, DX 466 ADDQ AX, acc4 467 ADCQ $0, DX 468 MOVQ DX, t1 469 470 MOVQ (8*2)(x_ptr), AX 471 MULQ t0 472 ADDQ t1, acc5 473 ADCQ $0, DX 474 ADDQ AX, acc5 475 ADCQ $0, DX 476 MOVQ DX, t1 477 478 MOVQ (8*3)(x_ptr), AX 479 MULQ t0 480 ADDQ t1, acc0 481 ADCQ $0, DX 482 ADDQ AX, acc0 483 ADCQ DX, acc1 484 ADCQ $0, acc2 485 // Last reduction step 486 MOVQ acc3, AX 487 MOVQ acc3, t1 488 SHLQ $32, acc3 489 MULQ p256const1<>(SB) 490 SHRQ $32, t1 491 ADDQ acc3, acc4 492 ADCQ t1, acc5 493 ADCQ AX, acc0 494 ADCQ DX, acc1 495 ADCQ $0, acc2 496 // Copy result [255:0] 497 MOVQ acc4, x_ptr 498 MOVQ acc5, acc3 499 MOVQ acc0, t0 500 MOVQ acc1, t1 501 // Subtract p256 502 SUBQ $-1, acc4 503 SBBQ p256const0<>(SB) ,acc5 504 SBBQ $0, acc0 505 SBBQ p256const1<>(SB), acc1 506 SBBQ $0, acc2 507 508 CMOVQCS x_ptr, acc4 509 CMOVQCS acc3, acc5 510 CMOVQCS t0, acc0 511 CMOVQCS t1, acc1 512 513 MOVQ acc4, (8*0)(res_ptr) 514 MOVQ acc5, (8*1)(res_ptr) 515 MOVQ acc0, (8*2)(res_ptr) 516 MOVQ acc1, (8*3)(res_ptr) 517 518 RET 519 /* ---------------------------------------*/ 520 // func p256FromMont(res, in []uint64) 521 TEXT ·p256FromMont(SB),NOSPLIT,$0 522 MOVQ res+0(FP), res_ptr 523 MOVQ in+24(FP), x_ptr 524 525 MOVQ (8*0)(x_ptr), acc0 526 MOVQ (8*1)(x_ptr), acc1 527 MOVQ (8*2)(x_ptr), acc2 528 MOVQ (8*3)(x_ptr), acc3 529 XORQ acc4, acc4 530 531 // Only reduce, no multiplications are needed 532 // First stage 533 MOVQ acc0, AX 534 MOVQ acc0, t1 535 SHLQ $32, acc0 536 MULQ p256const1<>(SB) 537 SHRQ $32, t1 538 ADDQ acc0, acc1 539 ADCQ t1, acc2 540 ADCQ AX, acc3 541 ADCQ DX, acc4 542 XORQ acc5, acc5 543 // Second stage 544 MOVQ acc1, AX 545 MOVQ acc1, t1 546 SHLQ $32, acc1 547 MULQ p256const1<>(SB) 548 SHRQ $32, t1 549 ADDQ acc1, acc2 550 ADCQ t1, acc3 551 ADCQ AX, acc4 552 ADCQ DX, acc5 553 XORQ acc0, acc0 554 // Third stage 555 MOVQ acc2, AX 556 MOVQ acc2, t1 557 SHLQ $32, acc2 558 MULQ p256const1<>(SB) 559 SHRQ $32, t1 560 ADDQ acc2, acc3 561 ADCQ t1, acc4 562 ADCQ AX, acc5 563 ADCQ DX, acc0 564 XORQ acc1, acc1 565 // Last stage 566 MOVQ acc3, AX 567 MOVQ acc3, t1 568 SHLQ $32, acc3 569 MULQ p256const1<>(SB) 570 SHRQ $32, t1 571 ADDQ acc3, acc4 572 ADCQ t1, acc5 573 ADCQ AX, acc0 574 ADCQ DX, acc1 575 576 MOVQ acc4, x_ptr 577 MOVQ acc5, acc3 578 MOVQ acc0, t0 579 MOVQ acc1, t1 580 581 SUBQ $-1, acc4 582 SBBQ p256const0<>(SB), acc5 583 SBBQ $0, acc0 584 SBBQ p256const1<>(SB), acc1 585 586 CMOVQCS x_ptr, acc4 587 CMOVQCS acc3, acc5 588 CMOVQCS t0, acc0 589 CMOVQCS t1, acc1 590 591 MOVQ acc4, (8*0)(res_ptr) 592 MOVQ acc5, (8*1)(res_ptr) 593 MOVQ acc0, (8*2)(res_ptr) 594 MOVQ acc1, (8*3)(res_ptr) 595 596 RET 597 /* ---------------------------------------*/ 598 // Constant time point access to arbitrary point table. 599 // Indexed from 1 to 15, with -1 offset 600 // (index 0 is implicitly point at infinity) 601 // func p256Select(point, table []uint64, idx int) 602 TEXT ·p256Select(SB),NOSPLIT,$0 603 MOVQ idx+48(FP),AX 604 MOVQ table+24(FP),DI 605 MOVQ point+0(FP),DX 606 607 PXOR X15, X15 // X15 = 0 608 PCMPEQL X14, X14 // X14 = -1 609 PSUBL X14, X15 // X15 = 1 610 MOVL AX, X14 611 PSHUFD $0, X14, X14 612 613 PXOR X0, X0 614 PXOR X1, X1 615 PXOR X2, X2 616 PXOR X3, X3 617 PXOR X4, X4 618 PXOR X5, X5 619 MOVQ $16, AX 620 621 MOVOU X15, X13 622 623 loop_select: 624 625 MOVOU X13, X12 626 PADDL X15, X13 627 PCMPEQL X14, X12 628 629 MOVOU (16*0)(DI), X6 630 MOVOU (16*1)(DI), X7 631 MOVOU (16*2)(DI), X8 632 MOVOU (16*3)(DI), X9 633 MOVOU (16*4)(DI), X10 634 MOVOU (16*5)(DI), X11 635 ADDQ $(16*6), DI 636 637 PAND X12, X6 638 PAND X12, X7 639 PAND X12, X8 640 PAND X12, X9 641 PAND X12, X10 642 PAND X12, X11 643 644 PXOR X6, X0 645 PXOR X7, X1 646 PXOR X8, X2 647 PXOR X9, X3 648 PXOR X10, X4 649 PXOR X11, X5 650 651 DECQ AX 652 JNE loop_select 653 654 MOVOU X0, (16*0)(DX) 655 MOVOU X1, (16*1)(DX) 656 MOVOU X2, (16*2)(DX) 657 MOVOU X3, (16*3)(DX) 658 MOVOU X4, (16*4)(DX) 659 MOVOU X5, (16*5)(DX) 660 661 RET 662 /* ---------------------------------------*/ 663 // Constant time point access to base point table. 664 // func p256SelectBase(point, table []uint64, idx int) 665 TEXT ·p256SelectBase(SB),NOSPLIT,$0 666 MOVQ idx+48(FP),AX 667 MOVQ table+24(FP),DI 668 MOVQ point+0(FP),DX 669 670 PXOR X15, X15 // X15 = 0 671 PCMPEQL X14, X14 // X14 = -1 672 PSUBL X14, X15 // X15 = 1 673 MOVL AX, X14 674 PSHUFD $0, X14, X14 675 676 PXOR X0, X0 677 PXOR X1, X1 678 PXOR X2, X2 679 PXOR X3, X3 680 MOVQ $32, AX 681 682 MOVOU X15, X13 683 684 loop_select_base: 685 686 MOVOU X13, X12 687 PADDL X15, X13 688 PCMPEQL X14, X12 689 690 MOVOU (16*0)(DI), X4 691 MOVOU (16*1)(DI), X5 692 MOVOU (16*2)(DI), X6 693 MOVOU (16*3)(DI), X7 694 695 MOVOU (16*4)(DI), X8 696 MOVOU (16*5)(DI), X9 697 MOVOU (16*6)(DI), X10 698 MOVOU (16*7)(DI), X11 699 700 ADDQ $(16*8), DI 701 702 PAND X12, X4 703 PAND X12, X5 704 PAND X12, X6 705 PAND X12, X7 706 707 MOVOU X13, X12 708 PADDL X15, X13 709 PCMPEQL X14, X12 710 711 PAND X12, X8 712 PAND X12, X9 713 PAND X12, X10 714 PAND X12, X11 715 716 PXOR X4, X0 717 PXOR X5, X1 718 PXOR X6, X2 719 PXOR X7, X3 720 721 PXOR X8, X0 722 PXOR X9, X1 723 PXOR X10, X2 724 PXOR X11, X3 725 726 DECQ AX 727 JNE loop_select_base 728 729 MOVOU X0, (16*0)(DX) 730 MOVOU X1, (16*1)(DX) 731 MOVOU X2, (16*2)(DX) 732 MOVOU X3, (16*3)(DX) 733 734 RET 735 /* ---------------------------------------*/ 736 // func p256OrdMul(res, in1, in2 []uint64) 737 TEXT ·p256OrdMul(SB),NOSPLIT,$0 738 MOVQ res+0(FP), res_ptr 739 MOVQ in1+24(FP), x_ptr 740 MOVQ in2+48(FP), y_ptr 741 // x * y[0] 742 MOVQ (8*0)(y_ptr), t0 743 744 MOVQ (8*0)(x_ptr), AX 745 MULQ t0 746 MOVQ AX, acc0 747 MOVQ DX, acc1 748 749 MOVQ (8*1)(x_ptr), AX 750 MULQ t0 751 ADDQ AX, acc1 752 ADCQ $0, DX 753 MOVQ DX, acc2 754 755 MOVQ (8*2)(x_ptr), AX 756 MULQ t0 757 ADDQ AX, acc2 758 ADCQ $0, DX 759 MOVQ DX, acc3 760 761 MOVQ (8*3)(x_ptr), AX 762 MULQ t0 763 ADDQ AX, acc3 764 ADCQ $0, DX 765 MOVQ DX, acc4 766 XORQ acc5, acc5 767 // First reduction step 768 MOVQ acc0, AX 769 MULQ p256ordK0<>(SB) 770 MOVQ AX, t0 771 772 MOVQ p256ord<>+0x00(SB), AX 773 MULQ t0 774 ADDQ AX, acc0 775 ADCQ $0, DX 776 MOVQ DX, t1 777 778 MOVQ p256ord<>+0x08(SB), AX 779 MULQ t0 780 ADDQ t1, acc1 781 ADCQ $0, DX 782 ADDQ AX, acc1 783 ADCQ $0, DX 784 MOVQ DX, t1 785 786 MOVQ p256ord<>+0x10(SB), AX 787 MULQ t0 788 ADDQ t1, acc2 789 ADCQ $0, DX 790 ADDQ AX, acc2 791 ADCQ $0, DX 792 MOVQ DX, t1 793 794 MOVQ p256ord<>+0x18(SB), AX 795 MULQ t0 796 ADDQ t1, acc3 797 ADCQ $0, DX 798 ADDQ AX, acc3 799 ADCQ DX, acc4 800 ADCQ $0, acc5 801 // x * y[1] 802 MOVQ (8*1)(y_ptr), t0 803 804 MOVQ (8*0)(x_ptr), AX 805 MULQ t0 806 ADDQ AX, acc1 807 ADCQ $0, DX 808 MOVQ DX, t1 809 810 MOVQ (8*1)(x_ptr), AX 811 MULQ t0 812 ADDQ t1, acc2 813 ADCQ $0, DX 814 ADDQ AX, acc2 815 ADCQ $0, DX 816 MOVQ DX, t1 817 818 MOVQ (8*2)(x_ptr), AX 819 MULQ t0 820 ADDQ t1, acc3 821 ADCQ $0, DX 822 ADDQ AX, acc3 823 ADCQ $0, DX 824 MOVQ DX, t1 825 826 MOVQ (8*3)(x_ptr), AX 827 MULQ t0 828 ADDQ t1, acc4 829 ADCQ $0, DX 830 ADDQ AX, acc4 831 ADCQ DX, acc5 832 ADCQ $0, acc0 833 // Second reduction step 834 MOVQ acc1, AX 835 MULQ p256ordK0<>(SB) 836 MOVQ AX, t0 837 838 MOVQ p256ord<>+0x00(SB), AX 839 MULQ t0 840 ADDQ AX, acc1 841 ADCQ $0, DX 842 MOVQ DX, t1 843 844 MOVQ p256ord<>+0x08(SB), AX 845 MULQ t0 846 ADDQ t1, acc2 847 ADCQ $0, DX 848 ADDQ AX, acc2 849 ADCQ $0, DX 850 MOVQ DX, t1 851 852 MOVQ p256ord<>+0x10(SB), AX 853 MULQ t0 854 ADDQ t1, acc3 855 ADCQ $0, DX 856 ADDQ AX, acc3 857 ADCQ $0, DX 858 MOVQ DX, t1 859 860 MOVQ p256ord<>+0x18(SB), AX 861 MULQ t0 862 ADDQ t1, acc4 863 ADCQ $0, DX 864 ADDQ AX, acc4 865 ADCQ DX, acc5 866 ADCQ $0, acc0 867 // x * y[2] 868 MOVQ (8*2)(y_ptr), t0 869 870 MOVQ (8*0)(x_ptr), AX 871 MULQ t0 872 ADDQ AX, acc2 873 ADCQ $0, DX 874 MOVQ DX, t1 875 876 MOVQ (8*1)(x_ptr), AX 877 MULQ t0 878 ADDQ t1, acc3 879 ADCQ $0, DX 880 ADDQ AX, acc3 881 ADCQ $0, DX 882 MOVQ DX, t1 883 884 MOVQ (8*2)(x_ptr), AX 885 MULQ t0 886 ADDQ t1, acc4 887 ADCQ $0, DX 888 ADDQ AX, acc4 889 ADCQ $0, DX 890 MOVQ DX, t1 891 892 MOVQ (8*3)(x_ptr), AX 893 MULQ t0 894 ADDQ t1, acc5 895 ADCQ $0, DX 896 ADDQ AX, acc5 897 ADCQ DX, acc0 898 ADCQ $0, acc1 899 // Third reduction step 900 MOVQ acc2, AX 901 MULQ p256ordK0<>(SB) 902 MOVQ AX, t0 903 904 MOVQ p256ord<>+0x00(SB), AX 905 MULQ t0 906 ADDQ AX, acc2 907 ADCQ $0, DX 908 MOVQ DX, t1 909 910 MOVQ p256ord<>+0x08(SB), AX 911 MULQ t0 912 ADDQ t1, acc3 913 ADCQ $0, DX 914 ADDQ AX, acc3 915 ADCQ $0, DX 916 MOVQ DX, t1 917 918 MOVQ p256ord<>+0x10(SB), AX 919 MULQ t0 920 ADDQ t1, acc4 921 ADCQ $0, DX 922 ADDQ AX, acc4 923 ADCQ $0, DX 924 MOVQ DX, t1 925 926 MOVQ p256ord<>+0x18(SB), AX 927 MULQ t0 928 ADDQ t1, acc5 929 ADCQ $0, DX 930 ADDQ AX, acc5 931 ADCQ DX, acc0 932 ADCQ $0, acc1 933 // x * y[3] 934 MOVQ (8*3)(y_ptr), t0 935 936 MOVQ (8*0)(x_ptr), AX 937 MULQ t0 938 ADDQ AX, acc3 939 ADCQ $0, DX 940 MOVQ DX, t1 941 942 MOVQ (8*1)(x_ptr), AX 943 MULQ t0 944 ADDQ t1, acc4 945 ADCQ $0, DX 946 ADDQ AX, acc4 947 ADCQ $0, DX 948 MOVQ DX, t1 949 950 MOVQ (8*2)(x_ptr), AX 951 MULQ t0 952 ADDQ t1, acc5 953 ADCQ $0, DX 954 ADDQ AX, acc5 955 ADCQ $0, DX 956 MOVQ DX, t1 957 958 MOVQ (8*3)(x_ptr), AX 959 MULQ t0 960 ADDQ t1, acc0 961 ADCQ $0, DX 962 ADDQ AX, acc0 963 ADCQ DX, acc1 964 ADCQ $0, acc2 965 // Last reduction step 966 MOVQ acc3, AX 967 MULQ p256ordK0<>(SB) 968 MOVQ AX, t0 969 970 MOVQ p256ord<>+0x00(SB), AX 971 MULQ t0 972 ADDQ AX, acc3 973 ADCQ $0, DX 974 MOVQ DX, t1 975 976 MOVQ p256ord<>+0x08(SB), AX 977 MULQ t0 978 ADDQ t1, acc4 979 ADCQ $0, DX 980 ADDQ AX, acc4 981 ADCQ $0, DX 982 MOVQ DX, t1 983 984 MOVQ p256ord<>+0x10(SB), AX 985 MULQ t0 986 ADDQ t1, acc5 987 ADCQ $0, DX 988 ADDQ AX, acc5 989 ADCQ $0, DX 990 MOVQ DX, t1 991 992 MOVQ p256ord<>+0x18(SB), AX 993 MULQ t0 994 ADDQ t1, acc0 995 ADCQ $0, DX 996 ADDQ AX, acc0 997 ADCQ DX, acc1 998 ADCQ $0, acc2 999 // Copy result [255:0] 1000 MOVQ acc4, x_ptr 1001 MOVQ acc5, acc3 1002 MOVQ acc0, t0 1003 MOVQ acc1, t1 1004 // Subtract p256 1005 SUBQ p256ord<>+0x00(SB), acc4 1006 SBBQ p256ord<>+0x08(SB) ,acc5 1007 SBBQ p256ord<>+0x10(SB), acc0 1008 SBBQ p256ord<>+0x18(SB), acc1 1009 SBBQ $0, acc2 1010 1011 CMOVQCS x_ptr, acc4 1012 CMOVQCS acc3, acc5 1013 CMOVQCS t0, acc0 1014 CMOVQCS t1, acc1 1015 1016 MOVQ acc4, (8*0)(res_ptr) 1017 MOVQ acc5, (8*1)(res_ptr) 1018 MOVQ acc0, (8*2)(res_ptr) 1019 MOVQ acc1, (8*3)(res_ptr) 1020 1021 RET 1022 /* ---------------------------------------*/ 1023 // func p256OrdSqr(res, in []uint64, n int) 1024 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 1025 MOVQ res+0(FP), res_ptr 1026 MOVQ in+24(FP), x_ptr 1027 MOVQ n+48(FP), BX 1028 1029 ordSqrLoop: 1030 1031 // y[1:] * y[0] 1032 MOVQ (8*0)(x_ptr), t0 1033 1034 MOVQ (8*1)(x_ptr), AX 1035 MULQ t0 1036 MOVQ AX, acc1 1037 MOVQ DX, acc2 1038 1039 MOVQ (8*2)(x_ptr), AX 1040 MULQ t0 1041 ADDQ AX, acc2 1042 ADCQ $0, DX 1043 MOVQ DX, acc3 1044 1045 MOVQ (8*3)(x_ptr), AX 1046 MULQ t0 1047 ADDQ AX, acc3 1048 ADCQ $0, DX 1049 MOVQ DX, acc4 1050 // y[2:] * y[1] 1051 MOVQ (8*1)(x_ptr), t0 1052 1053 MOVQ (8*2)(x_ptr), AX 1054 MULQ t0 1055 ADDQ AX, acc3 1056 ADCQ $0, DX 1057 MOVQ DX, t1 1058 1059 MOVQ (8*3)(x_ptr), AX 1060 MULQ t0 1061 ADDQ t1, acc4 1062 ADCQ $0, DX 1063 ADDQ AX, acc4 1064 ADCQ $0, DX 1065 MOVQ DX, acc5 1066 // y[3] * y[2] 1067 MOVQ (8*2)(x_ptr), t0 1068 1069 MOVQ (8*3)(x_ptr), AX 1070 MULQ t0 1071 ADDQ AX, acc5 1072 ADCQ $0, DX 1073 MOVQ DX, y_ptr 1074 XORQ t1, t1 1075 // *2 1076 ADDQ acc1, acc1 1077 ADCQ acc2, acc2 1078 ADCQ acc3, acc3 1079 ADCQ acc4, acc4 1080 ADCQ acc5, acc5 1081 ADCQ y_ptr, y_ptr 1082 ADCQ $0, t1 1083 // Missing products 1084 MOVQ (8*0)(x_ptr), AX 1085 MULQ AX 1086 MOVQ AX, acc0 1087 MOVQ DX, t0 1088 1089 MOVQ (8*1)(x_ptr), AX 1090 MULQ AX 1091 ADDQ t0, acc1 1092 ADCQ AX, acc2 1093 ADCQ $0, DX 1094 MOVQ DX, t0 1095 1096 MOVQ (8*2)(x_ptr), AX 1097 MULQ AX 1098 ADDQ t0, acc3 1099 ADCQ AX, acc4 1100 ADCQ $0, DX 1101 MOVQ DX, t0 1102 1103 MOVQ (8*3)(x_ptr), AX 1104 MULQ AX 1105 ADDQ t0, acc5 1106 ADCQ AX, y_ptr 1107 ADCQ DX, t1 1108 MOVQ t1, x_ptr 1109 // First reduction step 1110 MOVQ acc0, AX 1111 MULQ p256ordK0<>(SB) 1112 MOVQ AX, t0 1113 1114 MOVQ p256ord<>+0x00(SB), AX 1115 MULQ t0 1116 ADDQ AX, acc0 1117 ADCQ $0, DX 1118 MOVQ DX, t1 1119 1120 MOVQ p256ord<>+0x08(SB), AX 1121 MULQ t0 1122 ADDQ t1, acc1 1123 ADCQ $0, DX 1124 ADDQ AX, acc1 1125 1126 MOVQ t0, t1 1127 ADCQ DX, acc2 1128 ADCQ $0, t1 1129 SUBQ t0, acc2 1130 SBBQ $0, t1 1131 1132 MOVQ t0, AX 1133 MOVQ t0, DX 1134 MOVQ t0, acc0 1135 SHLQ $32, AX 1136 SHRQ $32, DX 1137 1138 ADDQ t1, acc3 1139 ADCQ $0, acc0 1140 SUBQ AX, acc3 1141 SBBQ DX, acc0 1142 // Second reduction step 1143 MOVQ acc1, AX 1144 MULQ p256ordK0<>(SB) 1145 MOVQ AX, t0 1146 1147 MOVQ p256ord<>+0x00(SB), AX 1148 MULQ t0 1149 ADDQ AX, acc1 1150 ADCQ $0, DX 1151 MOVQ DX, t1 1152 1153 MOVQ p256ord<>+0x08(SB), AX 1154 MULQ t0 1155 ADDQ t1, acc2 1156 ADCQ $0, DX 1157 ADDQ AX, acc2 1158 1159 MOVQ t0, t1 1160 ADCQ DX, acc3 1161 ADCQ $0, t1 1162 SUBQ t0, acc3 1163 SBBQ $0, t1 1164 1165 MOVQ t0, AX 1166 MOVQ t0, DX 1167 MOVQ t0, acc1 1168 SHLQ $32, AX 1169 SHRQ $32, DX 1170 1171 ADDQ t1, acc0 1172 ADCQ $0, acc1 1173 SUBQ AX, acc0 1174 SBBQ DX, acc1 1175 // Third reduction step 1176 MOVQ acc2, AX 1177 MULQ p256ordK0<>(SB) 1178 MOVQ AX, t0 1179 1180 MOVQ p256ord<>+0x00(SB), AX 1181 MULQ t0 1182 ADDQ AX, acc2 1183 ADCQ $0, DX 1184 MOVQ DX, t1 1185 1186 MOVQ p256ord<>+0x08(SB), AX 1187 MULQ t0 1188 ADDQ t1, acc3 1189 ADCQ $0, DX 1190 ADDQ AX, acc3 1191 1192 MOVQ t0, t1 1193 ADCQ DX, acc0 1194 ADCQ $0, t1 1195 SUBQ t0, acc0 1196 SBBQ $0, t1 1197 1198 MOVQ t0, AX 1199 MOVQ t0, DX 1200 MOVQ t0, acc2 1201 SHLQ $32, AX 1202 SHRQ $32, DX 1203 1204 ADDQ t1, acc1 1205 ADCQ $0, acc2 1206 SUBQ AX, acc1 1207 SBBQ DX, acc2 1208 // Last reduction step 1209 MOVQ acc3, AX 1210 MULQ p256ordK0<>(SB) 1211 MOVQ AX, t0 1212 1213 MOVQ p256ord<>+0x00(SB), AX 1214 MULQ t0 1215 ADDQ AX, acc3 1216 ADCQ $0, DX 1217 MOVQ DX, t1 1218 1219 MOVQ p256ord<>+0x08(SB), AX 1220 MULQ t0 1221 ADDQ t1, acc0 1222 ADCQ $0, DX 1223 ADDQ AX, acc0 1224 ADCQ $0, DX 1225 MOVQ DX, t1 1226 1227 MOVQ t0, t1 1228 ADCQ DX, acc1 1229 ADCQ $0, t1 1230 SUBQ t0, acc1 1231 SBBQ $0, t1 1232 1233 MOVQ t0, AX 1234 MOVQ t0, DX 1235 MOVQ t0, acc3 1236 SHLQ $32, AX 1237 SHRQ $32, DX 1238 1239 ADDQ t1, acc2 1240 ADCQ $0, acc3 1241 SUBQ AX, acc2 1242 SBBQ DX, acc3 1243 XORQ t0, t0 1244 // Add bits [511:256] of the sqr result 1245 ADCQ acc4, acc0 1246 ADCQ acc5, acc1 1247 ADCQ y_ptr, acc2 1248 ADCQ x_ptr, acc3 1249 ADCQ $0, t0 1250 1251 MOVQ acc0, acc4 1252 MOVQ acc1, acc5 1253 MOVQ acc2, y_ptr 1254 MOVQ acc3, t1 1255 // Subtract p256 1256 SUBQ p256ord<>+0x00(SB), acc0 1257 SBBQ p256ord<>+0x08(SB) ,acc1 1258 SBBQ p256ord<>+0x10(SB), acc2 1259 SBBQ p256ord<>+0x18(SB), acc3 1260 SBBQ $0, t0 1261 1262 CMOVQCS acc4, acc0 1263 CMOVQCS acc5, acc1 1264 CMOVQCS y_ptr, acc2 1265 CMOVQCS t1, acc3 1266 1267 MOVQ acc0, (8*0)(res_ptr) 1268 MOVQ acc1, (8*1)(res_ptr) 1269 MOVQ acc2, (8*2)(res_ptr) 1270 MOVQ acc3, (8*3)(res_ptr) 1271 MOVQ res_ptr, x_ptr 1272 DECQ BX 1273 JNE ordSqrLoop 1274 1275 RET 1276 /* ---------------------------------------*/ 1277 #undef res_ptr 1278 #undef x_ptr 1279 #undef y_ptr 1280 1281 #undef acc0 1282 #undef acc1 1283 #undef acc2 1284 #undef acc3 1285 #undef acc4 1286 #undef acc5 1287 #undef t0 1288 #undef t1 1289 /* ---------------------------------------*/ 1290 #define mul0 AX 1291 #define mul1 DX 1292 #define acc0 BX 1293 #define acc1 CX 1294 #define acc2 R8 1295 #define acc3 R9 1296 #define acc4 R10 1297 #define acc5 R11 1298 #define acc6 R12 1299 #define acc7 R13 1300 #define t0 R14 1301 #define t1 R15 1302 #define t2 DI 1303 #define t3 SI 1304 #define hlp BP 1305 /* ---------------------------------------*/ 1306 TEXT p256SubInternal(SB),NOSPLIT,$0 1307 XORQ mul0, mul0 1308 SUBQ t0, acc4 1309 SBBQ t1, acc5 1310 SBBQ t2, acc6 1311 SBBQ t3, acc7 1312 SBBQ $0, mul0 1313 1314 MOVQ acc4, acc0 1315 MOVQ acc5, acc1 1316 MOVQ acc6, acc2 1317 MOVQ acc7, acc3 1318 1319 ADDQ $-1, acc4 1320 ADCQ p256const0<>(SB), acc5 1321 ADCQ $0, acc6 1322 ADCQ p256const1<>(SB), acc7 1323 ANDQ $1, mul0 1324 1325 CMOVQEQ acc0, acc4 1326 CMOVQEQ acc1, acc5 1327 CMOVQEQ acc2, acc6 1328 CMOVQEQ acc3, acc7 1329 1330 RET 1331 /* ---------------------------------------*/ 1332 TEXT p256MulInternal(SB),NOSPLIT,$0 1333 MOVQ acc4, mul0 1334 MULQ t0 1335 MOVQ mul0, acc0 1336 MOVQ mul1, acc1 1337 1338 MOVQ acc4, mul0 1339 MULQ t1 1340 ADDQ mul0, acc1 1341 ADCQ $0, mul1 1342 MOVQ mul1, acc2 1343 1344 MOVQ acc4, mul0 1345 MULQ t2 1346 ADDQ mul0, acc2 1347 ADCQ $0, mul1 1348 MOVQ mul1, acc3 1349 1350 MOVQ acc4, mul0 1351 MULQ t3 1352 ADDQ mul0, acc3 1353 ADCQ $0, mul1 1354 MOVQ mul1, acc4 1355 1356 MOVQ acc5, mul0 1357 MULQ t0 1358 ADDQ mul0, acc1 1359 ADCQ $0, mul1 1360 MOVQ mul1, hlp 1361 1362 MOVQ acc5, mul0 1363 MULQ t1 1364 ADDQ hlp, acc2 1365 ADCQ $0, mul1 1366 ADDQ mul0, acc2 1367 ADCQ $0, mul1 1368 MOVQ mul1, hlp 1369 1370 MOVQ acc5, mul0 1371 MULQ t2 1372 ADDQ hlp, acc3 1373 ADCQ $0, mul1 1374 ADDQ mul0, acc3 1375 ADCQ $0, mul1 1376 MOVQ mul1, hlp 1377 1378 MOVQ acc5, mul0 1379 MULQ t3 1380 ADDQ hlp, acc4 1381 ADCQ $0, mul1 1382 ADDQ mul0, acc4 1383 ADCQ $0, mul1 1384 MOVQ mul1, acc5 1385 1386 MOVQ acc6, mul0 1387 MULQ t0 1388 ADDQ mul0, acc2 1389 ADCQ $0, mul1 1390 MOVQ mul1, hlp 1391 1392 MOVQ acc6, mul0 1393 MULQ t1 1394 ADDQ hlp, acc3 1395 ADCQ $0, mul1 1396 ADDQ mul0, acc3 1397 ADCQ $0, mul1 1398 MOVQ mul1, hlp 1399 1400 MOVQ acc6, mul0 1401 MULQ t2 1402 ADDQ hlp, acc4 1403 ADCQ $0, mul1 1404 ADDQ mul0, acc4 1405 ADCQ $0, mul1 1406 MOVQ mul1, hlp 1407 1408 MOVQ acc6, mul0 1409 MULQ t3 1410 ADDQ hlp, acc5 1411 ADCQ $0, mul1 1412 ADDQ mul0, acc5 1413 ADCQ $0, mul1 1414 MOVQ mul1, acc6 1415 1416 MOVQ acc7, mul0 1417 MULQ t0 1418 ADDQ mul0, acc3 1419 ADCQ $0, mul1 1420 MOVQ mul1, hlp 1421 1422 MOVQ acc7, mul0 1423 MULQ t1 1424 ADDQ hlp, acc4 1425 ADCQ $0, mul1 1426 ADDQ mul0, acc4 1427 ADCQ $0, mul1 1428 MOVQ mul1, hlp 1429 1430 MOVQ acc7, mul0 1431 MULQ t2 1432 ADDQ hlp, acc5 1433 ADCQ $0, mul1 1434 ADDQ mul0, acc5 1435 ADCQ $0, mul1 1436 MOVQ mul1, hlp 1437 1438 MOVQ acc7, mul0 1439 MULQ t3 1440 ADDQ hlp, acc6 1441 ADCQ $0, mul1 1442 ADDQ mul0, acc6 1443 ADCQ $0, mul1 1444 MOVQ mul1, acc7 1445 // First reduction step 1446 MOVQ acc0, mul0 1447 MOVQ acc0, hlp 1448 SHLQ $32, acc0 1449 MULQ p256const1<>(SB) 1450 SHRQ $32, hlp 1451 ADDQ acc0, acc1 1452 ADCQ hlp, acc2 1453 ADCQ mul0, acc3 1454 ADCQ $0, mul1 1455 MOVQ mul1, acc0 1456 // Second reduction step 1457 MOVQ acc1, mul0 1458 MOVQ acc1, hlp 1459 SHLQ $32, acc1 1460 MULQ p256const1<>(SB) 1461 SHRQ $32, hlp 1462 ADDQ acc1, acc2 1463 ADCQ hlp, acc3 1464 ADCQ mul0, acc0 1465 ADCQ $0, mul1 1466 MOVQ mul1, acc1 1467 // Third reduction step 1468 MOVQ acc2, mul0 1469 MOVQ acc2, hlp 1470 SHLQ $32, acc2 1471 MULQ p256const1<>(SB) 1472 SHRQ $32, hlp 1473 ADDQ acc2, acc3 1474 ADCQ hlp, acc0 1475 ADCQ mul0, acc1 1476 ADCQ $0, mul1 1477 MOVQ mul1, acc2 1478 // Last reduction step 1479 MOVQ acc3, mul0 1480 MOVQ acc3, hlp 1481 SHLQ $32, acc3 1482 MULQ p256const1<>(SB) 1483 SHRQ $32, hlp 1484 ADDQ acc3, acc0 1485 ADCQ hlp, acc1 1486 ADCQ mul0, acc2 1487 ADCQ $0, mul1 1488 MOVQ mul1, acc3 1489 BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP 1490 // Add bits [511:256] of the result 1491 ADCQ acc0, acc4 1492 ADCQ acc1, acc5 1493 ADCQ acc2, acc6 1494 ADCQ acc3, acc7 1495 ADCQ $0, hlp 1496 // Copy result 1497 MOVQ acc4, acc0 1498 MOVQ acc5, acc1 1499 MOVQ acc6, acc2 1500 MOVQ acc7, acc3 1501 // Subtract p256 1502 SUBQ $-1, acc4 1503 SBBQ p256const0<>(SB) ,acc5 1504 SBBQ $0, acc6 1505 SBBQ p256const1<>(SB), acc7 1506 SBBQ $0, hlp 1507 // If the result of the subtraction is negative, restore the previous result 1508 CMOVQCS acc0, acc4 1509 CMOVQCS acc1, acc5 1510 CMOVQCS acc2, acc6 1511 CMOVQCS acc3, acc7 1512 1513 RET 1514 /* ---------------------------------------*/ 1515 TEXT p256SqrInternal(SB),NOSPLIT,$0 1516 1517 MOVQ acc4, mul0 1518 MULQ acc5 1519 MOVQ mul0, acc1 1520 MOVQ mul1, acc2 1521 1522 MOVQ acc4, mul0 1523 MULQ acc6 1524 ADDQ mul0, acc2 1525 ADCQ $0, mul1 1526 MOVQ mul1, acc3 1527 1528 MOVQ acc4, mul0 1529 MULQ acc7 1530 ADDQ mul0, acc3 1531 ADCQ $0, mul1 1532 MOVQ mul1, t0 1533 1534 MOVQ acc5, mul0 1535 MULQ acc6 1536 ADDQ mul0, acc3 1537 ADCQ $0, mul1 1538 MOVQ mul1, hlp 1539 1540 MOVQ acc5, mul0 1541 MULQ acc7 1542 ADDQ hlp, t0 1543 ADCQ $0, mul1 1544 ADDQ mul0, t0 1545 ADCQ $0, mul1 1546 MOVQ mul1, t1 1547 1548 MOVQ acc6, mul0 1549 MULQ acc7 1550 ADDQ mul0, t1 1551 ADCQ $0, mul1 1552 MOVQ mul1, t2 1553 XORQ t3, t3 1554 // *2 1555 ADDQ acc1, acc1 1556 ADCQ acc2, acc2 1557 ADCQ acc3, acc3 1558 ADCQ t0, t0 1559 ADCQ t1, t1 1560 ADCQ t2, t2 1561 ADCQ $0, t3 1562 // Missing products 1563 MOVQ acc4, mul0 1564 MULQ mul0 1565 MOVQ mul0, acc0 1566 MOVQ DX, acc4 1567 1568 MOVQ acc5, mul0 1569 MULQ mul0 1570 ADDQ acc4, acc1 1571 ADCQ mul0, acc2 1572 ADCQ $0, DX 1573 MOVQ DX, acc4 1574 1575 MOVQ acc6, mul0 1576 MULQ mul0 1577 ADDQ acc4, acc3 1578 ADCQ mul0, t0 1579 ADCQ $0, DX 1580 MOVQ DX, acc4 1581 1582 MOVQ acc7, mul0 1583 MULQ mul0 1584 ADDQ acc4, t1 1585 ADCQ mul0, t2 1586 ADCQ DX, t3 1587 // First reduction step 1588 MOVQ acc0, mul0 1589 MOVQ acc0, hlp 1590 SHLQ $32, acc0 1591 MULQ p256const1<>(SB) 1592 SHRQ $32, hlp 1593 ADDQ acc0, acc1 1594 ADCQ hlp, acc2 1595 ADCQ mul0, acc3 1596 ADCQ $0, mul1 1597 MOVQ mul1, acc0 1598 // Second reduction step 1599 MOVQ acc1, mul0 1600 MOVQ acc1, hlp 1601 SHLQ $32, acc1 1602 MULQ p256const1<>(SB) 1603 SHRQ $32, hlp 1604 ADDQ acc1, acc2 1605 ADCQ hlp, acc3 1606 ADCQ mul0, acc0 1607 ADCQ $0, mul1 1608 MOVQ mul1, acc1 1609 // Third reduction step 1610 MOVQ acc2, mul0 1611 MOVQ acc2, hlp 1612 SHLQ $32, acc2 1613 MULQ p256const1<>(SB) 1614 SHRQ $32, hlp 1615 ADDQ acc2, acc3 1616 ADCQ hlp, acc0 1617 ADCQ mul0, acc1 1618 ADCQ $0, mul1 1619 MOVQ mul1, acc2 1620 // Last reduction step 1621 MOVQ acc3, mul0 1622 MOVQ acc3, hlp 1623 SHLQ $32, acc3 1624 MULQ p256const1<>(SB) 1625 SHRQ $32, hlp 1626 ADDQ acc3, acc0 1627 ADCQ hlp, acc1 1628 ADCQ mul0, acc2 1629 ADCQ $0, mul1 1630 MOVQ mul1, acc3 1631 BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP 1632 // Add bits [511:256] of the result 1633 ADCQ acc0, t0 1634 ADCQ acc1, t1 1635 ADCQ acc2, t2 1636 ADCQ acc3, t3 1637 ADCQ $0, hlp 1638 // Copy result 1639 MOVQ t0, acc4 1640 MOVQ t1, acc5 1641 MOVQ t2, acc6 1642 MOVQ t3, acc7 1643 // Subtract p256 1644 SUBQ $-1, acc4 1645 SBBQ p256const0<>(SB) ,acc5 1646 SBBQ $0, acc6 1647 SBBQ p256const1<>(SB), acc7 1648 SBBQ $0, hlp 1649 // If the result of the subtraction is negative, restore the previous result 1650 CMOVQCS t0, acc4 1651 CMOVQCS t1, acc5 1652 CMOVQCS t2, acc6 1653 CMOVQCS t3, acc7 1654 1655 RET 1656 /* ---------------------------------------*/ 1657 #define p256MulBy2Inline\ 1658 XORQ mul0, mul0;\ 1659 ADDQ acc4, acc4;\ 1660 ADCQ acc5, acc5;\ 1661 ADCQ acc6, acc6;\ 1662 ADCQ acc7, acc7;\ 1663 ADCQ $0, mul0;\ 1664 MOVQ acc4, t0;\ 1665 MOVQ acc5, t1;\ 1666 MOVQ acc6, t2;\ 1667 MOVQ acc7, t3;\ 1668 SUBQ $-1, t0;\ 1669 SBBQ p256const0<>(SB), t1;\ 1670 SBBQ $0, t2;\ 1671 SBBQ p256const1<>(SB), t3;\ 1672 SBBQ $0, mul0;\ 1673 CMOVQCS acc4, t0;\ 1674 CMOVQCS acc5, t1;\ 1675 CMOVQCS acc6, t2;\ 1676 CMOVQCS acc7, t3; 1677 /* ---------------------------------------*/ 1678 #define p256AddInline \ 1679 XORQ mul0, mul0;\ 1680 ADDQ t0, acc4;\ 1681 ADCQ t1, acc5;\ 1682 ADCQ t2, acc6;\ 1683 ADCQ t3, acc7;\ 1684 ADCQ $0, mul0;\ 1685 MOVQ acc4, t0;\ 1686 MOVQ acc5, t1;\ 1687 MOVQ acc6, t2;\ 1688 MOVQ acc7, t3;\ 1689 SUBQ $-1, t0;\ 1690 SBBQ p256const0<>(SB), t1;\ 1691 SBBQ $0, t2;\ 1692 SBBQ p256const1<>(SB), t3;\ 1693 SBBQ $0, mul0;\ 1694 CMOVQCS acc4, t0;\ 1695 CMOVQCS acc5, t1;\ 1696 CMOVQCS acc6, t2;\ 1697 CMOVQCS acc7, t3; 1698 /* ---------------------------------------*/ 1699 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1700 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1701 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1702 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1703 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1704 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1705 /* ---------------------------------------*/ 1706 #define x1in(off) (32*0 + off)(SP) 1707 #define y1in(off) (32*1 + off)(SP) 1708 #define z1in(off) (32*2 + off)(SP) 1709 #define x2in(off) (32*3 + off)(SP) 1710 #define y2in(off) (32*4 + off)(SP) 1711 #define xout(off) (32*5 + off)(SP) 1712 #define yout(off) (32*6 + off)(SP) 1713 #define zout(off) (32*7 + off)(SP) 1714 #define s2(off) (32*8 + off)(SP) 1715 #define z1sqr(off) (32*9 + off)(SP) 1716 #define h(off) (32*10 + off)(SP) 1717 #define r(off) (32*11 + off)(SP) 1718 #define hsqr(off) (32*12 + off)(SP) 1719 #define rsqr(off) (32*13 + off)(SP) 1720 #define hcub(off) (32*14 + off)(SP) 1721 #define rptr (32*15)(SP) 1722 #define sel_save (32*15 + 8)(SP) 1723 #define zero_save (32*15 + 8 + 4)(SP) 1724 1725 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 1726 TEXT ·p256PointAddAffineAsm(SB),0,$512-96 1727 // Move input to stack in order to free registers 1728 MOVQ res+0(FP), AX 1729 MOVQ in1+24(FP), BX 1730 MOVQ in2+48(FP), CX 1731 MOVQ sign+72(FP), DX 1732 MOVQ sel+80(FP), t1 1733 MOVQ zero+88(FP), t2 1734 1735 MOVOU (16*0)(BX), X0 1736 MOVOU (16*1)(BX), X1 1737 MOVOU (16*2)(BX), X2 1738 MOVOU (16*3)(BX), X3 1739 MOVOU (16*4)(BX), X4 1740 MOVOU (16*5)(BX), X5 1741 1742 MOVOU X0, x1in(16*0) 1743 MOVOU X1, x1in(16*1) 1744 MOVOU X2, y1in(16*0) 1745 MOVOU X3, y1in(16*1) 1746 MOVOU X4, z1in(16*0) 1747 MOVOU X5, z1in(16*1) 1748 1749 MOVOU (16*0)(CX), X0 1750 MOVOU (16*1)(CX), X1 1751 1752 MOVOU X0, x2in(16*0) 1753 MOVOU X1, x2in(16*1) 1754 // Store pointer to result 1755 MOVQ mul0, rptr 1756 MOVL t1, sel_save 1757 MOVL t2, zero_save 1758 // Negate y2in based on sign 1759 MOVQ (16*2 + 8*0)(CX), acc4 1760 MOVQ (16*2 + 8*1)(CX), acc5 1761 MOVQ (16*2 + 8*2)(CX), acc6 1762 MOVQ (16*2 + 8*3)(CX), acc7 1763 MOVQ $-1, acc0 1764 MOVQ p256const0<>(SB), acc1 1765 MOVQ $0, acc2 1766 MOVQ p256const1<>(SB), acc3 1767 XORQ mul0, mul0 1768 // Speculatively subtract 1769 SUBQ acc4, acc0 1770 SBBQ acc5, acc1 1771 SBBQ acc6, acc2 1772 SBBQ acc7, acc3 1773 SBBQ $0, mul0 1774 MOVQ acc0, t0 1775 MOVQ acc1, t1 1776 MOVQ acc2, t2 1777 MOVQ acc3, t3 1778 // Add in case the operand was > p256 1779 ADDQ $-1, acc0 1780 ADCQ p256const0<>(SB), acc1 1781 ADCQ $0, acc2 1782 ADCQ p256const1<>(SB), acc3 1783 ADCQ $0, mul0 1784 CMOVQNE t0, acc0 1785 CMOVQNE t1, acc1 1786 CMOVQNE t2, acc2 1787 CMOVQNE t3, acc3 1788 // If condition is 0, keep original value 1789 TESTQ DX, DX 1790 CMOVQEQ acc4, acc0 1791 CMOVQEQ acc5, acc1 1792 CMOVQEQ acc6, acc2 1793 CMOVQEQ acc7, acc3 1794 // Store result 1795 MOVQ acc0, y2in(8*0) 1796 MOVQ acc1, y2in(8*1) 1797 MOVQ acc2, y2in(8*2) 1798 MOVQ acc3, y2in(8*3) 1799 // Begin point add 1800 LDacc (z1in) 1801 CALL p256SqrInternal(SB) // z1ˆ2 1802 ST (z1sqr) 1803 1804 LDt (x2in) 1805 CALL p256MulInternal(SB) // x2 * z1ˆ2 1806 1807 LDt (x1in) 1808 CALL p256SubInternal(SB) // h = u2 - u1 1809 ST (h) 1810 1811 LDt (z1in) 1812 CALL p256MulInternal(SB) // z3 = h * z1 1813 ST (zout) 1814 1815 LDacc (z1sqr) 1816 CALL p256MulInternal(SB) // z1ˆ3 1817 1818 LDt (y2in) 1819 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 1820 ST (s2) 1821 1822 LDt (y1in) 1823 CALL p256SubInternal(SB) // r = s2 - s1 1824 ST (r) 1825 1826 CALL p256SqrInternal(SB) // rsqr = rˆ2 1827 ST (rsqr) 1828 1829 LDacc (h) 1830 CALL p256SqrInternal(SB) // hsqr = hˆ2 1831 ST (hsqr) 1832 1833 LDt (h) 1834 CALL p256MulInternal(SB) // hcub = hˆ3 1835 ST (hcub) 1836 1837 LDt (y1in) 1838 CALL p256MulInternal(SB) // y1 * hˆ3 1839 ST (s2) 1840 1841 LDacc (x1in) 1842 LDt (hsqr) 1843 CALL p256MulInternal(SB) // u1 * hˆ2 1844 ST (h) 1845 1846 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1847 LDacc (rsqr) 1848 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 1849 1850 LDt (hcub) 1851 CALL p256SubInternal(SB) 1852 ST (xout) 1853 1854 MOVQ acc4, t0 1855 MOVQ acc5, t1 1856 MOVQ acc6, t2 1857 MOVQ acc7, t3 1858 LDacc (h) 1859 CALL p256SubInternal(SB) 1860 1861 LDt (r) 1862 CALL p256MulInternal(SB) 1863 1864 LDt (s2) 1865 CALL p256SubInternal(SB) 1866 ST (yout) 1867 // Load stored values from stack 1868 MOVQ rptr, AX 1869 MOVL sel_save, BX 1870 MOVL zero_save, CX 1871 // The result is not valid if (sel == 0), conditional choose 1872 MOVOU xout(16*0), X0 1873 MOVOU xout(16*1), X1 1874 MOVOU yout(16*0), X2 1875 MOVOU yout(16*1), X3 1876 MOVOU zout(16*0), X4 1877 MOVOU zout(16*1), X5 1878 1879 MOVL BX, X6 1880 MOVL CX, X7 1881 1882 PXOR X8, X8 1883 PCMPEQL X9, X9 1884 1885 PSHUFD $0, X6, X6 1886 PSHUFD $0, X7, X7 1887 1888 PCMPEQL X8, X6 1889 PCMPEQL X8, X7 1890 1891 MOVOU X6, X15 1892 PANDN X9, X15 1893 1894 MOVOU x1in(16*0), X9 1895 MOVOU x1in(16*1), X10 1896 MOVOU y1in(16*0), X11 1897 MOVOU y1in(16*1), X12 1898 MOVOU z1in(16*0), X13 1899 MOVOU z1in(16*1), X14 1900 1901 PAND X15, X0 1902 PAND X15, X1 1903 PAND X15, X2 1904 PAND X15, X3 1905 PAND X15, X4 1906 PAND X15, X5 1907 1908 PAND X6, X9 1909 PAND X6, X10 1910 PAND X6, X11 1911 PAND X6, X12 1912 PAND X6, X13 1913 PAND X6, X14 1914 1915 PXOR X9, X0 1916 PXOR X10, X1 1917 PXOR X11, X2 1918 PXOR X12, X3 1919 PXOR X13, X4 1920 PXOR X14, X5 1921 // Similarly if zero == 0 1922 PCMPEQL X9, X9 1923 MOVOU X7, X15 1924 PANDN X9, X15 1925 1926 MOVOU x2in(16*0), X9 1927 MOVOU x2in(16*1), X10 1928 MOVOU y2in(16*0), X11 1929 MOVOU y2in(16*1), X12 1930 MOVOU p256one<>+0x00(SB), X13 1931 MOVOU p256one<>+0x10(SB), X14 1932 1933 PAND X15, X0 1934 PAND X15, X1 1935 PAND X15, X2 1936 PAND X15, X3 1937 PAND X15, X4 1938 PAND X15, X5 1939 1940 PAND X7, X9 1941 PAND X7, X10 1942 PAND X7, X11 1943 PAND X7, X12 1944 PAND X7, X13 1945 PAND X7, X14 1946 1947 PXOR X9, X0 1948 PXOR X10, X1 1949 PXOR X11, X2 1950 PXOR X12, X3 1951 PXOR X13, X4 1952 PXOR X14, X5 1953 // Finally output the result 1954 MOVOU X0, (16*0)(AX) 1955 MOVOU X1, (16*1)(AX) 1956 MOVOU X2, (16*2)(AX) 1957 MOVOU X3, (16*3)(AX) 1958 MOVOU X4, (16*4)(AX) 1959 MOVOU X5, (16*5)(AX) 1960 MOVQ $0, rptr 1961 1962 RET 1963 #undef x1in 1964 #undef y1in 1965 #undef z1in 1966 #undef x2in 1967 #undef y2in 1968 #undef xout 1969 #undef yout 1970 #undef zout 1971 #undef s2 1972 #undef z1sqr 1973 #undef h 1974 #undef r 1975 #undef hsqr 1976 #undef rsqr 1977 #undef hcub 1978 #undef rptr 1979 #undef sel_save 1980 #undef zero_save 1981 1982 // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero 1983 // otherwise. It writes to [acc4..acc7], t0 and t1. 1984 TEXT p256IsZero(SB),NOSPLIT,$0 1985 // AX contains a flag that is set if the input is zero. 1986 XORQ AX, AX 1987 MOVQ $1, t1 1988 1989 // Check whether [acc4..acc7] are all zero. 1990 MOVQ acc4, t0 1991 ORQ acc5, t0 1992 ORQ acc6, t0 1993 ORQ acc7, t0 1994 1995 // Set the zero flag if so. (CMOV of a constant to a register doesn't 1996 // appear to be supported in Go. Thus t1 = 1.) 1997 CMOVQEQ t1, AX 1998 1999 // XOR [acc4..acc7] with P and compare with zero again. 2000 XORQ $-1, acc4 2001 XORQ p256const0<>(SB), acc5 2002 XORQ p256const1<>(SB), acc7 2003 ORQ acc5, acc4 2004 ORQ acc6, acc4 2005 ORQ acc7, acc4 2006 2007 // Set the zero flag if so. 2008 CMOVQEQ t1, AX 2009 RET 2010 2011 /* ---------------------------------------*/ 2012 #define x1in(off) (32*0 + off)(SP) 2013 #define y1in(off) (32*1 + off)(SP) 2014 #define z1in(off) (32*2 + off)(SP) 2015 #define x2in(off) (32*3 + off)(SP) 2016 #define y2in(off) (32*4 + off)(SP) 2017 #define z2in(off) (32*5 + off)(SP) 2018 2019 #define xout(off) (32*6 + off)(SP) 2020 #define yout(off) (32*7 + off)(SP) 2021 #define zout(off) (32*8 + off)(SP) 2022 2023 #define u1(off) (32*9 + off)(SP) 2024 #define u2(off) (32*10 + off)(SP) 2025 #define s1(off) (32*11 + off)(SP) 2026 #define s2(off) (32*12 + off)(SP) 2027 #define z1sqr(off) (32*13 + off)(SP) 2028 #define z2sqr(off) (32*14 + off)(SP) 2029 #define h(off) (32*15 + off)(SP) 2030 #define r(off) (32*16 + off)(SP) 2031 #define hsqr(off) (32*17 + off)(SP) 2032 #define rsqr(off) (32*18 + off)(SP) 2033 #define hcub(off) (32*19 + off)(SP) 2034 #define rptr (32*20)(SP) 2035 #define points_eq (32*20+8)(SP) 2036 2037 //func p256PointAddAsm(res, in1, in2 []uint64) int 2038 TEXT ·p256PointAddAsm(SB),0,$680-80 2039 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 2040 // Move input to stack in order to free registers 2041 MOVQ res+0(FP), AX 2042 MOVQ in1+24(FP), BX 2043 MOVQ in2+48(FP), CX 2044 2045 MOVOU (16*0)(BX), X0 2046 MOVOU (16*1)(BX), X1 2047 MOVOU (16*2)(BX), X2 2048 MOVOU (16*3)(BX), X3 2049 MOVOU (16*4)(BX), X4 2050 MOVOU (16*5)(BX), X5 2051 2052 MOVOU X0, x1in(16*0) 2053 MOVOU X1, x1in(16*1) 2054 MOVOU X2, y1in(16*0) 2055 MOVOU X3, y1in(16*1) 2056 MOVOU X4, z1in(16*0) 2057 MOVOU X5, z1in(16*1) 2058 2059 MOVOU (16*0)(CX), X0 2060 MOVOU (16*1)(CX), X1 2061 MOVOU (16*2)(CX), X2 2062 MOVOU (16*3)(CX), X3 2063 MOVOU (16*4)(CX), X4 2064 MOVOU (16*5)(CX), X5 2065 2066 MOVOU X0, x2in(16*0) 2067 MOVOU X1, x2in(16*1) 2068 MOVOU X2, y2in(16*0) 2069 MOVOU X3, y2in(16*1) 2070 MOVOU X4, z2in(16*0) 2071 MOVOU X5, z2in(16*1) 2072 // Store pointer to result 2073 MOVQ AX, rptr 2074 // Begin point add 2075 LDacc (z2in) 2076 CALL p256SqrInternal(SB) // z2ˆ2 2077 ST (z2sqr) 2078 LDt (z2in) 2079 CALL p256MulInternal(SB) // z2ˆ3 2080 LDt (y1in) 2081 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 2082 ST (s1) 2083 2084 LDacc (z1in) 2085 CALL p256SqrInternal(SB) // z1ˆ2 2086 ST (z1sqr) 2087 LDt (z1in) 2088 CALL p256MulInternal(SB) // z1ˆ3 2089 LDt (y2in) 2090 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 2091 ST (s2) 2092 2093 LDt (s1) 2094 CALL p256SubInternal(SB) // r = s2 - s1 2095 ST (r) 2096 CALL p256IsZero(SB) 2097 MOVQ AX, points_eq 2098 2099 LDacc (z2sqr) 2100 LDt (x1in) 2101 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 2102 ST (u1) 2103 LDacc (z1sqr) 2104 LDt (x2in) 2105 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 2106 ST (u2) 2107 2108 LDt (u1) 2109 CALL p256SubInternal(SB) // h = u2 - u1 2110 ST (h) 2111 CALL p256IsZero(SB) 2112 ANDQ points_eq, AX 2113 MOVQ AX, points_eq 2114 2115 LDacc (r) 2116 CALL p256SqrInternal(SB) // rsqr = rˆ2 2117 ST (rsqr) 2118 2119 LDacc (h) 2120 CALL p256SqrInternal(SB) // hsqr = hˆ2 2121 ST (hsqr) 2122 2123 LDt (h) 2124 CALL p256MulInternal(SB) // hcub = hˆ3 2125 ST (hcub) 2126 2127 LDt (s1) 2128 CALL p256MulInternal(SB) 2129 ST (s2) 2130 2131 LDacc (z1in) 2132 LDt (z2in) 2133 CALL p256MulInternal(SB) // z1 * z2 2134 LDt (h) 2135 CALL p256MulInternal(SB) // z1 * z2 * h 2136 ST (zout) 2137 2138 LDacc (hsqr) 2139 LDt (u1) 2140 CALL p256MulInternal(SB) // hˆ2 * u1 2141 ST (u2) 2142 2143 p256MulBy2Inline // u1 * hˆ2 * 2, inline 2144 LDacc (rsqr) 2145 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 2146 2147 LDt (hcub) 2148 CALL p256SubInternal(SB) 2149 ST (xout) 2150 2151 MOVQ acc4, t0 2152 MOVQ acc5, t1 2153 MOVQ acc6, t2 2154 MOVQ acc7, t3 2155 LDacc (u2) 2156 CALL p256SubInternal(SB) 2157 2158 LDt (r) 2159 CALL p256MulInternal(SB) 2160 2161 LDt (s2) 2162 CALL p256SubInternal(SB) 2163 ST (yout) 2164 2165 MOVOU xout(16*0), X0 2166 MOVOU xout(16*1), X1 2167 MOVOU yout(16*0), X2 2168 MOVOU yout(16*1), X3 2169 MOVOU zout(16*0), X4 2170 MOVOU zout(16*1), X5 2171 // Finally output the result 2172 MOVQ rptr, AX 2173 MOVQ $0, rptr 2174 MOVOU X0, (16*0)(AX) 2175 MOVOU X1, (16*1)(AX) 2176 MOVOU X2, (16*2)(AX) 2177 MOVOU X3, (16*3)(AX) 2178 MOVOU X4, (16*4)(AX) 2179 MOVOU X5, (16*5)(AX) 2180 2181 MOVQ points_eq, AX 2182 MOVQ AX, ret+72(FP) 2183 2184 RET 2185 #undef x1in 2186 #undef y1in 2187 #undef z1in 2188 #undef x2in 2189 #undef y2in 2190 #undef z2in 2191 #undef xout 2192 #undef yout 2193 #undef zout 2194 #undef s1 2195 #undef s2 2196 #undef u1 2197 #undef u2 2198 #undef z1sqr 2199 #undef z2sqr 2200 #undef h 2201 #undef r 2202 #undef hsqr 2203 #undef rsqr 2204 #undef hcub 2205 #undef rptr 2206 /* ---------------------------------------*/ 2207 #define x(off) (32*0 + off)(SP) 2208 #define y(off) (32*1 + off)(SP) 2209 #define z(off) (32*2 + off)(SP) 2210 2211 #define s(off) (32*3 + off)(SP) 2212 #define m(off) (32*4 + off)(SP) 2213 #define zsqr(off) (32*5 + off)(SP) 2214 #define tmp(off) (32*6 + off)(SP) 2215 #define rptr (32*7)(SP) 2216 2217 //func p256PointDoubleAsm(res, in []uint64) 2218 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 2219 // Move input to stack in order to free registers 2220 MOVQ res+0(FP), AX 2221 MOVQ in+24(FP), BX 2222 2223 MOVOU (16*0)(BX), X0 2224 MOVOU (16*1)(BX), X1 2225 MOVOU (16*2)(BX), X2 2226 MOVOU (16*3)(BX), X3 2227 MOVOU (16*4)(BX), X4 2228 MOVOU (16*5)(BX), X5 2229 2230 MOVOU X0, x(16*0) 2231 MOVOU X1, x(16*1) 2232 MOVOU X2, y(16*0) 2233 MOVOU X3, y(16*1) 2234 MOVOU X4, z(16*0) 2235 MOVOU X5, z(16*1) 2236 // Store pointer to result 2237 MOVQ AX, rptr 2238 // Begin point double 2239 LDacc (z) 2240 CALL p256SqrInternal(SB) 2241 ST (zsqr) 2242 2243 LDt (x) 2244 p256AddInline 2245 STt (m) 2246 2247 LDacc (z) 2248 LDt (y) 2249 CALL p256MulInternal(SB) 2250 p256MulBy2Inline 2251 MOVQ rptr, AX 2252 // Store z 2253 MOVQ t0, (16*4 + 8*0)(AX) 2254 MOVQ t1, (16*4 + 8*1)(AX) 2255 MOVQ t2, (16*4 + 8*2)(AX) 2256 MOVQ t3, (16*4 + 8*3)(AX) 2257 2258 LDacc (x) 2259 LDt (zsqr) 2260 CALL p256SubInternal(SB) 2261 LDt (m) 2262 CALL p256MulInternal(SB) 2263 ST (m) 2264 // Multiply by 3 2265 p256MulBy2Inline 2266 LDacc (m) 2267 p256AddInline 2268 STt (m) 2269 //////////////////////// 2270 LDacc (y) 2271 p256MulBy2Inline 2272 t2acc 2273 CALL p256SqrInternal(SB) 2274 ST (s) 2275 CALL p256SqrInternal(SB) 2276 // Divide by 2 2277 XORQ mul0, mul0 2278 MOVQ acc4, t0 2279 MOVQ acc5, t1 2280 MOVQ acc6, t2 2281 MOVQ acc7, t3 2282 2283 ADDQ $-1, acc4 2284 ADCQ p256const0<>(SB), acc5 2285 ADCQ $0, acc6 2286 ADCQ p256const1<>(SB), acc7 2287 ADCQ $0, mul0 2288 TESTQ $1, t0 2289 2290 CMOVQEQ t0, acc4 2291 CMOVQEQ t1, acc5 2292 CMOVQEQ t2, acc6 2293 CMOVQEQ t3, acc7 2294 ANDQ t0, mul0 2295 2296 SHRQ $1, acc4:acc5 2297 SHRQ $1, acc5:acc6 2298 SHRQ $1, acc6:acc7 2299 SHRQ $1, acc7:mul0 2300 ST (y) 2301 ///////////////////////// 2302 LDacc (x) 2303 LDt (s) 2304 CALL p256MulInternal(SB) 2305 ST (s) 2306 p256MulBy2Inline 2307 STt (tmp) 2308 2309 LDacc (m) 2310 CALL p256SqrInternal(SB) 2311 LDt (tmp) 2312 CALL p256SubInternal(SB) 2313 2314 MOVQ rptr, AX 2315 // Store x 2316 MOVQ acc4, (16*0 + 8*0)(AX) 2317 MOVQ acc5, (16*0 + 8*1)(AX) 2318 MOVQ acc6, (16*0 + 8*2)(AX) 2319 MOVQ acc7, (16*0 + 8*3)(AX) 2320 2321 acc2t 2322 LDacc (s) 2323 CALL p256SubInternal(SB) 2324 2325 LDt (m) 2326 CALL p256MulInternal(SB) 2327 2328 LDt (y) 2329 CALL p256SubInternal(SB) 2330 MOVQ rptr, AX 2331 // Store y 2332 MOVQ acc4, (16*2 + 8*0)(AX) 2333 MOVQ acc5, (16*2 + 8*1)(AX) 2334 MOVQ acc6, (16*2 + 8*2)(AX) 2335 MOVQ acc7, (16*2 + 8*3)(AX) 2336 /////////////////////// 2337 MOVQ $0, rptr 2338 2339 RET 2340 /* ---------------------------------------*/ 2341