github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/crypto/internal/nistec/p256_asm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains constant-time, 64-bit assembly implementation of 6 // P256. The optimizations performed here are described in detail in: 7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 8 // 256-bit primes" 9 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 10 // https://eprint.iacr.org/2013/816.pdf 11 12 #include "textflag.h" 13 14 #define res_ptr DI 15 #define x_ptr SI 16 #define y_ptr CX 17 18 #define acc0 R8 19 #define acc1 R9 20 #define acc2 R10 21 #define acc3 R11 22 #define acc4 R12 23 #define acc5 R13 24 #define t0 R14 25 #define t1 R15 26 27 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 28 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 29 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 30 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 31 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 32 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 33 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 34 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 35 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 36 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 37 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 38 GLOBL p256const0<>(SB), 8, $8 39 GLOBL p256const1<>(SB), 8, $8 40 GLOBL p256ordK0<>(SB), 8, $8 41 GLOBL p256ord<>(SB), 8, $32 42 GLOBL p256one<>(SB), 8, $32 43 44 /* ---------------------------------------*/ 45 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 46 TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 47 JMP ·p256BigToLittle(SB) 48 /* ---------------------------------------*/ 49 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 50 TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 51 JMP ·p256BigToLittle(SB) 52 /* ---------------------------------------*/ 53 // func p256LittleToBig(res *[32]byte, in *p256Element) 54 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 55 JMP ·p256BigToLittle(SB) 56 /* ---------------------------------------*/ 57 // func p256BigToLittle(res *p256Element, in *[32]byte) 58 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 59 MOVQ res+0(FP), res_ptr 60 MOVQ in+8(FP), x_ptr 61 62 MOVQ (8*0)(x_ptr), acc0 63 MOVQ (8*1)(x_ptr), acc1 64 MOVQ (8*2)(x_ptr), acc2 65 MOVQ (8*3)(x_ptr), acc3 66 67 BSWAPQ acc0 68 BSWAPQ acc1 69 BSWAPQ acc2 70 BSWAPQ acc3 71 72 MOVQ acc3, (8*0)(res_ptr) 73 MOVQ acc2, (8*1)(res_ptr) 74 MOVQ acc1, (8*2)(res_ptr) 75 MOVQ acc0, (8*3)(res_ptr) 76 77 RET 78 /* ---------------------------------------*/ 79 // func p256MovCond(res, a, b *P256Point, cond int) 80 TEXT ·p256MovCond(SB),NOSPLIT,$0 81 MOVQ res+0(FP), res_ptr 82 MOVQ a+8(FP), x_ptr 83 MOVQ b+16(FP), y_ptr 84 MOVQ cond+24(FP), X12 85 86 PXOR X13, X13 87 PSHUFD $0, X12, X12 88 PCMPEQL X13, X12 89 90 MOVOU X12, X0 91 MOVOU (16*0)(x_ptr), X6 92 PANDN X6, X0 93 MOVOU X12, X1 94 MOVOU (16*1)(x_ptr), X7 95 PANDN X7, X1 96 MOVOU X12, X2 97 MOVOU (16*2)(x_ptr), X8 98 PANDN X8, X2 99 MOVOU X12, X3 100 MOVOU (16*3)(x_ptr), X9 101 PANDN X9, X3 102 MOVOU X12, X4 103 MOVOU (16*4)(x_ptr), X10 104 PANDN X10, X4 105 MOVOU X12, X5 106 MOVOU (16*5)(x_ptr), X11 107 PANDN X11, X5 108 109 MOVOU (16*0)(y_ptr), X6 110 MOVOU (16*1)(y_ptr), X7 111 MOVOU (16*2)(y_ptr), X8 112 MOVOU (16*3)(y_ptr), X9 113 MOVOU (16*4)(y_ptr), X10 114 MOVOU (16*5)(y_ptr), X11 115 116 PAND X12, X6 117 PAND X12, X7 118 PAND X12, X8 119 PAND X12, X9 120 PAND X12, X10 121 PAND X12, X11 122 123 PXOR X6, X0 124 PXOR X7, X1 125 PXOR X8, X2 126 PXOR X9, X3 127 PXOR X10, X4 128 PXOR X11, X5 129 130 MOVOU X0, (16*0)(res_ptr) 131 MOVOU X1, (16*1)(res_ptr) 132 MOVOU X2, (16*2)(res_ptr) 133 MOVOU X3, (16*3)(res_ptr) 134 MOVOU X4, (16*4)(res_ptr) 135 MOVOU X5, (16*5)(res_ptr) 136 137 RET 138 /* ---------------------------------------*/ 139 // func p256NegCond(val *p256Element, cond int) 140 TEXT ·p256NegCond(SB),NOSPLIT,$0 141 MOVQ val+0(FP), res_ptr 142 MOVQ cond+8(FP), t0 143 // acc = poly 144 MOVQ $-1, acc0 145 MOVQ p256const0<>(SB), acc1 146 MOVQ $0, acc2 147 MOVQ p256const1<>(SB), acc3 148 // Load the original value 149 MOVQ (8*0)(res_ptr), acc5 150 MOVQ (8*1)(res_ptr), x_ptr 151 MOVQ (8*2)(res_ptr), y_ptr 152 MOVQ (8*3)(res_ptr), t1 153 // Speculatively subtract 154 SUBQ acc5, acc0 155 SBBQ x_ptr, acc1 156 SBBQ y_ptr, acc2 157 SBBQ t1, acc3 158 // If condition is 0, keep original value 159 TESTQ t0, t0 160 CMOVQEQ acc5, acc0 161 CMOVQEQ x_ptr, acc1 162 CMOVQEQ y_ptr, acc2 163 CMOVQEQ t1, acc3 164 // Store result 165 MOVQ acc0, (8*0)(res_ptr) 166 MOVQ acc1, (8*1)(res_ptr) 167 MOVQ acc2, (8*2)(res_ptr) 168 MOVQ acc3, (8*3)(res_ptr) 169 170 RET 171 /* ---------------------------------------*/ 172 // func p256Sqr(res, in *p256Element, n int) 173 TEXT ·p256Sqr(SB),NOSPLIT,$0 174 MOVQ res+0(FP), res_ptr 175 MOVQ in+8(FP), x_ptr 176 MOVQ n+16(FP), BX 177 178 sqrLoop: 179 180 // y[1:] * y[0] 181 MOVQ (8*0)(x_ptr), t0 182 183 MOVQ (8*1)(x_ptr), AX 184 MULQ t0 185 MOVQ AX, acc1 186 MOVQ DX, acc2 187 188 MOVQ (8*2)(x_ptr), AX 189 MULQ t0 190 ADDQ AX, acc2 191 ADCQ $0, DX 192 MOVQ DX, acc3 193 194 MOVQ (8*3)(x_ptr), AX 195 MULQ t0 196 ADDQ AX, acc3 197 ADCQ $0, DX 198 MOVQ DX, acc4 199 // y[2:] * y[1] 200 MOVQ (8*1)(x_ptr), t0 201 202 MOVQ (8*2)(x_ptr), AX 203 MULQ t0 204 ADDQ AX, acc3 205 ADCQ $0, DX 206 MOVQ DX, t1 207 208 MOVQ (8*3)(x_ptr), AX 209 MULQ t0 210 ADDQ t1, acc4 211 ADCQ $0, DX 212 ADDQ AX, acc4 213 ADCQ $0, DX 214 MOVQ DX, acc5 215 // y[3] * y[2] 216 MOVQ (8*2)(x_ptr), t0 217 218 MOVQ (8*3)(x_ptr), AX 219 MULQ t0 220 ADDQ AX, acc5 221 ADCQ $0, DX 222 MOVQ DX, y_ptr 223 XORQ t1, t1 224 // *2 225 ADDQ acc1, acc1 226 ADCQ acc2, acc2 227 ADCQ acc3, acc3 228 ADCQ acc4, acc4 229 ADCQ acc5, acc5 230 ADCQ y_ptr, y_ptr 231 ADCQ $0, t1 232 // Missing products 233 MOVQ (8*0)(x_ptr), AX 234 MULQ AX 235 MOVQ AX, acc0 236 MOVQ DX, t0 237 238 MOVQ (8*1)(x_ptr), AX 239 MULQ AX 240 ADDQ t0, acc1 241 ADCQ AX, acc2 242 ADCQ $0, DX 243 MOVQ DX, t0 244 245 MOVQ (8*2)(x_ptr), AX 246 MULQ AX 247 ADDQ t0, acc3 248 ADCQ AX, acc4 249 ADCQ $0, DX 250 MOVQ DX, t0 251 252 MOVQ (8*3)(x_ptr), AX 253 MULQ AX 254 ADDQ t0, acc5 255 ADCQ AX, y_ptr 256 ADCQ DX, t1 257 MOVQ t1, x_ptr 258 // First reduction step 259 MOVQ acc0, AX 260 MOVQ acc0, t1 261 SHLQ $32, acc0 262 MULQ p256const1<>(SB) 263 SHRQ $32, t1 264 ADDQ acc0, acc1 265 ADCQ t1, acc2 266 ADCQ AX, acc3 267 ADCQ $0, DX 268 MOVQ DX, acc0 269 // Second reduction step 270 MOVQ acc1, AX 271 MOVQ acc1, t1 272 SHLQ $32, acc1 273 MULQ p256const1<>(SB) 274 SHRQ $32, t1 275 ADDQ acc1, acc2 276 ADCQ t1, acc3 277 ADCQ AX, acc0 278 ADCQ $0, DX 279 MOVQ DX, acc1 280 // Third reduction step 281 MOVQ acc2, AX 282 MOVQ acc2, t1 283 SHLQ $32, acc2 284 MULQ p256const1<>(SB) 285 SHRQ $32, t1 286 ADDQ acc2, acc3 287 ADCQ t1, acc0 288 ADCQ AX, acc1 289 ADCQ $0, DX 290 MOVQ DX, acc2 291 // Last reduction step 292 XORQ t0, t0 293 MOVQ acc3, AX 294 MOVQ acc3, t1 295 SHLQ $32, acc3 296 MULQ p256const1<>(SB) 297 SHRQ $32, t1 298 ADDQ acc3, acc0 299 ADCQ t1, acc1 300 ADCQ AX, acc2 301 ADCQ $0, DX 302 MOVQ DX, acc3 303 // Add bits [511:256] of the sqr result 304 ADCQ acc4, acc0 305 ADCQ acc5, acc1 306 ADCQ y_ptr, acc2 307 ADCQ x_ptr, acc3 308 ADCQ $0, t0 309 310 MOVQ acc0, acc4 311 MOVQ acc1, acc5 312 MOVQ acc2, y_ptr 313 MOVQ acc3, t1 314 // Subtract p256 315 SUBQ $-1, acc0 316 SBBQ p256const0<>(SB) ,acc1 317 SBBQ $0, acc2 318 SBBQ p256const1<>(SB), acc3 319 SBBQ $0, t0 320 321 CMOVQCS acc4, acc0 322 CMOVQCS acc5, acc1 323 CMOVQCS y_ptr, acc2 324 CMOVQCS t1, acc3 325 326 MOVQ acc0, (8*0)(res_ptr) 327 MOVQ acc1, (8*1)(res_ptr) 328 MOVQ acc2, (8*2)(res_ptr) 329 MOVQ acc3, (8*3)(res_ptr) 330 MOVQ res_ptr, x_ptr 331 DECQ BX 332 JNE sqrLoop 333 334 RET 335 /* ---------------------------------------*/ 336 // func p256Mul(res, in1, in2 *p256Element) 337 TEXT ·p256Mul(SB),NOSPLIT,$0 338 MOVQ res+0(FP), res_ptr 339 MOVQ in1+8(FP), x_ptr 340 MOVQ in2+16(FP), y_ptr 341 // x * y[0] 342 MOVQ (8*0)(y_ptr), t0 343 344 MOVQ (8*0)(x_ptr), AX 345 MULQ t0 346 MOVQ AX, acc0 347 MOVQ DX, acc1 348 349 MOVQ (8*1)(x_ptr), AX 350 MULQ t0 351 ADDQ AX, acc1 352 ADCQ $0, DX 353 MOVQ DX, acc2 354 355 MOVQ (8*2)(x_ptr), AX 356 MULQ t0 357 ADDQ AX, acc2 358 ADCQ $0, DX 359 MOVQ DX, acc3 360 361 MOVQ (8*3)(x_ptr), AX 362 MULQ t0 363 ADDQ AX, acc3 364 ADCQ $0, DX 365 MOVQ DX, acc4 366 XORQ acc5, acc5 367 // First reduction step 368 MOVQ acc0, AX 369 MOVQ acc0, t1 370 SHLQ $32, acc0 371 MULQ p256const1<>(SB) 372 SHRQ $32, t1 373 ADDQ acc0, acc1 374 ADCQ t1, acc2 375 ADCQ AX, acc3 376 ADCQ DX, acc4 377 ADCQ $0, acc5 378 XORQ acc0, acc0 379 // x * y[1] 380 MOVQ (8*1)(y_ptr), t0 381 382 MOVQ (8*0)(x_ptr), AX 383 MULQ t0 384 ADDQ AX, acc1 385 ADCQ $0, DX 386 MOVQ DX, t1 387 388 MOVQ (8*1)(x_ptr), AX 389 MULQ t0 390 ADDQ t1, acc2 391 ADCQ $0, DX 392 ADDQ AX, acc2 393 ADCQ $0, DX 394 MOVQ DX, t1 395 396 MOVQ (8*2)(x_ptr), AX 397 MULQ t0 398 ADDQ t1, acc3 399 ADCQ $0, DX 400 ADDQ AX, acc3 401 ADCQ $0, DX 402 MOVQ DX, t1 403 404 MOVQ (8*3)(x_ptr), AX 405 MULQ t0 406 ADDQ t1, acc4 407 ADCQ $0, DX 408 ADDQ AX, acc4 409 ADCQ DX, acc5 410 ADCQ $0, acc0 411 // Second reduction step 412 MOVQ acc1, AX 413 MOVQ acc1, t1 414 SHLQ $32, acc1 415 MULQ p256const1<>(SB) 416 SHRQ $32, t1 417 ADDQ acc1, acc2 418 ADCQ t1, acc3 419 ADCQ AX, acc4 420 ADCQ DX, acc5 421 ADCQ $0, acc0 422 XORQ acc1, acc1 423 // x * y[2] 424 MOVQ (8*2)(y_ptr), t0 425 426 MOVQ (8*0)(x_ptr), AX 427 MULQ t0 428 ADDQ AX, acc2 429 ADCQ $0, DX 430 MOVQ DX, t1 431 432 MOVQ (8*1)(x_ptr), AX 433 MULQ t0 434 ADDQ t1, acc3 435 ADCQ $0, DX 436 ADDQ AX, acc3 437 ADCQ $0, DX 438 MOVQ DX, t1 439 440 MOVQ (8*2)(x_ptr), AX 441 MULQ t0 442 ADDQ t1, acc4 443 ADCQ $0, DX 444 ADDQ AX, acc4 445 ADCQ $0, DX 446 MOVQ DX, t1 447 448 MOVQ (8*3)(x_ptr), AX 449 MULQ t0 450 ADDQ t1, acc5 451 ADCQ $0, DX 452 ADDQ AX, acc5 453 ADCQ DX, acc0 454 ADCQ $0, acc1 455 // Third reduction step 456 MOVQ acc2, AX 457 MOVQ acc2, t1 458 SHLQ $32, acc2 459 MULQ p256const1<>(SB) 460 SHRQ $32, t1 461 ADDQ acc2, acc3 462 ADCQ t1, acc4 463 ADCQ AX, acc5 464 ADCQ DX, acc0 465 ADCQ $0, acc1 466 XORQ acc2, acc2 467 // x * y[3] 468 MOVQ (8*3)(y_ptr), t0 469 470 MOVQ (8*0)(x_ptr), AX 471 MULQ t0 472 ADDQ AX, acc3 473 ADCQ $0, DX 474 MOVQ DX, t1 475 476 MOVQ (8*1)(x_ptr), AX 477 MULQ t0 478 ADDQ t1, acc4 479 ADCQ $0, DX 480 ADDQ AX, acc4 481 ADCQ $0, DX 482 MOVQ DX, t1 483 484 MOVQ (8*2)(x_ptr), AX 485 MULQ t0 486 ADDQ t1, acc5 487 ADCQ $0, DX 488 ADDQ AX, acc5 489 ADCQ $0, DX 490 MOVQ DX, t1 491 492 MOVQ (8*3)(x_ptr), AX 493 MULQ t0 494 ADDQ t1, acc0 495 ADCQ $0, DX 496 ADDQ AX, acc0 497 ADCQ DX, acc1 498 ADCQ $0, acc2 499 // Last reduction step 500 MOVQ acc3, AX 501 MOVQ acc3, t1 502 SHLQ $32, acc3 503 MULQ p256const1<>(SB) 504 SHRQ $32, t1 505 ADDQ acc3, acc4 506 ADCQ t1, acc5 507 ADCQ AX, acc0 508 ADCQ DX, acc1 509 ADCQ $0, acc2 510 // Copy result [255:0] 511 MOVQ acc4, x_ptr 512 MOVQ acc5, acc3 513 MOVQ acc0, t0 514 MOVQ acc1, t1 515 // Subtract p256 516 SUBQ $-1, acc4 517 SBBQ p256const0<>(SB) ,acc5 518 SBBQ $0, acc0 519 SBBQ p256const1<>(SB), acc1 520 SBBQ $0, acc2 521 522 CMOVQCS x_ptr, acc4 523 CMOVQCS acc3, acc5 524 CMOVQCS t0, acc0 525 CMOVQCS t1, acc1 526 527 MOVQ acc4, (8*0)(res_ptr) 528 MOVQ acc5, (8*1)(res_ptr) 529 MOVQ acc0, (8*2)(res_ptr) 530 MOVQ acc1, (8*3)(res_ptr) 531 532 RET 533 /* ---------------------------------------*/ 534 // func p256FromMont(res, in *p256Element) 535 TEXT ·p256FromMont(SB),NOSPLIT,$0 536 MOVQ res+0(FP), res_ptr 537 MOVQ in+8(FP), x_ptr 538 539 MOVQ (8*0)(x_ptr), acc0 540 MOVQ (8*1)(x_ptr), acc1 541 MOVQ (8*2)(x_ptr), acc2 542 MOVQ (8*3)(x_ptr), acc3 543 XORQ acc4, acc4 544 545 // Only reduce, no multiplications are needed 546 // First stage 547 MOVQ acc0, AX 548 MOVQ acc0, t1 549 SHLQ $32, acc0 550 MULQ p256const1<>(SB) 551 SHRQ $32, t1 552 ADDQ acc0, acc1 553 ADCQ t1, acc2 554 ADCQ AX, acc3 555 ADCQ DX, acc4 556 XORQ acc5, acc5 557 // Second stage 558 MOVQ acc1, AX 559 MOVQ acc1, t1 560 SHLQ $32, acc1 561 MULQ p256const1<>(SB) 562 SHRQ $32, t1 563 ADDQ acc1, acc2 564 ADCQ t1, acc3 565 ADCQ AX, acc4 566 ADCQ DX, acc5 567 XORQ acc0, acc0 568 // Third stage 569 MOVQ acc2, AX 570 MOVQ acc2, t1 571 SHLQ $32, acc2 572 MULQ p256const1<>(SB) 573 SHRQ $32, t1 574 ADDQ acc2, acc3 575 ADCQ t1, acc4 576 ADCQ AX, acc5 577 ADCQ DX, acc0 578 XORQ acc1, acc1 579 // Last stage 580 MOVQ acc3, AX 581 MOVQ acc3, t1 582 SHLQ $32, acc3 583 MULQ p256const1<>(SB) 584 SHRQ $32, t1 585 ADDQ acc3, acc4 586 ADCQ t1, acc5 587 ADCQ AX, acc0 588 ADCQ DX, acc1 589 590 MOVQ acc4, x_ptr 591 MOVQ acc5, acc3 592 MOVQ acc0, t0 593 MOVQ acc1, t1 594 595 SUBQ $-1, acc4 596 SBBQ p256const0<>(SB), acc5 597 SBBQ $0, acc0 598 SBBQ p256const1<>(SB), acc1 599 600 CMOVQCS x_ptr, acc4 601 CMOVQCS acc3, acc5 602 CMOVQCS t0, acc0 603 CMOVQCS t1, acc1 604 605 MOVQ acc4, (8*0)(res_ptr) 606 MOVQ acc5, (8*1)(res_ptr) 607 MOVQ acc0, (8*2)(res_ptr) 608 MOVQ acc1, (8*3)(res_ptr) 609 610 RET 611 /* ---------------------------------------*/ 612 // func p256Select(res *P256Point, table *p256Table, idx int) 613 TEXT ·p256Select(SB),NOSPLIT,$0 614 MOVQ idx+16(FP),AX 615 MOVQ table+8(FP),DI 616 MOVQ res+0(FP),DX 617 618 PXOR X15, X15 // X15 = 0 619 PCMPEQL X14, X14 // X14 = -1 620 PSUBL X14, X15 // X15 = 1 621 MOVL AX, X14 622 PSHUFD $0, X14, X14 623 624 PXOR X0, X0 625 PXOR X1, X1 626 PXOR X2, X2 627 PXOR X3, X3 628 PXOR X4, X4 629 PXOR X5, X5 630 MOVQ $16, AX 631 632 MOVOU X15, X13 633 634 loop_select: 635 636 MOVOU X13, X12 637 PADDL X15, X13 638 PCMPEQL X14, X12 639 640 MOVOU (16*0)(DI), X6 641 MOVOU (16*1)(DI), X7 642 MOVOU (16*2)(DI), X8 643 MOVOU (16*3)(DI), X9 644 MOVOU (16*4)(DI), X10 645 MOVOU (16*5)(DI), X11 646 ADDQ $(16*6), DI 647 648 PAND X12, X6 649 PAND X12, X7 650 PAND X12, X8 651 PAND X12, X9 652 PAND X12, X10 653 PAND X12, X11 654 655 PXOR X6, X0 656 PXOR X7, X1 657 PXOR X8, X2 658 PXOR X9, X3 659 PXOR X10, X4 660 PXOR X11, X5 661 662 DECQ AX 663 JNE loop_select 664 665 MOVOU X0, (16*0)(DX) 666 MOVOU X1, (16*1)(DX) 667 MOVOU X2, (16*2)(DX) 668 MOVOU X3, (16*3)(DX) 669 MOVOU X4, (16*4)(DX) 670 MOVOU X5, (16*5)(DX) 671 672 RET 673 /* ---------------------------------------*/ 674 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 675 TEXT ·p256SelectAffine(SB),NOSPLIT,$0 676 MOVQ idx+16(FP),AX 677 MOVQ table+8(FP),DI 678 MOVQ res+0(FP),DX 679 680 PXOR X15, X15 // X15 = 0 681 PCMPEQL X14, X14 // X14 = -1 682 PSUBL X14, X15 // X15 = 1 683 MOVL AX, X14 684 PSHUFD $0, X14, X14 685 686 PXOR X0, X0 687 PXOR X1, X1 688 PXOR X2, X2 689 PXOR X3, X3 690 MOVQ $16, AX 691 692 MOVOU X15, X13 693 694 loop_select_base: 695 696 MOVOU X13, X12 697 PADDL X15, X13 698 PCMPEQL X14, X12 699 700 MOVOU (16*0)(DI), X4 701 MOVOU (16*1)(DI), X5 702 MOVOU (16*2)(DI), X6 703 MOVOU (16*3)(DI), X7 704 705 MOVOU (16*4)(DI), X8 706 MOVOU (16*5)(DI), X9 707 MOVOU (16*6)(DI), X10 708 MOVOU (16*7)(DI), X11 709 710 ADDQ $(16*8), DI 711 712 PAND X12, X4 713 PAND X12, X5 714 PAND X12, X6 715 PAND X12, X7 716 717 MOVOU X13, X12 718 PADDL X15, X13 719 PCMPEQL X14, X12 720 721 PAND X12, X8 722 PAND X12, X9 723 PAND X12, X10 724 PAND X12, X11 725 726 PXOR X4, X0 727 PXOR X5, X1 728 PXOR X6, X2 729 PXOR X7, X3 730 731 PXOR X8, X0 732 PXOR X9, X1 733 PXOR X10, X2 734 PXOR X11, X3 735 736 DECQ AX 737 JNE loop_select_base 738 739 MOVOU X0, (16*0)(DX) 740 MOVOU X1, (16*1)(DX) 741 MOVOU X2, (16*2)(DX) 742 MOVOU X3, (16*3)(DX) 743 744 RET 745 /* ---------------------------------------*/ 746 // func p256OrdMul(res, in1, in2 *p256OrdElement) 747 TEXT ·p256OrdMul(SB),NOSPLIT,$0 748 MOVQ res+0(FP), res_ptr 749 MOVQ in1+8(FP), x_ptr 750 MOVQ in2+16(FP), y_ptr 751 // x * y[0] 752 MOVQ (8*0)(y_ptr), t0 753 754 MOVQ (8*0)(x_ptr), AX 755 MULQ t0 756 MOVQ AX, acc0 757 MOVQ DX, acc1 758 759 MOVQ (8*1)(x_ptr), AX 760 MULQ t0 761 ADDQ AX, acc1 762 ADCQ $0, DX 763 MOVQ DX, acc2 764 765 MOVQ (8*2)(x_ptr), AX 766 MULQ t0 767 ADDQ AX, acc2 768 ADCQ $0, DX 769 MOVQ DX, acc3 770 771 MOVQ (8*3)(x_ptr), AX 772 MULQ t0 773 ADDQ AX, acc3 774 ADCQ $0, DX 775 MOVQ DX, acc4 776 XORQ acc5, acc5 777 // First reduction step 778 MOVQ acc0, AX 779 MULQ p256ordK0<>(SB) 780 MOVQ AX, t0 781 782 MOVQ p256ord<>+0x00(SB), AX 783 MULQ t0 784 ADDQ AX, acc0 785 ADCQ $0, DX 786 MOVQ DX, t1 787 788 MOVQ p256ord<>+0x08(SB), AX 789 MULQ t0 790 ADDQ t1, acc1 791 ADCQ $0, DX 792 ADDQ AX, acc1 793 ADCQ $0, DX 794 MOVQ DX, t1 795 796 MOVQ p256ord<>+0x10(SB), AX 797 MULQ t0 798 ADDQ t1, acc2 799 ADCQ $0, DX 800 ADDQ AX, acc2 801 ADCQ $0, DX 802 MOVQ DX, t1 803 804 MOVQ p256ord<>+0x18(SB), AX 805 MULQ t0 806 ADDQ t1, acc3 807 ADCQ $0, DX 808 ADDQ AX, acc3 809 ADCQ DX, acc4 810 ADCQ $0, acc5 811 // x * y[1] 812 MOVQ (8*1)(y_ptr), t0 813 814 MOVQ (8*0)(x_ptr), AX 815 MULQ t0 816 ADDQ AX, acc1 817 ADCQ $0, DX 818 MOVQ DX, t1 819 820 MOVQ (8*1)(x_ptr), AX 821 MULQ t0 822 ADDQ t1, acc2 823 ADCQ $0, DX 824 ADDQ AX, acc2 825 ADCQ $0, DX 826 MOVQ DX, t1 827 828 MOVQ (8*2)(x_ptr), AX 829 MULQ t0 830 ADDQ t1, acc3 831 ADCQ $0, DX 832 ADDQ AX, acc3 833 ADCQ $0, DX 834 MOVQ DX, t1 835 836 MOVQ (8*3)(x_ptr), AX 837 MULQ t0 838 ADDQ t1, acc4 839 ADCQ $0, DX 840 ADDQ AX, acc4 841 ADCQ DX, acc5 842 ADCQ $0, acc0 843 // Second reduction step 844 MOVQ acc1, AX 845 MULQ p256ordK0<>(SB) 846 MOVQ AX, t0 847 848 MOVQ p256ord<>+0x00(SB), AX 849 MULQ t0 850 ADDQ AX, acc1 851 ADCQ $0, DX 852 MOVQ DX, t1 853 854 MOVQ p256ord<>+0x08(SB), AX 855 MULQ t0 856 ADDQ t1, acc2 857 ADCQ $0, DX 858 ADDQ AX, acc2 859 ADCQ $0, DX 860 MOVQ DX, t1 861 862 MOVQ p256ord<>+0x10(SB), AX 863 MULQ t0 864 ADDQ t1, acc3 865 ADCQ $0, DX 866 ADDQ AX, acc3 867 ADCQ $0, DX 868 MOVQ DX, t1 869 870 MOVQ p256ord<>+0x18(SB), AX 871 MULQ t0 872 ADDQ t1, acc4 873 ADCQ $0, DX 874 ADDQ AX, acc4 875 ADCQ DX, acc5 876 ADCQ $0, acc0 877 // x * y[2] 878 MOVQ (8*2)(y_ptr), t0 879 880 MOVQ (8*0)(x_ptr), AX 881 MULQ t0 882 ADDQ AX, acc2 883 ADCQ $0, DX 884 MOVQ DX, t1 885 886 MOVQ (8*1)(x_ptr), AX 887 MULQ t0 888 ADDQ t1, acc3 889 ADCQ $0, DX 890 ADDQ AX, acc3 891 ADCQ $0, DX 892 MOVQ DX, t1 893 894 MOVQ (8*2)(x_ptr), AX 895 MULQ t0 896 ADDQ t1, acc4 897 ADCQ $0, DX 898 ADDQ AX, acc4 899 ADCQ $0, DX 900 MOVQ DX, t1 901 902 MOVQ (8*3)(x_ptr), AX 903 MULQ t0 904 ADDQ t1, acc5 905 ADCQ $0, DX 906 ADDQ AX, acc5 907 ADCQ DX, acc0 908 ADCQ $0, acc1 909 // Third reduction step 910 MOVQ acc2, AX 911 MULQ p256ordK0<>(SB) 912 MOVQ AX, t0 913 914 MOVQ p256ord<>+0x00(SB), AX 915 MULQ t0 916 ADDQ AX, acc2 917 ADCQ $0, DX 918 MOVQ DX, t1 919 920 MOVQ p256ord<>+0x08(SB), AX 921 MULQ t0 922 ADDQ t1, acc3 923 ADCQ $0, DX 924 ADDQ AX, acc3 925 ADCQ $0, DX 926 MOVQ DX, t1 927 928 MOVQ p256ord<>+0x10(SB), AX 929 MULQ t0 930 ADDQ t1, acc4 931 ADCQ $0, DX 932 ADDQ AX, acc4 933 ADCQ $0, DX 934 MOVQ DX, t1 935 936 MOVQ p256ord<>+0x18(SB), AX 937 MULQ t0 938 ADDQ t1, acc5 939 ADCQ $0, DX 940 ADDQ AX, acc5 941 ADCQ DX, acc0 942 ADCQ $0, acc1 943 // x * y[3] 944 MOVQ (8*3)(y_ptr), t0 945 946 MOVQ (8*0)(x_ptr), AX 947 MULQ t0 948 ADDQ AX, acc3 949 ADCQ $0, DX 950 MOVQ DX, t1 951 952 MOVQ (8*1)(x_ptr), AX 953 MULQ t0 954 ADDQ t1, acc4 955 ADCQ $0, DX 956 ADDQ AX, acc4 957 ADCQ $0, DX 958 MOVQ DX, t1 959 960 MOVQ (8*2)(x_ptr), AX 961 MULQ t0 962 ADDQ t1, acc5 963 ADCQ $0, DX 964 ADDQ AX, acc5 965 ADCQ $0, DX 966 MOVQ DX, t1 967 968 MOVQ (8*3)(x_ptr), AX 969 MULQ t0 970 ADDQ t1, acc0 971 ADCQ $0, DX 972 ADDQ AX, acc0 973 ADCQ DX, acc1 974 ADCQ $0, acc2 975 // Last reduction step 976 MOVQ acc3, AX 977 MULQ p256ordK0<>(SB) 978 MOVQ AX, t0 979 980 MOVQ p256ord<>+0x00(SB), AX 981 MULQ t0 982 ADDQ AX, acc3 983 ADCQ $0, DX 984 MOVQ DX, t1 985 986 MOVQ p256ord<>+0x08(SB), AX 987 MULQ t0 988 ADDQ t1, acc4 989 ADCQ $0, DX 990 ADDQ AX, acc4 991 ADCQ $0, DX 992 MOVQ DX, t1 993 994 MOVQ p256ord<>+0x10(SB), AX 995 MULQ t0 996 ADDQ t1, acc5 997 ADCQ $0, DX 998 ADDQ AX, acc5 999 ADCQ $0, DX 1000 MOVQ DX, t1 1001 1002 MOVQ p256ord<>+0x18(SB), AX 1003 MULQ t0 1004 ADDQ t1, acc0 1005 ADCQ $0, DX 1006 ADDQ AX, acc0 1007 ADCQ DX, acc1 1008 ADCQ $0, acc2 1009 // Copy result [255:0] 1010 MOVQ acc4, x_ptr 1011 MOVQ acc5, acc3 1012 MOVQ acc0, t0 1013 MOVQ acc1, t1 1014 // Subtract p256 1015 SUBQ p256ord<>+0x00(SB), acc4 1016 SBBQ p256ord<>+0x08(SB) ,acc5 1017 SBBQ p256ord<>+0x10(SB), acc0 1018 SBBQ p256ord<>+0x18(SB), acc1 1019 SBBQ $0, acc2 1020 1021 CMOVQCS x_ptr, acc4 1022 CMOVQCS acc3, acc5 1023 CMOVQCS t0, acc0 1024 CMOVQCS t1, acc1 1025 1026 MOVQ acc4, (8*0)(res_ptr) 1027 MOVQ acc5, (8*1)(res_ptr) 1028 MOVQ acc0, (8*2)(res_ptr) 1029 MOVQ acc1, (8*3)(res_ptr) 1030 1031 RET 1032 /* ---------------------------------------*/ 1033 // func p256OrdSqr(res, in *p256OrdElement, n int) 1034 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 1035 MOVQ res+0(FP), res_ptr 1036 MOVQ in+8(FP), x_ptr 1037 MOVQ n+16(FP), BX 1038 1039 ordSqrLoop: 1040 1041 // y[1:] * y[0] 1042 MOVQ (8*0)(x_ptr), t0 1043 1044 MOVQ (8*1)(x_ptr), AX 1045 MULQ t0 1046 MOVQ AX, acc1 1047 MOVQ DX, acc2 1048 1049 MOVQ (8*2)(x_ptr), AX 1050 MULQ t0 1051 ADDQ AX, acc2 1052 ADCQ $0, DX 1053 MOVQ DX, acc3 1054 1055 MOVQ (8*3)(x_ptr), AX 1056 MULQ t0 1057 ADDQ AX, acc3 1058 ADCQ $0, DX 1059 MOVQ DX, acc4 1060 // y[2:] * y[1] 1061 MOVQ (8*1)(x_ptr), t0 1062 1063 MOVQ (8*2)(x_ptr), AX 1064 MULQ t0 1065 ADDQ AX, acc3 1066 ADCQ $0, DX 1067 MOVQ DX, t1 1068 1069 MOVQ (8*3)(x_ptr), AX 1070 MULQ t0 1071 ADDQ t1, acc4 1072 ADCQ $0, DX 1073 ADDQ AX, acc4 1074 ADCQ $0, DX 1075 MOVQ DX, acc5 1076 // y[3] * y[2] 1077 MOVQ (8*2)(x_ptr), t0 1078 1079 MOVQ (8*3)(x_ptr), AX 1080 MULQ t0 1081 ADDQ AX, acc5 1082 ADCQ $0, DX 1083 MOVQ DX, y_ptr 1084 XORQ t1, t1 1085 // *2 1086 ADDQ acc1, acc1 1087 ADCQ acc2, acc2 1088 ADCQ acc3, acc3 1089 ADCQ acc4, acc4 1090 ADCQ acc5, acc5 1091 ADCQ y_ptr, y_ptr 1092 ADCQ $0, t1 1093 // Missing products 1094 MOVQ (8*0)(x_ptr), AX 1095 MULQ AX 1096 MOVQ AX, acc0 1097 MOVQ DX, t0 1098 1099 MOVQ (8*1)(x_ptr), AX 1100 MULQ AX 1101 ADDQ t0, acc1 1102 ADCQ AX, acc2 1103 ADCQ $0, DX 1104 MOVQ DX, t0 1105 1106 MOVQ (8*2)(x_ptr), AX 1107 MULQ AX 1108 ADDQ t0, acc3 1109 ADCQ AX, acc4 1110 ADCQ $0, DX 1111 MOVQ DX, t0 1112 1113 MOVQ (8*3)(x_ptr), AX 1114 MULQ AX 1115 ADDQ t0, acc5 1116 ADCQ AX, y_ptr 1117 ADCQ DX, t1 1118 MOVQ t1, x_ptr 1119 // First reduction step 1120 MOVQ acc0, AX 1121 MULQ p256ordK0<>(SB) 1122 MOVQ AX, t0 1123 1124 MOVQ p256ord<>+0x00(SB), AX 1125 MULQ t0 1126 ADDQ AX, acc0 1127 ADCQ $0, DX 1128 MOVQ DX, t1 1129 1130 MOVQ p256ord<>+0x08(SB), AX 1131 MULQ t0 1132 ADDQ t1, acc1 1133 ADCQ $0, DX 1134 ADDQ AX, acc1 1135 1136 MOVQ t0, t1 1137 ADCQ DX, acc2 1138 ADCQ $0, t1 1139 SUBQ t0, acc2 1140 SBBQ $0, t1 1141 1142 MOVQ t0, AX 1143 MOVQ t0, DX 1144 MOVQ t0, acc0 1145 SHLQ $32, AX 1146 SHRQ $32, DX 1147 1148 ADDQ t1, acc3 1149 ADCQ $0, acc0 1150 SUBQ AX, acc3 1151 SBBQ DX, acc0 1152 // Second reduction step 1153 MOVQ acc1, AX 1154 MULQ p256ordK0<>(SB) 1155 MOVQ AX, t0 1156 1157 MOVQ p256ord<>+0x00(SB), AX 1158 MULQ t0 1159 ADDQ AX, acc1 1160 ADCQ $0, DX 1161 MOVQ DX, t1 1162 1163 MOVQ p256ord<>+0x08(SB), AX 1164 MULQ t0 1165 ADDQ t1, acc2 1166 ADCQ $0, DX 1167 ADDQ AX, acc2 1168 1169 MOVQ t0, t1 1170 ADCQ DX, acc3 1171 ADCQ $0, t1 1172 SUBQ t0, acc3 1173 SBBQ $0, t1 1174 1175 MOVQ t0, AX 1176 MOVQ t0, DX 1177 MOVQ t0, acc1 1178 SHLQ $32, AX 1179 SHRQ $32, DX 1180 1181 ADDQ t1, acc0 1182 ADCQ $0, acc1 1183 SUBQ AX, acc0 1184 SBBQ DX, acc1 1185 // Third reduction step 1186 MOVQ acc2, AX 1187 MULQ p256ordK0<>(SB) 1188 MOVQ AX, t0 1189 1190 MOVQ p256ord<>+0x00(SB), AX 1191 MULQ t0 1192 ADDQ AX, acc2 1193 ADCQ $0, DX 1194 MOVQ DX, t1 1195 1196 MOVQ p256ord<>+0x08(SB), AX 1197 MULQ t0 1198 ADDQ t1, acc3 1199 ADCQ $0, DX 1200 ADDQ AX, acc3 1201 1202 MOVQ t0, t1 1203 ADCQ DX, acc0 1204 ADCQ $0, t1 1205 SUBQ t0, acc0 1206 SBBQ $0, t1 1207 1208 MOVQ t0, AX 1209 MOVQ t0, DX 1210 MOVQ t0, acc2 1211 SHLQ $32, AX 1212 SHRQ $32, DX 1213 1214 ADDQ t1, acc1 1215 ADCQ $0, acc2 1216 SUBQ AX, acc1 1217 SBBQ DX, acc2 1218 // Last reduction step 1219 MOVQ acc3, AX 1220 MULQ p256ordK0<>(SB) 1221 MOVQ AX, t0 1222 1223 MOVQ p256ord<>+0x00(SB), AX 1224 MULQ t0 1225 ADDQ AX, acc3 1226 ADCQ $0, DX 1227 MOVQ DX, t1 1228 1229 MOVQ p256ord<>+0x08(SB), AX 1230 MULQ t0 1231 ADDQ t1, acc0 1232 ADCQ $0, DX 1233 ADDQ AX, acc0 1234 ADCQ $0, DX 1235 MOVQ DX, t1 1236 1237 MOVQ t0, t1 1238 ADCQ DX, acc1 1239 ADCQ $0, t1 1240 SUBQ t0, acc1 1241 SBBQ $0, t1 1242 1243 MOVQ t0, AX 1244 MOVQ t0, DX 1245 MOVQ t0, acc3 1246 SHLQ $32, AX 1247 SHRQ $32, DX 1248 1249 ADDQ t1, acc2 1250 ADCQ $0, acc3 1251 SUBQ AX, acc2 1252 SBBQ DX, acc3 1253 XORQ t0, t0 1254 // Add bits [511:256] of the sqr result 1255 ADCQ acc4, acc0 1256 ADCQ acc5, acc1 1257 ADCQ y_ptr, acc2 1258 ADCQ x_ptr, acc3 1259 ADCQ $0, t0 1260 1261 MOVQ acc0, acc4 1262 MOVQ acc1, acc5 1263 MOVQ acc2, y_ptr 1264 MOVQ acc3, t1 1265 // Subtract p256 1266 SUBQ p256ord<>+0x00(SB), acc0 1267 SBBQ p256ord<>+0x08(SB) ,acc1 1268 SBBQ p256ord<>+0x10(SB), acc2 1269 SBBQ p256ord<>+0x18(SB), acc3 1270 SBBQ $0, t0 1271 1272 CMOVQCS acc4, acc0 1273 CMOVQCS acc5, acc1 1274 CMOVQCS y_ptr, acc2 1275 CMOVQCS t1, acc3 1276 1277 MOVQ acc0, (8*0)(res_ptr) 1278 MOVQ acc1, (8*1)(res_ptr) 1279 MOVQ acc2, (8*2)(res_ptr) 1280 MOVQ acc3, (8*3)(res_ptr) 1281 MOVQ res_ptr, x_ptr 1282 DECQ BX 1283 JNE ordSqrLoop 1284 1285 RET 1286 /* ---------------------------------------*/ 1287 #undef res_ptr 1288 #undef x_ptr 1289 #undef y_ptr 1290 1291 #undef acc0 1292 #undef acc1 1293 #undef acc2 1294 #undef acc3 1295 #undef acc4 1296 #undef acc5 1297 #undef t0 1298 #undef t1 1299 /* ---------------------------------------*/ 1300 #define mul0 AX 1301 #define mul1 DX 1302 #define acc0 BX 1303 #define acc1 CX 1304 #define acc2 R8 1305 #define acc3 R9 1306 #define acc4 R10 1307 #define acc5 R11 1308 #define acc6 R12 1309 #define acc7 R13 1310 #define t0 R14 1311 #define t1 R15 1312 #define t2 DI 1313 #define t3 SI 1314 #define hlp BP 1315 /* ---------------------------------------*/ 1316 TEXT p256SubInternal(SB),NOSPLIT,$0 1317 XORQ mul0, mul0 1318 SUBQ t0, acc4 1319 SBBQ t1, acc5 1320 SBBQ t2, acc6 1321 SBBQ t3, acc7 1322 SBBQ $0, mul0 1323 1324 MOVQ acc4, acc0 1325 MOVQ acc5, acc1 1326 MOVQ acc6, acc2 1327 MOVQ acc7, acc3 1328 1329 ADDQ $-1, acc4 1330 ADCQ p256const0<>(SB), acc5 1331 ADCQ $0, acc6 1332 ADCQ p256const1<>(SB), acc7 1333 ANDQ $1, mul0 1334 1335 CMOVQEQ acc0, acc4 1336 CMOVQEQ acc1, acc5 1337 CMOVQEQ acc2, acc6 1338 CMOVQEQ acc3, acc7 1339 1340 RET 1341 /* ---------------------------------------*/ 1342 TEXT p256MulInternal(SB),NOSPLIT,$8 1343 MOVQ acc4, mul0 1344 MULQ t0 1345 MOVQ mul0, acc0 1346 MOVQ mul1, acc1 1347 1348 MOVQ acc4, mul0 1349 MULQ t1 1350 ADDQ mul0, acc1 1351 ADCQ $0, mul1 1352 MOVQ mul1, acc2 1353 1354 MOVQ acc4, mul0 1355 MULQ t2 1356 ADDQ mul0, acc2 1357 ADCQ $0, mul1 1358 MOVQ mul1, acc3 1359 1360 MOVQ acc4, mul0 1361 MULQ t3 1362 ADDQ mul0, acc3 1363 ADCQ $0, mul1 1364 MOVQ mul1, acc4 1365 1366 MOVQ acc5, mul0 1367 MULQ t0 1368 ADDQ mul0, acc1 1369 ADCQ $0, mul1 1370 MOVQ mul1, hlp 1371 1372 MOVQ acc5, mul0 1373 MULQ t1 1374 ADDQ hlp, acc2 1375 ADCQ $0, mul1 1376 ADDQ mul0, acc2 1377 ADCQ $0, mul1 1378 MOVQ mul1, hlp 1379 1380 MOVQ acc5, mul0 1381 MULQ t2 1382 ADDQ hlp, acc3 1383 ADCQ $0, mul1 1384 ADDQ mul0, acc3 1385 ADCQ $0, mul1 1386 MOVQ mul1, hlp 1387 1388 MOVQ acc5, mul0 1389 MULQ t3 1390 ADDQ hlp, acc4 1391 ADCQ $0, mul1 1392 ADDQ mul0, acc4 1393 ADCQ $0, mul1 1394 MOVQ mul1, acc5 1395 1396 MOVQ acc6, mul0 1397 MULQ t0 1398 ADDQ mul0, acc2 1399 ADCQ $0, mul1 1400 MOVQ mul1, hlp 1401 1402 MOVQ acc6, mul0 1403 MULQ t1 1404 ADDQ hlp, acc3 1405 ADCQ $0, mul1 1406 ADDQ mul0, acc3 1407 ADCQ $0, mul1 1408 MOVQ mul1, hlp 1409 1410 MOVQ acc6, mul0 1411 MULQ t2 1412 ADDQ hlp, acc4 1413 ADCQ $0, mul1 1414 ADDQ mul0, acc4 1415 ADCQ $0, mul1 1416 MOVQ mul1, hlp 1417 1418 MOVQ acc6, mul0 1419 MULQ t3 1420 ADDQ hlp, acc5 1421 ADCQ $0, mul1 1422 ADDQ mul0, acc5 1423 ADCQ $0, mul1 1424 MOVQ mul1, acc6 1425 1426 MOVQ acc7, mul0 1427 MULQ t0 1428 ADDQ mul0, acc3 1429 ADCQ $0, mul1 1430 MOVQ mul1, hlp 1431 1432 MOVQ acc7, mul0 1433 MULQ t1 1434 ADDQ hlp, acc4 1435 ADCQ $0, mul1 1436 ADDQ mul0, acc4 1437 ADCQ $0, mul1 1438 MOVQ mul1, hlp 1439 1440 MOVQ acc7, mul0 1441 MULQ t2 1442 ADDQ hlp, acc5 1443 ADCQ $0, mul1 1444 ADDQ mul0, acc5 1445 ADCQ $0, mul1 1446 MOVQ mul1, hlp 1447 1448 MOVQ acc7, mul0 1449 MULQ t3 1450 ADDQ hlp, acc6 1451 ADCQ $0, mul1 1452 ADDQ mul0, acc6 1453 ADCQ $0, mul1 1454 MOVQ mul1, acc7 1455 // First reduction step 1456 MOVQ acc0, mul0 1457 MOVQ acc0, hlp 1458 SHLQ $32, acc0 1459 MULQ p256const1<>(SB) 1460 SHRQ $32, hlp 1461 ADDQ acc0, acc1 1462 ADCQ hlp, acc2 1463 ADCQ mul0, acc3 1464 ADCQ $0, mul1 1465 MOVQ mul1, acc0 1466 // Second reduction step 1467 MOVQ acc1, mul0 1468 MOVQ acc1, hlp 1469 SHLQ $32, acc1 1470 MULQ p256const1<>(SB) 1471 SHRQ $32, hlp 1472 ADDQ acc1, acc2 1473 ADCQ hlp, acc3 1474 ADCQ mul0, acc0 1475 ADCQ $0, mul1 1476 MOVQ mul1, acc1 1477 // Third reduction step 1478 MOVQ acc2, mul0 1479 MOVQ acc2, hlp 1480 SHLQ $32, acc2 1481 MULQ p256const1<>(SB) 1482 SHRQ $32, hlp 1483 ADDQ acc2, acc3 1484 ADCQ hlp, acc0 1485 ADCQ mul0, acc1 1486 ADCQ $0, mul1 1487 MOVQ mul1, acc2 1488 // Last reduction step 1489 MOVQ acc3, mul0 1490 MOVQ acc3, hlp 1491 SHLQ $32, acc3 1492 MULQ p256const1<>(SB) 1493 SHRQ $32, hlp 1494 ADDQ acc3, acc0 1495 ADCQ hlp, acc1 1496 ADCQ mul0, acc2 1497 ADCQ $0, mul1 1498 MOVQ mul1, acc3 1499 MOVQ $0, BP 1500 // Add bits [511:256] of the result 1501 ADCQ acc0, acc4 1502 ADCQ acc1, acc5 1503 ADCQ acc2, acc6 1504 ADCQ acc3, acc7 1505 ADCQ $0, hlp 1506 // Copy result 1507 MOVQ acc4, acc0 1508 MOVQ acc5, acc1 1509 MOVQ acc6, acc2 1510 MOVQ acc7, acc3 1511 // Subtract p256 1512 SUBQ $-1, acc4 1513 SBBQ p256const0<>(SB) ,acc5 1514 SBBQ $0, acc6 1515 SBBQ p256const1<>(SB), acc7 1516 SBBQ $0, hlp 1517 // If the result of the subtraction is negative, restore the previous result 1518 CMOVQCS acc0, acc4 1519 CMOVQCS acc1, acc5 1520 CMOVQCS acc2, acc6 1521 CMOVQCS acc3, acc7 1522 1523 RET 1524 /* ---------------------------------------*/ 1525 TEXT p256SqrInternal(SB),NOSPLIT,$8 1526 1527 MOVQ acc4, mul0 1528 MULQ acc5 1529 MOVQ mul0, acc1 1530 MOVQ mul1, acc2 1531 1532 MOVQ acc4, mul0 1533 MULQ acc6 1534 ADDQ mul0, acc2 1535 ADCQ $0, mul1 1536 MOVQ mul1, acc3 1537 1538 MOVQ acc4, mul0 1539 MULQ acc7 1540 ADDQ mul0, acc3 1541 ADCQ $0, mul1 1542 MOVQ mul1, t0 1543 1544 MOVQ acc5, mul0 1545 MULQ acc6 1546 ADDQ mul0, acc3 1547 ADCQ $0, mul1 1548 MOVQ mul1, hlp 1549 1550 MOVQ acc5, mul0 1551 MULQ acc7 1552 ADDQ hlp, t0 1553 ADCQ $0, mul1 1554 ADDQ mul0, t0 1555 ADCQ $0, mul1 1556 MOVQ mul1, t1 1557 1558 MOVQ acc6, mul0 1559 MULQ acc7 1560 ADDQ mul0, t1 1561 ADCQ $0, mul1 1562 MOVQ mul1, t2 1563 XORQ t3, t3 1564 // *2 1565 ADDQ acc1, acc1 1566 ADCQ acc2, acc2 1567 ADCQ acc3, acc3 1568 ADCQ t0, t0 1569 ADCQ t1, t1 1570 ADCQ t2, t2 1571 ADCQ $0, t3 1572 // Missing products 1573 MOVQ acc4, mul0 1574 MULQ mul0 1575 MOVQ mul0, acc0 1576 MOVQ DX, acc4 1577 1578 MOVQ acc5, mul0 1579 MULQ mul0 1580 ADDQ acc4, acc1 1581 ADCQ mul0, acc2 1582 ADCQ $0, DX 1583 MOVQ DX, acc4 1584 1585 MOVQ acc6, mul0 1586 MULQ mul0 1587 ADDQ acc4, acc3 1588 ADCQ mul0, t0 1589 ADCQ $0, DX 1590 MOVQ DX, acc4 1591 1592 MOVQ acc7, mul0 1593 MULQ mul0 1594 ADDQ acc4, t1 1595 ADCQ mul0, t2 1596 ADCQ DX, t3 1597 // First reduction step 1598 MOVQ acc0, mul0 1599 MOVQ acc0, hlp 1600 SHLQ $32, acc0 1601 MULQ p256const1<>(SB) 1602 SHRQ $32, hlp 1603 ADDQ acc0, acc1 1604 ADCQ hlp, acc2 1605 ADCQ mul0, acc3 1606 ADCQ $0, mul1 1607 MOVQ mul1, acc0 1608 // Second reduction step 1609 MOVQ acc1, mul0 1610 MOVQ acc1, hlp 1611 SHLQ $32, acc1 1612 MULQ p256const1<>(SB) 1613 SHRQ $32, hlp 1614 ADDQ acc1, acc2 1615 ADCQ hlp, acc3 1616 ADCQ mul0, acc0 1617 ADCQ $0, mul1 1618 MOVQ mul1, acc1 1619 // Third reduction step 1620 MOVQ acc2, mul0 1621 MOVQ acc2, hlp 1622 SHLQ $32, acc2 1623 MULQ p256const1<>(SB) 1624 SHRQ $32, hlp 1625 ADDQ acc2, acc3 1626 ADCQ hlp, acc0 1627 ADCQ mul0, acc1 1628 ADCQ $0, mul1 1629 MOVQ mul1, acc2 1630 // Last reduction step 1631 MOVQ acc3, mul0 1632 MOVQ acc3, hlp 1633 SHLQ $32, acc3 1634 MULQ p256const1<>(SB) 1635 SHRQ $32, hlp 1636 ADDQ acc3, acc0 1637 ADCQ hlp, acc1 1638 ADCQ mul0, acc2 1639 ADCQ $0, mul1 1640 MOVQ mul1, acc3 1641 MOVQ $0, BP 1642 // Add bits [511:256] of the result 1643 ADCQ acc0, t0 1644 ADCQ acc1, t1 1645 ADCQ acc2, t2 1646 ADCQ acc3, t3 1647 ADCQ $0, hlp 1648 // Copy result 1649 MOVQ t0, acc4 1650 MOVQ t1, acc5 1651 MOVQ t2, acc6 1652 MOVQ t3, acc7 1653 // Subtract p256 1654 SUBQ $-1, acc4 1655 SBBQ p256const0<>(SB) ,acc5 1656 SBBQ $0, acc6 1657 SBBQ p256const1<>(SB), acc7 1658 SBBQ $0, hlp 1659 // If the result of the subtraction is negative, restore the previous result 1660 CMOVQCS t0, acc4 1661 CMOVQCS t1, acc5 1662 CMOVQCS t2, acc6 1663 CMOVQCS t3, acc7 1664 1665 RET 1666 /* ---------------------------------------*/ 1667 #define p256MulBy2Inline\ 1668 XORQ mul0, mul0;\ 1669 ADDQ acc4, acc4;\ 1670 ADCQ acc5, acc5;\ 1671 ADCQ acc6, acc6;\ 1672 ADCQ acc7, acc7;\ 1673 ADCQ $0, mul0;\ 1674 MOVQ acc4, t0;\ 1675 MOVQ acc5, t1;\ 1676 MOVQ acc6, t2;\ 1677 MOVQ acc7, t3;\ 1678 SUBQ $-1, t0;\ 1679 SBBQ p256const0<>(SB), t1;\ 1680 SBBQ $0, t2;\ 1681 SBBQ p256const1<>(SB), t3;\ 1682 SBBQ $0, mul0;\ 1683 CMOVQCS acc4, t0;\ 1684 CMOVQCS acc5, t1;\ 1685 CMOVQCS acc6, t2;\ 1686 CMOVQCS acc7, t3; 1687 /* ---------------------------------------*/ 1688 #define p256AddInline \ 1689 XORQ mul0, mul0;\ 1690 ADDQ t0, acc4;\ 1691 ADCQ t1, acc5;\ 1692 ADCQ t2, acc6;\ 1693 ADCQ t3, acc7;\ 1694 ADCQ $0, mul0;\ 1695 MOVQ acc4, t0;\ 1696 MOVQ acc5, t1;\ 1697 MOVQ acc6, t2;\ 1698 MOVQ acc7, t3;\ 1699 SUBQ $-1, t0;\ 1700 SBBQ p256const0<>(SB), t1;\ 1701 SBBQ $0, t2;\ 1702 SBBQ p256const1<>(SB), t3;\ 1703 SBBQ $0, mul0;\ 1704 CMOVQCS acc4, t0;\ 1705 CMOVQCS acc5, t1;\ 1706 CMOVQCS acc6, t2;\ 1707 CMOVQCS acc7, t3; 1708 /* ---------------------------------------*/ 1709 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1710 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1711 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1712 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1713 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1714 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1715 /* ---------------------------------------*/ 1716 #define x1in(off) (32*0 + off)(SP) 1717 #define y1in(off) (32*1 + off)(SP) 1718 #define z1in(off) (32*2 + off)(SP) 1719 #define x2in(off) (32*3 + off)(SP) 1720 #define y2in(off) (32*4 + off)(SP) 1721 #define xout(off) (32*5 + off)(SP) 1722 #define yout(off) (32*6 + off)(SP) 1723 #define zout(off) (32*7 + off)(SP) 1724 #define s2(off) (32*8 + off)(SP) 1725 #define z1sqr(off) (32*9 + off)(SP) 1726 #define h(off) (32*10 + off)(SP) 1727 #define r(off) (32*11 + off)(SP) 1728 #define hsqr(off) (32*12 + off)(SP) 1729 #define rsqr(off) (32*13 + off)(SP) 1730 #define hcub(off) (32*14 + off)(SP) 1731 #define rptr (32*15)(SP) 1732 #define sel_save (32*15 + 8)(SP) 1733 #define zero_save (32*15 + 8 + 4)(SP) 1734 1735 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1736 TEXT ·p256PointAddAffineAsm(SB),0,$512-48 1737 // Move input to stack in order to free registers 1738 MOVQ res+0(FP), AX 1739 MOVQ in1+8(FP), BX 1740 MOVQ in2+16(FP), CX 1741 MOVQ sign+24(FP), DX 1742 MOVQ sel+32(FP), t1 1743 MOVQ zero+40(FP), t2 1744 1745 MOVOU (16*0)(BX), X0 1746 MOVOU (16*1)(BX), X1 1747 MOVOU (16*2)(BX), X2 1748 MOVOU (16*3)(BX), X3 1749 MOVOU (16*4)(BX), X4 1750 MOVOU (16*5)(BX), X5 1751 1752 MOVOU X0, x1in(16*0) 1753 MOVOU X1, x1in(16*1) 1754 MOVOU X2, y1in(16*0) 1755 MOVOU X3, y1in(16*1) 1756 MOVOU X4, z1in(16*0) 1757 MOVOU X5, z1in(16*1) 1758 1759 MOVOU (16*0)(CX), X0 1760 MOVOU (16*1)(CX), X1 1761 1762 MOVOU X0, x2in(16*0) 1763 MOVOU X1, x2in(16*1) 1764 // Store pointer to result 1765 MOVQ mul0, rptr 1766 MOVL t1, sel_save 1767 MOVL t2, zero_save 1768 // Negate y2in based on sign 1769 MOVQ (16*2 + 8*0)(CX), acc4 1770 MOVQ (16*2 + 8*1)(CX), acc5 1771 MOVQ (16*2 + 8*2)(CX), acc6 1772 MOVQ (16*2 + 8*3)(CX), acc7 1773 MOVQ $-1, acc0 1774 MOVQ p256const0<>(SB), acc1 1775 MOVQ $0, acc2 1776 MOVQ p256const1<>(SB), acc3 1777 XORQ mul0, mul0 1778 // Speculatively subtract 1779 SUBQ acc4, acc0 1780 SBBQ acc5, acc1 1781 SBBQ acc6, acc2 1782 SBBQ acc7, acc3 1783 SBBQ $0, mul0 1784 MOVQ acc0, t0 1785 MOVQ acc1, t1 1786 MOVQ acc2, t2 1787 MOVQ acc3, t3 1788 // Add in case the operand was > p256 1789 ADDQ $-1, acc0 1790 ADCQ p256const0<>(SB), acc1 1791 ADCQ $0, acc2 1792 ADCQ p256const1<>(SB), acc3 1793 ADCQ $0, mul0 1794 CMOVQNE t0, acc0 1795 CMOVQNE t1, acc1 1796 CMOVQNE t2, acc2 1797 CMOVQNE t3, acc3 1798 // If condition is 0, keep original value 1799 TESTQ DX, DX 1800 CMOVQEQ acc4, acc0 1801 CMOVQEQ acc5, acc1 1802 CMOVQEQ acc6, acc2 1803 CMOVQEQ acc7, acc3 1804 // Store result 1805 MOVQ acc0, y2in(8*0) 1806 MOVQ acc1, y2in(8*1) 1807 MOVQ acc2, y2in(8*2) 1808 MOVQ acc3, y2in(8*3) 1809 // Begin point add 1810 LDacc (z1in) 1811 CALL p256SqrInternal(SB) // z1ˆ2 1812 ST (z1sqr) 1813 1814 LDt (x2in) 1815 CALL p256MulInternal(SB) // x2 * z1ˆ2 1816 1817 LDt (x1in) 1818 CALL p256SubInternal(SB) // h = u2 - u1 1819 ST (h) 1820 1821 LDt (z1in) 1822 CALL p256MulInternal(SB) // z3 = h * z1 1823 ST (zout) 1824 1825 LDacc (z1sqr) 1826 CALL p256MulInternal(SB) // z1ˆ3 1827 1828 LDt (y2in) 1829 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 1830 ST (s2) 1831 1832 LDt (y1in) 1833 CALL p256SubInternal(SB) // r = s2 - s1 1834 ST (r) 1835 1836 CALL p256SqrInternal(SB) // rsqr = rˆ2 1837 ST (rsqr) 1838 1839 LDacc (h) 1840 CALL p256SqrInternal(SB) // hsqr = hˆ2 1841 ST (hsqr) 1842 1843 LDt (h) 1844 CALL p256MulInternal(SB) // hcub = hˆ3 1845 ST (hcub) 1846 1847 LDt (y1in) 1848 CALL p256MulInternal(SB) // y1 * hˆ3 1849 ST (s2) 1850 1851 LDacc (x1in) 1852 LDt (hsqr) 1853 CALL p256MulInternal(SB) // u1 * hˆ2 1854 ST (h) 1855 1856 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1857 LDacc (rsqr) 1858 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 1859 1860 LDt (hcub) 1861 CALL p256SubInternal(SB) 1862 ST (xout) 1863 1864 MOVQ acc4, t0 1865 MOVQ acc5, t1 1866 MOVQ acc6, t2 1867 MOVQ acc7, t3 1868 LDacc (h) 1869 CALL p256SubInternal(SB) 1870 1871 LDt (r) 1872 CALL p256MulInternal(SB) 1873 1874 LDt (s2) 1875 CALL p256SubInternal(SB) 1876 ST (yout) 1877 // Load stored values from stack 1878 MOVQ rptr, AX 1879 MOVL sel_save, BX 1880 MOVL zero_save, CX 1881 // The result is not valid if (sel == 0), conditional choose 1882 MOVOU xout(16*0), X0 1883 MOVOU xout(16*1), X1 1884 MOVOU yout(16*0), X2 1885 MOVOU yout(16*1), X3 1886 MOVOU zout(16*0), X4 1887 MOVOU zout(16*1), X5 1888 1889 MOVL BX, X6 1890 MOVL CX, X7 1891 1892 PXOR X8, X8 1893 PCMPEQL X9, X9 1894 1895 PSHUFD $0, X6, X6 1896 PSHUFD $0, X7, X7 1897 1898 PCMPEQL X8, X6 1899 PCMPEQL X8, X7 1900 1901 MOVOU X6, X15 1902 PANDN X9, X15 1903 1904 MOVOU x1in(16*0), X9 1905 MOVOU x1in(16*1), X10 1906 MOVOU y1in(16*0), X11 1907 MOVOU y1in(16*1), X12 1908 MOVOU z1in(16*0), X13 1909 MOVOU z1in(16*1), X14 1910 1911 PAND X15, X0 1912 PAND X15, X1 1913 PAND X15, X2 1914 PAND X15, X3 1915 PAND X15, X4 1916 PAND X15, X5 1917 1918 PAND X6, X9 1919 PAND X6, X10 1920 PAND X6, X11 1921 PAND X6, X12 1922 PAND X6, X13 1923 PAND X6, X14 1924 1925 PXOR X9, X0 1926 PXOR X10, X1 1927 PXOR X11, X2 1928 PXOR X12, X3 1929 PXOR X13, X4 1930 PXOR X14, X5 1931 // Similarly if zero == 0 1932 PCMPEQL X9, X9 1933 MOVOU X7, X15 1934 PANDN X9, X15 1935 1936 MOVOU x2in(16*0), X9 1937 MOVOU x2in(16*1), X10 1938 MOVOU y2in(16*0), X11 1939 MOVOU y2in(16*1), X12 1940 MOVOU p256one<>+0x00(SB), X13 1941 MOVOU p256one<>+0x10(SB), X14 1942 1943 PAND X15, X0 1944 PAND X15, X1 1945 PAND X15, X2 1946 PAND X15, X3 1947 PAND X15, X4 1948 PAND X15, X5 1949 1950 PAND X7, X9 1951 PAND X7, X10 1952 PAND X7, X11 1953 PAND X7, X12 1954 PAND X7, X13 1955 PAND X7, X14 1956 1957 PXOR X9, X0 1958 PXOR X10, X1 1959 PXOR X11, X2 1960 PXOR X12, X3 1961 PXOR X13, X4 1962 PXOR X14, X5 1963 // Finally output the result 1964 MOVOU X0, (16*0)(AX) 1965 MOVOU X1, (16*1)(AX) 1966 MOVOU X2, (16*2)(AX) 1967 MOVOU X3, (16*3)(AX) 1968 MOVOU X4, (16*4)(AX) 1969 MOVOU X5, (16*5)(AX) 1970 MOVQ $0, rptr 1971 1972 RET 1973 #undef x1in 1974 #undef y1in 1975 #undef z1in 1976 #undef x2in 1977 #undef y2in 1978 #undef xout 1979 #undef yout 1980 #undef zout 1981 #undef s2 1982 #undef z1sqr 1983 #undef h 1984 #undef r 1985 #undef hsqr 1986 #undef rsqr 1987 #undef hcub 1988 #undef rptr 1989 #undef sel_save 1990 #undef zero_save 1991 1992 // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero 1993 // otherwise. It writes to [acc4..acc7], t0 and t1. 1994 TEXT p256IsZero(SB),NOSPLIT,$0 1995 // AX contains a flag that is set if the input is zero. 1996 XORQ AX, AX 1997 MOVQ $1, t1 1998 1999 // Check whether [acc4..acc7] are all zero. 2000 MOVQ acc4, t0 2001 ORQ acc5, t0 2002 ORQ acc6, t0 2003 ORQ acc7, t0 2004 2005 // Set the zero flag if so. (CMOV of a constant to a register doesn't 2006 // appear to be supported in Go. Thus t1 = 1.) 2007 CMOVQEQ t1, AX 2008 2009 // XOR [acc4..acc7] with P and compare with zero again. 2010 XORQ $-1, acc4 2011 XORQ p256const0<>(SB), acc5 2012 XORQ p256const1<>(SB), acc7 2013 ORQ acc5, acc4 2014 ORQ acc6, acc4 2015 ORQ acc7, acc4 2016 2017 // Set the zero flag if so. 2018 CMOVQEQ t1, AX 2019 RET 2020 2021 /* ---------------------------------------*/ 2022 #define x1in(off) (32*0 + off)(SP) 2023 #define y1in(off) (32*1 + off)(SP) 2024 #define z1in(off) (32*2 + off)(SP) 2025 #define x2in(off) (32*3 + off)(SP) 2026 #define y2in(off) (32*4 + off)(SP) 2027 #define z2in(off) (32*5 + off)(SP) 2028 2029 #define xout(off) (32*6 + off)(SP) 2030 #define yout(off) (32*7 + off)(SP) 2031 #define zout(off) (32*8 + off)(SP) 2032 2033 #define u1(off) (32*9 + off)(SP) 2034 #define u2(off) (32*10 + off)(SP) 2035 #define s1(off) (32*11 + off)(SP) 2036 #define s2(off) (32*12 + off)(SP) 2037 #define z1sqr(off) (32*13 + off)(SP) 2038 #define z2sqr(off) (32*14 + off)(SP) 2039 #define h(off) (32*15 + off)(SP) 2040 #define r(off) (32*16 + off)(SP) 2041 #define hsqr(off) (32*17 + off)(SP) 2042 #define rsqr(off) (32*18 + off)(SP) 2043 #define hcub(off) (32*19 + off)(SP) 2044 #define rptr (32*20)(SP) 2045 #define points_eq (32*20+8)(SP) 2046 2047 //func p256PointAddAsm(res, in1, in2 *P256Point) int 2048 TEXT ·p256PointAddAsm(SB),0,$680-32 2049 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 2050 // Move input to stack in order to free registers 2051 MOVQ res+0(FP), AX 2052 MOVQ in1+8(FP), BX 2053 MOVQ in2+16(FP), CX 2054 2055 MOVOU (16*0)(BX), X0 2056 MOVOU (16*1)(BX), X1 2057 MOVOU (16*2)(BX), X2 2058 MOVOU (16*3)(BX), X3 2059 MOVOU (16*4)(BX), X4 2060 MOVOU (16*5)(BX), X5 2061 2062 MOVOU X0, x1in(16*0) 2063 MOVOU X1, x1in(16*1) 2064 MOVOU X2, y1in(16*0) 2065 MOVOU X3, y1in(16*1) 2066 MOVOU X4, z1in(16*0) 2067 MOVOU X5, z1in(16*1) 2068 2069 MOVOU (16*0)(CX), X0 2070 MOVOU (16*1)(CX), X1 2071 MOVOU (16*2)(CX), X2 2072 MOVOU (16*3)(CX), X3 2073 MOVOU (16*4)(CX), X4 2074 MOVOU (16*5)(CX), X5 2075 2076 MOVOU X0, x2in(16*0) 2077 MOVOU X1, x2in(16*1) 2078 MOVOU X2, y2in(16*0) 2079 MOVOU X3, y2in(16*1) 2080 MOVOU X4, z2in(16*0) 2081 MOVOU X5, z2in(16*1) 2082 // Store pointer to result 2083 MOVQ AX, rptr 2084 // Begin point add 2085 LDacc (z2in) 2086 CALL p256SqrInternal(SB) // z2ˆ2 2087 ST (z2sqr) 2088 LDt (z2in) 2089 CALL p256MulInternal(SB) // z2ˆ3 2090 LDt (y1in) 2091 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 2092 ST (s1) 2093 2094 LDacc (z1in) 2095 CALL p256SqrInternal(SB) // z1ˆ2 2096 ST (z1sqr) 2097 LDt (z1in) 2098 CALL p256MulInternal(SB) // z1ˆ3 2099 LDt (y2in) 2100 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 2101 ST (s2) 2102 2103 LDt (s1) 2104 CALL p256SubInternal(SB) // r = s2 - s1 2105 ST (r) 2106 CALL p256IsZero(SB) 2107 MOVQ AX, points_eq 2108 2109 LDacc (z2sqr) 2110 LDt (x1in) 2111 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 2112 ST (u1) 2113 LDacc (z1sqr) 2114 LDt (x2in) 2115 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 2116 ST (u2) 2117 2118 LDt (u1) 2119 CALL p256SubInternal(SB) // h = u2 - u1 2120 ST (h) 2121 CALL p256IsZero(SB) 2122 ANDQ points_eq, AX 2123 MOVQ AX, points_eq 2124 2125 LDacc (r) 2126 CALL p256SqrInternal(SB) // rsqr = rˆ2 2127 ST (rsqr) 2128 2129 LDacc (h) 2130 CALL p256SqrInternal(SB) // hsqr = hˆ2 2131 ST (hsqr) 2132 2133 LDt (h) 2134 CALL p256MulInternal(SB) // hcub = hˆ3 2135 ST (hcub) 2136 2137 LDt (s1) 2138 CALL p256MulInternal(SB) 2139 ST (s2) 2140 2141 LDacc (z1in) 2142 LDt (z2in) 2143 CALL p256MulInternal(SB) // z1 * z2 2144 LDt (h) 2145 CALL p256MulInternal(SB) // z1 * z2 * h 2146 ST (zout) 2147 2148 LDacc (hsqr) 2149 LDt (u1) 2150 CALL p256MulInternal(SB) // hˆ2 * u1 2151 ST (u2) 2152 2153 p256MulBy2Inline // u1 * hˆ2 * 2, inline 2154 LDacc (rsqr) 2155 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 2156 2157 LDt (hcub) 2158 CALL p256SubInternal(SB) 2159 ST (xout) 2160 2161 MOVQ acc4, t0 2162 MOVQ acc5, t1 2163 MOVQ acc6, t2 2164 MOVQ acc7, t3 2165 LDacc (u2) 2166 CALL p256SubInternal(SB) 2167 2168 LDt (r) 2169 CALL p256MulInternal(SB) 2170 2171 LDt (s2) 2172 CALL p256SubInternal(SB) 2173 ST (yout) 2174 2175 MOVOU xout(16*0), X0 2176 MOVOU xout(16*1), X1 2177 MOVOU yout(16*0), X2 2178 MOVOU yout(16*1), X3 2179 MOVOU zout(16*0), X4 2180 MOVOU zout(16*1), X5 2181 // Finally output the result 2182 MOVQ rptr, AX 2183 MOVQ $0, rptr 2184 MOVOU X0, (16*0)(AX) 2185 MOVOU X1, (16*1)(AX) 2186 MOVOU X2, (16*2)(AX) 2187 MOVOU X3, (16*3)(AX) 2188 MOVOU X4, (16*4)(AX) 2189 MOVOU X5, (16*5)(AX) 2190 2191 MOVQ points_eq, AX 2192 MOVQ AX, ret+24(FP) 2193 2194 RET 2195 #undef x1in 2196 #undef y1in 2197 #undef z1in 2198 #undef x2in 2199 #undef y2in 2200 #undef z2in 2201 #undef xout 2202 #undef yout 2203 #undef zout 2204 #undef s1 2205 #undef s2 2206 #undef u1 2207 #undef u2 2208 #undef z1sqr 2209 #undef z2sqr 2210 #undef h 2211 #undef r 2212 #undef hsqr 2213 #undef rsqr 2214 #undef hcub 2215 #undef rptr 2216 /* ---------------------------------------*/ 2217 #define x(off) (32*0 + off)(SP) 2218 #define y(off) (32*1 + off)(SP) 2219 #define z(off) (32*2 + off)(SP) 2220 2221 #define s(off) (32*3 + off)(SP) 2222 #define m(off) (32*4 + off)(SP) 2223 #define zsqr(off) (32*5 + off)(SP) 2224 #define tmp(off) (32*6 + off)(SP) 2225 #define rptr (32*7)(SP) 2226 2227 //func p256PointDoubleAsm(res, in *P256Point) 2228 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16 2229 // Move input to stack in order to free registers 2230 MOVQ res+0(FP), AX 2231 MOVQ in+8(FP), BX 2232 2233 MOVOU (16*0)(BX), X0 2234 MOVOU (16*1)(BX), X1 2235 MOVOU (16*2)(BX), X2 2236 MOVOU (16*3)(BX), X3 2237 MOVOU (16*4)(BX), X4 2238 MOVOU (16*5)(BX), X5 2239 2240 MOVOU X0, x(16*0) 2241 MOVOU X1, x(16*1) 2242 MOVOU X2, y(16*0) 2243 MOVOU X3, y(16*1) 2244 MOVOU X4, z(16*0) 2245 MOVOU X5, z(16*1) 2246 // Store pointer to result 2247 MOVQ AX, rptr 2248 // Begin point double 2249 LDacc (z) 2250 CALL p256SqrInternal(SB) 2251 ST (zsqr) 2252 2253 LDt (x) 2254 p256AddInline 2255 STt (m) 2256 2257 LDacc (z) 2258 LDt (y) 2259 CALL p256MulInternal(SB) 2260 p256MulBy2Inline 2261 MOVQ rptr, AX 2262 // Store z 2263 MOVQ t0, (16*4 + 8*0)(AX) 2264 MOVQ t1, (16*4 + 8*1)(AX) 2265 MOVQ t2, (16*4 + 8*2)(AX) 2266 MOVQ t3, (16*4 + 8*3)(AX) 2267 2268 LDacc (x) 2269 LDt (zsqr) 2270 CALL p256SubInternal(SB) 2271 LDt (m) 2272 CALL p256MulInternal(SB) 2273 ST (m) 2274 // Multiply by 3 2275 p256MulBy2Inline 2276 LDacc (m) 2277 p256AddInline 2278 STt (m) 2279 //////////////////////// 2280 LDacc (y) 2281 p256MulBy2Inline 2282 t2acc 2283 CALL p256SqrInternal(SB) 2284 ST (s) 2285 CALL p256SqrInternal(SB) 2286 // Divide by 2 2287 XORQ mul0, mul0 2288 MOVQ acc4, t0 2289 MOVQ acc5, t1 2290 MOVQ acc6, t2 2291 MOVQ acc7, t3 2292 2293 ADDQ $-1, acc4 2294 ADCQ p256const0<>(SB), acc5 2295 ADCQ $0, acc6 2296 ADCQ p256const1<>(SB), acc7 2297 ADCQ $0, mul0 2298 TESTQ $1, t0 2299 2300 CMOVQEQ t0, acc4 2301 CMOVQEQ t1, acc5 2302 CMOVQEQ t2, acc6 2303 CMOVQEQ t3, acc7 2304 ANDQ t0, mul0 2305 2306 SHRQ $1, acc5, acc4 2307 SHRQ $1, acc6, acc5 2308 SHRQ $1, acc7, acc6 2309 SHRQ $1, mul0, acc7 2310 ST (y) 2311 ///////////////////////// 2312 LDacc (x) 2313 LDt (s) 2314 CALL p256MulInternal(SB) 2315 ST (s) 2316 p256MulBy2Inline 2317 STt (tmp) 2318 2319 LDacc (m) 2320 CALL p256SqrInternal(SB) 2321 LDt (tmp) 2322 CALL p256SubInternal(SB) 2323 2324 MOVQ rptr, AX 2325 // Store x 2326 MOVQ acc4, (16*0 + 8*0)(AX) 2327 MOVQ acc5, (16*0 + 8*1)(AX) 2328 MOVQ acc6, (16*0 + 8*2)(AX) 2329 MOVQ acc7, (16*0 + 8*3)(AX) 2330 2331 acc2t 2332 LDacc (s) 2333 CALL p256SubInternal(SB) 2334 2335 LDt (m) 2336 CALL p256MulInternal(SB) 2337 2338 LDt (y) 2339 CALL p256SubInternal(SB) 2340 MOVQ rptr, AX 2341 // Store y 2342 MOVQ acc4, (16*2 + 8*0)(AX) 2343 MOVQ acc5, (16*2 + 8*1)(AX) 2344 MOVQ acc6, (16*2 + 8*2)(AX) 2345 MOVQ acc7, (16*2 + 8*3)(AX) 2346 /////////////////////// 2347 MOVQ $0, rptr 2348 2349 RET 2350 /* ---------------------------------------*/