github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/crypto/internal/nistec/p256_asm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 // This file contains constant-time, 64-bit assembly implementation of 8 // P256. The optimizations performed here are described in detail in: 9 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 10 // 256-bit primes" 11 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 12 // https://eprint.iacr.org/2013/816.pdf 13 14 #include "textflag.h" 15 16 #define res_ptr DI 17 #define x_ptr SI 18 #define y_ptr CX 19 20 #define acc0 R8 21 #define acc1 R9 22 #define acc2 R10 23 #define acc3 R11 24 #define acc4 R12 25 #define acc5 R13 26 #define t0 R14 27 #define t1 R15 28 29 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 30 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 31 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 32 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 33 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 34 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 35 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 36 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 37 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 38 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 39 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 40 GLOBL p256const0<>(SB), 8, $8 41 GLOBL p256const1<>(SB), 8, $8 42 GLOBL p256ordK0<>(SB), 8, $8 43 GLOBL p256ord<>(SB), 8, $32 44 GLOBL p256one<>(SB), 8, $32 45 46 /* ---------------------------------------*/ 47 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 48 TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 49 JMP ·p256BigToLittle(SB) 50 /* ---------------------------------------*/ 51 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 52 TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 53 JMP ·p256BigToLittle(SB) 54 /* ---------------------------------------*/ 55 // func p256LittleToBig(res *[32]byte, in *p256Element) 56 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 57 JMP ·p256BigToLittle(SB) 58 /* ---------------------------------------*/ 59 // func p256BigToLittle(res *p256Element, in *[32]byte) 60 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 61 MOVQ res+0(FP), res_ptr 62 MOVQ in+8(FP), x_ptr 63 64 MOVQ (8*0)(x_ptr), acc0 65 MOVQ (8*1)(x_ptr), acc1 66 MOVQ (8*2)(x_ptr), acc2 67 MOVQ (8*3)(x_ptr), acc3 68 69 BSWAPQ acc0 70 BSWAPQ acc1 71 BSWAPQ acc2 72 BSWAPQ acc3 73 74 MOVQ acc3, (8*0)(res_ptr) 75 MOVQ acc2, (8*1)(res_ptr) 76 MOVQ acc1, (8*2)(res_ptr) 77 MOVQ acc0, (8*3)(res_ptr) 78 79 RET 80 /* ---------------------------------------*/ 81 // func p256MovCond(res, a, b *P256Point, cond int) 82 TEXT ·p256MovCond(SB),NOSPLIT,$0 83 MOVQ res+0(FP), res_ptr 84 MOVQ a+8(FP), x_ptr 85 MOVQ b+16(FP), y_ptr 86 MOVQ cond+24(FP), X12 87 88 PXOR X13, X13 89 PSHUFD $0, X12, X12 90 PCMPEQL X13, X12 91 92 MOVOU X12, X0 93 MOVOU (16*0)(x_ptr), X6 94 PANDN X6, X0 95 MOVOU X12, X1 96 MOVOU (16*1)(x_ptr), X7 97 PANDN X7, X1 98 MOVOU X12, X2 99 MOVOU (16*2)(x_ptr), X8 100 PANDN X8, X2 101 MOVOU X12, X3 102 MOVOU (16*3)(x_ptr), X9 103 PANDN X9, X3 104 MOVOU X12, X4 105 MOVOU (16*4)(x_ptr), X10 106 PANDN X10, X4 107 MOVOU X12, X5 108 MOVOU (16*5)(x_ptr), X11 109 PANDN X11, X5 110 111 MOVOU (16*0)(y_ptr), X6 112 MOVOU (16*1)(y_ptr), X7 113 MOVOU (16*2)(y_ptr), X8 114 MOVOU (16*3)(y_ptr), X9 115 MOVOU (16*4)(y_ptr), X10 116 MOVOU (16*5)(y_ptr), X11 117 118 PAND X12, X6 119 PAND X12, X7 120 PAND X12, X8 121 PAND X12, X9 122 PAND X12, X10 123 PAND X12, X11 124 125 PXOR X6, X0 126 PXOR X7, X1 127 PXOR X8, X2 128 PXOR X9, X3 129 PXOR X10, X4 130 PXOR X11, X5 131 132 MOVOU X0, (16*0)(res_ptr) 133 MOVOU X1, (16*1)(res_ptr) 134 MOVOU X2, (16*2)(res_ptr) 135 MOVOU X3, (16*3)(res_ptr) 136 MOVOU X4, (16*4)(res_ptr) 137 MOVOU X5, (16*5)(res_ptr) 138 139 RET 140 /* ---------------------------------------*/ 141 // func p256NegCond(val *p256Element, cond int) 142 TEXT ·p256NegCond(SB),NOSPLIT,$0 143 MOVQ val+0(FP), res_ptr 144 MOVQ cond+8(FP), t0 145 // acc = poly 146 MOVQ $-1, acc0 147 MOVQ p256const0<>(SB), acc1 148 MOVQ $0, acc2 149 MOVQ p256const1<>(SB), acc3 150 // Load the original value 151 MOVQ (8*0)(res_ptr), acc5 152 MOVQ (8*1)(res_ptr), x_ptr 153 MOVQ (8*2)(res_ptr), y_ptr 154 MOVQ (8*3)(res_ptr), t1 155 // Speculatively subtract 156 SUBQ acc5, acc0 157 SBBQ x_ptr, acc1 158 SBBQ y_ptr, acc2 159 SBBQ t1, acc3 160 // If condition is 0, keep original value 161 TESTQ t0, t0 162 CMOVQEQ acc5, acc0 163 CMOVQEQ x_ptr, acc1 164 CMOVQEQ y_ptr, acc2 165 CMOVQEQ t1, acc3 166 // Store result 167 MOVQ acc0, (8*0)(res_ptr) 168 MOVQ acc1, (8*1)(res_ptr) 169 MOVQ acc2, (8*2)(res_ptr) 170 MOVQ acc3, (8*3)(res_ptr) 171 172 RET 173 /* ---------------------------------------*/ 174 // func p256Sqr(res, in *p256Element, n int) 175 TEXT ·p256Sqr(SB),NOSPLIT,$0 176 MOVQ res+0(FP), res_ptr 177 MOVQ in+8(FP), x_ptr 178 MOVQ n+16(FP), BX 179 180 sqrLoop: 181 182 // y[1:] * y[0] 183 MOVQ (8*0)(x_ptr), t0 184 185 MOVQ (8*1)(x_ptr), AX 186 MULQ t0 187 MOVQ AX, acc1 188 MOVQ DX, acc2 189 190 MOVQ (8*2)(x_ptr), AX 191 MULQ t0 192 ADDQ AX, acc2 193 ADCQ $0, DX 194 MOVQ DX, acc3 195 196 MOVQ (8*3)(x_ptr), AX 197 MULQ t0 198 ADDQ AX, acc3 199 ADCQ $0, DX 200 MOVQ DX, acc4 201 // y[2:] * y[1] 202 MOVQ (8*1)(x_ptr), t0 203 204 MOVQ (8*2)(x_ptr), AX 205 MULQ t0 206 ADDQ AX, acc3 207 ADCQ $0, DX 208 MOVQ DX, t1 209 210 MOVQ (8*3)(x_ptr), AX 211 MULQ t0 212 ADDQ t1, acc4 213 ADCQ $0, DX 214 ADDQ AX, acc4 215 ADCQ $0, DX 216 MOVQ DX, acc5 217 // y[3] * y[2] 218 MOVQ (8*2)(x_ptr), t0 219 220 MOVQ (8*3)(x_ptr), AX 221 MULQ t0 222 ADDQ AX, acc5 223 ADCQ $0, DX 224 MOVQ DX, y_ptr 225 XORQ t1, t1 226 // *2 227 ADDQ acc1, acc1 228 ADCQ acc2, acc2 229 ADCQ acc3, acc3 230 ADCQ acc4, acc4 231 ADCQ acc5, acc5 232 ADCQ y_ptr, y_ptr 233 ADCQ $0, t1 234 // Missing products 235 MOVQ (8*0)(x_ptr), AX 236 MULQ AX 237 MOVQ AX, acc0 238 MOVQ DX, t0 239 240 MOVQ (8*1)(x_ptr), AX 241 MULQ AX 242 ADDQ t0, acc1 243 ADCQ AX, acc2 244 ADCQ $0, DX 245 MOVQ DX, t0 246 247 MOVQ (8*2)(x_ptr), AX 248 MULQ AX 249 ADDQ t0, acc3 250 ADCQ AX, acc4 251 ADCQ $0, DX 252 MOVQ DX, t0 253 254 MOVQ (8*3)(x_ptr), AX 255 MULQ AX 256 ADDQ t0, acc5 257 ADCQ AX, y_ptr 258 ADCQ DX, t1 259 MOVQ t1, x_ptr 260 // First reduction step 261 MOVQ acc0, AX 262 MOVQ acc0, t1 263 SHLQ $32, acc0 264 MULQ p256const1<>(SB) 265 SHRQ $32, t1 266 ADDQ acc0, acc1 267 ADCQ t1, acc2 268 ADCQ AX, acc3 269 ADCQ $0, DX 270 MOVQ DX, acc0 271 // Second reduction step 272 MOVQ acc1, AX 273 MOVQ acc1, t1 274 SHLQ $32, acc1 275 MULQ p256const1<>(SB) 276 SHRQ $32, t1 277 ADDQ acc1, acc2 278 ADCQ t1, acc3 279 ADCQ AX, acc0 280 ADCQ $0, DX 281 MOVQ DX, acc1 282 // Third reduction step 283 MOVQ acc2, AX 284 MOVQ acc2, t1 285 SHLQ $32, acc2 286 MULQ p256const1<>(SB) 287 SHRQ $32, t1 288 ADDQ acc2, acc3 289 ADCQ t1, acc0 290 ADCQ AX, acc1 291 ADCQ $0, DX 292 MOVQ DX, acc2 293 // Last reduction step 294 XORQ t0, t0 295 MOVQ acc3, AX 296 MOVQ acc3, t1 297 SHLQ $32, acc3 298 MULQ p256const1<>(SB) 299 SHRQ $32, t1 300 ADDQ acc3, acc0 301 ADCQ t1, acc1 302 ADCQ AX, acc2 303 ADCQ $0, DX 304 MOVQ DX, acc3 305 // Add bits [511:256] of the sqr result 306 ADCQ acc4, acc0 307 ADCQ acc5, acc1 308 ADCQ y_ptr, acc2 309 ADCQ x_ptr, acc3 310 ADCQ $0, t0 311 312 MOVQ acc0, acc4 313 MOVQ acc1, acc5 314 MOVQ acc2, y_ptr 315 MOVQ acc3, t1 316 // Subtract p256 317 SUBQ $-1, acc0 318 SBBQ p256const0<>(SB) ,acc1 319 SBBQ $0, acc2 320 SBBQ p256const1<>(SB), acc3 321 SBBQ $0, t0 322 323 CMOVQCS acc4, acc0 324 CMOVQCS acc5, acc1 325 CMOVQCS y_ptr, acc2 326 CMOVQCS t1, acc3 327 328 MOVQ acc0, (8*0)(res_ptr) 329 MOVQ acc1, (8*1)(res_ptr) 330 MOVQ acc2, (8*2)(res_ptr) 331 MOVQ acc3, (8*3)(res_ptr) 332 MOVQ res_ptr, x_ptr 333 DECQ BX 334 JNE sqrLoop 335 336 RET 337 /* ---------------------------------------*/ 338 // func p256Mul(res, in1, in2 *p256Element) 339 TEXT ·p256Mul(SB),NOSPLIT,$0 340 MOVQ res+0(FP), res_ptr 341 MOVQ in1+8(FP), x_ptr 342 MOVQ in2+16(FP), y_ptr 343 // x * y[0] 344 MOVQ (8*0)(y_ptr), t0 345 346 MOVQ (8*0)(x_ptr), AX 347 MULQ t0 348 MOVQ AX, acc0 349 MOVQ DX, acc1 350 351 MOVQ (8*1)(x_ptr), AX 352 MULQ t0 353 ADDQ AX, acc1 354 ADCQ $0, DX 355 MOVQ DX, acc2 356 357 MOVQ (8*2)(x_ptr), AX 358 MULQ t0 359 ADDQ AX, acc2 360 ADCQ $0, DX 361 MOVQ DX, acc3 362 363 MOVQ (8*3)(x_ptr), AX 364 MULQ t0 365 ADDQ AX, acc3 366 ADCQ $0, DX 367 MOVQ DX, acc4 368 XORQ acc5, acc5 369 // First reduction step 370 MOVQ acc0, AX 371 MOVQ acc0, t1 372 SHLQ $32, acc0 373 MULQ p256const1<>(SB) 374 SHRQ $32, t1 375 ADDQ acc0, acc1 376 ADCQ t1, acc2 377 ADCQ AX, acc3 378 ADCQ DX, acc4 379 ADCQ $0, acc5 380 XORQ acc0, acc0 381 // x * y[1] 382 MOVQ (8*1)(y_ptr), t0 383 384 MOVQ (8*0)(x_ptr), AX 385 MULQ t0 386 ADDQ AX, acc1 387 ADCQ $0, DX 388 MOVQ DX, t1 389 390 MOVQ (8*1)(x_ptr), AX 391 MULQ t0 392 ADDQ t1, acc2 393 ADCQ $0, DX 394 ADDQ AX, acc2 395 ADCQ $0, DX 396 MOVQ DX, t1 397 398 MOVQ (8*2)(x_ptr), AX 399 MULQ t0 400 ADDQ t1, acc3 401 ADCQ $0, DX 402 ADDQ AX, acc3 403 ADCQ $0, DX 404 MOVQ DX, t1 405 406 MOVQ (8*3)(x_ptr), AX 407 MULQ t0 408 ADDQ t1, acc4 409 ADCQ $0, DX 410 ADDQ AX, acc4 411 ADCQ DX, acc5 412 ADCQ $0, acc0 413 // Second reduction step 414 MOVQ acc1, AX 415 MOVQ acc1, t1 416 SHLQ $32, acc1 417 MULQ p256const1<>(SB) 418 SHRQ $32, t1 419 ADDQ acc1, acc2 420 ADCQ t1, acc3 421 ADCQ AX, acc4 422 ADCQ DX, acc5 423 ADCQ $0, acc0 424 XORQ acc1, acc1 425 // x * y[2] 426 MOVQ (8*2)(y_ptr), t0 427 428 MOVQ (8*0)(x_ptr), AX 429 MULQ t0 430 ADDQ AX, acc2 431 ADCQ $0, DX 432 MOVQ DX, t1 433 434 MOVQ (8*1)(x_ptr), AX 435 MULQ t0 436 ADDQ t1, acc3 437 ADCQ $0, DX 438 ADDQ AX, acc3 439 ADCQ $0, DX 440 MOVQ DX, t1 441 442 MOVQ (8*2)(x_ptr), AX 443 MULQ t0 444 ADDQ t1, acc4 445 ADCQ $0, DX 446 ADDQ AX, acc4 447 ADCQ $0, DX 448 MOVQ DX, t1 449 450 MOVQ (8*3)(x_ptr), AX 451 MULQ t0 452 ADDQ t1, acc5 453 ADCQ $0, DX 454 ADDQ AX, acc5 455 ADCQ DX, acc0 456 ADCQ $0, acc1 457 // Third reduction step 458 MOVQ acc2, AX 459 MOVQ acc2, t1 460 SHLQ $32, acc2 461 MULQ p256const1<>(SB) 462 SHRQ $32, t1 463 ADDQ acc2, acc3 464 ADCQ t1, acc4 465 ADCQ AX, acc5 466 ADCQ DX, acc0 467 ADCQ $0, acc1 468 XORQ acc2, acc2 469 // x * y[3] 470 MOVQ (8*3)(y_ptr), t0 471 472 MOVQ (8*0)(x_ptr), AX 473 MULQ t0 474 ADDQ AX, acc3 475 ADCQ $0, DX 476 MOVQ DX, t1 477 478 MOVQ (8*1)(x_ptr), AX 479 MULQ t0 480 ADDQ t1, acc4 481 ADCQ $0, DX 482 ADDQ AX, acc4 483 ADCQ $0, DX 484 MOVQ DX, t1 485 486 MOVQ (8*2)(x_ptr), AX 487 MULQ t0 488 ADDQ t1, acc5 489 ADCQ $0, DX 490 ADDQ AX, acc5 491 ADCQ $0, DX 492 MOVQ DX, t1 493 494 MOVQ (8*3)(x_ptr), AX 495 MULQ t0 496 ADDQ t1, acc0 497 ADCQ $0, DX 498 ADDQ AX, acc0 499 ADCQ DX, acc1 500 ADCQ $0, acc2 501 // Last reduction step 502 MOVQ acc3, AX 503 MOVQ acc3, t1 504 SHLQ $32, acc3 505 MULQ p256const1<>(SB) 506 SHRQ $32, t1 507 ADDQ acc3, acc4 508 ADCQ t1, acc5 509 ADCQ AX, acc0 510 ADCQ DX, acc1 511 ADCQ $0, acc2 512 // Copy result [255:0] 513 MOVQ acc4, x_ptr 514 MOVQ acc5, acc3 515 MOVQ acc0, t0 516 MOVQ acc1, t1 517 // Subtract p256 518 SUBQ $-1, acc4 519 SBBQ p256const0<>(SB) ,acc5 520 SBBQ $0, acc0 521 SBBQ p256const1<>(SB), acc1 522 SBBQ $0, acc2 523 524 CMOVQCS x_ptr, acc4 525 CMOVQCS acc3, acc5 526 CMOVQCS t0, acc0 527 CMOVQCS t1, acc1 528 529 MOVQ acc4, (8*0)(res_ptr) 530 MOVQ acc5, (8*1)(res_ptr) 531 MOVQ acc0, (8*2)(res_ptr) 532 MOVQ acc1, (8*3)(res_ptr) 533 534 RET 535 /* ---------------------------------------*/ 536 // func p256FromMont(res, in *p256Element) 537 TEXT ·p256FromMont(SB),NOSPLIT,$0 538 MOVQ res+0(FP), res_ptr 539 MOVQ in+8(FP), x_ptr 540 541 MOVQ (8*0)(x_ptr), acc0 542 MOVQ (8*1)(x_ptr), acc1 543 MOVQ (8*2)(x_ptr), acc2 544 MOVQ (8*3)(x_ptr), acc3 545 XORQ acc4, acc4 546 547 // Only reduce, no multiplications are needed 548 // First stage 549 MOVQ acc0, AX 550 MOVQ acc0, t1 551 SHLQ $32, acc0 552 MULQ p256const1<>(SB) 553 SHRQ $32, t1 554 ADDQ acc0, acc1 555 ADCQ t1, acc2 556 ADCQ AX, acc3 557 ADCQ DX, acc4 558 XORQ acc5, acc5 559 // Second stage 560 MOVQ acc1, AX 561 MOVQ acc1, t1 562 SHLQ $32, acc1 563 MULQ p256const1<>(SB) 564 SHRQ $32, t1 565 ADDQ acc1, acc2 566 ADCQ t1, acc3 567 ADCQ AX, acc4 568 ADCQ DX, acc5 569 XORQ acc0, acc0 570 // Third stage 571 MOVQ acc2, AX 572 MOVQ acc2, t1 573 SHLQ $32, acc2 574 MULQ p256const1<>(SB) 575 SHRQ $32, t1 576 ADDQ acc2, acc3 577 ADCQ t1, acc4 578 ADCQ AX, acc5 579 ADCQ DX, acc0 580 XORQ acc1, acc1 581 // Last stage 582 MOVQ acc3, AX 583 MOVQ acc3, t1 584 SHLQ $32, acc3 585 MULQ p256const1<>(SB) 586 SHRQ $32, t1 587 ADDQ acc3, acc4 588 ADCQ t1, acc5 589 ADCQ AX, acc0 590 ADCQ DX, acc1 591 592 MOVQ acc4, x_ptr 593 MOVQ acc5, acc3 594 MOVQ acc0, t0 595 MOVQ acc1, t1 596 597 SUBQ $-1, acc4 598 SBBQ p256const0<>(SB), acc5 599 SBBQ $0, acc0 600 SBBQ p256const1<>(SB), acc1 601 602 CMOVQCS x_ptr, acc4 603 CMOVQCS acc3, acc5 604 CMOVQCS t0, acc0 605 CMOVQCS t1, acc1 606 607 MOVQ acc4, (8*0)(res_ptr) 608 MOVQ acc5, (8*1)(res_ptr) 609 MOVQ acc0, (8*2)(res_ptr) 610 MOVQ acc1, (8*3)(res_ptr) 611 612 RET 613 /* ---------------------------------------*/ 614 // func p256Select(res *P256Point, table *p256Table, idx int) 615 TEXT ·p256Select(SB),NOSPLIT,$0 616 MOVQ idx+16(FP),AX 617 MOVQ table+8(FP),DI 618 MOVQ res+0(FP),DX 619 620 PXOR X15, X15 // X15 = 0 621 PCMPEQL X14, X14 // X14 = -1 622 PSUBL X14, X15 // X15 = 1 623 MOVL AX, X14 624 PSHUFD $0, X14, X14 625 626 PXOR X0, X0 627 PXOR X1, X1 628 PXOR X2, X2 629 PXOR X3, X3 630 PXOR X4, X4 631 PXOR X5, X5 632 MOVQ $16, AX 633 634 MOVOU X15, X13 635 636 loop_select: 637 638 MOVOU X13, X12 639 PADDL X15, X13 640 PCMPEQL X14, X12 641 642 MOVOU (16*0)(DI), X6 643 MOVOU (16*1)(DI), X7 644 MOVOU (16*2)(DI), X8 645 MOVOU (16*3)(DI), X9 646 MOVOU (16*4)(DI), X10 647 MOVOU (16*5)(DI), X11 648 ADDQ $(16*6), DI 649 650 PAND X12, X6 651 PAND X12, X7 652 PAND X12, X8 653 PAND X12, X9 654 PAND X12, X10 655 PAND X12, X11 656 657 PXOR X6, X0 658 PXOR X7, X1 659 PXOR X8, X2 660 PXOR X9, X3 661 PXOR X10, X4 662 PXOR X11, X5 663 664 DECQ AX 665 JNE loop_select 666 667 MOVOU X0, (16*0)(DX) 668 MOVOU X1, (16*1)(DX) 669 MOVOU X2, (16*2)(DX) 670 MOVOU X3, (16*3)(DX) 671 MOVOU X4, (16*4)(DX) 672 MOVOU X5, (16*5)(DX) 673 674 RET 675 /* ---------------------------------------*/ 676 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 677 TEXT ·p256SelectAffine(SB),NOSPLIT,$0 678 MOVQ idx+16(FP),AX 679 MOVQ table+8(FP),DI 680 MOVQ res+0(FP),DX 681 682 PXOR X15, X15 // X15 = 0 683 PCMPEQL X14, X14 // X14 = -1 684 PSUBL X14, X15 // X15 = 1 685 MOVL AX, X14 686 PSHUFD $0, X14, X14 687 688 PXOR X0, X0 689 PXOR X1, X1 690 PXOR X2, X2 691 PXOR X3, X3 692 MOVQ $16, AX 693 694 MOVOU X15, X13 695 696 loop_select_base: 697 698 MOVOU X13, X12 699 PADDL X15, X13 700 PCMPEQL X14, X12 701 702 MOVOU (16*0)(DI), X4 703 MOVOU (16*1)(DI), X5 704 MOVOU (16*2)(DI), X6 705 MOVOU (16*3)(DI), X7 706 707 MOVOU (16*4)(DI), X8 708 MOVOU (16*5)(DI), X9 709 MOVOU (16*6)(DI), X10 710 MOVOU (16*7)(DI), X11 711 712 ADDQ $(16*8), DI 713 714 PAND X12, X4 715 PAND X12, X5 716 PAND X12, X6 717 PAND X12, X7 718 719 MOVOU X13, X12 720 PADDL X15, X13 721 PCMPEQL X14, X12 722 723 PAND X12, X8 724 PAND X12, X9 725 PAND X12, X10 726 PAND X12, X11 727 728 PXOR X4, X0 729 PXOR X5, X1 730 PXOR X6, X2 731 PXOR X7, X3 732 733 PXOR X8, X0 734 PXOR X9, X1 735 PXOR X10, X2 736 PXOR X11, X3 737 738 DECQ AX 739 JNE loop_select_base 740 741 MOVOU X0, (16*0)(DX) 742 MOVOU X1, (16*1)(DX) 743 MOVOU X2, (16*2)(DX) 744 MOVOU X3, (16*3)(DX) 745 746 RET 747 /* ---------------------------------------*/ 748 // func p256OrdMul(res, in1, in2 *p256OrdElement) 749 TEXT ·p256OrdMul(SB),NOSPLIT,$0 750 MOVQ res+0(FP), res_ptr 751 MOVQ in1+8(FP), x_ptr 752 MOVQ in2+16(FP), y_ptr 753 // x * y[0] 754 MOVQ (8*0)(y_ptr), t0 755 756 MOVQ (8*0)(x_ptr), AX 757 MULQ t0 758 MOVQ AX, acc0 759 MOVQ DX, acc1 760 761 MOVQ (8*1)(x_ptr), AX 762 MULQ t0 763 ADDQ AX, acc1 764 ADCQ $0, DX 765 MOVQ DX, acc2 766 767 MOVQ (8*2)(x_ptr), AX 768 MULQ t0 769 ADDQ AX, acc2 770 ADCQ $0, DX 771 MOVQ DX, acc3 772 773 MOVQ (8*3)(x_ptr), AX 774 MULQ t0 775 ADDQ AX, acc3 776 ADCQ $0, DX 777 MOVQ DX, acc4 778 XORQ acc5, acc5 779 // First reduction step 780 MOVQ acc0, AX 781 MULQ p256ordK0<>(SB) 782 MOVQ AX, t0 783 784 MOVQ p256ord<>+0x00(SB), AX 785 MULQ t0 786 ADDQ AX, acc0 787 ADCQ $0, DX 788 MOVQ DX, t1 789 790 MOVQ p256ord<>+0x08(SB), AX 791 MULQ t0 792 ADDQ t1, acc1 793 ADCQ $0, DX 794 ADDQ AX, acc1 795 ADCQ $0, DX 796 MOVQ DX, t1 797 798 MOVQ p256ord<>+0x10(SB), AX 799 MULQ t0 800 ADDQ t1, acc2 801 ADCQ $0, DX 802 ADDQ AX, acc2 803 ADCQ $0, DX 804 MOVQ DX, t1 805 806 MOVQ p256ord<>+0x18(SB), AX 807 MULQ t0 808 ADDQ t1, acc3 809 ADCQ $0, DX 810 ADDQ AX, acc3 811 ADCQ DX, acc4 812 ADCQ $0, acc5 813 // x * y[1] 814 MOVQ (8*1)(y_ptr), t0 815 816 MOVQ (8*0)(x_ptr), AX 817 MULQ t0 818 ADDQ AX, acc1 819 ADCQ $0, DX 820 MOVQ DX, t1 821 822 MOVQ (8*1)(x_ptr), AX 823 MULQ t0 824 ADDQ t1, acc2 825 ADCQ $0, DX 826 ADDQ AX, acc2 827 ADCQ $0, DX 828 MOVQ DX, t1 829 830 MOVQ (8*2)(x_ptr), AX 831 MULQ t0 832 ADDQ t1, acc3 833 ADCQ $0, DX 834 ADDQ AX, acc3 835 ADCQ $0, DX 836 MOVQ DX, t1 837 838 MOVQ (8*3)(x_ptr), AX 839 MULQ t0 840 ADDQ t1, acc4 841 ADCQ $0, DX 842 ADDQ AX, acc4 843 ADCQ DX, acc5 844 ADCQ $0, acc0 845 // Second reduction step 846 MOVQ acc1, AX 847 MULQ p256ordK0<>(SB) 848 MOVQ AX, t0 849 850 MOVQ p256ord<>+0x00(SB), AX 851 MULQ t0 852 ADDQ AX, acc1 853 ADCQ $0, DX 854 MOVQ DX, t1 855 856 MOVQ p256ord<>+0x08(SB), AX 857 MULQ t0 858 ADDQ t1, acc2 859 ADCQ $0, DX 860 ADDQ AX, acc2 861 ADCQ $0, DX 862 MOVQ DX, t1 863 864 MOVQ p256ord<>+0x10(SB), AX 865 MULQ t0 866 ADDQ t1, acc3 867 ADCQ $0, DX 868 ADDQ AX, acc3 869 ADCQ $0, DX 870 MOVQ DX, t1 871 872 MOVQ p256ord<>+0x18(SB), AX 873 MULQ t0 874 ADDQ t1, acc4 875 ADCQ $0, DX 876 ADDQ AX, acc4 877 ADCQ DX, acc5 878 ADCQ $0, acc0 879 // x * y[2] 880 MOVQ (8*2)(y_ptr), t0 881 882 MOVQ (8*0)(x_ptr), AX 883 MULQ t0 884 ADDQ AX, acc2 885 ADCQ $0, DX 886 MOVQ DX, t1 887 888 MOVQ (8*1)(x_ptr), AX 889 MULQ t0 890 ADDQ t1, acc3 891 ADCQ $0, DX 892 ADDQ AX, acc3 893 ADCQ $0, DX 894 MOVQ DX, t1 895 896 MOVQ (8*2)(x_ptr), AX 897 MULQ t0 898 ADDQ t1, acc4 899 ADCQ $0, DX 900 ADDQ AX, acc4 901 ADCQ $0, DX 902 MOVQ DX, t1 903 904 MOVQ (8*3)(x_ptr), AX 905 MULQ t0 906 ADDQ t1, acc5 907 ADCQ $0, DX 908 ADDQ AX, acc5 909 ADCQ DX, acc0 910 ADCQ $0, acc1 911 // Third reduction step 912 MOVQ acc2, AX 913 MULQ p256ordK0<>(SB) 914 MOVQ AX, t0 915 916 MOVQ p256ord<>+0x00(SB), AX 917 MULQ t0 918 ADDQ AX, acc2 919 ADCQ $0, DX 920 MOVQ DX, t1 921 922 MOVQ p256ord<>+0x08(SB), AX 923 MULQ t0 924 ADDQ t1, acc3 925 ADCQ $0, DX 926 ADDQ AX, acc3 927 ADCQ $0, DX 928 MOVQ DX, t1 929 930 MOVQ p256ord<>+0x10(SB), AX 931 MULQ t0 932 ADDQ t1, acc4 933 ADCQ $0, DX 934 ADDQ AX, acc4 935 ADCQ $0, DX 936 MOVQ DX, t1 937 938 MOVQ p256ord<>+0x18(SB), AX 939 MULQ t0 940 ADDQ t1, acc5 941 ADCQ $0, DX 942 ADDQ AX, acc5 943 ADCQ DX, acc0 944 ADCQ $0, acc1 945 // x * y[3] 946 MOVQ (8*3)(y_ptr), t0 947 948 MOVQ (8*0)(x_ptr), AX 949 MULQ t0 950 ADDQ AX, acc3 951 ADCQ $0, DX 952 MOVQ DX, t1 953 954 MOVQ (8*1)(x_ptr), AX 955 MULQ t0 956 ADDQ t1, acc4 957 ADCQ $0, DX 958 ADDQ AX, acc4 959 ADCQ $0, DX 960 MOVQ DX, t1 961 962 MOVQ (8*2)(x_ptr), AX 963 MULQ t0 964 ADDQ t1, acc5 965 ADCQ $0, DX 966 ADDQ AX, acc5 967 ADCQ $0, DX 968 MOVQ DX, t1 969 970 MOVQ (8*3)(x_ptr), AX 971 MULQ t0 972 ADDQ t1, acc0 973 ADCQ $0, DX 974 ADDQ AX, acc0 975 ADCQ DX, acc1 976 ADCQ $0, acc2 977 // Last reduction step 978 MOVQ acc3, AX 979 MULQ p256ordK0<>(SB) 980 MOVQ AX, t0 981 982 MOVQ p256ord<>+0x00(SB), AX 983 MULQ t0 984 ADDQ AX, acc3 985 ADCQ $0, DX 986 MOVQ DX, t1 987 988 MOVQ p256ord<>+0x08(SB), AX 989 MULQ t0 990 ADDQ t1, acc4 991 ADCQ $0, DX 992 ADDQ AX, acc4 993 ADCQ $0, DX 994 MOVQ DX, t1 995 996 MOVQ p256ord<>+0x10(SB), AX 997 MULQ t0 998 ADDQ t1, acc5 999 ADCQ $0, DX 1000 ADDQ AX, acc5 1001 ADCQ $0, DX 1002 MOVQ DX, t1 1003 1004 MOVQ p256ord<>+0x18(SB), AX 1005 MULQ t0 1006 ADDQ t1, acc0 1007 ADCQ $0, DX 1008 ADDQ AX, acc0 1009 ADCQ DX, acc1 1010 ADCQ $0, acc2 1011 // Copy result [255:0] 1012 MOVQ acc4, x_ptr 1013 MOVQ acc5, acc3 1014 MOVQ acc0, t0 1015 MOVQ acc1, t1 1016 // Subtract p256 1017 SUBQ p256ord<>+0x00(SB), acc4 1018 SBBQ p256ord<>+0x08(SB) ,acc5 1019 SBBQ p256ord<>+0x10(SB), acc0 1020 SBBQ p256ord<>+0x18(SB), acc1 1021 SBBQ $0, acc2 1022 1023 CMOVQCS x_ptr, acc4 1024 CMOVQCS acc3, acc5 1025 CMOVQCS t0, acc0 1026 CMOVQCS t1, acc1 1027 1028 MOVQ acc4, (8*0)(res_ptr) 1029 MOVQ acc5, (8*1)(res_ptr) 1030 MOVQ acc0, (8*2)(res_ptr) 1031 MOVQ acc1, (8*3)(res_ptr) 1032 1033 RET 1034 /* ---------------------------------------*/ 1035 // func p256OrdSqr(res, in *p256OrdElement, n int) 1036 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 1037 MOVQ res+0(FP), res_ptr 1038 MOVQ in+8(FP), x_ptr 1039 MOVQ n+16(FP), BX 1040 1041 ordSqrLoop: 1042 1043 // y[1:] * y[0] 1044 MOVQ (8*0)(x_ptr), t0 1045 1046 MOVQ (8*1)(x_ptr), AX 1047 MULQ t0 1048 MOVQ AX, acc1 1049 MOVQ DX, acc2 1050 1051 MOVQ (8*2)(x_ptr), AX 1052 MULQ t0 1053 ADDQ AX, acc2 1054 ADCQ $0, DX 1055 MOVQ DX, acc3 1056 1057 MOVQ (8*3)(x_ptr), AX 1058 MULQ t0 1059 ADDQ AX, acc3 1060 ADCQ $0, DX 1061 MOVQ DX, acc4 1062 // y[2:] * y[1] 1063 MOVQ (8*1)(x_ptr), t0 1064 1065 MOVQ (8*2)(x_ptr), AX 1066 MULQ t0 1067 ADDQ AX, acc3 1068 ADCQ $0, DX 1069 MOVQ DX, t1 1070 1071 MOVQ (8*3)(x_ptr), AX 1072 MULQ t0 1073 ADDQ t1, acc4 1074 ADCQ $0, DX 1075 ADDQ AX, acc4 1076 ADCQ $0, DX 1077 MOVQ DX, acc5 1078 // y[3] * y[2] 1079 MOVQ (8*2)(x_ptr), t0 1080 1081 MOVQ (8*3)(x_ptr), AX 1082 MULQ t0 1083 ADDQ AX, acc5 1084 ADCQ $0, DX 1085 MOVQ DX, y_ptr 1086 XORQ t1, t1 1087 // *2 1088 ADDQ acc1, acc1 1089 ADCQ acc2, acc2 1090 ADCQ acc3, acc3 1091 ADCQ acc4, acc4 1092 ADCQ acc5, acc5 1093 ADCQ y_ptr, y_ptr 1094 ADCQ $0, t1 1095 // Missing products 1096 MOVQ (8*0)(x_ptr), AX 1097 MULQ AX 1098 MOVQ AX, acc0 1099 MOVQ DX, t0 1100 1101 MOVQ (8*1)(x_ptr), AX 1102 MULQ AX 1103 ADDQ t0, acc1 1104 ADCQ AX, acc2 1105 ADCQ $0, DX 1106 MOVQ DX, t0 1107 1108 MOVQ (8*2)(x_ptr), AX 1109 MULQ AX 1110 ADDQ t0, acc3 1111 ADCQ AX, acc4 1112 ADCQ $0, DX 1113 MOVQ DX, t0 1114 1115 MOVQ (8*3)(x_ptr), AX 1116 MULQ AX 1117 ADDQ t0, acc5 1118 ADCQ AX, y_ptr 1119 ADCQ DX, t1 1120 MOVQ t1, x_ptr 1121 // First reduction step 1122 MOVQ acc0, AX 1123 MULQ p256ordK0<>(SB) 1124 MOVQ AX, t0 1125 1126 MOVQ p256ord<>+0x00(SB), AX 1127 MULQ t0 1128 ADDQ AX, acc0 1129 ADCQ $0, DX 1130 MOVQ DX, t1 1131 1132 MOVQ p256ord<>+0x08(SB), AX 1133 MULQ t0 1134 ADDQ t1, acc1 1135 ADCQ $0, DX 1136 ADDQ AX, acc1 1137 1138 MOVQ t0, t1 1139 ADCQ DX, acc2 1140 ADCQ $0, t1 1141 SUBQ t0, acc2 1142 SBBQ $0, t1 1143 1144 MOVQ t0, AX 1145 MOVQ t0, DX 1146 MOVQ t0, acc0 1147 SHLQ $32, AX 1148 SHRQ $32, DX 1149 1150 ADDQ t1, acc3 1151 ADCQ $0, acc0 1152 SUBQ AX, acc3 1153 SBBQ DX, acc0 1154 // Second reduction step 1155 MOVQ acc1, AX 1156 MULQ p256ordK0<>(SB) 1157 MOVQ AX, t0 1158 1159 MOVQ p256ord<>+0x00(SB), AX 1160 MULQ t0 1161 ADDQ AX, acc1 1162 ADCQ $0, DX 1163 MOVQ DX, t1 1164 1165 MOVQ p256ord<>+0x08(SB), AX 1166 MULQ t0 1167 ADDQ t1, acc2 1168 ADCQ $0, DX 1169 ADDQ AX, acc2 1170 1171 MOVQ t0, t1 1172 ADCQ DX, acc3 1173 ADCQ $0, t1 1174 SUBQ t0, acc3 1175 SBBQ $0, t1 1176 1177 MOVQ t0, AX 1178 MOVQ t0, DX 1179 MOVQ t0, acc1 1180 SHLQ $32, AX 1181 SHRQ $32, DX 1182 1183 ADDQ t1, acc0 1184 ADCQ $0, acc1 1185 SUBQ AX, acc0 1186 SBBQ DX, acc1 1187 // Third reduction step 1188 MOVQ acc2, AX 1189 MULQ p256ordK0<>(SB) 1190 MOVQ AX, t0 1191 1192 MOVQ p256ord<>+0x00(SB), AX 1193 MULQ t0 1194 ADDQ AX, acc2 1195 ADCQ $0, DX 1196 MOVQ DX, t1 1197 1198 MOVQ p256ord<>+0x08(SB), AX 1199 MULQ t0 1200 ADDQ t1, acc3 1201 ADCQ $0, DX 1202 ADDQ AX, acc3 1203 1204 MOVQ t0, t1 1205 ADCQ DX, acc0 1206 ADCQ $0, t1 1207 SUBQ t0, acc0 1208 SBBQ $0, t1 1209 1210 MOVQ t0, AX 1211 MOVQ t0, DX 1212 MOVQ t0, acc2 1213 SHLQ $32, AX 1214 SHRQ $32, DX 1215 1216 ADDQ t1, acc1 1217 ADCQ $0, acc2 1218 SUBQ AX, acc1 1219 SBBQ DX, acc2 1220 // Last reduction step 1221 MOVQ acc3, AX 1222 MULQ p256ordK0<>(SB) 1223 MOVQ AX, t0 1224 1225 MOVQ p256ord<>+0x00(SB), AX 1226 MULQ t0 1227 ADDQ AX, acc3 1228 ADCQ $0, DX 1229 MOVQ DX, t1 1230 1231 MOVQ p256ord<>+0x08(SB), AX 1232 MULQ t0 1233 ADDQ t1, acc0 1234 ADCQ $0, DX 1235 ADDQ AX, acc0 1236 ADCQ $0, DX 1237 MOVQ DX, t1 1238 1239 MOVQ t0, t1 1240 ADCQ DX, acc1 1241 ADCQ $0, t1 1242 SUBQ t0, acc1 1243 SBBQ $0, t1 1244 1245 MOVQ t0, AX 1246 MOVQ t0, DX 1247 MOVQ t0, acc3 1248 SHLQ $32, AX 1249 SHRQ $32, DX 1250 1251 ADDQ t1, acc2 1252 ADCQ $0, acc3 1253 SUBQ AX, acc2 1254 SBBQ DX, acc3 1255 XORQ t0, t0 1256 // Add bits [511:256] of the sqr result 1257 ADCQ acc4, acc0 1258 ADCQ acc5, acc1 1259 ADCQ y_ptr, acc2 1260 ADCQ x_ptr, acc3 1261 ADCQ $0, t0 1262 1263 MOVQ acc0, acc4 1264 MOVQ acc1, acc5 1265 MOVQ acc2, y_ptr 1266 MOVQ acc3, t1 1267 // Subtract p256 1268 SUBQ p256ord<>+0x00(SB), acc0 1269 SBBQ p256ord<>+0x08(SB) ,acc1 1270 SBBQ p256ord<>+0x10(SB), acc2 1271 SBBQ p256ord<>+0x18(SB), acc3 1272 SBBQ $0, t0 1273 1274 CMOVQCS acc4, acc0 1275 CMOVQCS acc5, acc1 1276 CMOVQCS y_ptr, acc2 1277 CMOVQCS t1, acc3 1278 1279 MOVQ acc0, (8*0)(res_ptr) 1280 MOVQ acc1, (8*1)(res_ptr) 1281 MOVQ acc2, (8*2)(res_ptr) 1282 MOVQ acc3, (8*3)(res_ptr) 1283 MOVQ res_ptr, x_ptr 1284 DECQ BX 1285 JNE ordSqrLoop 1286 1287 RET 1288 /* ---------------------------------------*/ 1289 #undef res_ptr 1290 #undef x_ptr 1291 #undef y_ptr 1292 1293 #undef acc0 1294 #undef acc1 1295 #undef acc2 1296 #undef acc3 1297 #undef acc4 1298 #undef acc5 1299 #undef t0 1300 #undef t1 1301 /* ---------------------------------------*/ 1302 #define mul0 AX 1303 #define mul1 DX 1304 #define acc0 BX 1305 #define acc1 CX 1306 #define acc2 R8 1307 #define acc3 R9 1308 #define acc4 R10 1309 #define acc5 R11 1310 #define acc6 R12 1311 #define acc7 R13 1312 #define t0 R14 1313 #define t1 R15 1314 #define t2 DI 1315 #define t3 SI 1316 #define hlp BP 1317 /* ---------------------------------------*/ 1318 TEXT p256SubInternal(SB),NOSPLIT,$0 1319 XORQ mul0, mul0 1320 SUBQ t0, acc4 1321 SBBQ t1, acc5 1322 SBBQ t2, acc6 1323 SBBQ t3, acc7 1324 SBBQ $0, mul0 1325 1326 MOVQ acc4, acc0 1327 MOVQ acc5, acc1 1328 MOVQ acc6, acc2 1329 MOVQ acc7, acc3 1330 1331 ADDQ $-1, acc4 1332 ADCQ p256const0<>(SB), acc5 1333 ADCQ $0, acc6 1334 ADCQ p256const1<>(SB), acc7 1335 ANDQ $1, mul0 1336 1337 CMOVQEQ acc0, acc4 1338 CMOVQEQ acc1, acc5 1339 CMOVQEQ acc2, acc6 1340 CMOVQEQ acc3, acc7 1341 1342 RET 1343 /* ---------------------------------------*/ 1344 TEXT p256MulInternal(SB),NOSPLIT,$8 1345 MOVQ acc4, mul0 1346 MULQ t0 1347 MOVQ mul0, acc0 1348 MOVQ mul1, acc1 1349 1350 MOVQ acc4, mul0 1351 MULQ t1 1352 ADDQ mul0, acc1 1353 ADCQ $0, mul1 1354 MOVQ mul1, acc2 1355 1356 MOVQ acc4, mul0 1357 MULQ t2 1358 ADDQ mul0, acc2 1359 ADCQ $0, mul1 1360 MOVQ mul1, acc3 1361 1362 MOVQ acc4, mul0 1363 MULQ t3 1364 ADDQ mul0, acc3 1365 ADCQ $0, mul1 1366 MOVQ mul1, acc4 1367 1368 MOVQ acc5, mul0 1369 MULQ t0 1370 ADDQ mul0, acc1 1371 ADCQ $0, mul1 1372 MOVQ mul1, hlp 1373 1374 MOVQ acc5, mul0 1375 MULQ t1 1376 ADDQ hlp, acc2 1377 ADCQ $0, mul1 1378 ADDQ mul0, acc2 1379 ADCQ $0, mul1 1380 MOVQ mul1, hlp 1381 1382 MOVQ acc5, mul0 1383 MULQ t2 1384 ADDQ hlp, acc3 1385 ADCQ $0, mul1 1386 ADDQ mul0, acc3 1387 ADCQ $0, mul1 1388 MOVQ mul1, hlp 1389 1390 MOVQ acc5, mul0 1391 MULQ t3 1392 ADDQ hlp, acc4 1393 ADCQ $0, mul1 1394 ADDQ mul0, acc4 1395 ADCQ $0, mul1 1396 MOVQ mul1, acc5 1397 1398 MOVQ acc6, mul0 1399 MULQ t0 1400 ADDQ mul0, acc2 1401 ADCQ $0, mul1 1402 MOVQ mul1, hlp 1403 1404 MOVQ acc6, mul0 1405 MULQ t1 1406 ADDQ hlp, acc3 1407 ADCQ $0, mul1 1408 ADDQ mul0, acc3 1409 ADCQ $0, mul1 1410 MOVQ mul1, hlp 1411 1412 MOVQ acc6, mul0 1413 MULQ t2 1414 ADDQ hlp, acc4 1415 ADCQ $0, mul1 1416 ADDQ mul0, acc4 1417 ADCQ $0, mul1 1418 MOVQ mul1, hlp 1419 1420 MOVQ acc6, mul0 1421 MULQ t3 1422 ADDQ hlp, acc5 1423 ADCQ $0, mul1 1424 ADDQ mul0, acc5 1425 ADCQ $0, mul1 1426 MOVQ mul1, acc6 1427 1428 MOVQ acc7, mul0 1429 MULQ t0 1430 ADDQ mul0, acc3 1431 ADCQ $0, mul1 1432 MOVQ mul1, hlp 1433 1434 MOVQ acc7, mul0 1435 MULQ t1 1436 ADDQ hlp, acc4 1437 ADCQ $0, mul1 1438 ADDQ mul0, acc4 1439 ADCQ $0, mul1 1440 MOVQ mul1, hlp 1441 1442 MOVQ acc7, mul0 1443 MULQ t2 1444 ADDQ hlp, acc5 1445 ADCQ $0, mul1 1446 ADDQ mul0, acc5 1447 ADCQ $0, mul1 1448 MOVQ mul1, hlp 1449 1450 MOVQ acc7, mul0 1451 MULQ t3 1452 ADDQ hlp, acc6 1453 ADCQ $0, mul1 1454 ADDQ mul0, acc6 1455 ADCQ $0, mul1 1456 MOVQ mul1, acc7 1457 // First reduction step 1458 MOVQ acc0, mul0 1459 MOVQ acc0, hlp 1460 SHLQ $32, acc0 1461 MULQ p256const1<>(SB) 1462 SHRQ $32, hlp 1463 ADDQ acc0, acc1 1464 ADCQ hlp, acc2 1465 ADCQ mul0, acc3 1466 ADCQ $0, mul1 1467 MOVQ mul1, acc0 1468 // Second reduction step 1469 MOVQ acc1, mul0 1470 MOVQ acc1, hlp 1471 SHLQ $32, acc1 1472 MULQ p256const1<>(SB) 1473 SHRQ $32, hlp 1474 ADDQ acc1, acc2 1475 ADCQ hlp, acc3 1476 ADCQ mul0, acc0 1477 ADCQ $0, mul1 1478 MOVQ mul1, acc1 1479 // Third reduction step 1480 MOVQ acc2, mul0 1481 MOVQ acc2, hlp 1482 SHLQ $32, acc2 1483 MULQ p256const1<>(SB) 1484 SHRQ $32, hlp 1485 ADDQ acc2, acc3 1486 ADCQ hlp, acc0 1487 ADCQ mul0, acc1 1488 ADCQ $0, mul1 1489 MOVQ mul1, acc2 1490 // Last reduction step 1491 MOVQ acc3, mul0 1492 MOVQ acc3, hlp 1493 SHLQ $32, acc3 1494 MULQ p256const1<>(SB) 1495 SHRQ $32, hlp 1496 ADDQ acc3, acc0 1497 ADCQ hlp, acc1 1498 ADCQ mul0, acc2 1499 ADCQ $0, mul1 1500 MOVQ mul1, acc3 1501 MOVQ $0, BP 1502 // Add bits [511:256] of the result 1503 ADCQ acc0, acc4 1504 ADCQ acc1, acc5 1505 ADCQ acc2, acc6 1506 ADCQ acc3, acc7 1507 ADCQ $0, hlp 1508 // Copy result 1509 MOVQ acc4, acc0 1510 MOVQ acc5, acc1 1511 MOVQ acc6, acc2 1512 MOVQ acc7, acc3 1513 // Subtract p256 1514 SUBQ $-1, acc4 1515 SBBQ p256const0<>(SB) ,acc5 1516 SBBQ $0, acc6 1517 SBBQ p256const1<>(SB), acc7 1518 SBBQ $0, hlp 1519 // If the result of the subtraction is negative, restore the previous result 1520 CMOVQCS acc0, acc4 1521 CMOVQCS acc1, acc5 1522 CMOVQCS acc2, acc6 1523 CMOVQCS acc3, acc7 1524 1525 RET 1526 /* ---------------------------------------*/ 1527 TEXT p256SqrInternal(SB),NOSPLIT,$8 1528 1529 MOVQ acc4, mul0 1530 MULQ acc5 1531 MOVQ mul0, acc1 1532 MOVQ mul1, acc2 1533 1534 MOVQ acc4, mul0 1535 MULQ acc6 1536 ADDQ mul0, acc2 1537 ADCQ $0, mul1 1538 MOVQ mul1, acc3 1539 1540 MOVQ acc4, mul0 1541 MULQ acc7 1542 ADDQ mul0, acc3 1543 ADCQ $0, mul1 1544 MOVQ mul1, t0 1545 1546 MOVQ acc5, mul0 1547 MULQ acc6 1548 ADDQ mul0, acc3 1549 ADCQ $0, mul1 1550 MOVQ mul1, hlp 1551 1552 MOVQ acc5, mul0 1553 MULQ acc7 1554 ADDQ hlp, t0 1555 ADCQ $0, mul1 1556 ADDQ mul0, t0 1557 ADCQ $0, mul1 1558 MOVQ mul1, t1 1559 1560 MOVQ acc6, mul0 1561 MULQ acc7 1562 ADDQ mul0, t1 1563 ADCQ $0, mul1 1564 MOVQ mul1, t2 1565 XORQ t3, t3 1566 // *2 1567 ADDQ acc1, acc1 1568 ADCQ acc2, acc2 1569 ADCQ acc3, acc3 1570 ADCQ t0, t0 1571 ADCQ t1, t1 1572 ADCQ t2, t2 1573 ADCQ $0, t3 1574 // Missing products 1575 MOVQ acc4, mul0 1576 MULQ mul0 1577 MOVQ mul0, acc0 1578 MOVQ DX, acc4 1579 1580 MOVQ acc5, mul0 1581 MULQ mul0 1582 ADDQ acc4, acc1 1583 ADCQ mul0, acc2 1584 ADCQ $0, DX 1585 MOVQ DX, acc4 1586 1587 MOVQ acc6, mul0 1588 MULQ mul0 1589 ADDQ acc4, acc3 1590 ADCQ mul0, t0 1591 ADCQ $0, DX 1592 MOVQ DX, acc4 1593 1594 MOVQ acc7, mul0 1595 MULQ mul0 1596 ADDQ acc4, t1 1597 ADCQ mul0, t2 1598 ADCQ DX, t3 1599 // First reduction step 1600 MOVQ acc0, mul0 1601 MOVQ acc0, hlp 1602 SHLQ $32, acc0 1603 MULQ p256const1<>(SB) 1604 SHRQ $32, hlp 1605 ADDQ acc0, acc1 1606 ADCQ hlp, acc2 1607 ADCQ mul0, acc3 1608 ADCQ $0, mul1 1609 MOVQ mul1, acc0 1610 // Second reduction step 1611 MOVQ acc1, mul0 1612 MOVQ acc1, hlp 1613 SHLQ $32, acc1 1614 MULQ p256const1<>(SB) 1615 SHRQ $32, hlp 1616 ADDQ acc1, acc2 1617 ADCQ hlp, acc3 1618 ADCQ mul0, acc0 1619 ADCQ $0, mul1 1620 MOVQ mul1, acc1 1621 // Third reduction step 1622 MOVQ acc2, mul0 1623 MOVQ acc2, hlp 1624 SHLQ $32, acc2 1625 MULQ p256const1<>(SB) 1626 SHRQ $32, hlp 1627 ADDQ acc2, acc3 1628 ADCQ hlp, acc0 1629 ADCQ mul0, acc1 1630 ADCQ $0, mul1 1631 MOVQ mul1, acc2 1632 // Last reduction step 1633 MOVQ acc3, mul0 1634 MOVQ acc3, hlp 1635 SHLQ $32, acc3 1636 MULQ p256const1<>(SB) 1637 SHRQ $32, hlp 1638 ADDQ acc3, acc0 1639 ADCQ hlp, acc1 1640 ADCQ mul0, acc2 1641 ADCQ $0, mul1 1642 MOVQ mul1, acc3 1643 MOVQ $0, BP 1644 // Add bits [511:256] of the result 1645 ADCQ acc0, t0 1646 ADCQ acc1, t1 1647 ADCQ acc2, t2 1648 ADCQ acc3, t3 1649 ADCQ $0, hlp 1650 // Copy result 1651 MOVQ t0, acc4 1652 MOVQ t1, acc5 1653 MOVQ t2, acc6 1654 MOVQ t3, acc7 1655 // Subtract p256 1656 SUBQ $-1, acc4 1657 SBBQ p256const0<>(SB) ,acc5 1658 SBBQ $0, acc6 1659 SBBQ p256const1<>(SB), acc7 1660 SBBQ $0, hlp 1661 // If the result of the subtraction is negative, restore the previous result 1662 CMOVQCS t0, acc4 1663 CMOVQCS t1, acc5 1664 CMOVQCS t2, acc6 1665 CMOVQCS t3, acc7 1666 1667 RET 1668 /* ---------------------------------------*/ 1669 #define p256MulBy2Inline\ 1670 XORQ mul0, mul0;\ 1671 ADDQ acc4, acc4;\ 1672 ADCQ acc5, acc5;\ 1673 ADCQ acc6, acc6;\ 1674 ADCQ acc7, acc7;\ 1675 ADCQ $0, mul0;\ 1676 MOVQ acc4, t0;\ 1677 MOVQ acc5, t1;\ 1678 MOVQ acc6, t2;\ 1679 MOVQ acc7, t3;\ 1680 SUBQ $-1, t0;\ 1681 SBBQ p256const0<>(SB), t1;\ 1682 SBBQ $0, t2;\ 1683 SBBQ p256const1<>(SB), t3;\ 1684 SBBQ $0, mul0;\ 1685 CMOVQCS acc4, t0;\ 1686 CMOVQCS acc5, t1;\ 1687 CMOVQCS acc6, t2;\ 1688 CMOVQCS acc7, t3; 1689 /* ---------------------------------------*/ 1690 #define p256AddInline \ 1691 XORQ mul0, mul0;\ 1692 ADDQ t0, acc4;\ 1693 ADCQ t1, acc5;\ 1694 ADCQ t2, acc6;\ 1695 ADCQ t3, acc7;\ 1696 ADCQ $0, mul0;\ 1697 MOVQ acc4, t0;\ 1698 MOVQ acc5, t1;\ 1699 MOVQ acc6, t2;\ 1700 MOVQ acc7, t3;\ 1701 SUBQ $-1, t0;\ 1702 SBBQ p256const0<>(SB), t1;\ 1703 SBBQ $0, t2;\ 1704 SBBQ p256const1<>(SB), t3;\ 1705 SBBQ $0, mul0;\ 1706 CMOVQCS acc4, t0;\ 1707 CMOVQCS acc5, t1;\ 1708 CMOVQCS acc6, t2;\ 1709 CMOVQCS acc7, t3; 1710 /* ---------------------------------------*/ 1711 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 1712 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 1713 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) 1714 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) 1715 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 1716 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 1717 /* ---------------------------------------*/ 1718 #define x1in(off) (32*0 + off)(SP) 1719 #define y1in(off) (32*1 + off)(SP) 1720 #define z1in(off) (32*2 + off)(SP) 1721 #define x2in(off) (32*3 + off)(SP) 1722 #define y2in(off) (32*4 + off)(SP) 1723 #define xout(off) (32*5 + off)(SP) 1724 #define yout(off) (32*6 + off)(SP) 1725 #define zout(off) (32*7 + off)(SP) 1726 #define s2(off) (32*8 + off)(SP) 1727 #define z1sqr(off) (32*9 + off)(SP) 1728 #define h(off) (32*10 + off)(SP) 1729 #define r(off) (32*11 + off)(SP) 1730 #define hsqr(off) (32*12 + off)(SP) 1731 #define rsqr(off) (32*13 + off)(SP) 1732 #define hcub(off) (32*14 + off)(SP) 1733 #define rptr (32*15)(SP) 1734 #define sel_save (32*15 + 8)(SP) 1735 #define zero_save (32*15 + 8 + 4)(SP) 1736 1737 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1738 TEXT ·p256PointAddAffineAsm(SB),0,$512-48 1739 // Move input to stack in order to free registers 1740 MOVQ res+0(FP), AX 1741 MOVQ in1+8(FP), BX 1742 MOVQ in2+16(FP), CX 1743 MOVQ sign+24(FP), DX 1744 MOVQ sel+32(FP), t1 1745 MOVQ zero+40(FP), t2 1746 1747 MOVOU (16*0)(BX), X0 1748 MOVOU (16*1)(BX), X1 1749 MOVOU (16*2)(BX), X2 1750 MOVOU (16*3)(BX), X3 1751 MOVOU (16*4)(BX), X4 1752 MOVOU (16*5)(BX), X5 1753 1754 MOVOU X0, x1in(16*0) 1755 MOVOU X1, x1in(16*1) 1756 MOVOU X2, y1in(16*0) 1757 MOVOU X3, y1in(16*1) 1758 MOVOU X4, z1in(16*0) 1759 MOVOU X5, z1in(16*1) 1760 1761 MOVOU (16*0)(CX), X0 1762 MOVOU (16*1)(CX), X1 1763 1764 MOVOU X0, x2in(16*0) 1765 MOVOU X1, x2in(16*1) 1766 // Store pointer to result 1767 MOVQ mul0, rptr 1768 MOVL t1, sel_save 1769 MOVL t2, zero_save 1770 // Negate y2in based on sign 1771 MOVQ (16*2 + 8*0)(CX), acc4 1772 MOVQ (16*2 + 8*1)(CX), acc5 1773 MOVQ (16*2 + 8*2)(CX), acc6 1774 MOVQ (16*2 + 8*3)(CX), acc7 1775 MOVQ $-1, acc0 1776 MOVQ p256const0<>(SB), acc1 1777 MOVQ $0, acc2 1778 MOVQ p256const1<>(SB), acc3 1779 XORQ mul0, mul0 1780 // Speculatively subtract 1781 SUBQ acc4, acc0 1782 SBBQ acc5, acc1 1783 SBBQ acc6, acc2 1784 SBBQ acc7, acc3 1785 SBBQ $0, mul0 1786 MOVQ acc0, t0 1787 MOVQ acc1, t1 1788 MOVQ acc2, t2 1789 MOVQ acc3, t3 1790 // Add in case the operand was > p256 1791 ADDQ $-1, acc0 1792 ADCQ p256const0<>(SB), acc1 1793 ADCQ $0, acc2 1794 ADCQ p256const1<>(SB), acc3 1795 ADCQ $0, mul0 1796 CMOVQNE t0, acc0 1797 CMOVQNE t1, acc1 1798 CMOVQNE t2, acc2 1799 CMOVQNE t3, acc3 1800 // If condition is 0, keep original value 1801 TESTQ DX, DX 1802 CMOVQEQ acc4, acc0 1803 CMOVQEQ acc5, acc1 1804 CMOVQEQ acc6, acc2 1805 CMOVQEQ acc7, acc3 1806 // Store result 1807 MOVQ acc0, y2in(8*0) 1808 MOVQ acc1, y2in(8*1) 1809 MOVQ acc2, y2in(8*2) 1810 MOVQ acc3, y2in(8*3) 1811 // Begin point add 1812 LDacc (z1in) 1813 CALL p256SqrInternal(SB) // z1ˆ2 1814 ST (z1sqr) 1815 1816 LDt (x2in) 1817 CALL p256MulInternal(SB) // x2 * z1ˆ2 1818 1819 LDt (x1in) 1820 CALL p256SubInternal(SB) // h = u2 - u1 1821 ST (h) 1822 1823 LDt (z1in) 1824 CALL p256MulInternal(SB) // z3 = h * z1 1825 ST (zout) 1826 1827 LDacc (z1sqr) 1828 CALL p256MulInternal(SB) // z1ˆ3 1829 1830 LDt (y2in) 1831 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 1832 ST (s2) 1833 1834 LDt (y1in) 1835 CALL p256SubInternal(SB) // r = s2 - s1 1836 ST (r) 1837 1838 CALL p256SqrInternal(SB) // rsqr = rˆ2 1839 ST (rsqr) 1840 1841 LDacc (h) 1842 CALL p256SqrInternal(SB) // hsqr = hˆ2 1843 ST (hsqr) 1844 1845 LDt (h) 1846 CALL p256MulInternal(SB) // hcub = hˆ3 1847 ST (hcub) 1848 1849 LDt (y1in) 1850 CALL p256MulInternal(SB) // y1 * hˆ3 1851 ST (s2) 1852 1853 LDacc (x1in) 1854 LDt (hsqr) 1855 CALL p256MulInternal(SB) // u1 * hˆ2 1856 ST (h) 1857 1858 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1859 LDacc (rsqr) 1860 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 1861 1862 LDt (hcub) 1863 CALL p256SubInternal(SB) 1864 ST (xout) 1865 1866 MOVQ acc4, t0 1867 MOVQ acc5, t1 1868 MOVQ acc6, t2 1869 MOVQ acc7, t3 1870 LDacc (h) 1871 CALL p256SubInternal(SB) 1872 1873 LDt (r) 1874 CALL p256MulInternal(SB) 1875 1876 LDt (s2) 1877 CALL p256SubInternal(SB) 1878 ST (yout) 1879 // Load stored values from stack 1880 MOVQ rptr, AX 1881 MOVL sel_save, BX 1882 MOVL zero_save, CX 1883 // The result is not valid if (sel == 0), conditional choose 1884 MOVOU xout(16*0), X0 1885 MOVOU xout(16*1), X1 1886 MOVOU yout(16*0), X2 1887 MOVOU yout(16*1), X3 1888 MOVOU zout(16*0), X4 1889 MOVOU zout(16*1), X5 1890 1891 MOVL BX, X6 1892 MOVL CX, X7 1893 1894 PXOR X8, X8 1895 PCMPEQL X9, X9 1896 1897 PSHUFD $0, X6, X6 1898 PSHUFD $0, X7, X7 1899 1900 PCMPEQL X8, X6 1901 PCMPEQL X8, X7 1902 1903 MOVOU X6, X15 1904 PANDN X9, X15 1905 1906 MOVOU x1in(16*0), X9 1907 MOVOU x1in(16*1), X10 1908 MOVOU y1in(16*0), X11 1909 MOVOU y1in(16*1), X12 1910 MOVOU z1in(16*0), X13 1911 MOVOU z1in(16*1), X14 1912 1913 PAND X15, X0 1914 PAND X15, X1 1915 PAND X15, X2 1916 PAND X15, X3 1917 PAND X15, X4 1918 PAND X15, X5 1919 1920 PAND X6, X9 1921 PAND X6, X10 1922 PAND X6, X11 1923 PAND X6, X12 1924 PAND X6, X13 1925 PAND X6, X14 1926 1927 PXOR X9, X0 1928 PXOR X10, X1 1929 PXOR X11, X2 1930 PXOR X12, X3 1931 PXOR X13, X4 1932 PXOR X14, X5 1933 // Similarly if zero == 0 1934 PCMPEQL X9, X9 1935 MOVOU X7, X15 1936 PANDN X9, X15 1937 1938 MOVOU x2in(16*0), X9 1939 MOVOU x2in(16*1), X10 1940 MOVOU y2in(16*0), X11 1941 MOVOU y2in(16*1), X12 1942 MOVOU p256one<>+0x00(SB), X13 1943 MOVOU p256one<>+0x10(SB), X14 1944 1945 PAND X15, X0 1946 PAND X15, X1 1947 PAND X15, X2 1948 PAND X15, X3 1949 PAND X15, X4 1950 PAND X15, X5 1951 1952 PAND X7, X9 1953 PAND X7, X10 1954 PAND X7, X11 1955 PAND X7, X12 1956 PAND X7, X13 1957 PAND X7, X14 1958 1959 PXOR X9, X0 1960 PXOR X10, X1 1961 PXOR X11, X2 1962 PXOR X12, X3 1963 PXOR X13, X4 1964 PXOR X14, X5 1965 // Finally output the result 1966 MOVOU X0, (16*0)(AX) 1967 MOVOU X1, (16*1)(AX) 1968 MOVOU X2, (16*2)(AX) 1969 MOVOU X3, (16*3)(AX) 1970 MOVOU X4, (16*4)(AX) 1971 MOVOU X5, (16*5)(AX) 1972 MOVQ $0, rptr 1973 1974 RET 1975 #undef x1in 1976 #undef y1in 1977 #undef z1in 1978 #undef x2in 1979 #undef y2in 1980 #undef xout 1981 #undef yout 1982 #undef zout 1983 #undef s2 1984 #undef z1sqr 1985 #undef h 1986 #undef r 1987 #undef hsqr 1988 #undef rsqr 1989 #undef hcub 1990 #undef rptr 1991 #undef sel_save 1992 #undef zero_save 1993 1994 // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero 1995 // otherwise. It writes to [acc4..acc7], t0 and t1. 1996 TEXT p256IsZero(SB),NOSPLIT,$0 1997 // AX contains a flag that is set if the input is zero. 1998 XORQ AX, AX 1999 MOVQ $1, t1 2000 2001 // Check whether [acc4..acc7] are all zero. 2002 MOVQ acc4, t0 2003 ORQ acc5, t0 2004 ORQ acc6, t0 2005 ORQ acc7, t0 2006 2007 // Set the zero flag if so. (CMOV of a constant to a register doesn't 2008 // appear to be supported in Go. Thus t1 = 1.) 2009 CMOVQEQ t1, AX 2010 2011 // XOR [acc4..acc7] with P and compare with zero again. 2012 XORQ $-1, acc4 2013 XORQ p256const0<>(SB), acc5 2014 XORQ p256const1<>(SB), acc7 2015 ORQ acc5, acc4 2016 ORQ acc6, acc4 2017 ORQ acc7, acc4 2018 2019 // Set the zero flag if so. 2020 CMOVQEQ t1, AX 2021 RET 2022 2023 /* ---------------------------------------*/ 2024 #define x1in(off) (32*0 + off)(SP) 2025 #define y1in(off) (32*1 + off)(SP) 2026 #define z1in(off) (32*2 + off)(SP) 2027 #define x2in(off) (32*3 + off)(SP) 2028 #define y2in(off) (32*4 + off)(SP) 2029 #define z2in(off) (32*5 + off)(SP) 2030 2031 #define xout(off) (32*6 + off)(SP) 2032 #define yout(off) (32*7 + off)(SP) 2033 #define zout(off) (32*8 + off)(SP) 2034 2035 #define u1(off) (32*9 + off)(SP) 2036 #define u2(off) (32*10 + off)(SP) 2037 #define s1(off) (32*11 + off)(SP) 2038 #define s2(off) (32*12 + off)(SP) 2039 #define z1sqr(off) (32*13 + off)(SP) 2040 #define z2sqr(off) (32*14 + off)(SP) 2041 #define h(off) (32*15 + off)(SP) 2042 #define r(off) (32*16 + off)(SP) 2043 #define hsqr(off) (32*17 + off)(SP) 2044 #define rsqr(off) (32*18 + off)(SP) 2045 #define hcub(off) (32*19 + off)(SP) 2046 #define rptr (32*20)(SP) 2047 #define points_eq (32*20+8)(SP) 2048 2049 //func p256PointAddAsm(res, in1, in2 *P256Point) int 2050 TEXT ·p256PointAddAsm(SB),0,$680-32 2051 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 2052 // Move input to stack in order to free registers 2053 MOVQ res+0(FP), AX 2054 MOVQ in1+8(FP), BX 2055 MOVQ in2+16(FP), CX 2056 2057 MOVOU (16*0)(BX), X0 2058 MOVOU (16*1)(BX), X1 2059 MOVOU (16*2)(BX), X2 2060 MOVOU (16*3)(BX), X3 2061 MOVOU (16*4)(BX), X4 2062 MOVOU (16*5)(BX), X5 2063 2064 MOVOU X0, x1in(16*0) 2065 MOVOU X1, x1in(16*1) 2066 MOVOU X2, y1in(16*0) 2067 MOVOU X3, y1in(16*1) 2068 MOVOU X4, z1in(16*0) 2069 MOVOU X5, z1in(16*1) 2070 2071 MOVOU (16*0)(CX), X0 2072 MOVOU (16*1)(CX), X1 2073 MOVOU (16*2)(CX), X2 2074 MOVOU (16*3)(CX), X3 2075 MOVOU (16*4)(CX), X4 2076 MOVOU (16*5)(CX), X5 2077 2078 MOVOU X0, x2in(16*0) 2079 MOVOU X1, x2in(16*1) 2080 MOVOU X2, y2in(16*0) 2081 MOVOU X3, y2in(16*1) 2082 MOVOU X4, z2in(16*0) 2083 MOVOU X5, z2in(16*1) 2084 // Store pointer to result 2085 MOVQ AX, rptr 2086 // Begin point add 2087 LDacc (z2in) 2088 CALL p256SqrInternal(SB) // z2ˆ2 2089 ST (z2sqr) 2090 LDt (z2in) 2091 CALL p256MulInternal(SB) // z2ˆ3 2092 LDt (y1in) 2093 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 2094 ST (s1) 2095 2096 LDacc (z1in) 2097 CALL p256SqrInternal(SB) // z1ˆ2 2098 ST (z1sqr) 2099 LDt (z1in) 2100 CALL p256MulInternal(SB) // z1ˆ3 2101 LDt (y2in) 2102 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 2103 ST (s2) 2104 2105 LDt (s1) 2106 CALL p256SubInternal(SB) // r = s2 - s1 2107 ST (r) 2108 CALL p256IsZero(SB) 2109 MOVQ AX, points_eq 2110 2111 LDacc (z2sqr) 2112 LDt (x1in) 2113 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 2114 ST (u1) 2115 LDacc (z1sqr) 2116 LDt (x2in) 2117 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 2118 ST (u2) 2119 2120 LDt (u1) 2121 CALL p256SubInternal(SB) // h = u2 - u1 2122 ST (h) 2123 CALL p256IsZero(SB) 2124 ANDQ points_eq, AX 2125 MOVQ AX, points_eq 2126 2127 LDacc (r) 2128 CALL p256SqrInternal(SB) // rsqr = rˆ2 2129 ST (rsqr) 2130 2131 LDacc (h) 2132 CALL p256SqrInternal(SB) // hsqr = hˆ2 2133 ST (hsqr) 2134 2135 LDt (h) 2136 CALL p256MulInternal(SB) // hcub = hˆ3 2137 ST (hcub) 2138 2139 LDt (s1) 2140 CALL p256MulInternal(SB) 2141 ST (s2) 2142 2143 LDacc (z1in) 2144 LDt (z2in) 2145 CALL p256MulInternal(SB) // z1 * z2 2146 LDt (h) 2147 CALL p256MulInternal(SB) // z1 * z2 * h 2148 ST (zout) 2149 2150 LDacc (hsqr) 2151 LDt (u1) 2152 CALL p256MulInternal(SB) // hˆ2 * u1 2153 ST (u2) 2154 2155 p256MulBy2Inline // u1 * hˆ2 * 2, inline 2156 LDacc (rsqr) 2157 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 2158 2159 LDt (hcub) 2160 CALL p256SubInternal(SB) 2161 ST (xout) 2162 2163 MOVQ acc4, t0 2164 MOVQ acc5, t1 2165 MOVQ acc6, t2 2166 MOVQ acc7, t3 2167 LDacc (u2) 2168 CALL p256SubInternal(SB) 2169 2170 LDt (r) 2171 CALL p256MulInternal(SB) 2172 2173 LDt (s2) 2174 CALL p256SubInternal(SB) 2175 ST (yout) 2176 2177 MOVOU xout(16*0), X0 2178 MOVOU xout(16*1), X1 2179 MOVOU yout(16*0), X2 2180 MOVOU yout(16*1), X3 2181 MOVOU zout(16*0), X4 2182 MOVOU zout(16*1), X5 2183 // Finally output the result 2184 MOVQ rptr, AX 2185 MOVQ $0, rptr 2186 MOVOU X0, (16*0)(AX) 2187 MOVOU X1, (16*1)(AX) 2188 MOVOU X2, (16*2)(AX) 2189 MOVOU X3, (16*3)(AX) 2190 MOVOU X4, (16*4)(AX) 2191 MOVOU X5, (16*5)(AX) 2192 2193 MOVQ points_eq, AX 2194 MOVQ AX, ret+24(FP) 2195 2196 RET 2197 #undef x1in 2198 #undef y1in 2199 #undef z1in 2200 #undef x2in 2201 #undef y2in 2202 #undef z2in 2203 #undef xout 2204 #undef yout 2205 #undef zout 2206 #undef s1 2207 #undef s2 2208 #undef u1 2209 #undef u2 2210 #undef z1sqr 2211 #undef z2sqr 2212 #undef h 2213 #undef r 2214 #undef hsqr 2215 #undef rsqr 2216 #undef hcub 2217 #undef rptr 2218 /* ---------------------------------------*/ 2219 #define x(off) (32*0 + off)(SP) 2220 #define y(off) (32*1 + off)(SP) 2221 #define z(off) (32*2 + off)(SP) 2222 2223 #define s(off) (32*3 + off)(SP) 2224 #define m(off) (32*4 + off)(SP) 2225 #define zsqr(off) (32*5 + off)(SP) 2226 #define tmp(off) (32*6 + off)(SP) 2227 #define rptr (32*7)(SP) 2228 2229 //func p256PointDoubleAsm(res, in *P256Point) 2230 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16 2231 // Move input to stack in order to free registers 2232 MOVQ res+0(FP), AX 2233 MOVQ in+8(FP), BX 2234 2235 MOVOU (16*0)(BX), X0 2236 MOVOU (16*1)(BX), X1 2237 MOVOU (16*2)(BX), X2 2238 MOVOU (16*3)(BX), X3 2239 MOVOU (16*4)(BX), X4 2240 MOVOU (16*5)(BX), X5 2241 2242 MOVOU X0, x(16*0) 2243 MOVOU X1, x(16*1) 2244 MOVOU X2, y(16*0) 2245 MOVOU X3, y(16*1) 2246 MOVOU X4, z(16*0) 2247 MOVOU X5, z(16*1) 2248 // Store pointer to result 2249 MOVQ AX, rptr 2250 // Begin point double 2251 LDacc (z) 2252 CALL p256SqrInternal(SB) 2253 ST (zsqr) 2254 2255 LDt (x) 2256 p256AddInline 2257 STt (m) 2258 2259 LDacc (z) 2260 LDt (y) 2261 CALL p256MulInternal(SB) 2262 p256MulBy2Inline 2263 MOVQ rptr, AX 2264 // Store z 2265 MOVQ t0, (16*4 + 8*0)(AX) 2266 MOVQ t1, (16*4 + 8*1)(AX) 2267 MOVQ t2, (16*4 + 8*2)(AX) 2268 MOVQ t3, (16*4 + 8*3)(AX) 2269 2270 LDacc (x) 2271 LDt (zsqr) 2272 CALL p256SubInternal(SB) 2273 LDt (m) 2274 CALL p256MulInternal(SB) 2275 ST (m) 2276 // Multiply by 3 2277 p256MulBy2Inline 2278 LDacc (m) 2279 p256AddInline 2280 STt (m) 2281 //////////////////////// 2282 LDacc (y) 2283 p256MulBy2Inline 2284 t2acc 2285 CALL p256SqrInternal(SB) 2286 ST (s) 2287 CALL p256SqrInternal(SB) 2288 // Divide by 2 2289 XORQ mul0, mul0 2290 MOVQ acc4, t0 2291 MOVQ acc5, t1 2292 MOVQ acc6, t2 2293 MOVQ acc7, t3 2294 2295 ADDQ $-1, acc4 2296 ADCQ p256const0<>(SB), acc5 2297 ADCQ $0, acc6 2298 ADCQ p256const1<>(SB), acc7 2299 ADCQ $0, mul0 2300 TESTQ $1, t0 2301 2302 CMOVQEQ t0, acc4 2303 CMOVQEQ t1, acc5 2304 CMOVQEQ t2, acc6 2305 CMOVQEQ t3, acc7 2306 ANDQ t0, mul0 2307 2308 SHRQ $1, acc5, acc4 2309 SHRQ $1, acc6, acc5 2310 SHRQ $1, acc7, acc6 2311 SHRQ $1, mul0, acc7 2312 ST (y) 2313 ///////////////////////// 2314 LDacc (x) 2315 LDt (s) 2316 CALL p256MulInternal(SB) 2317 ST (s) 2318 p256MulBy2Inline 2319 STt (tmp) 2320 2321 LDacc (m) 2322 CALL p256SqrInternal(SB) 2323 LDt (tmp) 2324 CALL p256SubInternal(SB) 2325 2326 MOVQ rptr, AX 2327 // Store x 2328 MOVQ acc4, (16*0 + 8*0)(AX) 2329 MOVQ acc5, (16*0 + 8*1)(AX) 2330 MOVQ acc6, (16*0 + 8*2)(AX) 2331 MOVQ acc7, (16*0 + 8*3)(AX) 2332 2333 acc2t 2334 LDacc (s) 2335 CALL p256SubInternal(SB) 2336 2337 LDt (m) 2338 CALL p256MulInternal(SB) 2339 2340 LDt (y) 2341 CALL p256SubInternal(SB) 2342 MOVQ rptr, AX 2343 // Store y 2344 MOVQ acc4, (16*2 + 8*0)(AX) 2345 MOVQ acc5, (16*2 + 8*1)(AX) 2346 MOVQ acc6, (16*2 + 8*2)(AX) 2347 MOVQ acc7, (16*2 + 8*3)(AX) 2348 /////////////////////// 2349 MOVQ $0, rptr 2350 2351 RET 2352 /* ---------------------------------------*/