github.com/fisco-bcos/crypto@v0.0.0-20200202032121-bd8ab0b5d4f1/elliptic/p256_asm_arm64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains constant-time, 64-bit assembly implementation of 6 // P256. The optimizations performed here are described in detail in: 7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 8 // 256-bit primes" 9 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 10 // https://eprint.iacr.org/2013/816.pdf 11 12 #include "textflag.h" 13 14 #define res_ptr R0 15 #define a_ptr R1 16 #define b_ptr R2 17 18 #define acc0 R3 19 #define acc1 R4 20 #define acc2 R5 21 #define acc3 R6 22 23 #define acc4 R7 24 #define acc5 R8 25 #define acc6 R9 26 #define acc7 R10 27 #define t0 R11 28 #define t1 R12 29 #define t2 R13 30 #define t3 R14 31 #define const0 R15 32 #define const1 R16 33 34 #define hlp0 R17 35 #define hlp1 res_ptr 36 37 #define x0 R19 38 #define x1 R20 39 #define x2 R21 40 #define x3 R22 41 #define y0 R23 42 #define y1 R24 43 #define y2 R25 44 #define y3 R26 45 46 #define const2 t2 47 #define const3 t3 48 49 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 50 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 51 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 52 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 53 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 54 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 55 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 56 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 57 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 58 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 59 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 60 GLOBL p256const0<>(SB), 8, $8 61 GLOBL p256const1<>(SB), 8, $8 62 GLOBL p256ordK0<>(SB), 8, $8 63 GLOBL p256ord<>(SB), 8, $32 64 GLOBL p256one<>(SB), 8, $32 65 66 /* ---------------------------------------*/ 67 // func p256LittleToBig(res []byte, in []uint64) 68 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 69 JMP ·p256BigToLittle(SB) 70 /* ---------------------------------------*/ 71 // func p256BigToLittle(res []uint64, in []byte) 72 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 73 MOVD res+0(FP), res_ptr 74 MOVD in+24(FP), a_ptr 75 76 LDP 0*16(a_ptr), (acc0, acc1) 77 LDP 1*16(a_ptr), (acc2, acc3) 78 79 REV acc0, acc0 80 REV acc1, acc1 81 REV acc2, acc2 82 REV acc3, acc3 83 84 STP (acc3, acc2), 0*16(res_ptr) 85 STP (acc1, acc0), 1*16(res_ptr) 86 RET 87 /* ---------------------------------------*/ 88 // func p256MovCond(res, a, b []uint64, cond int) 89 // If cond == 0 res=b, else res=a 90 TEXT ·p256MovCond(SB),NOSPLIT,$0 91 MOVD res+0(FP), res_ptr 92 MOVD a+24(FP), a_ptr 93 MOVD b+48(FP), b_ptr 94 MOVD cond+72(FP), R3 95 96 CMP $0, R3 97 // Two remarks: 98 // 1) Will want to revisit NEON, when support is better 99 // 2) CSEL might not be constant time on all ARM processors 100 LDP 0*16(a_ptr), (R4, R5) 101 LDP 1*16(a_ptr), (R6, R7) 102 LDP 2*16(a_ptr), (R8, R9) 103 LDP 0*16(b_ptr), (R16, R17) 104 LDP 1*16(b_ptr), (R19, R20) 105 LDP 2*16(b_ptr), (R21, R22) 106 CSEL EQ, R16, R4, R4 107 CSEL EQ, R17, R5, R5 108 CSEL EQ, R19, R6, R6 109 CSEL EQ, R20, R7, R7 110 CSEL EQ, R21, R8, R8 111 CSEL EQ, R22, R9, R9 112 STP (R4, R5), 0*16(res_ptr) 113 STP (R6, R7), 1*16(res_ptr) 114 STP (R8, R9), 2*16(res_ptr) 115 116 LDP 3*16(a_ptr), (R4, R5) 117 LDP 4*16(a_ptr), (R6, R7) 118 LDP 5*16(a_ptr), (R8, R9) 119 LDP 3*16(b_ptr), (R16, R17) 120 LDP 4*16(b_ptr), (R19, R20) 121 LDP 5*16(b_ptr), (R21, R22) 122 CSEL EQ, R16, R4, R4 123 CSEL EQ, R17, R5, R5 124 CSEL EQ, R19, R6, R6 125 CSEL EQ, R20, R7, R7 126 CSEL EQ, R21, R8, R8 127 CSEL EQ, R22, R9, R9 128 STP (R4, R5), 3*16(res_ptr) 129 STP (R6, R7), 4*16(res_ptr) 130 STP (R8, R9), 5*16(res_ptr) 131 132 RET 133 /* ---------------------------------------*/ 134 // func p256NegCond(val []uint64, cond int) 135 TEXT ·p256NegCond(SB),NOSPLIT,$0 136 MOVD val+0(FP), a_ptr 137 MOVD cond+24(FP), hlp0 138 MOVD a_ptr, res_ptr 139 // acc = poly 140 MOVD $-1, acc0 141 MOVD p256const0<>(SB), acc1 142 MOVD $0, acc2 143 MOVD p256const1<>(SB), acc3 144 // Load the original value 145 LDP 0*16(a_ptr), (t0, t1) 146 LDP 1*16(a_ptr), (t2, t3) 147 // Speculatively subtract 148 SUBS t0, acc0 149 SBCS t1, acc1 150 SBCS t2, acc2 151 SBC t3, acc3 152 // If condition is 0, keep original value 153 CMP $0, hlp0 154 CSEL EQ, t0, acc0, acc0 155 CSEL EQ, t1, acc1, acc1 156 CSEL EQ, t2, acc2, acc2 157 CSEL EQ, t3, acc3, acc3 158 // Store result 159 STP (acc0, acc1), 0*16(res_ptr) 160 STP (acc2, acc3), 1*16(res_ptr) 161 162 RET 163 /* ---------------------------------------*/ 164 // func p256Sqr(res, in []uint64, n int) 165 TEXT ·p256Sqr(SB),NOSPLIT,$0 166 MOVD res+0(FP), res_ptr 167 MOVD in+24(FP), a_ptr 168 MOVD n+48(FP), b_ptr 169 170 MOVD p256const0<>(SB), const0 171 MOVD p256const1<>(SB), const1 172 173 LDP 0*16(a_ptr), (x0, x1) 174 LDP 1*16(a_ptr), (x2, x3) 175 176 sqrLoop: 177 SUB $1, b_ptr 178 CALL p256SqrInternal<>(SB) 179 MOVD y0, x0 180 MOVD y1, x1 181 MOVD y2, x2 182 MOVD y3, x3 183 CBNZ b_ptr, sqrLoop 184 185 STP (y0, y1), 0*16(res_ptr) 186 STP (y2, y3), 1*16(res_ptr) 187 RET 188 /* ---------------------------------------*/ 189 // func p256Mul(res, in1, in2 []uint64) 190 TEXT ·p256Mul(SB),NOSPLIT,$0 191 MOVD res+0(FP), res_ptr 192 MOVD in1+24(FP), a_ptr 193 MOVD in2+48(FP), b_ptr 194 195 MOVD p256const0<>(SB), const0 196 MOVD p256const1<>(SB), const1 197 198 LDP 0*16(a_ptr), (x0, x1) 199 LDP 1*16(a_ptr), (x2, x3) 200 201 LDP 0*16(b_ptr), (y0, y1) 202 LDP 1*16(b_ptr), (y2, y3) 203 204 CALL p256MulInternal<>(SB) 205 206 STP (y0, y1), 0*16(res_ptr) 207 STP (y2, y3), 1*16(res_ptr) 208 RET 209 /* ---------------------------------------*/ 210 // func p256FromMont(res, in []uint64) 211 TEXT ·p256FromMont(SB),NOSPLIT,$0 212 MOVD res+0(FP), res_ptr 213 MOVD in+24(FP), a_ptr 214 215 MOVD p256const0<>(SB), const0 216 MOVD p256const1<>(SB), const1 217 218 LDP 0*16(a_ptr), (acc0, acc1) 219 LDP 1*16(a_ptr), (acc2, acc3) 220 // Only reduce, no multiplications are needed 221 // First reduction step 222 ADDS acc0<<32, acc1, acc1 223 LSR $32, acc0, t0 224 MUL acc0, const1, t1 225 UMULH acc0, const1, acc0 226 ADCS t0, acc2 227 ADCS t1, acc3 228 ADC $0, acc0 229 // Second reduction step 230 ADDS acc1<<32, acc2, acc2 231 LSR $32, acc1, t0 232 MUL acc1, const1, t1 233 UMULH acc1, const1, acc1 234 ADCS t0, acc3 235 ADCS t1, acc0 236 ADC $0, acc1 237 // Third reduction step 238 ADDS acc2<<32, acc3, acc3 239 LSR $32, acc2, t0 240 MUL acc2, const1, t1 241 UMULH acc2, const1, acc2 242 ADCS t0, acc0 243 ADCS t1, acc1 244 ADC $0, acc2 245 // Last reduction step 246 ADDS acc3<<32, acc0, acc0 247 LSR $32, acc3, t0 248 MUL acc3, const1, t1 249 UMULH acc3, const1, acc3 250 ADCS t0, acc1 251 ADCS t1, acc2 252 ADC $0, acc3 253 254 SUBS $-1, acc0, t0 255 SBCS const0, acc1, t1 256 SBCS $0, acc2, t2 257 SBCS const1, acc3, t3 258 259 CSEL CS, t0, acc0, acc0 260 CSEL CS, t1, acc1, acc1 261 CSEL CS, t2, acc2, acc2 262 CSEL CS, t3, acc3, acc3 263 264 STP (acc0, acc1), 0*16(res_ptr) 265 STP (acc2, acc3), 1*16(res_ptr) 266 267 RET 268 /* ---------------------------------------*/ 269 // Constant time point access to arbitrary point table. 270 // Indexed from 1 to 15, with -1 offset 271 // (index 0 is implicitly point at infinity) 272 // func p256Select(point, table []uint64, idx int) 273 TEXT ·p256Select(SB),NOSPLIT,$0 274 MOVD idx+48(FP), const0 275 MOVD table+24(FP), b_ptr 276 MOVD point+0(FP), res_ptr 277 278 EOR x0, x0, x0 279 EOR x1, x1, x1 280 EOR x2, x2, x2 281 EOR x3, x3, x3 282 EOR y0, y0, y0 283 EOR y1, y1, y1 284 EOR y2, y2, y2 285 EOR y3, y3, y3 286 EOR t0, t0, t0 287 EOR t1, t1, t1 288 EOR t2, t2, t2 289 EOR t3, t3, t3 290 291 MOVD $0, const1 292 293 loop_select: 294 ADD $1, const1 295 CMP const0, const1 296 LDP.P 16(b_ptr), (acc0, acc1) 297 CSEL EQ, acc0, x0, x0 298 CSEL EQ, acc1, x1, x1 299 LDP.P 16(b_ptr), (acc2, acc3) 300 CSEL EQ, acc2, x2, x2 301 CSEL EQ, acc3, x3, x3 302 LDP.P 16(b_ptr), (acc4, acc5) 303 CSEL EQ, acc4, y0, y0 304 CSEL EQ, acc5, y1, y1 305 LDP.P 16(b_ptr), (acc6, acc7) 306 CSEL EQ, acc6, y2, y2 307 CSEL EQ, acc7, y3, y3 308 LDP.P 16(b_ptr), (acc0, acc1) 309 CSEL EQ, acc0, t0, t0 310 CSEL EQ, acc1, t1, t1 311 LDP.P 16(b_ptr), (acc2, acc3) 312 CSEL EQ, acc2, t2, t2 313 CSEL EQ, acc3, t3, t3 314 315 CMP $16, const1 316 BNE loop_select 317 318 STP (x0, x1), 0*16(res_ptr) 319 STP (x2, x3), 1*16(res_ptr) 320 STP (y0, y1), 2*16(res_ptr) 321 STP (y2, y3), 3*16(res_ptr) 322 STP (t0, t1), 4*16(res_ptr) 323 STP (t2, t3), 5*16(res_ptr) 324 RET 325 /* ---------------------------------------*/ 326 // Constant time point access to base point table. 327 // func p256SelectBase(point, table []uint64, idx int) 328 TEXT ·p256SelectBase(SB),NOSPLIT,$0 329 MOVD idx+48(FP), t0 330 MOVD table+24(FP), t1 331 MOVD point+0(FP), res_ptr 332 333 EOR x0, x0, x0 334 EOR x1, x1, x1 335 EOR x2, x2, x2 336 EOR x3, x3, x3 337 EOR y0, y0, y0 338 EOR y1, y1, y1 339 EOR y2, y2, y2 340 EOR y3, y3, y3 341 342 MOVD $0, t2 343 344 loop_select: 345 ADD $1, t2 346 CMP t0, t2 347 LDP.P 16(t1), (acc0, acc1) 348 CSEL EQ, acc0, x0, x0 349 CSEL EQ, acc1, x1, x1 350 LDP.P 16(t1), (acc2, acc3) 351 CSEL EQ, acc2, x2, x2 352 CSEL EQ, acc3, x3, x3 353 LDP.P 16(t1), (acc4, acc5) 354 CSEL EQ, acc4, y0, y0 355 CSEL EQ, acc5, y1, y1 356 LDP.P 16(t1), (acc6, acc7) 357 CSEL EQ, acc6, y2, y2 358 CSEL EQ, acc7, y3, y3 359 360 CMP $32, t2 361 BNE loop_select 362 363 STP (x0, x1), 0*16(res_ptr) 364 STP (x2, x3), 1*16(res_ptr) 365 STP (y0, y1), 2*16(res_ptr) 366 STP (y2, y3), 3*16(res_ptr) 367 RET 368 /* ---------------------------------------*/ 369 // func p256OrdSqr(res, in []uint64, n int) 370 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 371 MOVD in+24(FP), a_ptr 372 MOVD n+48(FP), b_ptr 373 374 MOVD p256ordK0<>(SB), hlp1 375 LDP p256ord<>+0x00(SB), (const0, const1) 376 LDP p256ord<>+0x10(SB), (const2, const3) 377 378 LDP 0*16(a_ptr), (x0, x1) 379 LDP 1*16(a_ptr), (x2, x3) 380 381 ordSqrLoop: 382 SUB $1, b_ptr 383 384 // x[1:] * x[0] 385 MUL x0, x1, acc1 386 UMULH x0, x1, acc2 387 388 MUL x0, x2, t0 389 ADDS t0, acc2, acc2 390 UMULH x0, x2, acc3 391 392 MUL x0, x3, t0 393 ADCS t0, acc3, acc3 394 UMULH x0, x3, acc4 395 ADC $0, acc4, acc4 396 // x[2:] * x[1] 397 MUL x1, x2, t0 398 ADDS t0, acc3 399 UMULH x1, x2, t1 400 ADCS t1, acc4 401 ADC $0, ZR, acc5 402 403 MUL x1, x3, t0 404 ADDS t0, acc4 405 UMULH x1, x3, t1 406 ADC t1, acc5 407 // x[3] * x[2] 408 MUL x2, x3, t0 409 ADDS t0, acc5 410 UMULH x2, x3, acc6 411 ADC $0, acc6 412 413 MOVD $0, acc7 414 // *2 415 ADDS acc1, acc1 416 ADCS acc2, acc2 417 ADCS acc3, acc3 418 ADCS acc4, acc4 419 ADCS acc5, acc5 420 ADCS acc6, acc6 421 ADC $0, acc7 422 // Missing products 423 MUL x0, x0, acc0 424 UMULH x0, x0, t0 425 ADDS t0, acc1, acc1 426 427 MUL x1, x1, t0 428 ADCS t0, acc2, acc2 429 UMULH x1, x1, t1 430 ADCS t1, acc3, acc3 431 432 MUL x2, x2, t0 433 ADCS t0, acc4, acc4 434 UMULH x2, x2, t1 435 ADCS t1, acc5, acc5 436 437 MUL x3, x3, t0 438 ADCS t0, acc6, acc6 439 UMULH x3, x3, t1 440 ADC t1, acc7, acc7 441 // First reduction step 442 MUL acc0, hlp1, hlp0 443 444 MUL const0, hlp1, t0 445 ADDS t0, acc0, acc0 446 UMULH const0, hlp0, t1 447 448 MUL const1, hlp0, t0 449 ADCS t0, acc1, acc1 450 UMULH const1, hlp0, y0 451 452 MUL const2, hlp0, t0 453 ADCS t0, acc2, acc2 454 UMULH const2, hlp0, acc0 455 456 MUL const3, hlp0, t0 457 ADCS t0, acc3, acc3 458 459 UMULH const3, hlp0, hlp0 460 ADC $0, hlp0 461 462 ADDS t1, acc1, acc1 463 ADCS y0, acc2, acc2 464 ADCS acc0, acc3, acc3 465 ADC $0, hlp0, acc0 466 // Second reduction step 467 MUL acc1, hlp1, hlp0 468 469 MUL const0, hlp1, t0 470 ADDS t0, acc1, acc1 471 UMULH const0, hlp0, t1 472 473 MUL const1, hlp0, t0 474 ADCS t0, acc2, acc2 475 UMULH const1, hlp0, y0 476 477 MUL const2, hlp0, t0 478 ADCS t0, acc3, acc3 479 UMULH const2, hlp0, acc1 480 481 MUL const3, hlp0, t0 482 ADCS t0, acc0, acc0 483 484 UMULH const3, hlp0, hlp0 485 ADC $0, hlp0 486 487 ADDS t1, acc2, acc2 488 ADCS y0, acc3, acc3 489 ADCS acc1, acc0, acc0 490 ADC $0, hlp0, acc1 491 // Third reduction step 492 MUL acc2, hlp1, hlp0 493 494 MUL const0, hlp1, t0 495 ADDS t0, acc2, acc2 496 UMULH const0, hlp0, t1 497 498 MUL const1, hlp0, t0 499 ADCS t0, acc3, acc3 500 UMULH const1, hlp0, y0 501 502 MUL const2, hlp0, t0 503 ADCS t0, acc0, acc0 504 UMULH const2, hlp0, acc2 505 506 MUL const3, hlp0, t0 507 ADCS t0, acc1, acc1 508 509 UMULH const3, hlp0, hlp0 510 ADC $0, hlp0 511 512 ADDS t1, acc3, acc3 513 ADCS y0, acc0, acc0 514 ADCS acc2, acc1, acc1 515 ADC $0, hlp0, acc2 516 517 // Last reduction step 518 MUL acc3, hlp1, hlp0 519 520 MUL const0, hlp1, t0 521 ADDS t0, acc3, acc3 522 UMULH const0, hlp0, t1 523 524 MUL const1, hlp0, t0 525 ADCS t0, acc0, acc0 526 UMULH const1, hlp0, y0 527 528 MUL const2, hlp0, t0 529 ADCS t0, acc1, acc1 530 UMULH const2, hlp0, acc3 531 532 MUL const3, hlp0, t0 533 ADCS t0, acc2, acc2 534 535 UMULH const3, hlp0, hlp0 536 ADC $0, acc7 537 538 ADDS t1, acc0, acc0 539 ADCS y0, acc1, acc1 540 ADCS acc3, acc2, acc2 541 ADC $0, hlp0, acc3 542 543 ADDS acc4, acc0, acc0 544 ADCS acc5, acc1, acc1 545 ADCS acc6, acc2, acc2 546 ADCS acc7, acc3, acc3 547 ADC $0, ZR, acc4 548 549 SUBS const0, acc0, y0 550 SBCS const1, acc1, y1 551 SBCS const2, acc2, y2 552 SBCS const3, acc3, y3 553 SBCS $0, acc4, acc4 554 555 CSEL CS, y0, acc0, x0 556 CSEL CS, y1, acc1, x1 557 CSEL CS, y2, acc2, x2 558 CSEL CS, y3, acc3, x3 559 560 CBNZ b_ptr, ordSqrLoop 561 562 MOVD res+0(FP), res_ptr 563 STP (x0, x1), 0*16(res_ptr) 564 STP (x2, x3), 1*16(res_ptr) 565 566 RET 567 /* ---------------------------------------*/ 568 // func p256OrdMul(res, in1, in2 []uint64) 569 TEXT ·p256OrdMul(SB),NOSPLIT,$0 570 MOVD in1+24(FP), a_ptr 571 MOVD in2+48(FP), b_ptr 572 573 MOVD p256ordK0<>(SB), hlp1 574 LDP p256ord<>+0x00(SB), (const0, const1) 575 LDP p256ord<>+0x10(SB), (const2, const3) 576 577 LDP 0*16(a_ptr), (x0, x1) 578 LDP 1*16(a_ptr), (x2, x3) 579 LDP 0*16(b_ptr), (y0, y1) 580 LDP 1*16(b_ptr), (y2, y3) 581 582 // y[0] * x 583 MUL y0, x0, acc0 584 UMULH y0, x0, acc1 585 586 MUL y0, x1, t0 587 ADDS t0, acc1 588 UMULH y0, x1, acc2 589 590 MUL y0, x2, t0 591 ADCS t0, acc2 592 UMULH y0, x2, acc3 593 594 MUL y0, x3, t0 595 ADCS t0, acc3 596 UMULH y0, x3, acc4 597 ADC $0, acc4 598 // First reduction step 599 MUL acc0, hlp1, hlp0 600 601 MUL const0, hlp1, t0 602 ADDS t0, acc0, acc0 603 UMULH const0, hlp0, t1 604 605 MUL const1, hlp0, t0 606 ADCS t0, acc1, acc1 607 UMULH const1, hlp0, y0 608 609 MUL const2, hlp0, t0 610 ADCS t0, acc2, acc2 611 UMULH const2, hlp0, acc0 612 613 MUL const3, hlp0, t0 614 ADCS t0, acc3, acc3 615 616 UMULH const3, hlp0, hlp0 617 ADC $0, acc4 618 619 ADDS t1, acc1, acc1 620 ADCS y0, acc2, acc2 621 ADCS acc0, acc3, acc3 622 ADC $0, hlp0, acc0 623 // y[1] * x 624 MUL y1, x0, t0 625 ADDS t0, acc1 626 UMULH y1, x0, t1 627 628 MUL y1, x1, t0 629 ADCS t0, acc2 630 UMULH y1, x1, hlp0 631 632 MUL y1, x2, t0 633 ADCS t0, acc3 634 UMULH y1, x2, y0 635 636 MUL y1, x3, t0 637 ADCS t0, acc4 638 UMULH y1, x3, y1 639 ADC $0, ZR, acc5 640 641 ADDS t1, acc2 642 ADCS hlp0, acc3 643 ADCS y0, acc4 644 ADC y1, acc5 645 // Second reduction step 646 MUL acc1, hlp1, hlp0 647 648 MUL const0, hlp1, t0 649 ADDS t0, acc1, acc1 650 UMULH const0, hlp0, t1 651 652 MUL const1, hlp0, t0 653 ADCS t0, acc2, acc2 654 UMULH const1, hlp0, y0 655 656 MUL const2, hlp0, t0 657 ADCS t0, acc3, acc3 658 UMULH const2, hlp0, acc1 659 660 MUL const3, hlp0, t0 661 ADCS t0, acc0, acc0 662 663 UMULH const3, hlp0, hlp0 664 ADC $0, acc5 665 666 ADDS t1, acc2, acc2 667 ADCS y0, acc3, acc3 668 ADCS acc1, acc0, acc0 669 ADC $0, hlp0, acc1 670 // y[2] * x 671 MUL y2, x0, t0 672 ADDS t0, acc2 673 UMULH y2, x0, t1 674 675 MUL y2, x1, t0 676 ADCS t0, acc3 677 UMULH y2, x1, hlp0 678 679 MUL y2, x2, t0 680 ADCS t0, acc4 681 UMULH y2, x2, y0 682 683 MUL y2, x3, t0 684 ADCS t0, acc5 685 UMULH y2, x3, y1 686 ADC $0, ZR, acc6 687 688 ADDS t1, acc3 689 ADCS hlp0, acc4 690 ADCS y0, acc5 691 ADC y1, acc6 692 // Third reduction step 693 MUL acc2, hlp1, hlp0 694 695 MUL const0, hlp1, t0 696 ADDS t0, acc2, acc2 697 UMULH const0, hlp0, t1 698 699 MUL const1, hlp0, t0 700 ADCS t0, acc3, acc3 701 UMULH const1, hlp0, y0 702 703 MUL const2, hlp0, t0 704 ADCS t0, acc0, acc0 705 UMULH const2, hlp0, acc2 706 707 MUL const3, hlp0, t0 708 ADCS t0, acc1, acc1 709 710 UMULH const3, hlp0, hlp0 711 ADC $0, acc6 712 713 ADDS t1, acc3, acc3 714 ADCS y0, acc0, acc0 715 ADCS acc2, acc1, acc1 716 ADC $0, hlp0, acc2 717 // y[3] * x 718 MUL y3, x0, t0 719 ADDS t0, acc3 720 UMULH y3, x0, t1 721 722 MUL y3, x1, t0 723 ADCS t0, acc4 724 UMULH y3, x1, hlp0 725 726 MUL y3, x2, t0 727 ADCS t0, acc5 728 UMULH y3, x2, y0 729 730 MUL y3, x3, t0 731 ADCS t0, acc6 732 UMULH y3, x3, y1 733 ADC $0, ZR, acc7 734 735 ADDS t1, acc4 736 ADCS hlp0, acc5 737 ADCS y0, acc6 738 ADC y1, acc7 739 // Last reduction step 740 MUL acc3, hlp1, hlp0 741 742 MUL const0, hlp1, t0 743 ADDS t0, acc3, acc3 744 UMULH const0, hlp0, t1 745 746 MUL const1, hlp0, t0 747 ADCS t0, acc0, acc0 748 UMULH const1, hlp0, y0 749 750 MUL const2, hlp0, t0 751 ADCS t0, acc1, acc1 752 UMULH const2, hlp0, acc3 753 754 MUL const3, hlp0, t0 755 ADCS t0, acc2, acc2 756 757 UMULH const3, hlp0, hlp0 758 ADC $0, acc7 759 760 ADDS t1, acc0, acc0 761 ADCS y0, acc1, acc1 762 ADCS acc3, acc2, acc2 763 ADC $0, hlp0, acc3 764 765 ADDS acc4, acc0, acc0 766 ADCS acc5, acc1, acc1 767 ADCS acc6, acc2, acc2 768 ADCS acc7, acc3, acc3 769 ADC $0, ZR, acc4 770 771 SUBS const0, acc0, t0 772 SBCS const1, acc1, t1 773 SBCS const2, acc2, t2 774 SBCS const3, acc3, t3 775 SBCS $0, acc4, acc4 776 777 CSEL CS, t0, acc0, acc0 778 CSEL CS, t1, acc1, acc1 779 CSEL CS, t2, acc2, acc2 780 CSEL CS, t3, acc3, acc3 781 782 MOVD res+0(FP), res_ptr 783 STP (acc0, acc1), 0*16(res_ptr) 784 STP (acc2, acc3), 1*16(res_ptr) 785 786 RET 787 /* ---------------------------------------*/ 788 TEXT p256SubInternal<>(SB),NOSPLIT,$0 789 SUBS x0, y0, acc0 790 SBCS x1, y1, acc1 791 SBCS x2, y2, acc2 792 SBCS x3, y3, acc3 793 SBC $0, ZR, t0 794 795 ADDS $-1, acc0, acc4 796 ADCS const0, acc1, acc5 797 ADCS $0, acc2, acc6 798 ADC const1, acc3, acc7 799 800 ANDS $1, t0 801 CSEL EQ, acc0, acc4, x0 802 CSEL EQ, acc1, acc5, x1 803 CSEL EQ, acc2, acc6, x2 804 CSEL EQ, acc3, acc7, x3 805 806 RET 807 /* ---------------------------------------*/ 808 TEXT p256SqrInternal<>(SB),NOSPLIT,$0 809 // x[1:] * x[0] 810 MUL x0, x1, acc1 811 UMULH x0, x1, acc2 812 813 MUL x0, x2, t0 814 ADDS t0, acc2, acc2 815 UMULH x0, x2, acc3 816 817 MUL x0, x3, t0 818 ADCS t0, acc3, acc3 819 UMULH x0, x3, acc4 820 ADC $0, acc4, acc4 821 // x[2:] * x[1] 822 MUL x1, x2, t0 823 ADDS t0, acc3 824 UMULH x1, x2, t1 825 ADCS t1, acc4 826 ADC $0, ZR, acc5 827 828 MUL x1, x3, t0 829 ADDS t0, acc4 830 UMULH x1, x3, t1 831 ADC t1, acc5 832 // x[3] * x[2] 833 MUL x2, x3, t0 834 ADDS t0, acc5 835 UMULH x2, x3, acc6 836 ADC $0, acc6 837 838 MOVD $0, acc7 839 // *2 840 ADDS acc1, acc1 841 ADCS acc2, acc2 842 ADCS acc3, acc3 843 ADCS acc4, acc4 844 ADCS acc5, acc5 845 ADCS acc6, acc6 846 ADC $0, acc7 847 // Missing products 848 MUL x0, x0, acc0 849 UMULH x0, x0, t0 850 ADDS t0, acc1, acc1 851 852 MUL x1, x1, t0 853 ADCS t0, acc2, acc2 854 UMULH x1, x1, t1 855 ADCS t1, acc3, acc3 856 857 MUL x2, x2, t0 858 ADCS t0, acc4, acc4 859 UMULH x2, x2, t1 860 ADCS t1, acc5, acc5 861 862 MUL x3, x3, t0 863 ADCS t0, acc6, acc6 864 UMULH x3, x3, t1 865 ADCS t1, acc7, acc7 866 // First reduction step 867 ADDS acc0<<32, acc1, acc1 868 LSR $32, acc0, t0 869 MUL acc0, const1, t1 870 UMULH acc0, const1, acc0 871 ADCS t0, acc2, acc2 872 ADCS t1, acc3, acc3 873 ADC $0, acc0, acc0 874 // Second reduction step 875 ADDS acc1<<32, acc2, acc2 876 LSR $32, acc1, t0 877 MUL acc1, const1, t1 878 UMULH acc1, const1, acc1 879 ADCS t0, acc3, acc3 880 ADCS t1, acc0, acc0 881 ADC $0, acc1, acc1 882 // Third reduction step 883 ADDS acc2<<32, acc3, acc3 884 LSR $32, acc2, t0 885 MUL acc2, const1, t1 886 UMULH acc2, const1, acc2 887 ADCS t0, acc0, acc0 888 ADCS t1, acc1, acc1 889 ADC $0, acc2, acc2 890 // Last reduction step 891 ADDS acc3<<32, acc0, acc0 892 LSR $32, acc3, t0 893 MUL acc3, const1, t1 894 UMULH acc3, const1, acc3 895 ADCS t0, acc1, acc1 896 ADCS t1, acc2, acc2 897 ADC $0, acc3, acc3 898 // Add bits [511:256] of the sqr result 899 ADDS acc4, acc0, acc0 900 ADCS acc5, acc1, acc1 901 ADCS acc6, acc2, acc2 902 ADCS acc7, acc3, acc3 903 ADC $0, ZR, acc4 904 905 SUBS $-1, acc0, t0 906 SBCS const0, acc1, t1 907 SBCS $0, acc2, t2 908 SBCS const1, acc3, t3 909 SBCS $0, acc4, acc4 910 911 CSEL CS, t0, acc0, y0 912 CSEL CS, t1, acc1, y1 913 CSEL CS, t2, acc2, y2 914 CSEL CS, t3, acc3, y3 915 RET 916 /* ---------------------------------------*/ 917 TEXT p256MulInternal<>(SB),NOSPLIT,$0 918 // y[0] * x 919 MUL y0, x0, acc0 920 UMULH y0, x0, acc1 921 922 MUL y0, x1, t0 923 ADDS t0, acc1 924 UMULH y0, x1, acc2 925 926 MUL y0, x2, t0 927 ADCS t0, acc2 928 UMULH y0, x2, acc3 929 930 MUL y0, x3, t0 931 ADCS t0, acc3 932 UMULH y0, x3, acc4 933 ADC $0, acc4 934 // First reduction step 935 ADDS acc0<<32, acc1, acc1 936 LSR $32, acc0, t0 937 MUL acc0, const1, t1 938 UMULH acc0, const1, acc0 939 ADCS t0, acc2 940 ADCS t1, acc3 941 ADC $0, acc0 942 // y[1] * x 943 MUL y1, x0, t0 944 ADDS t0, acc1 945 UMULH y1, x0, t1 946 947 MUL y1, x1, t0 948 ADCS t0, acc2 949 UMULH y1, x1, t2 950 951 MUL y1, x2, t0 952 ADCS t0, acc3 953 UMULH y1, x2, t3 954 955 MUL y1, x3, t0 956 ADCS t0, acc4 957 UMULH y1, x3, hlp0 958 ADC $0, ZR, acc5 959 960 ADDS t1, acc2 961 ADCS t2, acc3 962 ADCS t3, acc4 963 ADC hlp0, acc5 964 // Second reduction step 965 ADDS acc1<<32, acc2, acc2 966 LSR $32, acc1, t0 967 MUL acc1, const1, t1 968 UMULH acc1, const1, acc1 969 ADCS t0, acc3 970 ADCS t1, acc0 971 ADC $0, acc1 972 // y[2] * x 973 MUL y2, x0, t0 974 ADDS t0, acc2 975 UMULH y2, x0, t1 976 977 MUL y2, x1, t0 978 ADCS t0, acc3 979 UMULH y2, x1, t2 980 981 MUL y2, x2, t0 982 ADCS t0, acc4 983 UMULH y2, x2, t3 984 985 MUL y2, x3, t0 986 ADCS t0, acc5 987 UMULH y2, x3, hlp0 988 ADC $0, ZR, acc6 989 990 ADDS t1, acc3 991 ADCS t2, acc4 992 ADCS t3, acc5 993 ADC hlp0, acc6 994 // Third reduction step 995 ADDS acc2<<32, acc3, acc3 996 LSR $32, acc2, t0 997 MUL acc2, const1, t1 998 UMULH acc2, const1, acc2 999 ADCS t0, acc0 1000 ADCS t1, acc1 1001 ADC $0, acc2 1002 // y[3] * x 1003 MUL y3, x0, t0 1004 ADDS t0, acc3 1005 UMULH y3, x0, t1 1006 1007 MUL y3, x1, t0 1008 ADCS t0, acc4 1009 UMULH y3, x1, t2 1010 1011 MUL y3, x2, t0 1012 ADCS t0, acc5 1013 UMULH y3, x2, t3 1014 1015 MUL y3, x3, t0 1016 ADCS t0, acc6 1017 UMULH y3, x3, hlp0 1018 ADC $0, ZR, acc7 1019 1020 ADDS t1, acc4 1021 ADCS t2, acc5 1022 ADCS t3, acc6 1023 ADC hlp0, acc7 1024 // Last reduction step 1025 ADDS acc3<<32, acc0, acc0 1026 LSR $32, acc3, t0 1027 MUL acc3, const1, t1 1028 UMULH acc3, const1, acc3 1029 ADCS t0, acc1 1030 ADCS t1, acc2 1031 ADC $0, acc3 1032 // Add bits [511:256] of the mul result 1033 ADDS acc4, acc0, acc0 1034 ADCS acc5, acc1, acc1 1035 ADCS acc6, acc2, acc2 1036 ADCS acc7, acc3, acc3 1037 ADC $0, ZR, acc4 1038 1039 SUBS $-1, acc0, t0 1040 SBCS const0, acc1, t1 1041 SBCS $0, acc2, t2 1042 SBCS const1, acc3, t3 1043 SBCS $0, acc4, acc4 1044 1045 CSEL CS, t0, acc0, y0 1046 CSEL CS, t1, acc1, y1 1047 CSEL CS, t2, acc2, y2 1048 CSEL CS, t3, acc3, y3 1049 RET 1050 /* ---------------------------------------*/ 1051 #define p256MulBy2Inline \ 1052 ADDS y0, y0, x0; \ 1053 ADCS y1, y1, x1; \ 1054 ADCS y2, y2, x2; \ 1055 ADCS y3, y3, x3; \ 1056 ADC $0, ZR, hlp0; \ 1057 SUBS $-1, x0, t0; \ 1058 SBCS const0, x1, t1;\ 1059 SBCS $0, x2, t2; \ 1060 SBCS const1, x3, t3;\ 1061 SBCS $0, hlp0, hlp0;\ 1062 CSEL CC, x0, t0, x0;\ 1063 CSEL CC, x1, t1, x1;\ 1064 CSEL CC, x2, t2, x2;\ 1065 CSEL CC, x3, t3, x3; 1066 /* ---------------------------------------*/ 1067 #define x1in(off) (off)(a_ptr) 1068 #define y1in(off) (off + 32)(a_ptr) 1069 #define z1in(off) (off + 64)(a_ptr) 1070 #define x2in(off) (off)(b_ptr) 1071 #define z2in(off) (off + 64)(b_ptr) 1072 #define x3out(off) (off)(res_ptr) 1073 #define y3out(off) (off + 32)(res_ptr) 1074 #define z3out(off) (off + 64)(res_ptr) 1075 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) 1076 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) 1077 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) 1078 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) 1079 /* ---------------------------------------*/ 1080 #define y2in(off) (32*0 + 8 + off)(RSP) 1081 #define s2(off) (32*1 + 8 + off)(RSP) 1082 #define z1sqr(off) (32*2 + 8 + off)(RSP) 1083 #define h(off) (32*3 + 8 + off)(RSP) 1084 #define r(off) (32*4 + 8 + off)(RSP) 1085 #define hsqr(off) (32*5 + 8 + off)(RSP) 1086 #define rsqr(off) (32*6 + 8 + off)(RSP) 1087 #define hcub(off) (32*7 + 8 + off)(RSP) 1088 1089 #define z2sqr(off) (32*8 + 8 + off)(RSP) 1090 #define s1(off) (32*9 + 8 + off)(RSP) 1091 #define u1(off) (32*10 + 8 + off)(RSP) 1092 #define u2(off) (32*11 + 8 + off)(RSP) 1093 1094 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 1095 TEXT ·p256PointAddAffineAsm(SB),0,$264-96 1096 MOVD in1+24(FP), a_ptr 1097 MOVD in2+48(FP), b_ptr 1098 MOVD sign+72(FP), hlp0 1099 MOVD sel+80(FP), hlp1 1100 MOVD zero+88(FP), t2 1101 1102 MOVD $1, t0 1103 CMP $0, t2 1104 CSEL EQ, ZR, t0, t2 1105 CMP $0, hlp1 1106 CSEL EQ, ZR, t0, hlp1 1107 1108 MOVD p256const0<>(SB), const0 1109 MOVD p256const1<>(SB), const1 1110 EOR t2<<1, hlp1 1111 1112 // Negate y2in based on sign 1113 LDP 2*16(b_ptr), (y0, y1) 1114 LDP 3*16(b_ptr), (y2, y3) 1115 MOVD $-1, acc0 1116 1117 SUBS y0, acc0, acc0 1118 SBCS y1, const0, acc1 1119 SBCS y2, ZR, acc2 1120 SBCS y3, const1, acc3 1121 SBC $0, ZR, t0 1122 1123 ADDS $-1, acc0, acc4 1124 ADCS const0, acc1, acc5 1125 ADCS $0, acc2, acc6 1126 ADCS const1, acc3, acc7 1127 ADC $0, t0, t0 1128 1129 CMP $0, t0 1130 CSEL EQ, acc4, acc0, acc0 1131 CSEL EQ, acc5, acc1, acc1 1132 CSEL EQ, acc6, acc2, acc2 1133 CSEL EQ, acc7, acc3, acc3 1134 // If condition is 0, keep original value 1135 CMP $0, hlp0 1136 CSEL EQ, y0, acc0, y0 1137 CSEL EQ, y1, acc1, y1 1138 CSEL EQ, y2, acc2, y2 1139 CSEL EQ, y3, acc3, y3 1140 // Store result 1141 STy(y2in) 1142 // Begin point add 1143 LDx(z1in) 1144 CALL p256SqrInternal<>(SB) // z1ˆ2 1145 STy(z1sqr) 1146 1147 LDx(x2in) 1148 CALL p256MulInternal<>(SB) // x2 * z1ˆ2 1149 1150 LDx(x1in) 1151 CALL p256SubInternal<>(SB) // h = u2 - u1 1152 STx(h) 1153 1154 LDy(z1in) 1155 CALL p256MulInternal<>(SB) // z3 = h * z1 1156 1157 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1 1158 LDP 5*16(a_ptr), (acc2, acc3) 1159 ANDS $1, hlp1, ZR 1160 CSEL EQ, acc0, y0, y0 1161 CSEL EQ, acc1, y1, y1 1162 CSEL EQ, acc2, y2, y2 1163 CSEL EQ, acc3, y3, y3 1164 LDP p256one<>+0x00(SB), (acc0, acc1) 1165 LDP p256one<>+0x10(SB), (acc2, acc3) 1166 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1 1167 CSEL EQ, acc0, y0, y0 1168 CSEL EQ, acc1, y1, y1 1169 CSEL EQ, acc2, y2, y2 1170 CSEL EQ, acc3, y3, y3 1171 LDx(z1in) 1172 MOVD res+0(FP), t0 1173 STP (y0, y1), 4*16(t0) 1174 STP (y2, y3), 5*16(t0) 1175 1176 LDy(z1sqr) 1177 CALL p256MulInternal<>(SB) // z1 ^ 3 1178 1179 LDx(y2in) 1180 CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3 1181 STy(s2) 1182 1183 LDx(y1in) 1184 CALL p256SubInternal<>(SB) // r = s2 - s1 1185 STx(r) 1186 1187 CALL p256SqrInternal<>(SB) // rsqr = rˆ2 1188 STy (rsqr) 1189 1190 LDx(h) 1191 CALL p256SqrInternal<>(SB) // hsqr = hˆ2 1192 STy(hsqr) 1193 1194 CALL p256MulInternal<>(SB) // hcub = hˆ3 1195 STy(hcub) 1196 1197 LDx(y1in) 1198 CALL p256MulInternal<>(SB) // y1 * hˆ3 1199 STy(s2) 1200 1201 LDP hsqr(0*8), (x0, x1) 1202 LDP hsqr(2*8), (x2, x3) 1203 LDP 0*16(a_ptr), (y0, y1) 1204 LDP 1*16(a_ptr), (y2, y3) 1205 CALL p256MulInternal<>(SB) // u1 * hˆ2 1206 STP (y0, y1), h(0*8) 1207 STP (y2, y3), h(2*8) 1208 1209 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1210 1211 LDy(rsqr) 1212 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1213 1214 MOVD x0, y0 1215 MOVD x1, y1 1216 MOVD x2, y2 1217 MOVD x3, y3 1218 LDx(hcub) 1219 CALL p256SubInternal<>(SB) 1220 1221 LDP 0*16(a_ptr), (acc0, acc1) 1222 LDP 1*16(a_ptr), (acc2, acc3) 1223 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1 1224 CSEL EQ, acc0, x0, x0 1225 CSEL EQ, acc1, x1, x1 1226 CSEL EQ, acc2, x2, x2 1227 CSEL EQ, acc3, x3, x3 1228 LDP 0*16(b_ptr), (acc0, acc1) 1229 LDP 1*16(b_ptr), (acc2, acc3) 1230 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2 1231 CSEL EQ, acc0, x0, x0 1232 CSEL EQ, acc1, x1, x1 1233 CSEL EQ, acc2, x2, x2 1234 CSEL EQ, acc3, x3, x3 1235 MOVD res+0(FP), t0 1236 STP (x0, x1), 0*16(t0) 1237 STP (x2, x3), 1*16(t0) 1238 1239 LDP h(0*8), (y0, y1) 1240 LDP h(2*8), (y2, y3) 1241 CALL p256SubInternal<>(SB) 1242 1243 LDP r(0*8), (y0, y1) 1244 LDP r(2*8), (y2, y3) 1245 CALL p256MulInternal<>(SB) 1246 1247 LDP s2(0*8), (x0, x1) 1248 LDP s2(2*8), (x2, x3) 1249 CALL p256SubInternal<>(SB) 1250 LDP 2*16(a_ptr), (acc0, acc1) 1251 LDP 3*16(a_ptr), (acc2, acc3) 1252 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1 1253 CSEL EQ, acc0, x0, x0 1254 CSEL EQ, acc1, x1, x1 1255 CSEL EQ, acc2, x2, x2 1256 CSEL EQ, acc3, x3, x3 1257 LDP y2in(0*8), (acc0, acc1) 1258 LDP y2in(2*8), (acc2, acc3) 1259 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2 1260 CSEL EQ, acc0, x0, x0 1261 CSEL EQ, acc1, x1, x1 1262 CSEL EQ, acc2, x2, x2 1263 CSEL EQ, acc3, x3, x3 1264 MOVD res+0(FP), t0 1265 STP (x0, x1), 2*16(t0) 1266 STP (x2, x3), 3*16(t0) 1267 1268 RET 1269 1270 #define p256AddInline \ 1271 ADDS y0, x0, x0; \ 1272 ADCS y1, x1, x1; \ 1273 ADCS y2, x2, x2; \ 1274 ADCS y3, x3, x3; \ 1275 ADC $0, ZR, hlp0; \ 1276 SUBS $-1, x0, t0; \ 1277 SBCS const0, x1, t1;\ 1278 SBCS $0, x2, t2; \ 1279 SBCS const1, x3, t3;\ 1280 SBCS $0, hlp0, hlp0;\ 1281 CSEL CC, x0, t0, x0;\ 1282 CSEL CC, x1, t1, x1;\ 1283 CSEL CC, x2, t2, x2;\ 1284 CSEL CC, x3, t3, x3; 1285 1286 #define s(off) (32*0 + 8 + off)(RSP) 1287 #define m(off) (32*1 + 8 + off)(RSP) 1288 #define zsqr(off) (32*2 + 8 + off)(RSP) 1289 #define tmp(off) (32*3 + 8 + off)(RSP) 1290 1291 //func p256PointDoubleAsm(res, in []uint64) 1292 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48 1293 MOVD res+0(FP), res_ptr 1294 MOVD in+24(FP), a_ptr 1295 1296 MOVD p256const0<>(SB), const0 1297 MOVD p256const1<>(SB), const1 1298 1299 // Begin point double 1300 LDP 4*16(a_ptr), (x0, x1) 1301 LDP 5*16(a_ptr), (x2, x3) 1302 CALL p256SqrInternal<>(SB) 1303 STP (y0, y1), zsqr(0*8) 1304 STP (y2, y3), zsqr(2*8) 1305 1306 LDP 0*16(a_ptr), (x0, x1) 1307 LDP 1*16(a_ptr), (x2, x3) 1308 p256AddInline 1309 STx(m) 1310 1311 LDx(z1in) 1312 LDy(y1in) 1313 CALL p256MulInternal<>(SB) 1314 p256MulBy2Inline 1315 STx(z3out) 1316 1317 LDy(x1in) 1318 LDx(zsqr) 1319 CALL p256SubInternal<>(SB) 1320 LDy(m) 1321 CALL p256MulInternal<>(SB) 1322 1323 // Multiply by 3 1324 p256MulBy2Inline 1325 p256AddInline 1326 STx(m) 1327 1328 LDy(y1in) 1329 p256MulBy2Inline 1330 CALL p256SqrInternal<>(SB) 1331 STy(s) 1332 MOVD y0, x0 1333 MOVD y1, x1 1334 MOVD y2, x2 1335 MOVD y3, x3 1336 CALL p256SqrInternal<>(SB) 1337 1338 // Divide by 2 1339 ADDS $-1, y0, t0 1340 ADCS const0, y1, t1 1341 ADCS $0, y2, t2 1342 ADCS const1, y3, t3 1343 ADC $0, ZR, hlp0 1344 1345 ANDS $1, y0, ZR 1346 CSEL EQ, y0, t0, t0 1347 CSEL EQ, y1, t1, t1 1348 CSEL EQ, y2, t2, t2 1349 CSEL EQ, y3, t3, t3 1350 AND y0, hlp0, hlp0 1351 1352 EXTR $1, t0, t1, y0 1353 EXTR $1, t1, t2, y1 1354 EXTR $1, t2, t3, y2 1355 EXTR $1, t3, hlp0, y3 1356 STy(y3out) 1357 1358 LDx(x1in) 1359 LDy(s) 1360 CALL p256MulInternal<>(SB) 1361 STy(s) 1362 p256MulBy2Inline 1363 STx(tmp) 1364 1365 LDx(m) 1366 CALL p256SqrInternal<>(SB) 1367 LDx(tmp) 1368 CALL p256SubInternal<>(SB) 1369 1370 STx(x3out) 1371 1372 LDy(s) 1373 CALL p256SubInternal<>(SB) 1374 1375 LDy(m) 1376 CALL p256MulInternal<>(SB) 1377 1378 LDx(y3out) 1379 CALL p256SubInternal<>(SB) 1380 STx(y3out) 1381 RET 1382 /* ---------------------------------------*/ 1383 #undef y2in 1384 #undef x3out 1385 #undef y3out 1386 #undef z3out 1387 #define y2in(off) (off + 32)(b_ptr) 1388 #define x3out(off) (off)(b_ptr) 1389 #define y3out(off) (off + 32)(b_ptr) 1390 #define z3out(off) (off + 64)(b_ptr) 1391 //func p256PointAddAsm(res, in1, in2 []uint64) int 1392 TEXT ·p256PointAddAsm(SB),0,$392-80 1393 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 1394 // Move input to stack in order to free registers 1395 MOVD in1+24(FP), a_ptr 1396 MOVD in2+48(FP), b_ptr 1397 1398 MOVD p256const0<>(SB), const0 1399 MOVD p256const1<>(SB), const1 1400 1401 // Begin point add 1402 LDx(z2in) 1403 CALL p256SqrInternal<>(SB) // z2^2 1404 STy(z2sqr) 1405 1406 CALL p256MulInternal<>(SB) // z2^3 1407 1408 LDx(y1in) 1409 CALL p256MulInternal<>(SB) // s1 = z2ˆ3*y1 1410 STy(s1) 1411 1412 LDx(z1in) 1413 CALL p256SqrInternal<>(SB) // z1^2 1414 STy(z1sqr) 1415 1416 CALL p256MulInternal<>(SB) // z1^3 1417 1418 LDx(y2in) 1419 CALL p256MulInternal<>(SB) // s2 = z1ˆ3*y2 1420 1421 LDx(s1) 1422 CALL p256SubInternal<>(SB) // r = s2 - s1 1423 STx(r) 1424 1425 MOVD $1, t2 1426 ORR x0, x1, t0 // Check if zero mod p256 1427 ORR x2, x3, t1 1428 ORR t1, t0, t0 1429 CMP $0, t0 1430 CSEL EQ, t2, ZR, hlp1 1431 1432 EOR $-1, x0, t0 1433 EOR const0, x1, t1 1434 EOR const1, x3, t3 1435 1436 ORR t0, t1, t0 1437 ORR x2, t3, t1 1438 ORR t1, t0, t0 1439 CMP $0, t0 1440 CSEL EQ, t2, hlp1, hlp1 1441 1442 LDx(z2sqr) 1443 LDy(x1in) 1444 CALL p256MulInternal<>(SB) // u1 = x1 * z2ˆ2 1445 STy(u1) 1446 1447 LDx(z1sqr) 1448 LDy(x2in) 1449 CALL p256MulInternal<>(SB) // u2 = x2 * z1ˆ2 1450 STy(u2) 1451 1452 LDx(u1) 1453 CALL p256SubInternal<>(SB) // h = u2 - u1 1454 STx(h) 1455 1456 MOVD $1, t2 1457 ORR x0, x1, t0 // Check if zero mod p256 1458 ORR x2, x3, t1 1459 ORR t1, t0, t0 1460 CMP $0, t0 1461 CSEL EQ, t2, ZR, hlp0 1462 1463 EOR $-1, x0, t0 1464 EOR const0, x1, t1 1465 EOR const1, x3, t3 1466 1467 ORR t0, t1, t0 1468 ORR x2, t3, t1 1469 ORR t1, t0, t0 1470 CMP $0, t0 1471 CSEL EQ, t2, hlp0, hlp0 1472 1473 AND hlp0, hlp1, hlp1 1474 1475 LDx(r) 1476 CALL p256SqrInternal<>(SB) // rsqr = rˆ2 1477 STy(rsqr) 1478 1479 LDx(h) 1480 CALL p256SqrInternal<>(SB) // hsqr = hˆ2 1481 STy(hsqr) 1482 1483 LDx(h) 1484 CALL p256MulInternal<>(SB) // hcub = hˆ3 1485 STy(hcub) 1486 1487 LDx(s1) 1488 CALL p256MulInternal<>(SB) 1489 STy(s2) 1490 1491 LDx(z1in) 1492 LDy(z2in) 1493 CALL p256MulInternal<>(SB) // z1 * z2 1494 LDx(h) 1495 CALL p256MulInternal<>(SB) // z1 * z2 * h 1496 MOVD res+0(FP), b_ptr 1497 STy(z3out) 1498 1499 LDx(hsqr) 1500 LDy(u1) 1501 CALL p256MulInternal<>(SB) // hˆ2 * u1 1502 STy(u2) 1503 1504 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1505 LDy(rsqr) 1506 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1507 1508 MOVD x0, y0 1509 MOVD x1, y1 1510 MOVD x2, y2 1511 MOVD x3, y3 1512 LDx(hcub) 1513 CALL p256SubInternal<>(SB) 1514 STx(x3out) 1515 1516 LDy(u2) 1517 CALL p256SubInternal<>(SB) 1518 1519 LDy(r) 1520 CALL p256MulInternal<>(SB) 1521 1522 LDx(s2) 1523 CALL p256SubInternal<>(SB) 1524 STx(y3out) 1525 1526 MOVD hlp1, R0 1527 MOVD R0, ret+72(FP) 1528 1529 RET