github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/crypto/internal/nistec/p256_asm_arm64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains constant-time, 64-bit assembly implementation of 6 // P256. The optimizations performed here are described in detail in: 7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 8 // 256-bit primes" 9 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 10 // https://eprint.iacr.org/2013/816.pdf 11 12 #include "textflag.h" 13 14 #define res_ptr R0 15 #define a_ptr R1 16 #define b_ptr R2 17 18 #define acc0 R3 19 #define acc1 R4 20 #define acc2 R5 21 #define acc3 R6 22 23 #define acc4 R7 24 #define acc5 R8 25 #define acc6 R9 26 #define acc7 R10 27 #define t0 R11 28 #define t1 R12 29 #define t2 R13 30 #define t3 R14 31 #define const0 R15 32 #define const1 R16 33 34 #define hlp0 R17 35 #define hlp1 res_ptr 36 37 #define x0 R19 38 #define x1 R20 39 #define x2 R21 40 #define x3 R22 41 #define y0 R23 42 #define y1 R24 43 #define y2 R25 44 #define y3 R26 45 46 #define const2 t2 47 #define const3 t3 48 49 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 50 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 51 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 52 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 53 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 54 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 55 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 56 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 57 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 58 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 59 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 60 GLOBL p256const0<>(SB), 8, $8 61 GLOBL p256const1<>(SB), 8, $8 62 GLOBL p256ordK0<>(SB), 8, $8 63 GLOBL p256ord<>(SB), 8, $32 64 GLOBL p256one<>(SB), 8, $32 65 66 /* ---------------------------------------*/ 67 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 68 TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 69 JMP ·p256BigToLittle(SB) 70 /* ---------------------------------------*/ 71 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 72 TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 73 JMP ·p256BigToLittle(SB) 74 /* ---------------------------------------*/ 75 // func p256LittleToBig(res *[32]byte, in *p256Element) 76 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 77 JMP ·p256BigToLittle(SB) 78 /* ---------------------------------------*/ 79 // func p256BigToLittle(res *p256Element, in *[32]byte) 80 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 81 MOVD res+0(FP), res_ptr 82 MOVD in+8(FP), a_ptr 83 84 LDP 0*16(a_ptr), (acc0, acc1) 85 LDP 1*16(a_ptr), (acc2, acc3) 86 87 REV acc0, acc0 88 REV acc1, acc1 89 REV acc2, acc2 90 REV acc3, acc3 91 92 STP (acc3, acc2), 0*16(res_ptr) 93 STP (acc1, acc0), 1*16(res_ptr) 94 RET 95 /* ---------------------------------------*/ 96 // func p256MovCond(res, a, b *P256Point, cond int) 97 // If cond == 0 res=b, else res=a 98 TEXT ·p256MovCond(SB),NOSPLIT,$0 99 MOVD res+0(FP), res_ptr 100 MOVD a+8(FP), a_ptr 101 MOVD b+16(FP), b_ptr 102 MOVD cond+24(FP), R3 103 104 CMP $0, R3 105 // Two remarks: 106 // 1) Will want to revisit NEON, when support is better 107 // 2) CSEL might not be constant time on all ARM processors 108 LDP 0*16(a_ptr), (R4, R5) 109 LDP 1*16(a_ptr), (R6, R7) 110 LDP 2*16(a_ptr), (R8, R9) 111 LDP 0*16(b_ptr), (R16, R17) 112 LDP 1*16(b_ptr), (R19, R20) 113 LDP 2*16(b_ptr), (R21, R22) 114 CSEL EQ, R16, R4, R4 115 CSEL EQ, R17, R5, R5 116 CSEL EQ, R19, R6, R6 117 CSEL EQ, R20, R7, R7 118 CSEL EQ, R21, R8, R8 119 CSEL EQ, R22, R9, R9 120 STP (R4, R5), 0*16(res_ptr) 121 STP (R6, R7), 1*16(res_ptr) 122 STP (R8, R9), 2*16(res_ptr) 123 124 LDP 3*16(a_ptr), (R4, R5) 125 LDP 4*16(a_ptr), (R6, R7) 126 LDP 5*16(a_ptr), (R8, R9) 127 LDP 3*16(b_ptr), (R16, R17) 128 LDP 4*16(b_ptr), (R19, R20) 129 LDP 5*16(b_ptr), (R21, R22) 130 CSEL EQ, R16, R4, R4 131 CSEL EQ, R17, R5, R5 132 CSEL EQ, R19, R6, R6 133 CSEL EQ, R20, R7, R7 134 CSEL EQ, R21, R8, R8 135 CSEL EQ, R22, R9, R9 136 STP (R4, R5), 3*16(res_ptr) 137 STP (R6, R7), 4*16(res_ptr) 138 STP (R8, R9), 5*16(res_ptr) 139 140 RET 141 /* ---------------------------------------*/ 142 // func p256NegCond(val *p256Element, cond int) 143 TEXT ·p256NegCond(SB),NOSPLIT,$0 144 MOVD val+0(FP), a_ptr 145 MOVD cond+8(FP), hlp0 146 MOVD a_ptr, res_ptr 147 // acc = poly 148 MOVD $-1, acc0 149 MOVD p256const0<>(SB), acc1 150 MOVD $0, acc2 151 MOVD p256const1<>(SB), acc3 152 // Load the original value 153 LDP 0*16(a_ptr), (t0, t1) 154 LDP 1*16(a_ptr), (t2, t3) 155 // Speculatively subtract 156 SUBS t0, acc0 157 SBCS t1, acc1 158 SBCS t2, acc2 159 SBC t3, acc3 160 // If condition is 0, keep original value 161 CMP $0, hlp0 162 CSEL EQ, t0, acc0, acc0 163 CSEL EQ, t1, acc1, acc1 164 CSEL EQ, t2, acc2, acc2 165 CSEL EQ, t3, acc3, acc3 166 // Store result 167 STP (acc0, acc1), 0*16(res_ptr) 168 STP (acc2, acc3), 1*16(res_ptr) 169 170 RET 171 /* ---------------------------------------*/ 172 // func p256Sqr(res, in *p256Element, n int) 173 TEXT ·p256Sqr(SB),NOSPLIT,$0 174 MOVD res+0(FP), res_ptr 175 MOVD in+8(FP), a_ptr 176 MOVD n+16(FP), b_ptr 177 178 MOVD p256const0<>(SB), const0 179 MOVD p256const1<>(SB), const1 180 181 LDP 0*16(a_ptr), (x0, x1) 182 LDP 1*16(a_ptr), (x2, x3) 183 184 sqrLoop: 185 SUB $1, b_ptr 186 CALL p256SqrInternal<>(SB) 187 MOVD y0, x0 188 MOVD y1, x1 189 MOVD y2, x2 190 MOVD y3, x3 191 CBNZ b_ptr, sqrLoop 192 193 STP (y0, y1), 0*16(res_ptr) 194 STP (y2, y3), 1*16(res_ptr) 195 RET 196 /* ---------------------------------------*/ 197 // func p256Mul(res, in1, in2 *p256Element) 198 TEXT ·p256Mul(SB),NOSPLIT,$0 199 MOVD res+0(FP), res_ptr 200 MOVD in1+8(FP), a_ptr 201 MOVD in2+16(FP), b_ptr 202 203 MOVD p256const0<>(SB), const0 204 MOVD p256const1<>(SB), const1 205 206 LDP 0*16(a_ptr), (x0, x1) 207 LDP 1*16(a_ptr), (x2, x3) 208 209 LDP 0*16(b_ptr), (y0, y1) 210 LDP 1*16(b_ptr), (y2, y3) 211 212 CALL p256MulInternal<>(SB) 213 214 STP (y0, y1), 0*16(res_ptr) 215 STP (y2, y3), 1*16(res_ptr) 216 RET 217 /* ---------------------------------------*/ 218 // func p256FromMont(res, in *p256Element) 219 TEXT ·p256FromMont(SB),NOSPLIT,$0 220 MOVD res+0(FP), res_ptr 221 MOVD in+8(FP), a_ptr 222 223 MOVD p256const0<>(SB), const0 224 MOVD p256const1<>(SB), const1 225 226 LDP 0*16(a_ptr), (acc0, acc1) 227 LDP 1*16(a_ptr), (acc2, acc3) 228 // Only reduce, no multiplications are needed 229 // First reduction step 230 ADDS acc0<<32, acc1, acc1 231 LSR $32, acc0, t0 232 MUL acc0, const1, t1 233 UMULH acc0, const1, acc0 234 ADCS t0, acc2 235 ADCS t1, acc3 236 ADC $0, acc0 237 // Second reduction step 238 ADDS acc1<<32, acc2, acc2 239 LSR $32, acc1, t0 240 MUL acc1, const1, t1 241 UMULH acc1, const1, acc1 242 ADCS t0, acc3 243 ADCS t1, acc0 244 ADC $0, acc1 245 // Third reduction step 246 ADDS acc2<<32, acc3, acc3 247 LSR $32, acc2, t0 248 MUL acc2, const1, t1 249 UMULH acc2, const1, acc2 250 ADCS t0, acc0 251 ADCS t1, acc1 252 ADC $0, acc2 253 // Last reduction step 254 ADDS acc3<<32, acc0, acc0 255 LSR $32, acc3, t0 256 MUL acc3, const1, t1 257 UMULH acc3, const1, acc3 258 ADCS t0, acc1 259 ADCS t1, acc2 260 ADC $0, acc3 261 262 SUBS $-1, acc0, t0 263 SBCS const0, acc1, t1 264 SBCS $0, acc2, t2 265 SBCS const1, acc3, t3 266 267 CSEL CS, t0, acc0, acc0 268 CSEL CS, t1, acc1, acc1 269 CSEL CS, t2, acc2, acc2 270 CSEL CS, t3, acc3, acc3 271 272 STP (acc0, acc1), 0*16(res_ptr) 273 STP (acc2, acc3), 1*16(res_ptr) 274 275 RET 276 /* ---------------------------------------*/ 277 // func p256Select(res *P256Point, table *p256Table, idx int) 278 TEXT ·p256Select(SB),NOSPLIT,$0 279 MOVD idx+16(FP), const0 280 MOVD table+8(FP), b_ptr 281 MOVD res+0(FP), res_ptr 282 283 EOR x0, x0, x0 284 EOR x1, x1, x1 285 EOR x2, x2, x2 286 EOR x3, x3, x3 287 EOR y0, y0, y0 288 EOR y1, y1, y1 289 EOR y2, y2, y2 290 EOR y3, y3, y3 291 EOR t0, t0, t0 292 EOR t1, t1, t1 293 EOR t2, t2, t2 294 EOR t3, t3, t3 295 296 MOVD $0, const1 297 298 loop_select: 299 ADD $1, const1 300 CMP const0, const1 301 LDP.P 16(b_ptr), (acc0, acc1) 302 CSEL EQ, acc0, x0, x0 303 CSEL EQ, acc1, x1, x1 304 LDP.P 16(b_ptr), (acc2, acc3) 305 CSEL EQ, acc2, x2, x2 306 CSEL EQ, acc3, x3, x3 307 LDP.P 16(b_ptr), (acc4, acc5) 308 CSEL EQ, acc4, y0, y0 309 CSEL EQ, acc5, y1, y1 310 LDP.P 16(b_ptr), (acc6, acc7) 311 CSEL EQ, acc6, y2, y2 312 CSEL EQ, acc7, y3, y3 313 LDP.P 16(b_ptr), (acc0, acc1) 314 CSEL EQ, acc0, t0, t0 315 CSEL EQ, acc1, t1, t1 316 LDP.P 16(b_ptr), (acc2, acc3) 317 CSEL EQ, acc2, t2, t2 318 CSEL EQ, acc3, t3, t3 319 320 CMP $16, const1 321 BNE loop_select 322 323 STP (x0, x1), 0*16(res_ptr) 324 STP (x2, x3), 1*16(res_ptr) 325 STP (y0, y1), 2*16(res_ptr) 326 STP (y2, y3), 3*16(res_ptr) 327 STP (t0, t1), 4*16(res_ptr) 328 STP (t2, t3), 5*16(res_ptr) 329 RET 330 /* ---------------------------------------*/ 331 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 332 TEXT ·p256SelectAffine(SB),NOSPLIT,$0 333 MOVD idx+16(FP), t0 334 MOVD table+8(FP), t1 335 MOVD res+0(FP), res_ptr 336 337 EOR x0, x0, x0 338 EOR x1, x1, x1 339 EOR x2, x2, x2 340 EOR x3, x3, x3 341 EOR y0, y0, y0 342 EOR y1, y1, y1 343 EOR y2, y2, y2 344 EOR y3, y3, y3 345 346 MOVD $0, t2 347 348 loop_select: 349 ADD $1, t2 350 CMP t0, t2 351 LDP.P 16(t1), (acc0, acc1) 352 CSEL EQ, acc0, x0, x0 353 CSEL EQ, acc1, x1, x1 354 LDP.P 16(t1), (acc2, acc3) 355 CSEL EQ, acc2, x2, x2 356 CSEL EQ, acc3, x3, x3 357 LDP.P 16(t1), (acc4, acc5) 358 CSEL EQ, acc4, y0, y0 359 CSEL EQ, acc5, y1, y1 360 LDP.P 16(t1), (acc6, acc7) 361 CSEL EQ, acc6, y2, y2 362 CSEL EQ, acc7, y3, y3 363 364 CMP $32, t2 365 BNE loop_select 366 367 STP (x0, x1), 0*16(res_ptr) 368 STP (x2, x3), 1*16(res_ptr) 369 STP (y0, y1), 2*16(res_ptr) 370 STP (y2, y3), 3*16(res_ptr) 371 RET 372 /* ---------------------------------------*/ 373 // func p256OrdSqr(res, in *p256OrdElement, n int) 374 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 375 MOVD in+8(FP), a_ptr 376 MOVD n+16(FP), b_ptr 377 378 MOVD p256ordK0<>(SB), hlp1 379 LDP p256ord<>+0x00(SB), (const0, const1) 380 LDP p256ord<>+0x10(SB), (const2, const3) 381 382 LDP 0*16(a_ptr), (x0, x1) 383 LDP 1*16(a_ptr), (x2, x3) 384 385 ordSqrLoop: 386 SUB $1, b_ptr 387 388 // x[1:] * x[0] 389 MUL x0, x1, acc1 390 UMULH x0, x1, acc2 391 392 MUL x0, x2, t0 393 ADDS t0, acc2, acc2 394 UMULH x0, x2, acc3 395 396 MUL x0, x3, t0 397 ADCS t0, acc3, acc3 398 UMULH x0, x3, acc4 399 ADC $0, acc4, acc4 400 // x[2:] * x[1] 401 MUL x1, x2, t0 402 ADDS t0, acc3 403 UMULH x1, x2, t1 404 ADCS t1, acc4 405 ADC $0, ZR, acc5 406 407 MUL x1, x3, t0 408 ADDS t0, acc4 409 UMULH x1, x3, t1 410 ADC t1, acc5 411 // x[3] * x[2] 412 MUL x2, x3, t0 413 ADDS t0, acc5 414 UMULH x2, x3, acc6 415 ADC $0, acc6 416 417 MOVD $0, acc7 418 // *2 419 ADDS acc1, acc1 420 ADCS acc2, acc2 421 ADCS acc3, acc3 422 ADCS acc4, acc4 423 ADCS acc5, acc5 424 ADCS acc6, acc6 425 ADC $0, acc7 426 // Missing products 427 MUL x0, x0, acc0 428 UMULH x0, x0, t0 429 ADDS t0, acc1, acc1 430 431 MUL x1, x1, t0 432 ADCS t0, acc2, acc2 433 UMULH x1, x1, t1 434 ADCS t1, acc3, acc3 435 436 MUL x2, x2, t0 437 ADCS t0, acc4, acc4 438 UMULH x2, x2, t1 439 ADCS t1, acc5, acc5 440 441 MUL x3, x3, t0 442 ADCS t0, acc6, acc6 443 UMULH x3, x3, t1 444 ADC t1, acc7, acc7 445 // First reduction step 446 MUL acc0, hlp1, hlp0 447 448 MUL const0, hlp1, t0 449 ADDS t0, acc0, acc0 450 UMULH const0, hlp0, t1 451 452 MUL const1, hlp0, t0 453 ADCS t0, acc1, acc1 454 UMULH const1, hlp0, y0 455 456 MUL const2, hlp0, t0 457 ADCS t0, acc2, acc2 458 UMULH const2, hlp0, acc0 459 460 MUL const3, hlp0, t0 461 ADCS t0, acc3, acc3 462 463 UMULH const3, hlp0, hlp0 464 ADC $0, hlp0 465 466 ADDS t1, acc1, acc1 467 ADCS y0, acc2, acc2 468 ADCS acc0, acc3, acc3 469 ADC $0, hlp0, acc0 470 // Second reduction step 471 MUL acc1, hlp1, hlp0 472 473 MUL const0, hlp1, t0 474 ADDS t0, acc1, acc1 475 UMULH const0, hlp0, t1 476 477 MUL const1, hlp0, t0 478 ADCS t0, acc2, acc2 479 UMULH const1, hlp0, y0 480 481 MUL const2, hlp0, t0 482 ADCS t0, acc3, acc3 483 UMULH const2, hlp0, acc1 484 485 MUL const3, hlp0, t0 486 ADCS t0, acc0, acc0 487 488 UMULH const3, hlp0, hlp0 489 ADC $0, hlp0 490 491 ADDS t1, acc2, acc2 492 ADCS y0, acc3, acc3 493 ADCS acc1, acc0, acc0 494 ADC $0, hlp0, acc1 495 // Third reduction step 496 MUL acc2, hlp1, hlp0 497 498 MUL const0, hlp1, t0 499 ADDS t0, acc2, acc2 500 UMULH const0, hlp0, t1 501 502 MUL const1, hlp0, t0 503 ADCS t0, acc3, acc3 504 UMULH const1, hlp0, y0 505 506 MUL const2, hlp0, t0 507 ADCS t0, acc0, acc0 508 UMULH const2, hlp0, acc2 509 510 MUL const3, hlp0, t0 511 ADCS t0, acc1, acc1 512 513 UMULH const3, hlp0, hlp0 514 ADC $0, hlp0 515 516 ADDS t1, acc3, acc3 517 ADCS y0, acc0, acc0 518 ADCS acc2, acc1, acc1 519 ADC $0, hlp0, acc2 520 521 // Last reduction step 522 MUL acc3, hlp1, hlp0 523 524 MUL const0, hlp1, t0 525 ADDS t0, acc3, acc3 526 UMULH const0, hlp0, t1 527 528 MUL const1, hlp0, t0 529 ADCS t0, acc0, acc0 530 UMULH const1, hlp0, y0 531 532 MUL const2, hlp0, t0 533 ADCS t0, acc1, acc1 534 UMULH const2, hlp0, acc3 535 536 MUL const3, hlp0, t0 537 ADCS t0, acc2, acc2 538 539 UMULH const3, hlp0, hlp0 540 ADC $0, acc7 541 542 ADDS t1, acc0, acc0 543 ADCS y0, acc1, acc1 544 ADCS acc3, acc2, acc2 545 ADC $0, hlp0, acc3 546 547 ADDS acc4, acc0, acc0 548 ADCS acc5, acc1, acc1 549 ADCS acc6, acc2, acc2 550 ADCS acc7, acc3, acc3 551 ADC $0, ZR, acc4 552 553 SUBS const0, acc0, y0 554 SBCS const1, acc1, y1 555 SBCS const2, acc2, y2 556 SBCS const3, acc3, y3 557 SBCS $0, acc4, acc4 558 559 CSEL CS, y0, acc0, x0 560 CSEL CS, y1, acc1, x1 561 CSEL CS, y2, acc2, x2 562 CSEL CS, y3, acc3, x3 563 564 CBNZ b_ptr, ordSqrLoop 565 566 MOVD res+0(FP), res_ptr 567 STP (x0, x1), 0*16(res_ptr) 568 STP (x2, x3), 1*16(res_ptr) 569 570 RET 571 /* ---------------------------------------*/ 572 // func p256OrdMul(res, in1, in2 *p256OrdElement) 573 TEXT ·p256OrdMul(SB),NOSPLIT,$0 574 MOVD in1+8(FP), a_ptr 575 MOVD in2+16(FP), b_ptr 576 577 MOVD p256ordK0<>(SB), hlp1 578 LDP p256ord<>+0x00(SB), (const0, const1) 579 LDP p256ord<>+0x10(SB), (const2, const3) 580 581 LDP 0*16(a_ptr), (x0, x1) 582 LDP 1*16(a_ptr), (x2, x3) 583 LDP 0*16(b_ptr), (y0, y1) 584 LDP 1*16(b_ptr), (y2, y3) 585 586 // y[0] * x 587 MUL y0, x0, acc0 588 UMULH y0, x0, acc1 589 590 MUL y0, x1, t0 591 ADDS t0, acc1 592 UMULH y0, x1, acc2 593 594 MUL y0, x2, t0 595 ADCS t0, acc2 596 UMULH y0, x2, acc3 597 598 MUL y0, x3, t0 599 ADCS t0, acc3 600 UMULH y0, x3, acc4 601 ADC $0, acc4 602 // First reduction step 603 MUL acc0, hlp1, hlp0 604 605 MUL const0, hlp1, t0 606 ADDS t0, acc0, acc0 607 UMULH const0, hlp0, t1 608 609 MUL const1, hlp0, t0 610 ADCS t0, acc1, acc1 611 UMULH const1, hlp0, y0 612 613 MUL const2, hlp0, t0 614 ADCS t0, acc2, acc2 615 UMULH const2, hlp0, acc0 616 617 MUL const3, hlp0, t0 618 ADCS t0, acc3, acc3 619 620 UMULH const3, hlp0, hlp0 621 ADC $0, acc4 622 623 ADDS t1, acc1, acc1 624 ADCS y0, acc2, acc2 625 ADCS acc0, acc3, acc3 626 ADC $0, hlp0, acc0 627 // y[1] * x 628 MUL y1, x0, t0 629 ADDS t0, acc1 630 UMULH y1, x0, t1 631 632 MUL y1, x1, t0 633 ADCS t0, acc2 634 UMULH y1, x1, hlp0 635 636 MUL y1, x2, t0 637 ADCS t0, acc3 638 UMULH y1, x2, y0 639 640 MUL y1, x3, t0 641 ADCS t0, acc4 642 UMULH y1, x3, y1 643 ADC $0, ZR, acc5 644 645 ADDS t1, acc2 646 ADCS hlp0, acc3 647 ADCS y0, acc4 648 ADC y1, acc5 649 // Second reduction step 650 MUL acc1, hlp1, hlp0 651 652 MUL const0, hlp1, t0 653 ADDS t0, acc1, acc1 654 UMULH const0, hlp0, t1 655 656 MUL const1, hlp0, t0 657 ADCS t0, acc2, acc2 658 UMULH const1, hlp0, y0 659 660 MUL const2, hlp0, t0 661 ADCS t0, acc3, acc3 662 UMULH const2, hlp0, acc1 663 664 MUL const3, hlp0, t0 665 ADCS t0, acc0, acc0 666 667 UMULH const3, hlp0, hlp0 668 ADC $0, acc5 669 670 ADDS t1, acc2, acc2 671 ADCS y0, acc3, acc3 672 ADCS acc1, acc0, acc0 673 ADC $0, hlp0, acc1 674 // y[2] * x 675 MUL y2, x0, t0 676 ADDS t0, acc2 677 UMULH y2, x0, t1 678 679 MUL y2, x1, t0 680 ADCS t0, acc3 681 UMULH y2, x1, hlp0 682 683 MUL y2, x2, t0 684 ADCS t0, acc4 685 UMULH y2, x2, y0 686 687 MUL y2, x3, t0 688 ADCS t0, acc5 689 UMULH y2, x3, y1 690 ADC $0, ZR, acc6 691 692 ADDS t1, acc3 693 ADCS hlp0, acc4 694 ADCS y0, acc5 695 ADC y1, acc6 696 // Third reduction step 697 MUL acc2, hlp1, hlp0 698 699 MUL const0, hlp1, t0 700 ADDS t0, acc2, acc2 701 UMULH const0, hlp0, t1 702 703 MUL const1, hlp0, t0 704 ADCS t0, acc3, acc3 705 UMULH const1, hlp0, y0 706 707 MUL const2, hlp0, t0 708 ADCS t0, acc0, acc0 709 UMULH const2, hlp0, acc2 710 711 MUL const3, hlp0, t0 712 ADCS t0, acc1, acc1 713 714 UMULH const3, hlp0, hlp0 715 ADC $0, acc6 716 717 ADDS t1, acc3, acc3 718 ADCS y0, acc0, acc0 719 ADCS acc2, acc1, acc1 720 ADC $0, hlp0, acc2 721 // y[3] * x 722 MUL y3, x0, t0 723 ADDS t0, acc3 724 UMULH y3, x0, t1 725 726 MUL y3, x1, t0 727 ADCS t0, acc4 728 UMULH y3, x1, hlp0 729 730 MUL y3, x2, t0 731 ADCS t0, acc5 732 UMULH y3, x2, y0 733 734 MUL y3, x3, t0 735 ADCS t0, acc6 736 UMULH y3, x3, y1 737 ADC $0, ZR, acc7 738 739 ADDS t1, acc4 740 ADCS hlp0, acc5 741 ADCS y0, acc6 742 ADC y1, acc7 743 // Last reduction step 744 MUL acc3, hlp1, hlp0 745 746 MUL const0, hlp1, t0 747 ADDS t0, acc3, acc3 748 UMULH const0, hlp0, t1 749 750 MUL const1, hlp0, t0 751 ADCS t0, acc0, acc0 752 UMULH const1, hlp0, y0 753 754 MUL const2, hlp0, t0 755 ADCS t0, acc1, acc1 756 UMULH const2, hlp0, acc3 757 758 MUL const3, hlp0, t0 759 ADCS t0, acc2, acc2 760 761 UMULH const3, hlp0, hlp0 762 ADC $0, acc7 763 764 ADDS t1, acc0, acc0 765 ADCS y0, acc1, acc1 766 ADCS acc3, acc2, acc2 767 ADC $0, hlp0, acc3 768 769 ADDS acc4, acc0, acc0 770 ADCS acc5, acc1, acc1 771 ADCS acc6, acc2, acc2 772 ADCS acc7, acc3, acc3 773 ADC $0, ZR, acc4 774 775 SUBS const0, acc0, t0 776 SBCS const1, acc1, t1 777 SBCS const2, acc2, t2 778 SBCS const3, acc3, t3 779 SBCS $0, acc4, acc4 780 781 CSEL CS, t0, acc0, acc0 782 CSEL CS, t1, acc1, acc1 783 CSEL CS, t2, acc2, acc2 784 CSEL CS, t3, acc3, acc3 785 786 MOVD res+0(FP), res_ptr 787 STP (acc0, acc1), 0*16(res_ptr) 788 STP (acc2, acc3), 1*16(res_ptr) 789 790 RET 791 /* ---------------------------------------*/ 792 TEXT p256SubInternal<>(SB),NOSPLIT,$0 793 SUBS x0, y0, acc0 794 SBCS x1, y1, acc1 795 SBCS x2, y2, acc2 796 SBCS x3, y3, acc3 797 SBC $0, ZR, t0 798 799 ADDS $-1, acc0, acc4 800 ADCS const0, acc1, acc5 801 ADCS $0, acc2, acc6 802 ADC const1, acc3, acc7 803 804 ANDS $1, t0 805 CSEL EQ, acc0, acc4, x0 806 CSEL EQ, acc1, acc5, x1 807 CSEL EQ, acc2, acc6, x2 808 CSEL EQ, acc3, acc7, x3 809 810 RET 811 /* ---------------------------------------*/ 812 TEXT p256SqrInternal<>(SB),NOSPLIT,$0 813 // x[1:] * x[0] 814 MUL x0, x1, acc1 815 UMULH x0, x1, acc2 816 817 MUL x0, x2, t0 818 ADDS t0, acc2, acc2 819 UMULH x0, x2, acc3 820 821 MUL x0, x3, t0 822 ADCS t0, acc3, acc3 823 UMULH x0, x3, acc4 824 ADC $0, acc4, acc4 825 // x[2:] * x[1] 826 MUL x1, x2, t0 827 ADDS t0, acc3 828 UMULH x1, x2, t1 829 ADCS t1, acc4 830 ADC $0, ZR, acc5 831 832 MUL x1, x3, t0 833 ADDS t0, acc4 834 UMULH x1, x3, t1 835 ADC t1, acc5 836 // x[3] * x[2] 837 MUL x2, x3, t0 838 ADDS t0, acc5 839 UMULH x2, x3, acc6 840 ADC $0, acc6 841 842 MOVD $0, acc7 843 // *2 844 ADDS acc1, acc1 845 ADCS acc2, acc2 846 ADCS acc3, acc3 847 ADCS acc4, acc4 848 ADCS acc5, acc5 849 ADCS acc6, acc6 850 ADC $0, acc7 851 // Missing products 852 MUL x0, x0, acc0 853 UMULH x0, x0, t0 854 ADDS t0, acc1, acc1 855 856 MUL x1, x1, t0 857 ADCS t0, acc2, acc2 858 UMULH x1, x1, t1 859 ADCS t1, acc3, acc3 860 861 MUL x2, x2, t0 862 ADCS t0, acc4, acc4 863 UMULH x2, x2, t1 864 ADCS t1, acc5, acc5 865 866 MUL x3, x3, t0 867 ADCS t0, acc6, acc6 868 UMULH x3, x3, t1 869 ADCS t1, acc7, acc7 870 // First reduction step 871 ADDS acc0<<32, acc1, acc1 872 LSR $32, acc0, t0 873 MUL acc0, const1, t1 874 UMULH acc0, const1, acc0 875 ADCS t0, acc2, acc2 876 ADCS t1, acc3, acc3 877 ADC $0, acc0, acc0 878 // Second reduction step 879 ADDS acc1<<32, acc2, acc2 880 LSR $32, acc1, t0 881 MUL acc1, const1, t1 882 UMULH acc1, const1, acc1 883 ADCS t0, acc3, acc3 884 ADCS t1, acc0, acc0 885 ADC $0, acc1, acc1 886 // Third reduction step 887 ADDS acc2<<32, acc3, acc3 888 LSR $32, acc2, t0 889 MUL acc2, const1, t1 890 UMULH acc2, const1, acc2 891 ADCS t0, acc0, acc0 892 ADCS t1, acc1, acc1 893 ADC $0, acc2, acc2 894 // Last reduction step 895 ADDS acc3<<32, acc0, acc0 896 LSR $32, acc3, t0 897 MUL acc3, const1, t1 898 UMULH acc3, const1, acc3 899 ADCS t0, acc1, acc1 900 ADCS t1, acc2, acc2 901 ADC $0, acc3, acc3 902 // Add bits [511:256] of the sqr result 903 ADDS acc4, acc0, acc0 904 ADCS acc5, acc1, acc1 905 ADCS acc6, acc2, acc2 906 ADCS acc7, acc3, acc3 907 ADC $0, ZR, acc4 908 909 SUBS $-1, acc0, t0 910 SBCS const0, acc1, t1 911 SBCS $0, acc2, t2 912 SBCS const1, acc3, t3 913 SBCS $0, acc4, acc4 914 915 CSEL CS, t0, acc0, y0 916 CSEL CS, t1, acc1, y1 917 CSEL CS, t2, acc2, y2 918 CSEL CS, t3, acc3, y3 919 RET 920 /* ---------------------------------------*/ 921 TEXT p256MulInternal<>(SB),NOSPLIT,$0 922 // y[0] * x 923 MUL y0, x0, acc0 924 UMULH y0, x0, acc1 925 926 MUL y0, x1, t0 927 ADDS t0, acc1 928 UMULH y0, x1, acc2 929 930 MUL y0, x2, t0 931 ADCS t0, acc2 932 UMULH y0, x2, acc3 933 934 MUL y0, x3, t0 935 ADCS t0, acc3 936 UMULH y0, x3, acc4 937 ADC $0, acc4 938 // First reduction step 939 ADDS acc0<<32, acc1, acc1 940 LSR $32, acc0, t0 941 MUL acc0, const1, t1 942 UMULH acc0, const1, acc0 943 ADCS t0, acc2 944 ADCS t1, acc3 945 ADC $0, acc0 946 // y[1] * x 947 MUL y1, x0, t0 948 ADDS t0, acc1 949 UMULH y1, x0, t1 950 951 MUL y1, x1, t0 952 ADCS t0, acc2 953 UMULH y1, x1, t2 954 955 MUL y1, x2, t0 956 ADCS t0, acc3 957 UMULH y1, x2, t3 958 959 MUL y1, x3, t0 960 ADCS t0, acc4 961 UMULH y1, x3, hlp0 962 ADC $0, ZR, acc5 963 964 ADDS t1, acc2 965 ADCS t2, acc3 966 ADCS t3, acc4 967 ADC hlp0, acc5 968 // Second reduction step 969 ADDS acc1<<32, acc2, acc2 970 LSR $32, acc1, t0 971 MUL acc1, const1, t1 972 UMULH acc1, const1, acc1 973 ADCS t0, acc3 974 ADCS t1, acc0 975 ADC $0, acc1 976 // y[2] * x 977 MUL y2, x0, t0 978 ADDS t0, acc2 979 UMULH y2, x0, t1 980 981 MUL y2, x1, t0 982 ADCS t0, acc3 983 UMULH y2, x1, t2 984 985 MUL y2, x2, t0 986 ADCS t0, acc4 987 UMULH y2, x2, t3 988 989 MUL y2, x3, t0 990 ADCS t0, acc5 991 UMULH y2, x3, hlp0 992 ADC $0, ZR, acc6 993 994 ADDS t1, acc3 995 ADCS t2, acc4 996 ADCS t3, acc5 997 ADC hlp0, acc6 998 // Third reduction step 999 ADDS acc2<<32, acc3, acc3 1000 LSR $32, acc2, t0 1001 MUL acc2, const1, t1 1002 UMULH acc2, const1, acc2 1003 ADCS t0, acc0 1004 ADCS t1, acc1 1005 ADC $0, acc2 1006 // y[3] * x 1007 MUL y3, x0, t0 1008 ADDS t0, acc3 1009 UMULH y3, x0, t1 1010 1011 MUL y3, x1, t0 1012 ADCS t0, acc4 1013 UMULH y3, x1, t2 1014 1015 MUL y3, x2, t0 1016 ADCS t0, acc5 1017 UMULH y3, x2, t3 1018 1019 MUL y3, x3, t0 1020 ADCS t0, acc6 1021 UMULH y3, x3, hlp0 1022 ADC $0, ZR, acc7 1023 1024 ADDS t1, acc4 1025 ADCS t2, acc5 1026 ADCS t3, acc6 1027 ADC hlp0, acc7 1028 // Last reduction step 1029 ADDS acc3<<32, acc0, acc0 1030 LSR $32, acc3, t0 1031 MUL acc3, const1, t1 1032 UMULH acc3, const1, acc3 1033 ADCS t0, acc1 1034 ADCS t1, acc2 1035 ADC $0, acc3 1036 // Add bits [511:256] of the mul result 1037 ADDS acc4, acc0, acc0 1038 ADCS acc5, acc1, acc1 1039 ADCS acc6, acc2, acc2 1040 ADCS acc7, acc3, acc3 1041 ADC $0, ZR, acc4 1042 1043 SUBS $-1, acc0, t0 1044 SBCS const0, acc1, t1 1045 SBCS $0, acc2, t2 1046 SBCS const1, acc3, t3 1047 SBCS $0, acc4, acc4 1048 1049 CSEL CS, t0, acc0, y0 1050 CSEL CS, t1, acc1, y1 1051 CSEL CS, t2, acc2, y2 1052 CSEL CS, t3, acc3, y3 1053 RET 1054 /* ---------------------------------------*/ 1055 #define p256MulBy2Inline \ 1056 ADDS y0, y0, x0; \ 1057 ADCS y1, y1, x1; \ 1058 ADCS y2, y2, x2; \ 1059 ADCS y3, y3, x3; \ 1060 ADC $0, ZR, hlp0; \ 1061 SUBS $-1, x0, t0; \ 1062 SBCS const0, x1, t1;\ 1063 SBCS $0, x2, t2; \ 1064 SBCS const1, x3, t3;\ 1065 SBCS $0, hlp0, hlp0;\ 1066 CSEL CC, x0, t0, x0;\ 1067 CSEL CC, x1, t1, x1;\ 1068 CSEL CC, x2, t2, x2;\ 1069 CSEL CC, x3, t3, x3; 1070 /* ---------------------------------------*/ 1071 #define x1in(off) (off)(a_ptr) 1072 #define y1in(off) (off + 32)(a_ptr) 1073 #define z1in(off) (off + 64)(a_ptr) 1074 #define x2in(off) (off)(b_ptr) 1075 #define z2in(off) (off + 64)(b_ptr) 1076 #define x3out(off) (off)(res_ptr) 1077 #define y3out(off) (off + 32)(res_ptr) 1078 #define z3out(off) (off + 64)(res_ptr) 1079 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) 1080 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) 1081 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) 1082 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) 1083 /* ---------------------------------------*/ 1084 #define y2in(off) (32*0 + 8 + off)(RSP) 1085 #define s2(off) (32*1 + 8 + off)(RSP) 1086 #define z1sqr(off) (32*2 + 8 + off)(RSP) 1087 #define h(off) (32*3 + 8 + off)(RSP) 1088 #define r(off) (32*4 + 8 + off)(RSP) 1089 #define hsqr(off) (32*5 + 8 + off)(RSP) 1090 #define rsqr(off) (32*6 + 8 + off)(RSP) 1091 #define hcub(off) (32*7 + 8 + off)(RSP) 1092 1093 #define z2sqr(off) (32*8 + 8 + off)(RSP) 1094 #define s1(off) (32*9 + 8 + off)(RSP) 1095 #define u1(off) (32*10 + 8 + off)(RSP) 1096 #define u2(off) (32*11 + 8 + off)(RSP) 1097 1098 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1099 TEXT ·p256PointAddAffineAsm(SB),0,$264-48 1100 MOVD in1+8(FP), a_ptr 1101 MOVD in2+16(FP), b_ptr 1102 MOVD sign+24(FP), hlp0 1103 MOVD sel+32(FP), hlp1 1104 MOVD zero+40(FP), t2 1105 1106 MOVD $1, t0 1107 CMP $0, t2 1108 CSEL EQ, ZR, t0, t2 1109 CMP $0, hlp1 1110 CSEL EQ, ZR, t0, hlp1 1111 1112 MOVD p256const0<>(SB), const0 1113 MOVD p256const1<>(SB), const1 1114 EOR t2<<1, hlp1 1115 1116 // Negate y2in based on sign 1117 LDP 2*16(b_ptr), (y0, y1) 1118 LDP 3*16(b_ptr), (y2, y3) 1119 MOVD $-1, acc0 1120 1121 SUBS y0, acc0, acc0 1122 SBCS y1, const0, acc1 1123 SBCS y2, ZR, acc2 1124 SBCS y3, const1, acc3 1125 SBC $0, ZR, t0 1126 1127 ADDS $-1, acc0, acc4 1128 ADCS const0, acc1, acc5 1129 ADCS $0, acc2, acc6 1130 ADCS const1, acc3, acc7 1131 ADC $0, t0, t0 1132 1133 CMP $0, t0 1134 CSEL EQ, acc4, acc0, acc0 1135 CSEL EQ, acc5, acc1, acc1 1136 CSEL EQ, acc6, acc2, acc2 1137 CSEL EQ, acc7, acc3, acc3 1138 // If condition is 0, keep original value 1139 CMP $0, hlp0 1140 CSEL EQ, y0, acc0, y0 1141 CSEL EQ, y1, acc1, y1 1142 CSEL EQ, y2, acc2, y2 1143 CSEL EQ, y3, acc3, y3 1144 // Store result 1145 STy(y2in) 1146 // Begin point add 1147 LDx(z1in) 1148 CALL p256SqrInternal<>(SB) // z1ˆ2 1149 STy(z1sqr) 1150 1151 LDx(x2in) 1152 CALL p256MulInternal<>(SB) // x2 * z1ˆ2 1153 1154 LDx(x1in) 1155 CALL p256SubInternal<>(SB) // h = u2 - u1 1156 STx(h) 1157 1158 LDy(z1in) 1159 CALL p256MulInternal<>(SB) // z3 = h * z1 1160 1161 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1 1162 LDP 5*16(a_ptr), (acc2, acc3) 1163 ANDS $1, hlp1, ZR 1164 CSEL EQ, acc0, y0, y0 1165 CSEL EQ, acc1, y1, y1 1166 CSEL EQ, acc2, y2, y2 1167 CSEL EQ, acc3, y3, y3 1168 LDP p256one<>+0x00(SB), (acc0, acc1) 1169 LDP p256one<>+0x10(SB), (acc2, acc3) 1170 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1 1171 CSEL EQ, acc0, y0, y0 1172 CSEL EQ, acc1, y1, y1 1173 CSEL EQ, acc2, y2, y2 1174 CSEL EQ, acc3, y3, y3 1175 LDx(z1in) 1176 MOVD res+0(FP), t0 1177 STP (y0, y1), 4*16(t0) 1178 STP (y2, y3), 5*16(t0) 1179 1180 LDy(z1sqr) 1181 CALL p256MulInternal<>(SB) // z1 ^ 3 1182 1183 LDx(y2in) 1184 CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3 1185 STy(s2) 1186 1187 LDx(y1in) 1188 CALL p256SubInternal<>(SB) // r = s2 - s1 1189 STx(r) 1190 1191 CALL p256SqrInternal<>(SB) // rsqr = rˆ2 1192 STy (rsqr) 1193 1194 LDx(h) 1195 CALL p256SqrInternal<>(SB) // hsqr = hˆ2 1196 STy(hsqr) 1197 1198 CALL p256MulInternal<>(SB) // hcub = hˆ3 1199 STy(hcub) 1200 1201 LDx(y1in) 1202 CALL p256MulInternal<>(SB) // y1 * hˆ3 1203 STy(s2) 1204 1205 LDP hsqr(0*8), (x0, x1) 1206 LDP hsqr(2*8), (x2, x3) 1207 LDP 0*16(a_ptr), (y0, y1) 1208 LDP 1*16(a_ptr), (y2, y3) 1209 CALL p256MulInternal<>(SB) // u1 * hˆ2 1210 STP (y0, y1), h(0*8) 1211 STP (y2, y3), h(2*8) 1212 1213 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1214 1215 LDy(rsqr) 1216 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1217 1218 MOVD x0, y0 1219 MOVD x1, y1 1220 MOVD x2, y2 1221 MOVD x3, y3 1222 LDx(hcub) 1223 CALL p256SubInternal<>(SB) 1224 1225 LDP 0*16(a_ptr), (acc0, acc1) 1226 LDP 1*16(a_ptr), (acc2, acc3) 1227 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1 1228 CSEL EQ, acc0, x0, x0 1229 CSEL EQ, acc1, x1, x1 1230 CSEL EQ, acc2, x2, x2 1231 CSEL EQ, acc3, x3, x3 1232 LDP 0*16(b_ptr), (acc0, acc1) 1233 LDP 1*16(b_ptr), (acc2, acc3) 1234 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2 1235 CSEL EQ, acc0, x0, x0 1236 CSEL EQ, acc1, x1, x1 1237 CSEL EQ, acc2, x2, x2 1238 CSEL EQ, acc3, x3, x3 1239 MOVD res+0(FP), t0 1240 STP (x0, x1), 0*16(t0) 1241 STP (x2, x3), 1*16(t0) 1242 1243 LDP h(0*8), (y0, y1) 1244 LDP h(2*8), (y2, y3) 1245 CALL p256SubInternal<>(SB) 1246 1247 LDP r(0*8), (y0, y1) 1248 LDP r(2*8), (y2, y3) 1249 CALL p256MulInternal<>(SB) 1250 1251 LDP s2(0*8), (x0, x1) 1252 LDP s2(2*8), (x2, x3) 1253 CALL p256SubInternal<>(SB) 1254 LDP 2*16(a_ptr), (acc0, acc1) 1255 LDP 3*16(a_ptr), (acc2, acc3) 1256 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1 1257 CSEL EQ, acc0, x0, x0 1258 CSEL EQ, acc1, x1, x1 1259 CSEL EQ, acc2, x2, x2 1260 CSEL EQ, acc3, x3, x3 1261 LDP y2in(0*8), (acc0, acc1) 1262 LDP y2in(2*8), (acc2, acc3) 1263 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2 1264 CSEL EQ, acc0, x0, x0 1265 CSEL EQ, acc1, x1, x1 1266 CSEL EQ, acc2, x2, x2 1267 CSEL EQ, acc3, x3, x3 1268 MOVD res+0(FP), t0 1269 STP (x0, x1), 2*16(t0) 1270 STP (x2, x3), 3*16(t0) 1271 1272 RET 1273 1274 #define p256AddInline \ 1275 ADDS y0, x0, x0; \ 1276 ADCS y1, x1, x1; \ 1277 ADCS y2, x2, x2; \ 1278 ADCS y3, x3, x3; \ 1279 ADC $0, ZR, hlp0; \ 1280 SUBS $-1, x0, t0; \ 1281 SBCS const0, x1, t1;\ 1282 SBCS $0, x2, t2; \ 1283 SBCS const1, x3, t3;\ 1284 SBCS $0, hlp0, hlp0;\ 1285 CSEL CC, x0, t0, x0;\ 1286 CSEL CC, x1, t1, x1;\ 1287 CSEL CC, x2, t2, x2;\ 1288 CSEL CC, x3, t3, x3; 1289 1290 #define s(off) (32*0 + 8 + off)(RSP) 1291 #define m(off) (32*1 + 8 + off)(RSP) 1292 #define zsqr(off) (32*2 + 8 + off)(RSP) 1293 #define tmp(off) (32*3 + 8 + off)(RSP) 1294 1295 //func p256PointDoubleAsm(res, in *P256Point) 1296 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16 1297 MOVD res+0(FP), res_ptr 1298 MOVD in+8(FP), a_ptr 1299 1300 MOVD p256const0<>(SB), const0 1301 MOVD p256const1<>(SB), const1 1302 1303 // Begin point double 1304 LDP 4*16(a_ptr), (x0, x1) 1305 LDP 5*16(a_ptr), (x2, x3) 1306 CALL p256SqrInternal<>(SB) 1307 STP (y0, y1), zsqr(0*8) 1308 STP (y2, y3), zsqr(2*8) 1309 1310 LDP 0*16(a_ptr), (x0, x1) 1311 LDP 1*16(a_ptr), (x2, x3) 1312 p256AddInline 1313 STx(m) 1314 1315 LDx(z1in) 1316 LDy(y1in) 1317 CALL p256MulInternal<>(SB) 1318 p256MulBy2Inline 1319 STx(z3out) 1320 1321 LDy(x1in) 1322 LDx(zsqr) 1323 CALL p256SubInternal<>(SB) 1324 LDy(m) 1325 CALL p256MulInternal<>(SB) 1326 1327 // Multiply by 3 1328 p256MulBy2Inline 1329 p256AddInline 1330 STx(m) 1331 1332 LDy(y1in) 1333 p256MulBy2Inline 1334 CALL p256SqrInternal<>(SB) 1335 STy(s) 1336 MOVD y0, x0 1337 MOVD y1, x1 1338 MOVD y2, x2 1339 MOVD y3, x3 1340 CALL p256SqrInternal<>(SB) 1341 1342 // Divide by 2 1343 ADDS $-1, y0, t0 1344 ADCS const0, y1, t1 1345 ADCS $0, y2, t2 1346 ADCS const1, y3, t3 1347 ADC $0, ZR, hlp0 1348 1349 ANDS $1, y0, ZR 1350 CSEL EQ, y0, t0, t0 1351 CSEL EQ, y1, t1, t1 1352 CSEL EQ, y2, t2, t2 1353 CSEL EQ, y3, t3, t3 1354 AND y0, hlp0, hlp0 1355 1356 EXTR $1, t0, t1, y0 1357 EXTR $1, t1, t2, y1 1358 EXTR $1, t2, t3, y2 1359 EXTR $1, t3, hlp0, y3 1360 STy(y3out) 1361 1362 LDx(x1in) 1363 LDy(s) 1364 CALL p256MulInternal<>(SB) 1365 STy(s) 1366 p256MulBy2Inline 1367 STx(tmp) 1368 1369 LDx(m) 1370 CALL p256SqrInternal<>(SB) 1371 LDx(tmp) 1372 CALL p256SubInternal<>(SB) 1373 1374 STx(x3out) 1375 1376 LDy(s) 1377 CALL p256SubInternal<>(SB) 1378 1379 LDy(m) 1380 CALL p256MulInternal<>(SB) 1381 1382 LDx(y3out) 1383 CALL p256SubInternal<>(SB) 1384 STx(y3out) 1385 RET 1386 /* ---------------------------------------*/ 1387 #undef y2in 1388 #undef x3out 1389 #undef y3out 1390 #undef z3out 1391 #define y2in(off) (off + 32)(b_ptr) 1392 #define x3out(off) (off)(b_ptr) 1393 #define y3out(off) (off + 32)(b_ptr) 1394 #define z3out(off) (off + 64)(b_ptr) 1395 // func p256PointAddAsm(res, in1, in2 *P256Point) int 1396 TEXT ·p256PointAddAsm(SB),0,$392-32 1397 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 1398 // Move input to stack in order to free registers 1399 MOVD in1+8(FP), a_ptr 1400 MOVD in2+16(FP), b_ptr 1401 1402 MOVD p256const0<>(SB), const0 1403 MOVD p256const1<>(SB), const1 1404 1405 // Begin point add 1406 LDx(z2in) 1407 CALL p256SqrInternal<>(SB) // z2^2 1408 STy(z2sqr) 1409 1410 CALL p256MulInternal<>(SB) // z2^3 1411 1412 LDx(y1in) 1413 CALL p256MulInternal<>(SB) // s1 = z2ˆ3*y1 1414 STy(s1) 1415 1416 LDx(z1in) 1417 CALL p256SqrInternal<>(SB) // z1^2 1418 STy(z1sqr) 1419 1420 CALL p256MulInternal<>(SB) // z1^3 1421 1422 LDx(y2in) 1423 CALL p256MulInternal<>(SB) // s2 = z1ˆ3*y2 1424 1425 LDx(s1) 1426 CALL p256SubInternal<>(SB) // r = s2 - s1 1427 STx(r) 1428 1429 MOVD $1, t2 1430 ORR x0, x1, t0 // Check if zero mod p256 1431 ORR x2, x3, t1 1432 ORR t1, t0, t0 1433 CMP $0, t0 1434 CSEL EQ, t2, ZR, hlp1 1435 1436 EOR $-1, x0, t0 1437 EOR const0, x1, t1 1438 EOR const1, x3, t3 1439 1440 ORR t0, t1, t0 1441 ORR x2, t3, t1 1442 ORR t1, t0, t0 1443 CMP $0, t0 1444 CSEL EQ, t2, hlp1, hlp1 1445 1446 LDx(z2sqr) 1447 LDy(x1in) 1448 CALL p256MulInternal<>(SB) // u1 = x1 * z2ˆ2 1449 STy(u1) 1450 1451 LDx(z1sqr) 1452 LDy(x2in) 1453 CALL p256MulInternal<>(SB) // u2 = x2 * z1ˆ2 1454 STy(u2) 1455 1456 LDx(u1) 1457 CALL p256SubInternal<>(SB) // h = u2 - u1 1458 STx(h) 1459 1460 MOVD $1, t2 1461 ORR x0, x1, t0 // Check if zero mod p256 1462 ORR x2, x3, t1 1463 ORR t1, t0, t0 1464 CMP $0, t0 1465 CSEL EQ, t2, ZR, hlp0 1466 1467 EOR $-1, x0, t0 1468 EOR const0, x1, t1 1469 EOR const1, x3, t3 1470 1471 ORR t0, t1, t0 1472 ORR x2, t3, t1 1473 ORR t1, t0, t0 1474 CMP $0, t0 1475 CSEL EQ, t2, hlp0, hlp0 1476 1477 AND hlp0, hlp1, hlp1 1478 1479 LDx(r) 1480 CALL p256SqrInternal<>(SB) // rsqr = rˆ2 1481 STy(rsqr) 1482 1483 LDx(h) 1484 CALL p256SqrInternal<>(SB) // hsqr = hˆ2 1485 STy(hsqr) 1486 1487 LDx(h) 1488 CALL p256MulInternal<>(SB) // hcub = hˆ3 1489 STy(hcub) 1490 1491 LDx(s1) 1492 CALL p256MulInternal<>(SB) 1493 STy(s2) 1494 1495 LDx(z1in) 1496 LDy(z2in) 1497 CALL p256MulInternal<>(SB) // z1 * z2 1498 LDx(h) 1499 CALL p256MulInternal<>(SB) // z1 * z2 * h 1500 MOVD res+0(FP), b_ptr 1501 STy(z3out) 1502 1503 LDx(hsqr) 1504 LDy(u1) 1505 CALL p256MulInternal<>(SB) // hˆ2 * u1 1506 STy(u2) 1507 1508 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1509 LDy(rsqr) 1510 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1511 1512 MOVD x0, y0 1513 MOVD x1, y1 1514 MOVD x2, y2 1515 MOVD x3, y3 1516 LDx(hcub) 1517 CALL p256SubInternal<>(SB) 1518 STx(x3out) 1519 1520 LDy(u2) 1521 CALL p256SubInternal<>(SB) 1522 1523 LDy(r) 1524 CALL p256MulInternal<>(SB) 1525 1526 LDx(s2) 1527 CALL p256SubInternal<>(SB) 1528 STx(y3out) 1529 1530 MOVD hlp1, R0 1531 MOVD R0, ret+24(FP) 1532 1533 RET