gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm2/p256_asm_arm64.s (about) 1 // This file contains constant-time, 64-bit assembly implementation of 2 // P256. The optimizations performed here are described in detail in: 3 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 4 // 256-bit primes" 5 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 6 // https://eprint.iacr.org/2013/816.pdf 7 8 #include "textflag.h" 9 10 #define res_ptr R0 11 #define a_ptr R1 12 #define b_ptr R2 13 14 #define acc0 R3 15 #define acc1 R4 16 #define acc2 R5 17 #define acc3 R6 18 19 #define acc4 R7 20 #define acc5 R8 21 #define acc6 R9 22 #define acc7 R10 23 #define t0 R11 24 #define t1 R12 25 #define t2 R13 26 #define t3 R14 27 #define const0 R15 28 #define const1 R16 29 30 #define hlp0 R17 31 #define hlp1 res_ptr 32 33 #define x0 R19 34 #define x1 R20 35 #define x2 R21 36 #define x3 R22 37 #define y0 R23 38 #define y1 R24 39 #define y2 R25 40 #define y3 R26 41 42 #define const2 t2 43 #define const3 t3 44 45 DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff 46 DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 47 DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff 48 DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff 49 DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975 50 DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123 51 DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b 52 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 53 DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff 54 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 55 DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff 56 DATA p256one<>+0x10(SB)/8, $0x0000000000000000 57 DATA p256one<>+0x18(SB)/8, $0x0000000100000000 58 GLOBL p256p<>(SB), RODATA, $32 59 GLOBL p256ordK0<>(SB), RODATA, $8 60 GLOBL p256ord<>(SB), RODATA, $32 61 GLOBL p256one<>(SB), RODATA, $32 62 63 /* ---------------------------------------*/ 64 // func p256LittleToBig(res []byte, in []uint64) 65 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 66 JMP ·p256BigToLittle(SB) 67 /* ---------------------------------------*/ 68 // func p256BigToLittle(res []uint64, in []byte) 69 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 70 MOVD res+0(FP), res_ptr 71 MOVD in+24(FP), a_ptr 72 73 LDP 0*16(a_ptr), (acc0, acc1) 74 LDP 1*16(a_ptr), (acc2, acc3) 75 76 REV acc0, acc0 77 REV acc1, acc1 78 REV acc2, acc2 79 REV acc3, acc3 80 81 STP (acc3, acc2), 0*16(res_ptr) 82 STP (acc1, acc0), 1*16(res_ptr) 83 RET 84 /* ---------------------------------------*/ 85 // func p256MovCond(res, a, b []uint64, cond int) 86 // If cond == 0 res=b, else res=a 87 TEXT ·p256MovCond(SB),NOSPLIT,$0 88 MOVD res+0(FP), res_ptr 89 MOVD a+24(FP), a_ptr 90 MOVD b+48(FP), b_ptr 91 MOVD cond+72(FP), R3 92 93 CMP $0, R3 94 // Two remarks: 95 // 1) Will want to revisit NEON, when support is better 96 // 2) CSEL might not be constant time on all ARM processors 97 LDP 0*16(a_ptr), (R4, R5) 98 LDP 1*16(a_ptr), (R6, R7) 99 LDP 2*16(a_ptr), (R8, R9) 100 LDP 0*16(b_ptr), (R16, R17) 101 LDP 1*16(b_ptr), (R19, R20) 102 LDP 2*16(b_ptr), (R21, R22) 103 CSEL EQ, R16, R4, R4 104 CSEL EQ, R17, R5, R5 105 CSEL EQ, R19, R6, R6 106 CSEL EQ, R20, R7, R7 107 CSEL EQ, R21, R8, R8 108 CSEL EQ, R22, R9, R9 109 STP (R4, R5), 0*16(res_ptr) 110 STP (R6, R7), 1*16(res_ptr) 111 STP (R8, R9), 2*16(res_ptr) 112 113 LDP 3*16(a_ptr), (R4, R5) 114 LDP 4*16(a_ptr), (R6, R7) 115 LDP 5*16(a_ptr), (R8, R9) 116 LDP 3*16(b_ptr), (R16, R17) 117 LDP 4*16(b_ptr), (R19, R20) 118 LDP 5*16(b_ptr), (R21, R22) 119 CSEL EQ, R16, R4, R4 120 CSEL EQ, R17, R5, R5 121 CSEL EQ, R19, R6, R6 122 CSEL EQ, R20, R7, R7 123 CSEL EQ, R21, R8, R8 124 CSEL EQ, R22, R9, R9 125 STP (R4, R5), 3*16(res_ptr) 126 STP (R6, R7), 4*16(res_ptr) 127 STP (R8, R9), 5*16(res_ptr) 128 129 RET 130 /* ---------------------------------------*/ 131 // func p256NegCond(val []uint64, cond int) 132 TEXT ·p256NegCond(SB),NOSPLIT,$0 133 MOVD val+0(FP), a_ptr 134 MOVD cond+24(FP), hlp0 135 MOVD a_ptr, res_ptr 136 // acc = poly 137 LDP p256p<>+0x00(SB), (acc0, acc1) 138 LDP p256p<>+0x10(SB), (acc2, acc3) 139 140 // Load the original value 141 LDP 0*16(a_ptr), (t0, t1) 142 LDP 1*16(a_ptr), (t2, t3) 143 // Speculatively subtract 144 SUBS t0, acc0 145 SBCS t1, acc1 146 SBCS t2, acc2 147 SBC t3, acc3 148 // If condition is 0, keep original value 149 CMP $0, hlp0 150 CSEL EQ, t0, acc0, acc0 151 CSEL EQ, t1, acc1, acc1 152 CSEL EQ, t2, acc2, acc2 153 CSEL EQ, t3, acc3, acc3 154 // Store result 155 STP (acc0, acc1), 0*16(res_ptr) 156 STP (acc2, acc3), 1*16(res_ptr) 157 158 RET 159 /* ---------------------------------------*/ 160 // func p256Sqr(res, in []uint64, n int) 161 TEXT ·p256Sqr(SB),NOSPLIT,$0 162 MOVD res+0(FP), res_ptr 163 MOVD in+24(FP), a_ptr 164 MOVD n+48(FP), b_ptr 165 166 LDP p256p<>+0x00(SB), (const0, const1) 167 LDP p256p<>+0x10(SB), (const2, const3) 168 169 LDP 0*16(a_ptr), (x0, x1) 170 LDP 1*16(a_ptr), (x2, x3) 171 172 sqrLoop: 173 SUB $1, b_ptr 174 CALL sm2P256SqrInternal<>(SB) 175 MOVD y0, x0 176 MOVD y1, x1 177 MOVD y2, x2 178 MOVD y3, x3 179 CBNZ b_ptr, sqrLoop 180 181 STP (y0, y1), 0*16(res_ptr) 182 STP (y2, y3), 1*16(res_ptr) 183 RET 184 /* ---------------------------------------*/ 185 // func p256Mul(res, in1, in2 []uint64) 186 TEXT ·p256Mul(SB),NOSPLIT,$0 187 MOVD res+0(FP), res_ptr 188 MOVD in1+24(FP), a_ptr 189 MOVD in2+48(FP), b_ptr 190 191 LDP p256p<>+0x00(SB), (const0, const1) 192 LDP p256p<>+0x10(SB), (const2, const3) 193 194 LDP 0*16(a_ptr), (x0, x1) 195 LDP 1*16(a_ptr), (x2, x3) 196 197 LDP 0*16(b_ptr), (y0, y1) 198 LDP 1*16(b_ptr), (y2, y3) 199 200 CALL sm2P256MulInternal<>(SB) 201 202 STP (y0, y1), 0*16(res_ptr) 203 STP (y2, y3), 1*16(res_ptr) 204 RET 205 /* ---------------------------------------*/ 206 // func p256FromMont(res, in []uint64) 207 TEXT ·p256FromMont(SB),NOSPLIT,$0 208 MOVD res+0(FP), res_ptr 209 MOVD in+24(FP), a_ptr 210 211 LDP p256p<>+0x00(SB), (const0, const1) 212 LDP p256p<>+0x10(SB), (const2, const3) 213 214 LDP 0*16(a_ptr), (acc0, acc1) 215 LDP 1*16(a_ptr), (acc2, acc3) 216 // Only reduce, no multiplications are needed 217 // First reduction step 218 LSL $32, acc0, y0 219 LSR $32, acc0, y1 220 221 ADDS acc0, acc1, acc1 222 ADCS $0, acc2, acc2 223 ADCS $0, acc3, acc3 224 ADC $0, acc0, acc0 225 226 SUBS y0, acc1 227 SBCS y1, acc2 228 SBCS y0, acc3 229 SBC y1, acc0 230 // Second reduction step 231 LSL $32, acc1, y0 232 LSR $32, acc1, y1 233 234 ADDS acc1, acc2, acc2 235 ADCS $0, acc3, acc3 236 ADCS $0, acc0, acc0 237 ADC $0, acc1, acc1 238 239 SUBS y0, acc2 240 SBCS y1, acc3 241 SBCS y0, acc0 242 SBC y1, acc1 243 // Third reduction step 244 LSL $32, acc2, y0 245 LSR $32, acc2, y1 246 247 ADDS acc2, acc3, acc3 248 ADCS $0, acc0, acc0 249 ADCS $0, acc1, acc1 250 ADC $0, acc2, acc2 251 252 SUBS y0, acc3 253 SBCS y1, acc0 254 SBCS y0, acc1 255 SBC y1, acc2 256 // Last reduction step 257 LSL $32, acc3, y0 258 LSR $32, acc3, y1 259 260 ADDS acc3, acc0, acc0 261 ADCS $0, acc1, acc1 262 ADCS $0, acc2, acc2 263 ADC $0, acc3, acc3 264 265 SUBS y0, acc0 266 SBCS y1, acc1 267 SBCS y0, acc2 268 SBC y1, acc3 269 270 SUBS const0, acc0, t0 271 SBCS const1, acc1, t1 272 SBCS const2, acc2, t2 273 SBCS const3, acc3, t3 274 275 CSEL CS, t0, acc0, acc0 276 CSEL CS, t1, acc1, acc1 277 CSEL CS, t2, acc2, acc2 278 CSEL CS, t3, acc3, acc3 279 280 STP (acc0, acc1), 0*16(res_ptr) 281 STP (acc2, acc3), 1*16(res_ptr) 282 283 RET 284 /* ---------------------------------------*/ 285 // Constant time point access to arbitrary point table. 286 // Indexed from 1 to 15, with -1 offset 287 // (index 0 is implicitly point at infinity) 288 // func p256Select(point, table []uint64, idx int) 289 TEXT ·p256Select(SB),NOSPLIT,$0 290 MOVD idx+48(FP), const0 291 MOVD table+24(FP), b_ptr 292 MOVD point+0(FP), res_ptr 293 294 EOR x0, x0, x0 295 EOR x1, x1, x1 296 EOR x2, x2, x2 297 EOR x3, x3, x3 298 EOR y0, y0, y0 299 EOR y1, y1, y1 300 EOR y2, y2, y2 301 EOR y3, y3, y3 302 EOR t0, t0, t0 303 EOR t1, t1, t1 304 EOR t2, t2, t2 305 EOR t3, t3, t3 306 307 MOVD $0, const1 308 309 loop_select: 310 ADD $1, const1 311 CMP const0, const1 312 LDP.P 16(b_ptr), (acc0, acc1) 313 CSEL EQ, acc0, x0, x0 314 CSEL EQ, acc1, x1, x1 315 LDP.P 16(b_ptr), (acc2, acc3) 316 CSEL EQ, acc2, x2, x2 317 CSEL EQ, acc3, x3, x3 318 LDP.P 16(b_ptr), (acc4, acc5) 319 CSEL EQ, acc4, y0, y0 320 CSEL EQ, acc5, y1, y1 321 LDP.P 16(b_ptr), (acc6, acc7) 322 CSEL EQ, acc6, y2, y2 323 CSEL EQ, acc7, y3, y3 324 LDP.P 16(b_ptr), (acc0, acc1) 325 CSEL EQ, acc0, t0, t0 326 CSEL EQ, acc1, t1, t1 327 LDP.P 16(b_ptr), (acc2, acc3) 328 CSEL EQ, acc2, t2, t2 329 CSEL EQ, acc3, t3, t3 330 331 CMP $16, const1 332 BNE loop_select 333 334 STP (x0, x1), 0*16(res_ptr) 335 STP (x2, x3), 1*16(res_ptr) 336 STP (y0, y1), 2*16(res_ptr) 337 STP (y2, y3), 3*16(res_ptr) 338 STP (t0, t1), 4*16(res_ptr) 339 STP (t2, t3), 5*16(res_ptr) 340 RET 341 /* ---------------------------------------*/ 342 // Constant time point access to base point table. 343 // func p256SelectBase(point *[12]uint64, table string, idx int) 344 TEXT ·p256SelectBase(SB),NOSPLIT,$0 345 MOVD idx+24(FP), t0 346 MOVD table_base+8(FP), t1 347 MOVD point+0(FP), res_ptr 348 349 EOR x0, x0, x0 350 EOR x1, x1, x1 351 EOR x2, x2, x2 352 EOR x3, x3, x3 353 EOR y0, y0, y0 354 EOR y1, y1, y1 355 EOR y2, y2, y2 356 EOR y3, y3, y3 357 358 MOVD $0, t2 359 360 loop_select: 361 ADD $1, t2 362 CMP t0, t2 363 LDP.P 16(t1), (acc0, acc1) 364 CSEL EQ, acc0, x0, x0 365 CSEL EQ, acc1, x1, x1 366 LDP.P 16(t1), (acc2, acc3) 367 CSEL EQ, acc2, x2, x2 368 CSEL EQ, acc3, x3, x3 369 LDP.P 16(t1), (acc4, acc5) 370 CSEL EQ, acc4, y0, y0 371 CSEL EQ, acc5, y1, y1 372 LDP.P 16(t1), (acc6, acc7) 373 CSEL EQ, acc6, y2, y2 374 CSEL EQ, acc7, y3, y3 375 376 CMP $32, t2 377 BNE loop_select 378 379 STP (x0, x1), 0*16(res_ptr) 380 STP (x2, x3), 1*16(res_ptr) 381 STP (y0, y1), 2*16(res_ptr) 382 STP (y2, y3), 3*16(res_ptr) 383 RET 384 /* ---------------------------------------*/ 385 // func p256OrdSqr(res, in []uint64, n int) 386 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 387 MOVD in+24(FP), a_ptr 388 MOVD n+48(FP), b_ptr 389 390 MOVD p256ordK0<>(SB), hlp1 391 LDP p256ord<>+0x00(SB), (const0, const1) 392 LDP p256ord<>+0x10(SB), (const2, const3) 393 394 LDP 0*16(a_ptr), (x0, x1) 395 LDP 1*16(a_ptr), (x2, x3) 396 397 ordSqrLoop: 398 SUB $1, b_ptr 399 400 // x[1:] * x[0] 401 MUL x0, x1, acc1 402 UMULH x0, x1, acc2 403 404 MUL x0, x2, t0 405 ADDS t0, acc2, acc2 406 UMULH x0, x2, acc3 407 408 MUL x0, x3, t0 409 ADCS t0, acc3, acc3 410 UMULH x0, x3, acc4 411 ADC $0, acc4, acc4 412 // x[2:] * x[1] 413 MUL x1, x2, t0 414 ADDS t0, acc3 415 UMULH x1, x2, t1 416 ADCS t1, acc4 417 ADC $0, ZR, acc5 418 419 MUL x1, x3, t0 420 ADDS t0, acc4 421 UMULH x1, x3, t1 422 ADC t1, acc5 423 // x[3] * x[2] 424 MUL x2, x3, t0 425 ADDS t0, acc5 426 UMULH x2, x3, acc6 427 ADC $0, acc6 428 429 MOVD $0, acc7 430 // *2 431 ADDS acc1, acc1 432 ADCS acc2, acc2 433 ADCS acc3, acc3 434 ADCS acc4, acc4 435 ADCS acc5, acc5 436 ADCS acc6, acc6 437 ADC $0, acc7 438 // Missing products 439 MUL x0, x0, acc0 440 UMULH x0, x0, t0 441 ADDS t0, acc1, acc1 442 443 MUL x1, x1, t0 444 ADCS t0, acc2, acc2 445 UMULH x1, x1, t1 446 ADCS t1, acc3, acc3 447 448 MUL x2, x2, t0 449 ADCS t0, acc4, acc4 450 UMULH x2, x2, t1 451 ADCS t1, acc5, acc5 452 453 MUL x3, x3, t0 454 ADCS t0, acc6, acc6 455 UMULH x3, x3, t1 456 ADC t1, acc7, acc7 457 // First reduction step 458 MUL acc0, hlp1, hlp0 459 460 MUL const0, hlp1, t0 461 ADDS t0, acc0, acc0 462 UMULH const0, hlp0, t1 463 464 MUL const1, hlp0, t0 465 ADCS t0, acc1, acc1 466 UMULH const1, hlp0, y0 467 468 MUL const2, hlp0, t0 469 ADCS t0, acc2, acc2 470 UMULH const2, hlp0, acc0 471 472 MUL const3, hlp0, t0 473 ADCS t0, acc3, acc3 474 475 UMULH const3, hlp0, hlp0 476 ADC $0, hlp0 477 478 ADDS t1, acc1, acc1 479 ADCS y0, acc2, acc2 480 ADCS acc0, acc3, acc3 481 ADC $0, hlp0, acc0 482 // Second reduction step 483 MUL acc1, hlp1, hlp0 484 485 MUL const0, hlp1, t0 486 ADDS t0, acc1, acc1 487 UMULH const0, hlp0, t1 488 489 MUL const1, hlp0, t0 490 ADCS t0, acc2, acc2 491 UMULH const1, hlp0, y0 492 493 MUL const2, hlp0, t0 494 ADCS t0, acc3, acc3 495 UMULH const2, hlp0, acc1 496 497 MUL const3, hlp0, t0 498 ADCS t0, acc0, acc0 499 500 UMULH const3, hlp0, hlp0 501 ADC $0, hlp0 502 503 ADDS t1, acc2, acc2 504 ADCS y0, acc3, acc3 505 ADCS acc1, acc0, acc0 506 ADC $0, hlp0, acc1 507 // Third reduction step 508 MUL acc2, hlp1, hlp0 509 510 MUL const0, hlp1, t0 511 ADDS t0, acc2, acc2 512 UMULH const0, hlp0, t1 513 514 MUL const1, hlp0, t0 515 ADCS t0, acc3, acc3 516 UMULH const1, hlp0, y0 517 518 MUL const2, hlp0, t0 519 ADCS t0, acc0, acc0 520 UMULH const2, hlp0, acc2 521 522 MUL const3, hlp0, t0 523 ADCS t0, acc1, acc1 524 525 UMULH const3, hlp0, hlp0 526 ADC $0, hlp0 527 528 ADDS t1, acc3, acc3 529 ADCS y0, acc0, acc0 530 ADCS acc2, acc1, acc1 531 ADC $0, hlp0, acc2 532 533 // Last reduction step 534 MUL acc3, hlp1, hlp0 535 536 MUL const0, hlp1, t0 537 ADDS t0, acc3, acc3 538 UMULH const0, hlp0, t1 539 540 MUL const1, hlp0, t0 541 ADCS t0, acc0, acc0 542 UMULH const1, hlp0, y0 543 544 MUL const2, hlp0, t0 545 ADCS t0, acc1, acc1 546 UMULH const2, hlp0, acc3 547 548 MUL const3, hlp0, t0 549 ADCS t0, acc2, acc2 550 551 UMULH const3, hlp0, hlp0 552 ADC $0, acc7 553 554 ADDS t1, acc0, acc0 555 ADCS y0, acc1, acc1 556 ADCS acc3, acc2, acc2 557 ADC $0, hlp0, acc3 558 559 ADDS acc4, acc0, acc0 560 ADCS acc5, acc1, acc1 561 ADCS acc6, acc2, acc2 562 ADCS acc7, acc3, acc3 563 ADC $0, ZR, acc4 564 565 SUBS const0, acc0, y0 566 SBCS const1, acc1, y1 567 SBCS const2, acc2, y2 568 SBCS const3, acc3, y3 569 SBCS $0, acc4, acc4 570 571 CSEL CS, y0, acc0, x0 572 CSEL CS, y1, acc1, x1 573 CSEL CS, y2, acc2, x2 574 CSEL CS, y3, acc3, x3 575 576 CBNZ b_ptr, ordSqrLoop 577 578 MOVD res+0(FP), res_ptr 579 STP (x0, x1), 0*16(res_ptr) 580 STP (x2, x3), 1*16(res_ptr) 581 582 RET 583 /* ---------------------------------------*/ 584 // func p256OrdMul(res, in1, in2 []uint64) 585 TEXT ·p256OrdMul(SB),NOSPLIT,$0 586 MOVD in1+24(FP), a_ptr 587 MOVD in2+48(FP), b_ptr 588 589 MOVD p256ordK0<>(SB), hlp1 590 LDP p256ord<>+0x00(SB), (const0, const1) 591 LDP p256ord<>+0x10(SB), (const2, const3) 592 593 LDP 0*16(a_ptr), (x0, x1) 594 LDP 1*16(a_ptr), (x2, x3) 595 LDP 0*16(b_ptr), (y0, y1) 596 LDP 1*16(b_ptr), (y2, y3) 597 598 // y[0] * x 599 MUL y0, x0, acc0 600 UMULH y0, x0, acc1 601 602 MUL y0, x1, t0 603 ADDS t0, acc1 604 UMULH y0, x1, acc2 605 606 MUL y0, x2, t0 607 ADCS t0, acc2 608 UMULH y0, x2, acc3 609 610 MUL y0, x3, t0 611 ADCS t0, acc3 612 UMULH y0, x3, acc4 613 ADC $0, acc4 614 // First reduction step 615 MUL acc0, hlp1, hlp0 616 617 MUL const0, hlp1, t0 618 ADDS t0, acc0, acc0 619 UMULH const0, hlp0, t1 620 621 MUL const1, hlp0, t0 622 ADCS t0, acc1, acc1 623 UMULH const1, hlp0, y0 624 625 MUL const2, hlp0, t0 626 ADCS t0, acc2, acc2 627 UMULH const2, hlp0, acc0 628 629 MUL const3, hlp0, t0 630 ADCS t0, acc3, acc3 631 632 UMULH const3, hlp0, hlp0 633 ADC $0, acc4 634 635 ADDS t1, acc1, acc1 636 ADCS y0, acc2, acc2 637 ADCS acc0, acc3, acc3 638 ADC $0, hlp0, acc0 639 // y[1] * x 640 MUL y1, x0, t0 641 ADDS t0, acc1 642 UMULH y1, x0, t1 643 644 MUL y1, x1, t0 645 ADCS t0, acc2 646 UMULH y1, x1, hlp0 647 648 MUL y1, x2, t0 649 ADCS t0, acc3 650 UMULH y1, x2, y0 651 652 MUL y1, x3, t0 653 ADCS t0, acc4 654 UMULH y1, x3, y1 655 ADC $0, ZR, acc5 656 657 ADDS t1, acc2 658 ADCS hlp0, acc3 659 ADCS y0, acc4 660 ADC y1, acc5 661 // Second reduction step 662 MUL acc1, hlp1, hlp0 663 664 MUL const0, hlp1, t0 665 ADDS t0, acc1, acc1 666 UMULH const0, hlp0, t1 667 668 MUL const1, hlp0, t0 669 ADCS t0, acc2, acc2 670 UMULH const1, hlp0, y0 671 672 MUL const2, hlp0, t0 673 ADCS t0, acc3, acc3 674 UMULH const2, hlp0, acc1 675 676 MUL const3, hlp0, t0 677 ADCS t0, acc0, acc0 678 679 UMULH const3, hlp0, hlp0 680 ADC $0, acc5 681 682 ADDS t1, acc2, acc2 683 ADCS y0, acc3, acc3 684 ADCS acc1, acc0, acc0 685 ADC $0, hlp0, acc1 686 // y[2] * x 687 MUL y2, x0, t0 688 ADDS t0, acc2 689 UMULH y2, x0, t1 690 691 MUL y2, x1, t0 692 ADCS t0, acc3 693 UMULH y2, x1, hlp0 694 695 MUL y2, x2, t0 696 ADCS t0, acc4 697 UMULH y2, x2, y0 698 699 MUL y2, x3, t0 700 ADCS t0, acc5 701 UMULH y2, x3, y1 702 ADC $0, ZR, acc6 703 704 ADDS t1, acc3 705 ADCS hlp0, acc4 706 ADCS y0, acc5 707 ADC y1, acc6 708 // Third reduction step 709 MUL acc2, hlp1, hlp0 710 711 MUL const0, hlp1, t0 712 ADDS t0, acc2, acc2 713 UMULH const0, hlp0, t1 714 715 MUL const1, hlp0, t0 716 ADCS t0, acc3, acc3 717 UMULH const1, hlp0, y0 718 719 MUL const2, hlp0, t0 720 ADCS t0, acc0, acc0 721 UMULH const2, hlp0, acc2 722 723 MUL const3, hlp0, t0 724 ADCS t0, acc1, acc1 725 726 UMULH const3, hlp0, hlp0 727 ADC $0, acc6 728 729 ADDS t1, acc3, acc3 730 ADCS y0, acc0, acc0 731 ADCS acc2, acc1, acc1 732 ADC $0, hlp0, acc2 733 // y[3] * x 734 MUL y3, x0, t0 735 ADDS t0, acc3 736 UMULH y3, x0, t1 737 738 MUL y3, x1, t0 739 ADCS t0, acc4 740 UMULH y3, x1, hlp0 741 742 MUL y3, x2, t0 743 ADCS t0, acc5 744 UMULH y3, x2, y0 745 746 MUL y3, x3, t0 747 ADCS t0, acc6 748 UMULH y3, x3, y1 749 ADC $0, ZR, acc7 750 751 ADDS t1, acc4 752 ADCS hlp0, acc5 753 ADCS y0, acc6 754 ADC y1, acc7 755 // Last reduction step 756 MUL acc3, hlp1, hlp0 757 758 MUL const0, hlp1, t0 759 ADDS t0, acc3, acc3 760 UMULH const0, hlp0, t1 761 762 MUL const1, hlp0, t0 763 ADCS t0, acc0, acc0 764 UMULH const1, hlp0, y0 765 766 MUL const2, hlp0, t0 767 ADCS t0, acc1, acc1 768 UMULH const2, hlp0, acc3 769 770 MUL const3, hlp0, t0 771 ADCS t0, acc2, acc2 772 773 UMULH const3, hlp0, hlp0 774 ADC $0, acc7 775 776 ADDS t1, acc0, acc0 777 ADCS y0, acc1, acc1 778 ADCS acc3, acc2, acc2 779 ADC $0, hlp0, acc3 780 781 ADDS acc4, acc0, acc0 782 ADCS acc5, acc1, acc1 783 ADCS acc6, acc2, acc2 784 ADCS acc7, acc3, acc3 785 ADC $0, ZR, acc4 786 787 SUBS const0, acc0, t0 788 SBCS const1, acc1, t1 789 SBCS const2, acc2, t2 790 SBCS const3, acc3, t3 791 SBCS $0, acc4, acc4 792 793 CSEL CS, t0, acc0, acc0 794 CSEL CS, t1, acc1, acc1 795 CSEL CS, t2, acc2, acc2 796 CSEL CS, t3, acc3, acc3 797 798 MOVD res+0(FP), res_ptr 799 STP (acc0, acc1), 0*16(res_ptr) 800 STP (acc2, acc3), 1*16(res_ptr) 801 802 RET 803 /* ---------------------------------------*/ 804 TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0 805 SUBS x0, y0, acc0 806 SBCS x1, y1, acc1 807 SBCS x2, y2, acc2 808 SBCS x3, y3, acc3 809 SBC $0, ZR, t0 810 811 ADDS const0, acc0, acc4 812 ADCS const1, acc1, acc5 813 ADCS const2, acc2, acc6 814 ADC const3, acc3, acc7 815 816 ANDS $1, t0 817 CSEL EQ, acc0, acc4, x0 818 CSEL EQ, acc1, acc5, x1 819 CSEL EQ, acc2, acc6, x2 820 CSEL EQ, acc3, acc7, x3 821 822 RET 823 /* ---------------------------------------*/ 824 TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0 825 // x[1:] * x[0] 826 MUL x0, x1, acc1 827 UMULH x0, x1, acc2 828 829 MUL x0, x2, t0 830 ADDS t0, acc2, acc2 831 UMULH x0, x2, acc3 832 833 MUL x0, x3, t0 834 ADCS t0, acc3, acc3 835 UMULH x0, x3, acc4 836 ADC $0, acc4, acc4 837 // x[2:] * x[1] 838 MUL x1, x2, t0 839 ADDS t0, acc3 840 UMULH x1, x2, t1 841 ADCS t1, acc4 842 ADC $0, ZR, acc5 843 844 MUL x1, x3, t0 845 ADDS t0, acc4 846 UMULH x1, x3, t1 847 ADC t1, acc5 848 // x[3] * x[2] 849 MUL x2, x3, t0 850 ADDS t0, acc5 851 UMULH x2, x3, acc6 852 ADC $0, acc6 853 854 MOVD $0, acc7 855 // *2 856 ADDS acc1, acc1 857 ADCS acc2, acc2 858 ADCS acc3, acc3 859 ADCS acc4, acc4 860 ADCS acc5, acc5 861 ADCS acc6, acc6 862 ADC $0, acc7 863 // Missing products 864 MUL x0, x0, acc0 865 UMULH x0, x0, t0 866 ADDS t0, acc1, acc1 867 868 MUL x1, x1, t0 869 ADCS t0, acc2, acc2 870 UMULH x1, x1, t1 871 ADCS t1, acc3, acc3 872 873 MUL x2, x2, t0 874 ADCS t0, acc4, acc4 875 UMULH x2, x2, t1 876 ADCS t1, acc5, acc5 877 878 MUL x3, x3, t0 879 ADCS t0, acc6, acc6 880 UMULH x3, x3, t1 881 ADCS t1, acc7, acc7 882 // First reduction step 883 LSL $32, acc0, y0 884 LSR $32, acc0, y1 885 886 ADDS acc0, acc1, acc1 887 ADCS $0, acc2, acc2 888 ADCS $0, acc3, acc3 889 ADC $0, acc0, acc0 890 891 SUBS y0, acc1 892 SBCS y1, acc2 893 SBCS y0, acc3 894 SBC y1, acc0 895 // Second reduction step 896 LSL $32, acc1, y0 897 LSR $32, acc1, y1 898 899 ADDS acc1, acc2, acc2 900 ADCS $0, acc3, acc3 901 ADCS $0, acc0, acc0 902 ADC $0, acc1, acc1 903 904 SUBS y0, acc2 905 SBCS y1, acc3 906 SBCS y0, acc0 907 SBC y1, acc1 908 // Third reduction step 909 LSL $32, acc2, y0 910 LSR $32, acc2, y1 911 912 ADDS acc2, acc3, acc3 913 ADCS $0, acc0, acc0 914 ADCS $0, acc1, acc1 915 ADC $0, acc2, acc2 916 917 SUBS y0, acc3 918 SBCS y1, acc0 919 SBCS y0, acc1 920 SBC y1, acc2 921 // Last reduction step 922 LSL $32, acc3, y0 923 LSR $32, acc3, y1 924 925 ADDS acc3, acc0, acc0 926 ADCS $0, acc1, acc1 927 ADCS $0, acc2, acc2 928 ADC $0, acc3, acc3 929 930 SUBS y0, acc0 931 SBCS y1, acc1 932 SBCS y0, acc2 933 SBC y1, acc3 934 935 // Add bits [511:256] of the sqr result 936 ADDS acc4, acc0, acc0 937 ADCS acc5, acc1, acc1 938 ADCS acc6, acc2, acc2 939 ADCS acc7, acc3, acc3 940 ADC $0, ZR, acc4 941 942 SUBS const0, acc0, t0 943 SBCS const1, acc1, t1 944 SBCS const2, acc2, acc5 945 SBCS const3, acc3, acc6 946 SBCS $0, acc4, acc4 947 948 CSEL CS, t0, acc0, y0 949 CSEL CS, t1, acc1, y1 950 CSEL CS, acc5, acc2, y2 951 CSEL CS, acc6, acc3, y3 952 RET 953 /* ---------------------------------------*/ 954 TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 955 // y[0] * x 956 MUL y0, x0, acc0 957 UMULH y0, x0, acc1 958 959 MUL y0, x1, t0 960 ADDS t0, acc1 961 UMULH y0, x1, acc2 962 963 MUL y0, x2, t0 964 ADCS t0, acc2 965 UMULH y0, x2, acc3 966 967 MUL y0, x3, t0 968 ADCS t0, acc3 969 UMULH y0, x3, acc4 970 ADC $0, acc4 971 // First reduction step 972 LSL $32, acc0, t0 973 LSR $32, acc0, t1 974 975 ADDS acc0, acc1, acc1 976 ADCS $0, acc2, acc2 977 ADCS $0, acc3, acc3 978 ADC $0, acc0, acc0 979 980 SUBS t0, acc1 981 SBCS t1, acc2 982 SBCS t0, acc3 983 SBC t1, acc0 984 985 // y[1] * x 986 MUL y1, x0, t0 987 ADDS t0, acc1 988 UMULH y1, x0, t1 989 990 MUL y1, x1, t0 991 ADCS t0, acc2 992 UMULH y1, x1, y0 993 994 MUL y1, x2, t0 995 ADCS t0, acc3 996 UMULH y1, x2, acc6 997 998 MUL y1, x3, t0 999 ADCS t0, acc4 1000 UMULH y1, x3, hlp0 1001 ADC $0, ZR, acc5 1002 1003 ADDS t1, acc2 1004 ADCS y0, acc3 1005 ADCS acc6, acc4 1006 ADC hlp0, acc5 1007 // Second reduction step 1008 LSL $32, acc1, t0 1009 LSR $32, acc1, t1 1010 1011 ADDS acc1, acc2, acc2 1012 ADCS $0, acc3, acc3 1013 ADCS $0, acc0, acc0 1014 ADC $0, acc1, acc1 1015 1016 SUBS t0, acc2 1017 SBCS t1, acc3 1018 SBCS t0, acc0 1019 SBC t1, acc1 1020 1021 // y[2] * x 1022 MUL y2, x0, t0 1023 ADDS t0, acc2 1024 UMULH y2, x0, t1 1025 1026 MUL y2, x1, t0 1027 ADCS t0, acc3 1028 UMULH y2, x1, y0 1029 1030 MUL y2, x2, t0 1031 ADCS t0, acc4 1032 UMULH y2, x2, y1 1033 1034 MUL y2, x3, t0 1035 ADCS t0, acc5 1036 UMULH y2, x3, hlp0 1037 ADC $0, ZR, acc6 1038 1039 ADDS t1, acc3 1040 ADCS y0, acc4 1041 ADCS y1, acc5 1042 ADC hlp0, acc6 1043 // Third reduction step 1044 LSL $32, acc2, t0 1045 LSR $32, acc2, t1 1046 1047 ADDS acc2, acc3, acc3 1048 ADCS $0, acc0, acc0 1049 ADCS $0, acc1, acc1 1050 ADC $0, acc2, acc2 1051 1052 SUBS t0, acc3 1053 SBCS t1, acc0 1054 SBCS t0, acc1 1055 SBC t1, acc2 1056 1057 // y[3] * x 1058 MUL y3, x0, t0 1059 ADDS t0, acc3 1060 UMULH y3, x0, t1 1061 1062 MUL y3, x1, t0 1063 ADCS t0, acc4 1064 UMULH y3, x1, y0 1065 1066 MUL y3, x2, t0 1067 ADCS t0, acc5 1068 UMULH y3, x2, y1 1069 1070 MUL y3, x3, t0 1071 ADCS t0, acc6 1072 UMULH y3, x3, hlp0 1073 ADC $0, ZR, acc7 1074 1075 ADDS t1, acc4 1076 ADCS y0, acc5 1077 ADCS y1, acc6 1078 ADC hlp0, acc7 1079 // Last reduction step 1080 LSL $32, acc3, t0 1081 LSR $32, acc3, t1 1082 1083 ADDS acc3, acc0, acc0 1084 ADCS $0, acc1, acc1 1085 ADCS $0, acc2, acc2 1086 ADC $0, acc3, acc3 1087 1088 SUBS t0, acc0 1089 SBCS t1, acc1 1090 SBCS t0, acc2 1091 SBC t1, acc3 1092 1093 // Add bits [511:256] of the mul result 1094 ADDS acc4, acc0, acc0 1095 ADCS acc5, acc1, acc1 1096 ADCS acc6, acc2, acc2 1097 ADCS acc7, acc3, acc3 1098 ADC $0, ZR, acc4 1099 1100 SUBS const0, acc0, t0 1101 SBCS const1, acc1, t1 1102 SBCS const2, acc2, acc5 1103 SBCS const3, acc3, acc6 1104 SBCS $0, acc4, acc4 1105 1106 CSEL CS, t0, acc0, y0 1107 CSEL CS, t1, acc1, y1 1108 CSEL CS, acc5, acc2, y2 1109 CSEL CS, acc6, acc3, y3 1110 RET 1111 /* ---------------------------------------*/ 1112 #define p256MulBy2Inline \ 1113 ADDS y0, y0, x0; \ 1114 ADCS y1, y1, x1; \ 1115 ADCS y2, y2, x2; \ 1116 ADCS y3, y3, x3; \ 1117 ADC $0, ZR, hlp0; \ 1118 SUBS const0, x0, t0; \ 1119 SBCS const1, x1, t1;\ 1120 SBCS const2, x2, acc5; \ 1121 SBCS const3, x3, acc6;\ 1122 SBCS $0, hlp0, hlp0;\ 1123 CSEL CC, x0, t0, x0;\ 1124 CSEL CC, x1, t1, x1;\ 1125 CSEL CC, x2, acc5, x2;\ 1126 CSEL CC, x3, acc6, x3; 1127 /* ---------------------------------------*/ 1128 #define x1in(off) (off)(a_ptr) 1129 #define y1in(off) (off + 32)(a_ptr) 1130 #define z1in(off) (off + 64)(a_ptr) 1131 #define x2in(off) (off)(b_ptr) 1132 #define z2in(off) (off + 64)(b_ptr) 1133 #define x3out(off) (off)(res_ptr) 1134 #define y3out(off) (off + 32)(res_ptr) 1135 #define z3out(off) (off + 64)(res_ptr) 1136 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) 1137 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) 1138 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) 1139 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) 1140 /* ---------------------------------------*/ 1141 #define y2in(off) (32*0 + 8 + off)(RSP) 1142 #define s2(off) (32*1 + 8 + off)(RSP) 1143 #define z1sqr(off) (32*2 + 8 + off)(RSP) 1144 #define h(off) (32*3 + 8 + off)(RSP) 1145 #define r(off) (32*4 + 8 + off)(RSP) 1146 #define hsqr(off) (32*5 + 8 + off)(RSP) 1147 #define rsqr(off) (32*6 + 8 + off)(RSP) 1148 #define hcub(off) (32*7 + 8 + off)(RSP) 1149 1150 #define z2sqr(off) (32*8 + 8 + off)(RSP) 1151 #define s1(off) (32*9 + 8 + off)(RSP) 1152 #define u1(off) (32*10 + 8 + off)(RSP) 1153 #define u2(off) (32*11 + 8 + off)(RSP) 1154 1155 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 1156 TEXT ·p256PointAddAffineAsm(SB),0,$264-96 1157 MOVD in1+24(FP), a_ptr 1158 MOVD in2+48(FP), b_ptr 1159 MOVD sign+72(FP), hlp0 1160 MOVD sel+80(FP), hlp1 1161 MOVD zero+88(FP), t1 1162 1163 MOVD $1, t0 1164 CMP $0, t1 1165 CSEL EQ, ZR, t0, t1 1166 CMP $0, hlp1 1167 CSEL EQ, ZR, t0, hlp1 1168 1169 LDP p256p<>+0x00(SB), (const0, const1) 1170 LDP p256p<>+0x10(SB), (const2, const3) 1171 EOR t1<<1, hlp1 1172 1173 // Negate y2in based on sign 1174 LDP 2*16(b_ptr), (y0, y1) 1175 LDP 3*16(b_ptr), (y2, y3) 1176 1177 SUBS y0, const0, acc0 1178 SBCS y1, const1, acc1 1179 SBCS y2, const2, acc2 1180 SBCS y3, const3, acc3 1181 SBC $0, ZR, t0 1182 1183 ADDS const0, acc0, acc4 1184 ADCS const1, acc1, acc5 1185 ADCS const2, acc2, acc6 1186 ADCS const3, acc3, acc7 1187 ADC $0, t0, t0 1188 1189 CMP $0, t0 1190 CSEL EQ, acc4, acc0, acc0 1191 CSEL EQ, acc5, acc1, acc1 1192 CSEL EQ, acc6, acc2, acc2 1193 CSEL EQ, acc7, acc3, acc3 1194 // If condition is 0, keep original value 1195 CMP $0, hlp0 1196 CSEL EQ, y0, acc0, y0 1197 CSEL EQ, y1, acc1, y1 1198 CSEL EQ, y2, acc2, y2 1199 CSEL EQ, y3, acc3, y3 1200 // Store result 1201 STy(y2in) 1202 // Begin point add 1203 LDx(z1in) 1204 CALL sm2P256SqrInternal<>(SB) // z1ˆ2 1205 STy(z1sqr) 1206 1207 LDx(x2in) 1208 CALL sm2P256MulInternal<>(SB) // x2 * z1ˆ2 1209 1210 LDx(x1in) 1211 CALL sm2P256Subinternal<>(SB) // h = u2 - u1 1212 STx(h) 1213 1214 LDy(z1in) 1215 CALL sm2P256MulInternal<>(SB) // z3 = h * z1 1216 1217 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1 1218 LDP 5*16(a_ptr), (acc2, acc3) 1219 ANDS $1, hlp1, ZR 1220 CSEL EQ, acc0, y0, y0 1221 CSEL EQ, acc1, y1, y1 1222 CSEL EQ, acc2, y2, y2 1223 CSEL EQ, acc3, y3, y3 1224 LDP p256one<>+0x00(SB), (acc0, acc1) 1225 LDP p256one<>+0x10(SB), (acc2, acc3) 1226 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1 1227 CSEL EQ, acc0, y0, y0 1228 CSEL EQ, acc1, y1, y1 1229 CSEL EQ, acc2, y2, y2 1230 CSEL EQ, acc3, y3, y3 1231 LDx(z1in) 1232 MOVD res+0(FP), t0 1233 STP (y0, y1), 4*16(t0) 1234 STP (y2, y3), 5*16(t0) 1235 1236 LDy(z1sqr) 1237 CALL sm2P256MulInternal<>(SB) // z1 ^ 3 1238 1239 LDx(y2in) 1240 CALL sm2P256MulInternal<>(SB) // s2 = y2 * z1ˆ3 1241 STy(s2) 1242 1243 LDx(y1in) 1244 CALL sm2P256Subinternal<>(SB) // r = s2 - s1 1245 STx(r) 1246 1247 CALL sm2P256SqrInternal<>(SB) // rsqr = rˆ2 1248 STy (rsqr) 1249 1250 LDx(h) 1251 CALL sm2P256SqrInternal<>(SB) // hsqr = hˆ2 1252 STy(hsqr) 1253 1254 CALL sm2P256MulInternal<>(SB) // hcub = hˆ3 1255 STy(hcub) 1256 1257 LDx(y1in) 1258 CALL sm2P256MulInternal<>(SB) // y1 * hˆ3 1259 STy(s2) 1260 1261 LDP hsqr(0*8), (x0, x1) 1262 LDP hsqr(2*8), (x2, x3) 1263 LDP 0*16(a_ptr), (y0, y1) 1264 LDP 1*16(a_ptr), (y2, y3) 1265 CALL sm2P256MulInternal<>(SB) // u1 * hˆ2 1266 STP (y0, y1), h(0*8) 1267 STP (y2, y3), h(2*8) 1268 1269 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1270 1271 LDy(rsqr) 1272 CALL sm2P256Subinternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1273 1274 MOVD x0, y0 1275 MOVD x1, y1 1276 MOVD x2, y2 1277 MOVD x3, y3 1278 LDx(hcub) 1279 CALL sm2P256Subinternal<>(SB) 1280 1281 LDP 0*16(a_ptr), (acc0, acc1) 1282 LDP 1*16(a_ptr), (acc2, acc3) 1283 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1 1284 CSEL EQ, acc0, x0, x0 1285 CSEL EQ, acc1, x1, x1 1286 CSEL EQ, acc2, x2, x2 1287 CSEL EQ, acc3, x3, x3 1288 LDP 0*16(b_ptr), (acc0, acc1) 1289 LDP 1*16(b_ptr), (acc2, acc3) 1290 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2 1291 CSEL EQ, acc0, x0, x0 1292 CSEL EQ, acc1, x1, x1 1293 CSEL EQ, acc2, x2, x2 1294 CSEL EQ, acc3, x3, x3 1295 MOVD res+0(FP), t0 1296 STP (x0, x1), 0*16(t0) 1297 STP (x2, x3), 1*16(t0) 1298 1299 LDP h(0*8), (y0, y1) 1300 LDP h(2*8), (y2, y3) 1301 CALL sm2P256Subinternal<>(SB) 1302 1303 LDP r(0*8), (y0, y1) 1304 LDP r(2*8), (y2, y3) 1305 CALL sm2P256MulInternal<>(SB) 1306 1307 LDP s2(0*8), (x0, x1) 1308 LDP s2(2*8), (x2, x3) 1309 CALL sm2P256Subinternal<>(SB) 1310 LDP 2*16(a_ptr), (acc0, acc1) 1311 LDP 3*16(a_ptr), (acc2, acc3) 1312 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1 1313 CSEL EQ, acc0, x0, x0 1314 CSEL EQ, acc1, x1, x1 1315 CSEL EQ, acc2, x2, x2 1316 CSEL EQ, acc3, x3, x3 1317 LDP y2in(0*8), (acc0, acc1) 1318 LDP y2in(2*8), (acc2, acc3) 1319 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2 1320 CSEL EQ, acc0, x0, x0 1321 CSEL EQ, acc1, x1, x1 1322 CSEL EQ, acc2, x2, x2 1323 CSEL EQ, acc3, x3, x3 1324 MOVD res+0(FP), t0 1325 STP (x0, x1), 2*16(t0) 1326 STP (x2, x3), 3*16(t0) 1327 1328 RET 1329 1330 #define p256AddInline \ 1331 ADDS y0, x0, x0; \ 1332 ADCS y1, x1, x1; \ 1333 ADCS y2, x2, x2; \ 1334 ADCS y3, x3, x3; \ 1335 ADC $0, ZR, hlp0; \ 1336 SUBS const0, x0, t0; \ 1337 SBCS const1, x1, t1;\ 1338 SBCS const2, x2, acc5; \ 1339 SBCS const3, x3, acc6;\ 1340 SBCS $0, hlp0, hlp0;\ 1341 CSEL CC, x0, t0, x0;\ 1342 CSEL CC, x1, t1, x1;\ 1343 CSEL CC, x2, acc5, x2;\ 1344 CSEL CC, x3, acc6, x3; 1345 1346 #define s(off) (32*0 + 8 + off)(RSP) 1347 #define m(off) (32*1 + 8 + off)(RSP) 1348 #define zsqr(off) (32*2 + 8 + off)(RSP) 1349 #define tmp(off) (32*3 + 8 + off)(RSP) 1350 1351 //func p256PointDoubleAsm(res, in []uint64) 1352 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48 1353 MOVD res+0(FP), res_ptr 1354 MOVD in+24(FP), a_ptr 1355 1356 LDP p256p<>+0x00(SB), (const0, const1) 1357 LDP p256p<>+0x10(SB), (const2, const3) 1358 1359 // Begin point double 1360 LDP 4*16(a_ptr), (x0, x1) 1361 LDP 5*16(a_ptr), (x2, x3) 1362 CALL sm2P256SqrInternal<>(SB) 1363 STP (y0, y1), zsqr(0*8) 1364 STP (y2, y3), zsqr(2*8) 1365 1366 LDP 0*16(a_ptr), (x0, x1) 1367 LDP 1*16(a_ptr), (x2, x3) 1368 p256AddInline 1369 STx(m) 1370 1371 LDx(z1in) 1372 LDy(y1in) 1373 CALL sm2P256MulInternal<>(SB) 1374 p256MulBy2Inline 1375 STx(z3out) 1376 1377 LDy(x1in) 1378 LDx(zsqr) 1379 CALL sm2P256Subinternal<>(SB) 1380 LDy(m) 1381 CALL sm2P256MulInternal<>(SB) 1382 1383 // Multiply by 3 1384 p256MulBy2Inline 1385 p256AddInline 1386 STx(m) 1387 1388 LDy(y1in) 1389 p256MulBy2Inline 1390 CALL sm2P256SqrInternal<>(SB) 1391 STy(s) 1392 MOVD y0, x0 1393 MOVD y1, x1 1394 MOVD y2, x2 1395 MOVD y3, x3 1396 CALL sm2P256SqrInternal<>(SB) 1397 1398 // Divide by 2 1399 ADDS const0, y0, t0 1400 ADCS const1, y1, t1 1401 ADCS const2, y2, acc5 1402 ADCS const3, y3, acc6 1403 ADC $0, ZR, hlp0 1404 1405 ANDS $1, y0, ZR 1406 CSEL EQ, y0, t0, t0 1407 CSEL EQ, y1, t1, t1 1408 CSEL EQ, y2, acc5, acc5 1409 CSEL EQ, y3, acc6, acc6 1410 AND y0, hlp0, hlp0 1411 1412 EXTR $1, t0, t1, y0 1413 EXTR $1, t1, acc5, y1 1414 EXTR $1, acc5, acc6, y2 1415 EXTR $1, acc6, hlp0, y3 1416 STy(y3out) 1417 1418 LDx(x1in) 1419 LDy(s) 1420 CALL sm2P256MulInternal<>(SB) 1421 STy(s) 1422 p256MulBy2Inline 1423 STx(tmp) 1424 1425 LDx(m) 1426 CALL sm2P256SqrInternal<>(SB) 1427 LDx(tmp) 1428 CALL sm2P256Subinternal<>(SB) 1429 1430 STx(x3out) 1431 1432 LDy(s) 1433 CALL sm2P256Subinternal<>(SB) 1434 1435 LDy(m) 1436 CALL sm2P256MulInternal<>(SB) 1437 1438 LDx(y3out) 1439 CALL sm2P256Subinternal<>(SB) 1440 STx(y3out) 1441 RET 1442 /* ---------------------------------------*/ 1443 #undef y2in 1444 #undef x3out 1445 #undef y3out 1446 #undef z3out 1447 #define y2in(off) (off + 32)(b_ptr) 1448 #define x3out(off) (off)(b_ptr) 1449 #define y3out(off) (off + 32)(b_ptr) 1450 #define z3out(off) (off + 64)(b_ptr) 1451 //func p256PointAddAsm(res, in1, in2 []uint64) int 1452 TEXT ·p256PointAddAsm(SB),0,$392-80 1453 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 1454 // Move input to stack in order to free registers 1455 MOVD in1+24(FP), a_ptr 1456 MOVD in2+48(FP), b_ptr 1457 1458 LDP p256p<>+0x00(SB), (const0, const1) 1459 LDP p256p<>+0x10(SB), (const2, const3) 1460 1461 // Begin point add 1462 LDx(z2in) 1463 CALL sm2P256SqrInternal<>(SB) // z2^2 1464 STy(z2sqr) 1465 1466 CALL sm2P256MulInternal<>(SB) // z2^3 1467 1468 LDx(y1in) 1469 CALL sm2P256MulInternal<>(SB) // s1 = z2ˆ3*y1 1470 STy(s1) 1471 1472 LDx(z1in) 1473 CALL sm2P256SqrInternal<>(SB) // z1^2 1474 STy(z1sqr) 1475 1476 CALL sm2P256MulInternal<>(SB) // z1^3 1477 1478 LDx(y2in) 1479 CALL sm2P256MulInternal<>(SB) // s2 = z1ˆ3*y2 1480 1481 LDx(s1) 1482 CALL sm2P256Subinternal<>(SB) // r = s2 - s1 1483 STx(r) 1484 1485 MOVD $1, acc1 1486 ORR x0, x1, acc2 // Check if zero mod p256 1487 ORR x2, x3, acc3 1488 ORR acc3, acc2, acc2 1489 CMP $0, acc2 1490 CSEL EQ, acc1, ZR, hlp1 1491 1492 EOR const0, x0, acc2 1493 EOR const1, x1, acc3 1494 EOR const2, x2, acc4 1495 EOR const3, x3, acc5 1496 1497 ORR acc2, acc3, acc2 1498 ORR acc4, acc5, acc3 1499 ORR acc3, acc2, acc2 1500 CMP $0, acc2 1501 CSEL EQ, acc1, hlp1, hlp1 1502 1503 LDx(z2sqr) 1504 LDy(x1in) 1505 CALL sm2P256MulInternal<>(SB) // u1 = x1 * z2ˆ2 1506 STy(u1) 1507 1508 LDx(z1sqr) 1509 LDy(x2in) 1510 CALL sm2P256MulInternal<>(SB) // u2 = x2 * z1ˆ2 1511 STy(u2) 1512 1513 LDx(u1) 1514 CALL sm2P256Subinternal<>(SB) // h = u2 - u1 1515 STx(h) 1516 1517 MOVD $1, acc1 1518 ORR x0, x1, acc2 // Check if zero mod p256 1519 ORR x2, x3, acc3 1520 ORR acc3, acc2, acc2 1521 CMP $0, acc2 1522 CSEL EQ, acc1, ZR, hlp0 1523 1524 EOR const0, x0, acc2 1525 EOR const1, x1, acc3 1526 EOR const2, x2, acc4 1527 EOR const3, x3, acc5 1528 1529 ORR acc2, acc3, acc2 1530 ORR acc4, acc5, acc3 1531 ORR acc3, acc2, acc2 1532 CMP $0, acc2 1533 CSEL EQ, acc1, hlp0, hlp0 1534 1535 AND hlp0, hlp1, hlp1 1536 1537 LDx(r) 1538 CALL sm2P256SqrInternal<>(SB) // rsqr = rˆ2 1539 STy(rsqr) 1540 1541 LDx(h) 1542 CALL sm2P256SqrInternal<>(SB) // hsqr = hˆ2 1543 STy(hsqr) 1544 1545 LDx(h) 1546 CALL sm2P256MulInternal<>(SB) // hcub = hˆ3 1547 STy(hcub) 1548 1549 LDx(s1) 1550 CALL sm2P256MulInternal<>(SB) 1551 STy(s2) 1552 1553 LDx(z1in) 1554 LDy(z2in) 1555 CALL sm2P256MulInternal<>(SB) // z1 * z2 1556 LDx(h) 1557 CALL sm2P256MulInternal<>(SB) // z1 * z2 * h 1558 MOVD res+0(FP), b_ptr 1559 STy(z3out) 1560 1561 LDx(hsqr) 1562 LDy(u1) 1563 CALL sm2P256MulInternal<>(SB) // hˆ2 * u1 1564 STy(u2) 1565 1566 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1567 LDy(rsqr) 1568 CALL sm2P256Subinternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1569 1570 MOVD x0, y0 1571 MOVD x1, y1 1572 MOVD x2, y2 1573 MOVD x3, y3 1574 LDx(hcub) 1575 CALL sm2P256Subinternal<>(SB) 1576 STx(x3out) 1577 1578 LDy(u2) 1579 CALL sm2P256Subinternal<>(SB) 1580 1581 LDy(r) 1582 CALL sm2P256MulInternal<>(SB) 1583 1584 LDx(s2) 1585 CALL sm2P256Subinternal<>(SB) 1586 STx(y3out) 1587 1588 MOVD hlp1, R0 1589 MOVD R0, ret+72(FP) 1590 1591 RET 1592