github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_arm64.s (about) 1 // This file contains constant-time, 64-bit assembly implementation of 2 // P256. The optimizations performed here are described in detail in: 3 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 4 // 256-bit primes" 5 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 6 // https://eprint.iacr.org/2013/816.pdf 7 //go:build !purego 8 9 #include "textflag.h" 10 11 #define res_ptr R0 12 #define a_ptr R1 13 #define b_ptr R2 14 15 #define acc0 R3 16 #define acc1 R4 17 #define acc2 R5 18 #define acc3 R6 19 20 #define acc4 R7 21 #define acc5 R8 22 #define acc6 R9 23 #define acc7 R10 24 #define t0 R11 25 #define t1 R12 26 #define t2 R13 27 #define t3 R14 28 #define const0 R15 29 #define const1 R16 30 31 #define hlp0 R17 32 #define hlp1 res_ptr 33 34 #define x0 R19 35 #define x1 R20 36 #define x2 R21 37 #define x3 R22 38 #define y0 R23 39 #define y1 R24 40 #define y2 R25 41 #define y3 R26 42 43 #define const2 t2 44 #define const3 t3 45 46 DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff 47 DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 48 DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff 49 DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff 50 DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975 51 DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123 52 DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b 53 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 54 DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff 55 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 56 DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff 57 DATA p256one<>+0x10(SB)/8, $0x0000000000000000 58 DATA p256one<>+0x18(SB)/8, $0x0000000100000000 59 GLOBL p256p<>(SB), RODATA, $32 60 GLOBL p256ordK0<>(SB), RODATA, $8 61 GLOBL p256ord<>(SB), RODATA, $32 62 GLOBL p256one<>(SB), RODATA, $32 63 64 /* ---------------------------------------*/ 65 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 66 TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 67 JMP ·p256BigToLittle(SB) 68 /* ---------------------------------------*/ 69 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 70 TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 71 JMP ·p256BigToLittle(SB) 72 /* ---------------------------------------*/ 73 // func p256LittleToBig(res *[32]byte, in *p256Element) 74 TEXT ·p256LittleToBig(SB),NOSPLIT,$0 75 JMP ·p256BigToLittle(SB) 76 /* ---------------------------------------*/ 77 // func p256BigToLittle(res *p256Element, in *[32]byte) 78 TEXT ·p256BigToLittle(SB),NOSPLIT,$0 79 MOVD res+0(FP), res_ptr 80 MOVD in+8(FP), a_ptr 81 82 VLD1 (a_ptr), [V0.B16, V1.B16] 83 84 VEXT $8, V0.B16, V0.B16, V3.B16 85 VEXT $8, V1.B16, V1.B16, V2.B16 86 VREV64 V2.B16, V2.B16 87 VREV64 V3.B16, V3.B16 88 89 VST1 [V2.B16, V3.B16], (res_ptr) 90 91 RET 92 /* ---------------------------------------*/ 93 // func p256MovCond(res, a, b *SM2P256Point, cond int) 94 // If cond == 0 res=b, else res=a 95 TEXT ·p256MovCond(SB),NOSPLIT,$0 96 MOVD res+0(FP), res_ptr 97 MOVD a+8(FP), a_ptr 98 MOVD b+16(FP), b_ptr 99 MOVD cond+24(FP), R3 100 101 VEOR V0.B16, V0.B16, V0.B16 102 VDUP R3, V1.S4 103 VCMEQ V0.S4, V1.S4, V2.S4 104 105 VLD1.P (48)(a_ptr), [V3.B16, V4.B16, V5.B16] 106 VLD1.P (48)(b_ptr), [V6.B16, V7.B16, V8.B16] 107 VBIT V2.B16, V6.B16, V3.B16 108 VBIT V2.B16, V7.B16, V4.B16 109 VBIT V2.B16, V8.B16, V5.B16 110 VST1.P [V3.B16, V4.B16, V5.B16], (48)(res_ptr) 111 112 VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16] 113 VLD1 (b_ptr), [V6.B16, V7.B16, V8.B16] 114 VBIT V2.B16, V6.B16, V3.B16 115 VBIT V2.B16, V7.B16, V4.B16 116 VBIT V2.B16, V8.B16, V5.B16 117 VST1 [V3.B16, V4.B16, V5.B16], (res_ptr) 118 119 RET 120 /* ---------------------------------------*/ 121 // func p256NegCond(val *p256Element, cond int) 122 TEXT ·p256NegCond(SB),NOSPLIT,$0 123 MOVD val+0(FP), a_ptr 124 MOVD cond+8(FP), hlp0 125 MOVD a_ptr, res_ptr 126 // acc = poly 127 LDP p256p<>+0x00(SB), (acc0, acc1) 128 LDP p256p<>+0x10(SB), (acc2, acc3) 129 130 // Load the original value 131 LDP 0*16(a_ptr), (t0, t1) 132 LDP 1*16(a_ptr), (t2, t3) 133 // Speculatively subtract 134 SUBS t0, acc0 135 SBCS t1, acc1 136 SBCS t2, acc2 137 SBC t3, acc3 138 // If condition is 0, keep original value 139 CMP $0, hlp0 140 CSEL EQ, t0, acc0, acc0 141 CSEL EQ, t1, acc1, acc1 142 CSEL EQ, t2, acc2, acc2 143 CSEL EQ, t3, acc3, acc3 144 // Store result 145 STP (acc0, acc1), 0*16(res_ptr) 146 STP (acc2, acc3), 1*16(res_ptr) 147 148 RET 149 /* ---------------------------------------*/ 150 // func p256Sqr(res, in *p256Element, n int) 151 TEXT ·p256Sqr(SB),NOSPLIT,$0 152 MOVD res+0(FP), res_ptr 153 MOVD in+8(FP), a_ptr 154 MOVD n+16(FP), b_ptr 155 156 LDP p256p<>+0x00(SB), (const0, const1) 157 LDP p256p<>+0x10(SB), (const2, const3) 158 159 LDP 0*16(a_ptr), (x0, x1) 160 LDP 1*16(a_ptr), (x2, x3) 161 162 sqrLoop: 163 SUB $1, b_ptr 164 CALL sm2P256SqrInternal<>(SB) 165 MOVD y0, x0 166 MOVD y1, x1 167 MOVD y2, x2 168 MOVD y3, x3 169 CBNZ b_ptr, sqrLoop 170 171 STP (y0, y1), 0*16(res_ptr) 172 STP (y2, y3), 1*16(res_ptr) 173 RET 174 /* ---------------------------------------*/ 175 // func p256Mul(res, in1, in2 *p256Element) 176 TEXT ·p256Mul(SB),NOSPLIT,$0 177 MOVD res+0(FP), res_ptr 178 MOVD in1+8(FP), a_ptr 179 MOVD in2+16(FP), b_ptr 180 181 LDP p256p<>+0x00(SB), (const0, const1) 182 LDP p256p<>+0x10(SB), (const2, const3) 183 184 LDP 0*16(a_ptr), (x0, x1) 185 LDP 1*16(a_ptr), (x2, x3) 186 187 LDP 0*16(b_ptr), (y0, y1) 188 LDP 1*16(b_ptr), (y2, y3) 189 190 CALL sm2P256MulInternal<>(SB) 191 192 STP (y0, y1), 0*16(res_ptr) 193 STP (y2, y3), 1*16(res_ptr) 194 RET 195 /* ---------------------------------------*/ 196 // func p256FromMont(res, in *p256Element) 197 TEXT ·p256FromMont(SB),NOSPLIT,$0 198 MOVD res+0(FP), res_ptr 199 MOVD in+8(FP), a_ptr 200 LDP p256p<>+0x00(SB), (const0, const1) 201 LDP p256p<>+0x10(SB), (const2, const3) 202 203 LDP 0*16(a_ptr), (acc0, acc1) 204 LDP 1*16(a_ptr), (acc2, acc3) 205 // Only reduce, no multiplications are needed 206 // First reduction step 207 LSL $32, acc0, y0 208 LSR $32, acc0, y1 209 210 SUBS y0, acc1 211 SBCS y1, acc2 212 SBCS y0, acc3 213 SBC y1, acc0, y0 214 215 ADDS acc0, acc1, acc1 216 ADCS $0, acc2, acc2 217 ADCS $0, acc3, acc3 218 ADC $0, y0, acc0 219 220 // Second reduction step 221 LSL $32, acc1, y0 222 LSR $32, acc1, y1 223 224 SUBS y0, acc2 225 SBCS y1, acc3 226 SBCS y0, acc0 227 SBC y1, acc1, y0 228 229 ADDS acc1, acc2, acc2 230 ADCS $0, acc3, acc3 231 ADCS $0, acc0, acc0 232 ADC $0, y0, acc1 233 234 // Third reduction step 235 LSL $32, acc2, y0 236 LSR $32, acc2, y1 237 238 SUBS y0, acc3 239 SBCS y1, acc0 240 SBCS y0, acc1 241 SBC y1, acc2, y0 242 243 ADDS acc2, acc3, acc3 244 ADCS $0, acc0, acc0 245 ADCS $0, acc1, acc1 246 ADC $0, y0, acc2 247 248 // Last reduction step 249 LSL $32, acc3, y0 250 LSR $32, acc3, y1 251 252 SUBS y0, acc0 253 SBCS y1, acc1 254 SBCS y0, acc2 255 SBC y1, acc3, y0 256 257 ADDS acc3, acc0, acc0 258 ADCS $0, acc1, acc1 259 ADCS $0, acc2, acc2 260 ADC $0, y0, acc3 261 262 SUBS const0, acc0, t0 263 SBCS const1, acc1, t1 264 SBCS const2, acc2, t2 265 SBCS const3, acc3, t3 266 267 CSEL CS, t0, acc0, acc0 268 CSEL CS, t1, acc1, acc1 269 CSEL CS, t2, acc2, acc2 270 CSEL CS, t3, acc3, acc3 271 272 STP (acc0, acc1), 0*16(res_ptr) 273 STP (acc2, acc3), 1*16(res_ptr) 274 275 RET 276 /* ---------------------------------------*/ 277 // func p256Select(res *SM2P256Point, table *p256Table, idx, limit int) 278 TEXT ·p256Select(SB),NOSPLIT,$0 279 MOVD limit+24(FP), a_ptr 280 MOVD idx+16(FP), const0 281 MOVD table+8(FP), b_ptr 282 MOVD res+0(FP), res_ptr 283 284 VDUP const0, V0.S4 285 286 VEOR V2.B16, V2.B16, V2.B16 287 VEOR V3.B16, V3.B16, V3.B16 288 VEOR V4.B16, V4.B16, V4.B16 289 VEOR V5.B16, V5.B16, V5.B16 290 VEOR V6.B16, V6.B16, V6.B16 291 VEOR V7.B16, V7.B16, V7.B16 292 293 MOVD $0, const1 294 295 loop_select: 296 ADD $1, const1 297 VDUP const1, V1.S4 298 VCMEQ V0.S4, V1.S4, V14.S4 299 VLD1.P (48)(b_ptr), [V8.B16, V9.B16, V10.B16] 300 VLD1.P (48)(b_ptr), [V11.B16, V12.B16, V13.B16] 301 VBIT V14.B16, V8.B16, V2.B16 302 VBIT V14.B16, V9.B16, V3.B16 303 VBIT V14.B16, V10.B16, V4.B16 304 VBIT V14.B16, V11.B16, V5.B16 305 VBIT V14.B16, V12.B16, V6.B16 306 VBIT V14.B16, V13.B16, V7.B16 307 308 CMP a_ptr, const1 309 BNE loop_select 310 VST1.P [V2.B16, V3.B16, V4.B16], (48)(res_ptr) 311 VST1 [V5.B16, V6.B16, V7.B16], (res_ptr) 312 RET 313 /* ---------------------------------------*/ 314 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 315 TEXT ·p256SelectAffine(SB),NOSPLIT,$0 316 MOVD idx+16(FP), t0 317 MOVD table+8(FP), t1 318 MOVD res+0(FP), res_ptr 319 320 VDUP t0, V0.S4 321 322 VEOR V2.B16, V2.B16, V2.B16 323 VEOR V3.B16, V3.B16, V3.B16 324 VEOR V4.B16, V4.B16, V4.B16 325 VEOR V5.B16, V5.B16, V5.B16 326 327 MOVD $0, t2 328 329 loop_select: 330 ADD $1, t2 331 VDUP t2, V1.S4 332 VCMEQ V0.S4, V1.S4, V10.S4 333 VLD1.P (64)(t1), [V6.B16, V7.B16, V8.B16, V9.B16] 334 VBIT V10.B16, V6.B16, V2.B16 335 VBIT V10.B16, V7.B16, V3.B16 336 VBIT V10.B16, V8.B16, V4.B16 337 VBIT V10.B16, V9.B16, V5.B16 338 339 CMP $32, t2 340 BNE loop_select 341 342 VST1 [V2.B16, V3.B16, V4.B16, V5.B16], (res_ptr) 343 RET 344 345 /* ---------------------------------------*/ 346 //func p256OrdReduce(s *p256OrdElement) 347 TEXT ·p256OrdReduce(SB),NOSPLIT,$0 348 MOVD s+0(FP), res_ptr 349 350 LDP p256ord<>+0x00(SB), (const0, const1) 351 LDP p256ord<>+0x10(SB), (const2, const3) 352 353 LDP 0*16(res_ptr), (acc0, acc1) 354 LDP 1*16(res_ptr), (acc2, acc3) 355 EOR acc4, acc4, acc4 356 357 SUBS const0, acc0, y0 358 SBCS const1, acc1, y1 359 SBCS const2, acc2, y2 360 SBCS const3, acc3, y3 361 SBCS $0, acc4, acc4 362 363 CSEL CS, y0, acc0, x0 364 CSEL CS, y1, acc1, x1 365 CSEL CS, y2, acc2, x2 366 CSEL CS, y3, acc3, x3 367 368 STP (x0, x1), 0*16(res_ptr) 369 STP (x2, x3), 1*16(res_ptr) 370 371 RET 372 373 /* ---------------------------------------*/ 374 // func p256OrdSqr(res, in *p256OrdElement, n int) 375 TEXT ·p256OrdSqr(SB),NOSPLIT,$0 376 MOVD in+8(FP), a_ptr 377 MOVD n+16(FP), b_ptr 378 379 MOVD p256ordK0<>(SB), hlp1 380 381 LDP p256ord<>+0x00(SB), (const0, const1) 382 LDP p256ord<>+0x10(SB), (const2, const3) 383 384 LDP 0*16(a_ptr), (x0, x1) 385 LDP 1*16(a_ptr), (x2, x3) 386 387 ordSqrLoop: 388 SUB $1, b_ptr 389 390 // x[1:] * x[0] 391 MUL x0, x1, acc1 392 UMULH x0, x1, acc2 393 394 MUL x0, x2, t0 395 ADDS t0, acc2, acc2 396 UMULH x0, x2, acc3 397 398 MUL x0, x3, t0 399 ADCS t0, acc3, acc3 400 UMULH x0, x3, acc4 401 ADC $0, acc4, acc4 402 // x[2:] * x[1] 403 MUL x1, x2, t0 404 ADDS t0, acc3 405 UMULH x1, x2, t1 406 ADCS t1, acc4 407 ADC $0, ZR, acc5 408 409 MUL x1, x3, t0 410 ADDS t0, acc4 411 UMULH x1, x3, t1 412 ADC t1, acc5 413 // x[3] * x[2] 414 MUL x2, x3, t0 415 ADDS t0, acc5 416 UMULH x2, x3, acc6 417 ADC $0, acc6 418 419 MOVD $0, acc7 420 // *2 421 ADDS acc1, acc1 422 ADCS acc2, acc2 423 ADCS acc3, acc3 424 ADCS acc4, acc4 425 ADCS acc5, acc5 426 ADCS acc6, acc6 427 ADC $0, acc7 428 // Missing products 429 MUL x0, x0, acc0 430 UMULH x0, x0, t0 431 ADDS t0, acc1, acc1 432 433 MUL x1, x1, t0 434 ADCS t0, acc2, acc2 435 UMULH x1, x1, t1 436 ADCS t1, acc3, acc3 437 438 MUL x2, x2, t0 439 ADCS t0, acc4, acc4 440 UMULH x2, x2, t1 441 ADCS t1, acc5, acc5 442 443 MUL x3, x3, t0 444 ADCS t0, acc6, acc6 445 UMULH x3, x3, t1 446 ADC t1, acc7, acc7 447 // First reduction step 448 MUL acc0, hlp1, hlp0 449 450 MUL const0, hlp0, t0 451 ADDS t0, acc0, acc0 452 UMULH const0, hlp0, t1 453 454 MUL const1, hlp0, t0 455 ADCS t0, acc1, acc1 456 UMULH const1, hlp0, y0 457 458 MUL const2, hlp0, t0 459 ADCS t0, acc2, acc2 460 UMULH const2, hlp0, acc0 461 462 MUL const3, hlp0, t0 463 ADCS t0, acc3, acc3 464 465 UMULH const3, hlp0, hlp0 466 ADC $0, hlp0 467 468 ADDS t1, acc1, acc1 469 ADCS y0, acc2, acc2 470 ADCS acc0, acc3, acc3 471 ADC $0, hlp0, acc0 472 // Second reduction step 473 MUL acc1, hlp1, hlp0 474 475 MUL const0, hlp0, t0 476 ADDS t0, acc1, acc1 477 UMULH const0, hlp0, t1 478 479 MUL const1, hlp0, t0 480 ADCS t0, acc2, acc2 481 UMULH const1, hlp0, y0 482 483 MUL const2, hlp0, t0 484 ADCS t0, acc3, acc3 485 UMULH const2, hlp0, acc1 486 487 MUL const3, hlp0, t0 488 ADCS t0, acc0, acc0 489 490 UMULH const3, hlp0, hlp0 491 ADC $0, hlp0 492 493 ADDS t1, acc2, acc2 494 ADCS y0, acc3, acc3 495 ADCS acc1, acc0, acc0 496 ADC $0, hlp0, acc1 497 // Third reduction step 498 MUL acc2, hlp1, hlp0 499 500 MUL const0, hlp0, t0 501 ADDS t0, acc2, acc2 502 UMULH const0, hlp0, t1 503 504 MUL const1, hlp0, t0 505 ADCS t0, acc3, acc3 506 UMULH const1, hlp0, y0 507 508 MUL const2, hlp0, t0 509 ADCS t0, acc0, acc0 510 UMULH const2, hlp0, acc2 511 512 MUL const3, hlp0, t0 513 ADCS t0, acc1, acc1 514 515 UMULH const3, hlp0, hlp0 516 ADC $0, hlp0 517 518 ADDS t1, acc3, acc3 519 ADCS y0, acc0, acc0 520 ADCS acc2, acc1, acc1 521 ADC $0, hlp0, acc2 522 523 // Last reduction step 524 MUL acc3, hlp1, hlp0 525 526 MUL const0, hlp0, t0 527 ADDS t0, acc3, acc3 528 UMULH const0, hlp0, t1 529 530 MUL const1, hlp0, t0 531 ADCS t0, acc0, acc0 532 UMULH const1, hlp0, y0 533 534 MUL const2, hlp0, t0 535 ADCS t0, acc1, acc1 536 UMULH const2, hlp0, acc3 537 538 MUL const3, hlp0, t0 539 ADCS t0, acc2, acc2 540 541 UMULH const3, hlp0, hlp0 542 ADC $0, acc7 543 544 ADDS t1, acc0, acc0 545 ADCS y0, acc1, acc1 546 ADCS acc3, acc2, acc2 547 ADC $0, hlp0, acc3 548 549 ADDS acc4, acc0, acc0 550 ADCS acc5, acc1, acc1 551 ADCS acc6, acc2, acc2 552 ADCS acc7, acc3, acc3 553 ADC $0, ZR, acc4 554 555 SUBS const0, acc0, y0 556 SBCS const1, acc1, y1 557 SBCS const2, acc2, y2 558 SBCS const3, acc3, y3 559 SBCS $0, acc4, acc4 560 561 CSEL CS, y0, acc0, x0 562 CSEL CS, y1, acc1, x1 563 CSEL CS, y2, acc2, x2 564 CSEL CS, y3, acc3, x3 565 566 CBNZ b_ptr, ordSqrLoop 567 568 MOVD res+0(FP), res_ptr 569 STP (x0, x1), 0*16(res_ptr) 570 STP (x2, x3), 1*16(res_ptr) 571 572 RET 573 /* ---------------------------------------*/ 574 // func p256OrdMul(res, in1, in2 *p256OrdElement) 575 TEXT ·p256OrdMul(SB),NOSPLIT,$0 576 MOVD in1+8(FP), a_ptr 577 MOVD in2+16(FP), b_ptr 578 579 MOVD p256ordK0<>(SB), hlp1 580 LDP p256ord<>+0x00(SB), (const0, const1) 581 LDP p256ord<>+0x10(SB), (const2, const3) 582 583 LDP 0*16(a_ptr), (x0, x1) 584 LDP 1*16(a_ptr), (x2, x3) 585 LDP 0*16(b_ptr), (y0, y1) 586 LDP 1*16(b_ptr), (y2, y3) 587 588 // y[0] * x 589 MUL y0, x0, acc0 590 UMULH y0, x0, acc1 591 592 MUL y0, x1, t0 593 ADDS t0, acc1 594 UMULH y0, x1, acc2 595 596 MUL y0, x2, t0 597 ADCS t0, acc2 598 UMULH y0, x2, acc3 599 600 MUL y0, x3, t0 601 ADCS t0, acc3 602 UMULH y0, x3, acc4 603 ADC $0, acc4 604 // First reduction step 605 MUL acc0, hlp1, hlp0 606 607 MUL const0, hlp0, t0 608 ADDS t0, acc0, acc0 609 UMULH const0, hlp0, t1 610 611 MUL const1, hlp0, t0 612 ADCS t0, acc1, acc1 613 UMULH const1, hlp0, y0 614 615 MUL const2, hlp0, t0 616 ADCS t0, acc2, acc2 617 UMULH const2, hlp0, acc0 618 619 MUL const3, hlp0, t0 620 ADCS t0, acc3, acc3 621 622 UMULH const3, hlp0, hlp0 623 ADC $0, acc4 624 625 ADDS t1, acc1, acc1 626 ADCS y0, acc2, acc2 627 ADCS acc0, acc3, acc3 628 ADC $0, hlp0, acc0 629 // y[1] * x 630 MUL y1, x0, t0 631 ADDS t0, acc1 632 UMULH y1, x0, t1 633 634 MUL y1, x1, t0 635 ADCS t0, acc2 636 UMULH y1, x1, hlp0 637 638 MUL y1, x2, t0 639 ADCS t0, acc3 640 UMULH y1, x2, y0 641 642 MUL y1, x3, t0 643 ADCS t0, acc4 644 UMULH y1, x3, y1 645 ADC $0, ZR, acc5 646 647 ADDS t1, acc2 648 ADCS hlp0, acc3 649 ADCS y0, acc4 650 ADC y1, acc5 651 // Second reduction step 652 MUL acc1, hlp1, hlp0 653 654 MUL const0, hlp0, t0 655 ADDS t0, acc1, acc1 656 UMULH const0, hlp0, t1 657 658 MUL const1, hlp0, t0 659 ADCS t0, acc2, acc2 660 UMULH const1, hlp0, y0 661 662 MUL const2, hlp0, t0 663 ADCS t0, acc3, acc3 664 UMULH const2, hlp0, acc1 665 666 MUL const3, hlp0, t0 667 ADCS t0, acc0, acc0 668 669 UMULH const3, hlp0, hlp0 670 ADC $0, acc5 671 672 ADDS t1, acc2, acc2 673 ADCS y0, acc3, acc3 674 ADCS acc1, acc0, acc0 675 ADC $0, hlp0, acc1 676 // y[2] * x 677 MUL y2, x0, t0 678 ADDS t0, acc2 679 UMULH y2, x0, t1 680 681 MUL y2, x1, t0 682 ADCS t0, acc3 683 UMULH y2, x1, hlp0 684 685 MUL y2, x2, t0 686 ADCS t0, acc4 687 UMULH y2, x2, y0 688 689 MUL y2, x3, t0 690 ADCS t0, acc5 691 UMULH y2, x3, y1 692 ADC $0, ZR, acc6 693 694 ADDS t1, acc3 695 ADCS hlp0, acc4 696 ADCS y0, acc5 697 ADC y1, acc6 698 // Third reduction step 699 MUL acc2, hlp1, hlp0 700 701 MUL const0, hlp0, t0 702 ADDS t0, acc2, acc2 703 UMULH const0, hlp0, t1 704 705 MUL const1, hlp0, t0 706 ADCS t0, acc3, acc3 707 UMULH const1, hlp0, y0 708 709 MUL const2, hlp0, t0 710 ADCS t0, acc0, acc0 711 UMULH const2, hlp0, acc2 712 713 MUL const3, hlp0, t0 714 ADCS t0, acc1, acc1 715 716 UMULH const3, hlp0, hlp0 717 ADC $0, acc6 718 719 ADDS t1, acc3, acc3 720 ADCS y0, acc0, acc0 721 ADCS acc2, acc1, acc1 722 ADC $0, hlp0, acc2 723 // y[3] * x 724 MUL y3, x0, t0 725 ADDS t0, acc3 726 UMULH y3, x0, t1 727 728 MUL y3, x1, t0 729 ADCS t0, acc4 730 UMULH y3, x1, hlp0 731 732 MUL y3, x2, t0 733 ADCS t0, acc5 734 UMULH y3, x2, y0 735 736 MUL y3, x3, t0 737 ADCS t0, acc6 738 UMULH y3, x3, y1 739 ADC $0, ZR, acc7 740 741 ADDS t1, acc4 742 ADCS hlp0, acc5 743 ADCS y0, acc6 744 ADC y1, acc7 745 // Last reduction step 746 MUL acc3, hlp1, hlp0 747 748 MUL const0, hlp0, t0 749 ADDS t0, acc3, acc3 750 UMULH const0, hlp0, t1 751 752 MUL const1, hlp0, t0 753 ADCS t0, acc0, acc0 754 UMULH const1, hlp0, y0 755 756 MUL const2, hlp0, t0 757 ADCS t0, acc1, acc1 758 UMULH const2, hlp0, acc3 759 760 MUL const3, hlp0, t0 761 ADCS t0, acc2, acc2 762 763 UMULH const3, hlp0, hlp0 764 ADC $0, acc7 765 766 ADDS t1, acc0, acc0 767 ADCS y0, acc1, acc1 768 ADCS acc3, acc2, acc2 769 ADC $0, hlp0, acc3 770 771 ADDS acc4, acc0, acc0 772 ADCS acc5, acc1, acc1 773 ADCS acc6, acc2, acc2 774 ADCS acc7, acc3, acc3 775 ADC $0, ZR, acc4 776 777 SUBS const0, acc0, t0 778 SBCS const1, acc1, t1 779 SBCS const2, acc2, t2 780 SBCS const3, acc3, t3 781 SBCS $0, acc4, acc4 782 783 CSEL CS, t0, acc0, acc0 784 CSEL CS, t1, acc1, acc1 785 CSEL CS, t2, acc2, acc2 786 CSEL CS, t3, acc3, acc3 787 788 MOVD res+0(FP), res_ptr 789 STP (acc0, acc1), 0*16(res_ptr) 790 STP (acc2, acc3), 1*16(res_ptr) 791 792 RET 793 /* ---------------------------------------*/ 794 // (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0) 795 TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0 796 SUBS x0, y0, acc0 797 SBCS x1, y1, acc1 798 SBCS x2, y2, acc2 799 SBCS x3, y3, acc3 800 SBC $0, ZR, t0 801 802 ADDS const0, acc0, acc4 803 ADCS const1, acc1, acc5 804 ADCS const2, acc2, acc6 805 ADC const3, acc3, acc7 806 807 ANDS $1, t0 808 CSEL EQ, acc0, acc4, x0 809 CSEL EQ, acc1, acc5, x1 810 CSEL EQ, acc2, acc6, x2 811 CSEL EQ, acc3, acc7, x3 812 813 RET 814 815 /* ---------------------------------------*/ 816 // (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2 817 TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0 818 // x[1:] * x[0] 819 MUL x0, x1, acc1 820 UMULH x0, x1, acc2 821 822 MUL x0, x2, t0 823 ADDS t0, acc2, acc2 824 UMULH x0, x2, acc3 825 826 MUL x0, x3, t0 827 ADCS t0, acc3, acc3 828 UMULH x0, x3, acc4 829 ADC $0, acc4, acc4 830 // x[2:] * x[1] 831 MUL x1, x2, t0 832 ADDS t0, acc3 833 UMULH x1, x2, t1 834 ADCS t1, acc4 835 ADC $0, ZR, acc5 836 837 MUL x1, x3, t0 838 ADDS t0, acc4 839 UMULH x1, x3, t1 840 ADC t1, acc5 841 // x[3] * x[2] 842 MUL x2, x3, t0 843 ADDS t0, acc5 844 UMULH x2, x3, acc6 845 ADC $0, acc6 846 847 MOVD $0, acc7 848 // *2 849 ADDS acc1, acc1 850 ADCS acc2, acc2 851 ADCS acc3, acc3 852 ADCS acc4, acc4 853 ADCS acc5, acc5 854 ADCS acc6, acc6 855 ADC $0, acc7 856 // Missing products 857 MUL x0, x0, acc0 858 UMULH x0, x0, t0 859 ADDS t0, acc1, acc1 860 861 MUL x1, x1, t0 862 ADCS t0, acc2, acc2 863 UMULH x1, x1, t1 864 ADCS t1, acc3, acc3 865 866 MUL x2, x2, t0 867 ADCS t0, acc4, acc4 868 UMULH x2, x2, t1 869 ADCS t1, acc5, acc5 870 871 MUL x3, x3, t0 872 ADCS t0, acc6, acc6 873 UMULH x3, x3, t1 874 ADCS t1, acc7, acc7 875 876 // First reduction step 877 LSL $32, acc0, y0 878 LSR $32, acc0, y1 879 880 SUBS y0, acc1 881 SBCS y1, acc2 882 SBCS y0, acc3 883 SBC y1, acc0, y0 884 885 ADDS acc0, acc1, acc1 886 ADCS $0, acc2, acc2 887 ADCS $0, acc3, acc3 888 ADC $0, y0, acc0 889 890 // Second reduction step 891 LSL $32, acc1, y0 892 LSR $32, acc1, y1 893 894 SUBS y0, acc2 895 SBCS y1, acc3 896 SBCS y0, acc0 897 SBC y1, acc1, y0 898 899 ADDS acc1, acc2, acc2 900 ADCS $0, acc3, acc3 901 ADCS $0, acc0, acc0 902 ADC $0, y0, acc1 903 904 // Third reduction step 905 LSL $32, acc2, y0 906 LSR $32, acc2, y1 907 908 SUBS y0, acc3 909 SBCS y1, acc0 910 SBCS y0, acc1 911 SBC y1, acc2, y0 912 913 ADDS acc2, acc3, acc3 914 ADCS $0, acc0, acc0 915 ADCS $0, acc1, acc1 916 ADC $0, y0, acc2 917 918 // Last reduction step 919 LSL $32, acc3, y0 920 LSR $32, acc3, y1 921 922 SUBS y0, acc0 923 SBCS y1, acc1 924 SBCS y0, acc2 925 SBC y1, acc3, y0 926 927 ADDS acc3, acc0, acc0 928 ADCS $0, acc1, acc1 929 ADCS $0, acc2, acc2 930 ADC $0, y0, acc3 931 932 // Add bits [511:256] of the sqr result 933 ADDS acc4, acc0, acc0 934 ADCS acc5, acc1, acc1 935 ADCS acc6, acc2, acc2 936 ADCS acc7, acc3, acc3 937 ADC $0, ZR, acc4 938 939 SUBS const0, acc0, t0 940 SBCS const1, acc1, t1 941 SBCS const2, acc2, acc5 942 SBCS const3, acc3, acc6 943 SBCS $0, acc4, acc4 944 945 CSEL CS, t0, acc0, y0 946 CSEL CS, t1, acc1, y1 947 CSEL CS, acc5, acc2, y2 948 CSEL CS, acc6, acc3, y3 949 RET 950 /* ---------------------------------------*/ 951 // (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0) 952 TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 953 // y[0] * x 954 MUL y0, x0, acc0 955 UMULH y0, x0, acc1 956 957 MUL y0, x1, t0 958 ADDS t0, acc1 959 UMULH y0, x1, acc2 960 961 MUL y0, x2, t0 962 ADCS t0, acc2 963 UMULH y0, x2, acc3 964 965 MUL y0, x3, t0 966 ADCS t0, acc3 967 UMULH y0, x3, acc4 968 ADC $0, acc4 969 // First reduction step 970 LSL $32, acc0, t0 971 LSR $32, acc0, t1 972 973 SUBS t0, acc1 974 SBCS t1, acc2 975 SBCS t0, acc3 976 SBC t1, acc0, t0 977 978 ADDS acc0, acc1, acc1 979 ADCS $0, acc2, acc2 980 ADCS $0, acc3, acc3 981 ADC $0, t0, acc0 982 983 // y[1] * x 984 MUL y1, x0, t0 985 ADDS t0, acc1 986 UMULH y1, x0, t1 987 988 MUL y1, x1, t0 989 ADCS t0, acc2 990 UMULH y1, x1, y0 991 992 MUL y1, x2, t0 993 ADCS t0, acc3 994 UMULH y1, x2, acc6 995 996 MUL y1, x3, t0 997 ADCS t0, acc4 998 UMULH y1, x3, hlp0 999 ADC $0, ZR, acc5 1000 1001 ADDS t1, acc2 1002 ADCS y0, acc3 1003 ADCS acc6, acc4 1004 ADC hlp0, acc5 1005 // Second reduction step 1006 LSL $32, acc1, t0 1007 LSR $32, acc1, t1 1008 1009 SUBS t0, acc2 1010 SBCS t1, acc3 1011 SBCS t0, acc0 1012 SBC t1, acc1, t0 1013 1014 ADDS acc1, acc2, acc2 1015 ADCS $0, acc3, acc3 1016 ADCS $0, acc0, acc0 1017 ADC $0, t0, acc1 1018 1019 // y[2] * x 1020 MUL y2, x0, t0 1021 ADDS t0, acc2 1022 UMULH y2, x0, t1 1023 1024 MUL y2, x1, t0 1025 ADCS t0, acc3 1026 UMULH y2, x1, y0 1027 1028 MUL y2, x2, t0 1029 ADCS t0, acc4 1030 UMULH y2, x2, y1 1031 1032 MUL y2, x3, t0 1033 ADCS t0, acc5 1034 UMULH y2, x3, hlp0 1035 ADC $0, ZR, acc6 1036 1037 ADDS t1, acc3 1038 ADCS y0, acc4 1039 ADCS y1, acc5 1040 ADC hlp0, acc6 1041 // Third reduction step 1042 LSL $32, acc2, t0 1043 LSR $32, acc2, t1 1044 1045 SUBS t0, acc3 1046 SBCS t1, acc0 1047 SBCS t0, acc1 1048 SBC t1, acc2, t0 1049 1050 ADDS acc2, acc3, acc3 1051 ADCS $0, acc0, acc0 1052 ADCS $0, acc1, acc1 1053 ADC $0, t0, acc2 1054 1055 // y[3] * x 1056 MUL y3, x0, t0 1057 ADDS t0, acc3 1058 UMULH y3, x0, t1 1059 1060 MUL y3, x1, t0 1061 ADCS t0, acc4 1062 UMULH y3, x1, y0 1063 1064 MUL y3, x2, t0 1065 ADCS t0, acc5 1066 UMULH y3, x2, y1 1067 1068 MUL y3, x3, t0 1069 ADCS t0, acc6 1070 UMULH y3, x3, hlp0 1071 ADC $0, ZR, acc7 1072 1073 ADDS t1, acc4 1074 ADCS y0, acc5 1075 ADCS y1, acc6 1076 ADC hlp0, acc7 1077 // Last reduction step 1078 LSL $32, acc3, t0 1079 LSR $32, acc3, t1 1080 1081 SUBS t0, acc0 1082 SBCS t1, acc1 1083 SBCS t0, acc2 1084 SBC t1, acc3, t0 1085 1086 ADDS acc3, acc0, acc0 1087 ADCS $0, acc1, acc1 1088 ADCS $0, acc2, acc2 1089 ADC $0, t0, acc3 1090 1091 // Add bits [511:256] of the mul result 1092 ADDS acc4, acc0, acc0 1093 ADCS acc5, acc1, acc1 1094 ADCS acc6, acc2, acc2 1095 ADCS acc7, acc3, acc3 1096 ADC $0, ZR, acc4 1097 1098 SUBS const0, acc0, t0 1099 SBCS const1, acc1, t1 1100 SBCS const2, acc2, acc5 1101 SBCS const3, acc3, acc6 1102 SBCS $0, acc4, acc4 1103 1104 CSEL CS, t0, acc0, y0 1105 CSEL CS, t1, acc1, y1 1106 CSEL CS, acc5, acc2, y2 1107 CSEL CS, acc6, acc3, y3 1108 RET 1109 /* ---------------------------------------*/ 1110 // (x3, x2, x1, x0) = 2(y3, y2, y1, y0) 1111 #define p256MulBy2Inline \ 1112 ADDS y0, y0, x0; \ 1113 ADCS y1, y1, x1; \ 1114 ADCS y2, y2, x2; \ 1115 ADCS y3, y3, x3; \ 1116 ADC $0, ZR, hlp0; \ 1117 SUBS const0, x0, t0; \ 1118 SBCS const1, x1, t1;\ 1119 SBCS const2, x2, acc5; \ 1120 SBCS const3, x3, acc6;\ 1121 SBCS $0, hlp0, hlp0;\ 1122 CSEL CC, x0, t0, x0;\ 1123 CSEL CC, x1, t1, x1;\ 1124 CSEL CC, x2, acc5, x2;\ 1125 CSEL CC, x3, acc6, x3; 1126 /* ---------------------------------------*/ 1127 #define x1in(off) (off)(a_ptr) 1128 #define y1in(off) (off + 32)(a_ptr) 1129 #define z1in(off) (off + 64)(a_ptr) 1130 #define x2in(off) (off)(b_ptr) 1131 #define z2in(off) (off + 64)(b_ptr) 1132 #define x3out(off) (off)(res_ptr) 1133 #define y3out(off) (off + 32)(res_ptr) 1134 #define z3out(off) (off + 64)(res_ptr) 1135 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) 1136 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) 1137 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) 1138 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) 1139 /* ---------------------------------------*/ 1140 #define y2in(off) (32*0 + 8 + off)(RSP) 1141 #define s2(off) (32*1 + 8 + off)(RSP) 1142 #define z1sqr(off) (32*2 + 8 + off)(RSP) 1143 #define h(off) (32*3 + 8 + off)(RSP) 1144 #define r(off) (32*4 + 8 + off)(RSP) 1145 #define hsqr(off) (32*5 + 8 + off)(RSP) 1146 #define rsqr(off) (32*6 + 8 + off)(RSP) 1147 #define hcub(off) (32*7 + 8 + off)(RSP) 1148 1149 #define z2sqr(off) (32*8 + 8 + off)(RSP) 1150 #define s1(off) (32*9 + 8 + off)(RSP) 1151 #define u1(off) (32*10 + 8 + off)(RSP) 1152 #define u2(off) (32*11 + 8 + off)(RSP) 1153 1154 // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1155 TEXT ·p256PointAddAffineAsm(SB),0,$264-48 1156 MOVD in1+8(FP), a_ptr 1157 MOVD in2+16(FP), b_ptr 1158 MOVD sign+24(FP), hlp0 1159 MOVD sel+32(FP), hlp1 1160 MOVD zero+40(FP), t1 1161 1162 VEOR V12.B16, V12.B16, V12.B16 1163 VDUP hlp1, V13.S4 1164 VCMEQ V12.S4, V13.S4, V13.S4 1165 VDUP t1, V14.S4 1166 VCMEQ V12.S4, V14.S4, V14.S4 1167 1168 LDP p256p<>+0x00(SB), (const0, const1) 1169 LDP p256p<>+0x10(SB), (const2, const3) 1170 1171 // Negate y2in based on sign 1172 LDP 2*16(b_ptr), (y0, y1) 1173 LDP 3*16(b_ptr), (y2, y3) 1174 1175 SUBS y0, const0, acc0 1176 SBCS y1, const1, acc1 1177 SBCS y2, const2, acc2 1178 SBCS y3, const3, acc3 1179 SBC $0, ZR, t0 1180 1181 ADDS const0, acc0, acc4 1182 ADCS const1, acc1, acc5 1183 ADCS const2, acc2, acc6 1184 ADCS const3, acc3, acc7 1185 ADC $0, t0, t0 1186 1187 CMP $0, t0 1188 CSEL EQ, acc4, acc0, acc0 1189 CSEL EQ, acc5, acc1, acc1 1190 CSEL EQ, acc6, acc2, acc2 1191 CSEL EQ, acc7, acc3, acc3 1192 // If condition is 0, keep original value 1193 CMP $0, hlp0 1194 CSEL EQ, y0, acc0, y0 1195 CSEL EQ, y1, acc1, y1 1196 CSEL EQ, y2, acc2, y2 1197 CSEL EQ, y3, acc3, y3 1198 // Store result 1199 STy(y2in) 1200 1201 // Begin point add 1202 LDx(z1in) 1203 CALL sm2P256SqrInternal<>(SB) // z1ˆ2 1204 STy(z1sqr) 1205 1206 LDx(x2in) 1207 CALL sm2P256MulInternal<>(SB) // x2 * z1ˆ2 1208 1209 LDx(x1in) 1210 CALL sm2P256Subinternal<>(SB) // h = u2 - u1 1211 STx(h) 1212 1213 MOVD x0, y0 1214 MOVD x1, y1 1215 MOVD x2, y2 1216 MOVD x3, y3 1217 LDx(z1in) 1218 CALL sm2P256MulInternal<>(SB) // z3 = h * z1 1219 VMOV y0, V4.D[0] // save z3 1220 VMOV y1, V4.D[1] 1221 VMOV y2, V5.D[0] 1222 VMOV y3, V5.D[1] 1223 1224 LDy(z1sqr) 1225 CALL sm2P256MulInternal<>(SB) // z1 ^ 3 1226 1227 LDx(y2in) 1228 CALL sm2P256MulInternal<>(SB) // s2 = y2 * z1ˆ3 1229 STy(s2) 1230 1231 LDx(y1in) 1232 CALL sm2P256Subinternal<>(SB) // r = s2 - s1 1233 STx(r) 1234 1235 CALL sm2P256SqrInternal<>(SB) // rsqr = rˆ2 1236 STy (rsqr) 1237 1238 LDx(h) 1239 CALL sm2P256SqrInternal<>(SB) // hsqr = hˆ2 1240 STy(hsqr) 1241 1242 CALL sm2P256MulInternal<>(SB) // hcub = hˆ3 1243 STy(hcub) 1244 1245 LDx(y1in) 1246 CALL sm2P256MulInternal<>(SB) // y1 * hˆ3 1247 STy(s2) 1248 1249 LDP hsqr(0*8), (x0, x1) 1250 LDP hsqr(2*8), (x2, x3) 1251 LDP 0*16(a_ptr), (y0, y1) 1252 LDP 1*16(a_ptr), (y2, y3) 1253 CALL sm2P256MulInternal<>(SB) // u1 * hˆ2 1254 STP (y0, y1), h(0*8) 1255 STP (y2, y3), h(2*8) 1256 1257 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1258 1259 LDy(rsqr) 1260 CALL sm2P256Subinternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1261 1262 MOVD x0, y0 1263 MOVD x1, y1 1264 MOVD x2, y2 1265 MOVD x3, y3 1266 LDx(hcub) 1267 CALL sm2P256Subinternal<>(SB) 1268 VMOV x0, V0.D[0] // save x3 1269 VMOV x1, V0.D[1] 1270 VMOV x2, V1.D[0] 1271 VMOV x3, V1.D[1] 1272 1273 LDP h(0*8), (y0, y1) 1274 LDP h(2*8), (y2, y3) 1275 CALL sm2P256Subinternal<>(SB) 1276 1277 LDP r(0*8), (y0, y1) 1278 LDP r(2*8), (y2, y3) 1279 CALL sm2P256MulInternal<>(SB) 1280 1281 LDP s2(0*8), (x0, x1) 1282 LDP s2(2*8), (x2, x3) 1283 CALL sm2P256Subinternal<>(SB) 1284 VMOV x0, V2.D[0] // save y3 1285 VMOV x1, V2.D[1] 1286 VMOV x2, V3.D[0] 1287 VMOV x3, V3.D[1] 1288 1289 // If zero is 0, sets res = in2 1290 VLD1 (b_ptr), [V6.B16, V7.B16] 1291 ADD $8, RSP, hlp1 1292 VLD1 (hlp1), [V8.B16, V9.B16] 1293 MOVD $p256one<>(SB), hlp1 1294 VLD1 (hlp1), [V10.B16, V11.B16] 1295 VBIT V14.B16, V6.B16, V0.B16 1296 VBIT V14.B16, V7.B16, V1.B16 1297 VBIT V14.B16, V8.B16, V2.B16 1298 VBIT V14.B16, V9.B16, V3.B16 1299 VBIT V14.B16, V10.B16, V4.B16 1300 VBIT V14.B16, V11.B16, V5.B16 1301 1302 // If sel is 0, sets res = in1. 1303 VLD1.P (48)(a_ptr), [V6.B16, V7.B16, V8.B16] 1304 VLD1 (a_ptr), [V9.B16, V10.B16, V11.B16] 1305 VBIT V13.B16, V6.B16, V0.B16 1306 VBIT V13.B16, V7.B16, V1.B16 1307 VBIT V13.B16, V8.B16, V2.B16 1308 VBIT V13.B16, V9.B16, V3.B16 1309 VBIT V13.B16, V10.B16, V4.B16 1310 VBIT V13.B16, V11.B16, V5.B16 1311 1312 MOVD res+0(FP), t0 1313 VST1.P [V0.B16, V1.B16, V2.B16], (48)(t0) 1314 VST1 [V3.B16, V4.B16, V5.B16], (t0) 1315 RET 1316 1317 // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0) 1318 #define p256AddInline \ 1319 ADDS y0, x0, x0; \ 1320 ADCS y1, x1, x1; \ 1321 ADCS y2, x2, x2; \ 1322 ADCS y3, x3, x3; \ 1323 ADC $0, ZR, hlp0; \ 1324 SUBS const0, x0, t0; \ 1325 SBCS const1, x1, t1;\ 1326 SBCS const2, x2, acc5; \ 1327 SBCS const3, x3, acc6;\ 1328 SBCS $0, hlp0, hlp0;\ 1329 CSEL CC, x0, t0, x0;\ 1330 CSEL CC, x1, t1, x1;\ 1331 CSEL CC, x2, acc5, x2;\ 1332 CSEL CC, x3, acc6, x3; 1333 1334 #define s(off) (32*0 + 8 + off)(RSP) 1335 #define m(off) (32*1 + 8 + off)(RSP) 1336 #define zsqr(off) (32*2 + 8 + off)(RSP) 1337 #define tmp(off) (32*3 + 8 + off)(RSP) 1338 1339 //func p256PointDoubleAsm(res, in *SM2P256Point) 1340 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16 1341 MOVD res+0(FP), res_ptr 1342 MOVD in+8(FP), a_ptr 1343 1344 LDP p256p<>+0x00(SB), (const0, const1) 1345 LDP p256p<>+0x10(SB), (const2, const3) 1346 1347 // Begin point double 1348 LDP 4*16(a_ptr), (x0, x1) // load z 1349 LDP 5*16(a_ptr), (x2, x3) 1350 CALL sm2P256SqrInternal<>(SB) 1351 STP (y0, y1), zsqr(0*8) // store z^2 1352 STP (y2, y3), zsqr(2*8) 1353 1354 LDP 0*16(a_ptr), (x0, x1) // load x 1355 LDP 1*16(a_ptr), (x2, x3) 1356 p256AddInline 1357 STx(m) 1358 1359 LDx(z1in) 1360 LDy(y1in) 1361 CALL sm2P256MulInternal<>(SB) 1362 p256MulBy2Inline 1363 STx(z3out) 1364 1365 LDy(x1in) 1366 LDx(zsqr) 1367 CALL sm2P256Subinternal<>(SB) 1368 LDy(m) 1369 CALL sm2P256MulInternal<>(SB) 1370 1371 // Multiply by 3 1372 p256MulBy2Inline 1373 p256AddInline 1374 STx(m) 1375 1376 LDy(y1in) 1377 p256MulBy2Inline 1378 CALL sm2P256SqrInternal<>(SB) 1379 STy(s) 1380 MOVD y0, x0 1381 MOVD y1, x1 1382 MOVD y2, x2 1383 MOVD y3, x3 1384 CALL sm2P256SqrInternal<>(SB) 1385 1386 // Divide by 2 1387 ADDS const0, y0, t0 1388 ADCS const1, y1, t1 1389 ADCS const2, y2, acc5 1390 ADCS const3, y3, acc6 1391 ADC $0, ZR, hlp0 1392 1393 ANDS $1, y0, ZR 1394 CSEL EQ, y0, t0, t0 1395 CSEL EQ, y1, t1, t1 1396 CSEL EQ, y2, acc5, acc5 1397 CSEL EQ, y3, acc6, acc6 1398 AND y0, hlp0, hlp0 1399 1400 EXTR $1, t0, t1, y0 1401 EXTR $1, t1, acc5, y1 1402 EXTR $1, acc5, acc6, y2 1403 EXTR $1, acc6, hlp0, y3 1404 STy(y3out) 1405 1406 LDx(x1in) 1407 LDy(s) 1408 CALL sm2P256MulInternal<>(SB) 1409 STy(s) 1410 p256MulBy2Inline 1411 STx(tmp) 1412 1413 LDx(m) 1414 CALL sm2P256SqrInternal<>(SB) 1415 LDx(tmp) 1416 CALL sm2P256Subinternal<>(SB) 1417 1418 STx(x3out) 1419 1420 LDy(s) 1421 CALL sm2P256Subinternal<>(SB) 1422 1423 LDy(m) 1424 CALL sm2P256MulInternal<>(SB) 1425 1426 LDx(y3out) 1427 CALL sm2P256Subinternal<>(SB) 1428 STx(y3out) 1429 RET 1430 1431 #define p256PointDoubleRound() \ 1432 LDx(z3out) \ // load z 1433 CALL sm2P256SqrInternal<>(SB) \ 1434 STP (y0, y1), zsqr(0*8) \ // store z^2 1435 STP (y2, y3), zsqr(2*8) \ 1436 \ 1437 LDx(x3out) \// load x 1438 p256AddInline \ 1439 STx(m) \ 1440 \ 1441 LDx(z3out) \ // load z 1442 LDy(y3out) \ // load y 1443 CALL sm2P256MulInternal<>(SB) \ 1444 p256MulBy2Inline \ 1445 STx(z3out) \ // store result z 1446 \ 1447 LDy(x3out) \ // load x 1448 LDx(zsqr) \ 1449 CALL sm2P256Subinternal<>(SB) \ 1450 LDy(m) \ 1451 CALL sm2P256MulInternal<>(SB) \ 1452 \ 1453 \// Multiply by 3 1454 p256MulBy2Inline \ 1455 p256AddInline \ 1456 STx(m) \ 1457 \ 1458 LDy(y3out) \ // load y 1459 p256MulBy2Inline \ 1460 CALL sm2P256SqrInternal<>(SB) \ 1461 STy(s) \ 1462 MOVD y0, x0 \ 1463 MOVD y1, x1 \ 1464 MOVD y2, x2 \ 1465 MOVD y3, x3 \ 1466 CALL sm2P256SqrInternal<>(SB) \ 1467 \ 1468 \// Divide by 2 1469 ADDS const0, y0, t0 \ 1470 ADCS const1, y1, t1 \ 1471 ADCS const2, y2, acc5 \ 1472 ADCS const3, y3, acc6 \ 1473 ADC $0, ZR, hlp0 \ 1474 \ 1475 ANDS $1, y0, ZR \ 1476 CSEL EQ, y0, t0, t0 \ 1477 CSEL EQ, y1, t1, t1 \ 1478 CSEL EQ, y2, acc5, acc5 \ 1479 CSEL EQ, y3, acc6, acc6 \ 1480 AND y0, hlp0, hlp0 \ 1481 \ 1482 EXTR $1, t0, t1, y0 \ 1483 EXTR $1, t1, acc5, y1 \ 1484 EXTR $1, acc5, acc6, y2 \ 1485 EXTR $1, acc6, hlp0, y3 \ 1486 STy(y3out) \ 1487 \ 1488 LDx(x3out) \ // load x 1489 LDy(s) \ 1490 CALL sm2P256MulInternal<>(SB) \ 1491 STy(s) \ 1492 p256MulBy2Inline \ 1493 STx(tmp) \ 1494 \ 1495 LDx(m) \ 1496 CALL sm2P256SqrInternal<>(SB) \ 1497 LDx(tmp) \ 1498 CALL sm2P256Subinternal<>(SB) \ 1499 \ 1500 STx(x3out) \ 1501 \ 1502 LDy(s) \ 1503 CALL sm2P256Subinternal<>(SB) \ 1504 \ 1505 LDy(m) \ 1506 CALL sm2P256MulInternal<>(SB) \ 1507 \ 1508 LDx(y3out) \ 1509 CALL sm2P256Subinternal<>(SB) \ 1510 STx(y3out) \ 1511 1512 //func p256PointDouble6TimesAsm(res, in *SM2P256Point) 1513 TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$136-16 1514 MOVD res+0(FP), res_ptr 1515 MOVD in+8(FP), a_ptr 1516 1517 LDP p256p<>+0x00(SB), (const0, const1) 1518 LDP p256p<>+0x10(SB), (const2, const3) 1519 1520 // Begin point double round 1 1521 LDP 4*16(a_ptr), (x0, x1) // load z 1522 LDP 5*16(a_ptr), (x2, x3) 1523 CALL sm2P256SqrInternal<>(SB) 1524 STP (y0, y1), zsqr(0*8) // store z^2 1525 STP (y2, y3), zsqr(2*8) 1526 1527 LDP 0*16(a_ptr), (x0, x1) // load x 1528 LDP 1*16(a_ptr), (x2, x3) 1529 p256AddInline 1530 STx(m) 1531 1532 LDx(z1in) // load z 1533 LDy(y1in) // load y 1534 CALL sm2P256MulInternal<>(SB) 1535 p256MulBy2Inline 1536 STx(z3out) // store result z 1537 1538 LDy(x1in) // load x 1539 LDx(zsqr) 1540 CALL sm2P256Subinternal<>(SB) 1541 LDy(m) 1542 CALL sm2P256MulInternal<>(SB) 1543 1544 // Multiply by 3 1545 p256MulBy2Inline 1546 p256AddInline 1547 STx(m) 1548 1549 LDy(y1in) // load y 1550 p256MulBy2Inline 1551 CALL sm2P256SqrInternal<>(SB) 1552 STy(s) 1553 MOVD y0, x0 1554 MOVD y1, x1 1555 MOVD y2, x2 1556 MOVD y3, x3 1557 CALL sm2P256SqrInternal<>(SB) 1558 1559 // Divide by 2 1560 ADDS const0, y0, t0 1561 ADCS const1, y1, t1 1562 ADCS const2, y2, acc5 1563 ADCS const3, y3, acc6 1564 ADC $0, ZR, hlp0 1565 1566 ANDS $1, y0, ZR 1567 CSEL EQ, y0, t0, t0 1568 CSEL EQ, y1, t1, t1 1569 CSEL EQ, y2, acc5, acc5 1570 CSEL EQ, y3, acc6, acc6 1571 AND y0, hlp0, hlp0 1572 1573 EXTR $1, t0, t1, y0 1574 EXTR $1, t1, acc5, y1 1575 EXTR $1, acc5, acc6, y2 1576 EXTR $1, acc6, hlp0, y3 1577 STy(y3out) 1578 1579 LDx(x1in) // load x 1580 LDy(s) 1581 CALL sm2P256MulInternal<>(SB) 1582 STy(s) 1583 p256MulBy2Inline 1584 STx(tmp) 1585 1586 LDx(m) 1587 CALL sm2P256SqrInternal<>(SB) 1588 LDx(tmp) 1589 CALL sm2P256Subinternal<>(SB) 1590 1591 STx(x3out) 1592 1593 LDy(s) 1594 CALL sm2P256Subinternal<>(SB) 1595 1596 LDy(m) 1597 CALL sm2P256MulInternal<>(SB) 1598 1599 LDx(y3out) 1600 CALL sm2P256Subinternal<>(SB) 1601 STx(y3out) 1602 1603 // Begin point double rounds 2 - 6 1604 p256PointDoubleRound() 1605 p256PointDoubleRound() 1606 p256PointDoubleRound() 1607 p256PointDoubleRound() 1608 p256PointDoubleRound() 1609 1610 RET 1611 1612 /* ---------------------------------------*/ 1613 #undef y2in 1614 #undef x3out 1615 #undef y3out 1616 #undef z3out 1617 #define y2in(off) (off + 32)(b_ptr) 1618 #define x3out(off) (off)(b_ptr) 1619 #define y3out(off) (off + 32)(b_ptr) 1620 #define z3out(off) (off + 64)(b_ptr) 1621 // func p256PointAddAsm(res, in1, in2 *SM2P256Point) int 1622 TEXT ·p256PointAddAsm(SB),0,$392-32 1623 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 1624 // Move input to stack in order to free registers 1625 MOVD in1+8(FP), a_ptr 1626 MOVD in2+16(FP), b_ptr 1627 1628 LDP p256p<>+0x00(SB), (const0, const1) 1629 LDP p256p<>+0x10(SB), (const2, const3) 1630 1631 // Begin point add 1632 LDx(z2in) 1633 CALL sm2P256SqrInternal<>(SB) // z2^2 1634 STy(z2sqr) 1635 1636 CALL sm2P256MulInternal<>(SB) // z2^3 1637 1638 LDx(y1in) 1639 CALL sm2P256MulInternal<>(SB) // s1 = z2ˆ3*y1 1640 STy(s1) 1641 1642 LDx(z1in) 1643 CALL sm2P256SqrInternal<>(SB) // z1^2 1644 STy(z1sqr) 1645 1646 CALL sm2P256MulInternal<>(SB) // z1^3 1647 1648 LDx(y2in) 1649 CALL sm2P256MulInternal<>(SB) // s2 = z1ˆ3*y2 1650 1651 LDx(s1) 1652 CALL sm2P256Subinternal<>(SB) // r = s2 - s1 1653 STx(r) 1654 1655 MOVD $1, acc1 1656 ORR x0, x1, acc2 // Check if zero mod p256 1657 ORR x2, x3, acc3 1658 ORR acc3, acc2, acc2 1659 CMP $0, acc2 1660 CSEL EQ, acc1, ZR, hlp1 1661 1662 EOR const0, x0, acc2 1663 EOR const1, x1, acc3 1664 EOR const2, x2, acc4 1665 EOR const3, x3, acc5 1666 1667 ORR acc2, acc3, acc2 1668 ORR acc4, acc5, acc3 1669 ORR acc3, acc2, acc2 1670 CMP $0, acc2 1671 CSEL EQ, acc1, hlp1, hlp1 1672 1673 LDx(z2sqr) 1674 LDy(x1in) 1675 CALL sm2P256MulInternal<>(SB) // u1 = x1 * z2ˆ2 1676 STy(u1) 1677 1678 LDx(z1sqr) 1679 LDy(x2in) 1680 CALL sm2P256MulInternal<>(SB) // u2 = x2 * z1ˆ2 1681 STy(u2) 1682 1683 LDx(u1) 1684 CALL sm2P256Subinternal<>(SB) // h = u2 - u1 1685 STx(h) 1686 1687 MOVD $1, acc1 1688 ORR x0, x1, acc2 // Check if zero mod p256 1689 ORR x2, x3, acc3 1690 ORR acc3, acc2, acc2 1691 CMP $0, acc2 1692 CSEL EQ, acc1, ZR, hlp0 1693 1694 EOR const0, x0, acc2 1695 EOR const1, x1, acc3 1696 EOR const2, x2, acc4 1697 EOR const3, x3, acc5 1698 1699 ORR acc2, acc3, acc2 1700 ORR acc4, acc5, acc3 1701 ORR acc3, acc2, acc2 1702 CMP $0, acc2 1703 CSEL EQ, acc1, hlp0, hlp0 1704 1705 AND hlp0, hlp1, hlp1 1706 1707 LDx(r) 1708 CALL sm2P256SqrInternal<>(SB) // rsqr = rˆ2 1709 STy(rsqr) 1710 1711 LDx(h) 1712 CALL sm2P256SqrInternal<>(SB) // hsqr = hˆ2 1713 STy(hsqr) 1714 1715 LDx(h) 1716 CALL sm2P256MulInternal<>(SB) // hcub = hˆ3 1717 STy(hcub) 1718 1719 LDx(s1) 1720 CALL sm2P256MulInternal<>(SB) 1721 STy(s2) 1722 1723 LDx(z1in) 1724 LDy(z2in) 1725 CALL sm2P256MulInternal<>(SB) // z1 * z2 1726 LDx(h) 1727 CALL sm2P256MulInternal<>(SB) // z1 * z2 * h 1728 MOVD res+0(FP), b_ptr 1729 STy(z3out) 1730 1731 LDx(hsqr) 1732 LDy(u1) 1733 CALL sm2P256MulInternal<>(SB) // hˆ2 * u1 1734 STy(u2) 1735 1736 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1737 LDy(rsqr) 1738 CALL sm2P256Subinternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1739 1740 MOVD x0, y0 1741 MOVD x1, y1 1742 MOVD x2, y2 1743 MOVD x3, y3 1744 LDx(hcub) 1745 CALL sm2P256Subinternal<>(SB) 1746 STx(x3out) 1747 1748 LDy(u2) 1749 CALL sm2P256Subinternal<>(SB) 1750 1751 LDy(r) 1752 CALL sm2P256MulInternal<>(SB) 1753 1754 LDx(s2) 1755 CALL sm2P256Subinternal<>(SB) 1756 STx(y3out) 1757 1758 MOVD hlp1, R0 1759 MOVD R0, ret+24(FP) 1760 1761 RET