github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p503/arith_arm64.s (about) 1 // +build arm64,!purego 2 3 #include "textflag.h" 4 5 6 TEXT ·cmovP503(SB), NOSPLIT, $0-17 7 MOVD x+0(FP), R0 8 MOVD y+8(FP), R1 9 MOVB choice+16(FP), R2 10 11 // Set flags 12 // If choice is not 0 or 1, this implementation will swap completely 13 CMP $0, R2 14 15 LDP 0(R0), (R3, R4) 16 LDP 0(R1), (R5, R6) 17 CSEL EQ, R3, R5, R7 18 CSEL EQ, R4, R6, R8 19 STP (R7, R8), 0(R0) 20 21 LDP 16(R0), (R3, R4) 22 LDP 16(R1), (R5, R6) 23 CSEL EQ, R3, R5, R7 24 CSEL EQ, R4, R6, R8 25 STP (R7, R8), 16(R0) 26 27 LDP 32(R0), (R3, R4) 28 LDP 32(R1), (R5, R6) 29 CSEL EQ, R3, R5, R7 30 CSEL EQ, R4, R6, R8 31 STP (R7, R8), 32(R0) 32 33 LDP 48(R0), (R3, R4) 34 LDP 48(R1), (R5, R6) 35 CSEL EQ, R3, R5, R7 36 CSEL EQ, R4, R6, R8 37 STP (R7, R8), 48(R0) 38 39 RET 40 41 TEXT ·cswapP503(SB), NOSPLIT, $0-17 42 MOVD x+0(FP), R0 43 MOVD y+8(FP), R1 44 MOVB choice+16(FP), R2 45 46 // Set flags 47 // If choice is not 0 or 1, this implementation will swap completely 48 CMP $0, R2 49 50 LDP 0(R0), (R3, R4) 51 LDP 0(R1), (R5, R6) 52 CSEL EQ, R3, R5, R7 53 CSEL EQ, R4, R6, R8 54 STP (R7, R8), 0(R0) 55 CSEL NE, R3, R5, R9 56 CSEL NE, R4, R6, R10 57 STP (R9, R10), 0(R1) 58 59 LDP 16(R0), (R3, R4) 60 LDP 16(R1), (R5, R6) 61 CSEL EQ, R3, R5, R7 62 CSEL EQ, R4, R6, R8 63 STP (R7, R8), 16(R0) 64 CSEL NE, R3, R5, R9 65 CSEL NE, R4, R6, R10 66 STP (R9, R10), 16(R1) 67 68 LDP 32(R0), (R3, R4) 69 LDP 32(R1), (R5, R6) 70 CSEL EQ, R3, R5, R7 71 CSEL EQ, R4, R6, R8 72 STP (R7, R8), 32(R0) 73 CSEL NE, R3, R5, R9 74 CSEL NE, R4, R6, R10 75 STP (R9, R10), 32(R1) 76 77 LDP 48(R0), (R3, R4) 78 LDP 48(R1), (R5, R6) 79 CSEL EQ, R3, R5, R7 80 CSEL EQ, R4, R6, R8 81 STP (R7, R8), 48(R0) 82 CSEL NE, R3, R5, R9 83 CSEL NE, R4, R6, R10 84 STP (R9, R10), 48(R1) 85 86 RET 87 88 TEXT ·addP503(SB), NOSPLIT, $0-24 89 MOVD z+0(FP), R2 90 MOVD x+8(FP), R0 91 MOVD y+16(FP), R1 92 93 // Load first summand into R3-R10 94 // Add first summand and second summand and store result in R3-R10 95 LDP 0(R0), (R3, R4) 96 LDP 0(R1), (R11, R12) 97 LDP 16(R0), (R5, R6) 98 LDP 16(R1), (R13, R14) 99 ADDS R11, R3 100 ADCS R12, R4 101 ADCS R13, R5 102 ADCS R14, R6 103 104 LDP 32(R0), (R7, R8) 105 LDP 32(R1), (R11, R12) 106 LDP 48(R0), (R9, R10) 107 LDP 48(R1), (R13, R14) 108 ADCS R11, R7 109 ADCS R12, R8 110 ADCS R13, R9 111 ADC R14, R10 112 113 // Subtract 2 * p503 in R11-R17 from the result in R3-R10 114 LDP ·P503x2+0(SB), (R11, R12) 115 LDP ·P503x2+24(SB), (R13, R14) 116 SUBS R11, R3 117 SBCS R12, R4 118 LDP ·P503x2+40(SB), (R15, R16) 119 SBCS R12, R5 120 SBCS R13, R6 121 MOVD ·P503x2+56(SB), R17 122 SBCS R14, R7 123 SBCS R15, R8 124 SBCS R16, R9 125 SBCS R17, R10 126 SBC ZR, ZR, R19 127 128 // If x + y - 2 * p503 < 0, R19 is 1 and 2 * p503 should be added 129 AND R19, R11 130 AND R19, R12 131 AND R19, R13 132 AND R19, R14 133 AND R19, R15 134 AND R19, R16 135 AND R19, R17 136 137 ADDS R11, R3 138 ADCS R12, R4 139 STP (R3, R4), 0(R2) 140 ADCS R12, R5 141 ADCS R13, R6 142 STP (R5, R6), 16(R2) 143 ADCS R14, R7 144 ADCS R15, R8 145 STP (R7, R8), 32(R2) 146 ADCS R16, R9 147 ADC R17, R10 148 STP (R9, R10), 48(R2) 149 150 RET 151 152 TEXT ·subP503(SB), NOSPLIT, $0-24 153 MOVD z+0(FP), R2 154 MOVD x+8(FP), R0 155 MOVD y+16(FP), R1 156 157 // Load x into R3-R10 158 // Subtract y from x and store result in R3-R10 159 LDP 0(R0), (R3, R4) 160 LDP 0(R1), (R11, R12) 161 LDP 16(R0), (R5, R6) 162 LDP 16(R1), (R13, R14) 163 SUBS R11, R3 164 SBCS R12, R4 165 SBCS R13, R5 166 SBCS R14, R6 167 168 LDP 32(R0), (R7, R8) 169 LDP 32(R1), (R11, R12) 170 LDP 48(R0), (R9, R10) 171 LDP 48(R1), (R13, R14) 172 SBCS R11, R7 173 SBCS R12, R8 174 SBCS R13, R9 175 SBCS R14, R10 176 SBC ZR, ZR, R19 177 178 // If x - y < 0, R19 is 1 and 2 * p503 should be added 179 LDP ·P503x2+0(SB), (R11, R12) 180 LDP ·P503x2+24(SB), (R13, R14) 181 AND R19, R11 182 AND R19, R12 183 LDP ·P503x2+40(SB), (R15, R16) 184 AND R19, R13 185 AND R19, R14 186 MOVD ·P503x2+56(SB), R17 187 AND R19, R15 188 AND R19, R16 189 AND R19, R17 190 191 ADDS R11, R3 192 ADCS R12, R4 193 STP (R3, R4), 0(R2) 194 ADCS R12, R5 195 ADCS R13, R6 196 STP (R5, R6), 16(R2) 197 ADCS R14, R7 198 ADCS R15, R8 199 STP (R7, R8), 32(R2) 200 ADCS R16, R9 201 ADC R17, R10 202 STP (R9, R10), 48(R2) 203 204 RET 205 206 TEXT ·adlP503(SB), NOSPLIT, $0-24 207 MOVD z+0(FP), R2 208 MOVD x+8(FP), R0 209 MOVD y+16(FP), R1 210 211 LDP 0(R0), (R3, R4) 212 LDP 0(R1), (R11, R12) 213 LDP 16(R0), (R5, R6) 214 LDP 16(R1), (R13, R14) 215 ADDS R11, R3 216 ADCS R12, R4 217 STP (R3, R4), 0(R2) 218 ADCS R13, R5 219 ADCS R14, R6 220 STP (R5, R6), 16(R2) 221 222 LDP 32(R0), (R7, R8) 223 LDP 32(R1), (R11, R12) 224 LDP 48(R0), (R9, R10) 225 LDP 48(R1), (R13, R14) 226 ADCS R11, R7 227 ADCS R12, R8 228 STP (R7, R8), 32(R2) 229 ADCS R13, R9 230 ADCS R14, R10 231 STP (R9, R10), 48(R2) 232 233 LDP 64(R0), (R3, R4) 234 LDP 64(R1), (R11, R12) 235 LDP 80(R0), (R5, R6) 236 LDP 80(R1), (R13, R14) 237 ADCS R11, R3 238 ADCS R12, R4 239 STP (R3, R4), 64(R2) 240 ADCS R13, R5 241 ADCS R14, R6 242 STP (R5, R6), 80(R2) 243 244 LDP 96(R0), (R7, R8) 245 LDP 96(R1), (R11, R12) 246 LDP 112(R0), (R9, R10) 247 LDP 112(R1), (R13, R14) 248 ADCS R11, R7 249 ADCS R12, R8 250 STP (R7, R8), 96(R2) 251 ADCS R13, R9 252 ADC R14, R10 253 STP (R9, R10), 112(R2) 254 255 RET 256 257 TEXT ·sulP503(SB), NOSPLIT, $0-24 258 MOVD z+0(FP), R2 259 MOVD x+8(FP), R0 260 MOVD y+16(FP), R1 261 262 LDP 0(R0), (R3, R4) 263 LDP 0(R1), (R11, R12) 264 LDP 16(R0), (R5, R6) 265 LDP 16(R1), (R13, R14) 266 SUBS R11, R3 267 SBCS R12, R4 268 STP (R3, R4), 0(R2) 269 SBCS R13, R5 270 SBCS R14, R6 271 STP (R5, R6), 16(R2) 272 273 LDP 32(R0), (R7, R8) 274 LDP 32(R1), (R11, R12) 275 LDP 48(R0), (R9, R10) 276 LDP 48(R1), (R13, R14) 277 SBCS R11, R7 278 SBCS R12, R8 279 STP (R7, R8), 32(R2) 280 SBCS R13, R9 281 SBCS R14, R10 282 STP (R9, R10), 48(R2) 283 284 LDP 64(R0), (R3, R4) 285 LDP 64(R1), (R11, R12) 286 LDP 80(R0), (R5, R6) 287 LDP 80(R1), (R13, R14) 288 SBCS R11, R3 289 SBCS R12, R4 290 SBCS R13, R5 291 SBCS R14, R6 292 293 LDP 96(R0), (R7, R8) 294 LDP 96(R1), (R11, R12) 295 LDP 112(R0), (R9, R10) 296 LDP 112(R1), (R13, R14) 297 SBCS R11, R7 298 SBCS R12, R8 299 SBCS R13, R9 300 SBCS R14, R10 301 SBC ZR, ZR, R15 302 303 // If x - y < 0, R15 is 1 and p503 should be added 304 LDP ·P503+16(SB), (R16, R17) 305 LDP ·P503+32(SB), (R19, R20) 306 AND R15, R16 307 AND R15, R17 308 LDP ·P503+48(SB), (R21, R22) 309 AND R15, R19 310 AND R15, R20 311 AND R15, R21 312 AND R15, R22 313 314 ADDS R16, R3 315 ADCS R16, R4 316 STP (R3, R4), 64(R2) 317 ADCS R16, R5 318 ADCS R17, R6 319 STP (R5, R6), 80(R2) 320 ADCS R19, R7 321 ADCS R20, R8 322 STP (R7, R8), 96(R2) 323 ADCS R21, R9 324 ADC R22, R10 325 STP (R9, R10), 112(R2) 326 327 RET 328 329 // Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high) 330 // Z0 is not actually touched 331 // Result of (X0-X1) * (Y0-Y1) will be in Z0-Z3 332 // Inputs get overwritten, except for X1 333 #define mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, Z2, Z3, T0) \ 334 MUL X1, Y0, X0 \ 335 UMULH X1, Y0, Y0 \ 336 ADDS Z3, Z1 \ 337 ADC ZR, Z2 \ 338 \ 339 MUL Y1, X1, T0 \ 340 UMULH Y1, X1, Y1 \ 341 ADDS X0, Z1 \ 342 ADCS Y0, Z2 \ 343 ADC ZR, ZR, Z3 \ 344 \ 345 ADDS T0, Z2 \ 346 ADC Y1, Z3 347 348 // Expects that X points to (X0-X1) 349 // Result of (X0-X3) * (Y0-Y3) will be in Z0-Z7 350 // Inputs get overwritten, except X2-X3 and Y2-Y3 351 #define mul256x256karatsuba(X, X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1)\ 352 ADDS X2, X0 \ // xH + xL, destroys xL 353 ADCS X3, X1 \ 354 ADCS ZR, ZR, T0 \ 355 \ 356 ADDS Y2, Y0, Z6 \ // yH + yL 357 ADCS Y3, Y1, T1 \ 358 ADC ZR, ZR, Z7 \ 359 \ 360 SUB T0, ZR, Z2 \ 361 SUB Z7, ZR, Z3 \ 362 AND Z7, T0 \ // combined carry 363 \ 364 AND Z2, Z6, Z0 \ // masked(yH + yL) 365 AND Z2, T1, Z1 \ 366 \ 367 AND Z3, X0, Z4 \ // masked(xH + xL) 368 AND Z3, X1, Z5 \ 369 \ 370 MUL Z6, X0, Z2 \ 371 MUL T1, X0, Z3 \ 372 \ 373 ADDS Z4, Z0 \ 374 UMULH T1, X0, Z4 \ 375 ADCS Z5, Z1 \ 376 UMULH Z6, X0, Z5 \ 377 ADC ZR, T0 \ 378 \ // (xH + xL) * (yH + yL) 379 mul128x128comba(X0, X1, Z6, T1, Z2, Z3, Z4, Z5, Z7)\ 380 \ 381 LDP 0+X, (X0, X1) \ 382 \ 383 ADDS Z0, Z4 \ 384 UMULH Y0, X0, Z7 \ 385 UMULH Y1, X0, T1 \ 386 ADCS Z1, Z5 \ 387 MUL Y0, X0, Z0 \ 388 MUL Y1, X0, Z1 \ 389 ADC ZR, T0 \ 390 \ // xL * yL 391 mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, T1, Z7, Z6)\ 392 \ 393 MUL Y2, X2, X0 \ 394 UMULH Y2, X2, Y0 \ 395 SUBS Z0, Z2 \ // (xH + xL) * (yH + yL) - xL * yL 396 SBCS Z1, Z3 \ 397 SBCS T1, Z4 \ 398 MUL Y3, X2, X1 \ 399 UMULH Y3, X2, Z6 \ 400 SBCS Z7, Z5 \ 401 SBCS ZR, T0 \ 402 \ // xH * yH 403 mul128x128comba(X2, X3, Y2, Y3, X0, X1, Z6, Y0, Y1)\ 404 \ 405 SUBS X0, Z2 \ // (xH + xL) * (yH + yL) - xL * yL - xH * yH 406 SBCS X1, Z3 \ 407 SBCS Z6, Z4 \ 408 SBCS Y0, Z5 \ 409 SBCS ZR, T0 \ 410 \ 411 ADDS T1, Z2 \ // (xH * yH) * 2^256 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^128 + xL * yL 412 ADCS Z7, Z3 \ 413 ADCS X0, Z4 \ 414 ADCS X1, Z5 \ 415 ADCS T0, Z6 \ 416 ADC Y0, ZR, Z7 417 418 419 // This implements two-level Karatsuba with a 128x128 Comba multiplier 420 // at the bottom 421 TEXT ·mulP503(SB), NOSPLIT, $0-24 422 MOVD z+0(FP), R2 423 MOVD x+8(FP), R0 424 MOVD y+16(FP), R1 425 426 // Load xL in R3-R6, xH in R7-R10 427 // (xH + xL) in R25-R29 428 LDP 0(R0), (R3, R4) 429 LDP 32(R0), (R7, R8) 430 ADDS R3, R7, R25 431 ADCS R4, R8, R26 432 LDP 16(R0), (R5, R6) 433 LDP 48(R0), (R9, R10) 434 ADCS R5, R9, R27 435 ADCS R6, R10, R29 436 ADC ZR, ZR, R7 437 438 // Load yL in R11-R14, yH in R15-19 439 // (yH + yL) in R11-R14, destroys yL 440 LDP 0(R1), (R11, R12) 441 LDP 32(R1), (R15, R16) 442 ADDS R15, R11 443 ADCS R16, R12 444 LDP 16(R1), (R13, R14) 445 LDP 48(R1), (R17, R19) 446 ADCS R17, R13 447 ADCS R19, R14 448 ADC ZR, ZR, R8 449 450 // Compute masks and combined carry 451 SUB R7, ZR, R9 452 SUB R8, ZR, R10 453 AND R8, R7 454 455 // masked(yH + yL) 456 AND R9, R11, R15 457 AND R9, R12, R16 458 AND R9, R13, R17 459 AND R9, R14, R19 460 461 // masked(xH + xL) 462 AND R10, R25, R20 463 AND R10, R26, R21 464 AND R10, R27, R22 465 AND R10, R29, R23 466 467 // masked(xH + xL) + masked(yH + yL) in R15-R19 468 ADDS R20, R15 469 ADCS R21, R16 470 ADCS R22, R17 471 ADCS R23, R19 472 ADC ZR, R7 473 474 // Use z as temporary storage 475 STP (R25, R26), 0(R2) 476 477 // (xH + xL) * (yH + yL) 478 mul256x256karatsuba(0(R2), R25, R26, R27, R29, R11, R12, R13, R14, R8, R9, R10, R20, R21, R22, R23, R24, R0, R1) 479 480 MOVD x+8(FP), R0 481 MOVD y+16(FP), R1 482 483 ADDS R21, R15 484 ADCS R22, R16 485 ADCS R23, R17 486 ADCS R24, R19 487 ADC ZR, R7 488 489 // Load yL in R11-R14 490 LDP 0(R1), (R11, R12) 491 LDP 16(R1), (R13, R14) 492 493 // xL * yL 494 mul256x256karatsuba(0(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R1, R2) 495 496 MOVD z+0(FP), R2 497 MOVD y+16(FP), R1 498 499 // (xH + xL) * (yH + yL) - xL * yL 500 SUBS R21, R8 501 SBCS R22, R9 502 STP (R21, R22), 0(R2) 503 SBCS R23, R10 504 SBCS R24, R20 505 STP (R23, R24), 16(R2) 506 SBCS R25, R15 507 SBCS R26, R16 508 SBCS R27, R17 509 SBCS R29, R19 510 SBC ZR, R7 511 512 // Load xH in R3-R6, yH in R11-R14 513 LDP 32(R0), (R3, R4) 514 LDP 48(R0), (R5, R6) 515 LDP 32(R1), (R11, R12) 516 LDP 48(R1), (R13, R14) 517 518 ADDS R25, R8 519 ADCS R26, R9 520 ADCS R27, R10 521 ADCS R29, R20 522 ADC ZR, ZR, R1 523 524 MOVD R20, 32(R2) 525 526 // xH * yH 527 mul256x256karatsuba(32(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R2, R20) 528 NEG R1, R1 529 530 MOVD z+0(FP), R2 531 MOVD 32(R2), R20 532 533 // (xH + xL) * (yH + yL) - xL * yL - xH * yH in R8-R10,R20,R15-R19 534 // Store lower half in z, that's done 535 SUBS R21, R8 536 SBCS R22, R9 537 STP (R8, R9), 32(R2) 538 SBCS R23, R10 539 SBCS R24, R20 540 STP (R10, R20), 48(R2) 541 SBCS R25, R15 542 SBCS R26, R16 543 SBCS R27, R17 544 SBCS R29, R19 545 SBC ZR, R7 546 547 // (xH * yH) * 2^512 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^256 + xL * yL 548 // Store remaining limbs in z 549 ADDS $1, R1 550 ADCS R21, R15 551 ADCS R22, R16 552 STP (R15, R16), 64(R2) 553 ADCS R23, R17 554 ADCS R24, R19 555 STP (R17, R19), 80(R2) 556 ADCS R7, R25 557 ADCS ZR, R26 558 STP (R25, R26), 96(R2) 559 ADCS ZR, R27 560 ADC ZR, R29 561 STP (R27, R29), 112(R2) 562 563 RET 564 565 // Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high) 566 // Z0 is not actually touched 567 // Result of (X0-X1) * (Y0-Y3) will be in Z0-Z5 568 // Inputs remain intact 569 #define mul128x256comba(X0, X1, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3)\ 570 MUL X1, Y0, T0 \ 571 UMULH X1, Y0, T1 \ 572 ADDS Z3, Z1 \ 573 ADC ZR, Z2 \ 574 \ 575 MUL X0, Y2, T2 \ 576 UMULH X0, Y2, T3 \ 577 ADDS T0, Z1 \ 578 ADCS T1, Z2 \ 579 ADC ZR, ZR, Z3 \ 580 \ 581 MUL X1, Y1, T0 \ 582 UMULH X1, Y1, T1 \ 583 ADDS T2, Z2 \ 584 ADCS T3, Z3 \ 585 ADC ZR, ZR, Z4 \ 586 \ 587 MUL X0, Y3, T2 \ 588 UMULH X0, Y3, T3 \ 589 ADDS T0, Z2 \ 590 ADCS T1, Z3 \ 591 ADC ZR, Z4 \ 592 \ 593 MUL X1, Y2, T0 \ 594 UMULH X1, Y2, T1 \ 595 ADDS T2, Z3 \ 596 ADCS T3, Z4 \ 597 ADC ZR, ZR, Z5 \ 598 \ 599 MUL X1, Y3, T2 \ 600 UMULH X1, Y3, T3 \ 601 ADDS T0, Z3 \ 602 ADCS T1, Z4 \ 603 ADC ZR, Z5 \ 604 ADDS T2, Z4 \ 605 ADC T3, Z5 606 607 // This implements the shifted 2^(B*w) Montgomery reduction from 608 // https://eprint.iacr.org/2016/986.pdf with B = 4, w = 64 609 TEXT ·rdcP503(SB), NOSPLIT, $0-16 610 MOVD x+8(FP), R0 611 612 // Load x0-x1 613 LDP 0(R0), (R2, R3) 614 615 // Load the prime constant in R25-R29 616 LDP ·P503p1s8+32(SB), (R25, R26) 617 LDP ·P503p1s8+48(SB), (R27, R29) 618 619 // [x0,x1] * p503p1s8 to R4-R9 620 MUL R2, R25, R4 // x0 * p503p1s8[0] 621 UMULH R2, R25, R7 622 MUL R2, R26, R5 // x0 * p503p1s8[1] 623 UMULH R2, R26, R6 624 625 mul128x256comba(R2, R3, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13) 626 627 LDP 16(R0), (R3, R11) // x2 628 LDP 32(R0), (R12, R13) 629 LDP 48(R0), (R14, R15) 630 631 // Left-shift result in R4-R9 by 56 to R4-R10 632 ORR R9>>8, ZR, R10 633 LSL $56, R9 634 ORR R8>>8, R9 635 LSL $56, R8 636 ORR R7>>8, R8 637 LSL $56, R7 638 ORR R6>>8, R7 639 LSL $56, R6 640 ORR R5>>8, R6 641 LSL $56, R5 642 ORR R4>>8, R5 643 LSL $56, R4 644 645 ADDS R4, R11 // x3 646 ADCS R5, R12 // x4 647 ADCS R6, R13 648 ADCS R7, R14 649 ADCS R8, R15 650 LDP 64(R0), (R16, R17) 651 LDP 80(R0), (R19, R20) 652 MUL R3, R25, R4 // x2 * p503p1s8[0] 653 UMULH R3, R25, R7 654 ADCS R9, R16 655 ADCS R10, R17 656 ADCS ZR, R19 657 ADCS ZR, R20 658 LDP 96(R0), (R21, R22) 659 LDP 112(R0), (R23, R24) 660 MUL R3, R26, R5 // x2 * p503p1s8[1] 661 UMULH R3, R26, R6 662 ADCS ZR, R21 663 ADCS ZR, R22 664 ADCS ZR, R23 665 ADC ZR, R24 666 667 // [x2,x3] * p503p1s8 to R4-R9 668 mul128x256comba(R3, R11, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) 669 670 ORR R9>>8, ZR, R10 671 LSL $56, R9 672 ORR R8>>8, R9 673 LSL $56, R8 674 ORR R7>>8, R8 675 LSL $56, R7 676 ORR R6>>8, R7 677 LSL $56, R6 678 ORR R5>>8, R6 679 LSL $56, R5 680 ORR R4>>8, R5 681 LSL $56, R4 682 683 ADDS R4, R13 // x5 684 ADCS R5, R14 // x6 685 ADCS R6, R15 686 ADCS R7, R16 687 MUL R12, R25, R4 // x4 * p503p1s8[0] 688 UMULH R12, R25, R7 689 ADCS R8, R17 690 ADCS R9, R19 691 ADCS R10, R20 692 ADCS ZR, R21 693 MUL R12, R26, R5 // x4 * p503p1s8[1] 694 UMULH R12, R26, R6 695 ADCS ZR, R22 696 ADCS ZR, R23 697 ADC ZR, R24 698 699 // [x4,x5] * p503p1s8 to R4-R9 700 mul128x256comba(R12, R13, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) 701 702 ORR R9>>8, ZR, R10 703 LSL $56, R9 704 ORR R8>>8, R9 705 LSL $56, R8 706 ORR R7>>8, R8 707 LSL $56, R7 708 ORR R6>>8, R7 709 LSL $56, R6 710 ORR R5>>8, R6 711 LSL $56, R5 712 ORR R4>>8, R5 713 LSL $56, R4 714 715 ADDS R4, R15 // x7 716 ADCS R5, R16 // x8 717 ADCS R6, R17 718 ADCS R7, R19 719 MUL R14, R25, R4 // x6 * p503p1s8[0] 720 UMULH R14, R25, R7 721 ADCS R8, R20 722 ADCS R9, R21 723 ADCS R10, R22 724 MUL R14, R26, R5 // x6 * p503p1s8[1] 725 UMULH R14, R26, R6 726 ADCS ZR, R23 727 ADC ZR, R24 728 729 // [x6,x7] * p503p1s8 to R4-R9 730 mul128x256comba(R14, R15, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) 731 732 ORR R9>>8, ZR, R10 733 LSL $56, R9 734 ORR R8>>8, R9 735 LSL $56, R8 736 ORR R7>>8, R8 737 LSL $56, R7 738 ORR R6>>8, R7 739 LSL $56, R6 740 ORR R5>>8, R6 741 LSL $56, R5 742 ORR R4>>8, R5 743 LSL $56, R4 744 745 MOVD z+0(FP), R0 746 ADDS R4, R17 747 ADCS R5, R19 748 STP (R16, R17), 0(R0) // Store final result to z 749 ADCS R6, R20 750 ADCS R7, R21 751 STP (R19, R20), 16(R0) 752 ADCS R8, R22 753 ADCS R9, R23 754 STP (R21, R22), 32(R0) 755 ADC R10, R24 756 STP (R23, R24), 48(R0) 757 758 RET 759 760 TEXT ·modP503(SB), NOSPLIT, $0-8 761 MOVD x+0(FP), R0 762 763 // Keep x in R1-R8, p503 in R9-R14, subtract to R1-R8 764 LDP ·P503+16(SB), (R9, R10) 765 LDP 0(R0), (R1, R2) 766 LDP 16(R0), (R3, R4) 767 SUBS R9, R1 768 SBCS R9, R2 769 770 LDP 32(R0), (R5, R6) 771 LDP ·P503+32(SB), (R11, R12) 772 SBCS R9, R3 773 SBCS R10, R4 774 775 LDP 48(R0), (R7, R8) 776 LDP ·P503+48(SB), (R13, R14) 777 SBCS R11, R5 778 SBCS R12, R6 779 780 SBCS R13, R7 781 SBCS R14, R8 782 SBC ZR, ZR, R15 783 784 // Mask with the borrow and add p503 785 AND R15, R9 786 AND R15, R10 787 AND R15, R11 788 AND R15, R12 789 AND R15, R13 790 AND R15, R14 791 792 ADDS R9, R1 793 ADCS R9, R2 794 STP (R1, R2), 0(R0) 795 ADCS R9, R3 796 ADCS R10, R4 797 STP (R3, R4), 16(R0) 798 ADCS R11, R5 799 ADCS R12, R6 800 STP (R5, R6), 32(R0) 801 ADCS R13, R7 802 ADCS R14, R8 803 STP (R7, R8), 48(R0) 804 805 RET