github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p751/arith_arm64.s (about) 1 // +build arm64,!purego 2 3 #include "textflag.h" 4 5 TEXT ·cmovP751(SB), NOSPLIT, $0-17 6 MOVD x+0(FP), R0 7 MOVD y+8(FP), R1 8 MOVB choice+16(FP), R2 9 10 // Set flags 11 // If choice is not 0 or 1, this implementation will swap completely 12 CMP $0, R2 13 14 LDP 0(R0), (R3, R4) 15 LDP 0(R1), (R5, R6) 16 CSEL EQ, R3, R5, R7 17 CSEL EQ, R4, R6, R8 18 STP (R7, R8), 0(R0) 19 20 LDP 16(R0), (R3, R4) 21 LDP 16(R1), (R5, R6) 22 CSEL EQ, R3, R5, R7 23 CSEL EQ, R4, R6, R8 24 STP (R7, R8), 16(R0) 25 26 LDP 32(R0), (R3, R4) 27 LDP 32(R1), (R5, R6) 28 CSEL EQ, R3, R5, R7 29 CSEL EQ, R4, R6, R8 30 STP (R7, R8), 32(R0) 31 32 LDP 48(R0), (R3, R4) 33 LDP 48(R1), (R5, R6) 34 CSEL EQ, R3, R5, R7 35 CSEL EQ, R4, R6, R8 36 STP (R7, R8), 48(R0) 37 38 LDP 64(R0), (R3, R4) 39 LDP 64(R1), (R5, R6) 40 CSEL EQ, R3, R5, R7 41 CSEL EQ, R4, R6, R8 42 STP (R7, R8), 64(R0) 43 44 LDP 80(R0), (R3, R4) 45 LDP 80(R1), (R5, R6) 46 CSEL EQ, R3, R5, R7 47 CSEL EQ, R4, R6, R8 48 STP (R7, R8), 80(R0) 49 50 RET 51 52 TEXT ·cswapP751(SB), NOSPLIT, $0-17 53 MOVD x+0(FP), R0 54 MOVD y+8(FP), R1 55 MOVB choice+16(FP), R2 56 57 // Set flags 58 // If choice is not 0 or 1, this implementation will swap completely 59 CMP $0, R2 60 61 LDP 0(R0), (R3, R4) 62 LDP 0(R1), (R5, R6) 63 CSEL EQ, R3, R5, R7 64 CSEL EQ, R4, R6, R8 65 STP (R7, R8), 0(R0) 66 CSEL NE, R3, R5, R9 67 CSEL NE, R4, R6, R10 68 STP (R9, R10), 0(R1) 69 70 LDP 16(R0), (R3, R4) 71 LDP 16(R1), (R5, R6) 72 CSEL EQ, R3, R5, R7 73 CSEL EQ, R4, R6, R8 74 STP (R7, R8), 16(R0) 75 CSEL NE, R3, R5, R9 76 CSEL NE, R4, R6, R10 77 STP (R9, R10), 16(R1) 78 79 LDP 32(R0), (R3, R4) 80 LDP 32(R1), (R5, R6) 81 CSEL EQ, R3, R5, R7 82 CSEL EQ, R4, R6, R8 83 STP (R7, R8), 32(R0) 84 CSEL NE, R3, R5, R9 85 CSEL NE, R4, R6, R10 86 STP (R9, R10), 32(R1) 87 88 LDP 48(R0), (R3, R4) 89 LDP 48(R1), (R5, R6) 90 CSEL EQ, R3, R5, R7 91 CSEL EQ, R4, R6, R8 92 STP (R7, R8), 48(R0) 93 CSEL NE, R3, R5, R9 94 CSEL NE, R4, R6, R10 95 STP (R9, R10), 48(R1) 96 97 LDP 64(R0), (R3, R4) 98 LDP 64(R1), (R5, R6) 99 CSEL EQ, R3, R5, R7 100 CSEL EQ, R4, R6, R8 101 STP (R7, R8), 64(R0) 102 CSEL NE, R3, R5, R9 103 CSEL NE, R4, R6, R10 104 STP (R9, R10), 64(R1) 105 106 LDP 80(R0), (R3, R4) 107 LDP 80(R1), (R5, R6) 108 CSEL EQ, R3, R5, R7 109 CSEL EQ, R4, R6, R8 110 STP (R7, R8), 80(R0) 111 CSEL NE, R3, R5, R9 112 CSEL NE, R4, R6, R10 113 STP (R9, R10), 80(R1) 114 115 RET 116 117 TEXT ·addP751(SB), NOSPLIT, $0-24 118 MOVD z+0(FP), R2 119 MOVD x+8(FP), R0 120 MOVD y+16(FP), R1 121 122 // Load first summand into R3-R14 123 // Add first summand and second summand and store result in R3-R14 124 LDP 0(R0), (R3, R4) 125 LDP 0(R1), (R15, R16) 126 LDP 16(R0), (R5, R6) 127 LDP 16(R1), (R17, R19) 128 ADDS R15, R3 129 ADCS R16, R4 130 ADCS R17, R5 131 ADCS R19, R6 132 133 LDP 32(R0), (R7, R8) 134 LDP 32(R1), (R15, R16) 135 LDP 48(R0), (R9, R10) 136 LDP 48(R1), (R17, R19) 137 ADCS R15, R7 138 ADCS R16, R8 139 ADCS R17, R9 140 ADCS R19, R10 141 142 LDP 64(R0), (R11, R12) 143 LDP 64(R1), (R15, R16) 144 LDP 80(R0), (R13, R14) 145 LDP 80(R1), (R17, R19) 146 ADCS R15, R11 147 ADCS R16, R12 148 ADCS R17, R13 149 ADC R19, R14 150 151 // Subtract 2 * p751 in R15-R24 from the result in R3-R14 152 LDP ·P751x2+0(SB), (R15, R16) 153 SUBS R15, R3 154 SBCS R16, R4 155 LDP ·P751x2+40(SB), (R17, R19) 156 SBCS R16, R5 157 SBCS R16, R6 158 SBCS R16, R7 159 LDP ·P751x2+56(SB), (R20, R21) 160 SBCS R17, R8 161 SBCS R19, R9 162 LDP ·P751x2+72(SB), (R22, R23) 163 SBCS R20, R10 164 SBCS R21, R11 165 MOVD ·P751x2+88(SB), R24 166 SBCS R22, R12 167 SBCS R23, R13 168 SBCS R24, R14 169 SBC ZR, ZR, R25 170 171 // If x + y - 2 * p751 < 0, R25 is 1 and 2 * p751 should be added 172 AND R25, R15 173 AND R25, R16 174 AND R25, R17 175 AND R25, R19 176 AND R25, R20 177 AND R25, R21 178 AND R25, R22 179 AND R25, R23 180 AND R25, R24 181 182 ADDS R15, R3 183 ADCS R16, R4 184 STP (R3, R4), 0(R2) 185 ADCS R16, R5 186 ADCS R16, R6 187 STP (R5, R6), 16(R2) 188 ADCS R16, R7 189 ADCS R17, R8 190 STP (R7, R8), 32(R2) 191 ADCS R19, R9 192 ADCS R20, R10 193 STP (R9, R10), 48(R2) 194 ADCS R21, R11 195 ADCS R22, R12 196 STP (R11, R12), 64(R2) 197 ADCS R23, R13 198 ADC R24, R14 199 STP (R13, R14), 80(R2) 200 201 RET 202 203 TEXT ·subP751(SB), NOSPLIT, $0-24 204 MOVD z+0(FP), R2 205 MOVD x+8(FP), R0 206 MOVD y+16(FP), R1 207 208 // Load x into R3-R14 209 // Subtract y from x and store result in R3-R14 210 LDP 0(R0), (R3, R4) 211 LDP 0(R1), (R15, R16) 212 LDP 16(R0), (R5, R6) 213 LDP 16(R1), (R17, R19) 214 SUBS R15, R3 215 SBCS R16, R4 216 SBCS R17, R5 217 SBCS R19, R6 218 219 LDP 32(R0), (R7, R8) 220 LDP 32(R1), (R15, R16) 221 LDP 48(R0), (R9, R10) 222 LDP 48(R1), (R17, R19) 223 SBCS R15, R7 224 SBCS R16, R8 225 SBCS R17, R9 226 SBCS R19, R10 227 228 LDP 64(R0), (R11, R12) 229 LDP 64(R1), (R15, R16) 230 LDP 80(R0), (R13, R14) 231 LDP 80(R1), (R17, R19) 232 SBCS R15, R11 233 SBCS R16, R12 234 SBCS R17, R13 235 SBCS R19, R14 236 SBC ZR, ZR, R15 237 238 // If x - y < 0, R15 is 1 and 2 * p751 should be added 239 LDP ·P751x2+0(SB), (R16, R17) 240 AND R15, R16 241 AND R15, R17 242 LDP ·P751x2+40(SB), (R19, R20) 243 AND R15, R19 244 AND R15, R20 245 246 ADDS R16, R3 247 ADCS R17, R4 248 STP (R3, R4), 0(R2) 249 ADCS R17, R5 250 ADCS R17, R6 251 STP (R5, R6), 16(R2) 252 ADCS R17, R7 253 ADCS R19, R8 254 STP (R7, R8), 32(R2) 255 ADCS R20, R9 256 257 LDP ·P751x2+56(SB), (R16, R17) 258 AND R15, R16 259 AND R15, R17 260 LDP ·P751x2+72(SB), (R19, R20) 261 AND R15, R19 262 AND R15, R20 263 264 ADCS R16, R10 265 STP (R9, R10), 48(R2) 266 ADCS R17, R11 267 ADCS R19, R12 268 STP (R11, R12), 64(R2) 269 ADCS R20, R13 270 271 MOVD ·P751x2+88(SB), R16 272 AND R15, R16 273 ADC R16, R14 274 STP (R13, R14), 80(R2) 275 276 RET 277 278 TEXT ·adlP751(SB), NOSPLIT, $0-24 279 MOVD z+0(FP), R2 280 MOVD x+8(FP), R0 281 MOVD y+16(FP), R1 282 283 LDP 0(R0), (R3, R4) 284 LDP 0(R1), (R15, R16) 285 LDP 16(R0), (R5, R6) 286 LDP 16(R1), (R17, R19) 287 ADDS R15, R3 288 ADCS R16, R4 289 STP (R3, R4), 0(R2) 290 ADCS R17, R5 291 ADCS R19, R6 292 STP (R5, R6), 16(R2) 293 294 LDP 32(R0), (R7, R8) 295 LDP 32(R1), (R15, R16) 296 LDP 48(R0), (R9, R10) 297 LDP 48(R1), (R17, R19) 298 ADCS R15, R7 299 ADCS R16, R8 300 STP (R7, R8), 32(R2) 301 ADCS R17, R9 302 ADCS R19, R10 303 STP (R9, R10), 48(R2) 304 305 LDP 64(R0), (R11, R12) 306 LDP 64(R1), (R15, R16) 307 LDP 80(R0), (R13, R14) 308 LDP 80(R1), (R17, R19) 309 ADCS R15, R11 310 ADCS R16, R12 311 STP (R11, R12), 64(R2) 312 ADCS R17, R13 313 ADCS R19, R14 314 STP (R13, R14), 80(R2) 315 316 LDP 96(R0), (R3, R4) 317 LDP 96(R1), (R15, R16) 318 LDP 112(R0), (R5, R6) 319 LDP 112(R1), (R17, R19) 320 ADCS R15, R3 321 ADCS R16, R4 322 STP (R3, R4), 96(R2) 323 ADCS R17, R5 324 ADCS R19, R6 325 STP (R5, R6), 112(R2) 326 327 LDP 128(R0), (R7, R8) 328 LDP 128(R1), (R15, R16) 329 LDP 144(R0), (R9, R10) 330 LDP 144(R1), (R17, R19) 331 ADCS R15, R7 332 ADCS R16, R8 333 STP (R7, R8), 128(R2) 334 ADCS R17, R9 335 ADCS R19, R10 336 STP (R9, R10), 144(R2) 337 338 LDP 160(R0), (R11, R12) 339 LDP 160(R1), (R15, R16) 340 LDP 176(R0), (R13, R14) 341 LDP 176(R1), (R17, R19) 342 ADCS R15, R11 343 ADCS R16, R12 344 STP (R11, R12), 160(R2) 345 ADCS R17, R13 346 ADC R19, R14 347 STP (R13, R14), 176(R2) 348 349 RET 350 351 TEXT ·sulP751(SB), NOSPLIT, $0-24 352 MOVD z+0(FP), R2 353 MOVD x+8(FP), R0 354 MOVD y+16(FP), R1 355 356 LDP 0(R0), (R3, R4) 357 LDP 0(R1), (R15, R16) 358 LDP 16(R0), (R5, R6) 359 LDP 16(R1), (R17, R19) 360 SUBS R15, R3 361 SBCS R16, R4 362 STP (R3, R4), 0(R2) 363 SBCS R17, R5 364 SBCS R19, R6 365 STP (R5, R6), 16(R2) 366 367 LDP 32(R0), (R7, R8) 368 LDP 32(R1), (R15, R16) 369 LDP 48(R0), (R9, R10) 370 LDP 48(R1), (R17, R19) 371 SBCS R15, R7 372 SBCS R16, R8 373 STP (R7, R8), 32(R2) 374 SBCS R17, R9 375 SBCS R19, R10 376 STP (R9, R10), 48(R2) 377 378 LDP 64(R0), (R11, R12) 379 LDP 64(R1), (R15, R16) 380 LDP 80(R0), (R13, R14) 381 LDP 80(R1), (R17, R19) 382 SBCS R15, R11 383 SBCS R16, R12 384 STP (R11, R12), 64(R2) 385 SBCS R17, R13 386 SBCS R19, R14 387 STP (R13, R14), 80(R2) 388 389 LDP 96(R0), (R3, R4) 390 LDP 96(R1), (R15, R16) 391 LDP 112(R0), (R5, R6) 392 LDP 112(R1), (R17, R19) 393 SBCS R15, R3 394 SBCS R16, R4 395 SBCS R17, R5 396 SBCS R19, R6 397 398 LDP 128(R0), (R7, R8) 399 LDP 128(R1), (R15, R16) 400 LDP 144(R0), (R9, R10) 401 LDP 144(R1), (R17, R19) 402 SBCS R15, R7 403 SBCS R16, R8 404 SBCS R17, R9 405 SBCS R19, R10 406 407 LDP 160(R0), (R11, R12) 408 LDP 160(R1), (R15, R16) 409 LDP 176(R0), (R13, R14) 410 LDP 176(R1), (R17, R19) 411 SBCS R15, R11 412 SBCS R16, R12 413 SBCS R17, R13 414 SBCS R19, R14 415 SBC ZR, ZR, R15 416 417 // If x - y < 0, R15 is 1 and p751 should be added 418 MOVD ·P751+0(SB), R20 419 AND R15, R20 420 LDP ·P751+40(SB), (R16, R17) 421 ADDS R20, R3 422 ADCS R20, R4 423 STP (R3, R4), 96(R2) 424 ADCS R20, R5 425 ADCS R20, R6 426 STP (R5, R6), 112(R2) 427 ADCS R20, R7 428 429 LDP ·P751+56(SB), (R19, R20) 430 AND R15, R16 431 AND R15, R17 432 ADCS R16, R8 433 STP (R7, R8), 128(R2) 434 ADCS R17, R9 435 436 LDP ·P751+72(SB), (R16, R17) 437 AND R15, R19 438 AND R15, R20 439 ADCS R19, R10 440 STP (R9, R10), 144(R2) 441 ADCS R20, R11 442 443 MOVD ·P751+88(SB), R19 444 AND R15, R16 445 AND R15, R17 446 ADCS R16, R12 447 STP (R11, R12), 160(R2) 448 ADCS R17, R13 449 450 AND R15, R19 451 ADC R19, R14 452 STP (R13, R14), 176(R2) 453 454 RET 455 456 // Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high) 457 // Z0 is not actually touched 458 // Result of (X0-X2) * (Y0-Y2) will be in Z0-Z5 459 // Inputs remain intact 460 #define mul192x192comba(X0, X1, X2, Y0, Y1, Y2, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3) \ 461 MUL X1, Y0, T2 \ 462 UMULH X1, Y0, T3 \ 463 \ 464 ADDS Z3, Z1 \ 465 ADCS ZR, Z2 \ 466 ADC ZR, ZR, Z3 \ 467 \ 468 MUL X0, Y2, T0 \ 469 UMULH X0, Y2, T1 \ 470 \ 471 ADDS T2, Z1 \ 472 ADCS T3, Z2 \ 473 ADC ZR, Z3 \ 474 \ 475 MUL X1, Y1, T2 \ 476 UMULH X1, Y1, T3 \ 477 \ 478 ADDS T0, Z2 \ 479 ADCS T1, Z3 \ 480 ADC ZR, ZR, Z4 \ 481 \ 482 MUL X2, Y0, T0 \ 483 UMULH X2, Y0, T1 \ 484 \ 485 ADDS T2, Z2 \ 486 ADCS T3, Z3 \ 487 ADC ZR, Z4 \ 488 \ 489 MUL X1, Y2, T2 \ 490 UMULH X1, Y2, T3 \ 491 \ 492 ADDS T0, Z2 \ 493 ADCS T1, Z3 \ 494 ADC ZR, Z4 \ 495 \ 496 MUL X2, Y1, T0 \ 497 UMULH X2, Y1, T1 \ 498 \ 499 ADDS T2, Z3 \ 500 ADCS T3, Z4 \ 501 ADC ZR, ZR, Z5 \ 502 \ 503 MUL X2, Y2, T2 \ 504 UMULH X2, Y2, T3 \ 505 \ 506 ADDS T0, Z3 \ 507 ADCS T1, Z4 \ 508 ADC ZR, Z5 \ 509 \ 510 ADDS T2, Z4 \ 511 ADC T3, Z5 512 513 // Expects that X points to (X4-X6), Y to (Y4-Y6) 514 // Result of (X0-X5) * (Y0-Y5) will be in (0(Z), 8(Z), 16(Z), T0-T8) 515 // Inputs get overwritten 516 #define mul384x384karatsuba(X, Y, Z, X0, X1, X2, X3, X4, X5, Y0, Y1, Y2, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10)\ 517 ADDS X0, X3 \ // xH + xL, destroys xH 518 ADCS X1, X4 \ 519 ADCS X2, X5 \ 520 ADC ZR, ZR, T10 \ 521 \ 522 ADDS Y0, Y3 \ // yH + yL, destroys yH 523 ADCS Y1, Y4 \ 524 ADCS Y2, Y5 \ 525 ADC ZR, ZR, T6 \ 526 \ 527 SUB T10, ZR, T7 \ 528 SUB T6, ZR, T8 \ 529 AND T6, T10 \ // combined carry 530 \ 531 AND T7, Y3, T0 \ // masked(yH + yL) 532 AND T7, Y4, T1 \ 533 AND T7, Y5, T2 \ 534 \ 535 AND T8, X3, T3 \ // masked(xH + xL) 536 AND T8, X4, T4 \ 537 AND T8, X5, T5 \ 538 \ 539 ADDS T3, T0 \ 540 ADCS T4, T1 \ 541 STP (T0, T1), 0+Z \ 542 \ 543 MUL X3, Y3, T0 \ 544 MUL X3, Y4, T1 \ 545 \ 546 ADCS T5, T2 \ 547 MOVD T2, 16+Z \ 548 \ 549 UMULH X3, Y4, T2 \ 550 UMULH X3, Y3, T3 \ 551 \ 552 ADC ZR, T10 \ 553 \ // (xH + xL) * (yH + yL) 554 mul192x192comba(X3, X4, X5, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ 555 \ 556 MUL X0, Y0, X3 \ 557 LDP 0+Z, (T6, T7) \ 558 MOVD 16+Z, T8 \ 559 \ 560 UMULH X0, Y0, Y3 \ 561 ADDS T6, T3 \ 562 ADCS T7, T4 \ 563 MUL X0, Y1, X4 \ 564 ADCS T8, T5 \ 565 ADC ZR, T10 \ 566 UMULH X0, Y1, X5 \ 567 \ // xL * yL 568 mul192x192comba(X0, X1, X2, Y0, Y1, Y2, X3, X4, X5, Y3, Y4, Y5, T6, T7, T8, T9)\ 569 \ 570 STP (X3, X4), 0+Z \ 571 MOVD X5, 16+Z \ 572 \ 573 SUBS X3, T0 \ // (xH + xL) * (yH + yL) - xL * yL 574 SBCS X4, T1 \ 575 LDP 0+X, (X3, X4) \ 576 SBCS X5, T2 \ 577 MOVD 16+X, X5 \ 578 SBCS Y3, T3 \ 579 SBCS Y4, T4 \ 580 SBCS Y5, T5 \ 581 SBC ZR, T10 \ 582 \ 583 ADDS Y3, T0 \ // ((xH + xL) * (yH + yL) - xL * yL) * 2^192 + xL * yL 584 ADCS Y4, T1 \ 585 LDP 0+Y, (Y3, Y4) \ 586 MUL X3, Y3, X0 \ 587 ADCS Y5, T2 \ 588 UMULH X3, Y3, Y0 \ 589 MOVD 16+Y, Y5 \ 590 MUL X3, Y4, X1 \ 591 ADCS ZR, T3 \ 592 UMULH X3, Y4, X2 \ 593 ADCS ZR, T4 \ 594 ADCS ZR, T5 \ 595 ADC ZR, T10 \ 596 \ // xH * yH, overwrite xLow, yLow 597 mul192x192comba(X3, X4, X5, Y3, Y4, Y5, X0, X1, X2, Y0, Y1, Y2, T6, T7, T8, T9)\ 598 \ 599 SUBS X0, T0 \ // ((xH + xL) * (yH + yL) - xL * yL - xH * yH) 600 SBCS X1, T1 \ 601 SBCS X2, T2 \ 602 SBCS Y0, T3 \ 603 SBCS Y1, T4 \ 604 SBCS Y2, T5 \ 605 SBC ZR, T10 \ 606 \ 607 ADDS X0, T3 \ 608 ADCS X1, T4 \ 609 ADCS X2, T5 \ 610 ADCS T10, Y0, T6 \ 611 ADCS ZR, Y1, T7 \ 612 ADC ZR, Y2, T8 613 614 615 TEXT ·mulP751(SB), NOSPLIT, $0-24 616 MOVD z+0(FP), R2 617 MOVD x+8(FP), R0 618 MOVD y+16(FP), R1 619 620 // Load xL in R3-R8, xH in R9-R14 621 // (xH + xL) in R3-R8, destroys xH 622 LDP 0(R0), (R3, R4) 623 LDP 48(R0), (R9, R10) 624 ADDS R9, R3 625 ADCS R10, R4 626 LDP 16(R0), (R5, R6) 627 LDP 64(R0), (R11, R12) 628 ADCS R11, R5 629 ADCS R12, R6 630 LDP 32(R0), (R7, R8) 631 LDP 80(R0), (R13, R14) 632 ADCS R13, R7 633 ADCS R14, R8 634 ADC ZR, ZR, R22 635 636 // Load yL in R9-R14, yH in R15-21 637 // (yH + yL) in R9-R14, destroys yH 638 LDP 0(R1), (R9, R10) 639 LDP 48(R1), (R15, R16) 640 ADDS R15, R9 641 ADCS R16, R10 642 LDP 16(R1), (R11, R12) 643 LDP 64(R1), (R17, R19) 644 ADCS R17, R11 645 ADCS R19, R12 646 LDP 32(R1), (R13, R14) 647 LDP 80(R1), (R20, R21) 648 ADCS R20, R13 649 ADCS R21, R14 650 ADC ZR, ZR, R23 651 652 // Compute masks and combined carry 653 SUB R22, ZR, R24 654 SUB R23, ZR, R25 655 AND R23, R22 656 657 // Store xH, yH in z so mul384x384karatsuba can retrieve them from memory 658 // It doesn't have enough registers 659 // Meanwhile computed masked(xH + xL) in R15-R21 660 STP (R6, R7), 0(R2) 661 AND R25, R3, R15 662 AND R25, R4, R16 663 STP (R8, R12), 16(R2) 664 AND R25, R5, R17 665 AND R25, R6, R19 666 STP (R13, R14), 32(R2) 667 AND R25, R7, R20 668 AND R25, R8, R21 669 670 // Masked(xH + xL) + masked(yH + yL) in R15-R21 671 // Store intermediate values in z 672 AND R24, R9, R25 673 AND R24, R10, R26 674 ADDS R25, R15 675 ADCS R26, R16 676 STP (R15, R16), 96(R2) 677 AND R24, R11, R25 678 AND R24, R12, R26 679 ADCS R25, R17 680 ADCS R26, R19 681 STP (R17, R19), 112(R2) 682 AND R24, R13, R25 683 AND R24, R14, R26 684 ADCS R25, R20 685 ADCS R26, R21 686 STP (R20, R21), 128(R2) 687 // Store carry in R29 so it can remain there 688 ADC ZR, R22, R29 689 690 // (xH + xL) * (yH + yL) 691 mul384x384karatsuba(0(R2), 24(R2), 48(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26) 692 693 // Load masked(xH + xL) + masked(yH + yL) and add that to its top half 694 // Store the result back in z 695 STP (R15, R16), 72(R2) 696 LDP 96(R2), (R3, R4) 697 ADDS R3, R19 698 STP (R17, R19), 88(R2) 699 ADCS R4, R20 700 LDP 112(R2), (R5, R6) 701 ADCS R5, R21 702 STP (R20, R21), 104(R2) 703 ADCS R6, R22 704 LDP 128(R2), (R7, R8) 705 ADCS R7, R23 706 STP (R22, R23), 120(R2) 707 ADCS R8, R24 708 MOVD R24, 136(R2) 709 ADC ZR, R29 710 711 // Load xL, yL 712 LDP 0(R0), (R3, R4) 713 LDP 16(R0), (R5, R6) 714 LDP 32(R0), (R7, R8) 715 LDP 0(R1), (R9, R10) 716 LDP 16(R1), (R11, R12) 717 LDP 32(R1), (R13, R14) 718 719 // xL * yL 720 mul384x384karatsuba(24(R0), 24(R1), 0(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26) 721 722 // (xH + xL) * (yH + yL) - xL * yL in R3-R14 723 LDP 0(R2), (R12, R13) 724 LDP 48(R2), (R3, R4) 725 SUBS R12, R3 726 LDP 64(R2), (R5, R6) 727 MOVD 16(R2), R14 728 SBCS R13, R4 729 SBCS R14, R5 730 LDP 80(R2), (R7, R8) 731 SBCS R15, R6 732 SBCS R16, R7 733 LDP 96(R2), (R9, R10) 734 SBCS R17, R8 735 SBCS R19, R9 736 LDP 112(R2), (R11, R12) 737 SBCS R20, R10 738 SBCS R21, R11 739 LDP 128(R2), (R13, R14) 740 SBCS R22, R12 741 SBCS R23, R13 742 SBCS R24, R14 743 SBC ZR, R29 744 745 STP (R15, R16), 24(R2) 746 MOVD R17, 40(R2) 747 748 // ((xH + xL) * (yH + yL) - xL * yL) * 2^384 + xL * yL and store back in z 749 ADDS R19, R3 750 ADCS R20, R4 751 STP (R3, R4), 48(R2) 752 ADCS R21, R5 753 ADCS R22, R6 754 STP (R5, R6), 64(R2) 755 ADCS R23, R7 756 ADCS R24, R8 757 STP (R7, R8), 80(R2) 758 ADCS ZR, R9 759 ADCS ZR, R10 760 STP (R9, R10), 96(R2) 761 ADCS ZR, R11 762 ADCS ZR, R12 763 STP (R11, R12), 112(R2) 764 ADCS ZR, R13 765 ADCS ZR, R14 766 STP (R13, R14), 128(R2) 767 ADC ZR, R29 768 769 // Load xH, yH 770 LDP 48(R0), (R3, R4) 771 LDP 64(R0), (R5, R6) 772 LDP 80(R0), (R7, R8) 773 LDP 48(R1), (R9, R10) 774 LDP 64(R1), (R11, R12) 775 LDP 80(R1), (R13, R14) 776 777 // xH * yH 778 mul384x384karatsuba(72(R0), 72(R1), 144(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26) 779 780 LDP 144(R2), (R12, R13) 781 MOVD 160(R2), R14 782 783 // (xH + xL) * (yH + yL) - xL * yL - xH * yH in R3-R14 784 // Store lower half in z, that's done 785 LDP 48(R2), (R3, R4) 786 SUBS R12, R3 787 LDP 64(R2), (R5, R6) 788 SBCS R13, R4 789 SBCS R14, R5 790 LDP 80(R2), (R7, R8) 791 SBCS R15, R6 792 SBCS R16, R7 793 LDP 96(R2), (R9, R10) 794 SBCS R17, R8 795 SBCS R19, R9 796 LDP 112(R2), (R11, R12) 797 SBCS R20, R10 798 SBCS R21, R11 799 LDP 128(R2), (R13, R14) 800 SBCS R22, R12 801 SBCS R23, R13 802 STP (R3, R4), 48(R2) 803 SBCS R24, R14 804 STP (R5, R6), 64(R2) 805 SBC ZR, R29 806 STP (R7, R8), 80(R2) 807 808 // (xH * yH) * 2^768 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^384 + xL * yL 809 // Store remaining limbs in z 810 LDP 144(R2), (R3, R4) 811 MOVD 160(R2), R5 812 813 ADDS R3, R9 814 ADCS R4, R10 815 STP (R9, R10), 96(R2) 816 ADCS R5, R11 817 ADCS R15, R12 818 STP (R11, R12), 112(R2) 819 ADCS R16, R13 820 ADCS R17, R14 821 STP (R13, R14), 128(R2) 822 823 ADCS R29, R19 824 ADCS ZR, R20 825 STP (R19, R20), 144(R2) 826 ADCS ZR, R21 827 ADCS ZR, R22 828 STP (R21, R22), 160(R2) 829 ADCS ZR, R23 830 ADC ZR, R24 831 STP (R23, R24), 176(R2) 832 833 RET 834 835 TEXT ·rdcP751(SB), NOSPLIT, $0-16 836 MOVD z+0(FP), R0 837 MOVD x+8(FP), R1 838 839 // Load p751+1 in R14-R17, R29, R19-R20, spread over arithmetic 840 LDP ·P751p1+40(SB), (R14, R15) 841 // z0-z11 will be R2-R13 842 // Load x0-x4 to z0-z4 and x5, spread over arithmetic 843 LDP 0(R1), (R2, R3) 844 845 // x5 iteration 846 MUL R2, R14, R22 847 LDP 32(R1), (R6, R21) 848 UMULH R2, R14, R23 849 ADDS R21, R22, R7 // Set z5 850 ADC ZR, R23, R25 851 852 // x6 iteration 853 MUL R2, R15, R22 854 MOVD 48(R1), R21 855 UMULH R2, R15, R23 856 ADDS R22, R25 857 ADC R23, ZR, R26 858 859 MUL R3, R14, R22 860 LDP ·P751p1+56(SB), (R16, R17) 861 UMULH R3, R14, R23 862 ADDS R22, R25 863 ADCS R23, R26 864 ADC ZR, ZR, R24 865 866 ADDS R21, R25, R8 // Set z6 867 ADCS ZR, R26 868 ADC ZR, R24 869 870 // x7 iteration 871 MUL R2, R16, R22 872 MOVD 56(R1), R21 873 UMULH R2, R16, R23 874 ADDS R22, R26 875 ADCS R23, R24 876 ADC ZR, ZR, R25 877 878 MUL R3, R15, R22 879 LDP 16(R1), (R4, R5) 880 UMULH R3, R15, R23 881 ADDS R22, R26 882 ADCS R23, R24 883 ADC ZR, R25 884 885 MUL R4, R14, R22 886 LDP ·P751p1+72(SB), (R29, R19) 887 UMULH R4, R14, R23 888 ADDS R22, R26 889 ADCS R23, R24 890 ADC ZR, R25 891 892 ADDS R21, R26, R9 // Set z7 893 ADCS ZR, R24 894 ADC ZR, R25 895 896 // x8 iteration 897 MUL R2, R17, R22 898 MOVD 64(R1), R21 899 UMULH R2, R17, R23 900 ADDS R22, R24 901 ADCS R23, R25 902 ADC ZR, ZR, R26 903 904 MUL R3, R16, R22 905 MOVD ·P751p1+88(SB), R20 906 UMULH R3, R16, R23 907 ADDS R22, R24 908 ADCS R23, R25 909 ADC ZR, R26 910 911 MUL R4, R15, R22 912 UMULH R4, R15, R23 913 ADDS R22, R24 914 ADCS R23, R25 915 ADC ZR, R26 916 917 MUL R5, R14, R22 918 UMULH R5, R14, R23 919 ADDS R22, R24 920 ADCS R23, R25 921 ADC ZR, R26 922 923 ADDS R24, R21, R10 // Set z8 924 ADCS ZR, R25 925 ADC ZR, R26 926 927 // x9 iteration 928 MUL R2, R29, R22 929 MOVD 72(R1), R21 930 UMULH R2, R29, R23 931 ADDS R22, R25 932 ADCS R23, R26 933 ADC ZR, ZR, R24 934 935 MUL R3, R17, R22 936 UMULH R3, R17, R23 937 ADDS R22, R25 938 ADCS R23, R26 939 ADC ZR, R24 940 941 MUL R4, R16, R22 942 UMULH R4, R16, R23 943 ADDS R22, R25 944 ADCS R23, R26 945 ADC ZR, R24 946 947 MUL R5, R15, R22 948 UMULH R5, R15, R23 949 ADDS R22, R25 950 ADCS R23, R26 951 ADC ZR, R24 952 953 MUL R6, R14, R22 954 UMULH R6, R14, R23 955 ADDS R22, R25 956 ADCS R23, R26 957 ADC ZR, R24 958 959 ADDS R21, R25, R11 // Set z9 960 ADCS ZR, R26 961 ADC ZR, R24 962 963 // x10 iteration 964 MUL R2, R19, R22 965 MOVD 80(R1), R21 966 UMULH R2, R19, R23 967 ADDS R22, R26 968 ADCS R23, R24 969 ADC ZR, ZR, R25 970 971 MUL R3, R29, R22 972 UMULH R3, R29, R23 973 ADDS R22, R26 974 ADCS R23, R24 975 ADC ZR, R25 976 977 MUL R4, R17, R22 978 UMULH R4, R17, R23 979 ADDS R22, R26 980 ADCS R23, R24 981 ADC ZR, R25 982 983 MUL R5, R16, R22 984 UMULH R5, R16, R23 985 ADDS R22, R26 986 ADCS R23, R24 987 ADC ZR, R25 988 989 MUL R6, R15, R22 990 UMULH R6, R15, R23 991 ADDS R22, R26 992 ADCS R23, R24 993 ADC ZR, R25 994 995 MUL R7, R14, R22 996 UMULH R7, R14, R23 997 ADDS R22, R26 998 ADCS R23, R24 999 ADC ZR, R25 1000 1001 ADDS R21, R26, R12 // Set z10 1002 ADCS ZR, R24 1003 ADC ZR, R25 1004 1005 // x11 iteration 1006 MUL R2, R20, R22 1007 MOVD 88(R1), R21 1008 UMULH R2, R20, R23 1009 ADDS R22, R24 1010 ADCS R23, R25 1011 ADC ZR, ZR, R26 1012 1013 MUL R3, R19, R22 1014 UMULH R3, R19, R23 1015 ADDS R22, R24 1016 ADCS R23, R25 1017 ADC ZR, R26 1018 1019 MUL R4, R29, R22 1020 UMULH R4, R29, R23 1021 ADDS R22, R24 1022 ADCS R23, R25 1023 ADC ZR, R26 1024 1025 MUL R5, R17, R22 1026 UMULH R5, R17, R23 1027 ADDS R22, R24 1028 ADCS R23, R25 1029 ADC ZR, R26 1030 1031 MUL R6, R16, R22 1032 UMULH R6, R16, R23 1033 ADDS R22, R24 1034 ADCS R23, R25 1035 ADC ZR, R26 1036 1037 MUL R7, R15, R22 1038 UMULH R7, R15, R23 1039 ADDS R22, R24 1040 ADCS R23, R25 1041 ADC ZR, R26 1042 1043 MUL R8, R14, R22 1044 UMULH R8, R14, R23 1045 ADDS R22, R24 1046 ADCS R23, R25 1047 ADC ZR, R26 1048 1049 ADDS R21, R24, R13 // Set z11 1050 ADCS ZR, R25 1051 ADC ZR, R26 1052 1053 // x12 iteration 1054 MUL R3, R20, R22 1055 MOVD 96(R1), R21 1056 UMULH R3, R20, R23 1057 ADDS R22, R25 1058 ADCS R23, R26 1059 ADC ZR, ZR, R24 1060 1061 MUL R4, R19, R22 1062 UMULH R4, R19, R23 1063 ADDS R22, R25 1064 ADCS R23, R26 1065 ADC ZR, R24 1066 1067 MUL R5, R29, R22 1068 UMULH R5, R29, R23 1069 ADDS R22, R25 1070 ADCS R23, R26 1071 ADC ZR, R24 1072 1073 MUL R6, R17, R22 1074 UMULH R6, R17, R23 1075 ADDS R22, R25 1076 ADCS R23, R26 1077 ADC ZR, R24 1078 1079 MUL R7, R16, R22 1080 UMULH R7, R16, R23 1081 ADDS R22, R25 1082 ADCS R23, R26 1083 ADC ZR, R24 1084 1085 MUL R8, R15, R22 1086 UMULH R8, R15, R23 1087 ADDS R22, R25 1088 ADCS R23, R26 1089 ADC ZR, R24 1090 1091 MUL R9, R14, R22 1092 UMULH R9, R14, R23 1093 ADDS R22, R25 1094 ADCS R23, R26 1095 ADC ZR, R24 1096 1097 ADDS R21, R25, R2 // Set z0 1098 ADCS ZR, R26 1099 ADC ZR, R24 1100 1101 // x13 iteration 1102 MUL R4, R20, R22 1103 MOVD 104(R1), R21 1104 UMULH R4, R20, R23 1105 ADDS R22, R26 1106 ADCS R23, R24 1107 ADC ZR, ZR, R25 1108 1109 MUL R5, R19, R22 1110 UMULH R5, R19, R23 1111 ADDS R22, R26 1112 ADCS R23, R24 1113 ADC ZR, R25 1114 1115 MUL R6, R29, R22 1116 UMULH R6, R29, R23 1117 ADDS R22, R26 1118 ADCS R23, R24 1119 ADC ZR, R25 1120 1121 MUL R7, R17, R22 1122 UMULH R7, R17, R23 1123 ADDS R22, R26 1124 ADCS R23, R24 1125 ADC ZR, R25 1126 1127 MUL R8, R16, R22 1128 UMULH R8, R16, R23 1129 ADDS R22, R26 1130 ADCS R23, R24 1131 ADC ZR, R25 1132 1133 MUL R9, R15, R22 1134 UMULH R9, R15, R23 1135 ADDS R22, R26 1136 ADCS R23, R24 1137 ADC ZR, R25 1138 1139 MUL R10, R14, R22 1140 UMULH R10, R14, R23 1141 ADDS R22, R26 1142 ADCS R23, R24 1143 ADC ZR, R25 1144 1145 ADDS R21, R26, R3 // Set z1 1146 STP (R2, R3), 0(R0) 1147 ADCS ZR, R24 1148 ADC ZR, R25 1149 1150 // x14 iteration 1151 MUL R5, R20, R22 1152 MOVD 112(R1), R21 1153 UMULH R5, R20, R23 1154 ADDS R22, R24 1155 ADCS R23, R25 1156 ADC ZR, ZR, R26 1157 1158 MUL R6, R19, R22 1159 UMULH R6, R19, R23 1160 ADDS R22, R24 1161 ADCS R23, R25 1162 ADC ZR, R26 1163 1164 MUL R7, R29, R22 1165 UMULH R7, R29, R23 1166 ADDS R22, R24 1167 ADCS R23, R25 1168 ADC ZR, R26 1169 1170 MUL R8, R17, R22 1171 UMULH R8, R17, R23 1172 ADDS R22, R24 1173 ADCS R23, R25 1174 ADC ZR, R26 1175 1176 MUL R9, R16, R22 1177 UMULH R9, R16, R23 1178 ADDS R22, R24 1179 ADCS R23, R25 1180 ADC ZR, R26 1181 1182 MUL R10, R15, R22 1183 UMULH R10, R15, R23 1184 ADDS R22, R24 1185 ADCS R23, R25 1186 ADC ZR, R26 1187 1188 MUL R11, R14, R22 1189 UMULH R11, R14, R23 1190 ADDS R22, R24 1191 ADCS R23, R25 1192 ADC ZR, R26 1193 1194 ADDS R21, R24, R4 // Set z2 1195 ADCS ZR, R25 1196 ADC ZR, R26 1197 1198 // x15 iteration 1199 MUL R6, R20, R22 1200 MOVD 120(R1), R21 1201 UMULH R6, R20, R23 1202 ADDS R22, R25 1203 ADCS R23, R26 1204 ADC ZR, ZR, R24 1205 1206 MUL R7, R19, R22 1207 UMULH R7, R19, R23 1208 ADDS R22, R25 1209 ADCS R23, R26 1210 ADC ZR, R24 1211 1212 MUL R8, R29, R22 1213 UMULH R8, R29, R23 1214 ADDS R22, R25 1215 ADCS R23, R26 1216 ADC ZR, R24 1217 1218 MUL R9, R17, R22 1219 UMULH R9, R17, R23 1220 ADDS R22, R25 1221 ADCS R23, R26 1222 ADC ZR, R24 1223 1224 MUL R10, R16, R22 1225 UMULH R10, R16, R23 1226 ADDS R22, R25 1227 ADCS R23, R26 1228 ADC ZR, R24 1229 1230 MUL R11, R15, R22 1231 UMULH R11, R15, R23 1232 ADDS R22, R25 1233 ADCS R23, R26 1234 ADC ZR, R24 1235 1236 MUL R12, R14, R22 1237 UMULH R12, R14, R23 1238 ADDS R22, R25 1239 ADCS R23, R26 1240 ADC ZR, R24 1241 1242 ADDS R21, R25, R5 // Set z3 1243 STP (R4, R5), 16(R0) 1244 ADCS ZR, R26 1245 ADC ZR, R24 1246 1247 // x16 iteration 1248 MUL R7, R20, R22 1249 MOVD 128(R1), R21 1250 UMULH R7, R20, R23 1251 ADDS R22, R26 1252 ADCS R23, R24 1253 ADC ZR, ZR, R25 1254 1255 MUL R8, R19, R22 1256 UMULH R8, R19, R23 1257 ADDS R22, R26 1258 ADCS R23, R24 1259 ADC ZR, R25 1260 1261 MUL R9, R29, R22 1262 UMULH R9, R29, R23 1263 ADDS R22, R26 1264 ADCS R23, R24 1265 ADC ZR, R25 1266 1267 MUL R10, R17, R22 1268 UMULH R10, R17, R23 1269 ADDS R22, R26 1270 ADCS R23, R24 1271 ADC ZR, R25 1272 1273 MUL R11, R16, R22 1274 UMULH R11, R16, R23 1275 ADDS R22, R26 1276 ADCS R23, R24 1277 ADC ZR, R25 1278 1279 MUL R12, R15, R22 1280 UMULH R12, R15, R23 1281 ADDS R22, R26 1282 ADCS R23, R24 1283 ADC ZR, R25 1284 1285 MUL R13, R14, R22 1286 UMULH R13, R14, R23 1287 ADDS R22, R26 1288 ADCS R23, R24 1289 ADC ZR, R25 1290 1291 ADDS R21, R26, R6 // Set z4 1292 ADCS ZR, R24 1293 ADC ZR, R25 1294 1295 // x17 iteration 1296 MUL R8, R20, R22 1297 MOVD 136(R1), R21 1298 UMULH R8, R20, R23 1299 ADDS R22, R24 1300 ADCS R23, R25 1301 ADC ZR, ZR, R26 1302 1303 MUL R9, R19, R22 1304 UMULH R9, R19, R23 1305 ADDS R22, R24 1306 ADCS R23, R25 1307 ADC ZR, R26 1308 1309 MUL R10, R29, R22 1310 UMULH R10, R29, R23 1311 ADDS R22, R24 1312 ADCS R23, R25 1313 ADC ZR, R26 1314 1315 MUL R11, R17, R22 1316 UMULH R11, R17, R23 1317 ADDS R22, R24 1318 ADCS R23, R25 1319 ADC ZR, R26 1320 1321 MUL R12, R16, R22 1322 UMULH R12, R16, R23 1323 ADDS R22, R24 1324 ADCS R23, R25 1325 ADC ZR, R26 1326 1327 MUL R13, R15, R22 1328 UMULH R13, R15, R23 1329 ADDS R22, R24 1330 ADCS R23, R25 1331 ADC ZR, R26 1332 1333 ADDS R21, R24, R7 // Set z5 1334 STP (R6, R7), 32(R0) 1335 ADCS ZR, R25 1336 ADC ZR, R26 1337 1338 // x18 iteration 1339 MUL R9, R20, R22 1340 MOVD 144(R1), R21 1341 UMULH R9, R20, R23 1342 ADDS R22, R25 1343 ADCS R23, R26 1344 ADC ZR, ZR, R24 1345 1346 MUL R10, R19, R22 1347 UMULH R10, R19, R23 1348 ADDS R22, R25 1349 ADCS R23, R26 1350 ADC ZR, R24 1351 1352 MUL R11, R29, R22 1353 UMULH R11, R29, R23 1354 ADDS R22, R25 1355 ADCS R23, R26 1356 ADC ZR, R24 1357 1358 MUL R12, R17, R22 1359 UMULH R12, R17, R23 1360 ADDS R22, R25 1361 ADCS R23, R26 1362 ADC ZR, R24 1363 1364 MUL R13, R16, R22 1365 UMULH R13, R16, R23 1366 ADDS R22, R25 1367 ADCS R23, R26 1368 ADC ZR, R24 1369 1370 ADDS R21, R25, R8 // Set z6 1371 ADCS ZR, R26 1372 ADC ZR, R24 1373 1374 // x19 iteration 1375 MUL R10, R20, R22 1376 MOVD 152(R1), R21 1377 UMULH R10, R20, R23 1378 ADDS R22, R26 1379 ADCS R23, R24 1380 ADC ZR, ZR, R25 1381 1382 MUL R11, R19, R22 1383 UMULH R11, R19, R23 1384 ADDS R22, R26 1385 ADCS R23, R24 1386 ADC ZR, R25 1387 1388 MUL R12, R29, R22 1389 UMULH R12, R29, R23 1390 ADDS R22, R26 1391 ADCS R23, R24 1392 ADC ZR, R25 1393 1394 MUL R13, R17, R22 1395 UMULH R13, R17, R23 1396 ADDS R22, R26 1397 ADCS R23, R24 1398 ADC ZR, R25 1399 1400 ADDS R21, R26, R9 // Set z7 1401 STP (R8, R9), 48(R0) 1402 ADCS ZR, R24 1403 ADC ZR, R25 1404 1405 // x20 iteration 1406 MUL R11, R20, R22 1407 MOVD 160(R1), R21 1408 UMULH R11, R20, R23 1409 ADDS R22, R24 1410 ADCS R23, R25 1411 ADC ZR, ZR, R26 1412 1413 MUL R12, R19, R22 1414 UMULH R12, R19, R23 1415 ADDS R22, R24 1416 ADCS R23, R25 1417 ADC ZR, R26 1418 1419 MUL R13, R29, R22 1420 UMULH R13, R29, R23 1421 ADDS R22, R24 1422 ADCS R23, R25 1423 ADC ZR, R26 1424 1425 ADDS R21, R24, R10 // Set z8 1426 ADCS ZR, R25 1427 ADC ZR, R26 1428 1429 // x21 iteration 1430 MUL R12, R20, R22 1431 MOVD 168(R1), R21 1432 UMULH R12, R20, R23 1433 ADDS R22, R25 1434 ADCS R23, R26 1435 ADC ZR, ZR, R24 1436 1437 MUL R13, R19, R22 1438 UMULH R13, R19, R23 1439 ADDS R22, R25 1440 ADCS R23, R26 1441 ADC ZR, R24 1442 1443 ADDS R21, R25, R11 // Set z9 1444 STP (R10, R11), 64(R0) 1445 ADCS ZR, R26 1446 ADC ZR, R24 1447 1448 // x22 iteration 1449 MUL R13, R20, R22 1450 MOVD 176(R1), R21 1451 UMULH R13, R20, R23 1452 ADDS R22, R26 1453 ADC R23, R24 1454 ADDS R21, R26, R12 // Set z10 1455 1456 MOVD 184(R1), R21 1457 ADC R21, R24, R13 // Set z11 1458 STP (R12, R13), 80(R0) 1459 1460 RET 1461 1462 TEXT ·modP751(SB), NOSPLIT, $0-8 1463 MOVD x+0(FP), R0 1464 1465 // Keep x in R1-R12, p751 in R13-R21, subtract to R1-R12 1466 MOVD ·P751+0(SB), R13 1467 LDP 0(R0), (R1, R2) 1468 LDP 16(R0), (R3, R4) 1469 SUBS R13, R1 1470 SBCS R13, R2 1471 1472 LDP 32(R0), (R5, R6) 1473 LDP ·P751+40(SB), (R14, R15) 1474 SBCS R13, R3 1475 SBCS R13, R4 1476 1477 LDP 48(R0), (R7, R8) 1478 LDP ·P751+56(SB), (R16, R17) 1479 SBCS R13, R5 1480 SBCS R14, R6 1481 1482 LDP 64(R0), (R9, R10) 1483 LDP ·P751+72(SB), (R19, R20) 1484 SBCS R15, R7 1485 SBCS R16, R8 1486 1487 LDP 80(R0), (R11, R12) 1488 MOVD ·P751+88(SB), R21 1489 SBCS R17, R9 1490 SBCS R19, R10 1491 1492 SBCS R20, R11 1493 SBCS R21, R12 1494 SBC ZR, ZR, R22 1495 1496 // Mask with the borrow and add p751 1497 AND R22, R13 1498 AND R22, R14 1499 AND R22, R15 1500 AND R22, R16 1501 AND R22, R17 1502 AND R22, R19 1503 AND R22, R20 1504 AND R22, R21 1505 1506 ADDS R13, R1 1507 ADCS R13, R2 1508 STP (R1, R2), 0(R0) 1509 ADCS R13, R3 1510 ADCS R13, R4 1511 STP (R3, R4), 16(R0) 1512 ADCS R13, R5 1513 ADCS R14, R6 1514 STP (R5, R6), 32(R0) 1515 ADCS R15, R7 1516 ADCS R16, R8 1517 STP (R7, R8), 48(R0) 1518 ADCS R17, R9 1519 ADCS R19, R10 1520 STP (R9, R10), 64(R0) 1521 ADCS R20, R11 1522 ADC R21, R12 1523 STP (R11, R12), 80(R0) 1524 1525 RET