github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/mul_basecase.asm (about) 1 dnl PowerPC-64 mpn_mul_basecase. 2 3 dnl Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C POWER3/PPC630 6-18 35 C POWER4/PPC970 8 36 C POWER5 8 37 C POWER6 24 38 39 C INPUT PARAMETERS 40 define(`rp', `r3') 41 define(`up', `r4') 42 define(`un', `r5') 43 define(`vp', `r6') 44 define(`vn', `r7') 45 46 define(`v0', `r25') 47 define(`outer_rp', `r22') 48 define(`outer_up', `r23') 49 50 ASM_START() 51 PROLOGUE(mpn_mul_basecase) 52 53 C Special code for un <= 2, for efficiency of these important cases, 54 C and since it simplifies the default code. 55 cmpdi cr0, un, 2 56 bgt cr0, L(un_gt2) 57 cmpdi cr6, vn, 1 58 ld r7, 0(vp) 59 ld r5, 0(up) 60 mulld r8, r5, r7 C weight 0 61 mulhdu r9, r5, r7 C weight 1 62 std r8, 0(rp) 63 beq cr0, L(2x) 64 std r9, 8(rp) 65 blr 66 ALIGN(16) 67 L(2x): ld r0, 8(up) 68 mulld r8, r0, r7 C weight 1 69 mulhdu r10, r0, r7 C weight 2 70 addc r9, r9, r8 71 addze r10, r10 72 bne cr6, L(2x2) 73 std r9, 8(rp) 74 std r10, 16(rp) 75 blr 76 ALIGN(16) 77 L(2x2): ld r6, 8(vp) 78 nop 79 mulld r8, r5, r6 C weight 1 80 mulhdu r11, r5, r6 C weight 2 81 addc r9, r9, r8 82 std r9, 8(rp) 83 adde r11, r11, r10 84 mulld r12, r0, r6 C weight 2 85 mulhdu r0, r0, r6 C weight 3 86 addze r0, r0 87 addc r11, r11, r12 88 addze r0, r0 89 std r11, 16(rp) 90 std r0, 24(rp) 91 blr 92 93 L(un_gt2): 94 std r31, -8(r1) 95 std r30, -16(r1) 96 std r29, -24(r1) 97 std r28, -32(r1) 98 std r27, -40(r1) 99 std r26, -48(r1) 100 std r25, -56(r1) 101 std r24, -64(r1) 102 std r23, -72(r1) 103 std r22, -80(r1) 104 105 mr outer_rp, rp 106 mr outer_up, up 107 108 ld v0, 0(vp) C new v limb 109 addi vp, vp, 8 110 ld r26, 0(up) 111 112 rldicl. r0, un, 0,62 C r0 = n & 3, set cr0 113 cmpdi cr6, r0, 2 114 addi un, un, 1 C compute count... 115 srdi un, un, 2 C ...for ctr 116 mtctr un C copy inner loop count into ctr 117 beq cr0, L(b0) 118 blt cr6, L(b1) 119 beq cr6, L(b2) 120 121 122 ALIGN(16) 123 L(b3): mulld r0, r26, v0 124 mulhdu r12, r26, v0 125 addic r0, r0, 0 126 std r0, 0(rp) 127 ld r26, 8(up) 128 ld r27, 16(up) 129 bdz L(end_m_3) 130 131 ALIGN(16) 132 L(lo_m_3): 133 mulld r0, r26, v0 134 mulhdu r31, r26, v0 135 ld r26, 24(up) 136 nop 137 mulld r24, r27, v0 138 mulhdu r8, r27, v0 139 ld r27, 32(up) 140 nop 141 adde r0, r0, r12 142 adde r24, r24, r31 143 mulld r9, r26, v0 144 mulhdu r10, r26, v0 145 ld r26, 40(up) 146 nop 147 mulld r11, r27, v0 148 mulhdu r12, r27, v0 149 ld r27, 48(up) 150 std r0, 8(rp) 151 adde r9, r9, r8 152 std r24, 16(rp) 153 adde r11, r11, r10 154 std r9, 24(rp) 155 addi up, up, 32 156 std r11, 32(rp) 157 addi rp, rp, 32 158 bdnz L(lo_m_3) 159 160 ALIGN(16) 161 L(end_m_3): 162 mulld r0, r26, v0 163 mulhdu r31, r26, v0 164 165 mulld r24, r27, v0 166 mulhdu r8, r27, v0 167 168 adde r0, r0, r12 169 adde r24, r24, r31 170 171 std r0, 8(rp) 172 std r24, 16(rp) 173 addze r8, r8 174 std r8, 24(rp) 175 addic. vn, vn, -1 176 beq L(ret) 177 178 ALIGN(16) 179 L(outer_lo_3): 180 mtctr un C copy inner loop count into ctr 181 addi rp, outer_rp, 8 182 mr up, outer_up 183 addi outer_rp, outer_rp, 8 184 ld v0, 0(vp) C new v limb 185 addi vp, vp, 8 186 ld r26, 0(up) 187 ld r28, 0(rp) 188 mulld r0, r26, v0 189 mulhdu r12, r26, v0 190 addc r0, r0, r28 191 std r0, 0(rp) 192 ld r26, 8(up) 193 ld r27, 16(up) 194 bdz L(end_3) 195 196 ALIGN(16) C registers dying 197 L(lo_3): 198 mulld r0, r26, v0 C 199 mulhdu r10, r26, v0 C 26 200 ld r26, 24(up) C 201 ld r28, 8(rp) C 202 mulld r24, r27, v0 C 203 mulhdu r8, r27, v0 C 27 204 ld r27, 32(up) C 205 ld r29, 16(rp) C 206 adde r0, r0, r12 C 0 12 207 adde r24, r24, r10 C 24 10 208 mulld r9, r26, v0 C 209 mulhdu r10, r26, v0 C 26 210 ld r26, 40(up) C 211 ld r30, 24(rp) C 212 mulld r11, r27, v0 C 213 mulhdu r12, r27, v0 C 27 214 ld r27, 48(up) C 215 ld r31, 32(rp) C 216 adde r9, r9, r8 C 8 9 217 adde r11, r11, r10 C 10 11 218 addze r12, r12 C 12 219 addc r0, r0, r28 C 0 28 220 std r0, 8(rp) C 0 221 adde r24, r24, r29 C 7 29 222 std r24, 16(rp) C 7 223 adde r9, r9, r30 C 9 30 224 std r9, 24(rp) C 9 225 adde r11, r11, r31 C 11 31 226 std r11, 32(rp) C 11 227 addi up, up, 32 C 228 addi rp, rp, 32 C 229 bdnz L(lo_3) C 230 231 ALIGN(16) 232 L(end_3): 233 mulld r0, r26, v0 234 mulhdu r10, r26, v0 235 ld r28, 8(rp) 236 nop 237 mulld r24, r27, v0 238 mulhdu r8, r27, v0 239 ld r29, 16(rp) 240 nop 241 adde r0, r0, r12 242 adde r24, r24, r10 243 addze r8, r8 244 addc r0, r0, r28 245 std r0, 8(rp) 246 adde r24, r24, r29 247 std r24, 16(rp) 248 addze r8, r8 249 std r8, 24(rp) 250 251 addic. vn, vn, -1 252 bne L(outer_lo_3) 253 b L(ret) 254 255 256 ALIGN(16) 257 L(b0): ld r27, 8(up) 258 addi up, up, 8 259 mulld r0, r26, v0 260 mulhdu r10, r26, v0 261 mulld r24, r27, v0 262 mulhdu r8, r27, v0 263 addc r24, r24, r10 264 addze r12, r8 265 std r0, 0(rp) 266 std r24, 8(rp) 267 addi rp, rp, 8 268 ld r26, 8(up) 269 ld r27, 16(up) 270 bdz L(end_m_0) 271 272 ALIGN(16) 273 L(lo_m_0): 274 mulld r0, r26, v0 275 mulhdu r31, r26, v0 276 ld r26, 24(up) 277 nop 278 mulld r24, r27, v0 279 mulhdu r8, r27, v0 280 ld r27, 32(up) 281 nop 282 adde r0, r0, r12 283 adde r24, r24, r31 284 mulld r9, r26, v0 285 mulhdu r10, r26, v0 286 ld r26, 40(up) 287 nop 288 mulld r11, r27, v0 289 mulhdu r12, r27, v0 290 ld r27, 48(up) 291 std r0, 8(rp) 292 adde r9, r9, r8 293 std r24, 16(rp) 294 adde r11, r11, r10 295 std r9, 24(rp) 296 addi up, up, 32 297 std r11, 32(rp) 298 addi rp, rp, 32 299 bdnz L(lo_m_0) 300 301 ALIGN(16) 302 L(end_m_0): 303 mulld r0, r26, v0 304 mulhdu r31, r26, v0 305 306 mulld r24, r27, v0 307 mulhdu r8, r27, v0 308 309 adde r0, r0, r12 310 adde r24, r24, r31 311 312 std r0, 8(rp) 313 addze r8, r8 314 std r24, 16(rp) 315 addic. vn, vn, -1 316 std r8, 24(rp) 317 nop 318 beq L(ret) 319 320 ALIGN(16) 321 L(outer_lo_0): 322 mtctr un C copy inner loop count into ctr 323 addi rp, outer_rp, 16 324 addi up, outer_up, 8 325 addi outer_rp, outer_rp, 8 326 ld v0, 0(vp) C new v limb 327 addi vp, vp, 8 328 ld r26, -8(up) 329 ld r27, 0(up) 330 ld r28, -8(rp) 331 ld r29, 0(rp) 332 nop 333 nop 334 mulld r0, r26, v0 335 mulhdu r10, r26, v0 336 mulld r24, r27, v0 337 mulhdu r8, r27, v0 338 addc r24, r24, r10 339 addze r12, r8 340 addc r0, r0, r28 341 std r0, -8(rp) 342 adde r24, r24, r29 343 std r24, 0(rp) 344 ld r26, 8(up) 345 ld r27, 16(up) 346 bdz L(end_0) 347 348 ALIGN(16) C registers dying 349 L(lo_0): 350 mulld r0, r26, v0 C 351 mulhdu r10, r26, v0 C 26 352 ld r26, 24(up) C 353 ld r28, 8(rp) C 354 mulld r24, r27, v0 C 355 mulhdu r8, r27, v0 C 27 356 ld r27, 32(up) C 357 ld r29, 16(rp) C 358 adde r0, r0, r12 C 0 12 359 adde r24, r24, r10 C 24 10 360 mulld r9, r26, v0 C 361 mulhdu r10, r26, v0 C 26 362 ld r26, 40(up) C 363 ld r30, 24(rp) C 364 mulld r11, r27, v0 C 365 mulhdu r12, r27, v0 C 27 366 ld r27, 48(up) C 367 ld r31, 32(rp) C 368 adde r9, r9, r8 C 8 9 369 adde r11, r11, r10 C 10 11 370 addze r12, r12 C 12 371 addc r0, r0, r28 C 0 28 372 std r0, 8(rp) C 0 373 adde r24, r24, r29 C 7 29 374 std r24, 16(rp) C 7 375 adde r9, r9, r30 C 9 30 376 std r9, 24(rp) C 9 377 adde r11, r11, r31 C 11 31 378 std r11, 32(rp) C 11 379 addi up, up, 32 C 380 addi rp, rp, 32 C 381 bdnz L(lo_0) C 382 383 ALIGN(16) 384 L(end_0): 385 mulld r0, r26, v0 386 mulhdu r10, r26, v0 387 ld r28, 8(rp) 388 nop 389 mulld r24, r27, v0 390 mulhdu r8, r27, v0 391 ld r29, 16(rp) 392 nop 393 adde r0, r0, r12 394 adde r24, r24, r10 395 addze r8, r8 396 addic. vn, vn, -1 397 addc r0, r0, r28 398 std r0, 8(rp) 399 adde r24, r24, r29 400 std r24, 16(rp) 401 addze r8, r8 402 std r8, 24(rp) 403 bne L(outer_lo_0) 404 b L(ret) 405 406 407 ALIGN(16) 408 L(b1): ld r27, 8(up) 409 nop 410 mulld r0, r26, v0 411 mulhdu r31, r26, v0 412 ld r26, 16(up) 413 mulld r24, r27, v0 414 mulhdu r8, r27, v0 415 mulld r9, r26, v0 416 mulhdu r10, r26, v0 417 addc r24, r24, r31 418 adde r9, r9, r8 419 addze r12, r10 420 std r0, 0(rp) 421 std r24, 8(rp) 422 std r9, 16(rp) 423 addi up, up, 16 424 addi rp, rp, 16 425 ld r26, 8(up) 426 ld r27, 16(up) 427 bdz L(end_m_1) 428 429 ALIGN(16) 430 L(lo_m_1): 431 mulld r0, r26, v0 432 mulhdu r31, r26, v0 433 ld r26, 24(up) 434 nop 435 mulld r24, r27, v0 436 mulhdu r8, r27, v0 437 ld r27, 32(up) 438 nop 439 adde r0, r0, r12 440 adde r24, r24, r31 441 mulld r9, r26, v0 442 mulhdu r10, r26, v0 443 ld r26, 40(up) 444 nop 445 mulld r11, r27, v0 446 mulhdu r12, r27, v0 447 ld r27, 48(up) 448 std r0, 8(rp) 449 adde r9, r9, r8 450 std r24, 16(rp) 451 adde r11, r11, r10 452 std r9, 24(rp) 453 addi up, up, 32 454 std r11, 32(rp) 455 addi rp, rp, 32 456 bdnz L(lo_m_1) 457 458 ALIGN(16) 459 L(end_m_1): 460 mulld r0, r26, v0 461 mulhdu r31, r26, v0 462 463 mulld r24, r27, v0 464 mulhdu r8, r27, v0 465 466 adde r0, r0, r12 467 adde r24, r24, r31 468 469 std r0, 8(rp) 470 addze r8, r8 471 std r24, 16(rp) 472 addic. vn, vn, -1 473 std r8, 24(rp) 474 nop 475 beq L(ret) 476 477 ALIGN(16) 478 L(outer_lo_1): 479 mtctr un C copy inner loop count into ctr 480 addi rp, outer_rp, 24 481 addi up, outer_up, 16 482 addi outer_rp, outer_rp, 8 483 ld v0, 0(vp) C new v limb 484 addi vp, vp, 8 485 ld r26, -16(up) 486 ld r27, -8(up) 487 mulld r0, r26, v0 488 mulhdu r31, r26, v0 489 ld r26, 0(up) 490 ld r28, -16(rp) 491 mulld r24, r27, v0 492 mulhdu r8, r27, v0 493 ld r29, -8(rp) 494 ld r30, 0(rp) 495 mulld r9, r26, v0 496 mulhdu r10, r26, v0 497 addc r24, r24, r31 498 adde r9, r9, r8 499 addze r12, r10 500 addc r0, r0, r28 501 std r0, -16(rp) 502 adde r24, r24, r29 503 std r24, -8(rp) 504 adde r9, r9, r30 505 std r9, 0(rp) 506 ld r26, 8(up) 507 ld r27, 16(up) 508 bdz L(end_1) 509 510 ALIGN(16) C registers dying 511 L(lo_1): 512 mulld r0, r26, v0 C 513 mulhdu r10, r26, v0 C 26 514 ld r26, 24(up) C 515 ld r28, 8(rp) C 516 mulld r24, r27, v0 C 517 mulhdu r8, r27, v0 C 27 518 ld r27, 32(up) C 519 ld r29, 16(rp) C 520 adde r0, r0, r12 C 0 12 521 adde r24, r24, r10 C 24 10 522 mulld r9, r26, v0 C 523 mulhdu r10, r26, v0 C 26 524 ld r26, 40(up) C 525 ld r30, 24(rp) C 526 mulld r11, r27, v0 C 527 mulhdu r12, r27, v0 C 27 528 ld r27, 48(up) C 529 ld r31, 32(rp) C 530 adde r9, r9, r8 C 8 9 531 adde r11, r11, r10 C 10 11 532 addze r12, r12 C 12 533 addc r0, r0, r28 C 0 28 534 std r0, 8(rp) C 0 535 adde r24, r24, r29 C 7 29 536 std r24, 16(rp) C 7 537 adde r9, r9, r30 C 9 30 538 std r9, 24(rp) C 9 539 adde r11, r11, r31 C 11 31 540 std r11, 32(rp) C 11 541 addi up, up, 32 C 542 addi rp, rp, 32 C 543 bdnz L(lo_1) C 544 545 ALIGN(16) 546 L(end_1): 547 mulld r0, r26, v0 548 mulhdu r10, r26, v0 549 ld r28, 8(rp) 550 nop 551 mulld r24, r27, v0 552 mulhdu r8, r27, v0 553 ld r29, 16(rp) 554 nop 555 adde r0, r0, r12 556 adde r24, r24, r10 557 addze r8, r8 558 addic. vn, vn, -1 559 addc r0, r0, r28 560 std r0, 8(rp) 561 adde r24, r24, r29 562 std r24, 16(rp) 563 addze r8, r8 564 std r8, 24(rp) 565 bne L(outer_lo_1) 566 b L(ret) 567 568 569 ALIGN(16) 570 L(b2): ld r27, 8(up) 571 addi up, up, -8 572 addi rp, rp, -8 573 li r12, 0 574 addic r12, r12, 0 575 576 ALIGN(16) 577 L(lo_m_2): 578 mulld r0, r26, v0 579 mulhdu r31, r26, v0 580 ld r26, 24(up) 581 nop 582 mulld r24, r27, v0 583 mulhdu r8, r27, v0 584 ld r27, 32(up) 585 nop 586 adde r0, r0, r12 587 adde r24, r24, r31 588 mulld r9, r26, v0 589 mulhdu r10, r26, v0 590 ld r26, 40(up) 591 nop 592 mulld r11, r27, v0 593 mulhdu r12, r27, v0 594 ld r27, 48(up) 595 std r0, 8(rp) 596 adde r9, r9, r8 597 std r24, 16(rp) 598 adde r11, r11, r10 599 std r9, 24(rp) 600 addi up, up, 32 601 std r11, 32(rp) 602 603 addi rp, rp, 32 604 bdnz L(lo_m_2) 605 606 ALIGN(16) 607 L(end_m_2): 608 mulld r0, r26, v0 609 mulhdu r31, r26, v0 610 611 mulld r24, r27, v0 612 mulhdu r8, r27, v0 613 614 adde r0, r0, r12 615 adde r24, r24, r31 616 617 std r0, 8(rp) 618 addze r8, r8 619 std r24, 16(rp) 620 addic. vn, vn, -1 621 std r8, 24(rp) 622 nop 623 beq L(ret) 624 625 ALIGN(16) 626 L(outer_lo_2): 627 mtctr un C copy inner loop count into ctr 628 addi rp, outer_rp, 0 629 addi up, outer_up, -8 630 addi outer_rp, outer_rp, 8 631 ld v0, 0(vp) C new v limb 632 addi vp, vp, 8 633 ld r26, 8(up) 634 ld r27, 16(up) 635 li r12, 0 636 addic r12, r12, 0 637 638 ALIGN(16) C registers dying 639 L(lo_2): 640 mulld r0, r26, v0 C 641 mulhdu r10, r26, v0 C 26 642 ld r26, 24(up) C 643 ld r28, 8(rp) C 644 mulld r24, r27, v0 C 645 mulhdu r8, r27, v0 C 27 646 ld r27, 32(up) C 647 ld r29, 16(rp) C 648 adde r0, r0, r12 C 0 12 649 adde r24, r24, r10 C 24 10 650 mulld r9, r26, v0 C 651 mulhdu r10, r26, v0 C 26 652 ld r26, 40(up) C 653 ld r30, 24(rp) C 654 mulld r11, r27, v0 C 655 mulhdu r12, r27, v0 C 27 656 ld r27, 48(up) C 657 ld r31, 32(rp) C 658 adde r9, r9, r8 C 8 9 659 adde r11, r11, r10 C 10 11 660 addze r12, r12 C 12 661 addc r0, r0, r28 C 0 28 662 std r0, 8(rp) C 0 663 adde r24, r24, r29 C 7 29 664 std r24, 16(rp) C 7 665 adde r9, r9, r30 C 9 30 666 std r9, 24(rp) C 9 667 adde r11, r11, r31 C 11 31 668 std r11, 32(rp) C 11 669 addi up, up, 32 C 670 addi rp, rp, 32 C 671 bdnz L(lo_2) C 672 673 ALIGN(16) 674 L(end_2): 675 mulld r0, r26, v0 676 mulhdu r10, r26, v0 677 ld r28, 8(rp) 678 nop 679 mulld r24, r27, v0 680 mulhdu r8, r27, v0 681 ld r29, 16(rp) 682 nop 683 adde r0, r0, r12 684 adde r24, r24, r10 685 addze r8, r8 686 addic. vn, vn, -1 687 addc r0, r0, r28 688 std r0, 8(rp) 689 adde r24, r24, r29 690 std r24, 16(rp) 691 addze r8, r8 692 std r8, 24(rp) 693 bne L(outer_lo_2) 694 b L(ret) 695 696 697 L(ret): ld r31, -8(r1) 698 ld r30, -16(r1) 699 ld r29, -24(r1) 700 ld r28, -32(r1) 701 ld r27, -40(r1) 702 ld r26, -48(r1) 703 ld r25, -56(r1) 704 ld r24, -64(r1) 705 ld r23, -72(r1) 706 ld r22, -80(r1) 707 blr 708 EPILOGUE()