github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/p6/mul_basecase.asm (about) 1 dnl PowerPC-64 mpn_mul_basecase. 2 3 dnl Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C POWER3/PPC630 ? 35 C POWER4/PPC970 ? 36 C POWER5 ? 37 C POWER6 12.25 38 39 C TODO 40 C * Reduce register usage. At least 4 register less can be used. 41 C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling 42 C would bring us to 9 c/l. 43 C * The bdz insns for b1 and b2 will never branch, 44 C * Align things better, perhaps by moving things like pointer updates from 45 C before to after loops. 46 47 C INPUT PARAMETERS 48 define(`rp', `r3') 49 define(`up', `r4') 50 define(`un', `r5') 51 define(`vp', `r6') 52 define(`vn', `r7') 53 54 define(`v0', `r25') 55 define(`outer_rp', `r22') 56 define(`outer_up', `r23') 57 58 ASM_START() 59 PROLOGUE(mpn_mul_basecase) 60 61 C Special code for un <= 2, for efficiency of these important cases, 62 C and since it simplifies the default code. 63 cmpdi cr0, un, 2 64 bgt cr0, L(un_gt2) 65 cmpdi cr6, vn, 1 66 ld r7, 0(vp) 67 ld r5, 0(up) 68 mulld r8, r5, r7 C weight 0 69 mulhdu r9, r5, r7 C weight 1 70 std r8, 0(rp) 71 beq cr0, L(2x) 72 std r9, 8(rp) 73 blr 74 ALIGN(16) 75 L(2x): ld r0, 8(up) 76 mulld r8, r0, r7 C weight 1 77 mulhdu r10, r0, r7 C weight 2 78 addc r9, r9, r8 79 addze r10, r10 80 bne cr6, L(2x2) 81 std r9, 8(rp) 82 std r10, 16(rp) 83 blr 84 ALIGN(16) 85 L(2x2): ld r6, 8(vp) 86 nop 87 mulld r8, r5, r6 C weight 1 88 mulhdu r11, r5, r6 C weight 2 89 mulld r12, r0, r6 C weight 2 90 mulhdu r0, r0, r6 C weight 3 91 addc r9, r9, r8 92 std r9, 8(rp) 93 adde r11, r11, r10 94 addze r0, r0 95 addc r11, r11, r12 96 addze r0, r0 97 std r11, 16(rp) 98 std r0, 24(rp) 99 blr 100 101 L(un_gt2): 102 std r31, -8(r1) 103 std r30, -16(r1) 104 std r29, -24(r1) 105 std r28, -32(r1) 106 std r27, -40(r1) 107 std r26, -48(r1) 108 std r25, -56(r1) 109 std r24, -64(r1) 110 std r23, -72(r1) 111 std r22, -80(r1) 112 std r21, -88(r1) 113 std r20, -96(r1) 114 115 mr outer_rp, rp 116 mr outer_up, up 117 118 ld v0, 0(vp) C new v limb 119 addi vp, vp, 8 120 ld r26, 0(up) 121 122 rldicl. r0, un, 0,62 C r0 = n & 3, set cr0 123 cmpdi cr6, r0, 2 124 addi un, un, 4 C compute count... 125 srdi un, un, 2 C ...for ctr 126 mtctr un C copy inner loop count into ctr 127 beq cr0, L(b0) 128 blt cr6, L(b1) 129 beq cr6, L(b2) 130 131 132 ALIGN(16) 133 L(b3): 134 ld r27, 8(up) 135 ld r20, 16(up) 136 mulld r0, r26, v0 137 mulhdu r31, r26, v0 138 mulld r24, r27, v0 139 mulhdu r8, r27, v0 140 mulld r9, r20, v0 141 mulhdu r10, r20, v0 142 addc r24, r24, r31 143 adde r9, r9, r8 144 addze r12, r10 145 std r0, 0(rp) 146 std r24, 8(rp) 147 std r9, 16(rp) 148 addi up, up, 16 149 addi rp, rp, 16 150 bdz L(end_m_3) 151 152 ALIGN(32) 153 L(lo_m_3): 154 ld r26, 8(up) 155 ld r27, 16(up) 156 ld r20, 24(up) 157 ld r21, 32(up) 158 mulld r0, r26, v0 159 mulhdu r31, r26, v0 160 mulld r24, r27, v0 161 mulhdu r8, r27, v0 162 mulld r9, r20, v0 163 mulhdu r27, r20, v0 164 mulld r11, r21, v0 165 mulhdu r26, r21, v0 166 adde r0, r0, r12 167 adde r24, r24, r31 168 std r0, 8(rp) 169 adde r9, r9, r8 170 std r24, 16(rp) 171 adde r11, r11, r27 172 std r9, 24(rp) 173 addi up, up, 32 174 std r11, 32(rp) 175 addi rp, rp, 32 176 mr r12, r26 177 bdnz L(lo_m_3) 178 179 ALIGN(16) 180 L(end_m_3): 181 addze r12, r12 182 addic. vn, vn, -1 183 std r12, 8(rp) 184 beq L(ret) 185 186 ALIGN(16) 187 L(outer_lo_3): 188 mtctr un C copy inner loop count into ctr 189 addi rp, outer_rp, 24 190 addi up, outer_up, 16 191 addi outer_rp, outer_rp, 8 192 ld v0, 0(vp) C new v limb 193 addi vp, vp, 8 194 ld r26, -16(up) 195 ld r27, -8(up) 196 ld r20, 0(up) 197 mulld r0, r26, v0 198 mulhdu r31, r26, v0 199 mulld r24, r27, v0 200 mulhdu r8, r27, v0 201 mulld r9, r20, v0 202 mulhdu r10, r20, v0 203 ld r28, -16(rp) 204 ld r29, -8(rp) 205 ld r30, 0(rp) 206 addc r24, r24, r31 207 adde r9, r9, r8 208 addze r12, r10 209 addc r0, r0, r28 210 std r0, -16(rp) 211 adde r24, r24, r29 212 std r24, -8(rp) 213 adde r9, r9, r30 214 std r9, 0(rp) 215 bdz L(end_3) 216 217 ALIGN(32) C registers dying 218 L(lo_3): 219 ld r26, 8(up) 220 ld r27, 16(up) 221 ld r20, 24(up) C 222 ld r21, 32(up) C 223 addi up, up, 32 C 224 addi rp, rp, 32 C 225 mulld r0, r26, v0 C 226 mulhdu r10, r26, v0 C 26 227 mulld r24, r27, v0 C 228 mulhdu r8, r27, v0 C 27 229 mulld r9, r20, v0 C 230 mulhdu r27, r20, v0 C 26 231 mulld r11, r21, v0 C 232 mulhdu r26, r21, v0 C 27 233 ld r28, -24(rp) C 234 adde r0, r0, r12 C 0 12 235 ld r29, -16(rp) C 236 adde r24, r24, r10 C 24 10 237 ld r30, -8(rp) C 238 ld r31, 0(rp) C 239 adde r9, r9, r8 C 8 9 240 adde r11, r11, r27 C 27 11 241 addze r12, r26 C 26 242 addc r0, r0, r28 C 0 28 243 std r0, -24(rp) C 0 244 adde r24, r24, r29 C 7 29 245 std r24, -16(rp) C 7 246 adde r9, r9, r30 C 9 30 247 std r9, -8(rp) C 9 248 adde r11, r11, r31 C 11 31 249 std r11, 0(rp) C 11 250 bdnz L(lo_3) C 251 252 ALIGN(16) 253 L(end_3): 254 addze r12, r12 255 addic. vn, vn, -1 256 std r12, 8(rp) 257 bne L(outer_lo_3) 258 b L(ret) 259 260 261 ALIGN(16) 262 L(b1): 263 mulld r0, r26, v0 264 mulhdu r12, r26, v0 265 addic r0, r0, 0 266 std r0, 0(rp) 267 bdz L(end_m_1) 268 269 ALIGN(16) 270 L(lo_m_1): 271 ld r26, 8(up) 272 ld r27, 16(up) 273 ld r20, 24(up) 274 ld r21, 32(up) 275 mulld r0, r26, v0 276 mulhdu r31, r26, v0 277 mulld r24, r27, v0 278 mulhdu r8, r27, v0 279 mulld r9, r20, v0 280 mulhdu r27, r20, v0 281 mulld r11, r21, v0 282 mulhdu r26, r21, v0 283 adde r0, r0, r12 284 adde r24, r24, r31 285 std r0, 8(rp) 286 adde r9, r9, r8 287 std r24, 16(rp) 288 adde r11, r11, r27 289 std r9, 24(rp) 290 addi up, up, 32 291 std r11, 32(rp) 292 addi rp, rp, 32 293 mr r12, r26 294 bdnz L(lo_m_1) 295 296 ALIGN(16) 297 L(end_m_1): 298 addze r12, r12 299 addic. vn, vn, -1 300 std r12, 8(rp) 301 beq L(ret) 302 303 ALIGN(16) 304 L(outer_lo_1): 305 mtctr un C copy inner loop count into ctr 306 addi rp, outer_rp, 8 307 mr up, outer_up 308 addi outer_rp, outer_rp, 8 309 ld v0, 0(vp) C new v limb 310 addi vp, vp, 8 311 ld r26, 0(up) 312 ld r28, 0(rp) 313 mulld r0, r26, v0 314 mulhdu r12, r26, v0 315 addc r0, r0, r28 316 std r0, 0(rp) 317 bdz L(end_1) 318 319 ALIGN(32) C registers dying 320 L(lo_1): 321 ld r26, 8(up) 322 ld r27, 16(up) 323 ld r20, 24(up) C 324 ld r21, 32(up) C 325 addi up, up, 32 C 326 addi rp, rp, 32 C 327 mulld r0, r26, v0 C 328 mulhdu r10, r26, v0 C 26 329 mulld r24, r27, v0 C 330 mulhdu r8, r27, v0 C 27 331 mulld r9, r20, v0 C 332 mulhdu r27, r20, v0 C 26 333 mulld r11, r21, v0 C 334 mulhdu r26, r21, v0 C 27 335 ld r28, -24(rp) C 336 adde r0, r0, r12 C 0 12 337 ld r29, -16(rp) C 338 adde r24, r24, r10 C 24 10 339 ld r30, -8(rp) C 340 ld r31, 0(rp) C 341 adde r9, r9, r8 C 8 9 342 adde r11, r11, r27 C 27 11 343 addze r12, r26 C 26 344 addc r0, r0, r28 C 0 28 345 std r0, -24(rp) C 0 346 adde r24, r24, r29 C 7 29 347 std r24, -16(rp) C 7 348 adde r9, r9, r30 C 9 30 349 std r9, -8(rp) C 9 350 adde r11, r11, r31 C 11 31 351 std r11, 0(rp) C 11 352 bdnz L(lo_1) C 353 354 ALIGN(16) 355 L(end_1): 356 addze r12, r12 357 addic. vn, vn, -1 358 std r12, 8(rp) 359 bne L(outer_lo_1) 360 b L(ret) 361 362 363 ALIGN(16) 364 L(b0): 365 addi up, up, -8 366 addi rp, rp, -8 367 li r12, 0 368 addic r12, r12, 0 369 bdz L(end_m_0) 370 371 ALIGN(16) 372 L(lo_m_0): 373 ld r26, 8(up) 374 ld r27, 16(up) 375 ld r20, 24(up) 376 ld r21, 32(up) 377 mulld r0, r26, v0 378 mulhdu r31, r26, v0 379 mulld r24, r27, v0 380 mulhdu r8, r27, v0 381 mulld r9, r20, v0 382 mulhdu r27, r20, v0 383 mulld r11, r21, v0 384 mulhdu r26, r21, v0 385 adde r0, r0, r12 386 adde r24, r24, r31 387 std r0, 8(rp) 388 adde r9, r9, r8 389 std r24, 16(rp) 390 adde r11, r11, r27 391 std r9, 24(rp) 392 addi up, up, 32 393 std r11, 32(rp) 394 addi rp, rp, 32 395 mr r12, r26 396 bdnz L(lo_m_0) 397 398 ALIGN(16) 399 L(end_m_0): 400 addze r12, r12 401 addic. vn, vn, -1 402 std r12, 8(rp) 403 beq L(ret) 404 405 ALIGN(16) 406 L(outer_lo_0): 407 mtctr un C copy inner loop count into ctr 408 addi rp, outer_rp, 0 409 addi up, outer_up, -8 410 addi outer_rp, outer_rp, 8 411 ld v0, 0(vp) C new v limb 412 addi vp, vp, 8 413 li r12, 0 414 addic r12, r12, 0 415 bdz L(end_0) 416 417 ALIGN(32) C registers dying 418 L(lo_0): 419 ld r26, 8(up) 420 ld r27, 16(up) 421 ld r20, 24(up) C 422 ld r21, 32(up) C 423 addi up, up, 32 C 424 addi rp, rp, 32 C 425 mulld r0, r26, v0 C 426 mulhdu r10, r26, v0 C 26 427 mulld r24, r27, v0 C 428 mulhdu r8, r27, v0 C 27 429 mulld r9, r20, v0 C 430 mulhdu r27, r20, v0 C 26 431 mulld r11, r21, v0 C 432 mulhdu r26, r21, v0 C 27 433 ld r28, -24(rp) C 434 adde r0, r0, r12 C 0 12 435 ld r29, -16(rp) C 436 adde r24, r24, r10 C 24 10 437 ld r30, -8(rp) C 438 ld r31, 0(rp) C 439 adde r9, r9, r8 C 8 9 440 adde r11, r11, r27 C 27 11 441 addze r12, r26 C 26 442 addc r0, r0, r28 C 0 28 443 std r0, -24(rp) C 0 444 adde r24, r24, r29 C 7 29 445 std r24, -16(rp) C 7 446 adde r9, r9, r30 C 9 30 447 std r9, -8(rp) C 9 448 adde r11, r11, r31 C 11 31 449 std r11, 0(rp) C 11 450 bdnz L(lo_0) C 451 452 ALIGN(16) 453 L(end_0): 454 addze r12, r12 455 addic. vn, vn, -1 456 std r12, 8(rp) 457 bne L(outer_lo_0) 458 b L(ret) 459 460 461 ALIGN(16) 462 L(b2): ld r27, 8(up) 463 addi up, up, 8 464 mulld r0, r26, v0 465 mulhdu r10, r26, v0 466 mulld r24, r27, v0 467 mulhdu r8, r27, v0 468 addc r24, r24, r10 469 addze r12, r8 470 std r0, 0(rp) 471 std r24, 8(rp) 472 addi rp, rp, 8 473 bdz L(end_m_2) 474 475 ALIGN(16) 476 L(lo_m_2): 477 ld r26, 8(up) 478 ld r27, 16(up) 479 ld r20, 24(up) 480 ld r21, 32(up) 481 mulld r0, r26, v0 482 mulhdu r31, r26, v0 483 mulld r24, r27, v0 484 mulhdu r8, r27, v0 485 mulld r9, r20, v0 486 mulhdu r27, r20, v0 487 mulld r11, r21, v0 488 mulhdu r26, r21, v0 489 adde r0, r0, r12 490 adde r24, r24, r31 491 std r0, 8(rp) 492 adde r9, r9, r8 493 std r24, 16(rp) 494 adde r11, r11, r27 495 std r9, 24(rp) 496 addi up, up, 32 497 std r11, 32(rp) 498 addi rp, rp, 32 499 mr r12, r26 500 bdnz L(lo_m_2) 501 502 ALIGN(16) 503 L(end_m_2): 504 addze r12, r12 505 addic. vn, vn, -1 506 std r12, 8(rp) 507 beq L(ret) 508 509 ALIGN(16) 510 L(outer_lo_2): 511 mtctr un C copy inner loop count into ctr 512 addi rp, outer_rp, 16 513 addi up, outer_up, 8 514 addi outer_rp, outer_rp, 8 515 ld v0, 0(vp) C new v limb 516 addi vp, vp, 8 517 ld r26, -8(up) 518 ld r27, 0(up) 519 ld r28, -8(rp) 520 ld r29, 0(rp) 521 mulld r0, r26, v0 522 mulhdu r10, r26, v0 523 mulld r24, r27, v0 524 mulhdu r8, r27, v0 525 addc r24, r24, r10 526 addze r12, r8 527 addc r0, r0, r28 528 std r0, -8(rp) 529 adde r24, r24, r29 530 std r24, 0(rp) 531 bdz L(end_2) 532 533 ALIGN(16) C registers dying 534 L(lo_2): 535 ld r26, 8(up) 536 ld r27, 16(up) 537 ld r20, 24(up) C 538 ld r21, 32(up) C 539 addi up, up, 32 C 540 addi rp, rp, 32 C 541 mulld r0, r26, v0 C 542 mulhdu r10, r26, v0 C 26 543 mulld r24, r27, v0 C 544 mulhdu r8, r27, v0 C 27 545 mulld r9, r20, v0 C 546 mulhdu r27, r20, v0 C 26 547 mulld r11, r21, v0 C 548 mulhdu r26, r21, v0 C 27 549 ld r28, -24(rp) C 550 adde r0, r0, r12 C 0 12 551 ld r29, -16(rp) C 552 adde r24, r24, r10 C 24 10 553 ld r30, -8(rp) C 554 ld r31, 0(rp) C 555 adde r9, r9, r8 C 8 9 556 adde r11, r11, r27 C 27 11 557 addze r12, r26 C 26 558 addc r0, r0, r28 C 0 28 559 std r0, -24(rp) C 0 560 adde r24, r24, r29 C 7 29 561 std r24, -16(rp) C 7 562 adde r9, r9, r30 C 9 30 563 std r9, -8(rp) C 9 564 adde r11, r11, r31 C 11 31 565 std r11, 0(rp) C 11 566 bdnz L(lo_2) C 567 568 ALIGN(16) 569 L(end_2): 570 addze r12, r12 571 addic. vn, vn, -1 572 std r12, 8(rp) 573 bne L(outer_lo_2) 574 C b L(ret) 575 576 L(ret): ld r31, -8(r1) 577 ld r30, -16(r1) 578 ld r29, -24(r1) 579 ld r28, -32(r1) 580 ld r27, -40(r1) 581 ld r26, -48(r1) 582 ld r25, -56(r1) 583 ld r24, -64(r1) 584 ld r23, -72(r1) 585 ld r22, -80(r1) 586 ld r21, -88(r1) 587 ld r20, -96(r1) 588 blr 589 EPILOGUE()