github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/sqr_basecase.asm (about) 1 dnl PowerPC-64 mpn_sqr_basecase. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, 6 dnl Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb 37 C POWER3/PPC630 6-18 38 C POWER4/PPC970 8 39 C POWER5 8 40 C POWER6 16.25 41 C POWER7 3.77 42 43 C NOTES 44 C * This is very crude, cleanup! 45 C * Try to reduce the number of needed live registers. 46 C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The 47 C cost will be more live registers. 48 C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code 49 C size a lot and speed things up perhaps 25%. 50 C * Use computed goto in order to compress the code. 51 C * Implement a larger final corner. 52 C * Schedule callee-saves register saves into other insns. This could save 53 C about 5 cycles/call. (We cannot analogously optimise the restores, since 54 C the sqr_diag_addlsh1 loop has no wind-down code as currently written.) 55 C * Should the alternating std/adde sequences be split? Some pipelines handle 56 C adde poorly, and might sequentialise all these instructions. 57 C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for 58 C adjacent integer multiply insns. Except for the multiply insns, the code 59 C was not carefully optimised for POWER6 or any other CPU. 60 C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop. 61 62 C INPUT PARAMETERS 63 define(`rp', `r3') 64 define(`up', `r4') 65 define(`n', `r5') 66 67 define(`rp_outer', `r25') 68 define(`up_outer', `r21') 69 define(`rp_saved', `r22') 70 define(`up_saved', `r23') 71 define(`n_saved', `r24') 72 73 ASM_START() 74 PROLOGUE(mpn_sqr_basecase) 75 cmpdi cr0, n, 2 76 bge cr0, L(ge2) 77 ld r5, 0(up) C n = 1 78 nop 79 mulld r8, r5, r5 C weight 0 80 mulhdu r9, r5, r5 C weight 1 81 std r8, 0(rp) 82 std r9, 8(rp) 83 blr 84 ALIGN(16) 85 L(ge2): bgt cr0, L(gt2) 86 ld r0, 0(up) C n = 2 87 nop 88 mulld r8, r0, r0 C u0 * u0 89 mulhdu r9, r0, r0 C u0 * u0 90 ld r6, 8(up) 91 mulld r10, r6, r6 C u1 * u1 92 mulhdu r11, r6, r6 C u1 * u1 93 mulld r4, r6, r0 C u1 * u0 94 mulhdu r5, r6, r0 C u1 * u0 95 addc r4, r4, r4 96 adde r5, r5, r5 97 addze r11, r11 98 addc r9, r9, r4 99 adde r10, r10, r5 100 addze r11, r11 101 std r8, 0(rp) 102 std r9, 8(rp) 103 std r10, 16(rp) 104 std r11, 24(rp) 105 blr 106 107 ALIGN(16) 108 L(gt2): std r31, -8(r1) 109 std r30, -16(r1) 110 std r29, -24(r1) 111 std r28, -32(r1) 112 std r27, -40(r1) 113 std r26, -48(r1) 114 std r25, -56(r1) 115 std r24, -64(r1) 116 std r23, -72(r1) 117 std r22, -80(r1) 118 std r21, -88(r1) 119 120 mr rp_saved, rp 121 mr up_saved, up 122 mr n_saved, n 123 mr rp_outer, rp 124 mr up_outer, up 125 126 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 127 cmpdi cr6, r0, 2 128 addic r7, n, 2 C compute count... 129 srdi r7, r7, 2 C ...for ctr 130 mtctr r7 C copy count into ctr 131 beq- cr0, L(b0) 132 blt- cr6, L(b1) 133 beq- cr6, L(b2) 134 135 L(b3): ld r6, 0(up) 136 ld r9, 8(up) 137 ld r27, 16(up) 138 addi up, up, 24 139 li r12, 0 C carry limb 140 bdz L(em3) 141 142 ALIGN(16) 143 L(tm3): mulld r0, r9, r6 144 mulhdu r26, r9, r6 145 mulld r7, r27, r6 146 mulhdu r8, r27, r6 147 ld r9, 0(up) 148 ld r27, 8(up) 149 adde r0, r0, r12 150 adde r7, r7, r26 151 mulld r26, r9, r6 152 mulhdu r10, r9, r6 153 mulld r11, r27, r6 154 mulhdu r12, r27, r6 155 ld r9, 16(up) 156 ld r27, 24(up) 157 std r0, 8(rp) 158 adde r26, r26, r8 159 std r7, 16(rp) 160 adde r11, r11, r10 161 std r26, 24(rp) 162 addi up, up, 32 163 std r11, 32(rp) 164 addi rp, rp, 32 165 bdnz L(tm3) 166 167 L(em3): mulld r0, r9, r6 168 mulhdu r26, r9, r6 169 mulld r7, r27, r6 170 mulhdu r8, r27, r6 171 adde r0, r0, r12 172 adde r7, r7, r26 173 std r0, 8(rp) 174 std r7, 16(rp) 175 addze r8, r8 176 std r8, 24(rp) 177 addi n, n, 2 178 b L(outer_loop) 179 180 L(b0): ld r6, 0(up) 181 ld r27, 8(up) 182 mulld r7, r27, r6 183 mulhdu r12, r27, r6 184 std r7, 8(rp) 185 addi rp, rp, 8 186 ld r9, 16(up) 187 ld r27, 24(up) 188 addi up, up, 32 189 bdz L(em0) 190 191 ALIGN(16) 192 L(tm0): mulld r0, r9, r6 193 mulhdu r26, r9, r6 194 mulld r7, r27, r6 195 mulhdu r8, r27, r6 196 ld r9, 0(up) 197 ld r27, 8(up) 198 adde r0, r0, r12 199 adde r7, r7, r26 200 mulld r26, r9, r6 201 mulhdu r10, r9, r6 202 mulld r11, r27, r6 203 mulhdu r12, r27, r6 204 ld r9, 16(up) 205 ld r27, 24(up) 206 std r0, 8(rp) 207 adde r26, r26, r8 208 std r7, 16(rp) 209 adde r11, r11, r10 210 std r26, 24(rp) 211 addi up, up, 32 212 std r11, 32(rp) 213 addi rp, rp, 32 214 bdnz L(tm0) 215 216 L(em0): mulld r0, r9, r6 217 mulhdu r26, r9, r6 218 mulld r7, r27, r6 219 mulhdu r8, r27, r6 220 adde r0, r0, r12 221 adde r7, r7, r26 222 std r0, 8(rp) 223 std r7, 16(rp) 224 addze r8, r8 225 std r8, 24(rp) 226 addi n, n, 2 227 b L(outer_loop_ent_2) 228 229 L(b1): ld r6, 0(up) 230 ld r9, 8(up) 231 ld r27, 16(up) 232 mulld r0, r9, r6 233 mulhdu r26, r9, r6 234 mulld r7, r27, r6 235 mulhdu r12, r27, r6 236 addc r7, r7, r26 237 std r0, 8(rp) 238 std r7, 16(rp) 239 addi rp, rp, 16 240 ld r9, 24(up) 241 ld r27, 32(up) 242 addi up, up, 40 243 bdz L(em1) 244 245 ALIGN(16) 246 L(tm1): mulld r0, r9, r6 247 mulhdu r26, r9, r6 248 mulld r7, r27, r6 249 mulhdu r8, r27, r6 250 ld r9, 0(up) 251 ld r27, 8(up) 252 adde r0, r0, r12 253 adde r7, r7, r26 254 mulld r26, r9, r6 255 mulhdu r10, r9, r6 256 mulld r11, r27, r6 257 mulhdu r12, r27, r6 258 ld r9, 16(up) 259 ld r27, 24(up) 260 std r0, 8(rp) 261 adde r26, r26, r8 262 std r7, 16(rp) 263 adde r11, r11, r10 264 std r26, 24(rp) 265 addi up, up, 32 266 std r11, 32(rp) 267 addi rp, rp, 32 268 bdnz L(tm1) 269 270 L(em1): mulld r0, r9, r6 271 mulhdu r26, r9, r6 272 mulld r7, r27, r6 273 mulhdu r8, r27, r6 274 adde r0, r0, r12 275 adde r7, r7, r26 276 std r0, 8(rp) 277 std r7, 16(rp) 278 addze r8, r8 279 std r8, 24(rp) 280 addi n, n, 2 281 b L(outer_loop_ent_3) 282 283 L(b2): addi r7, r7, -1 C FIXME 284 mtctr r7 C FIXME 285 ld r6, 0(up) 286 ld r9, 8(up) 287 ld r27, 16(up) 288 mulld r0, r9, r6 289 mulhdu r26, r9, r6 290 mulld r7, r27, r6 291 mulhdu r8, r27, r6 292 ld r9, 24(up) 293 mulld r11, r9, r6 294 mulhdu r10, r9, r6 295 addc r7, r7, r26 296 adde r11, r11, r8 297 addze r12, r10 298 std r0, 8(rp) 299 std r7, 16(rp) 300 std r11, 24(rp) 301 addi rp, rp, 24 302 ld r9, 32(up) 303 ld r27, 40(up) 304 addi up, up, 48 305 bdz L(em2) 306 307 ALIGN(16) 308 L(tm2): mulld r0, r9, r6 309 mulhdu r26, r9, r6 310 mulld r7, r27, r6 311 mulhdu r8, r27, r6 312 ld r9, 0(up) 313 ld r27, 8(up) 314 adde r0, r0, r12 315 adde r7, r7, r26 316 mulld r26, r9, r6 317 mulhdu r10, r9, r6 318 mulld r11, r27, r6 319 mulhdu r12, r27, r6 320 ld r9, 16(up) 321 ld r27, 24(up) 322 std r0, 8(rp) 323 adde r26, r26, r8 324 std r7, 16(rp) 325 adde r11, r11, r10 326 std r26, 24(rp) 327 addi up, up, 32 328 std r11, 32(rp) 329 addi rp, rp, 32 330 bdnz L(tm2) 331 332 L(em2): mulld r0, r9, r6 333 mulhdu r26, r9, r6 334 mulld r7, r27, r6 335 mulhdu r8, r27, r6 336 adde r0, r0, r12 337 adde r7, r7, r26 338 std r0, 8(rp) 339 std r7, 16(rp) 340 addze r8, r8 341 std r8, 24(rp) 342 addi n, n, 2 343 b L(outer_loop_ent_0) 344 345 346 L(outer_loop): 347 addi n, n, -1 348 addi up_outer, up_outer, 8 349 addi rp_outer, rp_outer, 16 350 351 mr up, up_outer 352 addi rp, rp_outer, 8 353 354 srdi r0, n, 2 355 mtctr r0 356 357 bdz L(outer_end) 358 359 ld r6, 0(up) 360 ld r9, 8(up) 361 ld r27, 16(up) 362 mulld r0, r9, r6 363 mulhdu r26, r9, r6 364 mulld r7, r27, r6 365 mulhdu r8, r27, r6 366 ld r9, 24(up) 367 ld r28, 0(rp) 368 ld r29, 8(rp) 369 ld r30, 16(rp) 370 mulld r11, r9, r6 371 mulhdu r10, r9, r6 372 addc r7, r7, r26 373 adde r11, r11, r8 374 addze r12, r10 375 addc r0, r0, r28 376 std r0, 0(rp) 377 adde r7, r7, r29 378 std r7, 8(rp) 379 adde r11, r11, r30 380 std r11, 16(rp) 381 addi rp, rp, 24 382 ld r9, 32(up) 383 ld r27, 40(up) 384 addi up, up, 48 385 bdz L(ea1) 386 387 ALIGN(16) 388 L(ta1): mulld r0, r9, r6 389 mulhdu r26, r9, r6 C 9 390 mulld r7, r27, r6 391 mulhdu r8, r27, r6 C 27 392 ld r9, 0(up) 393 ld r28, 0(rp) 394 ld r27, 8(up) 395 ld r29, 8(rp) 396 adde r0, r0, r12 C 0 12 397 adde r7, r7, r26 C 5 7 398 mulld r26, r9, r6 399 mulhdu r10, r9, r6 C 9 400 mulld r11, r27, r6 401 mulhdu r12, r27, r6 C 27 402 ld r9, 16(up) 403 ld r30, 16(rp) 404 ld r27, 24(up) 405 ld r31, 24(rp) 406 adde r26, r26, r8 C 8 5 407 adde r11, r11, r10 C 10 11 408 addze r12, r12 C 12 409 addc r0, r0, r28 C 0 28 410 std r0, 0(rp) C 0 411 adde r7, r7, r29 C 7 29 412 std r7, 8(rp) C 7 413 adde r26, r26, r30 C 5 30 414 std r26, 16(rp) C 5 415 adde r11, r11, r31 C 11 31 416 std r11, 24(rp) C 11 417 addi up, up, 32 418 addi rp, rp, 32 419 bdnz L(ta1) 420 421 L(ea1): mulld r0, r9, r6 422 mulhdu r26, r9, r6 423 mulld r7, r27, r6 424 mulhdu r8, r27, r6 425 ld r28, 0(rp) 426 ld r29, 8(rp) 427 adde r0, r0, r12 428 adde r7, r7, r26 429 addze r8, r8 430 addc r0, r0, r28 431 std r0, 0(rp) 432 adde r7, r7, r29 433 std r7, 8(rp) 434 addze r8, r8 435 std r8, 16(rp) 436 437 L(outer_loop_ent_0): 438 addi n, n, -1 439 addi up_outer, up_outer, 8 440 addi rp_outer, rp_outer, 16 441 442 mr up, up_outer 443 addi rp, rp_outer, 8 444 445 srdi r0, n, 2 446 mtctr r0 447 448 ld r6, 0(up) 449 ld r9, 8(up) 450 ld r27, 16(up) 451 ld r28, 0(rp) 452 ld r29, 8(rp) 453 mulld r0, r9, r6 454 mulhdu r26, r9, r6 455 mulld r7, r27, r6 456 mulhdu r8, r27, r6 457 addc r0, r0, r28 458 adde r7, r7, r26 459 addze r12, r8 460 std r0, 0(rp) 461 adde r7, r7, r29 462 std r7, 8(rp) 463 addi rp, rp, 16 464 ld r9, 24(up) 465 ld r27, 32(up) 466 addi up, up, 40 467 bdz L(ea0) 468 469 ALIGN(16) 470 L(ta0): mulld r0, r9, r6 471 mulhdu r26, r9, r6 C 9 472 mulld r7, r27, r6 473 mulhdu r8, r27, r6 C 27 474 ld r9, 0(up) 475 ld r28, 0(rp) 476 ld r27, 8(up) 477 ld r29, 8(rp) 478 adde r0, r0, r12 C 0 12 479 adde r7, r7, r26 C 5 7 480 mulld r26, r9, r6 481 mulhdu r10, r9, r6 C 9 482 mulld r11, r27, r6 483 mulhdu r12, r27, r6 C 27 484 ld r9, 16(up) 485 ld r30, 16(rp) 486 ld r27, 24(up) 487 ld r31, 24(rp) 488 adde r26, r26, r8 C 8 5 489 adde r11, r11, r10 C 10 11 490 addze r12, r12 C 12 491 addc r0, r0, r28 C 0 28 492 std r0, 0(rp) C 0 493 adde r7, r7, r29 C 7 29 494 std r7, 8(rp) C 7 495 adde r26, r26, r30 C 5 30 496 std r26, 16(rp) C 5 497 adde r11, r11, r31 C 11 31 498 std r11, 24(rp) C 11 499 addi up, up, 32 500 addi rp, rp, 32 501 bdnz L(ta0) 502 503 L(ea0): mulld r0, r9, r6 504 mulhdu r26, r9, r6 505 mulld r7, r27, r6 506 mulhdu r8, r27, r6 507 ld r28, 0(rp) 508 ld r29, 8(rp) 509 adde r0, r0, r12 510 adde r7, r7, r26 511 addze r8, r8 512 addc r0, r0, r28 513 std r0, 0(rp) 514 adde r7, r7, r29 515 std r7, 8(rp) 516 addze r8, r8 517 std r8, 16(rp) 518 519 L(outer_loop_ent_3): 520 addi n, n, -1 521 addi up_outer, up_outer, 8 522 addi rp_outer, rp_outer, 16 523 524 mr up, up_outer 525 addi rp, rp_outer, 8 526 527 srdi r0, n, 2 528 mtctr r0 529 530 ld r6, 0(up) 531 ld r9, 8(up) 532 ld r28, 0(rp) 533 mulld r0, r9, r6 534 mulhdu r12, r9, r6 535 addc r0, r0, r28 536 std r0, 0(rp) 537 addi rp, rp, 8 538 ld r9, 16(up) 539 ld r27, 24(up) 540 addi up, up, 32 541 bdz L(ea3) 542 543 ALIGN(16) 544 L(ta3): mulld r0, r9, r6 545 mulhdu r26, r9, r6 C 9 546 mulld r7, r27, r6 547 mulhdu r8, r27, r6 C 27 548 ld r9, 0(up) 549 ld r28, 0(rp) 550 ld r27, 8(up) 551 ld r29, 8(rp) 552 adde r0, r0, r12 C 0 12 553 adde r7, r7, r26 C 5 7 554 mulld r26, r9, r6 555 mulhdu r10, r9, r6 C 9 556 mulld r11, r27, r6 557 mulhdu r12, r27, r6 C 27 558 ld r9, 16(up) 559 ld r30, 16(rp) 560 ld r27, 24(up) 561 ld r31, 24(rp) 562 adde r26, r26, r8 C 8 5 563 adde r11, r11, r10 C 10 11 564 addze r12, r12 C 12 565 addc r0, r0, r28 C 0 28 566 std r0, 0(rp) C 0 567 adde r7, r7, r29 C 7 29 568 std r7, 8(rp) C 7 569 adde r26, r26, r30 C 5 30 570 std r26, 16(rp) C 5 571 adde r11, r11, r31 C 11 31 572 std r11, 24(rp) C 11 573 addi up, up, 32 574 addi rp, rp, 32 575 bdnz L(ta3) 576 577 L(ea3): mulld r0, r9, r6 578 mulhdu r26, r9, r6 579 mulld r7, r27, r6 580 mulhdu r8, r27, r6 581 ld r28, 0(rp) 582 ld r29, 8(rp) 583 adde r0, r0, r12 584 adde r7, r7, r26 585 addze r8, r8 586 addc r0, r0, r28 587 std r0, 0(rp) 588 adde r7, r7, r29 589 std r7, 8(rp) 590 addze r8, r8 591 std r8, 16(rp) 592 593 594 L(outer_loop_ent_2): 595 addi n, n, -1 596 addi up_outer, up_outer, 8 597 addi rp_outer, rp_outer, 16 598 599 mr up, up_outer 600 addi rp, rp_outer, 8 601 602 srdi r0, n, 2 603 mtctr r0 604 605 addic r0, r0, 0 606 li r12, 0 C cy_limb = 0 607 ld r6, 0(up) 608 ld r9, 8(up) 609 ld r27, 16(up) 610 bdz L(ea2) 611 addi up, up, 24 612 613 ALIGN(16) 614 L(ta2): mulld r0, r9, r6 615 mulhdu r26, r9, r6 C 9 616 mulld r7, r27, r6 617 mulhdu r8, r27, r6 C 27 618 ld r9, 0(up) 619 ld r28, 0(rp) 620 ld r27, 8(up) 621 ld r29, 8(rp) 622 adde r0, r0, r12 C 0 12 623 adde r7, r7, r26 C 5 7 624 mulld r26, r9, r6 625 mulhdu r10, r9, r6 C 9 626 mulld r11, r27, r6 627 mulhdu r12, r27, r6 C 27 628 ld r9, 16(up) 629 ld r30, 16(rp) 630 ld r27, 24(up) 631 ld r31, 24(rp) 632 adde r26, r26, r8 C 8 5 633 adde r11, r11, r10 C 10 11 634 addze r12, r12 C 12 635 addc r0, r0, r28 C 0 28 636 std r0, 0(rp) C 0 637 adde r7, r7, r29 C 7 29 638 std r7, 8(rp) C 7 639 adde r26, r26, r30 C 5 30 640 std r26, 16(rp) C 5 641 adde r11, r11, r31 C 11 31 642 std r11, 24(rp) C 11 643 addi up, up, 32 644 addi rp, rp, 32 645 bdnz L(ta2) 646 647 L(ea2): mulld r0, r9, r6 648 mulhdu r26, r9, r6 649 mulld r7, r27, r6 650 mulhdu r8, r27, r6 651 ld r28, 0(rp) 652 ld r29, 8(rp) 653 adde r0, r0, r12 654 adde r7, r7, r26 655 addze r8, r8 656 addc r0, r0, r28 657 std r0, 0(rp) 658 adde r7, r7, r29 659 std r7, 8(rp) 660 addze r8, r8 661 std r8, 16(rp) 662 663 b L(outer_loop) 664 665 L(outer_end): 666 ld r6, 0(up) 667 ld r9, 8(up) 668 ld r11, 0(rp) 669 mulld r0, r9, r6 670 mulhdu r8, r9, r6 671 addc r0, r0, r11 672 std r0, 0(rp) 673 addze r8, r8 674 std r8, 8(rp) 675 676 define(`rp', `rp_saved') 677 define(`up', `r5') 678 define(`n', `r6') 679 define(`climb', `r0') 680 681 addi r4, rp_saved, 8 682 mr r5, up_saved 683 mr r6, n_saved 684 685 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 686 cmpdi cr6, r0, 2 687 addi n, n, 2 C compute count... 688 srdi n, n, 2 C ...for ctr 689 mtctr n C put loop count into ctr 690 beq cr0, L(xb0) 691 blt cr6, L(xb1) 692 beq cr6, L(xb2) 693 694 L(xb3): ld r6, 0(up) 695 ld r7, 8(up) 696 ld r12, 16(up) 697 addi up, up, 24 698 mulld r24, r6, r6 699 mulhdu r25, r6, r6 700 mulld r26, r7, r7 701 mulhdu r27, r7, r7 702 mulld r28, r12, r12 703 mulhdu r29, r12, r12 704 ld r10, 8(rp) 705 ld r11, 16(rp) 706 ld r6, 24(rp) 707 ld r7, 32(rp) 708 addc r10, r10, r10 709 adde r11, r11, r11 710 adde r6, r6, r6 711 adde r7, r7, r7 712 addze climb, r29 713 addc r10, r10, r25 714 adde r11, r11, r26 715 adde r6, r6, r27 716 adde r7, r7, r28 717 std r24, 0(rp) 718 std r10, 8(rp) 719 std r11, 16(rp) 720 std r6, 24(rp) 721 std r7, 32(rp) 722 addi rp, rp, 40 723 bdnz L(top) 724 b L(end) 725 726 L(xb2): ld r6, 0(up) 727 ld r7, 8(up) 728 addi up, up, 16 729 mulld r24, r6, r6 730 mulhdu r25, r6, r6 731 mulld r26, r7, r7 732 mulhdu r27, r7, r7 733 ld r10, 8(rp) 734 ld r11, 16(rp) 735 addc r10, r10, r10 736 adde r11, r11, r11 737 addze climb, r27 738 addc r10, r10, r25 739 adde r11, r11, r26 740 std r24, 0(rp) 741 std r10, 8(rp) 742 std r11, 16(rp) 743 addi rp, rp, 24 744 bdnz L(top) 745 b L(end) 746 747 L(xb0): ld r6, 0(up) 748 ld r7, 8(up) 749 ld r12, 16(up) 750 ld r23, 24(up) 751 addi up, up, 32 752 mulld r24, r6, r6 753 mulhdu r25, r6, r6 754 mulld r26, r7, r7 755 mulhdu r27, r7, r7 756 mulld r28, r12, r12 757 mulhdu r29, r12, r12 758 mulld r30, r23, r23 759 mulhdu r31, r23, r23 760 ld r10, 8(rp) 761 ld r11, 16(rp) 762 ld r6, 24(rp) 763 ld r7, 32(rp) 764 ld r12, 40(rp) 765 ld r23, 48(rp) 766 addc r10, r10, r10 767 adde r11, r11, r11 768 adde r6, r6, r6 769 adde r7, r7, r7 770 adde r12, r12, r12 771 adde r23, r23, r23 772 addze climb, r31 773 std r24, 0(rp) 774 addc r10, r10, r25 775 std r10, 8(rp) 776 adde r11, r11, r26 777 std r11, 16(rp) 778 adde r6, r6, r27 779 std r6, 24(rp) 780 adde r7, r7, r28 781 std r7, 32(rp) 782 adde r12, r12, r29 783 std r12, 40(rp) 784 adde r23, r23, r30 785 std r23, 48(rp) 786 addi rp, rp, 56 787 bdnz L(top) 788 b L(end) 789 790 L(xb1): ld r6, 0(up) 791 addi up, up, 8 792 mulld r24, r6, r6 793 mulhdu climb, r6, r6 794 std r24, 0(rp) 795 addic rp, rp, 8 C clear carry as side-effect 796 797 ALIGN(32) 798 L(top): ld r6, 0(up) 799 ld r7, 8(up) 800 ld r12, 16(up) 801 ld r23, 24(up) 802 addi up, up, 32 803 mulld r24, r6, r6 804 mulhdu r25, r6, r6 805 mulld r26, r7, r7 806 mulhdu r27, r7, r7 807 mulld r28, r12, r12 808 mulhdu r29, r12, r12 809 mulld r30, r23, r23 810 mulhdu r31, r23, r23 811 ld r8, 0(rp) 812 ld r9, 8(rp) 813 adde r8, r8, r8 814 adde r9, r9, r9 815 ld r10, 16(rp) 816 ld r11, 24(rp) 817 adde r10, r10, r10 818 adde r11, r11, r11 819 ld r6, 32(rp) 820 ld r7, 40(rp) 821 adde r6, r6, r6 822 adde r7, r7, r7 823 ld r12, 48(rp) 824 ld r23, 56(rp) 825 adde r12, r12, r12 826 adde r23, r23, r23 827 addze r31, r31 828 addc r8, r8, climb 829 std r8, 0(rp) 830 adde r9, r9, r24 831 std r9, 8(rp) 832 adde r10, r10, r25 833 std r10, 16(rp) 834 adde r11, r11, r26 835 std r11, 24(rp) 836 adde r6, r6, r27 837 std r6, 32(rp) 838 adde r7, r7, r28 839 std r7, 40(rp) 840 adde r12, r12, r29 841 std r12, 48(rp) 842 adde r23, r23, r30 843 std r23, 56(rp) 844 mr climb, r31 845 addi rp, rp, 64 846 bdnz L(top) 847 848 L(end): addze climb, climb 849 std climb, 0(rp) 850 851 ld r31, -8(r1) 852 ld r30, -16(r1) 853 ld r29, -24(r1) 854 ld r28, -32(r1) 855 ld r27, -40(r1) 856 ld r26, -48(r1) 857 ld r25, -56(r1) 858 ld r24, -64(r1) 859 ld r23, -72(r1) 860 ld r22, -80(r1) 861 ld r21, -88(r1) 862 blr 863 EPILOGUE()