github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/p6/aorsmul_1.asm (about) 1 dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6. 2 3 dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, 4 dnl Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C mpn_addmul_1 mpn_submul_1 35 C cycles/limb cycles/limb 36 C POWER3/PPC630 ? ? 37 C POWER4/PPC970 ? ? 38 C POWER5 ? ? 39 C POWER6 12.25 12.8 40 C POWER7 ? ? 41 42 C TODO 43 C * Reduce register usage. 44 C * Schedule function entry code. 45 C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling 46 C would bring us to 9 c/l. 47 C * Handle n = 1 and perhaps n = 2 separately, without saving any registers. 48 49 C INPUT PARAMETERS 50 define(`rp', `r3') 51 define(`up', `r4') 52 define(`n', `r5') 53 define(`v0', `r6') 54 55 ifdef(`OPERATION_addmul_1',` 56 define(ADDSUBC, adde) 57 define(ADDSUB, addc) 58 define(func, mpn_addmul_1) 59 define(func_nc, mpn_addmul_1c) C FIXME: not really supported 60 define(AM, `$1') 61 define(SM, `') 62 define(CLRRSC, `addic $1, r0, 0') 63 ') 64 ifdef(`OPERATION_submul_1',` 65 define(ADDSUBC, subfe) 66 define(ADDSUB, subfc) 67 define(func, mpn_submul_1) 68 define(func_nc, mpn_submul_1c) C FIXME: not really supported 69 define(AM, `') 70 define(SM, `$1') 71 define(CLRRSC, `subfc $1, r0, r0') 72 ') 73 74 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 75 76 ASM_START() 77 PROLOGUE(func) 78 std r31, -8(r1) 79 std r30, -16(r1) 80 std r29, -24(r1) 81 std r28, -32(r1) 82 std r27, -40(r1) 83 84 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 85 cmpdi cr6, r0, 2 86 addi n, n, 3 C compute count... 87 srdi n, n, 2 C ...for ctr 88 mtctr n C copy loop count into ctr 89 beq cr0, L(b0) 90 blt cr6, L(b1) 91 beq cr6, L(b2) 92 93 L(b3): ld r8, 0(up) 94 ld r7, 8(up) 95 ld r27, 16(up) 96 addi up, up, 16 97 addi rp, rp, 16 98 mulld r5, r8, v0 99 mulhdu r8, r8, v0 100 mulld r9, r7, v0 101 mulhdu r7, r7, v0 102 mulld r11, r27, v0 103 mulhdu r27, r27, v0 104 ld r29, -16(rp) 105 ld r30, -8(rp) 106 ld r31, 0(rp) 107 addc r9, r9, r8 108 adde r11, r11, r7 109 addze r12, r27 110 ADDSUB r5, r5, r29 111 b L(l3) 112 113 L(b2): ld r7, 0(up) 114 ld r27, 8(up) 115 addi up, up, 8 116 addi rp, rp, 8 117 mulld r9, r7, v0 118 mulhdu r7, r7, v0 119 mulld r11, r27, v0 120 mulhdu r27, r27, v0 121 ld r30, -8(rp) 122 ld r31, 0(rp) 123 addc r11, r11, r7 124 addze r12, r27 125 ADDSUB r9, r9, r30 126 b L(l2) 127 128 L(b1): ld r27, 0(up) 129 ld r31, 0(rp) 130 mulld r11, r27, v0 131 mulhdu r12, r27, v0 132 ADDSUB r11, r11, r31 133 b L(l1) 134 135 L(b0): addi up, up, -8 136 addi rp, rp, -8 137 CLRRSC( r12) C clear r12 and clr/set cy 138 139 ALIGN(32) 140 L(top): 141 SM(` subfe r11, r0, r0') C complement... 142 SM(` addic r11, r11, 1') C ...carry flag 143 ld r10, 8(up) 144 ld r8, 16(up) 145 ld r7, 24(up) 146 ld r27, 32(up) 147 addi up, up, 32 148 addi rp, rp, 32 149 mulld r0, r10, v0 150 mulhdu r10, r10, v0 151 mulld r5, r8, v0 152 mulhdu r8, r8, v0 153 mulld r9, r7, v0 154 mulhdu r7, r7, v0 155 mulld r11, r27, v0 156 mulhdu r27, r27, v0 157 ld r28, -24(rp) 158 adde r0, r0, r12 159 ld r29, -16(rp) 160 adde r5, r5, r10 161 ld r30, -8(rp) 162 ld r31, 0(rp) 163 adde r9, r9, r8 164 adde r11, r11, r7 165 addze r12, r27 166 ADDSUB r0, r0, r28 167 std r0, -24(rp) 168 ADDSUBC r5, r5, r29 169 L(l3): std r5, -16(rp) 170 ADDSUBC r9, r9, r30 171 L(l2): std r9, -8(rp) 172 ADDSUBC r11, r11, r31 173 L(l1): std r11, 0(rp) 174 bdnz L(top) 175 176 AM(` addze r3, r12') 177 SM(` subfe r11, r0, r0') C complement... 178 ld r31, -8(r1) 179 SM(` subf r3, r11, r12') 180 ld r30, -16(r1) 181 ld r29, -24(r1) 182 ld r28, -32(r1) 183 ld r27, -40(r1) 184 blr 185 EPILOGUE()