github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/divrem_1.asm (about) 1 dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb. 2 3 dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C norm unorm frac 35 C POWER3/PPC630 16-34 16-34 ~11 outdated figures 36 C POWER4/PPC970 28 28 19 37 C POWER5 29 29 ~19 38 C POWER6 49 59 ~42 39 C POWER7 24.5 23 ~14 40 41 C INPUT PARAMETERS 42 C qp = r3 43 C fn = r4 44 C up = r5 45 C un = r6 46 C d = r7 47 48 C We use a not very predictable branch in the frac code, therefore the cycle 49 C count wobbles somewhat. With the alternative branch-free code, things run 50 C considerably slower on POWER4/PPC970 and POWER5. 51 52 C Add preinv entry point. 53 54 55 ASM_START() 56 57 EXTERN_FUNC(mpn_invert_limb) 58 59 PROLOGUE(mpn_divrem_1,toc) 60 61 mfcr r12 62 add. r10, r6, r4 63 std r25, -56(r1) 64 mr r25, r4 65 mflr r0 66 std r26, -48(r1) 67 mr r26, r5 68 std r28, -32(r1) 69 mr r28, r6 70 std r29, -24(r1) 71 mr r29, r3 72 li r3, 0 73 std r30, -16(r1) 74 mr r30, r7 75 std r31, -8(r1) 76 li r31, 0 77 std r27, -40(r1) 78 std r0, 16(r1) 79 stw r12, 8(r1) 80 stdu r1, -176(r1) 81 beq- cr0, L(1) 82 cmpdi cr7, r7, 0 83 sldi r0, r10, 3 84 add r11, r0, r29 85 addi r29, r11, -8 86 blt- cr7, L(162) 87 cmpdi cr4, r6, 0 88 beq+ cr4, L(71) 89 L(163): 90 sldi r9, r6, 3 91 add r9, r9, r5 92 ld r7, -8(r9) 93 cmpld cr7, r7, r30 94 bge- cr7, L(71) 95 cmpdi cr7, r10, 1 96 li r0, 0 97 mr r31, r7 98 std r0, -8(r11) 99 addi r29, r29, -8 100 mr r3, r7 101 beq- cr7, L(1) 102 addi r28, r6, -1 103 cmpdi cr4, r28, 0 104 L(71): 105 cntlzd r27, r30 106 sld r30, r30, r27 107 sld r31, r31, r27 108 mr r3, r30 109 CALL( mpn_invert_limb) 110 beq- cr4, L(110) 111 sldi r9, r28, 3 112 addic. r6, r28, -2 113 add r9, r9, r26 114 subfic r5, r27, 64 115 ld r8, -8(r9) 116 srd r0, r8, r5 117 or r31, r31, r0 118 sld r7, r8, r27 119 blt- cr0, L(154) 120 addi r28, r28, -1 121 mtctr r28 122 sldi r6, r6, 3 123 ALIGN(16) 124 L(uloop): 125 ldx r8, r26, r6 126 nop 127 mulld r0, r31, r3 128 mulhdu r10, r31, r3 129 addi r11, r31, 1 130 srd r9, r8, r5 131 addi r6, r6, -8 132 or r9, r7, r9 133 addc r0, r0, r9 134 adde r10, r10, r11 135 mulld r31, r10, r30 136 subf r31, r31, r9 137 subfc r0, r31, r0 C r <= ql 138 subfe r0, r0, r0 C r0 = -(r <= ql) 139 and r9, r30, r0 140 add r31, r31, r9 141 add r10, r0, r10 C qh -= (r >= ql) 142 cmpld cr7, r31, r30 143 bge- cr7, L(164) 144 L(123): 145 std r10, 0(r29) 146 addi r29, r29, -8 147 sld r7, r8, r27 148 bdnz L(uloop) 149 L(154): 150 addi r11, r31, 1 151 nop 152 mulld r0, r31, r3 153 mulhdu r8, r31, r3 154 addc r0, r0, r7 155 adde r8, r8, r11 156 mulld r31, r8, r30 157 subf r31, r31, r7 158 subfc r0, r0, r31 C r >= ql 159 subfe r0, r0, r0 C r0 = -(r >= ql) 160 not r7, r0 161 add r8, r7, r8 C qh -= (r >= ql) 162 andc r0, r30, r0 163 add r31, r31, r0 164 cmpld cr7, r31, r30 165 bge- cr7, L(165) 166 L(134): 167 std r8, 0(r29) 168 addi r29, r29, -8 169 L(110): 170 addic. r0, r25, -1 171 blt- cr0, L(156) 172 mtctr r25 173 neg r9, r30 174 ALIGN(16) 175 L(ufloop): 176 addi r11, r31, 1 177 nop 178 mulld r0, r3, r31 179 mulhdu r10, r3, r31 180 add r10, r10, r11 181 mulld r31, r9, r10 182 ifelse(0,1,` 183 subfc r0, r0, r31 184 subfe r0, r0, r0 C r0 = -(r >= ql) 185 not r7, r0 186 add r10, r7, r10 C qh -= (r >= ql) 187 andc r0, r30, r0 188 add r31, r31, r0 189 ',` 190 cmpld cr7, r31, r0 191 blt cr7, L(29) 192 add r31, r30, r31 193 addi r10, r10, -1 194 L(29): 195 ') 196 std r10, 0(r29) 197 addi r29, r29, -8 198 bdnz L(ufloop) 199 L(156): 200 srd r3, r31, r27 201 L(1): 202 addi r1, r1, 176 203 ld r0, 16(r1) 204 lwz r12, 8(r1) 205 mtlr r0 206 ld r25, -56(r1) 207 ld r26, -48(r1) 208 mtcrf 8, r12 209 ld r27, -40(r1) 210 ld r28, -32(r1) 211 ld r29, -24(r1) 212 ld r30, -16(r1) 213 ld r31, -8(r1) 214 blr 215 L(162): 216 cmpdi cr7, r6, 0 217 beq- cr7, L(8) 218 sldi r9, r6, 3 219 addi r29, r29, -8 220 add r9, r9, r5 221 addi r28, r6, -1 222 ld r31, -8(r9) 223 subfc r9, r7, r31 224 li r9, 0 225 adde r9, r9, r9 226 neg r0, r9 227 std r9, -8(r11) 228 and r0, r0, r7 229 subf r31, r0, r31 230 L(8): 231 mr r3, r30 232 CALL( mpn_invert_limb) 233 li r27, 0 234 addic. r6, r28, -1 235 blt- cr0, L(110) 236 mtctr r28 237 sldi r6, r6, 3 238 ALIGN(16) 239 L(nloop): 240 addi r11, r31, 1 241 ldx r8, r26, r6 242 mulld r0, r31, r3 243 mulhdu r10, r31, r3 244 addi r6, r6, -8 245 addc r0, r0, r8 246 adde r10, r10, r11 247 mulld r31, r10, r30 248 subf r31, r31, r8 C r = nl - qh * d 249 subfc r0, r31, r0 C r <= ql 250 subfe r0, r0, r0 C r0 = -(r <= ql) 251 and r9, r30, r0 252 add r31, r31, r9 253 add r10, r0, r10 C qh -= (r >= ql) 254 cmpld cr7, r31, r30 255 bge- cr7, L(167) 256 L(51): 257 std r10, 0(r29) 258 addi r29, r29, -8 259 bdnz L(nloop) 260 b L(110) 261 262 L(164): 263 subf r31, r30, r31 264 addi r10, r10, 1 265 b L(123) 266 L(167): 267 subf r31, r30, r31 268 addi r10, r10, 1 269 b L(51) 270 L(165): 271 subf r31, r30, r31 272 addi r8, r8, 1 273 b L(134) 274 EPILOGUE()