github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/bdiv_dbm1c.asm (about) 1 dnl Alpha mpn_bdiv_dbm1c. 2 3 dnl Copyright 2008 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C EV4: 42 35 C EV5: 18 36 C EV6: 3 37 38 C TODO 39 C * Try less unrolling, 2-way should give the same performance. 40 C * Optimize feed-in and wind-down code, for speed, and perhaps further for 41 C code size. 42 C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency 43 C path. We have not tried very hard to find a better algorithm. Perhaps 44 C it would be a good task for the GNU superoptimizer. 45 46 C INPUT PARAMETERS 47 define(`rp', `r16') 48 define(`up', `r17') 49 define(`n', `r18') 50 define(`bd', `r19') 51 define(`cy', `r19') 52 53 54 ASM_START() 55 PROLOGUE(mpn_bdiv_dbm1c) 56 mov r20, r8 57 58 ldq r24, 0(r17) 59 and r18, 3, r28 60 lda r18, -4(r18) 61 beq r28, L(b0) 62 cmpeq r28, 1, r21 63 bne r21, L(b1) 64 cmpeq r28, 2, r21 65 bne r21, L(b2) 66 67 68 L(b3): ldq r2, 8(r17) 69 ldq r3, 16(r17) 70 bgt r18, L(gt3) 71 72 mulq r24, r19, r5 C U1 73 umulh r24, r19, r21 C U1 74 mulq r2, r19, r6 C U1 75 umulh r2, r19, r22 C U1 76 mulq r3, r19, r7 C U1 77 umulh r3, r19, r23 C U1 78 lda r16, -32(r16) 79 br L(cj3) 80 81 L(gt3): ldq r0, 24(r17) 82 mulq r24, r19, r5 C U1 83 umulh r24, r19, r21 C U1 84 ldq r1, 32(r17) 85 mulq r2, r19, r6 C U1 86 umulh r2, r19, r22 C U1 87 ldq r2, 40(r17) 88 mulq r3, r19, r7 C U1 89 umulh r3, r19, r23 C U1 90 ldq r3, 48(r17) 91 lda r18, -4(r18) 92 lda r17, 56(r17) 93 mulq r0, r19, r4 C U1 94 bgt r18, L(L3) 95 96 br L(cj7) 97 98 99 L(b2): ldq r3, 8(r17) 100 bgt r18, L(gt2) 101 102 mulq r24, r19, r6 C U1 103 umulh r24, r19, r22 C U1 104 mulq r3, r19, r7 C U1 105 umulh r3, r19, r23 C U1 106 lda r16, -40(r16) 107 br L(cj2) 108 109 L(gt2): ldq r0, 16(r17) 110 ldq r1, 24(r17) 111 mulq r24, r19, r6 C U1 112 umulh r24, r19, r22 C U1 113 ldq r2, 32(r17) 114 mulq r3, r19, r7 C U1 115 umulh r3, r19, r23 C U1 116 ldq r3, 40(r17) 117 lda r18, -4(r18) 118 lda r17, 48(r17) 119 mulq r0, r19, r4 C U1 120 umulh r0, r19, r20 C U1 121 lda r16, -8(r16) 122 bgt r18, L(gt6) 123 124 mulq r1, r19, r5 C U1 125 br L(cj6) 126 127 L(gt6): ldq r0, 0(r17) 128 mulq r1, r19, r5 C U1 129 br L(L2) 130 131 132 L(b1): bgt r18, L(gt1) 133 134 mulq r24, r19, r7 C U1 135 umulh r24, r19, r23 C U1 136 lda r16, -48(r16) 137 br L(cj1) 138 139 L(gt1): ldq r0, 8(r17) 140 ldq r1, 16(r17) 141 ldq r2, 24(r17) 142 mulq r24, r19, r7 C U1 143 umulh r24, r19, r23 C U1 144 ldq r3, 32(r17) 145 lda r18, -4(r18) 146 lda r17, 40(r17) 147 mulq r0, r19, r4 C U1 148 umulh r0, r19, r20 C U1 149 lda r16, -16(r16) 150 bgt r18, L(gt5) 151 152 mulq r1, r19, r5 C U1 153 umulh r1, r19, r21 C U1 154 mulq r2, r19, r6 C U1 155 br L(cj5) 156 157 L(gt5): ldq r0, 0(r17) 158 mulq r1, r19, r5 C U1 159 umulh r1, r19, r21 C U1 160 ldq r1, 8(r17) 161 mulq r2, r19, r6 C U1 162 br L(L1) 163 164 165 L(b0): ldq r1, 8(r17) 166 ldq r2, 16(r17) 167 ldq r3, 24(r17) 168 lda r17, 32(r17) 169 lda r16, -24(r16) 170 mulq r24, r19, r4 C U1 171 umulh r24, r19, r20 C U1 172 bgt r18, L(gt4) 173 174 mulq r1, r19, r5 C U1 175 umulh r1, r19, r21 C U1 176 mulq r2, r19, r6 C U1 177 umulh r2, r19, r22 C U1 178 mulq r3, r19, r7 C U1 179 br L(cj4) 180 181 L(gt4): ldq r0, 0(r17) 182 mulq r1, r19, r5 C U1 183 umulh r1, r19, r21 C U1 184 ldq r1, 8(r17) 185 mulq r2, r19, r6 C U1 186 umulh r2, r19, r22 C U1 187 ldq r2, 16(r17) 188 mulq r3, r19, r7 C U1 189 br L(L0) 190 191 C *** MAIN LOOP START *** 192 ALIGN(16) 193 L(top): mulq r0, r19, r4 C U1 194 subq r8, r28, r8 195 L(L3): umulh r0, r19, r20 C U1 196 cmpult r8, r5, r28 197 ldq r0, 0(r17) 198 subq r8, r5, r8 199 addq r21, r28, r28 200 stq r8, 0(r16) 201 202 mulq r1, r19, r5 C U1 203 subq r8, r28, r8 204 L(L2): umulh r1, r19, r21 C U1 205 cmpult r8, r6, r28 206 ldq r1, 8(r17) 207 subq r8, r6, r8 208 addq r22, r28, r28 209 stq r8, 8(r16) 210 211 mulq r2, r19, r6 C U1 212 subq r8, r28, r8 213 L(L1): umulh r2, r19, r22 C U1 214 cmpult r8, r7, r28 215 ldq r2, 16(r17) 216 subq r8, r7, r8 217 addq r23, r28, r28 218 stq r8, 16(r16) 219 220 mulq r3, r19, r7 C U1 221 subq r8, r28, r8 222 L(L0): umulh r3, r19, r23 C U1 223 cmpult r8, r4, r28 224 ldq r3, 24(r17) 225 subq r8, r4, r8 226 addq r20, r28, r28 227 stq r8, 24(r16) 228 229 lda r18, -4(r18) 230 lda r17, 32(r17) 231 lda r16, 32(r16) 232 bgt r18, L(top) 233 C *** MAIN LOOP END *** 234 235 mulq r0, r19, r4 C U1 236 subq r8, r28, r8 237 L(cj7): umulh r0, r19, r20 C U1 238 cmpult r8, r5, r28 239 subq r8, r5, r8 240 addq r21, r28, r28 241 stq r8, 0(r16) 242 mulq r1, r19, r5 C U1 243 subq r8, r28, r8 244 L(cj6): umulh r1, r19, r21 C U1 245 cmpult r8, r6, r28 246 subq r8, r6, r8 247 addq r22, r28, r28 248 stq r8, 8(r16) 249 mulq r2, r19, r6 C U1 250 subq r8, r28, r8 251 L(cj5): umulh r2, r19, r22 C U1 252 cmpult r8, r7, r28 253 subq r8, r7, r8 254 addq r23, r28, r28 255 stq r8, 16(r16) 256 mulq r3, r19, r7 C U1 257 subq r8, r28, r8 258 L(cj4): umulh r3, r19, r23 C U1 259 cmpult r8, r4, r28 260 subq r8, r4, r8 261 addq r20, r28, r28 262 stq r8, 24(r16) 263 subq r8, r28, r8 264 L(cj3): cmpult r8, r5, r28 265 subq r8, r5, r8 266 addq r21, r28, r28 267 stq r8, 32(r16) 268 subq r8, r28, r8 269 L(cj2): cmpult r8, r6, r28 270 subq r8, r6, r8 271 addq r22, r28, r28 272 stq r8, 40(r16) 273 subq r8, r28, r8 274 L(cj1): cmpult r8, r7, r28 275 subq r8, r7, r8 276 addq r23, r28, r28 277 stq r8, 48(r16) 278 subq r8, r28, r0 279 ret r31, (r26), 1 280 281 EPILOGUE() 282 ASM_END()