github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/divrem_2.asm (about) 1 dnl Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. 2 3 dnl Copyright 2007, 2008, 2013 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C norm frac 34 C ev4 35 C ev5 70 70 36 C ev6 29 29 37 38 C TODO 39 C * Perhaps inline mpn_invert_limb, that would allow us to not save/restore 40 C any registers (thus save ~10 cycles per call). 41 C * Use negated d1 and/or d0 to speed carry propagation. Might save a cycle 42 C or two. 43 C * Check cluster delays (for ev6). We very likely could save some cycles. 44 C * Use branch-free code for computing di. 45 C * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call. 46 47 C INPUT PARAMETERS 48 define(`qp', `r16') 49 define(`fn', `r17') 50 define(`up_param', `r18') 51 define(`un_param', `r19') 52 define(`dp', `r20') 53 54 ASM_START() 55 PROLOGUE(mpn_divrem_2,gp) 56 lda r30, -80(r30) 57 stq r26, 0(r30) 58 stq r9, 8(r30) 59 stq r10, 16(r30) 60 stq r11, 24(r30) 61 stq r12, 32(r30) 62 stq r13, 40(r30) 63 C stq r14, 48(r30) 64 stq r15, 56(r30) 65 .prologue 1 66 stq r16, 64(r30) 67 bis r31, r17, r15 68 s8addq r19, r18, r13 69 lda r13, -24(r13) 70 ldq r12, 8(r20) 71 ldq r10, 0(r20) 72 ldq r11, 16(r13) 73 ldq r9, 8(r13) 74 75 bis r31, r31, r3 C most_significant_q_limb = 0 76 cmpult r11, r12, r1 77 bne r1, L(L8) 78 cmpule r11, r12, r1 79 cmpult r9, r10, r2 80 and r1, r2, r1 81 bne r1, L(L8) 82 subq r11, r12, r11 83 subq r11, r2, r11 84 subq r9, r10, r9 85 lda r3, 1(r31) C most_significant_q_limb = 1 86 L(L8): stq r3, 72(r30) 87 88 addq r15, r19, r19 89 lda r19, -3(r19) 90 blt r19, L(L10) 91 bis r31, r12, r16 92 jsr r26, mpn_invert_limb 93 LDGP( r29, 0(r26)) 94 mulq r0, r12, r4 C t0 = LO(di * d1) 95 umulh r0, r10, r2 C s1 = HI(di * d0) 96 addq r4, r10, r4 C t0 += d0 97 cmpule r10, r4, r7 C (t0 < d0) 98 addq r4, r2, r4 C t0 += s1 99 cmpult r4, r2, r1 100 subq r1, r7, r7 C t1 (-1, 0, or 1) 101 blt r7, L(L42) 102 L(L22): 103 lda r0, -1(r0) C di-- 104 cmpult r4, r12, r1 C cy for: t0 -= d1 (below) 105 subq r7, r1, r7 C t1 -= cy 106 subq r4, r12, r4 C t0 -= d1 107 bge r7, L(L22) 108 L(L42): 109 ldq r16, 64(r30) 110 s8addq r19, r16, r16 111 ALIGN(16) 112 L(loop): 113 mulq r11, r0, r5 C q0 (early) 114 umulh r11, r0, r6 C q (early) 115 addq r5, r9, r8 C q0 += n1 116 addq r6, r11, r6 C q += n2 117 cmpult r8, r5, r1 C cy for: q0 += n1 118 addq r6, r1, r6 C q += cy 119 unop 120 mulq r12, r6, r1 C LO(d1 * q) 121 umulh r10, r6, r7 C t1 = HI(d0 * q) 122 subq r9, r1, r9 C n1 -= LO(d1 * q) 123 mulq r10, r6, r4 C t0 = LO(d0 * q) 124 unop 125 cmple r15, r19, r5 C condition and n0... 126 beq r5, L(L31) 127 ldq r5, 0(r13) 128 lda r13, -8(r13) 129 L(L31): subq r9, r12, r9 C n1 -= d1 130 cmpult r5, r10, r1 C 131 subq r9, r1, r9 C 132 subq r5, r10, r5 C n0 -= d0 133 subq r9, r7, r9 C n1 -= t0 134 cmpult r5, r4, r1 C 135 subq r9, r1, r2 C 136 subq r5, r4, r5 C n0 -= t1 137 cmpult r2, r8, r1 C (n1 < q0) 138 addq r6, r1, r6 C q += cond 139 lda r1, -1(r1) C -(n1 >= q0) 140 and r1, r10, r4 C 141 addq r5, r4, r9 C n0 += mask & d0 142 and r1, r12, r1 C 143 cmpult r9, r5, r11 C cy for: n0 += mask & d0 144 addq r2, r1, r1 C n1 += mask & d1 145 addq r1, r11, r11 C n1 += cy 146 cmpult r11, r12, r1 C 147 beq r1, L(fix) C 148 L(bck): stq r6, 0(r16) 149 lda r16, -8(r16) 150 lda r19, -1(r19) 151 bge r19, L(loop) 152 153 L(L10): stq r9, 8(r13) 154 stq r11, 16(r13) 155 ldq r0, 72(r30) 156 ldq r26, 0(r30) 157 ldq r9, 8(r30) 158 ldq r10, 16(r30) 159 ldq r11, 24(r30) 160 ldq r12, 32(r30) 161 ldq r13, 40(r30) 162 C ldq r14, 48(r30) 163 ldq r15, 56(r30) 164 lda r30, 80(r30) 165 ret r31, (r26), 1 166 167 L(fix): cmpule r11, r12, r1 168 cmpult r9, r10, r2 169 and r1, r2, r1 170 bne r1, L(bck) 171 subq r11, r12, r11 172 subq r11, r2, r11 173 subq r9, r10, r9 174 lda r6, 1(r6) 175 br L(bck) 176 EPILOGUE() 177 ASM_END()