github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/divrem_1.asm (about) 1 dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. 2 3 dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C 486 approx 43 maybe 36 C P5 44 37 C P6 39 38 C P6MMX 39 39 C K6 22 40 C K7 42 41 C P4 58 42 43 44 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 45 C mp_srcptr src, mp_size_t size, mp_limb_t divisor); 46 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 47 C mp_srcptr src, mp_size_t size, mp_limb_t divisor, 48 C mp_limb_t carry); 49 C 50 C Divide src,size by divisor and store the quotient in dst+xsize,size. 51 C Extend the division to fractional quotient limbs in dst,xsize. Return the 52 C remainder. Either or both xsize and size can be 0. 53 C 54 C mpn_divrem_1c takes a carry parameter which is an initial high limb, 55 C effectively one extra limb at the top of src,size. Must have 56 C carry<divisor. 57 C 58 C 59 C Essentially the code is the same as the division based part of 60 C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl 61 C instruction even when gcc is not being used (when longlong.h only has the 62 C rather slow generic C udiv_qrnnd(). 63 C 64 C A test is done to see if the high limb is less than the divisor, and if so 65 C one less div is done. A div is between 20 and 40 cycles on the various 66 C x86s, so assuming high<divisor about half the time, then this test saves 67 C half that amount. The branch misprediction penalty on each chip is less 68 C than half a div. 69 C 70 C 71 C Notes for P5: 72 C 73 C It might be thought that moving the load down to pair with the store would 74 C save 1 cycle, but that doesn't seem to happen in practice, and in any case 75 C would be a mere 2.2% saving, so it's hardly worth bothering about. 76 C 77 C A mul-by-inverse might be a possibility for P5, as done in 78 C mpn/x86/pentium/mod_1.asm. The number of auxiliary instructions required 79 C is a hinderance, but there could be a 10-15% speedup available. 80 C 81 C 82 C Notes for K6: 83 C 84 C K6 has its own version of this code, using loop and paying attention to 85 C cache line boundary crossings. The target 20 c/l can be had with the 86 C decl+jnz of the present code by pairing up the load and store in the 87 C loops. But it's considered easier not to introduce complexity just for 88 C that, but instead let k6 have its own code. 89 C 90 91 defframe(PARAM_CARRY, 24) 92 defframe(PARAM_DIVISOR,20) 93 defframe(PARAM_SIZE, 16) 94 defframe(PARAM_SRC, 12) 95 defframe(PARAM_XSIZE, 8) 96 defframe(PARAM_DST, 4) 97 98 TEXT 99 ALIGN(16) 100 101 PROLOGUE(mpn_divrem_1c) 102 deflit(`FRAME',0) 103 104 movl PARAM_SIZE, %ecx 105 pushl %edi FRAME_pushl() 106 107 movl PARAM_SRC, %edi 108 pushl %esi FRAME_pushl() 109 110 movl PARAM_DIVISOR, %esi 111 pushl %ebx FRAME_pushl() 112 113 movl PARAM_DST, %ebx 114 pushl %ebp FRAME_pushl() 115 116 movl PARAM_XSIZE, %ebp 117 orl %ecx, %ecx 118 119 movl PARAM_CARRY, %edx 120 jz L(fraction) 121 122 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 123 jmp L(integer_top) 124 125 EPILOGUE() 126 127 128 PROLOGUE(mpn_divrem_1) 129 deflit(`FRAME',0) 130 131 movl PARAM_SIZE, %ecx 132 pushl %edi FRAME_pushl() 133 134 movl PARAM_SRC, %edi 135 pushl %esi FRAME_pushl() 136 137 movl PARAM_DIVISOR, %esi 138 orl %ecx,%ecx 139 140 jz L(size_zero) 141 pushl %ebx FRAME_pushl() 142 143 movl -4(%edi,%ecx,4), %eax C src high limb 144 xorl %edx, %edx 145 146 movl PARAM_DST, %ebx 147 pushl %ebp FRAME_pushl() 148 149 movl PARAM_XSIZE, %ebp 150 cmpl %esi, %eax 151 152 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 153 jae L(integer_entry) 154 155 156 C high<divisor, so high of dst is zero, and avoid one div 157 158 movl %edx, (%ebx,%ecx,4) 159 decl %ecx 160 161 movl %eax, %edx 162 jz L(fraction) 163 164 165 L(integer_top): 166 C eax scratch (quotient) 167 C ebx dst+4*xsize-4 168 C ecx counter 169 C edx scratch (remainder) 170 C esi divisor 171 C edi src 172 C ebp xsize 173 174 movl -4(%edi,%ecx,4), %eax 175 L(integer_entry): 176 177 divl %esi 178 179 movl %eax, (%ebx,%ecx,4) 180 decl %ecx 181 jnz L(integer_top) 182 183 184 L(fraction): 185 orl %ebp, %ecx 186 jz L(done) 187 188 movl PARAM_DST, %ebx 189 190 191 L(fraction_top): 192 C eax scratch (quotient) 193 C ebx dst 194 C ecx counter 195 C edx scratch (remainder) 196 C esi divisor 197 C edi 198 C ebp 199 200 xorl %eax, %eax 201 202 divl %esi 203 204 movl %eax, -4(%ebx,%ecx,4) 205 decl %ecx 206 jnz L(fraction_top) 207 208 209 L(done): 210 popl %ebp 211 movl %edx, %eax 212 popl %ebx 213 popl %esi 214 popl %edi 215 ret 216 217 218 L(size_zero): 219 deflit(`FRAME',8) 220 movl PARAM_XSIZE, %ecx 221 xorl %eax, %eax 222 223 movl PARAM_DST, %edi 224 225 cld C better safe than sorry, see mpn/x86/README 226 227 rep 228 stosl 229 230 popl %esi 231 popl %edi 232 ret 233 EPILOGUE()