github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/divrem_1.asm (about) 1 dnl AMD K6 mpn_divrem_1 -- mpn by limb division. 2 3 dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: 20 cycles/limb 35 36 37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 38 C mp_srcptr src, mp_size_t size, mp_limb_t divisor); 39 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 40 C mp_srcptr src, mp_size_t size, mp_limb_t divisor, 41 C mp_limb_t carry); 42 C 43 C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop 44 C instead of decl+jnz, since it comes out 2 cycles/limb faster. 45 C 46 C A test is done to see if the high limb is less than the divisor, and if so 47 C one less div is done. A div is 20 cycles, so assuming high<divisor about 48 C half the time, then this test saves half that amount. The branch 49 C misprediction penalty is less than that. 50 C 51 C Back-to-back div instructions run at 20 cycles, the same as the loop here, 52 C so it seems there's nothing to gain by rearranging the loop. Pairing the 53 C mov and loop instructions was found to gain nothing. 54 C 55 C Enhancements: 56 C 57 C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but 58 C that algorithm has been found to suffer from the relatively poor carry 59 C handling on K6 and too many auxiliary instructions. The fractional part 60 C however could be done at about 13 c/l, if it mattered enough. 61 62 defframe(PARAM_CARRY, 24) 63 defframe(PARAM_DIVISOR,20) 64 defframe(PARAM_SIZE, 16) 65 defframe(PARAM_SRC, 12) 66 defframe(PARAM_XSIZE, 8) 67 defframe(PARAM_DST, 4) 68 69 TEXT 70 71 ALIGN(32) 72 PROLOGUE(mpn_divrem_1c) 73 deflit(`FRAME',0) 74 75 movl PARAM_SIZE, %ecx 76 pushl %edi FRAME_pushl() 77 78 movl PARAM_SRC, %edi 79 pushl %esi FRAME_pushl() 80 81 movl PARAM_DIVISOR, %esi 82 pushl %ebx FRAME_pushl() 83 84 movl PARAM_DST, %ebx 85 pushl %ebp FRAME_pushl() 86 87 movl PARAM_XSIZE, %ebp 88 orl %ecx, %ecx C size 89 90 movl PARAM_CARRY, %edx 91 jz L(fraction) C if size==0 92 93 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 94 jmp L(integer_top) 95 96 EPILOGUE() 97 98 99 ALIGN(16) 100 PROLOGUE(mpn_divrem_1) 101 deflit(`FRAME',0) 102 103 movl PARAM_SIZE, %ecx 104 pushl %edi FRAME_pushl() 105 106 movl PARAM_SRC, %edi 107 pushl %esi FRAME_pushl() 108 109 movl PARAM_DIVISOR, %esi 110 orl %ecx,%ecx C size 111 112 jz L(size_zero) 113 pushl %ebx FRAME_pushl() 114 115 movl -4(%edi,%ecx,4), %eax C src high limb 116 xorl %edx, %edx 117 118 movl PARAM_DST, %ebx 119 pushl %ebp FRAME_pushl() 120 121 movl PARAM_XSIZE, %ebp 122 cmpl %esi, %eax 123 124 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 125 jae L(integer_entry) 126 127 128 C high<divisor, so high of dst is zero, and avoid one div 129 130 movl %edx, (%ebx,%ecx,4) 131 decl %ecx 132 133 movl %eax, %edx 134 jz L(fraction) 135 136 137 L(integer_top): 138 C eax scratch (quotient) 139 C ebx dst+4*xsize-4 140 C ecx counter 141 C edx scratch (remainder) 142 C esi divisor 143 C edi src 144 C ebp xsize 145 146 movl -4(%edi,%ecx,4), %eax 147 L(integer_entry): 148 149 divl %esi 150 151 movl %eax, (%ebx,%ecx,4) 152 loop L(integer_top) 153 154 155 L(fraction): 156 orl %ebp, %ecx 157 jz L(done) 158 159 movl PARAM_DST, %ebx 160 161 162 L(fraction_top): 163 C eax scratch (quotient) 164 C ebx dst 165 C ecx counter 166 C edx scratch (remainder) 167 C esi divisor 168 C edi 169 C ebp 170 171 xorl %eax, %eax 172 173 divl %esi 174 175 movl %eax, -4(%ebx,%ecx,4) 176 loop L(fraction_top) 177 178 179 L(done): 180 popl %ebp 181 movl %edx, %eax 182 popl %ebx 183 popl %esi 184 popl %edi 185 ret 186 187 188 L(size_zero): 189 deflit(`FRAME',8) 190 movl PARAM_XSIZE, %ecx 191 xorl %eax, %eax 192 193 movl PARAM_DST, %edi 194 195 cld C better safe than sorry, see mpn/x86/README 196 197 rep 198 stosl 199 200 popl %esi 201 popl %edi 202 ret 203 EPILOGUE()