github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/dive_1.asm (about) 1 dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. 2 3 dnl Copyright 2001, 2002, 2014 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C divisor 35 C odd even 36 C P54: 24.5 30.5 cycles/limb 37 C P55: 23.0 28.0 38 39 40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 41 C mp_limb_t divisor); 42 C 43 C Plain divl is used for small sizes, since the inverse takes a while to 44 C setup. Multiplying works out faster for size>=3 when the divisor is odd, 45 C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or 46 C size==3 for even are about the same speed for both divl or mul, but the 47 C former is used since it will use up less code cache. 48 C 49 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as 50 C expected. On P54 in the even case the shrdl pairing nonsense (see 51 C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a 52 C further 1.5 slowdown for both odd and even. 53 54 defframe(PARAM_DIVISOR,16) 55 defframe(PARAM_SIZE, 12) 56 defframe(PARAM_SRC, 8) 57 defframe(PARAM_DST, 4) 58 59 dnl re-use parameter space 60 define(VAR_INVERSE,`PARAM_DST') 61 62 TEXT 63 64 ALIGN(32) 65 PROLOGUE(mpn_divexact_1) 66 deflit(`FRAME',0) 67 68 movl PARAM_DIVISOR, %eax 69 movl PARAM_SIZE, %ecx 70 71 pushl %esi FRAME_pushl() 72 push %edi FRAME_pushl() 73 74 movl PARAM_SRC, %esi 75 andl $1, %eax 76 77 movl PARAM_DST, %edi 78 addl %ecx, %eax C size if even, size+1 if odd 79 80 cmpl $4, %eax 81 jae L(mul_by_inverse) 82 83 84 xorl %edx, %edx 85 L(div_top): 86 movl -4(%esi,%ecx,4), %eax 87 88 divl PARAM_DIVISOR 89 90 movl %eax, -4(%edi,%ecx,4) 91 decl %ecx 92 93 jnz L(div_top) 94 95 popl %edi 96 popl %esi 97 98 ret 99 100 101 102 L(mul_by_inverse): 103 movl PARAM_DIVISOR, %eax 104 movl $-1, %ecx 105 106 L(strip_twos): 107 ASSERT(nz, `orl %eax, %eax') 108 shrl %eax 109 incl %ecx C shift count 110 111 jnc L(strip_twos) 112 113 leal 1(%eax,%eax), %edx C d 114 andl $127, %eax C d/2, 7 bits 115 116 pushl %ebx FRAME_pushl() 117 pushl %ebp FRAME_pushl() 118 119 ifdef(`PIC',`dnl 120 LEA( binvert_limb_table, %ebp) 121 movzbl (%eax,%ebp), %eax C inv 8 bits 122 ',` 123 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 124 ') 125 126 movl %eax, %ebp C inv 127 addl %eax, %eax C 2*inv 128 129 imull %ebp, %ebp C inv*inv 130 131 imull %edx, %ebp C inv*inv*d 132 133 subl %ebp, %eax C inv = 2*inv - inv*inv*d 134 movl PARAM_SIZE, %ebx 135 136 movl %eax, %ebp 137 addl %eax, %eax C 2*inv 138 139 imull %ebp, %ebp C inv*inv 140 141 imull %edx, %ebp C inv*inv*d 142 143 subl %ebp, %eax C inv = 2*inv - inv*inv*d 144 movl %edx, PARAM_DIVISOR C d without twos 145 146 leal (%esi,%ebx,4), %esi C src end 147 leal (%edi,%ebx,4), %edi C dst end 148 149 negl %ebx C -size 150 151 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 152 pushl %eax FRAME_pushl() 153 imull PARAM_DIVISOR, %eax 154 cmpl $1, %eax 155 popl %eax FRAME_popl()') 156 157 movl %eax, VAR_INVERSE 158 xorl %ebp, %ebp C initial carry bit 159 160 movl (%esi,%ebx,4), %eax C src low limb 161 orl %ecx, %ecx C shift 162 163 movl 4(%esi,%ebx,4), %edx C src second limb (for even) 164 jz L(odd_entry) 165 166 shrdl( %cl, %edx, %eax) 167 168 incl %ebx 169 jmp L(even_entry) 170 171 172 ALIGN(8) 173 L(odd_top): 174 C eax scratch 175 C ebx counter, limbs, negative 176 C ecx 177 C edx 178 C esi src end 179 C edi dst end 180 C ebp carry bit, 0 or -1 181 182 mull PARAM_DIVISOR 183 184 movl (%esi,%ebx,4), %eax 185 subl %ebp, %edx 186 187 subl %edx, %eax 188 189 sbbl %ebp, %ebp 190 191 L(odd_entry): 192 imull VAR_INVERSE, %eax 193 194 movl %eax, (%edi,%ebx,4) 195 196 incl %ebx 197 jnz L(odd_top) 198 199 200 popl %ebp 201 popl %ebx 202 203 popl %edi 204 popl %esi 205 206 ret 207 208 209 L(even_top): 210 C eax scratch 211 C ebx counter, limbs, negative 212 C ecx twos 213 C edx 214 C esi src end 215 C edi dst end 216 C ebp carry bit, 0 or -1 217 218 mull PARAM_DIVISOR 219 220 subl %ebp, %edx C carry bit 221 movl -4(%esi,%ebx,4), %eax C src limb 222 223 movl (%esi,%ebx,4), %ebp C and one above it 224 225 shrdl( %cl, %ebp, %eax) 226 227 subl %edx, %eax C carry limb 228 229 sbbl %ebp, %ebp 230 231 L(even_entry): 232 imull VAR_INVERSE, %eax 233 234 movl %eax, -4(%edi,%ebx,4) 235 incl %ebx 236 237 jnz L(even_top) 238 239 240 241 mull PARAM_DIVISOR 242 243 movl -4(%esi), %eax C src high limb 244 subl %ebp, %edx 245 246 shrl %cl, %eax 247 248 subl %edx, %eax C no carry if division is exact 249 250 imull VAR_INVERSE, %eax 251 252 movl %eax, -4(%edi) C dst high limb 253 nop C protect against cache bank clash 254 255 popl %ebp 256 popl %ebx 257 258 popl %edi 259 popl %esi 260 261 ret 262 263 EPILOGUE() 264 ASM_END()