github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/bdiv_q_1.asm (about) 1 dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. 2 3 dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato. 4 5 dnl Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C divisor 37 C odd even 38 C P54: 24.5 30.5 cycles/limb 39 C P55: 23.0 28.0 40 41 MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) 42 43 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as 44 C expected. On P54 in the even case the shrdl pairing nonsense (see 45 C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a 46 C further 1.5 slowdown for both odd and even. 47 48 defframe(PARAM_SHIFT, 24) 49 defframe(PARAM_INVERSE,20) 50 defframe(PARAM_DIVISOR,16) 51 defframe(PARAM_SIZE, 12) 52 defframe(PARAM_SRC, 8) 53 defframe(PARAM_DST, 4) 54 55 dnl re-use parameter space 56 define(VAR_INVERSE,`PARAM_DST') 57 58 TEXT 59 60 ALIGN(32) 61 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 62 C mp_limb_t divisor); 63 C 64 PROLOGUE(mpn_bdiv_q_1) 65 deflit(`FRAME',0) 66 67 movl $-1, %ecx 68 movl PARAM_DIVISOR, %eax 69 70 L(strip_twos): 71 ASSERT(nz, `orl %eax, %eax') 72 shrl %eax 73 incl %ecx C shift count 74 75 jnc L(strip_twos) 76 77 leal 1(%eax,%eax), %edx C d 78 andl $127, %eax C d/2, 7 bits 79 80 pushl %ebx FRAME_pushl() 81 pushl %ebp FRAME_pushl() 82 83 ifdef(`PIC',` 84 ifdef(`DARWIN',` 85 LEA( binvert_limb_table, %ebp) 86 movzbl (%eax,%ebp), %eax 87 ',` 88 call L(here) 89 L(here): 90 popl %ebp C eip 91 92 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp 93 C AGI 94 movl binvert_limb_table@GOT(%ebp), %ebp 95 C AGI 96 movzbl (%eax,%ebp), %eax 97 ') 98 ',` 99 100 dnl non-PIC 101 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 102 ') 103 104 movl %eax, %ebp C inv 105 addl %eax, %eax C 2*inv 106 107 imull %ebp, %ebp C inv*inv 108 109 imull %edx, %ebp C inv*inv*d 110 111 subl %ebp, %eax C inv = 2*inv - inv*inv*d 112 movl PARAM_SIZE, %ebx 113 114 movl %eax, %ebp 115 addl %eax, %eax C 2*inv 116 117 imull %ebp, %ebp C inv*inv 118 119 imull %edx, %ebp C inv*inv*d 120 121 subl %ebp, %eax C inv = 2*inv - inv*inv*d 122 movl %edx, PARAM_DIVISOR C d without twos 123 124 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 125 pushl %eax FRAME_pushl() 126 imull PARAM_DIVISOR, %eax 127 cmpl $1, %eax 128 popl %eax FRAME_popl()') 129 130 jmp L(common) 131 EPILOGUE() 132 133 C mp_limb_t 134 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 135 C mp_limb_t inverse, int shift) 136 ALIGN(32) 137 PROLOGUE(mpn_pi1_bdiv_q_1) 138 deflit(`FRAME',0) 139 140 movl PARAM_SHIFT, %ecx 141 142 pushl %ebx FRAME_pushl() 143 pushl %ebp FRAME_pushl() 144 145 movl PARAM_SIZE, %ebx 146 movl PARAM_INVERSE, %eax 147 148 L(common): 149 pushl %esi FRAME_pushl() 150 push %edi FRAME_pushl() 151 152 movl PARAM_SRC, %esi 153 movl PARAM_DST, %edi 154 movl %eax, VAR_INVERSE 155 156 leal (%esi,%ebx,4), %esi C src end 157 leal (%edi,%ebx,4), %edi C dst end 158 159 negl %ebx C -size 160 161 xorl %ebp, %ebp C initial carry bit 162 163 orl %ecx, %ecx C shift 164 movl (%esi,%ebx,4), %eax C src low limb 165 jz L(odd_entry) 166 167 xorl %edx, %edx C initial carry limb (for even, if one) 168 incl %ebx 169 jz L(one) 170 171 movl (%esi,%ebx,4), %edx C src second limb (for even) 172 shrdl( %cl, %edx, %eax) 173 174 jmp L(even_entry) 175 176 177 ALIGN(8) 178 L(odd_top): 179 C eax scratch 180 C ebx counter, limbs, negative 181 C ecx 182 C edx 183 C esi src end 184 C edi dst end 185 C ebp carry bit, 0 or -1 186 187 mull PARAM_DIVISOR 188 189 movl (%esi,%ebx,4), %eax 190 subl %ebp, %edx 191 192 subl %edx, %eax 193 194 sbbl %ebp, %ebp 195 196 L(odd_entry): 197 imull VAR_INVERSE, %eax 198 199 movl %eax, (%edi,%ebx,4) 200 201 incl %ebx 202 jnz L(odd_top) 203 204 popl %edi 205 popl %esi 206 207 popl %ebp 208 popl %ebx 209 210 ret 211 212 L(even_top): 213 C eax scratch 214 C ebx counter, limbs, negative 215 C ecx twos 216 C edx 217 C esi src end 218 C edi dst end 219 C ebp carry bit, 0 or -1 220 221 mull PARAM_DIVISOR 222 223 subl %ebp, %edx C carry bit 224 movl -4(%esi,%ebx,4), %eax C src limb 225 226 movl (%esi,%ebx,4), %ebp C and one above it 227 228 shrdl( %cl, %ebp, %eax) 229 230 subl %edx, %eax C carry limb 231 232 sbbl %ebp, %ebp 233 234 L(even_entry): 235 imull VAR_INVERSE, %eax 236 237 movl %eax, -4(%edi,%ebx,4) 238 incl %ebx 239 240 jnz L(even_top) 241 242 mull PARAM_DIVISOR 243 244 movl -4(%esi), %eax C src high limb 245 subl %ebp, %edx 246 247 L(one): 248 shrl %cl, %eax 249 250 subl %edx, %eax C no carry if division is exact 251 252 imull VAR_INVERSE, %eax 253 254 movl %eax, -4(%edi) C dst high limb 255 nop C protect against cache bank clash 256 257 popl %edi 258 popl %esi 259 260 popl %ebp 261 popl %ebx 262 263 ret 264 265 EPILOGUE() 266 ASM_END()