github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/bdiv_q_1.asm (about) 1 dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division. 2 3 dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato. 4 5 dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb 37 C Athlon: 11.0 38 C Hammer: 9.0 39 40 41 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 42 C mp_limb_t divisor); 43 C 44 C The dependent chain is mul+imul+sub for 11 cycles and that speed is 45 C achieved with no special effort. The load and shrld latencies are hidden 46 C by out of order execution. 47 C 48 C It's a touch faster on size==1 to use the mul-by-inverse than divl. 49 50 defframe(PARAM_SHIFT, 24) 51 defframe(PARAM_INVERSE,20) 52 defframe(PARAM_DIVISOR,16) 53 defframe(PARAM_SIZE, 12) 54 defframe(PARAM_SRC, 8) 55 defframe(PARAM_DST, 4) 56 57 defframe(SAVE_EBX, -4) 58 defframe(SAVE_ESI, -8) 59 defframe(SAVE_EDI, -12) 60 defframe(SAVE_EBP, -16) 61 defframe(VAR_INVERSE, -20) 62 defframe(VAR_DST_END, -24) 63 64 deflit(STACK_SPACE, 24) 65 66 TEXT 67 68 C mp_limb_t 69 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 70 C mp_limb_t inverse, int shift) 71 ALIGN(16) 72 PROLOGUE(mpn_pi1_bdiv_q_1) 73 deflit(`FRAME',0) 74 75 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 76 movl PARAM_SHIFT, %ecx C shift count 77 78 movl %ebp, SAVE_EBP 79 movl PARAM_SIZE, %ebp 80 81 movl %esi, SAVE_ESI 82 movl PARAM_SRC, %esi 83 84 movl %edi, SAVE_EDI 85 movl PARAM_DST, %edi 86 87 movl %ebx, SAVE_EBX 88 89 leal (%esi,%ebp,4), %esi C src end 90 leal (%edi,%ebp,4), %edi C dst end 91 negl %ebp C -size 92 93 movl PARAM_INVERSE, %eax C inv 94 95 L(common): 96 movl %eax, VAR_INVERSE 97 movl (%esi,%ebp,4), %eax C src[0] 98 99 incl %ebp 100 jz L(one) 101 102 movl (%esi,%ebp,4), %edx C src[1] 103 104 shrdl( %cl, %edx, %eax) 105 106 movl %edi, VAR_DST_END 107 xorl %ebx, %ebx 108 jmp L(entry) 109 110 ALIGN(8) 111 L(top): 112 C eax q 113 C ebx carry bit, 0 or 1 114 C ecx shift 115 C edx 116 C esi src end 117 C edi dst end 118 C ebp counter, limbs, negative 119 120 mull PARAM_DIVISOR C carry limb in edx 121 122 movl -4(%esi,%ebp,4), %eax 123 movl (%esi,%ebp,4), %edi 124 125 shrdl( %cl, %edi, %eax) 126 127 subl %ebx, %eax C apply carry bit 128 setc %bl 129 movl VAR_DST_END, %edi 130 131 subl %edx, %eax C apply carry limb 132 adcl $0, %ebx 133 134 L(entry): 135 imull VAR_INVERSE, %eax 136 137 movl %eax, -4(%edi,%ebp,4) 138 incl %ebp 139 jnz L(top) 140 141 142 mull PARAM_DIVISOR C carry limb in edx 143 144 movl -4(%esi), %eax C src high limb 145 shrl %cl, %eax 146 movl SAVE_ESI, %esi 147 148 subl %ebx, %eax C apply carry bit 149 movl SAVE_EBX, %ebx 150 movl SAVE_EBP, %ebp 151 152 subl %edx, %eax C apply carry limb 153 154 imull VAR_INVERSE, %eax 155 156 movl %eax, -4(%edi) 157 movl SAVE_EDI, %edi 158 addl $STACK_SPACE, %esp 159 160 ret 161 162 L(one): 163 shrl %cl, %eax 164 movl SAVE_ESI, %esi 165 movl SAVE_EBX, %ebx 166 167 imull VAR_INVERSE, %eax 168 169 movl SAVE_EBP, %ebp 170 171 movl %eax, -4(%edi) 172 movl SAVE_EDI, %edi 173 addl $STACK_SPACE, %esp 174 175 ret 176 EPILOGUE() 177 178 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 179 C mp_limb_t divisor); 180 C 181 182 ALIGN(16) 183 PROLOGUE(mpn_bdiv_q_1) 184 deflit(`FRAME',0) 185 186 movl PARAM_DIVISOR, %eax 187 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 188 movl $-1, %ecx C shift count 189 190 movl %ebp, SAVE_EBP 191 movl PARAM_SIZE, %ebp 192 193 movl %esi, SAVE_ESI 194 movl %edi, SAVE_EDI 195 196 C If there's usually only one or two trailing zero bits then this 197 C should be faster than bsfl. 198 L(strip_twos): 199 incl %ecx 200 shrl %eax 201 jnc L(strip_twos) 202 203 movl %ebx, SAVE_EBX 204 leal 1(%eax,%eax), %ebx C d without twos 205 andl $127, %eax C d/2, 7 bits 206 207 ifdef(`PIC',` 208 LEA( binvert_limb_table, %edx) 209 movzbl (%eax,%edx), %eax C inv 8 bits 210 ',` 211 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 212 ') 213 214 leal (%eax,%eax), %edx C 2*inv 215 movl %ebx, PARAM_DIVISOR C d without twos 216 217 imull %eax, %eax C inv*inv 218 219 movl PARAM_SRC, %esi 220 movl PARAM_DST, %edi 221 222 imull %ebx, %eax C inv*inv*d 223 224 subl %eax, %edx C inv = 2*inv - inv*inv*d 225 leal (%edx,%edx), %eax C 2*inv 226 227 imull %edx, %edx C inv*inv 228 229 leal (%esi,%ebp,4), %esi C src end 230 leal (%edi,%ebp,4), %edi C dst end 231 negl %ebp C -size 232 233 imull %ebx, %edx C inv*inv*d 234 235 subl %edx, %eax C inv = 2*inv - inv*inv*d 236 237 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 238 pushl %eax FRAME_pushl() 239 imull PARAM_DIVISOR, %eax 240 cmpl $1, %eax 241 popl %eax FRAME_popl()') 242 243 jmp L(common) 244 EPILOGUE() 245 ASM_END()