github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/dive_1.asm (about) 1 dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. 2 3 dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C divisor 35 C odd even 36 C K6: 10.0 12.0 cycles/limb 37 C K6-2: 10.0 11.5 38 39 40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 41 C mp_limb_t divisor); 42 C 43 C A simple divl is used for size==1. This is about 10 cycles faster for an 44 C odd divisor or 20 cycles for an even divisor. 45 C 46 C The loops are quite sensitive to code alignment, speeds should be 47 C rechecked (odd and even divisor, pic and non-pic) if contemplating 48 C changing anything. 49 50 defframe(PARAM_DIVISOR,16) 51 defframe(PARAM_SIZE, 12) 52 defframe(PARAM_SRC, 8) 53 defframe(PARAM_DST, 4) 54 55 dnl re-use parameter space 56 define(VAR_INVERSE,`PARAM_DST') 57 58 TEXT 59 60 ALIGN(32) 61 PROLOGUE(mpn_divexact_1) 62 deflit(`FRAME',0) 63 64 movl PARAM_SIZE, %ecx 65 66 movl PARAM_SRC, %eax 67 xorl %edx, %edx 68 69 cmpl $1, %ecx 70 jnz L(two_or_more) 71 72 movl (%eax), %eax 73 74 divl PARAM_DIVISOR 75 76 movl PARAM_DST, %ecx 77 movl %eax, (%ecx) 78 79 ret 80 81 82 L(two_or_more): 83 movl PARAM_DIVISOR, %eax 84 pushl %ebx FRAME_pushl() 85 86 movl PARAM_SRC, %ebx 87 pushl %ebp FRAME_pushl() 88 89 L(strip_twos): 90 shrl %eax 91 incl %edx C will get shift+1 92 93 jnc L(strip_twos) 94 pushl %esi FRAME_pushl() 95 96 leal 1(%eax,%eax), %esi C d without twos 97 andl $127, %eax C d/2, 7 bits 98 99 ifdef(`PIC',` 100 LEA( binvert_limb_table, %ebp) 101 Zdisp( movzbl, 0,(%eax,%ebp), %eax) 102 ',` 103 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 104 ') 105 pushl %edi FRAME_pushl() 106 107 leal (%eax,%eax), %ebp C 2*inv 108 109 imull %eax, %eax C inv*inv 110 111 movl PARAM_DST, %edi 112 113 imull %esi, %eax C inv*inv*d 114 115 subl %eax, %ebp C inv = 2*inv - inv*inv*d 116 leal (%ebp,%ebp), %eax C 2*inv 117 118 imull %ebp, %ebp C inv*inv 119 120 movl %esi, PARAM_DIVISOR C d without twos 121 leal (%ebx,%ecx,4), %ebx C src end 122 123 imull %esi, %ebp C inv*inv*d 124 125 leal (%edi,%ecx,4), %edi C dst end 126 negl %ecx C -size 127 128 subl %ebp, %eax C inv = 2*inv - inv*inv*d 129 subl $1, %edx C shift amount, and clear carry 130 131 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 132 pushl %eax FRAME_pushl() 133 imull PARAM_DIVISOR, %eax 134 cmpl $1, %eax 135 popl %eax FRAME_popl()') 136 137 movl %eax, VAR_INVERSE 138 jnz L(even) 139 140 movl (%ebx,%ecx,4), %esi C src low limb 141 jmp L(odd_entry) 142 143 144 ALIGN(16) 145 nop C code alignment 146 L(odd_top): 147 C eax scratch 148 C ebx src end 149 C ecx counter, limbs, negative 150 C edx inverse 151 C esi next limb, adjusted for carry 152 C edi dst end 153 C ebp carry bit, 0 or -1 154 155 imull %edx, %esi 156 157 movl PARAM_DIVISOR, %eax 158 movl %esi, -4(%edi,%ecx,4) 159 160 mull %esi C carry limb in edx 161 162 subl %ebp, %edx C apply carry bit 163 movl (%ebx,%ecx,4), %esi 164 165 L(odd_entry): 166 subl %edx, %esi C apply carry limb 167 movl VAR_INVERSE, %edx 168 169 sbbl %ebp, %ebp C 0 or -1 170 171 incl %ecx 172 jnz L(odd_top) 173 174 175 imull %edx, %esi 176 177 movl %esi, -4(%edi,%ecx,4) 178 179 popl %edi 180 popl %esi 181 182 popl %ebp 183 popl %ebx 184 185 ret 186 187 188 L(even): 189 C eax 190 C ebx src end 191 C ecx -size 192 C edx twos 193 C esi 194 C edi dst end 195 C ebp 196 197 xorl %ebp, %ebp 198 Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] 199 200 movd %edx, %mm7 201 movl VAR_INVERSE, %edx 202 203 addl $2, %ecx 204 psrlq %mm7, %mm0 205 206 movd %mm0, %esi 207 jz L(even_two) C if only two limbs 208 209 210 C Out-of-order execution is good enough to hide the load/rshift/movd 211 C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, 212 C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has 213 C been found. Maybe the fact every second movq is unaligned costs the extra 214 C 0.5. 215 216 L(even_top): 217 C eax scratch 218 C ebx src end 219 C ecx counter, limbs, negative 220 C edx inverse 221 C esi next limb, adjusted for carry 222 C edi dst end 223 C ebp carry bit, 0 or -1 224 C 225 C mm0 scratch, source limbs 226 C mm7 twos 227 228 imull %edx, %esi 229 230 movl %esi, -8(%edi,%ecx,4) 231 movl PARAM_DIVISOR, %eax 232 233 mull %esi C carry limb in edx 234 235 movq -4(%ebx,%ecx,4), %mm0 236 psrlq %mm7, %mm0 237 238 movd %mm0, %esi 239 subl %ebp, %edx C apply carry bit 240 241 subl %edx, %esi C apply carry limb 242 movl VAR_INVERSE, %edx 243 244 sbbl %ebp, %ebp C 0 or -1 245 246 incl %ecx 247 jnz L(even_top) 248 249 250 L(even_two): 251 movd -4(%ebx), %mm0 C src high limb 252 psrlq %mm7, %mm0 253 254 imull %edx, %esi 255 256 movl %esi, -8(%edi) 257 movl PARAM_DIVISOR, %eax 258 259 mull %esi C carry limb in edx 260 261 movd %mm0, %esi 262 subl %ebp, %edx C apply carry bit 263 264 movl VAR_INVERSE, %eax 265 subl %edx, %esi C apply carry limb 266 267 imull %eax, %esi 268 269 movl %esi, -4(%edi) 270 271 popl %edi 272 popl %esi 273 274 popl %ebp 275 popl %ebx 276 277 emms_or_femms 278 279 ret 280 281 EPILOGUE() 282 ASM_END()