github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/gcd_1.asm (about) 1 dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd. 2 3 dnl Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: 9.5 cycles/bit (approx) 1x1 gcd 35 C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) 36 37 38 C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y); 39 C 40 C This code is nothing very special, but offers a speedup over what gcc 2.95 41 C can do with mpn/generic/gcd_1.c. 42 C 43 C Future: 44 C 45 C Using a lookup table to count trailing zeros seems a touch quicker, but 46 C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used 47 C it too. 48 49 50 dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits 51 dnl bigger than y, then a division x%y is done to reduce it. 52 dnl 53 dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so 54 dnl there should be an advantage in the divl at about 4 or 5 bits, which is 55 dnl what's found. 56 57 deflit(DIV_THRESHOLD, 5) 58 59 60 defframe(PARAM_LIMB, 12) 61 defframe(PARAM_SIZE, 8) 62 defframe(PARAM_SRC, 4) 63 64 TEXT 65 ALIGN(16) 66 67 PROLOGUE(mpn_gcd_1) 68 deflit(`FRAME',0) 69 70 ASSERT(ne, `cmpl $0, PARAM_LIMB') 71 ASSERT(ae, `cmpl $1, PARAM_SIZE') 72 73 74 movl PARAM_SRC, %eax 75 pushl %ebx FRAME_pushl() 76 77 movl PARAM_LIMB, %edx 78 movl $-1, %ecx 79 80 movl (%eax), %ebx C src low limb 81 82 movl %ebx, %eax C src low limb 83 orl %edx, %ebx 84 85 L(common_twos): 86 shrl %ebx 87 incl %ecx 88 89 jnc L(common_twos) C 1/4 chance on random data 90 shrl %cl, %edx C y 91 92 cmpl $1, PARAM_SIZE 93 ja L(size_two_or_more) 94 95 96 ASSERT(nz, `orl %eax, %eax') C should have src limb != 0 97 98 shrl %cl, %eax C x 99 100 101 C Swap if necessary to make x>=y. Measures a touch quicker as a 102 C jump than a branch free calculation. 103 C 104 C eax x 105 C ebx 106 C ecx common twos 107 C edx y 108 109 movl %eax, %ebx 110 cmpl %eax, %edx 111 112 jb L(noswap) 113 movl %edx, %eax 114 115 movl %ebx, %edx 116 movl %eax, %ebx 117 L(noswap): 118 119 120 C See if it's worth reducing x with a divl. 121 C 122 C eax x 123 C ebx x 124 C ecx common twos 125 C edx y 126 127 shrl $DIV_THRESHOLD, %ebx 128 129 cmpl %ebx, %edx 130 ja L(nodiv) 131 132 133 C Reduce x to x%y. 134 C 135 C eax x 136 C ebx 137 C ecx common twos 138 C edx y 139 140 movl %edx, %ebx 141 xorl %edx, %edx 142 143 divl %ebx 144 145 orl %edx, %edx C y 146 nop C code alignment 147 148 movl %ebx, %eax C x 149 jz L(done_shll) 150 L(nodiv): 151 152 153 C eax x 154 C ebx 155 C ecx common twos 156 C edx y 157 C esi 158 C edi 159 C ebp 160 161 L(strip_y): 162 shrl %edx 163 jnc L(strip_y) 164 165 leal 1(%edx,%edx), %edx 166 movl %ecx, %ebx C common twos 167 168 leal 1(%eax), %ecx 169 jmp L(strip_x_and) 170 171 172 C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch 173 C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is 174 C only 1/4. 175 C 176 C A second computed %cl shift was tried, but that measured a touch slower 177 C than branching back. 178 C 179 C A branch-free abs(x-y) and min(x,y) calculation was tried, but that 180 C measured about 1 cycle/bit slower. 181 182 C eax x 183 C ebx common twos 184 C ecx scratch 185 C edx y 186 187 ALIGN(4) 188 L(swap): 189 addl %eax, %edx C x-y+y = x 190 negl %eax C -(x-y) = y-x 191 192 L(strip_x): 193 shrl %eax C odd-odd = even, so always one to strip 194 ASSERT(nz) 195 196 L(strip_x_leal): 197 leal 1(%eax), %ecx 198 199 L(strip_x_and): 200 andl $1, %ecx C (x^1)&1 201 202 shrl %cl, %eax C shift if x even 203 204 testb $1, %al 205 jz L(strip_x) 206 207 ASSERT(nz,`testl $1, %eax') C x, y odd 208 ASSERT(nz,`testl $1, %edx') 209 210 subl %edx, %eax 211 jb L(swap) 212 ja L(strip_x) 213 214 215 movl %edx, %eax 216 movl %ebx, %ecx 217 218 L(done_shll): 219 shll %cl, %eax 220 popl %ebx 221 222 ret 223 224 225 C ----------------------------------------------------------------------------- 226 C Two or more limbs. 227 C 228 C x={src,size} is reduced modulo y using either a plain mod_1 style 229 C remainder, or a modexact_1 style exact division. 230 231 deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4)) 232 233 ALIGN(8) 234 L(size_two_or_more): 235 C eax 236 C ebx 237 C ecx common twos 238 C edx y, without common twos 239 C esi 240 C edi 241 C ebp 242 243 deflit(FRAME_TWO_OR_MORE, FRAME) 244 245 pushl %edi defframe_pushl(SAVE_EDI) 246 movl PARAM_SRC, %ebx 247 248 L(y_twos): 249 shrl %edx 250 jnc L(y_twos) 251 252 movl %ecx, %edi C common twos 253 movl PARAM_SIZE, %ecx 254 255 pushl %esi defframe_pushl(SAVE_ESI) 256 leal 1(%edx,%edx), %esi C y (odd) 257 258 movl -4(%ebx,%ecx,4), %eax C src high limb 259 260 cmpl %edx, %eax C carry if high<divisor 261 262 sbbl %edx, %edx C -1 if high<divisor 263 264 addl %edx, %ecx C skip one limb if high<divisor 265 andl %eax, %edx 266 267 cmpl $MODEXACT_THRESHOLD, %ecx 268 jae L(modexact) 269 270 271 L(divide_top): 272 C eax scratch (quotient) 273 C ebx src 274 C ecx counter, size-1 to 1 275 C edx carry (remainder) 276 C esi divisor (odd) 277 C edi 278 C ebp 279 280 movl -4(%ebx,%ecx,4), %eax 281 divl %esi 282 loop L(divide_top) 283 284 285 movl %edx, %eax C x 286 movl %esi, %edx C y (odd) 287 288 movl %edi, %ebx C common twos 289 popl %esi 290 291 popl %edi 292 leal 1(%eax), %ecx 293 294 orl %eax, %eax 295 jnz L(strip_x_and) 296 297 298 movl %ebx, %ecx 299 movl %edx, %eax 300 301 shll %cl, %eax 302 popl %ebx 303 304 ret 305 306 307 ALIGN(8) 308 L(modexact): 309 C eax 310 C ebx src ptr 311 C ecx size or size-1 312 C edx 313 C esi y odd 314 C edi common twos 315 C ebp 316 317 movl PARAM_SIZE, %eax 318 pushl %esi FRAME_pushl() 319 320 pushl %eax FRAME_pushl() 321 322 pushl %ebx FRAME_pushl() 323 324 ifdef(`PIC_WITH_EBX',` 325 nop C code alignment 326 call L(movl_eip_ebx) 327 add $_GLOBAL_OFFSET_TABLE_, %ebx 328 ') 329 CALL( mpn_modexact_1_odd) 330 331 movl %esi, %edx C y odd 332 movl SAVE_ESI, %esi 333 334 movl %edi, %ebx C common twos 335 movl SAVE_EDI, %edi 336 337 addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp 338 orl %eax, %eax 339 340 leal 1(%eax), %ecx 341 jnz L(strip_x_and) 342 343 344 movl %ebx, %ecx 345 movl %edx, %eax 346 347 shll %cl, %eax 348 popl %ebx 349 350 ret 351 352 353 ifdef(`PIC_WITH_EBX',` 354 L(movl_eip_ebx): 355 movl (%esp), %ebx 356 ret_internal 357 ') 358 359 EPILOGUE()