github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mmx/mul_1.asm (about) 1 dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication. 2 3 dnl Copyright 2000-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C P5: 12.0 for 32-bit multiplier 36 C 7.0 for 16-bit multiplier 37 38 39 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 40 C mp_limb_t multiplier); 41 C 42 C When the multiplier is 16 bits some special case MMX code is used. Small 43 C multipliers might arise reasonably often from mpz_mul_ui etc. If the size 44 C is odd there's roughly a 5 cycle penalty, so times for say size==7 and 45 C size==8 end up being quite close. If src isn't aligned to an 8 byte 46 C boundary then one limb is processed separately with roughly a 5 cycle 47 C penalty, so in that case it's say size==8 and size==9 which are close. 48 C 49 C Alternatives: 50 C 51 C MMX is not believed to be of any use for 32-bit multipliers, since for 52 C instance the current method would just have to be more or less duplicated 53 C for the high and low halves of the multiplier, and would probably 54 C therefore run at about 14 cycles, which is slower than the plain integer 55 C at 12. 56 C 57 C Adding the high and low MMX products using integer code seems best. An 58 C attempt at using paddd and carry bit propagation with pcmpgtd didn't give 59 C any joy. Perhaps something could be done keeping the values signed and 60 C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or 61 C perhaps not. 62 C 63 C Future: 64 C 65 C An mpn_mul_1c entrypoint would need a double carry out of the low result 66 C limb in the 16-bit code, unless it could be assumed the carry fits in 16 67 C bits, possibly as carry<multiplier, this being true of a big calculation 68 C done piece by piece. But let's worry about that if/when mul_1c is 69 C actually used. 70 71 defframe(PARAM_MULTIPLIER,16) 72 defframe(PARAM_SIZE, 12) 73 defframe(PARAM_SRC, 8) 74 defframe(PARAM_DST, 4) 75 76 TEXT 77 78 ALIGN(8) 79 PROLOGUE(mpn_mul_1) 80 deflit(`FRAME',0) 81 82 movl PARAM_SIZE, %ecx 83 movl PARAM_SRC, %edx 84 85 cmpl $1, %ecx 86 jne L(two_or_more) 87 88 C one limb only 89 90 movl PARAM_MULTIPLIER, %eax 91 movl PARAM_DST, %ecx 92 93 mull (%edx) 94 95 movl %eax, (%ecx) 96 movl %edx, %eax 97 98 ret 99 100 101 L(two_or_more): 102 C eax size 103 C ebx 104 C ecx carry 105 C edx 106 C esi src 107 C edi 108 C ebp 109 110 pushl %esi FRAME_pushl() 111 pushl %edi FRAME_pushl() 112 113 movl %edx, %esi C src 114 movl PARAM_DST, %edi 115 116 movl PARAM_MULTIPLIER, %eax 117 pushl %ebx FRAME_pushl() 118 119 leal (%esi,%ecx,4), %esi C src end 120 leal (%edi,%ecx,4), %edi C dst end 121 122 negl %ecx C -size 123 124 pushl %ebp FRAME_pushl() 125 cmpl $65536, %eax 126 127 jb L(small) 128 129 130 L(big): 131 xorl %ebx, %ebx C carry limb 132 sarl %ecx C -size/2 133 134 jnc L(top) C with carry flag clear 135 136 137 C size was odd, process one limb separately 138 139 mull 4(%esi,%ecx,8) C m * src[0] 140 141 movl %eax, 4(%edi,%ecx,8) 142 incl %ecx 143 144 orl %edx, %ebx C carry limb, and clear carry flag 145 146 147 L(top): 148 C eax 149 C ebx carry 150 C ecx counter, negative 151 C edx 152 C esi src end 153 C edi dst end 154 C ebp (scratch carry) 155 156 adcl $0, %ebx 157 movl (%esi,%ecx,8), %eax 158 159 mull PARAM_MULTIPLIER 160 161 movl %edx, %ebp 162 addl %eax, %ebx 163 164 adcl $0, %ebp 165 movl 4(%esi,%ecx,8), %eax 166 167 mull PARAM_MULTIPLIER 168 169 movl %ebx, (%edi,%ecx,8) 170 addl %ebp, %eax 171 172 movl %eax, 4(%edi,%ecx,8) 173 incl %ecx 174 175 movl %edx, %ebx 176 jnz L(top) 177 178 179 adcl $0, %ebx 180 popl %ebp 181 182 movl %ebx, %eax 183 popl %ebx 184 185 popl %edi 186 popl %esi 187 188 ret 189 190 191 L(small): 192 C Special case for 16-bit multiplier. 193 C 194 C eax multiplier 195 C ebx 196 C ecx -size 197 C edx src 198 C esi src end 199 C edi dst end 200 C ebp multiplier 201 202 C size<3 not supported here. At size==3 we're already a couple of 203 C cycles faster, so there's no threshold as such, just use the MMX 204 C as soon as possible. 205 206 cmpl $-3, %ecx 207 ja L(big) 208 209 movd %eax, %mm7 C m 210 pxor %mm6, %mm6 C initial carry word 211 212 punpcklwd %mm7, %mm7 C m replicated 2 times 213 addl $2, %ecx C -size+2 214 215 punpckldq %mm7, %mm7 C m replicated 4 times 216 andl $4, %edx C test alignment, clear carry flag 217 218 movq %mm7, %mm0 C m 219 jz L(small_entry) 220 221 222 C Source is unaligned, process one limb separately. 223 C 224 C Plain integer code is used here, since it's smaller and is about 225 C the same 13 cycles as an mmx block would be. 226 C 227 C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence 228 C the use of separate incl and orl. 229 230 mull -8(%esi,%ecx,4) C m * src[0] 231 232 movl %eax, -8(%edi,%ecx,4) C dst[0] 233 incl %ecx C one limb processed 234 235 movd %edx, %mm6 C initial carry 236 237 orl %eax, %eax C clear carry flag 238 jmp L(small_entry) 239 240 241 C The scheduling here is quite tricky, since so many instructions have 242 C pairing restrictions. In particular the js won't pair with a movd, and 243 C can't be paired with an adc since it wants flags from the inc, so 244 C instructions are rotated to the top of the loop to find somewhere useful 245 C for it. 246 C 247 C Trouble has been taken to avoid overlapping successive loop iterations, 248 C since that would greatly increase the size of the startup and finishup 249 C code. Actually there's probably not much advantage to be had from 250 C overlapping anyway, since the difficulties are mostly with pairing, not 251 C with latencies as such. 252 C 253 C In the comments x represents the src data and m the multiplier (16 254 C bits, but replicated 4 times). 255 C 256 C The m signs calculated in %mm3 are a loop invariant and could be held in 257 C say %mm5, but that would save only one instruction and hence be no faster. 258 259 L(small_top): 260 C eax l.low, then l.high 261 C ebx (h.low) 262 C ecx counter, -size+2 to 0 or 1 263 C edx (h.high) 264 C esi &src[size] 265 C edi &dst[size] 266 C ebp 267 C 268 C %mm0 (high products) 269 C %mm1 (low products) 270 C %mm2 (adjust for m using x signs) 271 C %mm3 (adjust for x using m signs) 272 C %mm4 273 C %mm5 274 C %mm6 h.low, then carry 275 C %mm7 m replicated 4 times 276 277 movd %mm6, %ebx C h.low 278 psrlq $32, %mm1 C l.high 279 280 movd %mm0, %edx C h.high 281 movq %mm0, %mm6 C new c 282 283 adcl %eax, %ebx 284 incl %ecx 285 286 movd %mm1, %eax C l.high 287 movq %mm7, %mm0 288 289 adcl %eax, %edx 290 movl %ebx, -16(%edi,%ecx,4) 291 292 movl %edx, -12(%edi,%ecx,4) 293 psrlq $32, %mm6 C c 294 295 L(small_entry): 296 pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high 297 movq %mm7, %mm1 298 299 pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low 300 movq %mm7, %mm3 301 302 movq -8(%esi,%ecx,4), %mm2 C x 303 psraw $15, %mm3 C m signs 304 305 pand -8(%esi,%ecx,4), %mm3 C x selected by m signs 306 psraw $15, %mm2 C x signs 307 308 paddw %mm3, %mm0 C add x to h if m neg 309 pand %mm7, %mm2 C m selected by x signs 310 311 paddw %mm2, %mm0 C add m to h if x neg 312 incl %ecx 313 314 movd %mm1, %eax C l.low 315 punpcklwd %mm0, %mm6 C c + h.low << 16 316 317 psrlq $16, %mm0 C h.high 318 js L(small_top) 319 320 321 322 323 movd %mm6, %ebx C h.low 324 psrlq $32, %mm1 C l.high 325 326 adcl %eax, %ebx 327 popl %ebp FRAME_popl() 328 329 movd %mm0, %edx C h.high 330 psrlq $32, %mm0 C l.high 331 332 movd %mm1, %eax C l.high 333 334 adcl %eax, %edx 335 movl %ebx, -12(%edi,%ecx,4) 336 337 movd %mm0, %eax C c 338 339 adcl $0, %eax 340 movl %edx, -8(%edi,%ecx,4) 341 342 orl %ecx, %ecx 343 jnz L(small_done) C final %ecx==1 means even, ==0 odd 344 345 346 C Size odd, one extra limb to process. 347 C Plain integer code is used here, since it's smaller and is about 348 C the same speed as another mmx block would be. 349 350 movl %eax, %ecx 351 movl PARAM_MULTIPLIER, %eax 352 353 mull -4(%esi) 354 355 addl %ecx, %eax 356 357 adcl $0, %edx 358 movl %eax, -4(%edi) 359 360 movl %edx, %eax 361 L(small_done): 362 popl %ebx 363 364 popl %edi 365 popl %esi 366 367 emms 368 369 ret 370 371 EPILOGUE()