github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/aorsmul_1.asm (about) 1 dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3 dnl Copyright 1999-2003, 2005 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C P5 36 C P6 model 0-8,10-12 5.94 37 C P6 model 9 (Banias) 5.51 38 C P6 model 13 (Dothan) 5.57 39 C P4 model 0 (Willamette) 40 C P4 model 1 (?) 41 C P4 model 2 (Northwood) 42 C P4 model 3 (Prescott) 43 C P4 model 4 (Nocona) 44 C AMD K6 7.65-8.5 (data dependent) 45 C AMD K7 46 C AMD K8 47 48 49 dnl K6: large multipliers small multipliers 50 dnl UNROLL_COUNT cycles/limb cycles/limb 51 dnl 4 9.5 7.78 52 dnl 8 9.0 7.78 53 dnl 16 8.4 7.65 54 dnl 32 8.4 8.2 55 dnl 56 dnl Maximum possible unrolling with the current code is 32. 57 dnl 58 dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 59 dnl byte block, which might explain the good speed at that unrolling. 60 61 deflit(UNROLL_COUNT, 16) 62 63 64 ifdef(`OPERATION_addmul_1', ` 65 define(M4_inst, addl) 66 define(M4_function_1, mpn_addmul_1) 67 define(M4_function_1c, mpn_addmul_1c) 68 ',`ifdef(`OPERATION_submul_1', ` 69 define(M4_inst, subl) 70 define(M4_function_1, mpn_submul_1) 71 define(M4_function_1c, mpn_submul_1c) 72 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 73 ')')') 74 75 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) 76 77 78 C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 79 C mp_limb_t mult); 80 C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 81 C mp_limb_t mult, mp_limb_t carry); 82 C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 83 C mp_limb_t mult); 84 C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 85 C mp_limb_t mult, mp_limb_t carry); 86 C 87 C The jadcl0()s in the unrolled loop makes the speed data dependent. Small 88 C multipliers (most significant few bits clear) result in few carry bits and 89 C speeds up to 7.65 cycles/limb are attained. Large multipliers (most 90 C significant few bits set) make the carry bits 50/50 and lead to something 91 C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l. 92 C 93 C It's important that the gains for jadcl0 on small multipliers don't come 94 C at the cost of slowing down other data. Tests on uniformly distributed 95 C random data, designed to confound branch prediction, show about a 7% 96 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all 97 C overheads included). 98 C 99 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus 100 C 11.0 cycles/limb), and hence isn't used. 101 C 102 C In the simple loop, note that running ecx from negative to zero and using 103 C it as an index in the two movs wouldn't help. It would save one 104 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired 105 C that would be collapsed by this. 106 C 107 C Attempts at a simpler main loop, with less unrolling, haven't yielded much 108 C success, generally running over 9 c/l. 109 C 110 C 111 C jadcl0 112 C ------ 113 C 114 C jadcl0() being faster than adcl $0 seems to be an artifact of two things, 115 C firstly the instruction decoding and secondly the fact that there's a 116 C carry bit for the jadcl0 only on average about 1/4 of the time. 117 C 118 C The code in the unrolled loop decodes something like the following. 119 C 120 C decode cycles 121 C mull %ebp 2 122 C M4_inst %esi, disp(%edi) 1 123 C adcl %eax, %ecx 2 124 C movl %edx, %esi \ 1 125 C jnc 1f / 126 C incl %esi \ 1 127 C 1: movl disp(%ebx), %eax / 128 C --- 129 C 7 130 C 131 C In a back-to-back style test this measures 7 with the jnc not taken, or 8 132 C with it taken (both when correctly predicted). This is opposite to the 133 C measurements showing small multipliers running faster than large ones. 134 C Don't really know why. 135 C 136 C It's not clear how much branch misprediction might be costing. The K6 137 C doco says it will be 1 to 4 cycles, but presumably it's near the low end 138 C of that range to get the measured results. 139 C 140 C 141 C In the code the two carries are more or less the preceding mul product and 142 C the calculation is roughly 143 C 144 C x*y + u*b+v 145 C 146 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and 147 C v are the two limbs it's added to (being the low of the next mul, and a 148 C limb from the destination). 149 C 150 C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and 151 C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of 152 C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 153 C and b-1, then the total probability can be summed over x and y, 154 C 155 C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) 156 C --- * sum sum --- = --- * ------- * ------- = 1/4 157 C b^2 x=0 y=1 b^2 b^4 2 2 158 C 159 C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, 160 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. 161 162 163 ifdef(`PIC',` 164 deflit(UNROLL_THRESHOLD, 9) 165 ',` 166 deflit(UNROLL_THRESHOLD, 6) 167 ') 168 169 defframe(PARAM_CARRY, 20) 170 defframe(PARAM_MULTIPLIER,16) 171 defframe(PARAM_SIZE, 12) 172 defframe(PARAM_SRC, 8) 173 defframe(PARAM_DST, 4) 174 175 TEXT 176 ALIGN(32) 177 178 PROLOGUE(M4_function_1c) 179 pushl %esi 180 deflit(`FRAME',4) 181 movl PARAM_CARRY, %esi 182 jmp L(start_nc) 183 EPILOGUE() 184 185 PROLOGUE(M4_function_1) 186 push %esi 187 deflit(`FRAME',4) 188 xorl %esi, %esi C initial carry 189 190 L(start_nc): 191 movl PARAM_SIZE, %ecx 192 pushl %ebx 193 deflit(`FRAME',8) 194 195 movl PARAM_SRC, %ebx 196 pushl %edi 197 deflit(`FRAME',12) 198 199 cmpl $UNROLL_THRESHOLD, %ecx 200 movl PARAM_DST, %edi 201 202 pushl %ebp 203 deflit(`FRAME',16) 204 jae L(unroll) 205 206 207 C simple loop 208 209 movl PARAM_MULTIPLIER, %ebp 210 211 L(simple): 212 C eax scratch 213 C ebx src 214 C ecx counter 215 C edx scratch 216 C esi carry 217 C edi dst 218 C ebp multiplier 219 220 movl (%ebx), %eax 221 addl $4, %ebx 222 223 mull %ebp 224 225 addl $4, %edi 226 addl %esi, %eax 227 228 adcl $0, %edx 229 230 M4_inst %eax, -4(%edi) 231 232 adcl $0, %edx 233 234 movl %edx, %esi 235 loop L(simple) 236 237 238 popl %ebp 239 popl %edi 240 241 popl %ebx 242 movl %esi, %eax 243 244 popl %esi 245 ret 246 247 248 249 C ----------------------------------------------------------------------------- 250 C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop 251 C the carries are ecx=lo, esi=hi, then they swap for each limb processed. 252 C For the computed jump an odd size means they start one way around, an even 253 C size the other. 254 C 255 C VAR_JUMP holds the computed jump temporarily because there's not enough 256 C registers at the point of doing the mul for the initial two carry limbs. 257 C 258 C The add/adc for the initial carry in %esi is necessary only for the 259 C mpn_addmul/submul_1c entry points. Duplicating the startup code to 260 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good 261 C idea. 262 263 dnl overlapping with parameters already fetched 264 define(VAR_COUNTER, `PARAM_SIZE') 265 define(VAR_JUMP, `PARAM_DST') 266 267 L(unroll): 268 C eax 269 C ebx src 270 C ecx size 271 C edx 272 C esi initial carry 273 C edi dst 274 C ebp 275 276 movl %ecx, %edx 277 decl %ecx 278 279 subl $2, %edx 280 negl %ecx 281 282 shrl $UNROLL_LOG2, %edx 283 andl $UNROLL_MASK, %ecx 284 285 movl %edx, VAR_COUNTER 286 movl %ecx, %edx 287 288 shll $4, %edx 289 negl %ecx 290 291 C 15 code bytes per limb 292 ifdef(`PIC',` 293 call L(pic_calc) 294 L(here): 295 ',` 296 leal L(entry) (%edx,%ecx,1), %edx 297 ') 298 movl (%ebx), %eax C src low limb 299 300 movl PARAM_MULTIPLIER, %ebp 301 movl %edx, VAR_JUMP 302 303 mull %ebp 304 305 addl %esi, %eax C initial carry (from _1c) 306 jadcl0( %edx) 307 308 309 leal 4(%ebx,%ecx,4), %ebx 310 movl %edx, %esi C high carry 311 312 movl VAR_JUMP, %edx 313 leal (%edi,%ecx,4), %edi 314 315 testl $1, %ecx 316 movl %eax, %ecx C low carry 317 318 jz L(noswap) 319 movl %esi, %ecx C high,low carry other way around 320 321 movl %eax, %esi 322 L(noswap): 323 324 jmp *%edx 325 326 327 ifdef(`PIC',` 328 L(pic_calc): 329 C See mpn/x86/README about old gas bugs 330 leal (%edx,%ecx,1), %edx 331 addl $L(entry)-L(here), %edx 332 addl (%esp), %edx 333 ret_internal 334 ') 335 336 337 C ----------------------------------------------------------- 338 ALIGN(32) 339 L(top): 340 deflit(`FRAME',16) 341 C eax scratch 342 C ebx src 343 C ecx carry lo 344 C edx scratch 345 C esi carry hi 346 C edi dst 347 C ebp multiplier 348 C 349 C 15 code bytes per limb 350 351 leal UNROLL_BYTES(%edi), %edi 352 353 L(entry): 354 forloop(`i', 0, UNROLL_COUNT/2-1, ` 355 deflit(`disp0', eval(2*i*4)) 356 deflit(`disp1', eval(disp0 + 4)) 357 358 Zdisp( movl, disp0,(%ebx), %eax) 359 mull %ebp 360 Zdisp( M4_inst,%ecx, disp0,(%edi)) 361 adcl %eax, %esi 362 movl %edx, %ecx 363 jadcl0( %ecx) 364 365 movl disp1(%ebx), %eax 366 mull %ebp 367 M4_inst %esi, disp1(%edi) 368 adcl %eax, %ecx 369 movl %edx, %esi 370 jadcl0( %esi) 371 ') 372 373 decl VAR_COUNTER 374 375 leal UNROLL_BYTES(%ebx), %ebx 376 jns L(top) 377 378 379 popl %ebp 380 M4_inst %ecx, UNROLL_BYTES(%edi) 381 382 popl %edi 383 movl %esi, %eax 384 385 popl %ebx 386 jadcl0( %eax) 387 388 popl %esi 389 ret 390 391 EPILOGUE()