github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/aorsmul_1.asm (about) 1 dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3 dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C P5 36 C P6 model 0-8,10-12 6.44 37 C P6 model 9 (Banias) 6.15 38 C P6 model 13 (Dothan) 6.11 39 C P4 model 0 (Willamette) 40 C P4 model 1 (?) 41 C P4 model 2 (Northwood) 42 C P4 model 3 (Prescott) 43 C P4 model 4 (Nocona) 44 C AMD K6 45 C AMD K7 46 C AMD K8 47 48 49 dnl P6 UNROLL_COUNT cycles/limb 50 dnl 8 6.7 51 dnl 16 6.35 52 dnl 32 6.3 53 dnl 64 6.3 54 dnl Maximum possible with the current code is 64. 55 56 deflit(UNROLL_COUNT, 16) 57 58 59 ifdef(`OPERATION_addmul_1', ` 60 define(M4_inst, addl) 61 define(M4_function_1, mpn_addmul_1) 62 define(M4_function_1c, mpn_addmul_1c) 63 define(M4_description, add it to) 64 define(M4_desc_retval, carry) 65 ',`ifdef(`OPERATION_submul_1', ` 66 define(M4_inst, subl) 67 define(M4_function_1, mpn_submul_1) 68 define(M4_function_1c, mpn_submul_1c) 69 define(M4_description, subtract it from) 70 define(M4_desc_retval, borrow) 71 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 72 ')')') 73 74 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) 75 76 77 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 78 C mp_limb_t mult); 79 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 80 C mp_limb_t mult, mp_limb_t carry); 81 C 82 C Calculate src,size multiplied by mult and M4_description dst,size. 83 C Return the M4_desc_retval limb from the top of the result. 84 C 85 C This code is pretty much the same as the K6 code. The unrolled loop is 86 C the same, but there's just a few scheduling tweaks in the setups and the 87 C simple loop. 88 C 89 C A number of variations have been tried for the unrolled loop, with one or 90 C two carries, and with loads scheduled earlier, but nothing faster than 6 91 C cycles/limb has been found. 92 93 ifdef(`PIC',` 94 deflit(UNROLL_THRESHOLD, 5) 95 ',` 96 deflit(UNROLL_THRESHOLD, 5) 97 ') 98 99 defframe(PARAM_CARRY, 20) 100 defframe(PARAM_MULTIPLIER,16) 101 defframe(PARAM_SIZE, 12) 102 defframe(PARAM_SRC, 8) 103 defframe(PARAM_DST, 4) 104 105 TEXT 106 ALIGN(32) 107 108 PROLOGUE(M4_function_1c) 109 pushl %ebx 110 deflit(`FRAME',4) 111 movl PARAM_CARRY, %ebx 112 jmp L(start_nc) 113 EPILOGUE() 114 115 PROLOGUE(M4_function_1) 116 push %ebx 117 deflit(`FRAME',4) 118 xorl %ebx, %ebx C initial carry 119 120 L(start_nc): 121 movl PARAM_SIZE, %ecx 122 pushl %esi 123 deflit(`FRAME',8) 124 125 movl PARAM_SRC, %esi 126 pushl %edi 127 deflit(`FRAME',12) 128 129 movl PARAM_DST, %edi 130 pushl %ebp 131 deflit(`FRAME',16) 132 cmpl $UNROLL_THRESHOLD, %ecx 133 134 movl PARAM_MULTIPLIER, %ebp 135 jae L(unroll) 136 137 138 C simple loop 139 C this is offset 0x22, so close enough to aligned 140 L(simple): 141 C eax scratch 142 C ebx carry 143 C ecx counter 144 C edx scratch 145 C esi src 146 C edi dst 147 C ebp multiplier 148 149 movl (%esi), %eax 150 addl $4, %edi 151 152 mull %ebp 153 154 addl %ebx, %eax 155 adcl $0, %edx 156 157 M4_inst %eax, -4(%edi) 158 movl %edx, %ebx 159 160 adcl $0, %ebx 161 decl %ecx 162 163 leal 4(%esi), %esi 164 jnz L(simple) 165 166 167 popl %ebp 168 popl %edi 169 170 popl %esi 171 movl %ebx, %eax 172 173 popl %ebx 174 ret 175 176 177 178 C------------------------------------------------------------------------------ 179 C VAR_JUMP holds the computed jump temporarily because there's not enough 180 C registers when doing the mul for the initial two carry limbs. 181 C 182 C The add/adc for the initial carry in %ebx is necessary only for the 183 C mpn_add/submul_1c entry points. Duplicating the startup code to 184 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good 185 C idea. 186 187 dnl overlapping with parameters already fetched 188 define(VAR_COUNTER,`PARAM_SIZE') 189 define(VAR_JUMP, `PARAM_DST') 190 191 C this is offset 0x43, so close enough to aligned 192 L(unroll): 193 C eax 194 C ebx initial carry 195 C ecx size 196 C edx 197 C esi src 198 C edi dst 199 C ebp 200 201 movl %ecx, %edx 202 decl %ecx 203 204 subl $2, %edx 205 negl %ecx 206 207 shrl $UNROLL_LOG2, %edx 208 andl $UNROLL_MASK, %ecx 209 210 movl %edx, VAR_COUNTER 211 movl %ecx, %edx 212 213 C 15 code bytes per limb 214 ifdef(`PIC',` 215 call L(pic_calc) 216 L(here): 217 ',` 218 shll $4, %edx 219 negl %ecx 220 221 leal L(entry) (%edx,%ecx,1), %edx 222 ') 223 movl (%esi), %eax C src low limb 224 225 movl %edx, VAR_JUMP 226 leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi 227 228 mull %ebp 229 230 addl %ebx, %eax C initial carry (from _1c) 231 adcl $0, %edx 232 233 movl %edx, %ebx C high carry 234 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi 235 236 movl VAR_JUMP, %edx 237 testl $1, %ecx 238 movl %eax, %ecx C low carry 239 240 cmovnz( %ebx, %ecx) C high,low carry other way around 241 cmovnz( %eax, %ebx) 242 243 jmp *%edx 244 245 246 ifdef(`PIC',` 247 L(pic_calc): 248 shll $4, %edx 249 negl %ecx 250 251 C See mpn/x86/README about old gas bugs 252 leal (%edx,%ecx,1), %edx 253 addl $L(entry)-L(here), %edx 254 255 addl (%esp), %edx 256 257 ret_internal 258 ') 259 260 261 C ----------------------------------------------------------- 262 ALIGN(32) 263 L(top): 264 deflit(`FRAME',16) 265 C eax scratch 266 C ebx carry hi 267 C ecx carry lo 268 C edx scratch 269 C esi src 270 C edi dst 271 C ebp multiplier 272 C 273 C VAR_COUNTER loop counter 274 C 275 C 15 code bytes per limb 276 277 addl $UNROLL_BYTES, %edi 278 279 L(entry): 280 deflit(CHUNK_COUNT,2) 281 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 282 deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) 283 deflit(`disp1', eval(disp0 + 4)) 284 285 Zdisp( movl, disp0,(%esi), %eax) 286 mull %ebp 287 Zdisp( M4_inst,%ecx, disp0,(%edi)) 288 adcl %eax, %ebx 289 movl %edx, %ecx 290 adcl $0, %ecx 291 292 movl disp1(%esi), %eax 293 mull %ebp 294 M4_inst %ebx, disp1(%edi) 295 adcl %eax, %ecx 296 movl %edx, %ebx 297 adcl $0, %ebx 298 ') 299 300 decl VAR_COUNTER 301 leal UNROLL_BYTES(%esi), %esi 302 303 jns L(top) 304 305 306 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) 307 308 M4_inst %ecx, disp0(%edi) 309 movl %ebx, %eax 310 311 popl %ebp 312 popl %edi 313 314 popl %esi 315 popl %ebx 316 adcl $0, %eax 317 318 ret 319 320 EPILOGUE()