github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/aors_n.asm (about) 1 dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb. 35 36 37 ifdef(`OPERATION_add_n', ` 38 define(M4_inst, adcl) 39 define(M4_function_n, mpn_add_n) 40 define(M4_function_nc, mpn_add_nc) 41 define(M4_description, add) 42 ',`ifdef(`OPERATION_sub_n', ` 43 define(M4_inst, sbbl) 44 define(M4_function_n, mpn_sub_n) 45 define(M4_function_nc, mpn_sub_nc) 46 define(M4_description, subtract) 47 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n 48 ')')') 49 50 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 51 52 53 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 54 C mp_size_t size); 55 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 56 C mp_size_t size, mp_limb_t carry); 57 C 58 C Calculate src1,size M4_description src2,size, and store the result in 59 C dst,size. The return value is the carry bit from the top of the result 60 C (1 or 0). 61 C 62 C The _nc version accepts 1 or 0 for an initial carry into the low limb of 63 C the calculation. Note values other than 1 or 0 here will lead to garbage 64 C results. 65 C 66 C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and 67 C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of 68 C loop control, which with 4 limbs/loop means an extra 0.25 c/l. 69 70 define(PARAM_CARRY, `FRAME+20(%esp)') 71 define(PARAM_SIZE, `FRAME+16(%esp)') 72 define(PARAM_SRC2, `FRAME+12(%esp)') 73 define(PARAM_SRC1, `FRAME+8(%esp)') 74 define(PARAM_DST, `FRAME+4(%esp)') 75 deflit(`FRAME',0) 76 77 dnl minimum 5 because the unrolled code can't handle less 78 deflit(UNROLL_THRESHOLD, 5) 79 80 TEXT 81 ALIGN(32) 82 83 PROLOGUE(M4_function_nc) 84 movl PARAM_CARRY, %eax 85 jmp L(start) 86 EPILOGUE() 87 88 89 PROLOGUE(M4_function_n) 90 xorl %eax, %eax 91 L(start): 92 movl PARAM_SIZE, %ecx 93 pushl %ebx 94 FRAME_pushl() 95 96 movl PARAM_SRC1, %ebx 97 pushl %edi 98 FRAME_pushl() 99 100 movl PARAM_SRC2, %edx 101 cmpl $UNROLL_THRESHOLD, %ecx 102 103 movl PARAM_DST, %edi 104 jae L(unroll) 105 106 107 shrl %eax C initial carry flag 108 109 C offset 0x21 here, close enough to aligned 110 L(simple): 111 C eax scratch 112 C ebx src1 113 C ecx counter 114 C edx src2 115 C esi 116 C edi dst 117 C ebp 118 C 119 C The store to (%edi) could be done with a stosl; it'd be smaller 120 C code, but there's no speed gain and a cld would have to be added 121 C (per mpn/x86/README). 122 123 movl (%ebx), %eax 124 leal 4(%ebx), %ebx 125 126 M4_inst (%edx), %eax 127 128 movl %eax, (%edi) 129 leal 4(%edi), %edi 130 131 leal 4(%edx), %edx 132 loop L(simple) 133 134 135 movl $0, %eax 136 popl %edi 137 138 setc %al 139 140 popl %ebx 141 ret 142 143 144 C ----------------------------------------------------------------------------- 145 L(unroll): 146 C eax carry 147 C ebx src1 148 C ecx counter 149 C edx src2 150 C esi 151 C edi dst 152 C ebp 153 154 cmpl %edi, %ebx 155 pushl %esi 156 157 je L(inplace) 158 159 ifdef(`OPERATION_add_n',` 160 cmpl %edi, %edx 161 162 je L(inplace_reverse) 163 ') 164 165 movl %ecx, %esi 166 167 andl $-4, %ecx 168 andl $3, %esi 169 170 leal (%ebx,%ecx,4), %ebx 171 leal (%edx,%ecx,4), %edx 172 leal (%edi,%ecx,4), %edi 173 174 negl %ecx 175 shrl %eax 176 177 ALIGN(32) 178 L(normal_top): 179 C eax counter, qwords, negative 180 C ebx src1 181 C ecx scratch 182 C edx src2 183 C esi 184 C edi dst 185 C ebp 186 187 movl (%ebx,%ecx,4), %eax 188 leal 5(%ecx), %ecx 189 M4_inst -20(%edx,%ecx,4), %eax 190 movl %eax, -20(%edi,%ecx,4) 191 192 movl 4-20(%ebx,%ecx,4), %eax 193 M4_inst 4-20(%edx,%ecx,4), %eax 194 movl %eax, 4-20(%edi,%ecx,4) 195 196 movl 8-20(%ebx,%ecx,4), %eax 197 M4_inst 8-20(%edx,%ecx,4), %eax 198 movl %eax, 8-20(%edi,%ecx,4) 199 200 movl 12-20(%ebx,%ecx,4), %eax 201 M4_inst 12-20(%edx,%ecx,4), %eax 202 movl %eax, 12-20(%edi,%ecx,4) 203 204 loop L(normal_top) 205 206 207 decl %esi 208 jz L(normal_finish_one) 209 js L(normal_done) 210 211 C two or three more limbs 212 213 movl (%ebx), %eax 214 M4_inst (%edx), %eax 215 movl %eax, (%edi) 216 217 movl 4(%ebx), %eax 218 M4_inst 4(%edx), %eax 219 decl %esi 220 movl %eax, 4(%edi) 221 222 jz L(normal_done) 223 movl $2, %ecx 224 225 L(normal_finish_one): 226 movl (%ebx,%ecx,4), %eax 227 M4_inst (%edx,%ecx,4), %eax 228 movl %eax, (%edi,%ecx,4) 229 230 L(normal_done): 231 popl %esi 232 popl %edi 233 234 movl $0, %eax 235 popl %ebx 236 237 setc %al 238 239 ret 240 241 242 C ----------------------------------------------------------------------------- 243 244 ifdef(`OPERATION_add_n',` 245 L(inplace_reverse): 246 C dst==src2 247 248 movl %ebx, %edx 249 ') 250 251 L(inplace): 252 C eax initial carry 253 C ebx 254 C ecx size 255 C edx src 256 C esi 257 C edi dst 258 C ebp 259 260 leal -1(%ecx), %esi 261 decl %ecx 262 263 andl $-4, %ecx 264 andl $3, %esi 265 266 movl (%edx), %ebx C src low limb 267 leal (%edx,%ecx,4), %edx 268 269 leal (%edi,%ecx,4), %edi 270 negl %ecx 271 272 shrl %eax 273 274 275 ALIGN(32) 276 L(inplace_top): 277 C eax 278 C ebx next src limb 279 C ecx size 280 C edx src 281 C esi 282 C edi dst 283 C ebp 284 285 M4_inst %ebx, (%edi,%ecx,4) 286 287 movl 4(%edx,%ecx,4), %eax 288 leal 5(%ecx), %ecx 289 290 M4_inst %eax, 4-20(%edi,%ecx,4) 291 292 movl 8-20(%edx,%ecx,4), %eax 293 movl 12-20(%edx,%ecx,4), %ebx 294 295 M4_inst %eax, 8-20(%edi,%ecx,4) 296 M4_inst %ebx, 12-20(%edi,%ecx,4) 297 298 movl 16-20(%edx,%ecx,4), %ebx 299 loop L(inplace_top) 300 301 302 C now %esi is 0 to 3 representing respectively 1 to 4 limbs more 303 304 M4_inst %ebx, (%edi) 305 306 decl %esi 307 jz L(inplace_finish_one) 308 js L(inplace_done) 309 310 C two or three more limbs 311 312 movl 4(%edx), %eax 313 movl 8(%edx), %ebx 314 M4_inst %eax, 4(%edi) 315 M4_inst %ebx, 8(%edi) 316 317 decl %esi 318 movl $2, %ecx 319 320 jz L(normal_done) 321 322 L(inplace_finish_one): 323 movl 4(%edx,%ecx,4), %eax 324 M4_inst %eax, 4(%edi,%ecx,4) 325 326 L(inplace_done): 327 popl %esi 328 popl %edi 329 330 movl $0, %eax 331 popl %ebx 332 333 setc %al 334 335 ret 336 337 EPILOGUE()