github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/aors_n.asm (about) 1 dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. 2 3 dnl Copyright 1999-2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K7: 1.64 cycles/limb (at 16 limbs/loop). 35 36 37 38 dnl K7: UNROLL_COUNT cycles/limb 39 dnl 8 1.9 40 dnl 16 1.64 41 dnl 32 1.7 42 dnl 64 2.0 43 dnl Maximum possible with the current code is 64. 44 45 deflit(UNROLL_COUNT, 16) 46 47 48 ifdef(`OPERATION_add_n', ` 49 define(M4_inst, adcl) 50 define(M4_function_n, mpn_add_n) 51 define(M4_function_nc, mpn_add_nc) 52 define(M4_description, add) 53 ',`ifdef(`OPERATION_sub_n', ` 54 define(M4_inst, sbbl) 55 define(M4_function_n, mpn_sub_n) 56 define(M4_function_nc, mpn_sub_nc) 57 define(M4_description, subtract) 58 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n 59 ')')') 60 61 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 62 63 64 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 65 C mp_size_t size); 66 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 67 C mp_size_t size, mp_limb_t carry); 68 C 69 C Calculate src1,size M4_description src2,size, and store the result in 70 C dst,size. The return value is the carry bit from the top of the result (1 71 C or 0). 72 C 73 C The _nc version accepts 1 or 0 for an initial carry into the low limb of 74 C the calculation. Note values other than 1 or 0 here will lead to garbage 75 C results. 76 C 77 C This code runs at 1.64 cycles/limb, which might be the best possible with 78 C plain integer operations. Each limb is 2 loads and 1 store, any 2 of 79 C which can be done each cycle, leading to 1.5 c/l. 80 81 dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. 82 ifdef(`PIC',` 83 deflit(UNROLL_THRESHOLD, 8) 84 ',` 85 deflit(UNROLL_THRESHOLD, 8) 86 ') 87 88 defframe(PARAM_CARRY,20) 89 defframe(PARAM_SIZE, 16) 90 defframe(PARAM_SRC2, 12) 91 defframe(PARAM_SRC1, 8) 92 defframe(PARAM_DST, 4) 93 94 defframe(SAVE_EBP, -4) 95 defframe(SAVE_ESI, -8) 96 defframe(SAVE_EBX, -12) 97 defframe(SAVE_EDI, -16) 98 deflit(STACK_SPACE, 16) 99 100 TEXT 101 ALIGN(32) 102 deflit(`FRAME',0) 103 104 PROLOGUE(M4_function_nc) 105 movl PARAM_CARRY, %eax 106 jmp L(start) 107 EPILOGUE() 108 109 PROLOGUE(M4_function_n) 110 111 xorl %eax, %eax C carry 112 L(start): 113 movl PARAM_SIZE, %ecx 114 subl $STACK_SPACE, %esp 115 deflit(`FRAME',STACK_SPACE) 116 117 movl %edi, SAVE_EDI 118 movl %ebx, SAVE_EBX 119 cmpl $UNROLL_THRESHOLD, %ecx 120 121 movl PARAM_SRC2, %edx 122 movl PARAM_SRC1, %ebx 123 jae L(unroll) 124 125 movl PARAM_DST, %edi 126 leal (%ebx,%ecx,4), %ebx 127 leal (%edx,%ecx,4), %edx 128 129 leal (%edi,%ecx,4), %edi 130 negl %ecx 131 shrl %eax 132 133 C This loop in in a single 16 byte code block already, so no 134 C alignment necessary. 135 L(simple): 136 C eax scratch 137 C ebx src1 138 C ecx counter 139 C edx src2 140 C esi 141 C edi dst 142 C ebp 143 144 movl (%ebx,%ecx,4), %eax 145 M4_inst (%edx,%ecx,4), %eax 146 movl %eax, (%edi,%ecx,4) 147 incl %ecx 148 jnz L(simple) 149 150 movl $0, %eax 151 movl SAVE_EDI, %edi 152 153 movl SAVE_EBX, %ebx 154 setc %al 155 addl $STACK_SPACE, %esp 156 157 ret 158 159 160 C ----------------------------------------------------------------------------- 161 C This is at 0x55, close enough to aligned. 162 L(unroll): 163 deflit(`FRAME',STACK_SPACE) 164 movl %ebp, SAVE_EBP 165 andl $-2, %ecx C size low bit masked out 166 andl $1, PARAM_SIZE C size low bit kept 167 168 movl %ecx, %edi 169 decl %ecx 170 movl PARAM_DST, %ebp 171 172 shrl $UNROLL_LOG2, %ecx 173 negl %edi 174 movl %esi, SAVE_ESI 175 176 andl $UNROLL_MASK, %edi 177 178 ifdef(`PIC',` 179 call L(pic_calc) 180 L(here): 181 ',` 182 leal L(entry) (%edi,%edi,8), %esi C 9 bytes per 183 ') 184 negl %edi 185 shrl %eax 186 187 leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx 188 leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx 189 leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi 190 191 jmp *%esi 192 193 194 ifdef(`PIC',` 195 L(pic_calc): 196 C See mpn/x86/README about old gas bugs 197 leal (%edi,%edi,8), %esi 198 addl $L(entry)-L(here), %esi 199 addl (%esp), %esi 200 ret_internal 201 ') 202 203 204 C ----------------------------------------------------------------------------- 205 ALIGN(32) 206 L(top): 207 C eax zero 208 C ebx src1 209 C ecx counter 210 C edx src2 211 C esi scratch (was computed jump) 212 C edi dst 213 C ebp scratch 214 215 leal UNROLL_BYTES(%edx), %edx 216 217 L(entry): 218 deflit(CHUNK_COUNT, 2) 219 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 220 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 221 deflit(`disp1', eval(disp0 + 4)) 222 223 Zdisp( movl, disp0,(%ebx), %esi) 224 movl disp1(%ebx), %ebp 225 Zdisp( M4_inst,disp0,(%edx), %esi) 226 Zdisp( movl, %esi, disp0,(%edi)) 227 M4_inst disp1(%edx), %ebp 228 movl %ebp, disp1(%edi) 229 ') 230 231 decl %ecx 232 leal UNROLL_BYTES(%ebx), %ebx 233 leal UNROLL_BYTES(%edi), %edi 234 jns L(top) 235 236 237 mov PARAM_SIZE, %esi 238 movl SAVE_EBP, %ebp 239 movl $0, %eax 240 241 decl %esi 242 js L(even) 243 244 movl (%ebx), %ecx 245 M4_inst UNROLL_BYTES(%edx), %ecx 246 movl %ecx, (%edi) 247 L(even): 248 249 movl SAVE_EDI, %edi 250 movl SAVE_EBX, %ebx 251 setc %al 252 253 movl SAVE_ESI, %esi 254 addl $STACK_SPACE, %esp 255 256 ret 257 258 EPILOGUE()