github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mul_1.asm (about) 1 dnl AMD K6 mpn_mul_1 -- mpn by limb multiply. 2 3 dnl Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C P5 36 C P6 model 0-8,10-12 5.5 37 C P6 model 9 (Banias) 38 C P6 model 13 (Dothan) 4.87 39 C P4 model 0 (Willamette) 40 C P4 model 1 (?) 41 C P4 model 2 (Northwood) 42 C P4 model 3 (Prescott) 43 C P4 model 4 (Nocona) 44 C AMD K6 6.25 45 C AMD K7 46 C AMD K8 47 48 49 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 50 C mp_limb_t multiplier); 51 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 52 C mp_limb_t multiplier, mp_limb_t carry); 53 C 54 C Multiply src,size by mult and store the result in dst,size. 55 C Return the carry limb from the top of the result. 56 C 57 C mpn_mul_1c() accepts an initial carry for the calculation, it's added into 58 C the low limb of the result. 59 60 defframe(PARAM_CARRY, 20) 61 defframe(PARAM_MULTIPLIER,16) 62 defframe(PARAM_SIZE, 12) 63 defframe(PARAM_SRC, 8) 64 defframe(PARAM_DST, 4) 65 66 dnl minimum 5 because the unrolled code can't handle less 67 deflit(UNROLL_THRESHOLD, 5) 68 69 TEXT 70 ALIGN(32) 71 72 PROLOGUE(mpn_mul_1c) 73 pushl %esi 74 deflit(`FRAME',4) 75 movl PARAM_CARRY, %esi 76 jmp L(start_nc) 77 EPILOGUE() 78 79 80 PROLOGUE(mpn_mul_1) 81 push %esi 82 deflit(`FRAME',4) 83 xorl %esi, %esi C initial carry 84 85 L(start_nc): 86 mov PARAM_SIZE, %ecx 87 push %ebx 88 FRAME_pushl() 89 90 movl PARAM_SRC, %ebx 91 push %edi 92 FRAME_pushl() 93 94 movl PARAM_DST, %edi 95 pushl %ebp 96 FRAME_pushl() 97 98 cmpl $UNROLL_THRESHOLD, %ecx 99 movl PARAM_MULTIPLIER, %ebp 100 101 jae L(unroll) 102 103 104 C code offset 0x22 here, close enough to aligned 105 L(simple): 106 C eax scratch 107 C ebx src 108 C ecx counter 109 C edx scratch 110 C esi carry 111 C edi dst 112 C ebp multiplier 113 C 114 C this loop 8 cycles/limb 115 116 movl (%ebx), %eax 117 addl $4, %ebx 118 119 mull %ebp 120 121 addl %esi, %eax 122 movl $0, %esi 123 124 adcl %edx, %esi 125 126 movl %eax, (%edi) 127 addl $4, %edi 128 129 loop L(simple) 130 131 132 popl %ebp 133 134 popl %edi 135 popl %ebx 136 137 movl %esi, %eax 138 popl %esi 139 140 ret 141 142 143 C ----------------------------------------------------------------------------- 144 C The code for each limb is 6 cycles, with instruction decoding being the 145 C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25 146 C cycles/limb in total. 147 C 148 C The secret ingredient to get 6.25 is to start the loop with the mul and 149 C have the load/store pair at the end. Rotating the load/store to the top 150 C is an 0.5 c/l slowdown. (Some address generation effect probably.) 151 C 152 C The whole unrolled loop fits nicely in exactly 80 bytes. 153 154 155 ALIGN(16) C already aligned to 16 here actually 156 L(unroll): 157 movl (%ebx), %eax 158 leal -16(%ebx,%ecx,4), %ebx 159 160 leal -16(%edi,%ecx,4), %edi 161 subl $4, %ecx 162 163 negl %ecx 164 165 166 ALIGN(16) C one byte nop for this alignment 167 L(top): 168 C eax scratch 169 C ebx &src[size-4] 170 C ecx counter 171 C edx scratch 172 C esi carry 173 C edi &dst[size-4] 174 C ebp multiplier 175 176 mull %ebp 177 178 addl %esi, %eax 179 movl $0, %esi 180 181 adcl %edx, %esi 182 183 movl %eax, (%edi,%ecx,4) 184 movl 4(%ebx,%ecx,4), %eax 185 186 187 mull %ebp 188 189 addl %esi, %eax 190 movl $0, %esi 191 192 adcl %edx, %esi 193 194 movl %eax, 4(%edi,%ecx,4) 195 movl 8(%ebx,%ecx,4), %eax 196 197 198 mull %ebp 199 200 addl %esi, %eax 201 movl $0, %esi 202 203 adcl %edx, %esi 204 205 movl %eax, 8(%edi,%ecx,4) 206 movl 12(%ebx,%ecx,4), %eax 207 208 209 mull %ebp 210 211 addl %esi, %eax 212 movl $0, %esi 213 214 adcl %edx, %esi 215 216 movl %eax, 12(%edi,%ecx,4) 217 movl 16(%ebx,%ecx,4), %eax 218 219 220 addl $4, %ecx 221 js L(top) 222 223 224 225 C eax next src limb 226 C ebx &src[size-4] 227 C ecx 0 to 3 representing respectively 4 to 1 further limbs 228 C edx 229 C esi carry 230 C edi &dst[size-4] 231 232 testb $2, %cl 233 jnz L(finish_not_two) 234 235 mull %ebp 236 237 addl %esi, %eax 238 movl $0, %esi 239 240 adcl %edx, %esi 241 242 movl %eax, (%edi,%ecx,4) 243 movl 4(%ebx,%ecx,4), %eax 244 245 246 mull %ebp 247 248 addl %esi, %eax 249 movl $0, %esi 250 251 adcl %edx, %esi 252 253 movl %eax, 4(%edi,%ecx,4) 254 movl 8(%ebx,%ecx,4), %eax 255 256 addl $2, %ecx 257 L(finish_not_two): 258 259 260 testb $1, %cl 261 jnz L(finish_not_one) 262 263 mull %ebp 264 265 addl %esi, %eax 266 movl $0, %esi 267 268 adcl %edx, %esi 269 270 movl %eax, 8(%edi) 271 movl 12(%ebx), %eax 272 L(finish_not_one): 273 274 275 mull %ebp 276 277 addl %esi, %eax 278 popl %ebp 279 280 adcl $0, %edx 281 282 movl %eax, 12(%edi) 283 popl %edi 284 285 popl %ebx 286 movl %edx, %eax 287 288 popl %esi 289 290 ret 291 292 EPILOGUE()