github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/sqr_basecase.asm (about) 1 dnl x86 generic mpn_sqr_basecase -- square an mpn number. 2 3 dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 include(`../config.m4') 33 34 35 C cycles/crossproduct cycles/triangleproduct 36 C P5 37 C P6 38 C K6 39 C K7 40 C P4 41 42 43 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 44 C 45 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 46 C lot of function call overheads are avoided, especially when the size is 47 C small. 48 C 49 C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the 50 C code size to do so here. 51 C 52 C Enhancements: 53 C 54 C The addmul loop here is also not unrolled like aorsmul_1.asm and 55 C mul_basecase.asm are. Perhaps it should be done. It'd add to the 56 C complexity, but if it's worth doing in the other places then it should be 57 C worthwhile here. 58 C 59 C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6) 60 C might be worth considering. That'd add quite a bit to the code size, but 61 C only as much as is used would be dragged into L1 cache. 62 63 defframe(PARAM_SIZE,12) 64 defframe(PARAM_SRC, 8) 65 defframe(PARAM_DST, 4) 66 67 TEXT 68 ALIGN(8) 69 PROLOGUE(mpn_sqr_basecase) 70 deflit(`FRAME',0) 71 72 movl PARAM_SIZE, %edx 73 74 movl PARAM_SRC, %eax 75 76 cmpl $2, %edx 77 movl PARAM_DST, %ecx 78 79 je L(two_limbs) 80 ja L(three_or_more) 81 82 83 C ----------------------------------------------------------------------------- 84 C one limb only 85 C eax src 86 C ebx 87 C ecx dst 88 C edx 89 90 movl (%eax), %eax 91 mull %eax 92 movl %eax, (%ecx) 93 movl %edx, 4(%ecx) 94 ret 95 96 97 C ----------------------------------------------------------------------------- 98 ALIGN(8) 99 L(two_limbs): 100 C eax src 101 C ebx 102 C ecx dst 103 C edx 104 105 pushl %ebx 106 pushl %ebp 107 108 movl %eax, %ebx 109 movl (%eax), %eax 110 111 mull %eax C src[0]^2 112 113 pushl %esi 114 pushl %edi 115 116 movl %edx, %esi C dst[1] 117 movl %eax, (%ecx) C dst[0] 118 119 movl 4(%ebx), %eax 120 mull %eax C src[1]^2 121 122 movl %eax, %edi C dst[2] 123 movl %edx, %ebp C dst[3] 124 125 movl (%ebx), %eax 126 mull 4(%ebx) C src[0]*src[1] 127 128 addl %eax, %esi 129 130 adcl %edx, %edi 131 132 adcl $0, %ebp 133 addl %esi, %eax 134 135 adcl %edi, %edx 136 movl %eax, 4(%ecx) 137 138 adcl $0, %ebp 139 140 movl %edx, 8(%ecx) 141 movl %ebp, 12(%ecx) 142 143 popl %edi 144 popl %esi 145 146 popl %ebp 147 popl %ebx 148 149 ret 150 151 152 C ----------------------------------------------------------------------------- 153 ALIGN(8) 154 L(three_or_more): 155 deflit(`FRAME',0) 156 C eax src 157 C ebx 158 C ecx dst 159 C edx size 160 161 pushl %ebx FRAME_pushl() 162 pushl %edi FRAME_pushl() 163 164 pushl %esi FRAME_pushl() 165 pushl %ebp FRAME_pushl() 166 167 leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1 168 leal (%eax,%edx,4), %esi C &src[size] 169 170 C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 171 172 movl (%eax), %ebp C src[0], multiplier 173 movl %edx, %ecx 174 175 negl %ecx C -size 176 xorl %ebx, %ebx C clear carry limb 177 178 incl %ecx C -(size-1) 179 180 L(mul1): 181 C eax scratch 182 C ebx carry 183 C ecx counter, limbs, negative 184 C edx scratch 185 C esi &src[size] 186 C edi &dst[size] 187 C ebp multiplier 188 189 movl (%esi,%ecx,4), %eax 190 mull %ebp 191 addl %eax, %ebx 192 adcl $0, %edx 193 movl %ebx, (%edi,%ecx,4) 194 movl %edx, %ebx 195 incl %ecx 196 jnz L(mul1) 197 198 movl %ebx, (%edi) 199 200 201 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for 202 C n=1..size-2. 203 C 204 C The last products src[size-2]*src[size-1], which is the end corner 205 C of the product triangle, is handled separately at the end to save 206 C looping overhead. If size is 3 then it's only this that needs to 207 C be done. 208 C 209 C In the outer loop %esi is a constant, and %edi just advances by 1 210 C limb each time. The size of the operation decreases by 1 limb 211 C each time. 212 213 C eax 214 C ebx carry (needing carry flag added) 215 C ecx 216 C edx 217 C esi &src[size] 218 C edi &dst[size] 219 C ebp 220 221 movl PARAM_SIZE, %ecx 222 subl $3, %ecx 223 jz L(corner) 224 225 negl %ecx 226 227 dnl re-use parameter space 228 define(VAR_OUTER,`PARAM_DST') 229 230 L(outer): 231 C eax 232 C ebx 233 C ecx 234 C edx outer loop counter, -(size-3) to -1 235 C esi &src[size] 236 C edi dst, pointing at stored carry limb of previous loop 237 C ebp 238 239 movl %ecx, VAR_OUTER 240 addl $4, %edi C advance dst end 241 242 movl -8(%esi,%ecx,4), %ebp C next multiplier 243 subl $1, %ecx 244 245 xorl %ebx, %ebx C initial carry limb 246 247 L(inner): 248 C eax scratch 249 C ebx carry (needing carry flag added) 250 C ecx counter, -n-1 to -1 251 C edx scratch 252 C esi &src[size] 253 C edi dst end of this addmul 254 C ebp multiplier 255 256 movl (%esi,%ecx,4), %eax 257 mull %ebp 258 addl %ebx, %eax 259 adcl $0, %edx 260 addl %eax, (%edi,%ecx,4) 261 adcl $0, %edx 262 movl %edx, %ebx 263 addl $1, %ecx 264 jl L(inner) 265 266 267 movl %ebx, (%edi) 268 movl VAR_OUTER, %ecx 269 incl %ecx 270 jnz L(outer) 271 272 273 L(corner): 274 C esi &src[size] 275 C edi &dst[2*size-3] 276 277 movl -4(%esi), %eax 278 mull -8(%esi) C src[size-1]*src[size-2] 279 addl %eax, 0(%edi) 280 adcl $0, %edx 281 movl %edx, 4(%edi) C dst high limb 282 283 284 C ----------------------------------------------------------------------------- 285 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 286 287 movl PARAM_SIZE, %eax 288 negl %eax 289 addl $1, %eax C -(size-1) and clear carry 290 291 L(lshift): 292 C eax counter, negative 293 C ebx next limb 294 C ecx 295 C edx 296 C esi 297 C edi &dst[2*size-4] 298 C ebp 299 300 rcll 8(%edi,%eax,8) 301 rcll 12(%edi,%eax,8) 302 incl %eax 303 jnz L(lshift) 304 305 306 adcl %eax, %eax C high bit out 307 movl %eax, 8(%edi) C dst most significant limb 308 309 310 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., 311 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 312 C low limb of src[0]^2. 313 314 movl PARAM_SRC, %esi 315 movl (%esi), %eax C src[0] 316 mull %eax C src[0]^2 317 318 movl PARAM_SIZE, %ecx 319 leal (%esi,%ecx,4), %esi C src end 320 321 negl %ecx C -size 322 movl %edx, %ebx C initial carry 323 324 movl %eax, 12(%edi,%ecx,8) C dst[0] 325 incl %ecx C -(size-1) 326 327 L(diag): 328 C eax scratch (low product) 329 C ebx carry limb 330 C ecx counter, -(size-1) to -1 331 C edx scratch (high product) 332 C esi &src[size] 333 C edi &dst[2*size-3] 334 C ebp scratch (fetched dst limbs) 335 336 movl (%esi,%ecx,4), %eax 337 mull %eax 338 339 addl %ebx, 8(%edi,%ecx,8) 340 movl %edx, %ebx 341 342 adcl %eax, 12(%edi,%ecx,8) 343 adcl $0, %ebx 344 345 incl %ecx 346 jnz L(diag) 347 348 349 addl %ebx, 8(%edi) C dst most significant limb 350 351 popl %ebp 352 popl %esi 353 354 popl %edi 355 popl %ebx 356 357 ret 358 359 EPILOGUE()