github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/sqr_basecase.asm (about) 1 dnl Intel P5 mpn_sqr_basecase -- square an mpn number. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular 35 C product at around 20x20 limbs. 36 37 38 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 39 C 40 C Calculate src,size squared, storing the result in dst,2*size. 41 C 42 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 43 C lot of function call overheads are avoided, especially when the size is 44 C small. 45 46 defframe(PARAM_SIZE,12) 47 defframe(PARAM_SRC, 8) 48 defframe(PARAM_DST, 4) 49 50 TEXT 51 ALIGN(8) 52 PROLOGUE(mpn_sqr_basecase) 53 deflit(`FRAME',0) 54 55 movl PARAM_SIZE, %edx 56 movl PARAM_SRC, %eax 57 58 cmpl $2, %edx 59 movl PARAM_DST, %ecx 60 61 je L(two_limbs) 62 63 movl (%eax), %eax 64 ja L(three_or_more) 65 66 C ----------------------------------------------------------------------------- 67 C one limb only 68 C eax src 69 C ebx 70 C ecx dst 71 C edx 72 73 mull %eax 74 75 movl %eax, (%ecx) 76 movl %edx, 4(%ecx) 77 78 ret 79 80 C ----------------------------------------------------------------------------- 81 ALIGN(8) 82 L(two_limbs): 83 C eax src 84 C ebx 85 C ecx dst 86 C edx size 87 88 pushl %ebp 89 pushl %edi 90 91 pushl %esi 92 pushl %ebx 93 94 movl %eax, %ebx 95 movl (%eax), %eax 96 97 mull %eax C src[0]^2 98 99 movl %eax, (%ecx) C dst[0] 100 movl %edx, %esi C dst[1] 101 102 movl 4(%ebx), %eax 103 104 mull %eax C src[1]^2 105 106 movl %eax, %edi C dst[2] 107 movl %edx, %ebp C dst[3] 108 109 movl (%ebx), %eax 110 111 mull 4(%ebx) C src[0]*src[1] 112 113 addl %eax, %esi 114 popl %ebx 115 116 adcl %edx, %edi 117 118 adcl $0, %ebp 119 addl %esi, %eax 120 121 adcl %edi, %edx 122 movl %eax, 4(%ecx) 123 124 adcl $0, %ebp 125 popl %esi 126 127 movl %edx, 8(%ecx) 128 movl %ebp, 12(%ecx) 129 130 popl %edi 131 popl %ebp 132 133 ret 134 135 136 C ----------------------------------------------------------------------------- 137 ALIGN(8) 138 L(three_or_more): 139 C eax src low limb 140 C ebx 141 C ecx dst 142 C edx size 143 144 cmpl $4, %edx 145 pushl %ebx 146 deflit(`FRAME',4) 147 148 movl PARAM_SRC, %ebx 149 jae L(four_or_more) 150 151 152 C ----------------------------------------------------------------------------- 153 C three limbs 154 C eax src low limb 155 C ebx src 156 C ecx dst 157 C edx size 158 159 pushl %ebp 160 pushl %edi 161 162 mull %eax C src[0] ^ 2 163 164 movl %eax, (%ecx) 165 movl %edx, 4(%ecx) 166 167 movl 4(%ebx), %eax 168 xorl %ebp, %ebp 169 170 mull %eax C src[1] ^ 2 171 172 movl %eax, 8(%ecx) 173 movl %edx, 12(%ecx) 174 175 movl 8(%ebx), %eax 176 pushl %esi C risk of cache bank clash 177 178 mull %eax C src[2] ^ 2 179 180 movl %eax, 16(%ecx) 181 movl %edx, 20(%ecx) 182 183 movl (%ebx), %eax 184 185 mull 4(%ebx) C src[0] * src[1] 186 187 movl %eax, %esi 188 movl %edx, %edi 189 190 movl (%ebx), %eax 191 192 mull 8(%ebx) C src[0] * src[2] 193 194 addl %eax, %edi 195 movl %edx, %ebp 196 197 adcl $0, %ebp 198 movl 4(%ebx), %eax 199 200 mull 8(%ebx) C src[1] * src[2] 201 202 xorl %ebx, %ebx 203 addl %eax, %ebp 204 205 C eax 206 C ebx zero, will be dst[5] 207 C ecx dst 208 C edx dst[4] 209 C esi dst[1] 210 C edi dst[2] 211 C ebp dst[3] 212 213 adcl $0, %edx 214 addl %esi, %esi 215 216 adcl %edi, %edi 217 218 adcl %ebp, %ebp 219 220 adcl %edx, %edx 221 movl 4(%ecx), %eax 222 223 adcl $0, %ebx 224 addl %esi, %eax 225 226 movl %eax, 4(%ecx) 227 movl 8(%ecx), %eax 228 229 adcl %edi, %eax 230 movl 12(%ecx), %esi 231 232 adcl %ebp, %esi 233 movl 16(%ecx), %edi 234 235 movl %eax, 8(%ecx) 236 movl %esi, 12(%ecx) 237 238 adcl %edx, %edi 239 popl %esi 240 241 movl 20(%ecx), %eax 242 movl %edi, 16(%ecx) 243 244 popl %edi 245 popl %ebp 246 247 adcl %ebx, %eax C no carry out of this 248 popl %ebx 249 250 movl %eax, 20(%ecx) 251 252 ret 253 254 255 C ----------------------------------------------------------------------------- 256 ALIGN(8) 257 L(four_or_more): 258 C eax src low limb 259 C ebx src 260 C ecx dst 261 C edx size 262 C esi 263 C edi 264 C ebp 265 C 266 C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 267 268 deflit(`FRAME',4) 269 270 pushl %edi 271 FRAME_pushl() 272 pushl %esi 273 FRAME_pushl() 274 275 pushl %ebp 276 FRAME_pushl() 277 leal (%ecx,%edx,4), %edi C dst end of this mul1 278 279 leal (%ebx,%edx,4), %esi C src end 280 movl %ebx, %ebp C src 281 282 negl %edx C -size 283 xorl %ebx, %ebx C clear carry limb and carry flag 284 285 leal 1(%edx), %ecx C -(size-1) 286 287 L(mul1): 288 C eax scratch 289 C ebx carry 290 C ecx counter, negative 291 C edx scratch 292 C esi &src[size] 293 C edi &dst[size] 294 C ebp src 295 296 adcl $0, %ebx 297 movl (%esi,%ecx,4), %eax 298 299 mull (%ebp) 300 301 addl %eax, %ebx 302 303 movl %ebx, (%edi,%ecx,4) 304 incl %ecx 305 306 movl %edx, %ebx 307 jnz L(mul1) 308 309 310 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for 311 C n=1..size-2. 312 C 313 C The last two products, which are the end corner of the product 314 C triangle, are handled separately to save looping overhead. These 315 C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. 316 C If size is 4 then it's only these that need to be done. 317 C 318 C In the outer loop %esi is a constant, and %edi just advances by 1 319 C limb each time. The size of the operation decreases by 1 limb 320 C each time. 321 322 C eax 323 C ebx carry (needing carry flag added) 324 C ecx 325 C edx 326 C esi &src[size] 327 C edi &dst[size] 328 C ebp 329 330 adcl $0, %ebx 331 movl PARAM_SIZE, %edx 332 333 movl %ebx, (%edi) 334 subl $4, %edx 335 336 negl %edx 337 jz L(corner) 338 339 340 L(outer): 341 C ebx previous carry limb to store 342 C edx outer loop counter (negative) 343 C esi &src[size] 344 C edi dst, pointing at stored carry limb of previous loop 345 346 pushl %edx C new outer loop counter 347 leal -2(%edx), %ecx 348 349 movl %ebx, (%edi) 350 addl $4, %edi 351 352 addl $4, %ebp 353 xorl %ebx, %ebx C initial carry limb, clear carry flag 354 355 L(inner): 356 C eax scratch 357 C ebx carry (needing carry flag added) 358 C ecx counter, negative 359 C edx scratch 360 C esi &src[size] 361 C edi dst end of this addmul 362 C ebp &src[j] 363 364 adcl $0, %ebx 365 movl (%esi,%ecx,4), %eax 366 367 mull (%ebp) 368 369 addl %ebx, %eax 370 movl (%edi,%ecx,4), %ebx 371 372 adcl $0, %edx 373 addl %eax, %ebx 374 375 movl %ebx, (%edi,%ecx,4) 376 incl %ecx 377 378 movl %edx, %ebx 379 jnz L(inner) 380 381 382 adcl $0, %ebx 383 popl %edx C outer loop counter 384 385 incl %edx 386 jnz L(outer) 387 388 389 movl %ebx, (%edi) 390 391 L(corner): 392 C esi &src[size] 393 C edi &dst[2*size-4] 394 395 movl -8(%esi), %eax 396 movl -4(%edi), %ebx C risk of data cache bank clash here 397 398 mull -12(%esi) C src[size-2]*src[size-3] 399 400 addl %eax, %ebx 401 movl %edx, %ecx 402 403 adcl $0, %ecx 404 movl -4(%esi), %eax 405 406 mull -12(%esi) C src[size-1]*src[size-3] 407 408 addl %ecx, %eax 409 movl (%edi), %ecx 410 411 adcl $0, %edx 412 movl %ebx, -4(%edi) 413 414 addl %eax, %ecx 415 movl %edx, %ebx 416 417 adcl $0, %ebx 418 movl -4(%esi), %eax 419 420 mull -8(%esi) C src[size-1]*src[size-2] 421 422 movl %ecx, (%edi) 423 addl %eax, %ebx 424 425 adcl $0, %edx 426 movl PARAM_SIZE, %eax 427 428 negl %eax 429 movl %ebx, 4(%edi) 430 431 addl $1, %eax C -(size-1) and clear carry 432 movl %edx, 8(%edi) 433 434 435 C ----------------------------------------------------------------------------- 436 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 437 438 L(lshift): 439 C eax counter, negative 440 C ebx next limb 441 C ecx 442 C edx 443 C esi 444 C edi &dst[2*size-4] 445 C ebp 446 447 movl 12(%edi,%eax,8), %ebx 448 449 rcll %ebx 450 movl 16(%edi,%eax,8), %ecx 451 452 rcll %ecx 453 movl %ebx, 12(%edi,%eax,8) 454 455 movl %ecx, 16(%edi,%eax,8) 456 incl %eax 457 458 jnz L(lshift) 459 460 461 adcl %eax, %eax C high bit out 462 movl PARAM_SRC, %esi 463 464 movl PARAM_SIZE, %ecx C risk of cache bank clash 465 movl %eax, 12(%edi) C dst most significant limb 466 467 468 C ----------------------------------------------------------------------------- 469 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., 470 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 471 C low limb of src[0]^2. 472 473 movl (%esi), %eax C src[0] 474 leal (%esi,%ecx,4), %esi C src end 475 476 negl %ecx 477 478 mull %eax 479 480 movl %eax, 16(%edi,%ecx,8) C dst[0] 481 movl %edx, %ebx 482 483 addl $1, %ecx C size-1 and clear carry 484 485 L(diag): 486 C eax scratch (low product) 487 C ebx carry limb 488 C ecx counter, negative 489 C edx scratch (high product) 490 C esi &src[size] 491 C edi &dst[2*size-4] 492 C ebp scratch (fetched dst limbs) 493 494 movl (%esi,%ecx,4), %eax 495 adcl $0, %ebx 496 497 mull %eax 498 499 movl 16-4(%edi,%ecx,8), %ebp 500 501 addl %ebp, %ebx 502 movl 16(%edi,%ecx,8), %ebp 503 504 adcl %eax, %ebp 505 movl %ebx, 16-4(%edi,%ecx,8) 506 507 movl %ebp, 16(%edi,%ecx,8) 508 incl %ecx 509 510 movl %edx, %ebx 511 jnz L(diag) 512 513 514 adcl $0, %edx 515 movl 16-4(%edi), %eax C dst most significant limb 516 517 addl %eax, %edx 518 popl %ebp 519 520 movl %edx, 16-4(%edi) 521 popl %esi C risk of cache bank clash 522 523 popl %edi 524 popl %ebx 525 526 ret 527 528 EPILOGUE()