github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/atom/sse2/sqr_basecase.asm (about) 1 dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 4 5 dnl Copyright 2011 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C TODO 36 C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the 37 C 4 large loops into one; we could use it for the outer loop branch. 38 C * Optimise code outside of inner loops. 39 C * Write combined addmul_1 feed-in a wind-down code, and use when iterating 40 C outer each loop. ("Overlapping software pipelining") 41 C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone 42 C all pushes. 43 C * Perhaps write special code for n < M, for some small M. 44 C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps 45 C with even less pipelined code. 46 C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left. 47 C Consider breaking out earlier, saving high the cost of short loops. 48 49 C void mpn_sqr_basecase (mp_ptr wp, 50 C mp_srcptr xp, mp_size_t xn); 51 52 define(`rp', `%edi') 53 define(`up', `%esi') 54 define(`n', `%ecx') 55 56 define(`un', `%ebp') 57 58 TEXT 59 ALIGN(16) 60 PROLOGUE(mpn_sqr_basecase) 61 push %edi 62 push %esi 63 mov 12(%esp), rp 64 mov 16(%esp), up 65 mov 20(%esp), n 66 67 lea 4(rp), rp C write triangular product starting at rp[1] 68 dec n 69 movd (up), %mm7 70 71 jz L(one) 72 lea 4(up), up 73 push %ebx 74 push %ebp 75 mov n, %eax 76 77 movd (up), %mm0 78 neg n 79 pmuludq %mm7, %mm0 80 pxor %mm6, %mm6 81 mov n, un 82 83 and $3, %eax 84 jz L(of0) 85 cmp $2, %eax 86 jc L(of1) 87 jz L(of2) 88 89 C ================================================================ 90 jmp L(m3) 91 ALIGN(16) 92 L(lm3): movd -4(up), %mm0 93 pmuludq %mm7, %mm0 94 psrlq $32, %mm6 95 lea 16(rp), rp 96 paddq %mm0, %mm6 97 movd (up), %mm0 98 pmuludq %mm7, %mm0 99 movd %mm6, -4(rp) 100 psrlq $32, %mm6 101 L(m3): paddq %mm0, %mm6 102 movd 4(up), %mm0 103 pmuludq %mm7, %mm0 104 movd %mm6, (rp) 105 psrlq $32, %mm6 106 paddq %mm0, %mm6 107 movd 8(up), %mm0 108 pmuludq %mm7, %mm0 109 movd %mm6, 4(rp) 110 psrlq $32, %mm6 111 paddq %mm0, %mm6 112 add $4, un 113 movd %mm6, 8(rp) 114 lea 16(up), up 115 js L(lm3) 116 117 psrlq $32, %mm6 118 movd %mm6, 12(rp) 119 120 inc n 121 C jz L(done) 122 lea -12(up), up 123 lea 4(rp), rp 124 jmp L(ol2) 125 126 C ================================================================ 127 ALIGN(16) 128 L(lm0): movd (up), %mm0 129 pmuludq %mm7, %mm0 130 psrlq $32, %mm6 131 lea 16(rp), rp 132 L(of0): paddq %mm0, %mm6 133 movd 4(up), %mm0 134 pmuludq %mm7, %mm0 135 movd %mm6, (rp) 136 psrlq $32, %mm6 137 paddq %mm0, %mm6 138 movd 8(up), %mm0 139 pmuludq %mm7, %mm0 140 movd %mm6, 4(rp) 141 psrlq $32, %mm6 142 paddq %mm0, %mm6 143 movd 12(up), %mm0 144 pmuludq %mm7, %mm0 145 movd %mm6, 8(rp) 146 psrlq $32, %mm6 147 paddq %mm0, %mm6 148 add $4, un 149 movd %mm6, 12(rp) 150 lea 16(up), up 151 js L(lm0) 152 153 psrlq $32, %mm6 154 movd %mm6, 16(rp) 155 156 inc n 157 C jz L(done) 158 lea -8(up), up 159 lea 8(rp), rp 160 jmp L(ol3) 161 162 C ================================================================ 163 ALIGN(16) 164 L(lm1): movd -12(up), %mm0 165 pmuludq %mm7, %mm0 166 psrlq $32, %mm6 167 lea 16(rp), rp 168 paddq %mm0, %mm6 169 movd -8(up), %mm0 170 pmuludq %mm7, %mm0 171 movd %mm6, -12(rp) 172 psrlq $32, %mm6 173 paddq %mm0, %mm6 174 movd -4(up), %mm0 175 pmuludq %mm7, %mm0 176 movd %mm6, -8(rp) 177 psrlq $32, %mm6 178 paddq %mm0, %mm6 179 movd (up), %mm0 180 pmuludq %mm7, %mm0 181 movd %mm6, -4(rp) 182 psrlq $32, %mm6 183 L(of1): paddq %mm0, %mm6 184 add $4, un 185 movd %mm6, (rp) 186 lea 16(up), up 187 js L(lm1) 188 189 psrlq $32, %mm6 190 movd %mm6, 4(rp) 191 192 inc n 193 jz L(done) C goes away when we add special n=2 code 194 lea -20(up), up 195 lea -4(rp), rp 196 jmp L(ol0) 197 198 C ================================================================ 199 ALIGN(16) 200 L(lm2): movd -8(up), %mm0 201 pmuludq %mm7, %mm0 202 psrlq $32, %mm6 203 lea 16(rp), rp 204 paddq %mm0, %mm6 205 movd -4(up), %mm0 206 pmuludq %mm7, %mm0 207 movd %mm6, -8(rp) 208 psrlq $32, %mm6 209 paddq %mm0, %mm6 210 movd (up), %mm0 211 pmuludq %mm7, %mm0 212 movd %mm6, -4(rp) 213 psrlq $32, %mm6 214 L(of2): paddq %mm0, %mm6 215 movd 4(up), %mm0 216 pmuludq %mm7, %mm0 217 movd %mm6, (rp) 218 psrlq $32, %mm6 219 paddq %mm0, %mm6 220 add $4, un 221 movd %mm6, 4(rp) 222 lea 16(up), up 223 js L(lm2) 224 225 psrlq $32, %mm6 226 movd %mm6, 8(rp) 227 228 inc n 229 C jz L(done) 230 lea -16(up), up 231 C lea (rp), rp 232 C jmp L(ol1) 233 234 C ================================================================ 235 236 L(ol1): lea 4(up,n,4), up 237 movd (up), %mm7 C read next U invariant limb 238 lea 8(rp,n,4), rp 239 mov n, un 240 241 movd 4(up), %mm1 242 pmuludq %mm7, %mm1 243 sar $2, un 244 movd %mm1, %ebx 245 inc un 246 jz L(re1) 247 248 movd 8(up), %mm0 249 pmuludq %mm7, %mm0 250 xor %edx, %edx C zero edx and CF 251 jmp L(a1) 252 253 L(la1): adc $0, %edx 254 add %ebx, 12(rp) 255 movd %mm0, %eax 256 pmuludq %mm7, %mm1 257 lea 16(rp), rp 258 psrlq $32, %mm0 259 adc %edx, %eax 260 movd %mm0, %edx 261 movd %mm1, %ebx 262 movd 8(up), %mm0 263 pmuludq %mm7, %mm0 264 adc $0, %edx 265 add %eax, (rp) 266 L(a1): psrlq $32, %mm1 267 adc %edx, %ebx 268 movd %mm1, %edx 269 movd %mm0, %eax 270 movd 12(up), %mm1 271 pmuludq %mm7, %mm1 272 adc $0, %edx 273 add %ebx, 4(rp) 274 psrlq $32, %mm0 275 adc %edx, %eax 276 movd %mm0, %edx 277 movd %mm1, %ebx 278 lea 16(up), up 279 movd (up), %mm0 280 adc $0, %edx 281 add %eax, 8(rp) 282 psrlq $32, %mm1 283 adc %edx, %ebx 284 movd %mm1, %edx 285 pmuludq %mm7, %mm0 286 inc un 287 movd 4(up), %mm1 288 jnz L(la1) 289 290 adc un, %edx C un is zero here 291 add %ebx, 12(rp) 292 movd %mm0, %eax 293 pmuludq %mm7, %mm1 294 lea 16(rp), rp 295 psrlq $32, %mm0 296 adc %edx, %eax 297 movd %mm0, %edx 298 movd %mm1, %ebx 299 adc un, %edx 300 add %eax, (rp) 301 psrlq $32, %mm1 302 adc %edx, %ebx 303 movd %mm1, %eax 304 adc un, %eax 305 add %ebx, 4(rp) 306 adc un, %eax 307 mov %eax, 8(rp) 308 309 inc n 310 311 C ================================================================ 312 313 L(ol0): lea (up,n,4), up 314 movd 4(up), %mm7 C read next U invariant limb 315 lea 4(rp,n,4), rp 316 mov n, un 317 318 movd 8(up), %mm0 319 pmuludq %mm7, %mm0 320 sar $2, un 321 movd 12(up), %mm1 322 movd %mm0, %eax 323 pmuludq %mm7, %mm1 324 xor %edx, %edx C zero edx and CF 325 jmp L(a0) 326 327 L(la0): adc $0, %edx 328 add %ebx, 12(rp) 329 movd %mm0, %eax 330 pmuludq %mm7, %mm1 331 lea 16(rp), rp 332 psrlq $32, %mm0 333 adc %edx, %eax 334 movd %mm0, %edx 335 movd %mm1, %ebx 336 movd 8(up), %mm0 337 pmuludq %mm7, %mm0 338 adc $0, %edx 339 add %eax, (rp) 340 psrlq $32, %mm1 341 adc %edx, %ebx 342 movd %mm1, %edx 343 movd %mm0, %eax 344 movd 12(up), %mm1 345 pmuludq %mm7, %mm1 346 adc $0, %edx 347 add %ebx, 4(rp) 348 L(a0): psrlq $32, %mm0 349 adc %edx, %eax 350 movd %mm0, %edx 351 movd %mm1, %ebx 352 lea 16(up), up 353 movd (up), %mm0 354 adc $0, %edx 355 add %eax, 8(rp) 356 psrlq $32, %mm1 357 adc %edx, %ebx 358 movd %mm1, %edx 359 pmuludq %mm7, %mm0 360 inc un 361 movd 4(up), %mm1 362 jnz L(la0) 363 364 adc un, %edx C un is zero here 365 add %ebx, 12(rp) 366 movd %mm0, %eax 367 pmuludq %mm7, %mm1 368 lea 16(rp), rp 369 psrlq $32, %mm0 370 adc %edx, %eax 371 movd %mm0, %edx 372 movd %mm1, %ebx 373 adc un, %edx 374 add %eax, (rp) 375 psrlq $32, %mm1 376 adc %edx, %ebx 377 movd %mm1, %eax 378 adc un, %eax 379 add %ebx, 4(rp) 380 adc un, %eax 381 mov %eax, 8(rp) 382 383 inc n 384 385 C ================================================================ 386 387 L(ol3): lea 12(up,n,4), up 388 movd -8(up), %mm7 C read next U invariant limb 389 lea (rp,n,4), rp C put rp back 390 mov n, un 391 392 movd -4(up), %mm1 393 pmuludq %mm7, %mm1 394 sar $2, un 395 movd %mm1, %ebx 396 movd (up), %mm0 397 xor %edx, %edx C zero edx and CF 398 jmp L(a3) 399 400 L(la3): adc $0, %edx 401 add %ebx, 12(rp) 402 movd %mm0, %eax 403 pmuludq %mm7, %mm1 404 lea 16(rp), rp 405 psrlq $32, %mm0 406 adc %edx, %eax 407 movd %mm0, %edx 408 movd %mm1, %ebx 409 movd 8(up), %mm0 410 pmuludq %mm7, %mm0 411 adc $0, %edx 412 add %eax, (rp) 413 psrlq $32, %mm1 414 adc %edx, %ebx 415 movd %mm1, %edx 416 movd %mm0, %eax 417 movd 12(up), %mm1 418 pmuludq %mm7, %mm1 419 adc $0, %edx 420 add %ebx, 4(rp) 421 psrlq $32, %mm0 422 adc %edx, %eax 423 movd %mm0, %edx 424 movd %mm1, %ebx 425 lea 16(up), up 426 movd (up), %mm0 427 adc $0, %edx 428 add %eax, 8(rp) 429 L(a3): psrlq $32, %mm1 430 adc %edx, %ebx 431 movd %mm1, %edx 432 pmuludq %mm7, %mm0 433 inc un 434 movd 4(up), %mm1 435 jnz L(la3) 436 437 adc un, %edx C un is zero here 438 add %ebx, 12(rp) 439 movd %mm0, %eax 440 pmuludq %mm7, %mm1 441 lea 16(rp), rp 442 psrlq $32, %mm0 443 adc %edx, %eax 444 movd %mm0, %edx 445 movd %mm1, %ebx 446 adc un, %edx 447 add %eax, (rp) 448 psrlq $32, %mm1 449 adc %edx, %ebx 450 movd %mm1, %eax 451 adc un, %eax 452 add %ebx, 4(rp) 453 adc un, %eax 454 mov %eax, 8(rp) 455 456 inc n 457 458 C ================================================================ 459 460 L(ol2): lea 8(up,n,4), up 461 movd -4(up), %mm7 C read next U invariant limb 462 lea 12(rp,n,4), rp 463 mov n, un 464 465 movd (up), %mm0 466 pmuludq %mm7, %mm0 467 xor %edx, %edx 468 sar $2, un 469 movd 4(up), %mm1 470 test un, un C clear carry 471 movd %mm0, %eax 472 pmuludq %mm7, %mm1 473 inc un 474 jnz L(a2) 475 jmp L(re2) 476 477 L(la2): adc $0, %edx 478 add %ebx, 12(rp) 479 movd %mm0, %eax 480 pmuludq %mm7, %mm1 481 lea 16(rp), rp 482 L(a2): psrlq $32, %mm0 483 adc %edx, %eax 484 movd %mm0, %edx 485 movd %mm1, %ebx 486 movd 8(up), %mm0 487 pmuludq %mm7, %mm0 488 adc $0, %edx 489 add %eax, (rp) 490 psrlq $32, %mm1 491 adc %edx, %ebx 492 movd %mm1, %edx 493 movd %mm0, %eax 494 movd 12(up), %mm1 495 pmuludq %mm7, %mm1 496 adc $0, %edx 497 add %ebx, 4(rp) 498 psrlq $32, %mm0 499 adc %edx, %eax 500 movd %mm0, %edx 501 movd %mm1, %ebx 502 lea 16(up), up 503 movd (up), %mm0 504 adc $0, %edx 505 add %eax, 8(rp) 506 psrlq $32, %mm1 507 adc %edx, %ebx 508 movd %mm1, %edx 509 pmuludq %mm7, %mm0 510 inc un 511 movd 4(up), %mm1 512 jnz L(la2) 513 514 adc un, %edx C un is zero here 515 add %ebx, 12(rp) 516 movd %mm0, %eax 517 pmuludq %mm7, %mm1 518 lea 16(rp), rp 519 psrlq $32, %mm0 520 adc %edx, %eax 521 movd %mm0, %edx 522 movd %mm1, %ebx 523 adc un, %edx 524 add %eax, (rp) 525 psrlq $32, %mm1 526 adc %edx, %ebx 527 movd %mm1, %eax 528 adc un, %eax 529 add %ebx, 4(rp) 530 adc un, %eax 531 mov %eax, 8(rp) 532 533 inc n 534 jmp L(ol1) 535 536 C ================================================================ 537 L(re2): psrlq $32, %mm0 538 movd (up), %mm7 C read next U invariant limb 539 adc %edx, %eax 540 movd %mm0, %edx 541 movd %mm1, %ebx 542 adc un, %edx 543 add %eax, (rp) 544 lea 4(rp), rp 545 psrlq $32, %mm1 546 adc %edx, %ebx 547 movd %mm1, %eax 548 movd 4(up), %mm1 549 adc un, %eax 550 add %ebx, (rp) 551 pmuludq %mm7, %mm1 552 adc un, %eax 553 mov %eax, 4(rp) 554 movd %mm1, %ebx 555 556 L(re1): psrlq $32, %mm1 557 add %ebx, 4(rp) 558 movd %mm1, %eax 559 adc un, %eax 560 xor n, n C make n zeroness assumption below true 561 mov %eax, 8(rp) 562 563 L(done): C n is zero here 564 mov 24(%esp), up 565 mov 28(%esp), %eax 566 567 movd (up), %mm0 568 inc %eax 569 pmuludq %mm0, %mm0 570 lea 4(up), up 571 mov 20(%esp), rp 572 shr %eax 573 movd %mm0, (rp) 574 psrlq $32, %mm0 575 lea -12(rp), rp 576 mov %eax, 28(%esp) 577 jnc L(odd) 578 579 movd %mm0, %ebp 580 movd (up), %mm0 581 lea 8(rp), rp 582 pmuludq %mm0, %mm0 583 lea -4(up), up 584 add 8(rp), %ebp 585 movd %mm0, %edx 586 adc 12(rp), %edx 587 rcr n 588 jmp L(ent) 589 590 C ALIGN(16) C alignment seems irrelevant 591 L(top): movd (up), %mm1 592 adc n, n 593 movd %mm0, %eax 594 pmuludq %mm1, %mm1 595 movd 4(up), %mm0 596 adc (rp), %eax 597 movd %mm1, %ebx 598 pmuludq %mm0, %mm0 599 psrlq $32, %mm1 600 adc 4(rp), %ebx 601 movd %mm1, %ebp 602 movd %mm0, %edx 603 adc 8(rp), %ebp 604 adc 12(rp), %edx 605 rcr n C FIXME: isn't this awfully slow on atom??? 606 adc %eax, (rp) 607 adc %ebx, 4(rp) 608 L(ent): lea 8(up), up 609 adc %ebp, 8(rp) 610 psrlq $32, %mm0 611 adc %edx, 12(rp) 612 L(odd): decl 28(%esp) 613 lea 16(rp), rp 614 jnz L(top) 615 616 L(end): adc n, n 617 movd %mm0, %eax 618 adc n, %eax 619 mov %eax, (rp) 620 621 L(rtn): emms 622 pop %ebp 623 pop %ebx 624 pop %esi 625 pop %edi 626 ret 627 628 L(one): pmuludq %mm7, %mm7 629 movq %mm7, -4(rp) 630 emms 631 pop %esi 632 pop %edi 633 ret 634 EPILOGUE()