github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/sqr_basecase.asm (about) 1 dnl AMD K6 mpn_sqr_basecase -- square an mpn number. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular 35 C product (measured on the speed difference between 17 and 33 limbs, 36 C which is roughly the Karatsuba recursing range). 37 38 39 dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this 40 dnl code supports. This value is used only by the tune program to know 41 dnl what it can go up to. (An attempt to compile with a bigger value will 42 dnl trigger some m4_assert()s in the code, making the build fail.) 43 dnl 44 dnl The value is determined by requiring the displacements in the unrolled 45 dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of 46 dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66. 47 48 deflit(SQR_TOOM2_THRESHOLD_MAX, 66) 49 50 51 dnl Allow a value from the tune program to override config.m4. 52 53 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 54 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 55 56 57 dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The 58 dnl number required is determined by SQR_TOOM2_THRESHOLD, since 59 dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD. 60 dnl 61 dnl The first addmul is the biggest, and this takes the second least 62 dnl significant limb and multiplies it by the third least significant and 63 dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1 64 dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3. 65 66 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 67 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 68 69 70 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 71 C 72 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a 73 C lot of function call overheads are avoided, especially when the given size 74 C is small. 75 C 76 C The code size might look a bit excessive, but not all of it is executed 77 C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases 78 C clearly apply only to those sizes; mid sizes like 10x10 only need part of 79 C the unrolled addmul; and big sizes like 35x35 that do need all of it will 80 C at least be getting value for money, because 35x35 spends something like 81 C 5780 cycles here. 82 C 83 C Different values of UNROLL_COUNT give slightly different speeds, between 84 C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs. 85 C This isn't a big difference, but it's presumably some alignment effect 86 C which if understood could give a simple speedup. 87 88 defframe(PARAM_SIZE,12) 89 defframe(PARAM_SRC, 8) 90 defframe(PARAM_DST, 4) 91 92 TEXT 93 ALIGN(32) 94 PROLOGUE(mpn_sqr_basecase) 95 deflit(`FRAME',0) 96 97 movl PARAM_SIZE, %ecx 98 movl PARAM_SRC, %eax 99 100 cmpl $2, %ecx 101 je L(two_limbs) 102 103 movl PARAM_DST, %edx 104 ja L(three_or_more) 105 106 107 C ----------------------------------------------------------------------------- 108 C one limb only 109 C eax src 110 C ebx 111 C ecx size 112 C edx dst 113 114 movl (%eax), %eax 115 movl %edx, %ecx 116 117 mull %eax 118 119 movl %eax, (%ecx) 120 movl %edx, 4(%ecx) 121 ret 122 123 124 C ----------------------------------------------------------------------------- 125 ALIGN(16) 126 L(two_limbs): 127 C eax src 128 C ebx 129 C ecx size 130 C edx dst 131 132 pushl %ebx 133 movl %eax, %ebx C src 134 deflit(`FRAME',4) 135 136 movl (%ebx), %eax 137 movl PARAM_DST, %ecx 138 139 mull %eax C src[0]^2 140 141 movl %eax, (%ecx) 142 movl 4(%ebx), %eax 143 144 movl %edx, 4(%ecx) 145 146 mull %eax C src[1]^2 147 148 movl %eax, 8(%ecx) 149 movl (%ebx), %eax 150 151 movl %edx, 12(%ecx) 152 movl 4(%ebx), %edx 153 154 mull %edx C src[0]*src[1] 155 156 addl %eax, 4(%ecx) 157 158 adcl %edx, 8(%ecx) 159 adcl $0, 12(%ecx) 160 161 popl %ebx 162 addl %eax, 4(%ecx) 163 164 adcl %edx, 8(%ecx) 165 adcl $0, 12(%ecx) 166 167 ret 168 169 170 C ----------------------------------------------------------------------------- 171 L(three_or_more): 172 deflit(`FRAME',0) 173 cmpl $4, %ecx 174 jae L(four_or_more) 175 176 177 C ----------------------------------------------------------------------------- 178 C three limbs 179 C eax src 180 C ecx size 181 C edx dst 182 183 pushl %ebx 184 movl %eax, %ebx C src 185 186 movl (%ebx), %eax 187 movl %edx, %ecx C dst 188 189 mull %eax C src[0] ^ 2 190 191 movl %eax, (%ecx) 192 movl 4(%ebx), %eax 193 194 movl %edx, 4(%ecx) 195 pushl %esi 196 197 mull %eax C src[1] ^ 2 198 199 movl %eax, 8(%ecx) 200 movl 8(%ebx), %eax 201 202 movl %edx, 12(%ecx) 203 pushl %edi 204 205 mull %eax C src[2] ^ 2 206 207 movl %eax, 16(%ecx) 208 movl (%ebx), %eax 209 210 movl %edx, 20(%ecx) 211 movl 4(%ebx), %edx 212 213 mull %edx C src[0] * src[1] 214 215 movl %eax, %esi 216 movl (%ebx), %eax 217 218 movl %edx, %edi 219 movl 8(%ebx), %edx 220 221 pushl %ebp 222 xorl %ebp, %ebp 223 224 mull %edx C src[0] * src[2] 225 226 addl %eax, %edi 227 movl 4(%ebx), %eax 228 229 adcl %edx, %ebp 230 231 movl 8(%ebx), %edx 232 233 mull %edx C src[1] * src[2] 234 235 addl %eax, %ebp 236 237 adcl $0, %edx 238 239 240 C eax will be dst[5] 241 C ebx 242 C ecx dst 243 C edx dst[4] 244 C esi dst[1] 245 C edi dst[2] 246 C ebp dst[3] 247 248 xorl %eax, %eax 249 addl %esi, %esi 250 adcl %edi, %edi 251 adcl %ebp, %ebp 252 adcl %edx, %edx 253 adcl $0, %eax 254 255 addl %esi, 4(%ecx) 256 adcl %edi, 8(%ecx) 257 adcl %ebp, 12(%ecx) 258 259 popl %ebp 260 popl %edi 261 262 adcl %edx, 16(%ecx) 263 264 popl %esi 265 popl %ebx 266 267 adcl %eax, 20(%ecx) 268 ASSERT(nc) 269 270 ret 271 272 273 C ----------------------------------------------------------------------------- 274 275 defframe(SAVE_EBX, -4) 276 defframe(SAVE_ESI, -8) 277 defframe(SAVE_EDI, -12) 278 defframe(SAVE_EBP, -16) 279 defframe(VAR_COUNTER,-20) 280 defframe(VAR_JMP, -24) 281 deflit(STACK_SPACE, 24) 282 283 ALIGN(16) 284 L(four_or_more): 285 286 C eax src 287 C ebx 288 C ecx size 289 C edx dst 290 C esi 291 C edi 292 C ebp 293 294 C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 295 C 296 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled 297 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off 298 C a 5780 cycle operation, which is not surprising since the loop here is 8 299 C c/l and mpn_mul_1 is 6.25 c/l. 300 301 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 302 303 movl %edi, SAVE_EDI 304 leal 4(%edx), %edi 305 306 movl %ebx, SAVE_EBX 307 leal 4(%eax), %ebx 308 309 movl %esi, SAVE_ESI 310 xorl %esi, %esi 311 312 movl %ebp, SAVE_EBP 313 314 C eax 315 C ebx src+4 316 C ecx size 317 C edx 318 C esi 319 C edi dst+4 320 C ebp 321 322 movl (%eax), %ebp C multiplier 323 leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary 324 325 326 ALIGN(16) 327 L(mul_1): 328 C eax scratch 329 C ebx src ptr 330 C ecx counter 331 C edx scratch 332 C esi carry 333 C edi dst ptr 334 C ebp multiplier 335 336 movl (%ebx), %eax 337 addl $4, %ebx 338 339 mull %ebp 340 341 addl %esi, %eax 342 movl $0, %esi 343 344 adcl %edx, %esi 345 346 movl %eax, (%edi) 347 addl $4, %edi 348 349 loop L(mul_1) 350 351 352 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 353 C 354 C The last two addmuls, which are the bottom right corner of the product 355 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 356 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 357 C cases that need to be done. 358 C 359 C The unrolled code is the same as mpn_addmul_1(), see that routine for some 360 C comments. 361 C 362 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. 363 C 364 C VAR_JMP is the computed jump into the unrolled code, stepped by one code 365 C chunk each outer loop. 366 C 367 C K6 doesn't do any branch prediction on indirect jumps, which is good 368 C actually because it's a different target each time. The unrolled addmul 369 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of 370 C the indirect jump is quickly recovered. 371 372 373 dnl This value is also implicitly encoded in a shift and add. 374 dnl 375 deflit(CODE_BYTES_PER_LIMB, 15) 376 377 dnl With the unmodified &src[size] and &dst[size] pointers, the 378 dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT 379 dnl values up to 31. Above that an offset must be added to them. 380 dnl 381 deflit(OFFSET, 382 ifelse(eval(UNROLL_COUNT>31),1, 383 eval((UNROLL_COUNT-31)*4), 384 0)) 385 386 C eax 387 C ebx &src[size] 388 C ecx 389 C edx 390 C esi carry 391 C edi &dst[size] 392 C ebp 393 394 movl PARAM_SIZE, %ecx 395 movl %esi, (%edi) 396 397 subl $4, %ecx 398 jz L(corner) 399 400 movl %ecx, %edx 401 ifelse(OFFSET,0,, 402 ` subl $OFFSET, %ebx') 403 404 shll $4, %ecx 405 ifelse(OFFSET,0,, 406 ` subl $OFFSET, %edi') 407 408 negl %ecx 409 410 ifdef(`PIC',` 411 call L(pic_calc) 412 L(here): 413 ',` 414 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 415 ') 416 negl %edx 417 418 419 C The calculated jump mustn't be before the start of the available 420 C code. This is the limitation UNROLL_COUNT puts on the src operand 421 C size, but checked here using the jump address directly. 422 C 423 ASSERT(ae,` 424 movl_text_address( L(unroll_inner_start), %eax) 425 cmpl %eax, %ecx 426 ') 427 428 429 C ----------------------------------------------------------------------------- 430 ALIGN(16) 431 L(unroll_outer_top): 432 C eax 433 C ebx &src[size], constant 434 C ecx VAR_JMP 435 C edx VAR_COUNTER, limbs, negative 436 C esi high limb to store 437 C edi dst ptr, high of last addmul 438 C ebp 439 440 movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier 441 movl %edx, VAR_COUNTER 442 443 movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand 444 445 mull %ebp 446 447 testb $1, %cl 448 449 movl %edx, %esi C high carry 450 movl %ecx, %edx C jump 451 452 movl %eax, %ecx C low carry 453 leal CODE_BYTES_PER_LIMB(%edx), %edx 454 455 movl %edx, VAR_JMP 456 leal 4(%edi), %edi 457 458 C A branch-free version of this using some xors was found to be a 459 C touch slower than just a conditional jump, despite the jump 460 C switching between taken and not taken on every loop. 461 462 ifelse(eval(UNROLL_COUNT%2),0, 463 jz,jnz) L(unroll_noswap) 464 movl %esi, %eax C high,low carry other way around 465 466 movl %ecx, %esi 467 movl %eax, %ecx 468 L(unroll_noswap): 469 470 jmp *%edx 471 472 473 C Must be on an even address here so the low bit of the jump address 474 C will indicate which way around ecx/esi should start. 475 C 476 C An attempt was made at padding here to get the end of the unrolled 477 C code to come out on a good alignment, to save padding before 478 C L(corner). This worked, but turned out to run slower than just an 479 C ALIGN(2). The reason for this is not clear, it might be related 480 C to the different speeds on different UNROLL_COUNTs noted above. 481 482 ALIGN(2) 483 484 L(unroll_inner_start): 485 C eax scratch 486 C ebx src 487 C ecx carry low 488 C edx scratch 489 C esi carry high 490 C edi dst 491 C ebp multiplier 492 C 493 C 15 code bytes each limb 494 C ecx/esi swapped on each chunk 495 496 forloop(`i', UNROLL_COUNT, 1, ` 497 deflit(`disp_src', eval(-i*4 + OFFSET)) 498 deflit(`disp_dst', eval(disp_src - 4)) 499 500 m4_assert(`disp_src>=-128 && disp_src<128') 501 m4_assert(`disp_dst>=-128 && disp_dst<128') 502 503 ifelse(eval(i%2),0,` 504 Zdisp( movl, disp_src,(%ebx), %eax) 505 mull %ebp 506 Zdisp( addl, %esi, disp_dst,(%edi)) 507 adcl %eax, %ecx 508 movl %edx, %esi 509 jadcl0( %esi) 510 ',` 511 dnl this one comes out last 512 Zdisp( movl, disp_src,(%ebx), %eax) 513 mull %ebp 514 Zdisp( addl, %ecx, disp_dst,(%edi)) 515 adcl %eax, %esi 516 movl %edx, %ecx 517 jadcl0( %ecx) 518 ') 519 ') 520 L(unroll_inner_end): 521 522 addl %esi, -4+OFFSET(%edi) 523 524 movl VAR_COUNTER, %edx 525 jadcl0( %ecx) 526 527 movl %ecx, m4_empty_if_zero(OFFSET)(%edi) 528 movl VAR_JMP, %ecx 529 530 incl %edx 531 jnz L(unroll_outer_top) 532 533 534 ifelse(OFFSET,0,,` 535 addl $OFFSET, %ebx 536 addl $OFFSET, %edi 537 ') 538 539 540 C ----------------------------------------------------------------------------- 541 ALIGN(16) 542 L(corner): 543 C ebx &src[size] 544 C edi &dst[2*size-5] 545 546 movl -12(%ebx), %ebp 547 548 movl -8(%ebx), %eax 549 movl %eax, %ecx 550 551 mull %ebp 552 553 addl %eax, -4(%edi) 554 adcl $0, %edx 555 556 movl -4(%ebx), %eax 557 movl %edx, %esi 558 movl %eax, %ebx 559 560 mull %ebp 561 562 addl %esi, %eax 563 adcl $0, %edx 564 565 addl %eax, (%edi) 566 adcl $0, %edx 567 568 movl %edx, %esi 569 movl %ebx, %eax 570 571 mull %ecx 572 573 addl %esi, %eax 574 movl %eax, 4(%edi) 575 576 adcl $0, %edx 577 578 movl %edx, 8(%edi) 579 580 581 C ----------------------------------------------------------------------------- 582 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. 583 C The loop measures about 6 cycles/iteration, though it looks like it should 584 C decode in 5. 585 586 L(lshift_start): 587 movl PARAM_SIZE, %ecx 588 589 movl PARAM_DST, %edi 590 subl $1, %ecx C size-1 and clear carry 591 592 movl PARAM_SRC, %ebx 593 movl %ecx, %edx 594 595 xorl %eax, %eax C ready for adcl 596 597 598 ALIGN(16) 599 L(lshift): 600 C eax 601 C ebx src (for later use) 602 C ecx counter, decrementing 603 C edx size-1 (for later use) 604 C esi 605 C edi dst, incrementing 606 C ebp 607 608 rcll 4(%edi) 609 rcll 8(%edi) 610 leal 8(%edi), %edi 611 loop L(lshift) 612 613 614 adcl %eax, %eax 615 616 movl %eax, 4(%edi) C dst most significant limb 617 movl (%ebx), %eax C src[0] 618 619 leal 4(%ebx,%edx,4), %ebx C &src[size] 620 subl %edx, %ecx C -(size-1) 621 622 623 C ----------------------------------------------------------------------------- 624 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 625 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 626 C low limb of src[0]^2. 627 628 629 mull %eax 630 631 movl %eax, (%edi,%ecx,8) C dst[0] 632 633 634 ALIGN(16) 635 L(diag): 636 C eax scratch 637 C ebx &src[size] 638 C ecx counter, negative 639 C edx carry 640 C esi scratch 641 C edi dst[2*size-2] 642 C ebp 643 644 movl (%ebx,%ecx,4), %eax 645 movl %edx, %esi 646 647 mull %eax 648 649 addl %esi, 4(%edi,%ecx,8) 650 adcl %eax, 8(%edi,%ecx,8) 651 adcl $0, %edx 652 653 incl %ecx 654 jnz L(diag) 655 656 657 movl SAVE_EBX, %ebx 658 movl SAVE_ESI, %esi 659 660 addl %edx, 4(%edi) C dst most significant limb 661 662 movl SAVE_EDI, %edi 663 movl SAVE_EBP, %ebp 664 addl $FRAME, %esp 665 ret 666 667 668 669 C ----------------------------------------------------------------------------- 670 ifdef(`PIC',` 671 L(pic_calc): 672 C See mpn/x86/README about old gas bugs 673 addl (%esp), %ecx 674 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx 675 addl %edx, %ecx 676 ret_internal 677 ') 678 679 680 EPILOGUE()