github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/sqr_basecase.asm (about) 1 dnl AMD K7 mpn_sqr_basecase -- square an mpn number. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product 35 C (measured on the speed difference between 25 and 50 limbs, which is 36 C roughly the Karatsuba recursing range). 37 38 39 dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for 40 dnl some comments. 41 42 deflit(SQR_TOOM2_THRESHOLD_MAX, 66) 43 44 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 45 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 46 47 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 48 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 49 50 51 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 52 C 53 C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes, 54 C which is quite a bit, but is considered good value since squares big 55 C enough to use most of the code will be spending quite a few cycles in it. 56 57 58 defframe(PARAM_SIZE,12) 59 defframe(PARAM_SRC, 8) 60 defframe(PARAM_DST, 4) 61 62 TEXT 63 ALIGN(32) 64 PROLOGUE(mpn_sqr_basecase) 65 deflit(`FRAME',0) 66 67 movl PARAM_SIZE, %ecx 68 movl PARAM_SRC, %eax 69 cmpl $2, %ecx 70 71 movl PARAM_DST, %edx 72 je L(two_limbs) 73 ja L(three_or_more) 74 75 76 C------------------------------------------------------------------------------ 77 C one limb only 78 C eax src 79 C ecx size 80 C edx dst 81 82 movl (%eax), %eax 83 movl %edx, %ecx 84 85 mull %eax 86 87 movl %edx, 4(%ecx) 88 movl %eax, (%ecx) 89 ret 90 91 92 C------------------------------------------------------------------------------ 93 C 94 C Using the read/modify/write "add"s seems to be faster than saving and 95 C restoring registers. Perhaps the loads for the first set hide under the 96 C mul latency and the second gets store to load forwarding. 97 98 ALIGN(16) 99 L(two_limbs): 100 C eax src 101 C ebx 102 C ecx size 103 C edx dst 104 deflit(`FRAME',0) 105 106 pushl %ebx FRAME_pushl() 107 movl %eax, %ebx C src 108 movl (%eax), %eax 109 110 movl %edx, %ecx C dst 111 112 mull %eax C src[0]^2 113 114 movl %eax, (%ecx) C dst[0] 115 movl 4(%ebx), %eax 116 117 movl %edx, 4(%ecx) C dst[1] 118 119 mull %eax C src[1]^2 120 121 movl %eax, 8(%ecx) C dst[2] 122 movl (%ebx), %eax 123 124 movl %edx, 12(%ecx) C dst[3] 125 126 mull 4(%ebx) C src[0]*src[1] 127 128 popl %ebx 129 130 addl %eax, 4(%ecx) 131 adcl %edx, 8(%ecx) 132 adcl $0, 12(%ecx) 133 ASSERT(nc) 134 135 addl %eax, 4(%ecx) 136 adcl %edx, 8(%ecx) 137 adcl $0, 12(%ecx) 138 ASSERT(nc) 139 140 ret 141 142 143 C------------------------------------------------------------------------------ 144 defframe(SAVE_EBX, -4) 145 defframe(SAVE_ESI, -8) 146 defframe(SAVE_EDI, -12) 147 defframe(SAVE_EBP, -16) 148 deflit(STACK_SPACE, 16) 149 150 L(three_or_more): 151 subl $STACK_SPACE, %esp 152 cmpl $4, %ecx 153 jae L(four_or_more) 154 deflit(`FRAME',STACK_SPACE) 155 156 157 C------------------------------------------------------------------------------ 158 C Three limbs 159 C 160 C Writing out the loads and stores separately at the end of this code comes 161 C out about 10 cycles faster than using adcls to memory. 162 163 C eax src 164 C ecx size 165 C edx dst 166 167 movl %ebx, SAVE_EBX 168 movl %eax, %ebx C src 169 movl (%eax), %eax 170 171 movl %edx, %ecx C dst 172 movl %esi, SAVE_ESI 173 movl %edi, SAVE_EDI 174 175 mull %eax C src[0] ^ 2 176 177 movl %eax, (%ecx) 178 movl 4(%ebx), %eax 179 movl %edx, 4(%ecx) 180 181 mull %eax C src[1] ^ 2 182 183 movl %eax, 8(%ecx) 184 movl 8(%ebx), %eax 185 movl %edx, 12(%ecx) 186 187 mull %eax C src[2] ^ 2 188 189 movl %eax, 16(%ecx) 190 movl (%ebx), %eax 191 movl %edx, 20(%ecx) 192 193 mull 4(%ebx) C src[0] * src[1] 194 195 movl %eax, %esi 196 movl (%ebx), %eax 197 movl %edx, %edi 198 199 mull 8(%ebx) C src[0] * src[2] 200 201 addl %eax, %edi 202 movl %ebp, SAVE_EBP 203 movl $0, %ebp 204 205 movl 4(%ebx), %eax 206 adcl %edx, %ebp 207 208 mull 8(%ebx) C src[1] * src[2] 209 210 xorl %ebx, %ebx 211 addl %eax, %ebp 212 213 adcl $0, %edx 214 215 C eax 216 C ebx zero, will be dst[5] 217 C ecx dst 218 C edx dst[4] 219 C esi dst[1] 220 C edi dst[2] 221 C ebp dst[3] 222 223 adcl $0, %edx 224 addl %esi, %esi 225 226 adcl %edi, %edi 227 movl 4(%ecx), %eax 228 229 adcl %ebp, %ebp 230 231 adcl %edx, %edx 232 233 adcl $0, %ebx 234 addl %eax, %esi 235 movl 8(%ecx), %eax 236 237 adcl %eax, %edi 238 movl 12(%ecx), %eax 239 movl %esi, 4(%ecx) 240 241 adcl %eax, %ebp 242 movl 16(%ecx), %eax 243 movl %edi, 8(%ecx) 244 245 movl SAVE_ESI, %esi 246 movl SAVE_EDI, %edi 247 248 adcl %eax, %edx 249 movl 20(%ecx), %eax 250 movl %ebp, 12(%ecx) 251 252 adcl %ebx, %eax 253 ASSERT(nc) 254 movl SAVE_EBX, %ebx 255 movl SAVE_EBP, %ebp 256 257 movl %edx, 16(%ecx) 258 movl %eax, 20(%ecx) 259 addl $FRAME, %esp 260 261 ret 262 263 264 C------------------------------------------------------------------------------ 265 L(four_or_more): 266 267 C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 268 C Further products are added in rather than stored. 269 270 C eax src 271 C ebx 272 C ecx size 273 C edx dst 274 C esi 275 C edi 276 C ebp 277 278 defframe(`VAR_COUNTER',-20) 279 defframe(`VAR_JMP', -24) 280 deflit(EXTRA_STACK_SPACE, 8) 281 282 movl %ebx, SAVE_EBX 283 movl %edi, SAVE_EDI 284 leal (%edx,%ecx,4), %edi C &dst[size] 285 286 movl %esi, SAVE_ESI 287 movl %ebp, SAVE_EBP 288 leal (%eax,%ecx,4), %esi C &src[size] 289 290 movl (%eax), %ebp C multiplier 291 movl $0, %ebx 292 decl %ecx 293 294 negl %ecx 295 subl $EXTRA_STACK_SPACE, %esp 296 FRAME_subl_esp(EXTRA_STACK_SPACE) 297 298 L(mul_1): 299 C eax scratch 300 C ebx carry 301 C ecx counter 302 C edx scratch 303 C esi &src[size] 304 C edi &dst[size] 305 C ebp multiplier 306 307 movl (%esi,%ecx,4), %eax 308 309 mull %ebp 310 311 addl %ebx, %eax 312 movl %eax, (%edi,%ecx,4) 313 movl $0, %ebx 314 315 adcl %edx, %ebx 316 incl %ecx 317 jnz L(mul_1) 318 319 320 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 321 C 322 C The last two products, which are the bottom right corner of the product 323 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 324 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 325 C cases that need to be done. 326 C 327 C The unrolled code is the same as in mpn_addmul_1, see that routine for 328 C some comments. 329 C 330 C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive. 331 C 332 C VAR_JMP is the computed jump into the unrolled code, stepped by one code 333 C chunk each outer loop. 334 C 335 C K7 does branch prediction on indirect jumps, which is bad since it's a 336 C different target each time. There seems no way to avoid this. 337 338 dnl This value also hard coded in some shifts and adds 339 deflit(CODE_BYTES_PER_LIMB, 17) 340 341 dnl With the unmodified &src[size] and &dst[size] pointers, the 342 dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT 343 dnl values up to 31, but above that an offset must be added to them. 344 345 deflit(OFFSET, 346 ifelse(eval(UNROLL_COUNT>31),1, 347 eval((UNROLL_COUNT-31)*4), 348 0)) 349 350 dnl Because the last chunk of code is generated differently, a label placed 351 dnl at the end doesn't work. Instead calculate the implied end using the 352 dnl start and how many chunks of code there are. 353 354 deflit(UNROLL_INNER_END, 355 `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)') 356 357 C eax 358 C ebx carry 359 C ecx 360 C edx 361 C esi &src[size] 362 C edi &dst[size] 363 C ebp 364 365 movl PARAM_SIZE, %ecx 366 movl %ebx, (%edi) 367 368 subl $4, %ecx 369 jz L(corner) 370 371 negl %ecx 372 ifelse(OFFSET,0,,`subl $OFFSET, %edi') 373 ifelse(OFFSET,0,,`subl $OFFSET, %esi') 374 375 movl %ecx, %edx 376 shll $4, %ecx 377 378 ifdef(`PIC',` 379 call L(pic_calc) 380 L(here): 381 ',` 382 leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 383 ') 384 385 386 C The calculated jump mustn't come out to before the start of the 387 C code available. This is the limit UNROLL_COUNT puts on the src 388 C operand size, but checked here directly using the jump address. 389 ASSERT(ae, 390 `movl_text_address(L(unroll_inner_start), %eax) 391 cmpl %eax, %ecx') 392 393 394 C------------------------------------------------------------------------------ 395 ALIGN(16) 396 L(unroll_outer_top): 397 C eax 398 C ebx high limb to store 399 C ecx VAR_JMP 400 C edx VAR_COUNTER, limbs, negative 401 C esi &src[size], constant 402 C edi dst ptr, high of last addmul 403 C ebp 404 405 movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier 406 movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand 407 408 movl %edx, VAR_COUNTER 409 410 mull %ebp 411 412 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')') 413 414 testb $1, %cl 415 movl %edx, %ebx C high carry 416 movl %ecx, %edx C jump 417 418 movl %eax, %ecx C low carry 419 cmovX( %ebx, %ecx) C high carry reverse 420 cmovX( %eax, %ebx) C low carry reverse 421 422 leal CODE_BYTES_PER_LIMB(%edx), %eax 423 xorl %edx, %edx 424 leal 4(%edi), %edi 425 426 movl %eax, VAR_JMP 427 428 jmp *%eax 429 430 431 ifdef(`PIC',` 432 L(pic_calc): 433 addl (%esp), %ecx 434 addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx 435 addl %edx, %ecx 436 ret_internal 437 ') 438 439 440 C Must be an even address to preserve the significance of the low 441 C bit of the jump address indicating which way around ecx/ebx should 442 C start. 443 ALIGN(2) 444 445 L(unroll_inner_start): 446 C eax next limb 447 C ebx carry high 448 C ecx carry low 449 C edx scratch 450 C esi src 451 C edi dst 452 C ebp multiplier 453 454 forloop(`i', UNROLL_COUNT, 1, ` 455 deflit(`disp_src', eval(-i*4 + OFFSET)) 456 deflit(`disp_dst', eval(disp_src - 4)) 457 458 m4_assert(`disp_src>=-128 && disp_src<128') 459 m4_assert(`disp_dst>=-128 && disp_dst<128') 460 461 ifelse(eval(i%2),0,` 462 Zdisp( movl, disp_src,(%esi), %eax) 463 adcl %edx, %ebx 464 465 mull %ebp 466 467 Zdisp( addl, %ecx, disp_dst,(%edi)) 468 movl $0, %ecx 469 470 adcl %eax, %ebx 471 472 ',` 473 dnl this bit comes out last 474 Zdisp( movl, disp_src,(%esi), %eax) 475 adcl %edx, %ecx 476 477 mull %ebp 478 479 Zdisp( addl, %ebx, disp_dst,(%edi)) 480 481 ifelse(forloop_last,0, 482 ` movl $0, %ebx') 483 484 adcl %eax, %ecx 485 ') 486 ') 487 488 C eax next limb 489 C ebx carry high 490 C ecx carry low 491 C edx scratch 492 C esi src 493 C edi dst 494 C ebp multiplier 495 496 adcl $0, %edx 497 addl %ecx, -4+OFFSET(%edi) 498 movl VAR_JMP, %ecx 499 500 adcl $0, %edx 501 502 movl %edx, m4_empty_if_zero(OFFSET) (%edi) 503 movl VAR_COUNTER, %edx 504 505 incl %edx 506 jnz L(unroll_outer_top) 507 508 509 ifelse(OFFSET,0,,` 510 addl $OFFSET, %esi 511 addl $OFFSET, %edi 512 ') 513 514 515 C------------------------------------------------------------------------------ 516 L(corner): 517 C esi &src[size] 518 C edi &dst[2*size-5] 519 520 movl -12(%esi), %ebp 521 movl -8(%esi), %eax 522 movl %eax, %ecx 523 524 mull %ebp 525 526 addl %eax, -4(%edi) 527 movl -4(%esi), %eax 528 529 adcl $0, %edx 530 movl %edx, %ebx 531 movl %eax, %esi 532 533 mull %ebp 534 535 addl %ebx, %eax 536 537 adcl $0, %edx 538 addl %eax, (%edi) 539 movl %esi, %eax 540 541 adcl $0, %edx 542 movl %edx, %ebx 543 544 mull %ecx 545 546 addl %ebx, %eax 547 movl %eax, 4(%edi) 548 549 adcl $0, %edx 550 movl %edx, 8(%edi) 551 552 553 554 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 555 556 L(lshift_start): 557 movl PARAM_SIZE, %eax 558 movl PARAM_DST, %edi 559 xorl %ecx, %ecx C clear carry 560 561 leal (%edi,%eax,8), %edi 562 notl %eax C -size-1, preserve carry 563 564 leal 2(%eax), %eax C -(size-1) 565 566 L(lshift): 567 C eax counter, negative 568 C ebx 569 C ecx 570 C edx 571 C esi 572 C edi dst, pointing just after last limb 573 C ebp 574 575 rcll -4(%edi,%eax,8) 576 rcll (%edi,%eax,8) 577 incl %eax 578 jnz L(lshift) 579 580 setc %al 581 582 movl PARAM_SRC, %esi 583 movl %eax, -4(%edi) C dst most significant limb 584 585 movl PARAM_SIZE, %ecx 586 587 588 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 589 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 590 C low limb of src[0]^2. 591 592 movl (%esi), %eax C src[0] 593 594 mull %eax 595 596 leal (%esi,%ecx,4), %esi C src point just after last limb 597 negl %ecx 598 599 movl %eax, (%edi,%ecx,8) C dst[0] 600 incl %ecx 601 602 L(diag): 603 C eax scratch 604 C ebx scratch 605 C ecx counter, negative 606 C edx carry 607 C esi src just after last limb 608 C edi dst just after last limb 609 C ebp 610 611 movl (%esi,%ecx,4), %eax 612 movl %edx, %ebx 613 614 mull %eax 615 616 addl %ebx, -4(%edi,%ecx,8) 617 adcl %eax, (%edi,%ecx,8) 618 adcl $0, %edx 619 620 incl %ecx 621 jnz L(diag) 622 623 624 movl SAVE_ESI, %esi 625 movl SAVE_EBX, %ebx 626 627 addl %edx, -4(%edi) C dst most significant limb 628 movl SAVE_EDI, %edi 629 630 movl SAVE_EBP, %ebp 631 addl $FRAME, %esp 632 633 ret 634 635 EPILOGUE()