github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/sqr_basecase.asm (about) 1 dnl Intel P6 mpn_sqr_basecase -- square an mpn number. 2 3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular 35 C product (measured on the speed difference between 20 and 40 limbs, 36 C which is the Karatsuba recursing range). 37 38 39 dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for 40 dnl a description. The only difference here is that UNROLL_COUNT can go up 41 dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67. 42 43 deflit(SQR_TOOM2_THRESHOLD_MAX, 67) 44 45 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 46 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 47 48 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 49 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 50 51 52 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 53 C 54 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 55 C lot of function call overheads are avoided, especially when the given size 56 C is small. 57 C 58 C The code size might look a bit excessive, but not all of it is executed so 59 C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases 60 C clearly apply only to those sizes; mid sizes like 10x10 only need part of 61 C the unrolled addmul; and big sizes like 40x40 that do use the full 62 C unrolling will least be making good use of it, because 40x40 will take 63 C something like 7000 cycles. 64 65 defframe(PARAM_SIZE,12) 66 defframe(PARAM_SRC, 8) 67 defframe(PARAM_DST, 4) 68 69 TEXT 70 ALIGN(32) 71 PROLOGUE(mpn_sqr_basecase) 72 deflit(`FRAME',0) 73 74 movl PARAM_SIZE, %edx 75 76 movl PARAM_SRC, %eax 77 78 cmpl $2, %edx 79 movl PARAM_DST, %ecx 80 je L(two_limbs) 81 82 movl (%eax), %eax 83 ja L(three_or_more) 84 85 86 C ----------------------------------------------------------------------------- 87 C one limb only 88 C eax src limb 89 C ebx 90 C ecx dst 91 C edx 92 93 mull %eax 94 95 movl %eax, (%ecx) 96 movl %edx, 4(%ecx) 97 98 ret 99 100 101 C ----------------------------------------------------------------------------- 102 L(two_limbs): 103 C eax src 104 C ebx 105 C ecx dst 106 C edx 107 108 defframe(SAVE_ESI, -4) 109 defframe(SAVE_EBX, -8) 110 defframe(SAVE_EDI, -12) 111 defframe(SAVE_EBP, -16) 112 deflit(`STACK_SPACE',16) 113 114 subl $STACK_SPACE, %esp 115 deflit(`FRAME',STACK_SPACE) 116 117 movl %esi, SAVE_ESI 118 movl %eax, %esi 119 movl (%eax), %eax 120 121 mull %eax C src[0]^2 122 123 movl %eax, (%ecx) C dst[0] 124 movl 4(%esi), %eax 125 126 movl %ebx, SAVE_EBX 127 movl %edx, %ebx C dst[1] 128 129 mull %eax C src[1]^2 130 131 movl %edi, SAVE_EDI 132 movl %eax, %edi C dst[2] 133 movl (%esi), %eax 134 135 movl %ebp, SAVE_EBP 136 movl %edx, %ebp C dst[3] 137 138 mull 4(%esi) C src[0]*src[1] 139 140 addl %eax, %ebx 141 movl SAVE_ESI, %esi 142 143 adcl %edx, %edi 144 145 adcl $0, %ebp 146 addl %ebx, %eax 147 movl SAVE_EBX, %ebx 148 149 adcl %edi, %edx 150 movl SAVE_EDI, %edi 151 152 adcl $0, %ebp 153 154 movl %eax, 4(%ecx) 155 156 movl %ebp, 12(%ecx) 157 movl SAVE_EBP, %ebp 158 159 movl %edx, 8(%ecx) 160 addl $FRAME, %esp 161 162 ret 163 164 165 C ----------------------------------------------------------------------------- 166 L(three_or_more): 167 C eax src low limb 168 C ebx 169 C ecx dst 170 C edx size 171 deflit(`FRAME',0) 172 173 pushl %esi defframe_pushl(`SAVE_ESI') 174 cmpl $4, %edx 175 176 movl PARAM_SRC, %esi 177 jae L(four_or_more) 178 179 180 C ----------------------------------------------------------------------------- 181 C three limbs 182 183 C eax src low limb 184 C ebx 185 C ecx dst 186 C edx 187 C esi src 188 C edi 189 C ebp 190 191 pushl %ebp defframe_pushl(`SAVE_EBP') 192 pushl %edi defframe_pushl(`SAVE_EDI') 193 194 mull %eax C src[0] ^ 2 195 196 movl %eax, (%ecx) 197 movl %edx, 4(%ecx) 198 199 movl 4(%esi), %eax 200 xorl %ebp, %ebp 201 202 mull %eax C src[1] ^ 2 203 204 movl %eax, 8(%ecx) 205 movl %edx, 12(%ecx) 206 movl 8(%esi), %eax 207 208 pushl %ebx defframe_pushl(`SAVE_EBX') 209 210 mull %eax C src[2] ^ 2 211 212 movl %eax, 16(%ecx) 213 movl %edx, 20(%ecx) 214 215 movl (%esi), %eax 216 217 mull 4(%esi) C src[0] * src[1] 218 219 movl %eax, %ebx 220 movl %edx, %edi 221 222 movl (%esi), %eax 223 224 mull 8(%esi) C src[0] * src[2] 225 226 addl %eax, %edi 227 movl %edx, %ebp 228 229 adcl $0, %ebp 230 movl 4(%esi), %eax 231 232 mull 8(%esi) C src[1] * src[2] 233 234 xorl %esi, %esi 235 addl %eax, %ebp 236 237 C eax 238 C ebx dst[1] 239 C ecx dst 240 C edx dst[4] 241 C esi zero, will be dst[5] 242 C edi dst[2] 243 C ebp dst[3] 244 245 adcl $0, %edx 246 addl %ebx, %ebx 247 248 adcl %edi, %edi 249 250 adcl %ebp, %ebp 251 252 adcl %edx, %edx 253 movl 4(%ecx), %eax 254 255 adcl $0, %esi 256 addl %ebx, %eax 257 258 movl %eax, 4(%ecx) 259 movl 8(%ecx), %eax 260 261 adcl %edi, %eax 262 movl 12(%ecx), %ebx 263 264 adcl %ebp, %ebx 265 movl 16(%ecx), %edi 266 267 movl %eax, 8(%ecx) 268 movl SAVE_EBP, %ebp 269 270 movl %ebx, 12(%ecx) 271 movl SAVE_EBX, %ebx 272 273 adcl %edx, %edi 274 movl 20(%ecx), %eax 275 276 movl %edi, 16(%ecx) 277 movl SAVE_EDI, %edi 278 279 adcl %esi, %eax C no carry out of this 280 movl SAVE_ESI, %esi 281 282 movl %eax, 20(%ecx) 283 addl $FRAME, %esp 284 285 ret 286 287 288 289 C ----------------------------------------------------------------------------- 290 defframe(VAR_COUNTER,-20) 291 defframe(VAR_JMP, -24) 292 deflit(`STACK_SPACE',24) 293 294 L(four_or_more): 295 C eax src low limb 296 C ebx 297 C ecx 298 C edx size 299 C esi src 300 C edi 301 C ebp 302 deflit(`FRAME',4) dnl %esi already pushed 303 304 C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 305 306 subl $STACK_SPACE-FRAME, %esp 307 deflit(`FRAME',STACK_SPACE) 308 movl $1, %ecx 309 310 movl %edi, SAVE_EDI 311 movl PARAM_DST, %edi 312 313 movl %ebx, SAVE_EBX 314 subl %edx, %ecx C -(size-1) 315 316 movl %ebp, SAVE_EBP 317 movl $0, %ebx C initial carry 318 319 leal (%esi,%edx,4), %esi C &src[size] 320 movl %eax, %ebp C multiplier 321 322 leal -4(%edi,%edx,4), %edi C &dst[size-1] 323 324 325 C This loop runs at just over 6 c/l. 326 327 L(mul_1): 328 C eax scratch 329 C ebx carry 330 C ecx counter, limbs, negative, -(size-1) to -1 331 C edx scratch 332 C esi &src[size] 333 C edi &dst[size-1] 334 C ebp multiplier 335 336 movl %ebp, %eax 337 338 mull (%esi,%ecx,4) 339 340 addl %ebx, %eax 341 movl $0, %ebx 342 343 adcl %edx, %ebx 344 movl %eax, 4(%edi,%ecx,4) 345 346 incl %ecx 347 jnz L(mul_1) 348 349 350 movl %ebx, 4(%edi) 351 352 353 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 354 C 355 C The last two addmuls, which are the bottom right corner of the product 356 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 357 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 358 C cases that need to be done. 359 C 360 C The unrolled code is the same as mpn_addmul_1(), see that routine for some 361 C comments. 362 C 363 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. 364 C 365 C VAR_JMP is the computed jump into the unrolled code, stepped by one code 366 C chunk each outer loop. 367 368 dnl This is also hard-coded in the address calculation below. 369 deflit(CODE_BYTES_PER_LIMB, 15) 370 371 dnl With &src[size] and &dst[size-1] pointers, the displacements in the 372 dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above 373 dnl that an offset must be added to them. 374 deflit(OFFSET, 375 ifelse(eval(UNROLL_COUNT>32),1, 376 eval((UNROLL_COUNT-32)*4), 377 0)) 378 379 C eax 380 C ebx carry 381 C ecx 382 C edx 383 C esi &src[size] 384 C edi &dst[size-1] 385 C ebp 386 387 movl PARAM_SIZE, %ecx 388 389 subl $4, %ecx 390 jz L(corner) 391 392 movl %ecx, %edx 393 negl %ecx 394 395 shll $4, %ecx 396 ifelse(OFFSET,0,,`subl $OFFSET, %esi') 397 398 ifdef(`PIC',` 399 call L(pic_calc) 400 L(here): 401 ',` 402 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 403 ') 404 negl %edx 405 406 ifelse(OFFSET,0,,`subl $OFFSET, %edi') 407 408 C The calculated jump mustn't be before the start of the available 409 C code. This is the limit that UNROLL_COUNT puts on the src operand 410 C size, but checked here using the jump address directly. 411 412 ASSERT(ae, 413 `movl_text_address( L(unroll_inner_start), %eax) 414 cmpl %eax, %ecx') 415 416 417 C ----------------------------------------------------------------------------- 418 ALIGN(16) 419 L(unroll_outer_top): 420 C eax 421 C ebx high limb to store 422 C ecx VAR_JMP 423 C edx VAR_COUNTER, limbs, negative 424 C esi &src[size], constant 425 C edi dst ptr, second highest limb of last addmul 426 C ebp 427 428 movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier 429 movl %edx, VAR_COUNTER 430 431 movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand 432 433 mull %ebp 434 435 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') 436 437 testb $1, %cl 438 439 movl %edx, %ebx C high carry 440 leal 4(%edi), %edi 441 442 movl %ecx, %edx C jump 443 444 movl %eax, %ecx C low carry 445 leal CODE_BYTES_PER_LIMB(%edx), %edx 446 447 cmovX( %ebx, %ecx) C high carry reverse 448 cmovX( %eax, %ebx) C low carry reverse 449 movl %edx, VAR_JMP 450 jmp *%edx 451 452 453 C Must be on an even address here so the low bit of the jump address 454 C will indicate which way around ecx/ebx should start. 455 456 ALIGN(2) 457 458 L(unroll_inner_start): 459 C eax scratch 460 C ebx carry high 461 C ecx carry low 462 C edx scratch 463 C esi src pointer 464 C edi dst pointer 465 C ebp multiplier 466 C 467 C 15 code bytes each limb 468 C ecx/ebx reversed on each chunk 469 470 forloop(`i', UNROLL_COUNT, 1, ` 471 deflit(`disp_src', eval(-i*4 + OFFSET)) 472 deflit(`disp_dst', eval(disp_src)) 473 474 m4_assert(`disp_src>=-128 && disp_src<128') 475 m4_assert(`disp_dst>=-128 && disp_dst<128') 476 477 ifelse(eval(i%2),0,` 478 Zdisp( movl, disp_src,(%esi), %eax) 479 mull %ebp 480 Zdisp( addl, %ebx, disp_dst,(%edi)) 481 adcl %eax, %ecx 482 movl %edx, %ebx 483 adcl $0, %ebx 484 ',` 485 dnl this one comes out last 486 Zdisp( movl, disp_src,(%esi), %eax) 487 mull %ebp 488 Zdisp( addl, %ecx, disp_dst,(%edi)) 489 adcl %eax, %ebx 490 movl %edx, %ecx 491 adcl $0, %ecx 492 ') 493 ') 494 L(unroll_inner_end): 495 496 addl %ebx, m4_empty_if_zero(OFFSET)(%edi) 497 498 movl VAR_COUNTER, %edx 499 adcl $0, %ecx 500 501 movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) 502 movl VAR_JMP, %ecx 503 504 incl %edx 505 jnz L(unroll_outer_top) 506 507 508 ifelse(OFFSET,0,,` 509 addl $OFFSET, %esi 510 addl $OFFSET, %edi 511 ') 512 513 514 C ----------------------------------------------------------------------------- 515 ALIGN(16) 516 L(corner): 517 C eax 518 C ebx 519 C ecx 520 C edx 521 C esi &src[size] 522 C edi &dst[2*size-5] 523 C ebp 524 525 movl -12(%esi), %eax 526 527 mull -8(%esi) 528 529 addl %eax, (%edi) 530 movl -12(%esi), %eax 531 movl $0, %ebx 532 533 adcl %edx, %ebx 534 535 mull -4(%esi) 536 537 addl %eax, %ebx 538 movl -8(%esi), %eax 539 540 adcl $0, %edx 541 542 addl %ebx, 4(%edi) 543 movl $0, %ebx 544 545 adcl %edx, %ebx 546 547 mull -4(%esi) 548 549 movl PARAM_SIZE, %ecx 550 addl %ebx, %eax 551 552 adcl $0, %edx 553 554 movl %eax, 8(%edi) 555 556 movl %edx, 12(%edi) 557 movl PARAM_DST, %edi 558 559 560 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. 561 562 subl $1, %ecx C size-1 563 xorl %eax, %eax C ready for final adcl, and clear carry 564 565 movl %ecx, %edx 566 movl PARAM_SRC, %esi 567 568 569 L(lshift): 570 C eax 571 C ebx 572 C ecx counter, size-1 to 1 573 C edx size-1 (for later use) 574 C esi src (for later use) 575 C edi dst, incrementing 576 C ebp 577 578 rcll 4(%edi) 579 rcll 8(%edi) 580 581 leal 8(%edi), %edi 582 decl %ecx 583 jnz L(lshift) 584 585 586 adcl %eax, %eax 587 588 movl %eax, 4(%edi) C dst most significant limb 589 movl (%esi), %eax C src[0] 590 591 leal 4(%esi,%edx,4), %esi C &src[size] 592 subl %edx, %ecx C -(size-1) 593 594 595 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 596 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 597 C low limb of src[0]^2. 598 599 600 mull %eax 601 602 movl %eax, (%edi,%ecx,8) C dst[0] 603 604 605 L(diag): 606 C eax scratch 607 C ebx scratch 608 C ecx counter, negative 609 C edx carry 610 C esi &src[size] 611 C edi dst[2*size-2] 612 C ebp 613 614 movl (%esi,%ecx,4), %eax 615 movl %edx, %ebx 616 617 mull %eax 618 619 addl %ebx, 4(%edi,%ecx,8) 620 adcl %eax, 8(%edi,%ecx,8) 621 adcl $0, %edx 622 623 incl %ecx 624 jnz L(diag) 625 626 627 movl SAVE_ESI, %esi 628 movl SAVE_EBX, %ebx 629 630 addl %edx, 4(%edi) C dst most significant limb 631 632 movl SAVE_EDI, %edi 633 movl SAVE_EBP, %ebp 634 addl $FRAME, %esp 635 ret 636 637 638 639 C ----------------------------------------------------------------------------- 640 ifdef(`PIC',` 641 L(pic_calc): 642 addl (%esp), %ecx 643 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx 644 addl %edx, %ecx 645 ret_internal 646 ') 647 648 649 EPILOGUE()