github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/mmx/divrem_1.asm (about) 1 dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. 35 36 37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 38 C mp_srcptr src, mp_size_t size, 39 C mp_limb_t divisor); 40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 41 C mp_srcptr src, mp_size_t size, 42 C mp_limb_t divisor, mp_limb_t carry); 43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 44 C mp_srcptr src, mp_size_t size, 45 C mp_limb_t divisor, mp_limb_t inverse, 46 C unsigned shift); 47 C 48 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, 49 C see that file for some comments. It's possible what's here can be improved. 50 51 52 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 53 dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 54 dnl 55 dnl The different speeds of the integer and fraction parts means that using 56 dnl xsize+size isn't quite right. The threshold wants to be a bit higher 57 dnl for the integer part and a bit lower for the fraction part. (Or what's 58 dnl really wanted is to speed up the integer part!) 59 dnl 60 dnl The threshold is set to make the integer part right. At 4 limbs the 61 dnl div and mul are about the same there, but on the fractional part the 62 dnl mul is much faster. 63 64 deflit(MUL_THRESHOLD, 4) 65 66 67 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 68 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 69 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 70 defframe(PARAM_DIVISOR,20) 71 defframe(PARAM_SIZE, 16) 72 defframe(PARAM_SRC, 12) 73 defframe(PARAM_XSIZE, 8) 74 defframe(PARAM_DST, 4) 75 76 defframe(SAVE_EBX, -4) 77 defframe(SAVE_ESI, -8) 78 defframe(SAVE_EDI, -12) 79 defframe(SAVE_EBP, -16) 80 81 defframe(VAR_NORM, -20) 82 defframe(VAR_INVERSE, -24) 83 defframe(VAR_SRC, -28) 84 defframe(VAR_DST, -32) 85 defframe(VAR_DST_STOP,-36) 86 87 deflit(STACK_SPACE, 36) 88 89 TEXT 90 ALIGN(16) 91 92 PROLOGUE(mpn_preinv_divrem_1) 93 deflit(`FRAME',0) 94 movl PARAM_XSIZE, %ecx 95 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 96 97 movl %esi, SAVE_ESI 98 movl PARAM_SRC, %esi 99 100 movl %ebx, SAVE_EBX 101 movl PARAM_SIZE, %ebx 102 103 movl %ebp, SAVE_EBP 104 movl PARAM_DIVISOR, %ebp 105 106 movl %edi, SAVE_EDI 107 movl PARAM_DST, %edx 108 109 movl -4(%esi,%ebx,4), %eax C src high limb 110 xorl %edi, %edi C initial carry (if can't skip a div) 111 112 C 113 114 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] 115 xor %ecx, %ecx 116 117 movl %edx, VAR_DST_STOP C &dst[xsize+2] 118 cmpl %ebp, %eax C high cmp divisor 119 120 cmovc( %eax, %edi) C high is carry if high<divisor 121 122 cmovnc( %eax, %ecx) C 0 if skip div, src high if not 123 C (the latter in case src==dst) 124 125 movl %ecx, -12(%edx,%ebx,4) C dst high limb 126 127 sbbl $0, %ebx C skip one division if high<divisor 128 movl PARAM_PREINV_SHIFT, %ecx 129 130 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size] 131 movl $32, %eax 132 133 movl %edx, VAR_DST C &dst[xsize+size] 134 135 shll %cl, %ebp C d normalized 136 subl %ecx, %eax 137 movl %ecx, VAR_NORM 138 139 movd %eax, %mm7 C rshift 140 movl PARAM_PREINV_INVERSE, %eax 141 jmp L(start_preinv) 142 143 EPILOGUE() 144 145 146 147 ALIGN(16) 148 149 PROLOGUE(mpn_divrem_1c) 150 deflit(`FRAME',0) 151 movl PARAM_CARRY, %edx 152 153 movl PARAM_SIZE, %ecx 154 subl $STACK_SPACE, %esp 155 deflit(`FRAME',STACK_SPACE) 156 157 movl %ebx, SAVE_EBX 158 movl PARAM_XSIZE, %ebx 159 160 movl %edi, SAVE_EDI 161 movl PARAM_DST, %edi 162 163 movl %ebp, SAVE_EBP 164 movl PARAM_DIVISOR, %ebp 165 166 movl %esi, SAVE_ESI 167 movl PARAM_SRC, %esi 168 169 leal -4(%edi,%ebx,4), %edi 170 jmp L(start_1c) 171 172 EPILOGUE() 173 174 175 C offset 0x31, close enough to aligned 176 PROLOGUE(mpn_divrem_1) 177 deflit(`FRAME',0) 178 179 movl PARAM_SIZE, %ecx 180 movl $0, %edx C initial carry (if can't skip a div) 181 subl $STACK_SPACE, %esp 182 deflit(`FRAME',STACK_SPACE) 183 184 movl %ebp, SAVE_EBP 185 movl PARAM_DIVISOR, %ebp 186 187 movl %ebx, SAVE_EBX 188 movl PARAM_XSIZE, %ebx 189 190 movl %esi, SAVE_ESI 191 movl PARAM_SRC, %esi 192 orl %ecx, %ecx C size 193 194 movl %edi, SAVE_EDI 195 movl PARAM_DST, %edi 196 197 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 198 jz L(no_skip_div) C if size==0 199 200 movl -4(%esi,%ecx,4), %eax C src high limb 201 xorl %esi, %esi 202 cmpl %ebp, %eax C high cmp divisor 203 204 cmovc( %eax, %edx) C high is carry if high<divisor 205 206 cmovnc( %eax, %esi) C 0 if skip div, src high if not 207 C (the latter in case src==dst) 208 209 movl %esi, (%edi,%ecx,4) C dst high limb 210 211 sbbl $0, %ecx C size-1 if high<divisor 212 movl PARAM_SRC, %esi C reload 213 L(no_skip_div): 214 215 216 L(start_1c): 217 C eax 218 C ebx xsize 219 C ecx size 220 C edx carry 221 C esi src 222 C edi &dst[xsize-1] 223 C ebp divisor 224 225 leal (%ebx,%ecx), %eax C size+xsize 226 cmpl $MUL_THRESHOLD, %eax 227 jae L(mul_by_inverse) 228 229 orl %ecx, %ecx 230 jz L(divide_no_integer) 231 232 L(divide_integer): 233 C eax scratch (quotient) 234 C ebx xsize 235 C ecx counter 236 C edx scratch (remainder) 237 C esi src 238 C edi &dst[xsize-1] 239 C ebp divisor 240 241 movl -4(%esi,%ecx,4), %eax 242 243 divl %ebp 244 245 movl %eax, (%edi,%ecx,4) 246 decl %ecx 247 jnz L(divide_integer) 248 249 250 L(divide_no_integer): 251 movl PARAM_DST, %edi 252 orl %ebx, %ebx 253 jnz L(divide_fraction) 254 255 L(divide_done): 256 movl SAVE_ESI, %esi 257 258 movl SAVE_EDI, %edi 259 260 movl SAVE_EBX, %ebx 261 movl %edx, %eax 262 263 movl SAVE_EBP, %ebp 264 addl $STACK_SPACE, %esp 265 266 ret 267 268 269 L(divide_fraction): 270 C eax scratch (quotient) 271 C ebx counter 272 C ecx 273 C edx scratch (remainder) 274 C esi 275 C edi dst 276 C ebp divisor 277 278 movl $0, %eax 279 280 divl %ebp 281 282 movl %eax, -4(%edi,%ebx,4) 283 decl %ebx 284 jnz L(divide_fraction) 285 286 jmp L(divide_done) 287 288 289 290 C ----------------------------------------------------------------------------- 291 292 L(mul_by_inverse): 293 C eax 294 C ebx xsize 295 C ecx size 296 C edx carry 297 C esi src 298 C edi &dst[xsize-1] 299 C ebp divisor 300 301 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop 302 303 movl %ebx, VAR_DST_STOP 304 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] 305 306 movl %edi, VAR_DST 307 movl %ecx, %ebx C size 308 309 bsrl %ebp, %ecx C 31-l 310 movl %edx, %edi C carry 311 312 leal 1(%ecx), %eax C 32-l 313 xorl $31, %ecx C l 314 315 movl %ecx, VAR_NORM 316 movl $-1, %edx 317 318 shll %cl, %ebp C d normalized 319 movd %eax, %mm7 320 321 movl $-1, %eax 322 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 323 324 divl %ebp C floor (b*(b-d)-1) / d 325 326 L(start_preinv): 327 C eax inverse 328 C ebx size 329 C ecx shift 330 C edx 331 C esi src 332 C edi carry 333 C ebp divisor 334 C 335 C mm7 rshift 336 337 movl %eax, VAR_INVERSE 338 orl %ebx, %ebx C size 339 leal -12(%esi,%ebx,4), %eax C &src[size-3] 340 341 movl %eax, VAR_SRC 342 jz L(start_zero) 343 344 movl 8(%eax), %esi C src high limb 345 cmpl $1, %ebx 346 jz L(start_one) 347 348 L(start_two_or_more): 349 movl 4(%eax), %edx C src second highest limb 350 351 shldl( %cl, %esi, %edi) C n2 = carry,high << l 352 353 shldl( %cl, %edx, %esi) C n10 = high,second << l 354 355 cmpl $2, %ebx 356 je L(integer_two_left) 357 jmp L(integer_top) 358 359 360 L(start_one): 361 shldl( %cl, %esi, %edi) C n2 = carry,high << l 362 363 shll %cl, %esi C n10 = high << l 364 jmp L(integer_one_left) 365 366 367 L(start_zero): 368 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and 369 C skipped a division. 370 371 shll %cl, %edi C n2 = carry << l 372 movl %edi, %eax C return value for zero_done 373 cmpl $0, PARAM_XSIZE 374 375 je L(zero_done) 376 jmp L(fraction_some) 377 378 379 380 C ----------------------------------------------------------------------------- 381 C 382 C This loop runs at about 25 cycles, which is probably sub-optimal, and 383 C certainly more than the dependent chain would suggest. A better loop, or 384 C a better rough analysis of what's possible, would be welcomed. 385 C 386 C In the current implementation, the following successively dependent 387 C micro-ops seem to exist. 388 C 389 C uops 390 C n2+n1 1 (addl) 391 C mul 5 392 C q1+1 3 (addl/adcl) 393 C mul 5 394 C sub 3 (subl/sbbl) 395 C addback 2 (cmov) 396 C --- 397 C 19 398 C 399 C Lack of registers hinders explicit scheduling and it might be that the 400 C normal out of order execution isn't able to hide enough under the mul 401 C latencies. 402 C 403 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than 404 C cmov (and takes one uop off the dependent chain). A sarl/andl/addl 405 C combination was tried for the addback (despite the fact it would lengthen 406 C the dependent chain) but found to be no faster. 407 408 409 ALIGN(16) 410 L(integer_top): 411 C eax scratch 412 C ebx scratch (nadj, q1) 413 C ecx scratch (src, dst) 414 C edx scratch 415 C esi n10 416 C edi n2 417 C ebp d 418 C 419 C mm0 scratch (src qword) 420 C mm7 rshift for normalization 421 422 movl %esi, %eax 423 movl %ebp, %ebx 424 425 sarl $31, %eax C -n1 426 movl VAR_SRC, %ecx 427 428 andl %eax, %ebx C -n1 & d 429 negl %eax C n1 430 431 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow 432 addl %edi, %eax C n2+n1 433 movq (%ecx), %mm0 C next src limb and the one below it 434 435 mull VAR_INVERSE C m*(n2+n1) 436 437 subl $4, %ecx 438 439 movl %ecx, VAR_SRC 440 441 C 442 443 C 444 445 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 446 movl %ebp, %eax C d 447 leal 1(%edi), %ebx C n2+1 448 449 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 450 jz L(q1_ff) 451 452 mull %ebx C (q1+1)*d 453 454 movl VAR_DST, %ecx 455 psrlq %mm7, %mm0 456 457 C 458 459 C 460 461 C 462 463 subl %eax, %esi 464 movl VAR_DST_STOP, %eax 465 466 sbbl %edx, %edi C n - (q1+1)*d 467 movl %esi, %edi C remainder -> n2 468 leal (%ebp,%esi), %edx 469 470 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 471 movd %mm0, %esi 472 473 sbbl $0, %ebx C q 474 subl $4, %ecx 475 476 movl %ebx, (%ecx) 477 cmpl %eax, %ecx 478 479 movl %ecx, VAR_DST 480 jne L(integer_top) 481 482 483 L(integer_loop_done): 484 485 486 C ----------------------------------------------------------------------------- 487 C 488 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz 489 C q1_ff special case. This make the code a bit smaller and simpler, and 490 C costs only 2 cycles (each). 491 492 L(integer_two_left): 493 C eax scratch 494 C ebx scratch (nadj, q1) 495 C ecx scratch (src, dst) 496 C edx scratch 497 C esi n10 498 C edi n2 499 C ebp divisor 500 C 501 C mm7 rshift 502 503 504 movl %esi, %eax 505 movl %ebp, %ebx 506 507 sarl $31, %eax C -n1 508 movl PARAM_SRC, %ecx 509 510 andl %eax, %ebx C -n1 & d 511 negl %eax C n1 512 513 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow 514 addl %edi, %eax C n2+n1 515 516 mull VAR_INVERSE C m*(n2+n1) 517 518 movd (%ecx), %mm0 C src low limb 519 520 movl VAR_DST_STOP, %ecx 521 522 C 523 524 C 525 526 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 527 leal 1(%edi), %ebx C n2+1 528 movl %ebp, %eax C d 529 530 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 531 532 sbbl $0, %ebx 533 534 mull %ebx C (q1+1)*d 535 536 psllq $32, %mm0 537 538 psrlq %mm7, %mm0 539 540 C 541 542 C 543 544 subl %eax, %esi 545 546 sbbl %edx, %edi C n - (q1+1)*d 547 movl %esi, %edi C remainder -> n2 548 leal (%ebp,%esi), %edx 549 550 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 551 movd %mm0, %esi 552 553 sbbl $0, %ebx C q 554 555 movl %ebx, -4(%ecx) 556 557 558 C ----------------------------------------------------------------------------- 559 L(integer_one_left): 560 C eax scratch 561 C ebx scratch (nadj, q1) 562 C ecx scratch (dst) 563 C edx scratch 564 C esi n10 565 C edi n2 566 C ebp divisor 567 C 568 C mm7 rshift 569 570 571 movl %esi, %eax 572 movl %ebp, %ebx 573 574 sarl $31, %eax C -n1 575 movl VAR_DST_STOP, %ecx 576 577 andl %eax, %ebx C -n1 & d 578 negl %eax C n1 579 580 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow 581 addl %edi, %eax C n2+n1 582 583 mull VAR_INVERSE C m*(n2+n1) 584 585 C 586 587 C 588 589 C 590 591 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 592 leal 1(%edi), %ebx C n2+1 593 movl %ebp, %eax C d 594 595 C 596 597 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 598 599 sbbl $0, %ebx C q1 if q1+1 overflowed 600 601 mull %ebx 602 603 C 604 605 C 606 607 C 608 609 C 610 611 subl %eax, %esi 612 movl PARAM_XSIZE, %eax 613 614 sbbl %edx, %edi C n - (q1+1)*d 615 movl %esi, %edi C remainder -> n2 616 leal (%ebp,%esi), %edx 617 618 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 619 620 sbbl $0, %ebx C q 621 622 movl %ebx, -8(%ecx) 623 subl $8, %ecx 624 625 626 627 orl %eax, %eax C xsize 628 jnz L(fraction_some) 629 630 movl %edi, %eax 631 L(fraction_done): 632 movl VAR_NORM, %ecx 633 L(zero_done): 634 movl SAVE_EBP, %ebp 635 636 movl SAVE_EDI, %edi 637 638 movl SAVE_ESI, %esi 639 640 movl SAVE_EBX, %ebx 641 addl $STACK_SPACE, %esp 642 643 shrl %cl, %eax 644 emms 645 646 ret 647 648 649 C ----------------------------------------------------------------------------- 650 C 651 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 652 C of q*d is simply -d and the remainder n-q*d = n10+d 653 654 L(q1_ff): 655 C eax (divisor) 656 C ebx (q1+1 == 0) 657 C ecx 658 C edx 659 C esi n10 660 C edi n2 661 C ebp divisor 662 663 movl VAR_DST, %ecx 664 movl VAR_DST_STOP, %edx 665 subl $4, %ecx 666 667 movl %ecx, VAR_DST 668 psrlq %mm7, %mm0 669 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 670 671 movl $-1, (%ecx) 672 movd %mm0, %esi C next n10 673 674 cmpl %ecx, %edx 675 jne L(integer_top) 676 677 jmp L(integer_loop_done) 678 679 680 681 C ----------------------------------------------------------------------------- 682 C 683 C In the current implementation, the following successively dependent 684 C micro-ops seem to exist. 685 C 686 C uops 687 C mul 5 688 C q1+1 1 (addl) 689 C mul 5 690 C sub 3 (negl/sbbl) 691 C addback 2 (cmov) 692 C --- 693 C 16 694 C 695 C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for 696 C the addback was found to be a touch slower. 697 698 699 ALIGN(16) 700 L(fraction_some): 701 C eax 702 C ebx 703 C ecx 704 C edx 705 C esi 706 C edi carry 707 C ebp divisor 708 709 movl PARAM_DST, %esi 710 movl VAR_DST_STOP, %ecx C &dst[xsize+2] 711 movl %edi, %eax 712 713 subl $8, %ecx C &dst[xsize] 714 715 716 ALIGN(16) 717 L(fraction_top): 718 C eax n2, then scratch 719 C ebx scratch (nadj, q1) 720 C ecx dst, decrementing 721 C edx scratch 722 C esi dst stop point 723 C edi n2 724 C ebp divisor 725 726 mull VAR_INVERSE C m*n2 727 728 movl %ebp, %eax C d 729 subl $4, %ecx C dst 730 leal 1(%edi), %ebx 731 732 C 733 734 C 735 736 C 737 738 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 739 740 mull %ebx C (q1+1)*d 741 742 C 743 744 C 745 746 C 747 748 C 749 750 negl %eax C low of n - (q1+1)*d 751 752 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry 753 leal (%ebp,%eax), %edx 754 755 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 756 757 sbbl $0, %ebx C q 758 movl %eax, %edi C remainder->n2 759 cmpl %esi, %ecx 760 761 movl %ebx, (%ecx) C previous q 762 jne L(fraction_top) 763 764 765 jmp L(fraction_done) 766 767 EPILOGUE()