github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/divrem_1.asm (about) 1 dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb 2 dnl division. 3 4 dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. 36 37 38 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 39 C mp_srcptr src, mp_size_t size, 40 C mp_limb_t divisor); 41 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 42 C mp_srcptr src, mp_size_t size, 43 C mp_limb_t divisor, mp_limb_t carry); 44 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 45 C mp_srcptr src, mp_size_t size, 46 C mp_limb_t divisor, mp_limb_t inverse, 47 C unsigned shift); 48 C 49 C Algorithm: 50 C 51 C The method and nomenclature follow part 8 of "Division by Invariant 52 C Integers using Multiplication" by Granlund and Montgomery, reference in 53 C gmp.texi. 54 C 55 C The "and"s shown in the paper are done here with "cmov"s. "m" is written 56 C for m', and "d" for d_norm, which won't cause any confusion since it's 57 C only the normalized divisor that's of any use in the code. "b" is written 58 C for 2^N, the size of a limb, N being 32 here. 59 C 60 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as 61 C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If 62 C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case 63 C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then 64 C if q1==0xFFFFFFFF that must be the right value. 65 C 66 C For the last and second last steps q1==0xFFFFFFFF is instead handled by an 67 C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This 68 C then goes through as normal, and finding no addback required. sbbl costs 69 C an extra cycle over what the main loop code does, but it keeps code size 70 C and complexity down. 71 C 72 C Notes: 73 C 74 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high 75 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero 76 C carry, since in normal circumstances that will be a very rare event. 77 C 78 C The test for skipping a division is branch free (once size>=1 is tested). 79 C The store to the destination high limb is 0 when a divide is skipped, or 80 C if it's not skipped then a copy of the src high limb is used. The latter 81 C is in case src==dst. 82 C 83 C There's a small bias towards expecting xsize==0, by having code for 84 C xsize==0 in a straight line and xsize!=0 under forward jumps. 85 C 86 C Alternatives: 87 C 88 C If the divisor is normalized (high bit set) then a division step can 89 C always be skipped, since the high destination limb is always 0 or 1 in 90 C that case. It doesn't seem worth checking for this though, since it 91 C probably occurs infrequently, in particular note that big_base for a 92 C decimal mpn_get_str is not normalized in a 32-bit limb. 93 94 95 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 96 dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 97 dnl 98 dnl The inverse takes about 50 cycles to calculate, but after that the 99 dnl multiply is 17 c/l versus division at 42 c/l. 100 dnl 101 dnl At 3 limbs the mul is a touch faster than div on the integer part, and 102 dnl even more so on the fractional part. 103 104 deflit(MUL_THRESHOLD, 3) 105 106 107 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 108 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 109 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 110 defframe(PARAM_DIVISOR,20) 111 defframe(PARAM_SIZE, 16) 112 defframe(PARAM_SRC, 12) 113 defframe(PARAM_XSIZE, 8) 114 defframe(PARAM_DST, 4) 115 116 defframe(SAVE_EBX, -4) 117 defframe(SAVE_ESI, -8) 118 defframe(SAVE_EDI, -12) 119 defframe(SAVE_EBP, -16) 120 121 defframe(VAR_NORM, -20) 122 defframe(VAR_INVERSE, -24) 123 defframe(VAR_SRC, -28) 124 defframe(VAR_DST, -32) 125 defframe(VAR_DST_STOP,-36) 126 127 deflit(STACK_SPACE, 36) 128 129 TEXT 130 ALIGN(32) 131 132 PROLOGUE(mpn_preinv_divrem_1) 133 deflit(`FRAME',0) 134 movl PARAM_XSIZE, %ecx 135 movl PARAM_DST, %edx 136 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 137 138 movl %esi, SAVE_ESI 139 movl PARAM_SRC, %esi 140 141 movl %ebx, SAVE_EBX 142 movl PARAM_SIZE, %ebx 143 144 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] 145 movl %ebp, SAVE_EBP 146 movl PARAM_DIVISOR, %ebp 147 148 movl %edx, VAR_DST_STOP C &dst[xsize+2] 149 movl %edi, SAVE_EDI 150 xorl %edi, %edi C carry 151 152 movl -4(%esi,%ebx,4), %eax C src high limb 153 xor %ecx, %ecx 154 155 C 156 157 C 158 159 cmpl %ebp, %eax C high cmp divisor 160 161 cmovc( %eax, %edi) C high is carry if high<divisor 162 cmovnc( %eax, %ecx) C 0 if skip div, src high if not 163 C (the latter in case src==dst) 164 165 movl %ecx, -12(%edx,%ebx,4) C dst high limb 166 sbbl $0, %ebx C skip one division if high<divisor 167 movl PARAM_PREINV_SHIFT, %ecx 168 169 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size] 170 movl $32, %eax 171 172 movl %edx, VAR_DST C &dst[xsize+size] 173 174 shll %cl, %ebp C d normalized 175 subl %ecx, %eax 176 movl %ecx, VAR_NORM 177 178 movd %eax, %mm7 C rshift 179 movl PARAM_PREINV_INVERSE, %eax 180 jmp L(start_preinv) 181 182 EPILOGUE() 183 184 185 ALIGN(16) 186 187 PROLOGUE(mpn_divrem_1c) 188 deflit(`FRAME',0) 189 movl PARAM_CARRY, %edx 190 movl PARAM_SIZE, %ecx 191 subl $STACK_SPACE, %esp 192 deflit(`FRAME',STACK_SPACE) 193 194 movl %ebx, SAVE_EBX 195 movl PARAM_XSIZE, %ebx 196 197 movl %edi, SAVE_EDI 198 movl PARAM_DST, %edi 199 200 movl %ebp, SAVE_EBP 201 movl PARAM_DIVISOR, %ebp 202 203 movl %esi, SAVE_ESI 204 movl PARAM_SRC, %esi 205 206 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 207 jmp L(start_1c) 208 209 EPILOGUE() 210 211 212 C offset 0xa1, close enough to aligned 213 PROLOGUE(mpn_divrem_1) 214 deflit(`FRAME',0) 215 216 movl PARAM_SIZE, %ecx 217 movl $0, %edx C initial carry (if can't skip a div) 218 subl $STACK_SPACE, %esp 219 deflit(`FRAME',STACK_SPACE) 220 221 movl %esi, SAVE_ESI 222 movl PARAM_SRC, %esi 223 224 movl %ebx, SAVE_EBX 225 movl PARAM_XSIZE, %ebx 226 227 movl %ebp, SAVE_EBP 228 movl PARAM_DIVISOR, %ebp 229 orl %ecx, %ecx C size 230 231 movl %edi, SAVE_EDI 232 movl PARAM_DST, %edi 233 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 234 235 jz L(no_skip_div) C if size==0 236 movl -4(%esi,%ecx,4), %eax C src high limb 237 xorl %esi, %esi 238 239 cmpl %ebp, %eax C high cmp divisor 240 241 cmovc( %eax, %edx) C high is carry if high<divisor 242 cmovnc( %eax, %esi) C 0 if skip div, src high if not 243 244 movl %esi, (%edi,%ecx,4) C dst high limb 245 sbbl $0, %ecx C size-1 if high<divisor 246 movl PARAM_SRC, %esi C reload 247 L(no_skip_div): 248 249 250 L(start_1c): 251 C eax 252 C ebx xsize 253 C ecx size 254 C edx carry 255 C esi src 256 C edi &dst[xsize-1] 257 C ebp divisor 258 259 leal (%ebx,%ecx), %eax C size+xsize 260 cmpl $MUL_THRESHOLD, %eax 261 jae L(mul_by_inverse) 262 263 264 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs. 265 C It'd be possible to write them out without the looping, but no speedup 266 C would be expected. 267 C 268 C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the 269 C integer part, but curiously not on the fractional part, where %ebp is a 270 C (fixed) couple of cycles faster. 271 272 orl %ecx, %ecx 273 jz L(divide_no_integer) 274 275 L(divide_integer): 276 C eax scratch (quotient) 277 C ebx xsize 278 C ecx counter 279 C edx scratch (remainder) 280 C esi src 281 C edi &dst[xsize-1] 282 C ebp divisor 283 284 movl -4(%esi,%ecx,4), %eax 285 286 divl PARAM_DIVISOR 287 288 movl %eax, (%edi,%ecx,4) 289 decl %ecx 290 jnz L(divide_integer) 291 292 293 L(divide_no_integer): 294 movl PARAM_DST, %edi 295 orl %ebx, %ebx 296 jnz L(divide_fraction) 297 298 L(divide_done): 299 movl SAVE_ESI, %esi 300 movl SAVE_EDI, %edi 301 movl %edx, %eax 302 303 movl SAVE_EBX, %ebx 304 movl SAVE_EBP, %ebp 305 addl $STACK_SPACE, %esp 306 307 ret 308 309 310 L(divide_fraction): 311 C eax scratch (quotient) 312 C ebx counter 313 C ecx 314 C edx scratch (remainder) 315 C esi 316 C edi dst 317 C ebp divisor 318 319 movl $0, %eax 320 321 divl %ebp 322 323 movl %eax, -4(%edi,%ebx,4) 324 decl %ebx 325 jnz L(divide_fraction) 326 327 jmp L(divide_done) 328 329 330 331 C ----------------------------------------------------------------------------- 332 333 L(mul_by_inverse): 334 C eax 335 C ebx xsize 336 C ecx size 337 C edx carry 338 C esi src 339 C edi &dst[xsize-1] 340 C ebp divisor 341 342 bsrl %ebp, %eax C 31-l 343 344 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop 345 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] 346 347 movl %edi, VAR_DST 348 movl %ebx, VAR_DST_STOP 349 350 movl %ecx, %ebx C size 351 movl $31, %ecx 352 353 movl %edx, %edi C carry 354 movl $-1, %edx 355 356 C 357 358 xorl %eax, %ecx C l 359 incl %eax C 32-l 360 361 shll %cl, %ebp C d normalized 362 movl %ecx, VAR_NORM 363 364 movd %eax, %mm7 365 366 movl $-1, %eax 367 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 368 369 divl %ebp C floor (b*(b-d)-1) / d 370 371 L(start_preinv): 372 C eax inverse 373 C ebx size 374 C ecx shift 375 C edx 376 C esi src 377 C edi carry 378 C ebp divisor 379 C 380 C mm7 rshift 381 382 orl %ebx, %ebx C size 383 movl %eax, VAR_INVERSE 384 leal -12(%esi,%ebx,4), %eax C &src[size-3] 385 386 jz L(start_zero) 387 movl %eax, VAR_SRC 388 cmpl $1, %ebx 389 390 movl 8(%eax), %esi C src high limb 391 jz L(start_one) 392 393 L(start_two_or_more): 394 movl 4(%eax), %edx C src second highest limb 395 396 shldl( %cl, %esi, %edi) C n2 = carry,high << l 397 398 shldl( %cl, %edx, %esi) C n10 = high,second << l 399 400 cmpl $2, %ebx 401 je L(integer_two_left) 402 jmp L(integer_top) 403 404 405 L(start_one): 406 shldl( %cl, %esi, %edi) C n2 = carry,high << l 407 408 shll %cl, %esi C n10 = high << l 409 movl %eax, VAR_SRC 410 jmp L(integer_one_left) 411 412 413 L(start_zero): 414 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and 415 C skipped a division. 416 417 shll %cl, %edi C n2 = carry << l 418 movl %edi, %eax C return value for zero_done 419 cmpl $0, PARAM_XSIZE 420 421 je L(zero_done) 422 jmp L(fraction_some) 423 424 425 426 C ----------------------------------------------------------------------------- 427 C 428 C The multiply by inverse loop is 17 cycles, and relies on some out-of-order 429 C execution. The instruction scheduling is important, with various 430 C apparently equivalent forms running 1 to 5 cycles slower. 431 C 432 C A lower bound for the time would seem to be 16 cycles, based on the 433 C following successive dependencies. 434 C 435 C cycles 436 C n2+n1 1 437 C mul 6 438 C q1+1 1 439 C mul 6 440 C sub 1 441 C addback 1 442 C --- 443 C 16 444 C 445 C This chain is what the loop has already, but 16 cycles isn't achieved. 446 C K7 has enough decode, and probably enough execute (depending maybe on what 447 C a mul actually consumes), but nothing running under 17 has been found. 448 C 449 C In theory n2+n1 could be done in the sub and addback stages (by 450 C calculating both n2 and n2+n1 there), but lack of registers makes this an 451 C unlikely proposition. 452 C 453 C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow 454 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent 455 C chain, and nothing better than 18 cycles has been found when using it. 456 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will 457 C be an extremely rare event. 458 C 459 C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but 460 C if some special data is coming out with this always, the q1_ff special 461 C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to 462 C induce the q1_ff case, for speed measurements or testing. Note that 463 C 0xFFF...FFF divided by 1 or 2 doesn't induce it. 464 C 465 C The instruction groupings and empty comments show the cycles for a naive 466 C in-order view of the code (conveniently ignoring the load latency on 467 C VAR_INVERSE). This shows some of where the time is going, but is nonsense 468 C to the extent that out-of-order execution rearranges it. In this case 469 C there's 19 cycles shown, but it executes at 17. 470 471 ALIGN(16) 472 L(integer_top): 473 C eax scratch 474 C ebx scratch (nadj, q1) 475 C ecx scratch (src, dst) 476 C edx scratch 477 C esi n10 478 C edi n2 479 C ebp divisor 480 C 481 C mm0 scratch (src qword) 482 C mm7 rshift for normalization 483 484 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 485 movl %edi, %eax C n2 486 movl VAR_SRC, %ecx 487 488 leal (%ebp,%esi), %ebx 489 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 490 sbbl $-1, %eax C n2+n1 491 492 mull VAR_INVERSE C m*(n2+n1) 493 494 movq (%ecx), %mm0 C next limb and the one below it 495 subl $4, %ecx 496 497 movl %ecx, VAR_SRC 498 499 C 500 501 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 502 leal 1(%edi), %ebx C n2+1 503 movl %ebp, %eax C d 504 505 C 506 507 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 508 jz L(q1_ff) 509 movl VAR_DST, %ecx 510 511 mull %ebx C (q1+1)*d 512 513 psrlq %mm7, %mm0 514 515 leal -4(%ecx), %ecx 516 517 C 518 519 subl %eax, %esi 520 movl VAR_DST_STOP, %eax 521 522 C 523 524 sbbl %edx, %edi C n - (q1+1)*d 525 movl %esi, %edi C remainder -> n2 526 leal (%ebp,%esi), %edx 527 528 movd %mm0, %esi 529 530 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 531 sbbl $0, %ebx C q 532 cmpl %eax, %ecx 533 534 movl %ebx, (%ecx) 535 movl %ecx, VAR_DST 536 jne L(integer_top) 537 538 539 L(integer_loop_done): 540 541 542 C ----------------------------------------------------------------------------- 543 C 544 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz 545 C q1_ff special case. This make the code a bit smaller and simpler, and 546 C costs only 1 cycle (each). 547 548 L(integer_two_left): 549 C eax scratch 550 C ebx scratch (nadj, q1) 551 C ecx scratch (src, dst) 552 C edx scratch 553 C esi n10 554 C edi n2 555 C ebp divisor 556 C 557 C mm7 rshift 558 559 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 560 movl %edi, %eax C n2 561 movl PARAM_SRC, %ecx 562 563 leal (%ebp,%esi), %ebx 564 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 565 sbbl $-1, %eax C n2+n1 566 567 mull VAR_INVERSE C m*(n2+n1) 568 569 movd (%ecx), %mm0 C src low limb 570 571 movl VAR_DST_STOP, %ecx 572 573 C 574 575 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 576 leal 1(%edi), %ebx C n2+1 577 movl %ebp, %eax C d 578 579 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 580 581 sbbl $0, %ebx 582 583 mull %ebx C (q1+1)*d 584 585 psllq $32, %mm0 586 587 psrlq %mm7, %mm0 588 589 C 590 591 subl %eax, %esi 592 593 C 594 595 sbbl %edx, %edi C n - (q1+1)*d 596 movl %esi, %edi C remainder -> n2 597 leal (%ebp,%esi), %edx 598 599 movd %mm0, %esi 600 601 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 602 sbbl $0, %ebx C q 603 604 movl %ebx, -4(%ecx) 605 606 607 C ----------------------------------------------------------------------------- 608 L(integer_one_left): 609 C eax scratch 610 C ebx scratch (nadj, q1) 611 C ecx dst 612 C edx scratch 613 C esi n10 614 C edi n2 615 C ebp divisor 616 C 617 C mm7 rshift 618 619 movl VAR_DST_STOP, %ecx 620 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 621 movl %edi, %eax C n2 622 623 leal (%ebp,%esi), %ebx 624 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 625 sbbl $-1, %eax C n2+n1 626 627 mull VAR_INVERSE C m*(n2+n1) 628 629 C 630 631 C 632 633 C 634 635 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 636 leal 1(%edi), %ebx C n2+1 637 movl %ebp, %eax C d 638 639 C 640 641 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 642 643 sbbl $0, %ebx C q1 if q1+1 overflowed 644 645 mull %ebx 646 647 C 648 649 C 650 651 C 652 653 subl %eax, %esi 654 655 C 656 657 sbbl %edx, %edi C n - (q1+1)*d 658 movl %esi, %edi C remainder -> n2 659 leal (%ebp,%esi), %edx 660 661 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 662 sbbl $0, %ebx C q 663 664 movl %ebx, -8(%ecx) 665 subl $8, %ecx 666 667 668 669 L(integer_none): 670 cmpl $0, PARAM_XSIZE 671 jne L(fraction_some) 672 673 movl %edi, %eax 674 L(fraction_done): 675 movl VAR_NORM, %ecx 676 L(zero_done): 677 movl SAVE_EBP, %ebp 678 679 movl SAVE_EDI, %edi 680 movl SAVE_ESI, %esi 681 682 movl SAVE_EBX, %ebx 683 addl $STACK_SPACE, %esp 684 685 shrl %cl, %eax 686 emms 687 688 ret 689 690 691 C ----------------------------------------------------------------------------- 692 C 693 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 694 C of q*d is simply -d and the remainder n-q*d = n10+d 695 696 L(q1_ff): 697 C eax (divisor) 698 C ebx (q1+1 == 0) 699 C ecx 700 C edx 701 C esi n10 702 C edi n2 703 C ebp divisor 704 705 movl VAR_DST, %ecx 706 movl VAR_DST_STOP, %edx 707 subl $4, %ecx 708 709 psrlq %mm7, %mm0 710 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 711 movl %ecx, VAR_DST 712 713 movd %mm0, %esi C next n10 714 715 movl $-1, (%ecx) 716 cmpl %ecx, %edx 717 jne L(integer_top) 718 719 jmp L(integer_loop_done) 720 721 722 723 C ----------------------------------------------------------------------------- 724 C 725 C Being the fractional part, the "source" limbs are all zero, meaning 726 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated. 727 C 728 C The loop runs at 15 cycles. The dependent chain is the same as the 729 C general case above, but without the n2+n1 stage (due to n1==0), so 15 730 C would seem to be the lower bound. 731 C 732 C A not entirely obvious simplification is that q1+1 never overflows a limb, 733 C and so there's no need for the sbbl $0 or jz q1_ff from the general case. 734 C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. 735 C rnd() means rounding down to a multiple of d. 736 C 737 C m*n2 + b*n2 <= m*(d-1) + b*(d-1) 738 C = m*d + b*d - m - b 739 C = floor((b(b-d)-1)/d)*d + b*d - m - b 740 C = rnd(b(b-d)-1) + b*d - m - b 741 C = rnd(b(b-d)-1 + b*d) - m - b 742 C = rnd(b*b-1) - m - b 743 C <= (b-2)*b 744 C 745 C Unchanged from the general case is that the final quotient limb q can be 746 C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from 747 C equation 8.4 of the paper which simplifies as follows when n1==0 and 748 C n0==0. 749 C 750 C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b 751 C 752 C As before, the instruction groupings and empty comments show a naive 753 C in-order view of the code, which is made a nonsense by out of order 754 C execution. There's 17 cycles shown, but it executes at 15. 755 C 756 C Rotating the store q and remainder->n2 instructions up to the top of the 757 C loop gets the run time down from 16 to 15. 758 759 ALIGN(16) 760 L(fraction_some): 761 C eax 762 C ebx 763 C ecx 764 C edx 765 C esi 766 C edi carry 767 C ebp divisor 768 769 movl PARAM_DST, %esi 770 movl VAR_DST_STOP, %ecx C &dst[xsize+2] 771 movl %edi, %eax 772 773 subl $8, %ecx C &dst[xsize] 774 jmp L(fraction_entry) 775 776 777 ALIGN(16) 778 L(fraction_top): 779 C eax n2 carry, then scratch 780 C ebx scratch (nadj, q1) 781 C ecx dst, decrementing 782 C edx scratch 783 C esi dst stop point 784 C edi (will be n2) 785 C ebp divisor 786 787 movl %ebx, (%ecx) C previous q 788 movl %eax, %edi C remainder->n2 789 790 L(fraction_entry): 791 mull VAR_INVERSE C m*n2 792 793 movl %ebp, %eax C d 794 subl $4, %ecx C dst 795 leal 1(%edi), %ebx 796 797 C 798 799 C 800 801 C 802 803 C 804 805 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 806 807 mull %ebx C (q1+1)*d 808 809 C 810 811 C 812 813 C 814 815 negl %eax C low of n - (q1+1)*d 816 817 C 818 819 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry 820 leal (%ebp,%eax), %edx 821 822 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 823 sbbl $0, %ebx C q 824 cmpl %esi, %ecx 825 826 jne L(fraction_top) 827 828 829 movl %ebx, (%ecx) 830 jmp L(fraction_done) 831 832 EPILOGUE()