github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/divrem_1.asm (about) 1 dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. 2 3 dnl Copyright 1999-2004 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. 35 36 37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 38 C mp_srcptr src, mp_size_t size, 39 C mp_limb_t divisor); 40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 41 C mp_srcptr src, mp_size_t size, 42 C mp_limb_t divisor, mp_limb_t carry); 43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 44 C mp_srcptr src, mp_size_t size, 45 C mp_limb_t divisor, mp_limb_t inverse, 46 C unsigned shift); 47 C 48 C Algorithm: 49 C 50 C The method and nomenclature follow part 8 of "Division by Invariant 51 C Integers using Multiplication" by Granlund and Montgomery, reference in 52 C gmp.texi. 53 C 54 C "m" is written for what is m' in the paper, and "d" for d_norm, which 55 C won't cause any confusion since it's only the normalized divisor that's of 56 C any use in the code. "b" is written for 2^N, the size of a limb, N being 57 C 32 here. 58 C 59 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as 60 C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets 61 C us have just a psubq on the dependent chain. 62 C 63 C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, 64 C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. 65 C 66 C Notes: 67 C 68 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high 69 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero 70 C carry, since in normal circumstances that will be a very rare event. 71 C 72 C The test for skipping a division is branch free (once size>=1 is tested). 73 C The store to the destination high limb is 0 when a divide is skipped, or 74 C if it's not skipped then a copy of the src high limb is stored. The 75 C latter is in case src==dst. 76 C 77 C There's a small bias towards expecting xsize==0, by having code for 78 C xsize==0 in a straight line and xsize!=0 under forward jumps. 79 C 80 C Enhancements: 81 C 82 C The loop measures 32 cycles, but the dependent chain would suggest it 83 C could be done with 30. Not sure where to start looking for the extras. 84 C 85 C Alternatives: 86 C 87 C If the divisor is normalized (high bit set) then a division step can 88 C always be skipped, since the high destination limb is always 0 or 1 in 89 C that case. It doesn't seem worth checking for this though, since it 90 C probably occurs infrequently. 91 92 93 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 94 dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 95 dnl 96 dnl The inverse takes about 80-90 cycles to calculate, but after that the 97 dnl multiply is 32 c/l versus division at about 58 c/l. 98 dnl 99 dnl At 4 limbs the div is a touch faster than the mul (and of course 100 dnl simpler), so start the mul from 5 limbs. 101 102 deflit(MUL_THRESHOLD, 5) 103 104 105 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 106 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 107 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 108 defframe(PARAM_DIVISOR,20) 109 defframe(PARAM_SIZE, 16) 110 defframe(PARAM_SRC, 12) 111 defframe(PARAM_XSIZE, 8) 112 defframe(PARAM_DST, 4) 113 114 dnl re-use parameter space 115 define(SAVE_ESI,`PARAM_SIZE') 116 define(SAVE_EBP,`PARAM_SRC') 117 define(SAVE_EDI,`PARAM_DIVISOR') 118 define(SAVE_EBX,`PARAM_DST') 119 120 TEXT 121 122 ALIGN(16) 123 PROLOGUE(mpn_preinv_divrem_1) 124 deflit(`FRAME',0) 125 126 movl PARAM_SIZE, %ecx 127 xorl %edx, %edx C carry if can't skip a div 128 129 movl %esi, SAVE_ESI 130 movl PARAM_SRC, %esi 131 132 movl %ebp, SAVE_EBP 133 movl PARAM_DIVISOR, %ebp 134 135 movl %edi, SAVE_EDI 136 movl PARAM_DST, %edi 137 138 movl -4(%esi,%ecx,4), %eax C src high limb 139 140 movl %ebx, SAVE_EBX 141 movl PARAM_XSIZE, %ebx 142 143 movd PARAM_PREINV_INVERSE, %mm4 144 145 movd PARAM_PREINV_SHIFT, %mm7 C l 146 cmpl %ebp, %eax C high cmp divisor 147 148 cmovc( %eax, %edx) C high is carry if high<divisor 149 movd %edx, %mm0 C carry 150 151 movd %edx, %mm1 C carry 152 movl $0, %edx 153 154 movd %ebp, %mm5 C d 155 cmovnc( %eax, %edx) C 0 if skip div, src high if not 156 C (the latter in case src==dst) 157 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 158 159 movl %edx, (%edi,%ecx,4) C dst high limb 160 sbbl $0, %ecx C skip one division if high<divisor 161 movl $32, %eax 162 163 subl PARAM_PREINV_SHIFT, %eax 164 psllq %mm7, %mm5 C d normalized 165 leal (%edi,%ecx,4), %edi C &dst[xsize+size-1] 166 leal -4(%esi,%ecx,4), %esi C &src[size-1] 167 168 movd %eax, %mm6 C 32-l 169 jmp L(start_preinv) 170 171 EPILOGUE() 172 173 174 ALIGN(16) 175 PROLOGUE(mpn_divrem_1c) 176 deflit(`FRAME',0) 177 178 movl PARAM_CARRY, %edx 179 180 movl PARAM_SIZE, %ecx 181 182 movl %esi, SAVE_ESI 183 movl PARAM_SRC, %esi 184 185 movl %ebp, SAVE_EBP 186 movl PARAM_DIVISOR, %ebp 187 188 movl %edi, SAVE_EDI 189 movl PARAM_DST, %edi 190 191 movl %ebx, SAVE_EBX 192 movl PARAM_XSIZE, %ebx 193 194 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 195 jmp L(start_1c) 196 197 EPILOGUE() 198 199 200 ALIGN(16) 201 PROLOGUE(mpn_divrem_1) 202 deflit(`FRAME',0) 203 204 movl PARAM_SIZE, %ecx 205 xorl %edx, %edx C initial carry (if can't skip a div) 206 207 movl %esi, SAVE_ESI 208 movl PARAM_SRC, %esi 209 210 movl %ebp, SAVE_EBP 211 movl PARAM_DIVISOR, %ebp 212 213 movl %edi, SAVE_EDI 214 movl PARAM_DST, %edi 215 216 movl %ebx, SAVE_EBX 217 movl PARAM_XSIZE, %ebx 218 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 219 220 orl %ecx, %ecx C size 221 jz L(no_skip_div) C if size==0 222 movl -4(%esi,%ecx,4), %eax C src high limb 223 224 cmpl %ebp, %eax C high cmp divisor 225 226 cmovnc( %eax, %edx) C 0 if skip div, src high if not 227 movl %edx, (%edi,%ecx,4) C dst high limb 228 229 movl $0, %edx 230 cmovc( %eax, %edx) C high is carry if high<divisor 231 232 sbbl $0, %ecx C size-1 if high<divisor 233 L(no_skip_div): 234 235 236 L(start_1c): 237 C eax 238 C ebx xsize 239 C ecx size 240 C edx carry 241 C esi src 242 C edi &dst[xsize-1] 243 C ebp divisor 244 245 leal (%ebx,%ecx), %eax C size+xsize 246 leal -4(%esi,%ecx,4), %esi C &src[size-1] 247 leal (%edi,%ecx,4), %edi C &dst[size+xsize-1] 248 249 cmpl $MUL_THRESHOLD, %eax 250 jae L(mul_by_inverse) 251 252 253 orl %ecx, %ecx 254 jz L(divide_no_integer) C if size==0 255 256 L(divide_integer): 257 C eax scratch (quotient) 258 C ebx xsize 259 C ecx counter 260 C edx carry 261 C esi src, decrementing 262 C edi dst, decrementing 263 C ebp divisor 264 265 movl (%esi), %eax 266 subl $4, %esi 267 268 divl %ebp 269 270 movl %eax, (%edi) 271 subl $4, %edi 272 273 subl $1, %ecx 274 jnz L(divide_integer) 275 276 277 L(divide_no_integer): 278 orl %ebx, %ebx 279 jnz L(divide_fraction) C if xsize!=0 280 281 L(divide_done): 282 movl SAVE_ESI, %esi 283 movl SAVE_EDI, %edi 284 movl SAVE_EBX, %ebx 285 movl SAVE_EBP, %ebp 286 movl %edx, %eax 287 ret 288 289 290 L(divide_fraction): 291 C eax scratch (quotient) 292 C ebx counter 293 C ecx 294 C edx carry 295 C esi 296 C edi dst, decrementing 297 C ebp divisor 298 299 movl $0, %eax 300 301 divl %ebp 302 303 movl %eax, (%edi) 304 subl $4, %edi 305 306 subl $1, %ebx 307 jnz L(divide_fraction) 308 309 jmp L(divide_done) 310 311 312 313 C ----------------------------------------------------------------------------- 314 315 L(mul_by_inverse): 316 C eax 317 C ebx xsize 318 C ecx size 319 C edx carry 320 C esi &src[size-1] 321 C edi &dst[size+xsize-1] 322 C ebp divisor 323 324 bsrl %ebp, %eax C 31-l 325 movd %edx, %mm0 C carry 326 movd %edx, %mm1 C carry 327 movl %ecx, %edx C size 328 movl $31, %ecx 329 330 C 331 332 xorl %eax, %ecx C l = leading zeros on d 333 addl $1, %eax 334 335 shll %cl, %ebp C d normalized 336 movd %ecx, %mm7 C l 337 movl %edx, %ecx C size 338 339 movd %eax, %mm6 C 32-l 340 movl $-1, %edx 341 movl $-1, %eax 342 343 C 344 345 subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 346 347 divl %ebp C floor (b*(b-d)-1 / d) 348 movd %ebp, %mm5 C d 349 350 C 351 352 movd %eax, %mm4 C m 353 354 355 L(start_preinv): 356 C eax inverse 357 C ebx xsize 358 C ecx size 359 C edx 360 C esi &src[size-1] 361 C edi &dst[size+xsize-1] 362 C ebp 363 C 364 C mm0 carry 365 C mm1 carry 366 C mm2 367 C mm4 m 368 C mm5 d 369 C mm6 31-l 370 C mm7 l 371 372 psllq %mm7, %mm0 C n2 = carry << l, for size==0 373 374 subl $1, %ecx 375 jb L(integer_none) 376 377 movd (%esi), %mm0 C src high limb 378 punpckldq %mm1, %mm0 379 psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l) 380 jz L(integer_last) 381 382 383 C The dependent chain here consists of 384 C 385 C 2 paddd n1+n2 386 C 8 pmuludq m*(n1+n2) 387 C 2 paddq n2:nadj + m*(n1+n2) 388 C 2 psrlq q1 389 C 8 pmuludq d*q1 390 C 2 psubq (n-d)-q1*d 391 C 2 psrlq high n-(q1+1)*d mask 392 C 2 pand d masked 393 C 2 paddd n2+d addback 394 C -- 395 C 30 396 C 397 C But it seems to run at 32 cycles, so presumably there's something else 398 C going on. 399 400 ALIGN(16) 401 L(integer_top): 402 C eax 403 C ebx 404 C ecx counter, size-1 to 0 405 C edx 406 C esi src, decrementing 407 C edi dst, decrementing 408 C 409 C mm0 n2 410 C mm4 m 411 C mm5 d 412 C mm6 32-l 413 C mm7 l 414 415 ASSERT(b,`C n2<d 416 movd %mm0, %eax 417 movd %mm5, %edx 418 cmpl %edx, %eax') 419 420 movd -4(%esi), %mm1 C next src limbs 421 movd (%esi), %mm2 422 leal -4(%esi), %esi 423 424 punpckldq %mm2, %mm1 425 psrlq %mm6, %mm1 C n10 426 427 movq %mm1, %mm2 C n10 428 movq %mm1, %mm3 C n10 429 psrad $31, %mm1 C -n1 430 pand %mm5, %mm1 C -n1 & d 431 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow 432 433 psrld $31, %mm2 C n1 434 paddd %mm0, %mm2 C n2+n1 435 punpckldq %mm0, %mm1 C n2:nadj 436 437 pmuludq %mm4, %mm2 C m*(n2+n1) 438 439 C 440 441 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) 442 pxor %mm2, %mm2 C break dependency, saves 4 cycles 443 pcmpeqd %mm2, %mm2 C FF...FF 444 psrlq $63, %mm2 C 1 445 446 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) 447 448 paddd %mm1, %mm2 C q1+1 449 pmuludq %mm5, %mm1 C q1*d 450 451 punpckldq %mm0, %mm3 C n = n2:n10 452 pxor %mm0, %mm0 453 454 psubq %mm5, %mm3 C n - d 455 456 C 457 458 psubq %mm1, %mm3 C n - (q1+1)*d 459 460 por %mm3, %mm0 C copy remainder -> new n2 461 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 462 463 ASSERT(be,`C 0 or -1 464 movd %mm3, %eax 465 addl $1, %eax 466 cmpl $1, %eax') 467 468 paddd %mm3, %mm2 C q 469 pand %mm5, %mm3 C mask & d 470 471 paddd %mm3, %mm0 C addback if necessary 472 movd %mm2, (%edi) 473 leal -4(%edi), %edi 474 475 subl $1, %ecx 476 ja L(integer_top) 477 478 479 L(integer_last): 480 C eax 481 C ebx xsize 482 C ecx 483 C edx 484 C esi &src[0] 485 C edi &dst[xsize] 486 C 487 C mm0 n2 488 C mm4 m 489 C mm5 d 490 C mm6 491 C mm7 l 492 493 ASSERT(b,`C n2<d 494 movd %mm0, %eax 495 movd %mm5, %edx 496 cmpl %edx, %eax') 497 498 movd (%esi), %mm1 C src[0] 499 psllq %mm7, %mm1 C n10 500 501 movq %mm1, %mm2 C n10 502 movq %mm1, %mm3 C n10 503 psrad $31, %mm1 C -n1 504 pand %mm5, %mm1 C -n1 & d 505 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow 506 507 psrld $31, %mm2 C n1 508 paddd %mm0, %mm2 C n2+n1 509 punpckldq %mm0, %mm1 C n2:nadj 510 511 pmuludq %mm4, %mm2 C m*(n2+n1) 512 513 C 514 515 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) 516 pcmpeqd %mm2, %mm2 C FF...FF 517 psrlq $63, %mm2 C 1 518 519 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) 520 paddd %mm1, %mm2 C q1 521 522 pmuludq %mm5, %mm1 C q1*d 523 punpckldq %mm0, %mm3 C n 524 psubq %mm5, %mm3 C n - d 525 pxor %mm0, %mm0 526 527 C 528 529 psubq %mm1, %mm3 C n - (q1+1)*d 530 531 por %mm3, %mm0 C remainder -> n2 532 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 533 534 ASSERT(be,`C 0 or -1 535 movd %mm3, %eax 536 addl $1, %eax 537 cmpl $1, %eax') 538 539 paddd %mm3, %mm2 C q 540 pand %mm5, %mm3 C mask & d 541 542 paddd %mm3, %mm0 C addback if necessary 543 movd %mm2, (%edi) 544 leal -4(%edi), %edi 545 546 547 L(integer_none): 548 C eax 549 C ebx xsize 550 551 orl %ebx, %ebx 552 jnz L(fraction_some) C if xsize!=0 553 554 555 L(fraction_done): 556 movl SAVE_EBP, %ebp 557 psrld %mm7, %mm0 C remainder 558 559 movl SAVE_EDI, %edi 560 movd %mm0, %eax 561 562 movl SAVE_ESI, %esi 563 movl SAVE_EBX, %ebx 564 emms 565 ret 566 567 568 569 C ----------------------------------------------------------------------------- 570 C 571 572 L(fraction_some): 573 C eax 574 C ebx xsize 575 C ecx 576 C edx 577 C esi 578 C edi &dst[xsize-1] 579 C ebp 580 581 582 L(fraction_top): 583 C eax 584 C ebx counter, xsize iterations 585 C ecx 586 C edx 587 C esi src, decrementing 588 C edi dst, decrementing 589 C 590 C mm0 n2 591 C mm4 m 592 C mm5 d 593 C mm6 32-l 594 C mm7 l 595 596 ASSERT(b,`C n2<d 597 movd %mm0, %eax 598 movd %mm5, %edx 599 cmpl %edx, %eax') 600 601 movq %mm0, %mm1 C n2 602 pmuludq %mm4, %mm0 C m*n2 603 604 pcmpeqd %mm2, %mm2 605 psrlq $63, %mm2 606 607 C 608 609 psrlq $32, %mm0 C high(m*n2) 610 611 paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2) 612 613 paddd %mm0, %mm2 C q1+1 614 pmuludq %mm5, %mm0 C q1*d 615 616 psllq $32, %mm1 C n = n2:0 617 psubq %mm5, %mm1 C n - d 618 619 C 620 621 psubq %mm0, %mm1 C r = n - (q1+1)*d 622 pxor %mm0, %mm0 623 624 por %mm1, %mm0 C r -> n2 625 psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1 626 627 ASSERT(be,`C 0 or -1 628 movd %mm1, %eax 629 addl $1, %eax 630 cmpl $1, %eax') 631 632 paddd %mm1, %mm2 C q 633 pand %mm5, %mm1 C mask & d 634 635 paddd %mm1, %mm0 C addback if necessary 636 movd %mm2, (%edi) 637 leal -4(%edi), %edi 638 639 subl $1, %ebx 640 jne L(fraction_top) 641 642 643 jmp L(fraction_done) 644 645 EPILOGUE()