github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/mul_basecase.asm (about) 1 dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers. 2 3 dnl Copyright 1999-2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling). 35 36 37 dnl P6 UNROLL_COUNT cycles/product (approx) 38 dnl 8 7 39 dnl 16 6.5 40 dnl 32 6.4 41 dnl Maximum possible with the current code is 32. 42 43 deflit(UNROLL_COUNT, 16) 44 45 46 C void mpn_mul_basecase (mp_ptr wp, 47 C mp_srcptr xp, mp_size_t xsize, 48 C mp_srcptr yp, mp_size_t ysize); 49 C 50 C This routine is essentially the same as mpn/generic/mul_basecase.c, but 51 C it's faster because it does most of the mpn_addmul_1() startup 52 C calculations only once. 53 54 ifdef(`PIC',` 55 deflit(UNROLL_THRESHOLD, 5) 56 ',` 57 deflit(UNROLL_THRESHOLD, 5) 58 ') 59 60 defframe(PARAM_YSIZE,20) 61 defframe(PARAM_YP, 16) 62 defframe(PARAM_XSIZE,12) 63 defframe(PARAM_XP, 8) 64 defframe(PARAM_WP, 4) 65 66 TEXT 67 ALIGN(16) 68 69 PROLOGUE(mpn_mul_basecase) 70 deflit(`FRAME',0) 71 72 movl PARAM_XSIZE, %ecx 73 74 movl PARAM_YP, %eax 75 76 movl PARAM_XP, %edx 77 78 movl (%eax), %eax C yp[0] 79 cmpl $2, %ecx 80 ja L(xsize_more_than_two) 81 je L(two_by_something) 82 83 84 C one limb by one limb 85 86 mull (%edx) 87 88 movl PARAM_WP, %ecx 89 movl %eax, (%ecx) 90 movl %edx, 4(%ecx) 91 ret 92 93 94 C ----------------------------------------------------------------------------- 95 L(two_by_something): 96 deflit(`FRAME',0) 97 98 dnl re-use parameter space 99 define(SAVE_EBX, `PARAM_XSIZE') 100 define(SAVE_ESI, `PARAM_YSIZE') 101 102 movl %ebx, SAVE_EBX 103 cmpl $1, PARAM_YSIZE 104 movl %eax, %ecx C yp[0] 105 106 movl %esi, SAVE_ESI C save esi 107 movl PARAM_WP, %ebx 108 movl %edx, %esi C xp 109 110 movl (%edx), %eax C xp[0] 111 jne L(two_by_two) 112 113 114 C two limbs by one limb 115 C 116 C eax xp[0] 117 C ebx wp 118 C ecx yp[0] 119 C edx 120 C esi xp 121 122 mull %ecx 123 124 movl %eax, (%ebx) 125 movl 4(%esi), %eax 126 movl %edx, %esi C carry 127 128 mull %ecx 129 130 addl %eax, %esi 131 132 movl %esi, 4(%ebx) 133 movl SAVE_ESI, %esi 134 135 adcl $0, %edx 136 137 movl %edx, 8(%ebx) 138 movl SAVE_EBX, %ebx 139 140 ret 141 142 143 144 C ----------------------------------------------------------------------------- 145 146 ALIGN(16) 147 L(two_by_two): 148 C eax xp[0] 149 C ebx wp 150 C ecx yp[0] 151 C edx 152 C esi xp 153 C edi 154 C ebp 155 156 dnl more parameter space re-use 157 define(SAVE_EDI, `PARAM_WP') 158 159 mull %ecx C xp[0] * yp[0] 160 161 movl %edi, SAVE_EDI 162 movl %edx, %edi C carry, for wp[1] 163 164 movl %eax, (%ebx) 165 movl 4(%esi), %eax 166 167 mull %ecx C xp[1] * yp[0] 168 169 addl %eax, %edi 170 movl PARAM_YP, %ecx 171 172 adcl $0, %edx 173 movl 4(%ecx), %ecx C yp[1] 174 175 movl %edi, 4(%ebx) 176 movl 4(%esi), %eax C xp[1] 177 movl %edx, %edi C carry, for wp[2] 178 179 mull %ecx C xp[1] * yp[1] 180 181 addl %eax, %edi 182 movl (%esi), %eax C xp[0] 183 184 adcl $0, %edx 185 movl %edx, %esi C carry, for wp[3] 186 187 mull %ecx C xp[0] * yp[1] 188 189 addl %eax, 4(%ebx) 190 movl %esi, %eax 191 192 adcl %edx, %edi 193 movl SAVE_ESI, %esi 194 195 movl %edi, 8(%ebx) 196 197 adcl $0, %eax 198 movl SAVE_EDI, %edi 199 200 movl %eax, 12(%ebx) 201 movl SAVE_EBX, %ebx 202 203 ret 204 205 206 C ----------------------------------------------------------------------------- 207 ALIGN(16) 208 L(xsize_more_than_two): 209 210 C The first limb of yp is processed with a simple mpn_mul_1 loop running at 211 C about 6.2 c/l. Unrolling this doesn't seem worthwhile since it's only run 212 C once (whereas the addmul_1 below is run ysize-1 many times). A call to 213 C mpn_mul_1 would be slowed down by the parameter pushing and popping etc, 214 C and doesn't seem likely to be worthwhile on the typical sizes reaching 215 C here from the Karatsuba code. 216 217 C eax yp[0] 218 C ebx 219 C ecx xsize 220 C edx xp 221 C esi 222 C edi 223 C ebp 224 225 defframe(`SAVE_EBX', -4) 226 defframe(`SAVE_ESI', -8) 227 defframe(`SAVE_EDI', -12) 228 defframe(`SAVE_EBP', -16) 229 defframe(VAR_COUNTER, -20) dnl for use in the unroll case 230 defframe(VAR_ADJUST, -24) 231 defframe(VAR_JMP, -28) 232 defframe(VAR_SWAP, -32) 233 defframe(VAR_XP_LOW, -36) 234 deflit(STACK_SPACE, 36) 235 236 subl $STACK_SPACE, %esp 237 deflit(`FRAME',STACK_SPACE) 238 239 movl %edi, SAVE_EDI 240 movl PARAM_WP, %edi 241 242 movl %ebx, SAVE_EBX 243 244 movl %ebp, SAVE_EBP 245 movl %eax, %ebp 246 247 movl %esi, SAVE_ESI 248 xorl %ebx, %ebx 249 leal (%edx,%ecx,4), %esi C xp end 250 251 leal (%edi,%ecx,4), %edi C wp end of mul1 252 negl %ecx 253 254 255 L(mul1): 256 C eax scratch 257 C ebx carry 258 C ecx counter, negative 259 C edx scratch 260 C esi xp end 261 C edi wp end of mul1 262 C ebp multiplier 263 264 movl (%esi,%ecx,4), %eax 265 266 mull %ebp 267 268 addl %ebx, %eax 269 movl %eax, (%edi,%ecx,4) 270 movl $0, %ebx 271 272 adcl %edx, %ebx 273 incl %ecx 274 jnz L(mul1) 275 276 277 movl PARAM_YSIZE, %edx 278 279 movl %ebx, (%edi) C final carry 280 movl PARAM_XSIZE, %ecx 281 decl %edx 282 283 jz L(done) C if ysize==1 284 285 cmpl $UNROLL_THRESHOLD, %ecx 286 movl PARAM_YP, %eax 287 jae L(unroll) 288 289 290 C ----------------------------------------------------------------------------- 291 C simple addmul looping 292 C 293 C eax yp 294 C ebx 295 C ecx xsize 296 C edx ysize-1 297 C esi xp end 298 C edi wp end of mul1 299 C ebp 300 301 leal 4(%eax,%edx,4), %ebp C yp end 302 negl %ecx 303 negl %edx 304 305 movl %edx, PARAM_YSIZE C -(ysize-1) 306 movl (%esi,%ecx,4), %eax C xp low limb 307 incl %ecx 308 309 movl %ecx, PARAM_XSIZE C -(xsize-1) 310 xorl %ebx, %ebx C initial carry 311 312 movl %ebp, PARAM_YP 313 movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier 314 jmp L(simple_outer_entry) 315 316 317 L(simple_outer_top): 318 C ebp ysize counter, negative 319 320 movl PARAM_YP, %edx 321 322 movl PARAM_XSIZE, %ecx C -(xsize-1) 323 xorl %ebx, %ebx C carry 324 325 movl %ebp, PARAM_YSIZE 326 addl $4, %edi C next position in wp 327 328 movl (%edx,%ebp,4), %ebp C yp limb - multiplier 329 330 movl -4(%esi,%ecx,4), %eax C xp low limb 331 332 333 L(simple_outer_entry): 334 335 L(simple_inner_top): 336 C eax xp limb 337 C ebx carry limb 338 C ecx loop counter (negative) 339 C edx scratch 340 C esi xp end 341 C edi wp end 342 C ebp multiplier 343 344 mull %ebp 345 346 addl %eax, %ebx 347 adcl $0, %edx 348 349 addl %ebx, (%edi,%ecx,4) 350 movl (%esi,%ecx,4), %eax 351 adcl $0, %edx 352 353 incl %ecx 354 movl %edx, %ebx 355 jnz L(simple_inner_top) 356 357 358 C separate code for last limb so outer loop counter handling can be 359 C interleaved 360 361 mull %ebp 362 363 movl PARAM_YSIZE, %ebp 364 addl %eax, %ebx 365 366 adcl $0, %edx 367 368 addl %ebx, (%edi) 369 370 adcl $0, %edx 371 incl %ebp 372 373 movl %edx, 4(%edi) 374 jnz L(simple_outer_top) 375 376 377 L(done): 378 movl SAVE_EBX, %ebx 379 380 movl SAVE_ESI, %esi 381 382 movl SAVE_EDI, %edi 383 384 movl SAVE_EBP, %ebp 385 addl $FRAME, %esp 386 387 ret 388 389 390 391 C ----------------------------------------------------------------------------- 392 C 393 C The unrolled loop is the same as in mpn_addmul_1, see that code for some 394 C comments. 395 C 396 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop 397 C increment xp and wp. This is used to adjust xp and wp, and is rshifted to 398 C given an initial VAR_COUNTER at the top of the outer loop. 399 C 400 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT 401 C up to -1, inclusive. 402 C 403 C VAR_JMP is the computed jump into the unrolled loop. 404 C 405 C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the 406 C initial ebx and ecx on entry to the unrolling. 407 C 408 C VAR_XP_LOW is the least significant limb of xp, which is needed at the 409 C start of the unrolled loop. 410 C 411 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, 412 C inclusive. 413 C 414 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be 415 C added to give the location of the next limb of yp, which is the multiplier 416 C in the unrolled loop. 417 C 418 C The trick with the VAR_ADJUST value means it's only necessary to do one 419 C fetch in the outer loop to take care of xp, wp and the inner loop counter. 420 421 422 L(unroll): 423 C eax yp 424 C ebx 425 C ecx xsize 426 C edx ysize-1 427 C esi xp end 428 C edi wp end of mul1 429 C ebp 430 431 movl PARAM_XP, %esi 432 433 movl 4(%eax), %ebp C multiplier (yp second limb) 434 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing 435 436 movl %eax, PARAM_YP 437 movl PARAM_WP, %edi 438 negl %edx 439 440 movl %edx, PARAM_YSIZE 441 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 442 decl %ecx C xsize-1 443 444 movl (%esi), %eax C xp low limb 445 andl $-UNROLL_MASK-1, %ebx 446 negl %ecx C -(xsize-1) 447 448 negl %ebx 449 andl $UNROLL_MASK, %ecx 450 451 movl %ebx, VAR_ADJUST 452 movl %ecx, %edx 453 shll $4, %ecx 454 455 movl %eax, VAR_XP_LOW 456 sarl $UNROLL_LOG2, %ebx 457 negl %edx 458 459 C 15 code bytes per limb 460 ifdef(`PIC',` 461 call L(pic_calc) 462 L(unroll_here): 463 ',` 464 leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx 465 ') 466 467 movl %ecx, VAR_JMP 468 movl %edx, %ecx 469 shll $31, %edx 470 471 sarl $31, %edx C 0 or -1 as xsize odd or even 472 leal 4(%edi,%ecx,4), %edi C wp and xp, adjust for unrolling, 473 leal 4(%esi,%ecx,4), %esi C and start at second limb 474 475 movl %edx, VAR_SWAP 476 jmp L(unroll_outer_entry) 477 478 479 ifdef(`PIC',` 480 L(pic_calc): 481 C See mpn/x86/README about old gas bugs 482 leal (%ecx,%edx,1), %ecx 483 addl $L(unroll_inner_entry)-L(unroll_here), %ecx 484 addl (%esp), %ecx 485 ret_internal 486 ') 487 488 489 C -------------------------------------------------------------------------- 490 ALIGN(16) 491 L(unroll_outer_top): 492 C eax 493 C ebx 494 C ecx 495 C edx 496 C esi xp + offset 497 C edi wp + offset 498 C ebp ysize counter, negative 499 500 movl VAR_ADJUST, %ebx 501 movl PARAM_YP, %edx 502 503 movl VAR_XP_LOW, %eax 504 movl %ebp, PARAM_YSIZE C store incremented ysize counter 505 506 leal eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi 507 leal (%esi,%ebx,4), %esi 508 sarl $UNROLL_LOG2, %ebx 509 510 movl (%edx,%ebp,4), %ebp C yp next multiplier 511 512 L(unroll_outer_entry): 513 mull %ebp 514 515 movl %ebx, VAR_COUNTER 516 movl %edx, %ebx C carry high 517 movl %eax, %ecx C carry low 518 519 xorl %edx, %eax 520 movl VAR_JMP, %edx 521 522 andl VAR_SWAP, %eax 523 524 xorl %eax, %ebx C carries other way for odd index 525 xorl %eax, %ecx 526 527 jmp *%edx 528 529 530 C ----------------------------------------------------------------------------- 531 532 L(unroll_inner_top): 533 C eax xp limb 534 C ebx carry high 535 C ecx carry low 536 C edx scratch 537 C esi xp+8 538 C edi wp 539 C ebp yp multiplier limb 540 C 541 C VAR_COUNTER loop counter, negative 542 C 543 C 15 bytes each limb 544 545 addl $UNROLL_BYTES, %edi 546 547 L(unroll_inner_entry): 548 549 deflit(CHUNK_COUNT,2) 550 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 551 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 552 deflit(`disp1', eval(disp0 + 4)) 553 554 Zdisp( movl, disp0,(%esi), %eax) 555 mull %ebp 556 Zdisp( addl, %ecx, disp0,(%edi)) 557 adcl %eax, %ebx C new carry low 558 movl %edx, %ecx 559 adcl $0, %ecx C new carry high 560 561 movl disp1(%esi), %eax 562 mull %ebp 563 addl %ebx, disp1(%edi) 564 adcl %eax, %ecx C new carry low 565 movl %edx, %ebx 566 adcl $0, %ebx C new carry high 567 ') 568 569 570 incl VAR_COUNTER 571 leal UNROLL_BYTES(%esi), %esi 572 jnz L(unroll_inner_top) 573 574 575 C eax 576 C ebx carry high 577 C ecx carry low 578 C edx 579 C esi 580 C edi wp, pointing at second last limb) 581 C ebp 582 583 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) 584 deflit(`disp1', eval(disp0 + 4)) 585 586 movl PARAM_YSIZE, %ebp 587 addl %ecx, disp0(%edi) C carry low 588 589 adcl $0, %ebx 590 incl %ebp 591 592 movl %ebx, disp1(%edi) C carry high 593 jnz L(unroll_outer_top) 594 595 596 movl SAVE_ESI, %esi 597 598 movl SAVE_EBP, %ebp 599 600 movl SAVE_EDI, %edi 601 602 movl SAVE_EBX, %ebx 603 addl $FRAME, %esp 604 605 ret 606 607 EPILOGUE()