github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mul_basecase.asm (about) 1 dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16 35 C limbs/loop unrolling). 36 37 38 39 dnl K7 UNROLL_COUNT cycles/product (at around 20x20) 40 dnl 8 4.67 41 dnl 16 4.59 42 dnl 32 4.42 43 dnl Maximum possible with the current code is 32. 44 dnl 45 dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get 46 dnl done with a straight run through a block of code, no inner loop. Using 47 dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache. 48 49 deflit(UNROLL_COUNT, 32) 50 51 52 C void mpn_mul_basecase (mp_ptr wp, 53 C mp_srcptr xp, mp_size_t xsize, 54 C mp_srcptr yp, mp_size_t ysize); 55 C 56 C Calculate xp,xsize multiplied by yp,ysize, storing the result in 57 C wp,xsize+ysize. 58 C 59 C This routine is essentially the same as mpn/generic/mul_basecase.c, but 60 C it's faster because it does most of the mpn_addmul_1() startup 61 C calculations only once. The saving is 15-25% on typical sizes coming from 62 C the Karatsuba multiply code. 63 64 ifdef(`PIC',` 65 deflit(UNROLL_THRESHOLD, 5) 66 ',` 67 deflit(UNROLL_THRESHOLD, 5) 68 ') 69 70 defframe(PARAM_YSIZE,20) 71 defframe(PARAM_YP, 16) 72 defframe(PARAM_XSIZE,12) 73 defframe(PARAM_XP, 8) 74 defframe(PARAM_WP, 4) 75 76 TEXT 77 ALIGN(32) 78 PROLOGUE(mpn_mul_basecase) 79 deflit(`FRAME',0) 80 81 movl PARAM_XSIZE, %ecx 82 movl PARAM_YP, %eax 83 84 movl PARAM_XP, %edx 85 movl (%eax), %eax C yp low limb 86 87 cmpl $2, %ecx 88 ja L(xsize_more_than_two) 89 je L(two_by_something) 90 91 92 C one limb by one limb 93 94 mull (%edx) 95 96 movl PARAM_WP, %ecx 97 movl %eax, (%ecx) 98 movl %edx, 4(%ecx) 99 ret 100 101 102 C ----------------------------------------------------------------------------- 103 L(two_by_something): 104 deflit(`FRAME',0) 105 decl PARAM_YSIZE 106 pushl %ebx defframe_pushl(`SAVE_EBX') 107 movl %eax, %ecx C yp low limb 108 109 movl PARAM_WP, %ebx 110 pushl %esi defframe_pushl(`SAVE_ESI') 111 movl %edx, %esi C xp 112 113 movl (%edx), %eax C xp low limb 114 jnz L(two_by_two) 115 116 117 C two limbs by one limb 118 119 mull %ecx 120 121 movl %eax, (%ebx) 122 movl 4(%esi), %eax 123 movl %edx, %esi C carry 124 125 mull %ecx 126 127 addl %eax, %esi 128 129 movl %esi, 4(%ebx) 130 movl SAVE_ESI, %esi 131 132 adcl $0, %edx 133 134 movl %edx, 8(%ebx) 135 movl SAVE_EBX, %ebx 136 addl $FRAME, %esp 137 138 ret 139 140 141 142 C ----------------------------------------------------------------------------- 143 C Could load yp earlier into another register. 144 145 ALIGN(16) 146 L(two_by_two): 147 C eax xp low limb 148 C ebx wp 149 C ecx yp low limb 150 C edx 151 C esi xp 152 C edi 153 C ebp 154 155 dnl FRAME carries on from previous 156 157 mull %ecx C xp[0] * yp[0] 158 159 push %edi defframe_pushl(`SAVE_EDI') 160 movl %edx, %edi C carry, for wp[1] 161 162 movl %eax, (%ebx) 163 movl 4(%esi), %eax 164 165 mull %ecx C xp[1] * yp[0] 166 167 addl %eax, %edi 168 movl PARAM_YP, %ecx 169 170 adcl $0, %edx 171 movl 4(%ecx), %ecx C yp[1] 172 movl %edi, 4(%ebx) 173 174 movl 4(%esi), %eax C xp[1] 175 movl %edx, %edi C carry, for wp[2] 176 177 mull %ecx C xp[1] * yp[1] 178 179 addl %eax, %edi 180 181 adcl $0, %edx 182 movl (%esi), %eax C xp[0] 183 184 movl %edx, %esi C carry, for wp[3] 185 186 mull %ecx C xp[0] * yp[1] 187 188 addl %eax, 4(%ebx) 189 adcl %edx, %edi 190 movl %edi, 8(%ebx) 191 192 adcl $0, %esi 193 movl SAVE_EDI, %edi 194 movl %esi, 12(%ebx) 195 196 movl SAVE_ESI, %esi 197 movl SAVE_EBX, %ebx 198 addl $FRAME, %esp 199 200 ret 201 202 203 C ----------------------------------------------------------------------------- 204 ALIGN(16) 205 L(xsize_more_than_two): 206 207 C The first limb of yp is processed with a simple mpn_mul_1 style loop 208 C inline. Unrolling this doesn't seem worthwhile since it's only run once 209 C (whereas the addmul below is run ysize-1 many times). A call to the 210 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and 211 C popping, and doesn't seem likely to be worthwhile on the typical 13-26 212 C limb operations the Karatsuba code calls here with. 213 214 C eax yp[0] 215 C ebx 216 C ecx xsize 217 C edx xp 218 C esi 219 C edi 220 C ebp 221 222 dnl FRAME doesn't carry on from previous, no pushes yet here 223 defframe(`SAVE_EBX',-4) 224 defframe(`SAVE_ESI',-8) 225 defframe(`SAVE_EDI',-12) 226 defframe(`SAVE_EBP',-16) 227 deflit(`FRAME',0) 228 229 subl $16, %esp 230 deflit(`FRAME',16) 231 232 movl %edi, SAVE_EDI 233 movl PARAM_WP, %edi 234 235 movl %ebx, SAVE_EBX 236 movl %ebp, SAVE_EBP 237 movl %eax, %ebp 238 239 movl %esi, SAVE_ESI 240 xorl %ebx, %ebx 241 leal (%edx,%ecx,4), %esi C xp end 242 243 leal (%edi,%ecx,4), %edi C wp end of mul1 244 negl %ecx 245 246 247 L(mul1): 248 C eax scratch 249 C ebx carry 250 C ecx counter, negative 251 C edx scratch 252 C esi xp end 253 C edi wp end of mul1 254 C ebp multiplier 255 256 movl (%esi,%ecx,4), %eax 257 258 mull %ebp 259 260 addl %ebx, %eax 261 movl %eax, (%edi,%ecx,4) 262 movl $0, %ebx 263 264 adcl %edx, %ebx 265 incl %ecx 266 jnz L(mul1) 267 268 269 movl PARAM_YSIZE, %edx 270 movl PARAM_XSIZE, %ecx 271 272 movl %ebx, (%edi) C final carry 273 decl %edx 274 275 jnz L(ysize_more_than_one) 276 277 278 movl SAVE_EDI, %edi 279 movl SAVE_EBX, %ebx 280 281 movl SAVE_EBP, %ebp 282 movl SAVE_ESI, %esi 283 addl $FRAME, %esp 284 285 ret 286 287 288 L(ysize_more_than_one): 289 cmpl $UNROLL_THRESHOLD, %ecx 290 movl PARAM_YP, %eax 291 292 jae L(unroll) 293 294 295 C ----------------------------------------------------------------------------- 296 C simple addmul looping 297 C 298 C eax yp 299 C ebx 300 C ecx xsize 301 C edx ysize-1 302 C esi xp end 303 C edi wp end of mul1 304 C ebp 305 306 leal 4(%eax,%edx,4), %ebp C yp end 307 negl %ecx 308 negl %edx 309 310 movl (%esi,%ecx,4), %eax C xp low limb 311 movl %edx, PARAM_YSIZE C -(ysize-1) 312 incl %ecx 313 314 xorl %ebx, %ebx C initial carry 315 movl %ecx, PARAM_XSIZE C -(xsize-1) 316 movl %ebp, PARAM_YP 317 318 movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier 319 jmp L(simple_outer_entry) 320 321 322 C this is offset 0x121 so close enough to aligned 323 L(simple_outer_top): 324 C ebp ysize counter, negative 325 326 movl PARAM_YP, %edx 327 movl PARAM_XSIZE, %ecx C -(xsize-1) 328 xorl %ebx, %ebx C carry 329 330 movl %ebp, PARAM_YSIZE 331 addl $4, %edi C next position in wp 332 333 movl (%edx,%ebp,4), %ebp C yp limb - multiplier 334 movl -4(%esi,%ecx,4), %eax C xp low limb 335 336 337 L(simple_outer_entry): 338 339 L(simple_inner): 340 C eax xp limb 341 C ebx carry limb 342 C ecx loop counter (negative) 343 C edx scratch 344 C esi xp end 345 C edi wp end 346 C ebp multiplier 347 348 mull %ebp 349 350 addl %eax, %ebx 351 adcl $0, %edx 352 353 addl %ebx, (%edi,%ecx,4) 354 movl (%esi,%ecx,4), %eax 355 adcl $0, %edx 356 357 incl %ecx 358 movl %edx, %ebx 359 jnz L(simple_inner) 360 361 362 mull %ebp 363 364 movl PARAM_YSIZE, %ebp 365 addl %eax, %ebx 366 367 adcl $0, %edx 368 addl %ebx, (%edi) 369 370 adcl $0, %edx 371 incl %ebp 372 373 movl %edx, 4(%edi) 374 jnz L(simple_outer_top) 375 376 377 movl SAVE_EBX, %ebx 378 movl SAVE_ESI, %esi 379 380 movl SAVE_EDI, %edi 381 movl SAVE_EBP, %ebp 382 addl $FRAME, %esp 383 384 ret 385 386 387 388 C ----------------------------------------------------------------------------- 389 C 390 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some 391 C comments. 392 C 393 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop 394 C increment xp and wp. This is used to adjust back xp and wp, and rshifted 395 C to given an initial VAR_COUNTER at the top of the outer loop. 396 C 397 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT 398 C up to -1, inclusive. 399 C 400 C VAR_JMP is the computed jump into the unrolled loop. 401 C 402 C VAR_XP_LOW is the least significant limb of xp, which is needed at the 403 C start of the unrolled loop. 404 C 405 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, 406 C inclusive. 407 C 408 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be 409 C added to give the location of the next limb of yp, which is the multiplier 410 C in the unrolled loop. 411 C 412 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the 413 C outer loop to take care of xp, wp and the inner loop counter. 414 415 defframe(VAR_COUNTER, -20) 416 defframe(VAR_ADJUST, -24) 417 defframe(VAR_JMP, -28) 418 defframe(VAR_XP_LOW, -32) 419 deflit(VAR_EXTRA_SPACE, 16) 420 421 422 L(unroll): 423 C eax yp 424 C ebx 425 C ecx xsize 426 C edx ysize-1 427 C esi xp end 428 C edi wp end of mul1 429 C ebp 430 431 movl PARAM_XP, %esi 432 movl 4(%eax), %ebp C multiplier (yp second limb) 433 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing 434 435 movl PARAM_WP, %edi 436 movl %eax, PARAM_YP 437 negl %edx 438 439 movl %edx, PARAM_YSIZE 440 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 441 decl %ecx C xsize-1 442 443 movl (%esi), %eax C xp low limb 444 andl $-UNROLL_MASK-1, %ebx 445 negl %ecx 446 447 subl $VAR_EXTRA_SPACE, %esp 448 deflit(`FRAME',16+VAR_EXTRA_SPACE) 449 negl %ebx 450 andl $UNROLL_MASK, %ecx 451 452 movl %ebx, VAR_ADJUST 453 movl %ecx, %edx 454 shll $4, %ecx 455 456 sarl $UNROLL_LOG2, %ebx 457 458 C 17 code bytes per limb 459 ifdef(`PIC',` 460 call L(pic_calc) 461 L(unroll_here): 462 ',` 463 leal L(unroll_entry) (%ecx,%edx,1), %ecx 464 ') 465 negl %edx 466 467 movl %eax, VAR_XP_LOW 468 movl %ecx, VAR_JMP 469 leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling, 470 leal 4(%esi,%edx,4), %esi C and start at second limb 471 jmp L(unroll_outer_entry) 472 473 474 ifdef(`PIC',` 475 L(pic_calc): 476 C See mpn/x86/README about old gas bugs 477 leal (%ecx,%edx,1), %ecx 478 addl $L(unroll_entry)-L(unroll_here), %ecx 479 addl (%esp), %ecx 480 ret_internal 481 ') 482 483 484 C -------------------------------------------------------------------------- 485 ALIGN(32) 486 L(unroll_outer_top): 487 C ebp ysize counter, negative 488 489 movl VAR_ADJUST, %ebx 490 movl PARAM_YP, %edx 491 492 movl VAR_XP_LOW, %eax 493 movl %ebp, PARAM_YSIZE C store incremented ysize counter 494 495 leal 4(%edi,%ebx,4), %edi 496 leal (%esi,%ebx,4), %esi 497 sarl $UNROLL_LOG2, %ebx 498 499 movl (%edx,%ebp,4), %ebp C yp next multiplier 500 movl VAR_JMP, %ecx 501 502 L(unroll_outer_entry): 503 mull %ebp 504 505 testb $1, %cl C and clear carry bit 506 movl %ebx, VAR_COUNTER 507 movl $0, %ebx 508 509 movl $0, %ecx 510 cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb 511 cmovnz( %eax, %ebx) 512 513 C Extra fetch of VAR_JMP is bad, but registers are tight 514 jmp *VAR_JMP 515 516 517 C ----------------------------------------------------------------------------- 518 ALIGN(32) 519 L(unroll_top): 520 C eax xp limb 521 C ebx carry high 522 C ecx carry low 523 C edx scratch 524 C esi xp+8 525 C edi wp 526 C ebp yp multiplier limb 527 C 528 C VAR_COUNTER loop counter, negative 529 C 530 C 17 bytes each limb 531 532 L(unroll_entry): 533 534 deflit(CHUNK_COUNT,2) 535 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 536 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 537 deflit(`disp1', eval(disp0 + 4)) 538 539 Zdisp( movl, disp0,(%esi), %eax) 540 adcl %edx, %ebx 541 542 mull %ebp 543 544 Zdisp( addl, %ecx, disp0,(%edi)) 545 movl $0, %ecx 546 547 adcl %eax, %ebx 548 549 550 movl disp1(%esi), %eax 551 adcl %edx, %ecx 552 553 mull %ebp 554 555 addl %ebx, disp1(%edi) 556 movl $0, %ebx 557 558 adcl %eax, %ecx 559 ') 560 561 562 incl VAR_COUNTER 563 leal UNROLL_BYTES(%esi), %esi 564 leal UNROLL_BYTES(%edi), %edi 565 566 jnz L(unroll_top) 567 568 569 C eax 570 C ebx zero 571 C ecx low 572 C edx high 573 C esi 574 C edi wp, pointing at second last limb) 575 C ebp 576 C 577 C carry flag to be added to high 578 579 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) 580 deflit(`disp1', eval(disp0-0 + 4)) 581 582 movl PARAM_YSIZE, %ebp 583 adcl $0, %edx 584 addl %ecx, disp0(%edi) 585 586 adcl $0, %edx 587 incl %ebp 588 589 movl %edx, disp1(%edi) 590 jnz L(unroll_outer_top) 591 592 593 movl SAVE_ESI, %esi 594 movl SAVE_EBP, %ebp 595 596 movl SAVE_EDI, %edi 597 movl SAVE_EBX, %ebx 598 addl $FRAME, %esp 599 600 ret 601 602 EPILOGUE()