github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mul_basecase.asm (about) 1 dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers. 2 3 dnl Copyright 1999-2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop 35 C unrolling). 36 37 38 39 dnl K6: UNROLL_COUNT cycles/product (approx) 40 dnl 8 9.75 41 dnl 16 9.3 42 dnl 32 9.3 43 dnl Maximum possible with the current code is 32. 44 dnl 45 dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which 46 dnl might explain it's good performance. 47 48 deflit(UNROLL_COUNT, 16) 49 50 51 C void mpn_mul_basecase (mp_ptr wp, 52 C mp_srcptr xp, mp_size_t xsize, 53 C mp_srcptr yp, mp_size_t ysize); 54 C 55 C Calculate xp,xsize multiplied by yp,ysize, storing the result in 56 C wp,xsize+ysize. 57 C 58 C This routine is essentially the same as mpn/generic/mul_basecase.c, but 59 C it's faster because it does most of the mpn_addmul_1() entry code only 60 C once. The saving is about 10-20% on typical sizes coming from the 61 C Karatsuba multiply code. 62 C 63 C Enhancements: 64 C 65 C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25 66 C c/l. Could call mpn_mul_1 when ysize is big enough to make it worthwhile. 67 C 68 C The main unrolled addmul loop could be shared by mpn_addmul_1, using some 69 C extra stack setups and maybe 2 or 3 wasted cycles at the end. Code saving 70 C would be 256 bytes. 71 72 ifdef(`PIC',` 73 deflit(UNROLL_THRESHOLD, 8) 74 ',` 75 deflit(UNROLL_THRESHOLD, 8) 76 ') 77 78 defframe(PARAM_YSIZE,20) 79 defframe(PARAM_YP, 16) 80 defframe(PARAM_XSIZE,12) 81 defframe(PARAM_XP, 8) 82 defframe(PARAM_WP, 4) 83 84 TEXT 85 ALIGN(32) 86 PROLOGUE(mpn_mul_basecase) 87 deflit(`FRAME',0) 88 89 movl PARAM_XSIZE, %ecx 90 movl PARAM_YP, %eax 91 92 movl PARAM_XP, %edx 93 movl (%eax), %eax C yp low limb 94 95 cmpl $2, %ecx 96 ja L(xsize_more_than_two_limbs) 97 je L(two_by_something) 98 99 100 C one limb by one limb 101 102 movl (%edx), %edx C xp low limb 103 movl PARAM_WP, %ecx 104 105 mull %edx 106 107 movl %eax, (%ecx) 108 movl %edx, 4(%ecx) 109 ret 110 111 112 C ----------------------------------------------------------------------------- 113 L(two_by_something): 114 decl PARAM_YSIZE 115 pushl %ebx 116 deflit(`FRAME',4) 117 118 movl PARAM_WP, %ebx 119 pushl %esi 120 deflit(`FRAME',8) 121 122 movl %eax, %ecx C yp low limb 123 movl (%edx), %eax C xp low limb 124 125 movl %edx, %esi C xp 126 jnz L(two_by_two) 127 128 129 C two limbs by one limb 130 131 mull %ecx 132 133 movl %eax, (%ebx) 134 movl 4(%esi), %eax 135 136 movl %edx, %esi C carry 137 138 mull %ecx 139 140 addl %eax, %esi 141 movl %esi, 4(%ebx) 142 143 adcl $0, %edx 144 145 movl %edx, 8(%ebx) 146 popl %esi 147 148 popl %ebx 149 ret 150 151 152 153 C ----------------------------------------------------------------------------- 154 ALIGN(16) 155 L(two_by_two): 156 C eax xp low limb 157 C ebx wp 158 C ecx yp low limb 159 C edx 160 C esi xp 161 C edi 162 C ebp 163 deflit(`FRAME',8) 164 165 mull %ecx C xp[0] * yp[0] 166 167 push %edi 168 deflit(`FRAME',12) 169 movl %eax, (%ebx) 170 171 movl 4(%esi), %eax 172 movl %edx, %edi C carry, for wp[1] 173 174 mull %ecx C xp[1] * yp[0] 175 176 addl %eax, %edi 177 movl PARAM_YP, %ecx 178 179 adcl $0, %edx 180 181 movl %edi, 4(%ebx) 182 movl 4(%ecx), %ecx C yp[1] 183 184 movl 4(%esi), %eax C xp[1] 185 movl %edx, %edi C carry, for wp[2] 186 187 mull %ecx C xp[1] * yp[1] 188 189 addl %eax, %edi 190 191 adcl $0, %edx 192 193 movl (%esi), %eax C xp[0] 194 movl %edx, %esi C carry, for wp[3] 195 196 mull %ecx C xp[0] * yp[1] 197 198 addl %eax, 4(%ebx) 199 adcl %edx, %edi 200 adcl $0, %esi 201 202 movl %edi, 8(%ebx) 203 popl %edi 204 205 movl %esi, 12(%ebx) 206 popl %esi 207 208 popl %ebx 209 ret 210 211 212 C ----------------------------------------------------------------------------- 213 ALIGN(16) 214 L(xsize_more_than_two_limbs): 215 216 C The first limb of yp is processed with a simple mpn_mul_1 style loop 217 C inline. Unrolling this doesn't seem worthwhile since it's only run once 218 C (whereas the addmul below is run ysize-1 many times). A call to the 219 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and 220 C popping, and doesn't seem likely to be worthwhile on the typical 10-20 221 C limb operations the Karatsuba code calls here with. 222 223 C eax yp[0] 224 C ebx 225 C ecx xsize 226 C edx xp 227 C esi 228 C edi 229 C ebp 230 deflit(`FRAME',0) 231 232 pushl %edi defframe_pushl(SAVE_EDI) 233 pushl %ebp defframe_pushl(SAVE_EBP) 234 235 movl PARAM_WP, %edi 236 pushl %esi defframe_pushl(SAVE_ESI) 237 238 movl %eax, %ebp 239 pushl %ebx defframe_pushl(SAVE_EBX) 240 241 leal (%edx,%ecx,4), %ebx C xp end 242 xorl %esi, %esi 243 244 leal (%edi,%ecx,4), %edi C wp end of mul1 245 negl %ecx 246 247 248 L(mul1): 249 C eax scratch 250 C ebx xp end 251 C ecx counter, negative 252 C edx scratch 253 C esi carry 254 C edi wp end of mul1 255 C ebp multiplier 256 257 movl (%ebx,%ecx,4), %eax 258 259 mull %ebp 260 261 addl %esi, %eax 262 movl $0, %esi 263 264 adcl %edx, %esi 265 266 movl %eax, (%edi,%ecx,4) 267 incl %ecx 268 269 jnz L(mul1) 270 271 272 movl PARAM_YSIZE, %edx 273 movl %esi, (%edi) C final carry 274 275 movl PARAM_XSIZE, %ecx 276 decl %edx 277 278 jnz L(ysize_more_than_one_limb) 279 280 popl %ebx 281 popl %esi 282 popl %ebp 283 popl %edi 284 ret 285 286 287 L(ysize_more_than_one_limb): 288 cmpl $UNROLL_THRESHOLD, %ecx 289 movl PARAM_YP, %eax 290 291 jae L(unroll) 292 293 294 C ----------------------------------------------------------------------------- 295 C Simple addmul loop. 296 C 297 C Using ebx and edi pointing at the ends of their respective locations saves 298 C a couple of instructions in the outer loop. The inner loop is still 11 299 C cycles, the same as the simple loop in aorsmul_1.asm. 300 301 C eax yp 302 C ebx xp end 303 C ecx xsize 304 C edx ysize-1 305 C esi 306 C edi wp end of mul1 307 C ebp 308 309 movl 4(%eax), %ebp C multiplier 310 negl %ecx 311 312 movl %ecx, PARAM_XSIZE C -xsize 313 xorl %esi, %esi C initial carry 314 315 leal 4(%eax,%edx,4), %eax C yp end 316 negl %edx 317 318 movl %eax, PARAM_YP 319 movl %edx, PARAM_YSIZE 320 321 jmp L(simple_outer_entry) 322 323 324 C aligning here saves a couple of cycles 325 ALIGN(16) 326 L(simple_outer_top): 327 C edx ysize counter, negative 328 329 movl PARAM_YP, %eax C yp end 330 xorl %esi, %esi C carry 331 332 movl PARAM_XSIZE, %ecx C -xsize 333 movl %edx, PARAM_YSIZE 334 335 movl (%eax,%edx,4), %ebp C yp limb multiplier 336 L(simple_outer_entry): 337 addl $4, %edi 338 339 340 L(simple_inner): 341 C eax scratch 342 C ebx xp end 343 C ecx counter, negative 344 C edx scratch 345 C esi carry 346 C edi wp end of this addmul 347 C ebp multiplier 348 349 movl (%ebx,%ecx,4), %eax 350 351 mull %ebp 352 353 addl %esi, %eax 354 movl $0, %esi 355 356 adcl $0, %edx 357 addl %eax, (%edi,%ecx,4) 358 adcl %edx, %esi 359 360 incl %ecx 361 jnz L(simple_inner) 362 363 364 movl PARAM_YSIZE, %edx 365 movl %esi, (%edi) 366 367 incl %edx 368 jnz L(simple_outer_top) 369 370 371 popl %ebx 372 popl %esi 373 popl %ebp 374 popl %edi 375 ret 376 377 378 C ----------------------------------------------------------------------------- 379 C Unrolled loop. 380 C 381 C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for 382 C some comments. 383 C 384 C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to 385 C 0, inclusive. 386 C 387 C VAR_JMP is the computed jump into the unrolled loop. 388 C 389 C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop 390 C is entered. 391 C 392 C VAR_XP_LOW is the least significant limb of xp, which is needed at the 393 C start of the unrolled loop. This can't just be fetched through the xp 394 C pointer because of the offset applied to it. 395 C 396 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, 397 C inclusive. 398 C 399 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be 400 C added to give the location of the next limb of yp, which is the multiplier 401 C in the unrolled loop. 402 C 403 C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added 404 C to give the starting point in the destination for each unrolled loop (this 405 C point is one limb upwards for each limb of yp processed). 406 C 407 C Having PARAM_YSIZE count negative to zero means it's not necessary to 408 C store new values of PARAM_YP and PARAM_WP on each loop. Those values on 409 C the stack remain constant and on each loop an leal adjusts them with the 410 C PARAM_YSIZE counter value. 411 412 413 defframe(VAR_COUNTER, -20) 414 defframe(VAR_COUNTER_INIT, -24) 415 defframe(VAR_JMP, -28) 416 defframe(VAR_XP_LOW, -32) 417 deflit(VAR_STACK_SPACE, 16) 418 419 dnl For some strange reason using (%esp) instead of 0(%esp) is a touch 420 dnl slower in this code, hence the defframe empty-if-zero feature is 421 dnl disabled. 422 dnl 423 dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the 424 dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects 425 dnl anything isn't clear. 426 dnl 427 define(`defframe_empty_if_zero_disabled',1) 428 429 L(unroll): 430 C eax yp (not used) 431 C ebx xp end (not used) 432 C ecx xsize 433 C edx ysize-1 434 C esi 435 C edi wp end of mul1 (not used) 436 C ebp 437 deflit(`FRAME', 16) 438 439 leal -2(%ecx), %ebp C one limb processed at start, 440 decl %ecx C and ebp is one less 441 442 shrl $UNROLL_LOG2, %ebp 443 negl %ecx 444 445 subl $VAR_STACK_SPACE, %esp 446 deflit(`FRAME', 16+VAR_STACK_SPACE) 447 andl $UNROLL_MASK, %ecx 448 449 movl %ecx, %esi 450 shll $4, %ecx 451 452 movl %ebp, VAR_COUNTER_INIT 453 negl %esi 454 455 C 15 code bytes per limb 456 ifdef(`PIC',` 457 call L(pic_calc) 458 L(unroll_here): 459 ',` 460 leal L(unroll_entry) (%ecx,%esi,1), %ecx 461 ') 462 463 movl PARAM_XP, %ebx 464 movl %ebp, VAR_COUNTER 465 466 movl PARAM_WP, %edi 467 movl %ecx, VAR_JMP 468 469 movl (%ebx), %eax 470 leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1 471 472 leal (%ebx,%esi,4), %ebx C xp adjust for unrolling 473 474 movl %eax, VAR_XP_LOW 475 476 movl %ebx, PARAM_XP 477 movl PARAM_YP, %ebx 478 479 leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing 480 movl 4(%ebx), %ebp C multiplier (yp second limb) 481 482 leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing 483 484 movl %ecx, PARAM_WP 485 486 leal 1(%esi), %ecx C adjust parity for decl %ecx above 487 488 movl %ebx, PARAM_YP 489 negl %edx 490 491 movl %edx, PARAM_YSIZE 492 jmp L(unroll_outer_entry) 493 494 495 ifdef(`PIC',` 496 L(pic_calc): 497 C See mpn/x86/README about old gas bugs 498 leal (%ecx,%esi,1), %ecx 499 addl $L(unroll_entry)-L(unroll_here), %ecx 500 addl (%esp), %ecx 501 ret_internal 502 ') 503 504 505 C ----------------------------------------------------------------------------- 506 C Aligning here saves a couple of cycles per loop. Using 32 doesn't 507 C cost any extra space, since the inner unrolled loop below is 508 C aligned to 32. 509 ALIGN(32) 510 L(unroll_outer_top): 511 C edx ysize 512 513 movl PARAM_YP, %eax 514 movl %edx, PARAM_YSIZE C incremented ysize counter 515 516 movl PARAM_WP, %edi 517 518 movl VAR_COUNTER_INIT, %ebx 519 movl (%eax,%edx,4), %ebp C next multiplier 520 521 movl PARAM_XSIZE, %ecx 522 leal (%edi,%edx,4), %edi C adjust wp for where we are in yp 523 524 movl VAR_XP_LOW, %eax 525 movl %ebx, VAR_COUNTER 526 527 L(unroll_outer_entry): 528 mull %ebp 529 530 C using testb is a tiny bit faster than testl 531 testb $1, %cl 532 533 movl %eax, %ecx C low carry 534 movl VAR_JMP, %eax 535 536 movl %edx, %esi C high carry 537 movl PARAM_XP, %ebx 538 539 jnz L(unroll_noswap) 540 movl %ecx, %esi C high,low carry other way around 541 542 movl %edx, %ecx 543 L(unroll_noswap): 544 545 jmp *%eax 546 547 548 549 C ----------------------------------------------------------------------------- 550 ALIGN(32) 551 L(unroll_top): 552 C eax scratch 553 C ebx xp 554 C ecx carry low 555 C edx scratch 556 C esi carry high 557 C edi wp 558 C ebp multiplier 559 C VAR_COUNTER loop counter 560 C 561 C 15 code bytes each limb 562 563 leal UNROLL_BYTES(%edi), %edi 564 565 L(unroll_entry): 566 deflit(CHUNK_COUNT,2) 567 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 568 deflit(`disp0', eval(i*CHUNK_COUNT*4)) 569 deflit(`disp1', eval(disp0 + 4)) 570 deflit(`disp2', eval(disp1 + 4)) 571 572 movl disp1(%ebx), %eax 573 mull %ebp 574 Zdisp( addl, %ecx, disp0,(%edi)) 575 adcl %eax, %esi 576 movl %edx, %ecx 577 jadcl0( %ecx) 578 579 movl disp2(%ebx), %eax 580 mull %ebp 581 addl %esi, disp1(%edi) 582 adcl %eax, %ecx 583 movl %edx, %esi 584 jadcl0( %esi) 585 ') 586 587 decl VAR_COUNTER 588 leal UNROLL_BYTES(%ebx), %ebx 589 590 jns L(unroll_top) 591 592 593 movl PARAM_YSIZE, %edx 594 addl %ecx, UNROLL_BYTES(%edi) 595 596 adcl $0, %esi 597 598 incl %edx 599 movl %esi, UNROLL_BYTES+4(%edi) 600 601 jnz L(unroll_outer_top) 602 603 604 movl SAVE_ESI, %esi 605 movl SAVE_EBP, %ebp 606 movl SAVE_EDI, %edi 607 movl SAVE_EBX, %ebx 608 609 addl $FRAME, %esp 610 ret 611 612 EPILOGUE()