github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/redc_1.asm (about) 1 dnl X86-64 mpn_redc_1 optimised for AMD K8-K10. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 ? 37 C AMD K10 ? 38 C AMD bull ? 39 C AMD pile ? 40 C AMD steam ? 41 C AMD bobcat ? 42 C AMD jaguar ? 43 C Intel P4 ? 44 C Intel core ? 45 C Intel NHM ? 46 C Intel SBR ? 47 C Intel IBR ? 48 C Intel HWL ? 49 C Intel BWL ? 50 C Intel atom ? 51 C VIA nano ? 52 53 C The inner loops of this code are the result of running a code generation and 54 C optimisation tool suite written by David Harvey and Torbjörn Granlund. 55 56 C TODO 57 C * Micro-optimise, none performed thus far. 58 C * This looks different from other current redc_1.asm variants. Consider 59 C adapting this to the mainstream style. 60 C * Is this code really faster than more approaches which compute q0 later? 61 C Is the use of a jump jump table faster? Or is the edge of this due to the 62 C inlined add_n code? 63 C * Put initial m[0] x q0 computation in header. 64 C * Put basecases at the file's end, single them out before the pushes. 65 66 define(`rp', `%rdi') C rcx 67 define(`up', `%rsi') C rdx 68 define(`mp_param', `%rdx') C r8 69 define(`n', `%rcx') C r9 70 define(`u0inv', `%r8') C stack 71 72 define(`i', `%r11') 73 define(`nneg', `%r12') 74 define(`mp', `%r13') 75 define(`q0', `%rbp') 76 define(`vp', `%rdx') 77 78 ABI_SUPPORT(DOS64) 79 ABI_SUPPORT(STD64) 80 81 ASM_START() 82 TEXT 83 ALIGN(32) 84 PROLOGUE(mpn_redc_1) 85 FUNC_ENTRY(4) 86 IFDOS(` mov 56(%rsp), %r8 ') 87 push %rbp 88 mov (up), q0 C up[0] 89 push %rbx 90 imul u0inv, q0 C first q0, for all execution paths 91 push %r12 92 push %r13 93 push %r14 94 push %r15 95 96 mov n, nneg 97 neg nneg 98 lea (mp_param,n,8), mp C mp += n 99 lea -16(up,n,8), up C up += n 100 101 mov R32(n), R32(%rax) 102 and $3, R32(%rax) 103 lea 4(%rax), %r9 104 cmp $4, R32(n) 105 cmovg %r9, %rax 106 lea L(tab)(%rip), %r9 107 ifdef(`PIC',` 108 movslq (%r9,%rax,4), %rax 109 add %r9, %rax 110 jmp *%rax 111 ',` 112 jmp *(%r9,%rax,8) 113 ') 114 115 JUMPTABSECT 116 ALIGN(8) 117 L(tab): JMPENT( L(0), L(tab)) 118 JMPENT( L(1), L(tab)) 119 JMPENT( L(2), L(tab)) 120 JMPENT( L(3), L(tab)) 121 JMPENT( L(0m4), L(tab)) 122 JMPENT( L(1m4), L(tab)) 123 JMPENT( L(2m4), L(tab)) 124 JMPENT( L(3m4), L(tab)) 125 TEXT 126 127 ALIGN(16) 128 L(1): mov (mp_param), %rax 129 mul q0 130 add 8(up), %rax 131 adc 16(up), %rdx 132 mov %rdx, (rp) 133 mov $0, R32(%rax) 134 adc R32(%rax), R32(%rax) 135 jmp L(ret) 136 137 138 ALIGN(16) 139 L(2): mov (mp_param), %rax 140 mul q0 141 xor R32(%r14), R32(%r14) 142 mov %rax, %r10 143 mov -8(mp), %rax 144 mov %rdx, %r9 145 mul q0 146 add (up), %r10 147 adc %rax, %r9 148 adc %rdx, %r14 149 add 8(up), %r9 150 adc $0, %r14 151 mov %r9, q0 152 imul u0inv, q0 153 mov -16(mp), %rax 154 mul q0 155 xor R32(%rbx), R32(%rbx) 156 mov %rax, %r10 157 mov -8(mp), %rax 158 mov %rdx, %r11 159 mul q0 160 add %r9, %r10 161 adc %rax, %r11 162 adc %rdx, %rbx 163 add 16(up), %r11 164 adc $0, %rbx 165 xor R32(%rax), R32(%rax) 166 add %r11, %r14 167 adc 24(up), %rbx 168 mov %r14, (rp) 169 mov %rbx, 8(rp) 170 adc R32(%rax), R32(%rax) 171 jmp L(ret) 172 173 174 L(3): mov (mp_param), %rax 175 mul q0 176 mov %rax, %rbx 177 mov %rdx, %r10 178 mov -16(mp), %rax 179 mul q0 180 xor R32(%r9), R32(%r9) 181 xor R32(%r14), R32(%r14) 182 add -8(up), %rbx 183 adc %rax, %r10 184 mov -8(mp), %rax 185 adc %rdx, %r9 186 mul q0 187 add (up), %r10 188 mov %r10, (up) 189 adc %rax, %r9 190 adc %rdx, %r14 191 mov %r10, q0 192 imul u0inv, q0 193 add %r9, 8(up) 194 adc $0, %r14 195 mov %r14, -8(up) 196 197 mov -24(mp), %rax 198 mul q0 199 mov %rax, %rbx 200 mov %rdx, %r10 201 mov -16(mp), %rax 202 mul q0 203 xor R32(%r9), R32(%r9) 204 xor R32(%r14), R32(%r14) 205 add (up), %rbx 206 adc %rax, %r10 207 mov -8(mp), %rax 208 adc %rdx, %r9 209 mul q0 210 add 8(up), %r10 211 mov %r10, 8(up) 212 adc %rax, %r9 213 adc %rdx, %r14 214 mov %r10, q0 215 imul u0inv, q0 216 add %r9, 16(up) 217 adc $0, %r14 218 mov %r14, (up) 219 220 mov -24(mp), %rax 221 mul q0 222 mov %rax, %rbx 223 mov %rdx, %r10 224 mov -16(mp), %rax 225 mul q0 226 xor R32(%r9), R32(%r9) 227 xor R32(%r14), R32(%r14) 228 add 8(up), %rbx 229 adc %rax, %r10 230 mov -8(mp), %rax 231 adc %rdx, %r9 232 mul q0 233 add 16(up), %r10 234 adc %rax, %r9 235 adc %rdx, %r14 236 add 24(up), %r9 237 adc $0, %r14 238 239 xor R32(%rax), R32(%rax) 240 add -8(up), %r10 241 adc (up), %r9 242 adc 32(up), %r14 243 mov %r10, (rp) 244 mov %r9, 8(rp) 245 mov %r14, 16(rp) 246 adc R32(%rax), R32(%rax) 247 jmp L(ret) 248 249 250 ALIGN(16) 251 L(2m4): 252 L(lo2): mov (mp,nneg,8), %rax 253 mul q0 254 xor R32(%r14), R32(%r14) 255 xor R32(%rbx), R32(%rbx) 256 mov %rax, %r10 257 mov 8(mp,nneg,8), %rax 258 mov 24(up,nneg,8), %r15 259 mov %rdx, %r9 260 mul q0 261 add 16(up,nneg,8), %r10 262 adc %rax, %r9 263 mov 16(mp,nneg,8), %rax 264 adc %rdx, %r14 265 mul q0 266 mov $0, R32(%r10) C xor? 267 lea 2(nneg), i 268 add %r9, %r15 269 imul u0inv, %r15 270 jmp L(e2) 271 272 ALIGN(16) 273 L(li2): add %r10, (up,i,8) 274 adc %rax, %r9 275 mov (mp,i,8), %rax 276 adc %rdx, %r14 277 xor R32(%r10), R32(%r10) 278 mul q0 279 L(e2): add %r9, 8(up,i,8) 280 adc %rax, %r14 281 adc %rdx, %rbx 282 mov 8(mp,i,8), %rax 283 mul q0 284 add %r14, 16(up,i,8) 285 adc %rax, %rbx 286 adc %rdx, %r10 287 mov 16(mp,i,8), %rax 288 mul q0 289 add %rbx, 24(up,i,8) 290 mov $0, R32(%r14) C zero 291 mov %r14, %rbx C zero 292 adc %rax, %r10 293 mov 24(mp,i,8), %rax 294 mov %r14, %r9 C zero 295 adc %rdx, %r9 296 mul q0 297 add $4, i 298 js L(li2) 299 300 L(le2): add %r10, (up) 301 adc %rax, %r9 302 adc %r14, %rdx 303 add %r9, 8(up) 304 adc $0, %rdx 305 mov %rdx, 16(up,nneg,8) C up[0] 306 add $8, up 307 mov %r15, q0 308 dec n 309 jnz L(lo2) 310 311 mov nneg, n 312 sar $2, n 313 lea 32(up,nneg,8), up 314 lea (up,nneg,8), vp 315 316 mov -16(up), %r8 317 mov -8(up), %r9 318 add -16(vp), %r8 319 adc -8(vp), %r9 320 mov %r8, (rp) 321 mov %r9, 8(rp) 322 lea 16(rp), rp 323 jmp L(addx) 324 325 326 ALIGN(16) 327 L(1m4): 328 L(lo1): mov (mp,nneg,8), %rax 329 xor %r9, %r9 330 xor R32(%rbx), R32(%rbx) 331 mul q0 332 mov %rax, %r9 333 mov 8(mp,nneg,8), %rax 334 mov 24(up,nneg,8), %r15 335 mov %rdx, %r14 336 mov $0, R32(%r10) C xor? 337 mul q0 338 add 16(up,nneg,8), %r9 339 adc %rax, %r14 340 adc %rdx, %rbx 341 mov 16(mp,nneg,8), %rax 342 mul q0 343 lea 1(nneg), i 344 add %r14, %r15 345 imul u0inv, %r15 346 jmp L(e1) 347 348 ALIGN(16) 349 L(li1): add %r10, (up,i,8) 350 adc %rax, %r9 351 mov (mp,i,8), %rax 352 adc %rdx, %r14 353 xor R32(%r10), R32(%r10) 354 mul q0 355 add %r9, 8(up,i,8) 356 adc %rax, %r14 357 adc %rdx, %rbx 358 mov 8(mp,i,8), %rax 359 mul q0 360 L(e1): add %r14, 16(up,i,8) 361 adc %rax, %rbx 362 adc %rdx, %r10 363 mov 16(mp,i,8), %rax 364 mul q0 365 add %rbx, 24(up,i,8) 366 mov $0, R32(%r14) C zero 367 mov %r14, %rbx C zero 368 adc %rax, %r10 369 mov 24(mp,i,8), %rax 370 mov %r14, %r9 C zero 371 adc %rdx, %r9 372 mul q0 373 add $4, i 374 js L(li1) 375 376 L(le1): add %r10, (up) 377 adc %rax, %r9 378 adc %r14, %rdx 379 add %r9, 8(up) 380 adc $0, %rdx 381 mov %rdx, 16(up,nneg,8) C up[0] 382 add $8, up 383 mov %r15, q0 384 dec n 385 jnz L(lo1) 386 387 mov nneg, n 388 sar $2, n 389 lea 24(up,nneg,8), up 390 lea (up,nneg,8), vp 391 392 mov -8(up), %r8 393 add -8(vp), %r8 394 mov %r8, (rp) 395 lea 8(rp), rp 396 jmp L(addx) 397 398 399 ALIGN(16) 400 L(0): 401 L(0m4): 402 L(lo0): mov (mp,nneg,8), %rax 403 mov nneg, i 404 mul q0 405 xor R32(%r10), R32(%r10) 406 mov %rax, %r14 407 mov %rdx, %rbx 408 mov 8(mp,nneg,8), %rax 409 mov 24(up,nneg,8), %r15 410 mul q0 411 add 16(up,nneg,8), %r14 412 adc %rax, %rbx 413 adc %rdx, %r10 414 add %rbx, %r15 415 imul u0inv, %r15 416 jmp L(e0) 417 418 ALIGN(16) 419 L(li0): add %r10, (up,i,8) 420 adc %rax, %r9 421 mov (mp,i,8), %rax 422 adc %rdx, %r14 423 xor R32(%r10), R32(%r10) 424 mul q0 425 add %r9, 8(up,i,8) 426 adc %rax, %r14 427 adc %rdx, %rbx 428 mov 8(mp,i,8), %rax 429 mul q0 430 add %r14, 16(up,i,8) 431 adc %rax, %rbx 432 adc %rdx, %r10 433 L(e0): mov 16(mp,i,8), %rax 434 mul q0 435 add %rbx, 24(up,i,8) 436 mov $0, R32(%r14) C zero 437 mov %r14, %rbx C zero 438 adc %rax, %r10 439 mov 24(mp,i,8), %rax 440 mov %r14, %r9 C zero 441 adc %rdx, %r9 442 mul q0 443 add $4, i 444 js L(li0) 445 446 L(le0): add %r10, (up) 447 adc %rax, %r9 448 adc %r14, %rdx 449 add %r9, 8(up) 450 adc $0, %rdx 451 mov %rdx, 16(up,nneg,8) C up[0] 452 add $8, up 453 mov %r15, q0 454 dec n 455 jnz L(lo0) 456 457 mov nneg, n 458 sar $2, n 459 clc 460 lea 16(up,nneg,8), up 461 lea (up,nneg,8), vp 462 jmp L(addy) 463 464 465 ALIGN(16) 466 L(3m4): 467 L(lo3): mov (mp,nneg,8), %rax 468 mul q0 469 mov %rax, %rbx 470 mov %rdx, %r10 471 mov 8(mp,nneg,8), %rax 472 mov 24(up,nneg,8), %r15 473 mul q0 474 add 16(up,nneg,8), %rbx C result is zero, might carry 475 mov $0, R32(%rbx) C zero 476 mov %rbx, %r14 C zero 477 adc %rax, %r10 478 mov 16(mp,nneg,8), %rax 479 mov %r14, %r9 C zero 480 adc %rdx, %r9 481 add %r10, %r15 482 mul q0 483 lea 3(nneg), i 484 imul u0inv, %r15 485 C jmp L(li3) 486 487 ALIGN(16) 488 L(li3): add %r10, (up,i,8) 489 adc %rax, %r9 490 mov (mp,i,8), %rax 491 adc %rdx, %r14 492 xor R32(%r10), R32(%r10) 493 mul q0 494 add %r9, 8(up,i,8) 495 adc %rax, %r14 496 adc %rdx, %rbx 497 mov 8(mp,i,8), %rax 498 mul q0 499 add %r14, 16(up,i,8) 500 adc %rax, %rbx 501 adc %rdx, %r10 502 mov 16(mp,i,8), %rax 503 mul q0 504 add %rbx, 24(up,i,8) 505 mov $0, R32(%r14) C zero 506 mov %r14, %rbx C zero 507 adc %rax, %r10 508 mov 24(mp,i,8), %rax 509 mov %r14, %r9 C zero 510 adc %rdx, %r9 511 mul q0 512 add $4, i 513 js L(li3) 514 515 L(le3): add %r10, (up) 516 adc %rax, %r9 517 adc %r14, %rdx 518 add %r9, 8(up) 519 adc $0, %rdx 520 mov %rdx, 16(up,nneg,8) C up[0] 521 mov %r15, q0 522 lea 8(up), up 523 dec n 524 jnz L(lo3) 525 526 527 C ==== Addition code ==== 528 mov nneg, n 529 sar $2, n 530 lea 40(up,nneg,8), up 531 lea (up,nneg,8), vp 532 533 mov -24(up), %r8 534 mov -16(up), %r9 535 mov -8(up), %r10 536 add -24(vp), %r8 537 adc -16(vp), %r9 538 adc -8(vp), %r10 539 mov %r8, (rp) 540 mov %r9, 8(rp) 541 mov %r10, 16(rp) 542 lea 24(rp), rp 543 544 L(addx):inc n 545 jz L(ad3) 546 547 L(addy):mov (up), %r8 548 mov 8(up), %r9 549 inc n 550 jmp L(mid) 551 552 C ALIGN(16) 553 L(al3): adc (vp), %r8 554 adc 8(vp), %r9 555 adc 16(vp), %r10 556 adc 24(vp), %r11 557 mov %r8, (rp) 558 lea 32(up), up 559 mov %r9, 8(rp) 560 mov %r10, 16(rp) 561 inc n 562 mov %r11, 24(rp) 563 lea 32(vp), vp 564 mov (up), %r8 565 mov 8(up), %r9 566 lea 32(rp), rp 567 L(mid): mov 16(up), %r10 568 mov 24(up), %r11 569 jnz L(al3) 570 571 L(ae3): adc (vp), %r8 572 adc 8(vp), %r9 573 adc 16(vp), %r10 574 adc 24(vp), %r11 575 mov %r8, (rp) 576 mov %r9, 8(rp) 577 mov %r10, 16(rp) 578 mov %r11, 24(rp) 579 580 L(ad3): mov R32(n), R32(%rax) C zero 581 adc R32(%rax), R32(%rax) 582 583 L(ret): pop %r15 584 pop %r14 585 pop %r13 586 pop %r12 587 pop %rbx 588 pop %rbp 589 FUNC_EXIT() 590 ret 591 EPILOGUE()