github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/redc_1.asm (about) 1 dnl X86-64 mpn_redc_1 optimised for Intel Atom. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 ? 37 C AMD K10 ? 38 C AMD bull ? 39 C AMD pile ? 40 C AMD steam ? 41 C AMD bobcat 5.0 42 C AMD jaguar ? 43 C Intel P4 ? 44 C Intel core ? 45 C Intel NHM ? 46 C Intel SBR ? 47 C Intel IBR ? 48 C Intel HWL ? 49 C Intel BWL ? 50 C Intel atom ? 51 C VIA nano ? 52 53 C TODO 54 C * Micro-optimise, none performed thus far. 55 C * Consider inlining mpn_add_n. 56 C * Single basecases out before the pushes. 57 C * Make lead-in code for the inner loops be more similar. 58 59 C When playing with pointers, set this to $2 to fall back to conservative 60 C indexing in wind-down code. 61 define(`I',`$1') 62 63 define(`rp', `%rdi') C rcx 64 define(`up', `%rsi') C rdx 65 define(`mp_param', `%rdx') C r8 66 define(`n', `%rcx') C r9 67 define(`u0inv', `%r8') C stack 68 69 define(`i', `%r14') 70 define(`j', `%r15') 71 define(`mp', `%r12') 72 define(`q0', `%r13') 73 define(`w0', `%rbp') 74 define(`w1', `%r9') 75 define(`w2', `%r10') 76 define(`w3', `%r11') 77 78 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 79 80 ABI_SUPPORT(DOS64) 81 ABI_SUPPORT(STD64) 82 83 define(`ALIGNx', `ALIGN(16)') 84 85 ASM_START() 86 TEXT 87 ALIGN(32) 88 PROLOGUE(mpn_redc_1) 89 FUNC_ENTRY(4) 90 IFDOS(` mov 56(%rsp), %r8 ') 91 push %rbx 92 push %rbp 93 push %r12 94 push %r13 95 push %r14 96 push %r15 97 98 mov (up), q0 99 mov n, j C outer loop induction var 100 lea (mp_param,n,8), mp 101 lea (up,n,8), up 102 neg n 103 imul u0inv, q0 C first iteration q0 104 105 test $1, R8(n) 106 jz L(bx0) 107 108 L(bx1): test $2, R8(n) 109 jz L(b3) 110 111 L(b1): cmp $-1, R32(n) 112 jz L(n1) 113 114 L(otp1):lea 1(n), i 115 mov (mp,n,8), %rax 116 mul q0 117 mov %rax, %rbp 118 mov 8(mp,n,8), %rax 119 mov %rdx, %r9 120 mul q0 121 mov %rax, %rbx 122 mov 16(mp,n,8), %rax 123 mov %rdx, %r10 124 mul q0 125 add (up,n,8), %rbp 126 mov %rax, %rbp 127 adc %r9, %rbx 128 mov 24(mp,n,8), %rax 129 adc $0, %r10 130 mov %rdx, %r9 131 mul q0 132 add 8(up,n,8), %rbx 133 mov %rbx, 8(up,n,8) 134 mov %rax, %r11 135 adc %r10, %rbp 136 mov 32(mp,n,8), %rax 137 adc $0, %r9 138 imul u0inv, %rbx C next q limb 139 jmp L(e1) 140 141 ALIGNx 142 L(tp1): mul q0 143 add %rbp, -24(up,i,8) 144 mov %rax, %rbp 145 mov (mp,i,8), %rax 146 adc %r9, %r11 147 mov %rdx, %r9 148 adc $0, %r10 149 mul q0 150 add %r11, -16(up,i,8) 151 mov %rax, %r11 152 mov 8(mp,i,8), %rax 153 adc %r10, %rbp 154 mov %rdx, %r10 155 adc $0, %r9 156 mul q0 157 add %rbp, -8(up,i,8) 158 mov %rax, %rbp 159 adc %r9, %r11 160 mov 16(mp,i,8), %rax 161 adc $0, %r10 162 mov %rdx, %r9 163 mul q0 164 add %r11, (up,i,8) 165 mov %rax, %r11 166 adc %r10, %rbp 167 mov 24(mp,i,8), %rax 168 adc $0, %r9 169 L(e1): add $4, i 170 mov %rdx, %r10 171 js L(tp1) 172 173 L(ed1): mul q0 174 add %rbp, I(-24(up),-24(up,i,8)) 175 adc %r9, %r11 176 adc $0, %r10 177 add %r11, I(-16(up),-16(up,i,8)) 178 adc %r10, %rax 179 adc $0, %rdx 180 add %rax, I(-8(up),-8(up,i,8)) 181 adc $0, %rdx 182 mov %rdx, (up,n,8) C up[0] 183 mov %rbx, q0 C previously computed q limb -> q0 184 lea 8(up), up C up++ 185 dec j 186 jnz L(otp1) 187 jmp L(cj) 188 189 L(b3): cmp $-3, R32(n) 190 jz L(n3) 191 192 L(otp3):lea 3(n), i 193 mov (mp,n,8), %rax 194 mul q0 195 mov %rax, %rbp 196 mov 8(mp,n,8), %rax 197 mov %rdx, %r9 198 mul q0 199 mov %rax, %rbx 200 mov 16(mp,n,8), %rax 201 mov %rdx, %r10 202 mul q0 203 add (up,n,8), %rbp 204 mov %rax, %rbp 205 mov 24(mp,n,8), %rax 206 adc %r9, %rbx 207 mov %rdx, %r9 208 adc $0, %r10 209 mul q0 210 add 8(up,n,8), %rbx 211 mov %rbx, 8(up,n,8) 212 mov %rax, %r11 213 mov 32(mp,n,8), %rax 214 adc %r10, %rbp 215 mov %rdx, %r10 216 adc $0, %r9 217 imul u0inv, %rbx C next q limb 218 jmp L(e3) 219 220 ALIGNx 221 L(tp3): mul q0 222 add %rbp, -24(up,i,8) 223 mov %rax, %rbp 224 mov (mp,i,8), %rax 225 adc %r9, %r11 226 mov %rdx, %r9 227 adc $0, %r10 228 mul q0 229 add %r11, -16(up,i,8) 230 mov %rax, %r11 231 mov 8(mp,i,8), %rax 232 adc %r10, %rbp 233 mov %rdx, %r10 234 adc $0, %r9 235 L(e3): mul q0 236 add %rbp, -8(up,i,8) 237 mov %rax, %rbp 238 adc %r9, %r11 239 mov 16(mp,i,8), %rax 240 adc $0, %r10 241 mov %rdx, %r9 242 mul q0 243 add %r11, (up,i,8) 244 mov %rax, %r11 245 adc %r10, %rbp 246 mov 24(mp,i,8), %rax 247 adc $0, %r9 248 add $4, i 249 mov %rdx, %r10 250 js L(tp3) 251 252 L(ed3): mul q0 253 add %rbp, I(-24(up),-24(up,i,8)) 254 adc %r9, %r11 255 adc $0, %r10 256 add %r11, I(-16(up),-16(up,i,8)) 257 adc %r10, %rax 258 adc $0, %rdx 259 add %rax, I(-8(up),-8(up,i,8)) 260 adc $0, %rdx 261 mov %rdx, (up,n,8) C up[0] 262 mov %rbx, q0 C previously computed q limb -> q0 263 lea 8(up), up C up++ 264 dec j 265 jnz L(otp3) 266 C jmp L(cj) 267 268 L(cj): 269 IFSTD(` lea (up,n,8), up C param 2: up 270 lea (up,n,8), %rdx C param 3: up - n 271 neg R32(n) ') C param 4: n 272 273 IFDOS(` lea (up,n,8), %rdx C param 2: up 274 lea (%rdx,n,8), %r8 C param 3: up - n 275 neg R32(n) 276 mov n, %r9 C param 4: n 277 mov rp, %rcx ') C param 1: rp 278 279 IFSTD(` sub $8, %rsp ') 280 IFDOS(` sub $40, %rsp ') 281 ASSERT(nz, `test $15, %rsp') 282 CALL( mpn_add_n) 283 IFSTD(` add $8, %rsp ') 284 IFDOS(` add $40, %rsp ') 285 286 L(ret): pop %r15 287 pop %r14 288 pop %r13 289 pop %r12 290 pop %rbp 291 pop %rbx 292 FUNC_EXIT() 293 ret 294 295 L(bx0): test $2, R8(n) 296 jnz L(b2) 297 298 L(b0): cmp $-4, R32(n) 299 jz L(n4) 300 301 L(otp0):lea 4(n), i 302 mov (mp,n,8), %rax 303 mul q0 304 mov %rax, %r11 305 mov 8(mp,n,8), %rax 306 mov %rdx, %r10 307 mul q0 308 mov %rax, %rbx 309 mov 16(mp,n,8), %rax 310 mov %rdx, %r9 311 mul q0 312 add (up,n,8), %r11 313 mov %rax, %r11 314 adc %r10, %rbx 315 mov 24(mp,n,8), %rax 316 adc $0, %r9 317 mov %rdx, %r10 318 mul q0 319 add 8(up,n,8), %rbx 320 mov %rbx, 8(up,n,8) 321 mov %rax, %rbp 322 mov 32(mp,n,8), %rax 323 adc %r9, %r11 324 mov %rdx, %r9 325 adc $0, %r10 326 imul u0inv, %rbx C next q limb 327 jmp L(e0) 328 329 ALIGNx 330 L(tp0): mul q0 331 add %rbp, -24(up,i,8) 332 mov %rax, %rbp 333 mov (mp,i,8), %rax 334 adc %r9, %r11 335 mov %rdx, %r9 336 adc $0, %r10 337 L(e0): mul q0 338 add %r11, -16(up,i,8) 339 mov %rax, %r11 340 mov 8(mp,i,8), %rax 341 adc %r10, %rbp 342 mov %rdx, %r10 343 adc $0, %r9 344 mul q0 345 add %rbp, -8(up,i,8) 346 mov %rax, %rbp 347 adc %r9, %r11 348 mov 16(mp,i,8), %rax 349 adc $0, %r10 350 mov %rdx, %r9 351 mul q0 352 add %r11, (up,i,8) 353 mov %rax, %r11 354 adc %r10, %rbp 355 mov 24(mp,i,8), %rax 356 adc $0, %r9 357 add $4, i 358 mov %rdx, %r10 359 js L(tp0) 360 361 L(ed0): mul q0 362 add %rbp, I(-24(up),-24(up,i,8)) 363 adc %r9, %r11 364 adc $0, %r10 365 add %r11, I(-16(up),-16(up,i,8)) 366 adc %r10, %rax 367 adc $0, %rdx 368 add %rax, I(-8(up),-8(up,i,8)) 369 adc $0, %rdx 370 mov %rdx, (up,n,8) C up[0] 371 mov %rbx, q0 C previously computed q limb -> q0 372 lea 8(up), up C up++ 373 dec j 374 jnz L(otp0) 375 jmp L(cj) 376 377 L(b2): cmp $-2, R32(n) 378 jz L(n2) 379 380 L(otp2):lea 2(n), i 381 mov (mp,n,8), %rax 382 mul q0 383 mov %rax, %r11 384 mov 8(mp,n,8), %rax 385 mov %rdx, %r10 386 mul q0 387 mov %rax, %rbx 388 mov 16(mp,n,8), %rax 389 mov %rdx, %r9 390 mul q0 391 add (up,n,8), %r11 392 mov %rax, %r11 393 adc %r10, %rbx 394 mov 24(mp,n,8), %rax 395 adc $0, %r9 396 mov %rdx, %r10 397 mul q0 398 add 8(up,n,8), %rbx 399 mov %rbx, 8(up,n,8) 400 mov %rax, %rbp 401 mov 32(mp,n,8), %rax 402 adc %r9, %r11 403 mov %rdx, %r9 404 adc $0, %r10 405 imul u0inv, %rbx C next q limb 406 jmp L(e2) 407 408 ALIGNx 409 L(tp2): mul q0 410 add %rbp, -24(up,i,8) 411 mov %rax, %rbp 412 mov (mp,i,8), %rax 413 adc %r9, %r11 414 mov %rdx, %r9 415 adc $0, %r10 416 mul q0 417 add %r11, -16(up,i,8) 418 mov %rax, %r11 419 mov 8(mp,i,8), %rax 420 adc %r10, %rbp 421 mov %rdx, %r10 422 adc $0, %r9 423 mul q0 424 add %rbp, -8(up,i,8) 425 mov %rax, %rbp 426 adc %r9, %r11 427 mov 16(mp,i,8), %rax 428 adc $0, %r10 429 mov %rdx, %r9 430 L(e2): mul q0 431 add %r11, (up,i,8) 432 mov %rax, %r11 433 adc %r10, %rbp 434 mov 24(mp,i,8), %rax 435 adc $0, %r9 436 add $4, i 437 mov %rdx, %r10 438 js L(tp2) 439 440 L(ed2): mul q0 441 add %rbp, I(-24(up),-24(up,i,8)) 442 adc %r9, %r11 443 adc $0, %r10 444 add %r11, I(-16(up),-16(up,i,8)) 445 adc %r10, %rax 446 adc $0, %rdx 447 add %rax, I(-8(up),-8(up,i,8)) 448 adc $0, %rdx 449 mov %rdx, (up,n,8) C up[0] 450 mov %rbx, q0 C previously computed q limb -> q0 451 lea 8(up), up C up++ 452 dec j 453 jnz L(otp2) 454 jmp L(cj) 455 456 L(n1): mov (mp_param), %rax 457 mul q0 458 add -8(up), %rax 459 adc (up), %rdx 460 mov %rdx, (rp) 461 mov $0, R32(%rax) 462 adc R32(%rax), R32(%rax) 463 jmp L(ret) 464 465 L(n2): mov (mp_param), %rax 466 mov -16(up), %rbp 467 mul q0 468 add %rax, %rbp 469 mov %rdx, %r9 470 adc $0, %r9 471 mov -8(mp), %rax 472 mov -8(up), %r10 473 mul q0 474 add %rax, %r10 475 mov %rdx, %r11 476 adc $0, %r11 477 add %r9, %r10 478 adc $0, %r11 479 mov %r10, q0 480 imul u0inv, q0 C next q0 481 mov -16(mp), %rax 482 mul q0 483 add %rax, %r10 484 mov %rdx, %r9 485 adc $0, %r9 486 mov -8(mp), %rax 487 mov (up), %r14 488 mul q0 489 add %rax, %r14 490 adc $0, %rdx 491 add %r9, %r14 492 adc $0, %rdx 493 xor R32(%rax), R32(%rax) 494 add %r11, %r14 495 adc 8(up), %rdx 496 mov %r14, (rp) 497 mov %rdx, 8(rp) 498 adc R32(%rax), R32(%rax) 499 jmp L(ret) 500 501 ALIGNx 502 L(n3): mov -24(mp), %rax 503 mov -24(up), %r10 504 mul q0 505 add %rax, %r10 506 mov -16(mp), %rax 507 mov %rdx, %r11 508 adc $0, %r11 509 mov -16(up), %rbp 510 mul q0 511 add %rax, %rbp 512 mov %rdx, %r9 513 adc $0, %r9 514 mov -8(mp), %rax 515 add %r11, %rbp 516 mov -8(up), %r10 517 adc $0, %r9 518 mul q0 519 mov %rbp, q0 520 imul u0inv, q0 C next q0 521 add %rax, %r10 522 mov %rdx, %r11 523 adc $0, %r11 524 mov %rbp, -16(up) 525 add %r9, %r10 526 adc $0, %r11 527 mov %r10, -8(up) 528 mov %r11, -24(up) C up[0] 529 lea 8(up), up C up++ 530 dec j 531 jnz L(n3) 532 533 mov -48(up), %rdx 534 mov -40(up), %rbx 535 xor R32(%rax), R32(%rax) 536 add %rbp, %rdx 537 adc %r10, %rbx 538 adc -8(up), %r11 539 mov %rdx, (rp) 540 mov %rbx, 8(rp) 541 mov %r11, 16(rp) 542 adc R32(%rax), R32(%rax) 543 jmp L(ret) 544 545 L(n4): mov -32(mp), %rax 546 mul q0 547 mov %rax, %r11 548 mov -24(mp), %rax 549 mov %rdx, %r10 550 mul q0 551 mov %rax, %rbx 552 mov -16(mp), %rax 553 mov %rdx, %r9 554 mul q0 555 add -32(up), %r11 556 mov %rax, %r11 557 adc %r10, %rbx 558 mov -8(mp), %rax 559 adc $0, %r9 560 mov %rdx, %r10 561 mul q0 562 add -24(up), %rbx 563 mov %rbx, -24(up) 564 adc %r9, %r11 565 adc $0, %r10 566 imul u0inv, %rbx C next q limb 567 add %r11, -16(up) 568 adc %r10, %rax 569 adc $0, %rdx 570 add %rax, -8(up) 571 adc $0, %rdx 572 mov %rdx, -32(up) C up[0] 573 mov %rbx, q0 C previously computed q limb -> q0 574 dec j 575 lea 8(up), up C up++ 576 jnz L(n4) 577 jmp L(cj) 578 EPILOGUE() 579 ASM_END()