github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bobcat/redc_1.asm (about) 1 dnl X86-64 mpn_redc_1 optimised for AMD bobcat. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 ? 37 C AMD K10 ? 38 C AMD bull ? 39 C AMD pile ? 40 C AMD steam ? 41 C AMD bobcat 5.0 42 C AMD jaguar ? 43 C Intel P4 ? 44 C Intel core ? 45 C Intel NHM ? 46 C Intel SBR ? 47 C Intel IBR ? 48 C Intel HWL ? 49 C Intel BWL ? 50 C Intel atom ? 51 C VIA nano ? 52 53 C TODO 54 C * Micro-optimise, none performed thus far. 55 C * Consider inlining mpn_add_n. 56 C * Single basecases out before the pushes. 57 58 C When playing with pointers, set this to $2 to fall back to conservative 59 C indexing in wind-down code. 60 define(`I',`$1') 61 62 define(`rp', `%rdi') C rcx 63 define(`up', `%rsi') C rdx 64 define(`mp_param', `%rdx') C r8 65 define(`n', `%rcx') C r9 66 define(`u0inv', `%r8') C stack 67 68 define(`i', `%r14') 69 define(`j', `%r15') 70 define(`mp', `%r12') 71 define(`q0', `%r13') 72 define(`w0', `%rbp') 73 define(`w1', `%r9') 74 define(`w2', `%r10') 75 define(`w3', `%r11') 76 77 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 78 79 ABI_SUPPORT(DOS64) 80 ABI_SUPPORT(STD64) 81 82 define(`ALIGNx', `ALIGN(16)') 83 84 ASM_START() 85 TEXT 86 ALIGN(32) 87 PROLOGUE(mpn_redc_1) 88 FUNC_ENTRY(4) 89 IFDOS(` mov 56(%rsp), %r8 ') 90 push %rbx 91 push %rbp 92 push %r12 93 push %r13 94 push %r14 95 push %r15 96 97 mov (up), q0 98 mov n, j C outer loop induction var 99 lea (mp_param,n,8), mp 100 lea (up,n,8), up 101 neg n 102 imul u0inv, q0 C first iteration q0 103 104 test $1, R8(n) 105 jz L(bx0) 106 107 L(bx1): test $2, R8(n) 108 jz L(b3) 109 110 L(b1): cmp $-1, R32(n) 111 jz L(n1) 112 113 L(otp1):lea 1(n), i 114 mov (mp,n,8), %rax 115 mul q0 116 mov %rax, w2 117 mov %rdx, w3 118 mov 8(mp,n,8), %rax 119 mul q0 120 mov %rax, %rbx 121 mov %rdx, w1 122 add (up,n,8), w2 123 adc w3, %rbx 124 adc $0, w1 125 mov 16(mp,n,8), %rax 126 mul q0 127 mov %rax, w2 128 mov %rdx, w3 129 add 8(up,n,8), %rbx 130 mov %rbx, 8(up,n,8) 131 adc w1, w2 132 adc $0, w3 133 imul u0inv, %rbx C next q limb 134 jmp L(e1) 135 136 ALIGNx 137 L(tp1): add w0, -16(up,i,8) 138 adc w1, w2 139 adc $0, w3 140 mov (mp,i,8), %rax 141 mul q0 142 mov %rax, w0 143 mov %rdx, w1 144 add w2, -8(up,i,8) 145 adc w3, w0 146 adc $0, w1 147 mov 8(mp,i,8), %rax 148 mul q0 149 mov %rax, w2 150 mov %rdx, w3 151 add w0, (up,i,8) 152 adc w1, w2 153 adc $0, w3 154 L(e1): mov 16(mp,i,8), %rax 155 mul q0 156 mov %rax, w0 157 mov %rdx, w1 158 add w2, 8(up,i,8) 159 adc w3, w0 160 adc $0, w1 161 mov 24(mp,i,8), %rax 162 mul q0 163 mov %rax, w2 164 mov %rdx, w3 165 add $4, i 166 js L(tp1) 167 168 L(ed1): add w0, I(-16(up),-16(up,i,8)) 169 adc w1, w2 170 adc $0, w3 171 add w2, I(-8(up),-8(up,i,8)) 172 adc $0, w3 173 mov w3, (up,n,8) C up[0] 174 mov %rbx, q0 C previously computed q limb -> q0 175 lea 8(up), up C up++ 176 dec j 177 jnz L(otp1) 178 jmp L(cj) 179 180 L(b3): cmp $-3, R32(n) 181 jz L(n3) 182 183 L(otp3):lea 3(n), i 184 mov (mp,n,8), %rax 185 mul q0 186 mov %rax, w2 187 mov %rdx, w3 188 mov 8(mp,n,8), %rax 189 mul q0 190 mov %rax, %rbx 191 mov %rdx, w1 192 add (up,n,8), w2 193 adc w3, %rbx 194 adc $0, w1 195 mov 16(mp,n,8), %rax 196 mul q0 197 mov %rax, w2 198 mov %rdx, w3 199 add 8(up,n,8), %rbx 200 mov %rbx, 8(up,n,8) 201 adc w1, w2 202 adc $0, w3 203 imul u0inv, %rbx C next q limb 204 jmp L(e3) 205 206 ALIGNx 207 L(tp3): add w0, -16(up,i,8) 208 adc w1, w2 209 adc $0, w3 210 L(e3): mov (mp,i,8), %rax 211 mul q0 212 mov %rax, w0 213 mov %rdx, w1 214 add w2, -8(up,i,8) 215 adc w3, w0 216 adc $0, w1 217 mov 8(mp,i,8), %rax 218 mul q0 219 mov %rax, w2 220 mov %rdx, w3 221 add w0, (up,i,8) 222 adc w1, w2 223 adc $0, w3 224 mov 16(mp,i,8), %rax 225 mul q0 226 mov %rax, w0 227 mov %rdx, w1 228 add w2, 8(up,i,8) 229 adc w3, w0 230 adc $0, w1 231 mov 24(mp,i,8), %rax 232 mul q0 233 mov %rax, w2 234 mov %rdx, w3 235 add $4, i 236 js L(tp3) 237 238 L(ed3): add w0, I(-16(up),-16(up,i,8)) 239 adc w1, w2 240 adc $0, w3 241 add w2, I(-8(up),-8(up,i,8)) 242 adc $0, w3 243 mov w3, (up,n,8) C up[0] 244 mov %rbx, q0 C previously computed q limb -> q0 245 lea 8(up), up C up++ 246 dec j 247 jnz L(otp3) 248 C jmp L(cj) 249 250 L(cj): 251 IFSTD(` lea (up,n,8), up C param 2: up 252 lea (up,n,8), %rdx C param 3: up - n 253 neg R32(n) ') C param 4: n 254 255 IFDOS(` lea (up,n,8), %rdx C param 2: up 256 lea (%rdx,n,8), %r8 C param 3: up - n 257 neg R32(n) 258 mov n, %r9 C param 4: n 259 mov rp, %rcx ') C param 1: rp 260 261 IFSTD(` sub $8, %rsp ') 262 IFDOS(` sub $40, %rsp ') 263 ASSERT(nz, `test $15, %rsp') 264 CALL( mpn_add_n) 265 IFSTD(` add $8, %rsp ') 266 IFDOS(` add $40, %rsp ') 267 268 L(ret): pop %r15 269 pop %r14 270 pop %r13 271 pop %r12 272 pop %rbp 273 pop %rbx 274 FUNC_EXIT() 275 ret 276 277 L(bx0): test $2, R8(n) 278 jnz L(b2) 279 280 L(b0): 281 L(otp0):lea (n), i 282 mov (mp,n,8), %rax 283 mul q0 284 mov %rax, w0 285 mov %rdx, w1 286 mov 8(mp,n,8), %rax 287 mul q0 288 mov %rax, %rbx 289 mov %rdx, w3 290 add (up,n,8), w0 291 adc w1, %rbx 292 adc $0, w3 293 mov 16(mp,n,8), %rax 294 mul q0 295 mov %rax, w0 296 mov %rdx, w1 297 add 8(up,n,8), %rbx 298 mov %rbx, 8(up,n,8) 299 adc w3, w0 300 adc $0, w1 301 imul u0inv, %rbx C next q limb 302 jmp L(e0) 303 304 ALIGNx 305 L(tp0): add w0, -16(up,i,8) 306 adc w1, w2 307 adc $0, w3 308 mov (mp,i,8), %rax 309 mul q0 310 mov %rax, w0 311 mov %rdx, w1 312 add w2, -8(up,i,8) 313 adc w3, w0 314 adc $0, w1 315 mov 8(mp,i,8), %rax 316 mul q0 317 mov %rax, w2 318 mov %rdx, w3 319 add w0, (up,i,8) 320 adc w1, w2 321 adc $0, w3 322 mov 16(mp,i,8), %rax 323 mul q0 324 mov %rax, w0 325 mov %rdx, w1 326 add w2, 8(up,i,8) 327 adc w3, w0 328 adc $0, w1 329 L(e0): mov 24(mp,i,8), %rax 330 mul q0 331 mov %rax, w2 332 mov %rdx, w3 333 add $4, i 334 js L(tp0) 335 336 L(ed0): add w0, I(-16(up),-16(up,i,8)) 337 adc w1, w2 338 adc $0, w3 339 add w2, I(-8(up),-8(up,i,8)) 340 adc $0, w3 341 mov w3, (up,n,8) C up[0] 342 mov %rbx, q0 C previously computed q limb -> q0 343 lea 8(up), up C up++ 344 dec j 345 jnz L(otp0) 346 jmp L(cj) 347 348 L(b2): cmp $-2, R32(n) 349 jz L(n2) 350 351 L(otp2):lea 2(n), i 352 mov (mp,n,8), %rax 353 mul q0 354 mov %rax, w0 355 mov %rdx, w1 356 mov 8(mp,n,8), %rax 357 mul q0 358 mov %rax, %rbx 359 mov %rdx, w3 360 add (up,n,8), w0 361 adc w1, %rbx 362 adc $0, w3 363 mov 16(mp,n,8), %rax 364 mul q0 365 mov %rax, w0 366 mov %rdx, w1 367 add 8(up,n,8), %rbx 368 mov %rbx, 8(up,n,8) 369 adc w3, w0 370 adc $0, w1 371 imul u0inv, %rbx C next q limb 372 jmp L(e2) 373 374 ALIGNx 375 L(tp2): add w0, -16(up,i,8) 376 adc w1, w2 377 adc $0, w3 378 mov (mp,i,8), %rax 379 mul q0 380 mov %rax, w0 381 mov %rdx, w1 382 add w2, -8(up,i,8) 383 adc w3, w0 384 adc $0, w1 385 L(e2): mov 8(mp,i,8), %rax 386 mul q0 387 mov %rax, w2 388 mov %rdx, w3 389 add w0, (up,i,8) 390 adc w1, w2 391 adc $0, w3 392 mov 16(mp,i,8), %rax 393 mul q0 394 mov %rax, w0 395 mov %rdx, w1 396 add w2, 8(up,i,8) 397 adc w3, w0 398 adc $0, w1 399 mov 24(mp,i,8), %rax 400 mul q0 401 mov %rax, w2 402 mov %rdx, w3 403 add $4, i 404 js L(tp2) 405 406 L(ed2): add w0, I(-16(up),-16(up,i,8)) 407 adc w1, w2 408 adc $0, w3 409 add w2, I(-8(up),-8(up,i,8)) 410 adc $0, w3 411 mov w3, (up,n,8) C up[0] 412 mov %rbx, q0 C previously computed q limb -> q0 413 lea 8(up), up C up++ 414 dec j 415 jnz L(otp2) 416 jmp L(cj) 417 418 L(n1): mov (mp_param), %rax 419 mul q0 420 add -8(up), %rax 421 adc (up), %rdx 422 mov %rdx, (rp) 423 mov $0, R32(%rax) 424 adc R32(%rax), R32(%rax) 425 jmp L(ret) 426 427 L(n2): mov (mp_param), %rax 428 mov -16(up), %rbp 429 mul q0 430 add %rax, %rbp 431 mov %rdx, %r9 432 adc $0, %r9 433 mov -8(mp), %rax 434 mov -8(up), %r10 435 mul q0 436 add %rax, %r10 437 mov %rdx, %r11 438 adc $0, %r11 439 add %r9, %r10 440 adc $0, %r11 441 mov %r10, q0 442 imul u0inv, q0 C next q0 443 mov -16(mp), %rax 444 mul q0 445 add %rax, %r10 446 mov %rdx, %r9 447 adc $0, %r9 448 mov -8(mp), %rax 449 mov (up), %r14 450 mul q0 451 add %rax, %r14 452 adc $0, %rdx 453 add %r9, %r14 454 adc $0, %rdx 455 xor R32(%rax), R32(%rax) 456 add %r11, %r14 457 adc 8(up), %rdx 458 mov %r14, (rp) 459 mov %rdx, 8(rp) 460 adc R32(%rax), R32(%rax) 461 jmp L(ret) 462 463 ALIGNx 464 L(n3): mov -24(mp), %rax 465 mov -24(up), %r10 466 mul q0 467 add %rax, %r10 468 mov -16(mp), %rax 469 mov %rdx, %r11 470 adc $0, %r11 471 mov -16(up), %rbp 472 mul q0 473 add %rax, %rbp 474 mov %rdx, %r9 475 adc $0, %r9 476 mov -8(mp), %rax 477 add %r11, %rbp 478 mov -8(up), %r10 479 adc $0, %r9 480 mul q0 481 mov %rbp, q0 482 imul u0inv, q0 C next q0 483 add %rax, %r10 484 mov %rdx, %r11 485 adc $0, %r11 486 mov %rbp, -16(up) 487 add %r9, %r10 488 adc $0, %r11 489 mov %r10, -8(up) 490 mov %r11, -24(up) C up[0] 491 lea 8(up), up C up++ 492 dec j 493 jnz L(n3) 494 495 mov -48(up), %rdx 496 mov -40(up), %rbx 497 xor R32(%rax), R32(%rax) 498 add %rbp, %rdx 499 adc %r10, %rbx 500 adc -8(up), %r11 501 mov %rdx, (rp) 502 mov %rbx, 8(rp) 503 mov %r11, 16(rp) 504 adc R32(%rax), R32(%rax) 505 jmp L(ret) 506 EPILOGUE() 507 ASM_END()