github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mulmid_basecase.asm (about) 1 dnl AMD64 mpn_mulmid_basecase 2 3 dnl Contributed by David Harvey. 4 5 dnl Copyright 2011, 2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 34 include(`../config.m4') 35 36 C cycles/limb 37 C K8,K9: 2.375 (2.5 when un - vn is "small") 38 C K10: ? 39 C P4: ? 40 C P6-15: ? 41 42 C INPUT PARAMETERS 43 define(`rp', `%rdi') 44 define(`up', `%rsi') 45 define(`un_param',`%rdx') 46 define(`vp_param',`%rcx') 47 define(`vn', `%r8') 48 49 define(`v0', `%r12') 50 define(`v1', `%r9') 51 52 define(`w0', `%rbx') 53 define(`w1', `%rcx') 54 define(`w2', `%rbp') 55 define(`w3', `%r10') 56 57 define(`n', `%r11') 58 define(`outer_addr', `%r14') 59 define(`un', `%r13') 60 define(`vp', `%r15') 61 62 define(`vp_inner', `%r10') 63 64 ABI_SUPPORT(DOS64) 65 ABI_SUPPORT(STD64) 66 67 ASM_START() 68 TEXT 69 ALIGN(16) 70 PROLOGUE(mpn_mulmid_basecase) 71 FUNC_ENTRY(4) 72 IFDOS(` mov 56(%rsp), %r8d ') 73 push %rbx 74 push %rbp 75 push %r12 76 push %r13 77 push %r14 78 push %r15 79 80 mov vp_param, vp 81 82 C use un for row length (= un_param - vn + 1) 83 lea 1(un_param), un 84 sub vn, un 85 86 lea (rp,un,8), rp 87 88 cmp $4, un C TODO: needs tuning 89 jc L(diagonal) 90 91 lea (up,un_param,8), up 92 93 test $1, vn 94 jz L(mul_2) 95 96 C =========================================================== 97 C mul_1 for vp[0] if vn is odd 98 99 L(mul_1): 100 mov R32(un), R32(w0) 101 102 neg un 103 mov (up,un,8), %rax 104 mov (vp), v0 105 mul v0 106 107 and $-4, un C round down to multiple of 4 108 mov un, n 109 110 and $3, R32(w0) 111 jz L(mul_1_prologue_0) 112 cmp $2, R32(w0) 113 jc L(mul_1_prologue_1) 114 jz L(mul_1_prologue_2) 115 116 L(mul_1_prologue_3): 117 mov %rax, w3 118 mov %rdx, w0 119 lea L(addmul_prologue_3)(%rip), outer_addr 120 jmp L(mul_1_entry_3) 121 122 ALIGN(16) 123 L(mul_1_prologue_0): 124 mov %rax, w2 125 mov %rdx, w3 C note already w0 == 0 126 lea L(addmul_prologue_0)(%rip), outer_addr 127 jmp L(mul_1_entry_0) 128 129 ALIGN(16) 130 L(mul_1_prologue_1): 131 add $4, n 132 mov %rax, w1 133 mov %rdx, w2 134 mov $0, R32(w3) 135 mov (up,n,8), %rax 136 lea L(addmul_prologue_1)(%rip), outer_addr 137 jmp L(mul_1_entry_1) 138 139 ALIGN(16) 140 L(mul_1_prologue_2): 141 mov %rax, w0 142 mov %rdx, w1 143 mov 24(up,n,8), %rax 144 mov $0, R32(w2) 145 mov $0, R32(w3) 146 lea L(addmul_prologue_2)(%rip), outer_addr 147 jmp L(mul_1_entry_2) 148 149 150 C this loop is 10 c/loop = 2.5 c/l on K8 151 152 ALIGN(16) 153 L(mul_1_top): 154 mov w0, -16(rp,n,8) 155 add %rax, w1 156 mov (up,n,8), %rax 157 adc %rdx, w2 158 L(mul_1_entry_1): 159 mov $0, R32(w0) 160 mul v0 161 mov w1, -8(rp,n,8) 162 add %rax, w2 163 adc %rdx, w3 164 L(mul_1_entry_0): 165 mov 8(up,n,8), %rax 166 mul v0 167 mov w2, (rp,n,8) 168 add %rax, w3 169 adc %rdx, w0 170 L(mul_1_entry_3): 171 mov 16(up,n,8), %rax 172 mul v0 173 mov w3, 8(rp,n,8) 174 mov $0, R32(w2) C zero 175 mov w2, w3 C zero 176 add %rax, w0 177 mov 24(up,n,8), %rax 178 mov w2, w1 C zero 179 adc %rdx, w1 180 L(mul_1_entry_2): 181 mul v0 182 add $4, n 183 js L(mul_1_top) 184 185 mov w0, -16(rp) 186 add %rax, w1 187 mov w1, -8(rp) 188 mov w2, 8(rp) C zero last limb of output 189 adc %rdx, w2 190 mov w2, (rp) 191 192 dec vn 193 jz L(ret) 194 195 lea -8(up), up 196 lea 8(vp), vp 197 198 mov un, n 199 mov (vp), v0 200 mov 8(vp), v1 201 202 jmp *outer_addr 203 204 C =========================================================== 205 C mul_2 for vp[0], vp[1] if vn is even 206 207 ALIGN(16) 208 L(mul_2): 209 mov R32(un), R32(w0) 210 211 neg un 212 mov -8(up,un,8), %rax 213 mov (vp), v0 214 mov 8(vp), v1 215 mul v1 216 217 and $-4, un C round down to multiple of 4 218 mov un, n 219 220 and $3, R32(w0) 221 jz L(mul_2_prologue_0) 222 cmp $2, R32(w0) 223 jc L(mul_2_prologue_1) 224 jz L(mul_2_prologue_2) 225 226 L(mul_2_prologue_3): 227 mov %rax, w1 228 mov %rdx, w2 229 lea L(addmul_prologue_3)(%rip), outer_addr 230 jmp L(mul_2_entry_3) 231 232 ALIGN(16) 233 L(mul_2_prologue_0): 234 mov %rax, w0 235 mov %rdx, w1 236 lea L(addmul_prologue_0)(%rip), outer_addr 237 jmp L(mul_2_entry_0) 238 239 ALIGN(16) 240 L(mul_2_prologue_1): 241 mov %rax, w3 242 mov %rdx, w0 243 mov $0, R32(w1) 244 lea L(addmul_prologue_1)(%rip), outer_addr 245 jmp L(mul_2_entry_1) 246 247 ALIGN(16) 248 L(mul_2_prologue_2): 249 mov %rax, w2 250 mov %rdx, w3 251 mov $0, R32(w0) 252 mov 16(up,n,8), %rax 253 lea L(addmul_prologue_2)(%rip), outer_addr 254 jmp L(mul_2_entry_2) 255 256 257 C this loop is 18 c/loop = 2.25 c/l on K8 258 259 ALIGN(16) 260 L(mul_2_top): 261 mov -8(up,n,8), %rax 262 mul v1 263 add %rax, w0 264 adc %rdx, w1 265 L(mul_2_entry_0): 266 mov $0, R32(w2) 267 mov (up,n,8), %rax 268 mul v0 269 add %rax, w0 270 mov (up,n,8), %rax 271 adc %rdx, w1 272 adc $0, R32(w2) 273 mul v1 274 add %rax, w1 275 mov w0, (rp,n,8) 276 adc %rdx, w2 277 L(mul_2_entry_3): 278 mov 8(up,n,8), %rax 279 mul v0 280 mov $0, R32(w3) 281 add %rax, w1 282 adc %rdx, w2 283 mov $0, R32(w0) 284 adc $0, R32(w3) 285 mov 8(up,n,8), %rax 286 mov w1, 8(rp,n,8) 287 mul v1 288 add %rax, w2 289 mov 16(up,n,8), %rax 290 adc %rdx, w3 291 L(mul_2_entry_2): 292 mov $0, R32(w1) 293 mul v0 294 add %rax, w2 295 mov 16(up,n,8), %rax 296 adc %rdx, w3 297 adc $0, R32(w0) 298 mul v1 299 add %rax, w3 300 mov w2, 16(rp,n,8) 301 adc %rdx, w0 302 L(mul_2_entry_1): 303 mov 24(up,n,8), %rax 304 mul v0 305 add %rax, w3 306 adc %rdx, w0 307 adc $0, R32(w1) 308 add $4, n 309 mov w3, -8(rp,n,8) 310 jnz L(mul_2_top) 311 312 mov w0, (rp) 313 mov w1, 8(rp) 314 315 sub $2, vn 316 jz L(ret) 317 318 lea 16(vp), vp 319 lea -16(up), up 320 321 mov un, n 322 mov (vp), v0 323 mov 8(vp), v1 324 325 jmp *outer_addr 326 327 C =========================================================== 328 C addmul_2 for remaining vp's 329 330 ALIGN(16) 331 L(addmul_prologue_0): 332 mov -8(up,n,8), %rax 333 mul v1 334 mov %rax, w1 335 mov %rdx, w2 336 mov $0, R32(w3) 337 jmp L(addmul_entry_0) 338 339 ALIGN(16) 340 L(addmul_prologue_1): 341 mov 16(up,n,8), %rax 342 mul v1 343 mov %rax, w0 344 mov %rdx, w1 345 mov $0, R32(w2) 346 mov 24(up,n,8), %rax 347 jmp L(addmul_entry_1) 348 349 ALIGN(16) 350 L(addmul_prologue_2): 351 mov 8(up,n,8), %rax 352 mul v1 353 mov %rax, w3 354 mov %rdx, w0 355 mov $0, R32(w1) 356 jmp L(addmul_entry_2) 357 358 ALIGN(16) 359 L(addmul_prologue_3): 360 mov (up,n,8), %rax 361 mul v1 362 mov %rax, w2 363 mov %rdx, w3 364 mov $0, R32(w0) 365 mov $0, R32(w1) 366 jmp L(addmul_entry_3) 367 368 C this loop is 19 c/loop = 2.375 c/l on K8 369 370 ALIGN(16) 371 L(addmul_top): 372 mov $0, R32(w3) 373 add %rax, w0 374 mov -8(up,n,8), %rax 375 adc %rdx, w1 376 adc $0, R32(w2) 377 mul v1 378 add w0, -8(rp,n,8) 379 adc %rax, w1 380 adc %rdx, w2 381 L(addmul_entry_0): 382 mov (up,n,8), %rax 383 mul v0 384 add %rax, w1 385 mov (up,n,8), %rax 386 adc %rdx, w2 387 adc $0, R32(w3) 388 mul v1 389 add w1, (rp,n,8) 390 mov $0, R32(w1) 391 adc %rax, w2 392 mov $0, R32(w0) 393 adc %rdx, w3 394 L(addmul_entry_3): 395 mov 8(up,n,8), %rax 396 mul v0 397 add %rax, w2 398 mov 8(up,n,8), %rax 399 adc %rdx, w3 400 adc $0, R32(w0) 401 mul v1 402 add w2, 8(rp,n,8) 403 adc %rax, w3 404 adc %rdx, w0 405 L(addmul_entry_2): 406 mov 16(up,n,8), %rax 407 mul v0 408 add %rax, w3 409 mov 16(up,n,8), %rax 410 adc %rdx, w0 411 adc $0, R32(w1) 412 mul v1 413 add w3, 16(rp,n,8) 414 nop C don't ask... 415 adc %rax, w0 416 mov $0, R32(w2) 417 mov 24(up,n,8), %rax 418 adc %rdx, w1 419 L(addmul_entry_1): 420 mul v0 421 add $4, n 422 jnz L(addmul_top) 423 424 add %rax, w0 425 adc %rdx, w1 426 adc $0, R32(w2) 427 428 add w0, -8(rp) 429 adc w1, (rp) 430 adc w2, 8(rp) 431 432 sub $2, vn 433 jz L(ret) 434 435 lea 16(vp), vp 436 lea -16(up), up 437 438 mov un, n 439 mov (vp), v0 440 mov 8(vp), v1 441 442 jmp *outer_addr 443 444 C =========================================================== 445 C accumulate along diagonals if un - vn is small 446 447 ALIGN(16) 448 L(diagonal): 449 xor R32(w0), R32(w0) 450 xor R32(w1), R32(w1) 451 xor R32(w2), R32(w2) 452 453 neg un 454 455 mov R32(vn), %eax 456 and $3, %eax 457 jz L(diag_prologue_0) 458 cmp $2, %eax 459 jc L(diag_prologue_1) 460 jz L(diag_prologue_2) 461 462 L(diag_prologue_3): 463 lea -8(vp), vp 464 mov vp, vp_inner 465 add $1, vn 466 mov vn, n 467 lea L(diag_entry_3)(%rip), outer_addr 468 jmp L(diag_entry_3) 469 470 L(diag_prologue_0): 471 mov vp, vp_inner 472 mov vn, n 473 lea 0(%rip), outer_addr 474 mov -8(up,n,8), %rax 475 jmp L(diag_entry_0) 476 477 L(diag_prologue_1): 478 lea 8(vp), vp 479 mov vp, vp_inner 480 add $3, vn 481 mov vn, n 482 lea 0(%rip), outer_addr 483 mov -8(vp_inner), %rax 484 jmp L(diag_entry_1) 485 486 L(diag_prologue_2): 487 lea -16(vp), vp 488 mov vp, vp_inner 489 add $2, vn 490 mov vn, n 491 lea 0(%rip), outer_addr 492 mov 16(vp_inner), %rax 493 jmp L(diag_entry_2) 494 495 496 C this loop is 10 c/loop = 2.5 c/l on K8 497 498 ALIGN(16) 499 L(diag_top): 500 add %rax, w0 501 adc %rdx, w1 502 mov -8(up,n,8), %rax 503 adc $0, w2 504 L(diag_entry_0): 505 mulq (vp_inner) 506 add %rax, w0 507 adc %rdx, w1 508 adc $0, w2 509 L(diag_entry_3): 510 mov -16(up,n,8), %rax 511 mulq 8(vp_inner) 512 add %rax, w0 513 mov 16(vp_inner), %rax 514 adc %rdx, w1 515 adc $0, w2 516 L(diag_entry_2): 517 mulq -24(up,n,8) 518 add %rax, w0 519 mov 24(vp_inner), %rax 520 adc %rdx, w1 521 lea 32(vp_inner), vp_inner 522 adc $0, w2 523 L(diag_entry_1): 524 mulq -32(up,n,8) 525 sub $4, n 526 jnz L(diag_top) 527 528 add %rax, w0 529 adc %rdx, w1 530 adc $0, w2 531 532 mov w0, (rp,un,8) 533 534 inc un 535 jz L(diag_end) 536 537 mov vn, n 538 mov vp, vp_inner 539 540 lea 8(up), up 541 mov w1, w0 542 mov w2, w1 543 xor R32(w2), R32(w2) 544 545 jmp *outer_addr 546 547 L(diag_end): 548 mov w1, (rp) 549 mov w2, 8(rp) 550 551 L(ret): pop %r15 552 pop %r14 553 pop %r13 554 pop %r12 555 pop %rbp 556 pop %rbx 557 FUNC_EXIT() 558 ret 559 EPILOGUE()