github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mul_basecase.asm (about) 1 dnl AMD64 mpn_mul_basecase. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. 4 5 dnl Copyright 2008, 2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 2.375 37 C AMD K10 2.375 38 C Intel P4 15-16 39 C Intel core2 4.45 40 C Intel corei 4.35 41 C Intel atom ? 42 C VIA nano 4.5 43 44 C The inner loops of this code are the result of running a code generation and 45 C optimization tool suite written by David Harvey and Torbjorn Granlund. 46 47 C TODO 48 C * Use fewer registers. (how??? I can't see it -- david) 49 C * Avoid some "mov $0,r" and instead use "xor r,r". 50 C * Can the top of each L(addmul_outer_n) prologue be folded into the 51 C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the 52 C case where vn = 1 or 2; is it worth it? 53 54 C INPUT PARAMETERS 55 define(`rp', `%rdi') 56 define(`up', `%rsi') 57 define(`un_param',`%rdx') 58 define(`vp', `%rcx') 59 define(`vn', `%r8') 60 61 define(`v0', `%r12') 62 define(`v1', `%r9') 63 64 define(`w0', `%rbx') 65 define(`w1', `%r15') 66 define(`w2', `%rbp') 67 define(`w3', `%r10') 68 69 define(`n', `%r11') 70 define(`outer_addr', `%r14') 71 define(`un', `%r13') 72 73 ABI_SUPPORT(DOS64) 74 ABI_SUPPORT(STD64) 75 76 ASM_START() 77 TEXT 78 ALIGN(16) 79 PROLOGUE(mpn_mul_basecase) 80 FUNC_ENTRY(4) 81 IFDOS(` mov 56(%rsp), %r8d ') 82 push %rbx 83 push %rbp 84 push %r12 85 push %r13 86 push %r14 87 push %r15 88 89 xor R32(un), R32(un) 90 mov (up), %rax 91 mov (vp), v0 92 93 sub un_param, un C rdx used by mul 94 mov un, n 95 mov R32(un_param), R32(w0) 96 97 lea (rp,un_param,8), rp 98 lea (up,un_param,8), up 99 100 mul v0 101 102 test $1, R8(vn) 103 jz L(mul_2) 104 105 C =========================================================== 106 C mul_1 for vp[0] if vn is odd 107 108 L(mul_1): 109 and $3, R32(w0) 110 jz L(mul_1_prologue_0) 111 cmp $2, R32(w0) 112 jc L(mul_1_prologue_1) 113 jz L(mul_1_prologue_2) 114 115 L(mul_1_prologue_3): 116 add $-1, n 117 lea L(addmul_outer_3)(%rip), outer_addr 118 mov %rax, w3 119 mov %rdx, w0 120 jmp L(mul_1_entry_3) 121 122 L(mul_1_prologue_0): 123 mov %rax, w2 124 mov %rdx, w3 C note: already w0 == 0 125 lea L(addmul_outer_0)(%rip), outer_addr 126 jmp L(mul_1_entry_0) 127 128 L(mul_1_prologue_1): 129 cmp $-1, un 130 jne 2f 131 mov %rax, -8(rp) 132 mov %rdx, (rp) 133 jmp L(ret) 134 2: add $1, n 135 lea L(addmul_outer_1)(%rip), outer_addr 136 mov %rax, w1 137 mov %rdx, w2 138 xor R32(w3), R32(w3) 139 mov (up,n,8), %rax 140 jmp L(mul_1_entry_1) 141 142 L(mul_1_prologue_2): 143 add $-2, n 144 lea L(addmul_outer_2)(%rip), outer_addr 145 mov %rax, w0 146 mov %rdx, w1 147 mov 24(up,n,8), %rax 148 xor R32(w2), R32(w2) 149 xor R32(w3), R32(w3) 150 jmp L(mul_1_entry_2) 151 152 153 C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments 154 155 ALIGN(16) 156 L(mul_1_top): 157 mov w0, -16(rp,n,8) 158 add %rax, w1 159 mov (up,n,8), %rax 160 adc %rdx, w2 161 L(mul_1_entry_1): 162 xor R32(w0), R32(w0) 163 mul v0 164 mov w1, -8(rp,n,8) 165 add %rax, w2 166 adc %rdx, w3 167 L(mul_1_entry_0): 168 mov 8(up,n,8), %rax 169 mul v0 170 mov w2, (rp,n,8) 171 add %rax, w3 172 adc %rdx, w0 173 L(mul_1_entry_3): 174 mov 16(up,n,8), %rax 175 mul v0 176 mov w3, 8(rp,n,8) 177 xor R32(w2), R32(w2) C zero 178 mov w2, w3 C zero 179 add %rax, w0 180 mov 24(up,n,8), %rax 181 mov w2, w1 C zero 182 adc %rdx, w1 183 L(mul_1_entry_2): 184 mul v0 185 add $4, n 186 js L(mul_1_top) 187 188 mov w0, -16(rp) 189 add %rax, w1 190 mov w1, -8(rp) 191 adc %rdx, w2 192 mov w2, (rp) 193 194 add $-1, vn C vn -= 1 195 jz L(ret) 196 197 mov 8(vp), v0 198 mov 16(vp), v1 199 200 lea 8(vp), vp C vp += 1 201 lea 8(rp), rp C rp += 1 202 203 jmp *outer_addr 204 205 C =========================================================== 206 C mul_2 for vp[0], vp[1] if vn is even 207 208 ALIGN(16) 209 L(mul_2): 210 mov 8(vp), v1 211 212 and $3, R32(w0) 213 jz L(mul_2_prologue_0) 214 cmp $2, R32(w0) 215 jz L(mul_2_prologue_2) 216 jc L(mul_2_prologue_1) 217 218 L(mul_2_prologue_3): 219 lea L(addmul_outer_3)(%rip), outer_addr 220 add $2, n 221 mov %rax, -16(rp,n,8) 222 mov %rdx, w2 223 xor R32(w3), R32(w3) 224 xor R32(w0), R32(w0) 225 mov -16(up,n,8), %rax 226 jmp L(mul_2_entry_3) 227 228 ALIGN(16) 229 L(mul_2_prologue_0): 230 add $3, n 231 mov %rax, w0 232 mov %rdx, w1 233 xor R32(w2), R32(w2) 234 mov -24(up,n,8), %rax 235 lea L(addmul_outer_0)(%rip), outer_addr 236 jmp L(mul_2_entry_0) 237 238 ALIGN(16) 239 L(mul_2_prologue_1): 240 mov %rax, w3 241 mov %rdx, w0 242 xor R32(w1), R32(w1) 243 lea L(addmul_outer_1)(%rip), outer_addr 244 jmp L(mul_2_entry_1) 245 246 ALIGN(16) 247 L(mul_2_prologue_2): 248 add $1, n 249 lea L(addmul_outer_2)(%rip), outer_addr 250 mov $0, R32(w0) 251 mov $0, R32(w1) 252 mov %rax, w2 253 mov -8(up,n,8), %rax 254 mov %rdx, w3 255 jmp L(mul_2_entry_2) 256 257 C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments 258 259 ALIGN(16) 260 L(mul_2_top): 261 mov -32(up,n,8), %rax 262 mul v1 263 add %rax, w0 264 adc %rdx, w1 265 mov -24(up,n,8), %rax 266 xor R32(w2), R32(w2) 267 mul v0 268 add %rax, w0 269 mov -24(up,n,8), %rax 270 adc %rdx, w1 271 adc $0, R32(w2) 272 L(mul_2_entry_0): 273 mul v1 274 add %rax, w1 275 mov w0, -24(rp,n,8) 276 adc %rdx, w2 277 mov -16(up,n,8), %rax 278 mul v0 279 mov $0, R32(w3) 280 add %rax, w1 281 adc %rdx, w2 282 mov -16(up,n,8), %rax 283 adc $0, R32(w3) 284 mov $0, R32(w0) 285 mov w1, -16(rp,n,8) 286 L(mul_2_entry_3): 287 mul v1 288 add %rax, w2 289 mov -8(up,n,8), %rax 290 adc %rdx, w3 291 mov $0, R32(w1) 292 mul v0 293 add %rax, w2 294 mov -8(up,n,8), %rax 295 adc %rdx, w3 296 adc R32(w1), R32(w0) C adc $0, w0 297 L(mul_2_entry_2): 298 mul v1 299 add %rax, w3 300 mov w2, -8(rp,n,8) 301 adc %rdx, w0 302 mov (up,n,8), %rax 303 mul v0 304 add %rax, w3 305 adc %rdx, w0 306 adc $0, R32(w1) 307 L(mul_2_entry_1): 308 add $4, n 309 mov w3, -32(rp,n,8) 310 js L(mul_2_top) 311 312 mov -32(up,n,8), %rax C FIXME: n is constant 313 mul v1 314 add %rax, w0 315 mov w0, (rp) 316 adc %rdx, w1 317 mov w1, 8(rp) 318 319 add $-2, vn C vn -= 2 320 jz L(ret) 321 322 mov 16(vp), v0 323 mov 24(vp), v1 324 325 lea 16(vp), vp C vp += 2 326 lea 16(rp), rp C rp += 2 327 328 jmp *outer_addr 329 330 331 C =========================================================== 332 C addmul_2 for remaining vp's 333 334 C in the following prologues, we reuse un to store the 335 C adjusted value of n that is reloaded on each iteration 336 337 L(addmul_outer_0): 338 add $3, un 339 lea 0(%rip), outer_addr 340 341 mov un, n 342 mov -24(up,un,8), %rax 343 mul v0 344 mov %rax, w0 345 mov -24(up,un,8), %rax 346 mov %rdx, w1 347 xor R32(w2), R32(w2) 348 jmp L(addmul_entry_0) 349 350 L(addmul_outer_1): 351 mov un, n 352 mov (up,un,8), %rax 353 mul v0 354 mov %rax, w3 355 mov (up,un,8), %rax 356 mov %rdx, w0 357 xor R32(w1), R32(w1) 358 jmp L(addmul_entry_1) 359 360 L(addmul_outer_2): 361 add $1, un 362 lea 0(%rip), outer_addr 363 364 mov un, n 365 mov -8(up,un,8), %rax 366 mul v0 367 xor R32(w0), R32(w0) 368 mov %rax, w2 369 xor R32(w1), R32(w1) 370 mov %rdx, w3 371 mov -8(up,un,8), %rax 372 jmp L(addmul_entry_2) 373 374 L(addmul_outer_3): 375 add $2, un 376 lea 0(%rip), outer_addr 377 378 mov un, n 379 mov -16(up,un,8), %rax 380 xor R32(w3), R32(w3) 381 mul v0 382 mov %rax, w1 383 mov -16(up,un,8), %rax 384 mov %rdx, w2 385 jmp L(addmul_entry_3) 386 387 C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments 388 389 ALIGN(16) 390 L(addmul_top): 391 add w3, -32(rp,n,8) 392 adc %rax, w0 393 mov -24(up,n,8), %rax 394 adc %rdx, w1 395 xor R32(w2), R32(w2) 396 mul v0 397 add %rax, w0 398 mov -24(up,n,8), %rax 399 adc %rdx, w1 400 adc R32(w2), R32(w2) C adc $0, w2 401 L(addmul_entry_0): 402 mul v1 403 xor R32(w3), R32(w3) 404 add w0, -24(rp,n,8) 405 adc %rax, w1 406 mov -16(up,n,8), %rax 407 adc %rdx, w2 408 mul v0 409 add %rax, w1 410 mov -16(up,n,8), %rax 411 adc %rdx, w2 412 adc $0, R32(w3) 413 L(addmul_entry_3): 414 mul v1 415 add w1, -16(rp,n,8) 416 adc %rax, w2 417 mov -8(up,n,8), %rax 418 adc %rdx, w3 419 mul v0 420 xor R32(w0), R32(w0) 421 add %rax, w2 422 adc %rdx, w3 423 mov $0, R32(w1) 424 mov -8(up,n,8), %rax 425 adc R32(w1), R32(w0) C adc $0, w0 426 L(addmul_entry_2): 427 mul v1 428 add w2, -8(rp,n,8) 429 adc %rax, w3 430 adc %rdx, w0 431 mov (up,n,8), %rax 432 mul v0 433 add %rax, w3 434 mov (up,n,8), %rax 435 adc %rdx, w0 436 adc $0, R32(w1) 437 L(addmul_entry_1): 438 mul v1 439 add $4, n 440 js L(addmul_top) 441 442 add w3, -8(rp) 443 adc %rax, w0 444 mov w0, (rp) 445 adc %rdx, w1 446 mov w1, 8(rp) 447 448 add $-2, vn C vn -= 2 449 jz L(ret) 450 451 lea 16(rp), rp C rp += 2 452 lea 16(vp), vp C vp += 2 453 454 mov (vp), v0 455 mov 8(vp), v1 456 457 jmp *outer_addr 458 459 ALIGN(16) 460 L(ret): pop %r15 461 pop %r14 462 pop %r13 463 pop %r12 464 pop %rbp 465 pop %rbx 466 FUNC_EXIT() 467 ret 468 469 EPILOGUE()