github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bobcat/mul_basecase.asm (about) 1 dnl AMD64 mpn_mul_basecase optimised for AMD bobcat. 2 3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 4.5 35 C AMD K10 4.5 36 C AMD bd1 4.75 37 C AMD bobcat 5 38 C Intel P4 17.7 39 C Intel core2 5.5 40 C Intel NHM 5.43 41 C Intel SBR 3.92 42 C Intel atom 23 43 C VIA nano 5.63 44 45 C This mul_basecase is based on mul_1 and addmul_1, since these both run at the 46 C multiply insn bandwidth, without any apparent loop branch exit pipeline 47 C replays experienced on K8. The structure is unusual: it falls into mul_1 in 48 C the same way for all n, then it splits into 4 different wind-down blocks and 49 C 4 separate addmul_1 loops. 50 C 51 C We have not tried using the same addmul_1 loops with a switch into feed-in 52 C code, as we do in other basecase implementations. Doing that could save 53 C substantial code volume, but would also probably add some overhead. 54 55 C TODO 56 C * Tune un < 3 code. 57 C * Fix slowdown for un=vn=3 (67->71) compared to default code. 58 C * This is 1263 bytes, compared to 1099 bytes for default code. Consider 59 C combining addmul loops like that code. Tolerable slowdown? 60 C * Lots of space could be saved by replacing the "switch" code by gradual 61 C jumps out from mul_1 winddown code, perhaps with no added overhead. 62 C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. 63 64 ABI_SUPPORT(DOS64) 65 ABI_SUPPORT(STD64) 66 67 C Standard parameters 68 define(`rp', `%rdi') 69 define(`up', `%rsi') 70 define(`un_param', `%rdx') 71 define(`vp', `%rcx') 72 define(`vn', `%r8') 73 C Standard allocations 74 define(`un', `%rbx') 75 define(`w0', `%r10') 76 define(`w1', `%r11') 77 define(`w2', `%r12') 78 define(`w3', `%r13') 79 define(`n', `%rbp') 80 define(`v0', `%r9') 81 82 C Temp macro for allowing control over indexing. 83 C Define to return $1 for more conservative ptr handling. 84 define(`X',`$2') 85 86 87 ASM_START() 88 TEXT 89 ALIGN(16) 90 PROLOGUE(mpn_mul_basecase) 91 FUNC_ENTRY(4) 92 IFDOS(` mov 56(%rsp), %r8d ') 93 94 mov (up), %rax 95 mov (vp), v0 96 97 cmp $2, un_param 98 ja L(ge3) 99 jz L(u2) 100 101 mul v0 C u0 x v0 102 mov %rax, (rp) 103 mov %rdx, 8(rp) 104 FUNC_EXIT() 105 ret 106 107 L(u2): mul v0 C u0 x v0 108 mov %rax, (rp) 109 mov 8(up), %rax 110 mov %rdx, w0 111 mul v0 112 add %rax, w0 113 mov %rdx, w1 114 adc $0, w1 115 cmp $1, R32(vn) 116 jnz L(u2v2) 117 mov w0, 8(rp) 118 mov w1, 16(rp) 119 FUNC_EXIT() 120 ret 121 122 L(u2v2):mov 8(vp), v0 123 mov (up), %rax 124 mul v0 125 add %rax, w0 126 mov w0, 8(rp) 127 mov %rdx, %r8 C CAUTION: r8 realloc 128 adc $0, %r8 129 mov 8(up), %rax 130 mul v0 131 add w1, %r8 132 adc $0, %rdx 133 add %r8, %rax 134 adc $0, %rdx 135 mov %rax, 16(rp) 136 mov %rdx, 24(rp) 137 FUNC_EXIT() 138 ret 139 140 141 L(ge3): push %rbx 142 push %rbp 143 push %r12 144 push %r13 145 146 lea 8(vp), vp 147 148 lea -24(rp,un_param,8), rp 149 lea -24(up,un_param,8), up 150 xor R32(un), R32(un) 151 mov $2, R32(n) 152 sub un_param, un 153 sub un_param, n 154 155 mul v0 156 mov %rax, w2 157 mov %rdx, w3 158 jmp L(L3) 159 160 ALIGN(16) 161 L(top): mov w0, -16(rp,n,8) 162 add w1, w2 163 adc $0, w3 164 mov (up,n,8), %rax 165 mul v0 166 mov %rax, w0 167 mov %rdx, w1 168 mov w2, -8(rp,n,8) 169 add w3, w0 170 adc $0, w1 171 mov 8(up,n,8), %rax 172 mul v0 173 mov %rax, w2 174 mov %rdx, w3 175 mov w0, (rp,n,8) 176 add w1, w2 177 adc $0, w3 178 L(L3): mov 16(up,n,8), %rax 179 mul v0 180 mov %rax, w0 181 mov %rdx, w1 182 mov w2, 8(rp,n,8) 183 add w3, w0 184 adc $0, w1 185 mov 24(up,n,8), %rax 186 mul v0 187 mov %rax, w2 188 mov %rdx, w3 189 add $4, n 190 js L(top) 191 192 mov w0, -16(rp,n,8) 193 add w1, w2 194 adc $0, w3 195 196 C Switch on n into right addmul_l loop 197 test n, n 198 jz L(r2) 199 cmp $2, R32(n) 200 ja L(r3) 201 jz L(r0) 202 jmp L(r1) 203 204 205 L(r3): mov w2, X(-8(rp,n,8),16(rp)) 206 mov w3, X((rp,n,8),24(rp)) 207 add $2, un 208 209 C outer loop(3) 210 L(to3): dec vn 211 jz L(ret) 212 mov (vp), v0 213 mov 8(up,un,8), %rax 214 lea 8(vp), vp 215 lea 8(rp), rp 216 mov un, n 217 mul v0 218 mov %rax, w2 219 mov %rdx, w3 220 jmp L(al3) 221 222 ALIGN(16) 223 L(ta3): add w0, -16(rp,n,8) 224 adc w1, w2 225 adc $0, w3 226 mov (up,n,8), %rax 227 mul v0 228 mov %rax, w0 229 mov %rdx, w1 230 add w2, -8(rp,n,8) 231 adc w3, w0 232 adc $0, w1 233 mov 8(up,n,8), %rax 234 mul v0 235 mov %rax, w2 236 mov %rdx, w3 237 add w0, (rp,n,8) 238 adc w1, w2 239 adc $0, w3 240 L(al3): mov 16(up,n,8), %rax 241 mul v0 242 mov %rax, w0 243 mov %rdx, w1 244 add w2, 8(rp,n,8) 245 adc w3, w0 246 adc $0, w1 247 mov 24(up,n,8), %rax 248 mul v0 249 mov %rax, w2 250 mov %rdx, w3 251 add $4, n 252 js L(ta3) 253 254 add w0, X(-16(rp,n,8),8(rp)) 255 adc w1, w2 256 adc $0, w3 257 add w2, X(-8(rp,n,8),16(rp)) 258 adc $0, w3 259 mov w3, X((rp,n,8),24(rp)) 260 jmp L(to3) 261 262 263 L(r2): mov X(0(up,n,8),(up)), %rax 264 mul v0 265 mov %rax, w0 266 mov %rdx, w1 267 mov w2, X(-8(rp,n,8),-8(rp)) 268 add w3, w0 269 adc $0, w1 270 mov X(8(up,n,8),8(up)), %rax 271 mul v0 272 mov %rax, w2 273 mov %rdx, w3 274 mov w0, X((rp,n,8),(rp)) 275 add w1, w2 276 adc $0, w3 277 mov X(16(up,n,8),16(up)), %rax 278 mul v0 279 mov %rax, w0 280 mov %rdx, w1 281 mov w2, X(8(rp,n,8),8(rp)) 282 add w3, w0 283 adc $0, w1 284 mov w0, X(16(rp,n,8),16(rp)) 285 adc $0, w3 286 mov w1, X(24(rp,n,8),24(rp)) 287 inc un 288 289 C outer loop(2) 290 L(to2): dec vn 291 jz L(ret) 292 mov (vp), v0 293 mov 16(up,un,8), %rax 294 lea 8(vp), vp 295 lea 8(rp), rp 296 mov un, n 297 mul v0 298 mov %rax, w0 299 mov %rdx, w1 300 jmp L(al2) 301 302 ALIGN(16) 303 L(ta2): add w0, -16(rp,n,8) 304 adc w1, w2 305 adc $0, w3 306 mov (up,n,8), %rax 307 mul v0 308 mov %rax, w0 309 mov %rdx, w1 310 add w2, -8(rp,n,8) 311 adc w3, w0 312 adc $0, w1 313 mov 8(up,n,8), %rax 314 mul v0 315 mov %rax, w2 316 mov %rdx, w3 317 add w0, (rp,n,8) 318 adc w1, w2 319 adc $0, w3 320 mov 16(up,n,8), %rax 321 mul v0 322 mov %rax, w0 323 mov %rdx, w1 324 add w2, 8(rp,n,8) 325 adc w3, w0 326 adc $0, w1 327 L(al2): mov 24(up,n,8), %rax 328 mul v0 329 mov %rax, w2 330 mov %rdx, w3 331 add $4, n 332 js L(ta2) 333 334 add w0, X(-16(rp,n,8),8(rp)) 335 adc w1, w2 336 adc $0, w3 337 add w2, X(-8(rp,n,8),16(rp)) 338 adc $0, w3 339 mov w3, X((rp,n,8),24(rp)) 340 jmp L(to2) 341 342 343 L(r1): mov X(0(up,n,8),8(up)), %rax 344 mul v0 345 mov %rax, w0 346 mov %rdx, w1 347 mov w2, X(-8(rp,n,8),(rp)) 348 add w3, w0 349 adc $0, w1 350 mov X(8(up,n,8),16(up)), %rax 351 mul v0 352 mov %rax, w2 353 mov %rdx, w3 354 mov w0, X((rp,n,8),8(rp)) 355 add w1, w2 356 adc $0, w3 357 mov w2, X(8(rp,n,8),16(rp)) 358 mov w3, X(16(rp,n,8),24(rp)) 359 add $4, un 360 361 C outer loop(1) 362 L(to1): dec vn 363 jz L(ret) 364 mov (vp), v0 365 mov -8(up,un,8), %rax 366 lea 8(vp), vp 367 lea 8(rp), rp 368 mov un, n 369 mul v0 370 mov %rax, w2 371 mov %rdx, w3 372 jmp L(al1) 373 374 ALIGN(16) 375 L(ta1): add w0, -16(rp,n,8) 376 adc w1, w2 377 adc $0, w3 378 L(al1): mov (up,n,8), %rax 379 mul v0 380 mov %rax, w0 381 mov %rdx, w1 382 add w2, -8(rp,n,8) 383 adc w3, w0 384 adc $0, w1 385 mov 8(up,n,8), %rax 386 mul v0 387 mov %rax, w2 388 mov %rdx, w3 389 add w0, (rp,n,8) 390 adc w1, w2 391 adc $0, w3 392 mov 16(up,n,8), %rax 393 mul v0 394 mov %rax, w0 395 mov %rdx, w1 396 add w2, 8(rp,n,8) 397 adc w3, w0 398 adc $0, w1 399 mov 24(up,n,8), %rax 400 mul v0 401 mov %rax, w2 402 mov %rdx, w3 403 add $4, n 404 js L(ta1) 405 406 add w0, X(-16(rp,n,8),8(rp)) 407 adc w1, w2 408 adc $0, w3 409 add w2, X(-8(rp,n,8),16(rp)) 410 adc $0, w3 411 mov w3, X((rp,n,8),24(rp)) 412 jmp L(to1) 413 414 415 L(r0): mov X((up,n,8),16(up)), %rax 416 mul v0 417 mov %rax, w0 418 mov %rdx, w1 419 mov w2, X(-8(rp,n,8),8(rp)) 420 add w3, w0 421 adc $0, w1 422 mov w0, X((rp,n,8),16(rp)) 423 mov w1, X(8(rp,n,8),24(rp)) 424 add $3, un 425 426 C outer loop(0) 427 L(to0): dec vn 428 jz L(ret) 429 mov (vp), v0 430 mov (up,un,8), %rax 431 lea 8(vp), vp 432 lea 8(rp), rp 433 mov un, n 434 mul v0 435 mov %rax, w0 436 mov %rdx, w1 437 jmp L(al0) 438 439 ALIGN(16) 440 L(ta0): add w0, -16(rp,n,8) 441 adc w1, w2 442 adc $0, w3 443 mov (up,n,8), %rax 444 mul v0 445 mov %rax, w0 446 mov %rdx, w1 447 add w2, -8(rp,n,8) 448 adc w3, w0 449 adc $0, w1 450 L(al0): mov 8(up,n,8), %rax 451 mul v0 452 mov %rax, w2 453 mov %rdx, w3 454 add w0, (rp,n,8) 455 adc w1, w2 456 adc $0, w3 457 mov 16(up,n,8), %rax 458 mul v0 459 mov %rax, w0 460 mov %rdx, w1 461 add w2, 8(rp,n,8) 462 adc w3, w0 463 adc $0, w1 464 mov 24(up,n,8), %rax 465 mul v0 466 mov %rax, w2 467 mov %rdx, w3 468 add $4, n 469 js L(ta0) 470 471 add w0, X(-16(rp,n,8),8(rp)) 472 adc w1, w2 473 adc $0, w3 474 add w2, X(-8(rp,n,8),16(rp)) 475 adc $0, w3 476 mov w3, X((rp,n,8),24(rp)) 477 jmp L(to0) 478 479 480 L(ret): pop %r13 481 pop %r12 482 pop %rbp 483 pop %rbx 484 FUNC_EXIT() 485 ret 486 EPILOGUE()