github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bd1/mul_basecase.asm (about) 1 dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb mul_1 mul_2 mul_3 addmul_2 36 C AMD K8,K9 37 C AMD K10 38 C AMD bull ~4.8 ~4.55 - ~4.3 39 C AMD pile ~4.6 ~4.55 - ~4.55 40 C AMD bobcat 41 C AMD jaguar 42 C Intel P4 43 C Intel core 44 C Intel NHM 45 C Intel SBR 46 C Intel IBR 47 C Intel HWL 48 C Intel BWL 49 C Intel atom 50 C VIA nano 51 52 C The inner loops of this code are the result of running a code generation and 53 C optimisation tool suite written by David Harvey and Torbjorn Granlund. 54 55 C TODO 56 C * Merge bull-specific mul_1, if it is not slower the TOOM22 range. 57 C Alternatively, we could tweak the present code (which was loopmixed for a 58 C different CPU). 59 C * Merge faster mul_2, such as the one in the same directory as this file. 60 C * Further micro-optimise. 61 62 C When playing with pointers, set this to $2 to fall back to conservative 63 C indexing in wind-down code. 64 define(`I',`$1') 65 66 67 define(`rp', `%rdi') 68 define(`up', `%rsi') 69 define(`un_param',`%rdx') 70 define(`vp', `%rcx') 71 define(`vn', `%r8') 72 73 define(`un', `%rbx') 74 75 define(`w0', `%r10') 76 define(`w1', `%r11') 77 define(`w2', `%r12') 78 define(`w3', `%r13') 79 define(`n', `%rbp') 80 define(`v0', `%r9') 81 82 ABI_SUPPORT(DOS64) 83 ABI_SUPPORT(STD64) 84 85 ASM_START() 86 TEXT 87 ALIGN(16) 88 PROLOGUE(mpn_mul_basecase) 89 FUNC_ENTRY(4) 90 IFDOS(` mov 56(%rsp), %r8d ') 91 push %rbx 92 push %rbp 93 mov un_param, un C free up rdx 94 neg un 95 96 mov (up), %rax C shared for mul_1 and mul_2 97 lea (up,un_param,8), up C point at operand end 98 lea (rp,un_param,8), rp C point at rp[un-1] 99 100 mov (vp), v0 C shared for mul_1 and mul_2 101 mul v0 C shared for mul_1 and mul_2 102 103 test $1, R8(vn) 104 jz L(do_mul_2) 105 106 L(do_mul_1): 107 test $1, R8(un) 108 jnz L(m1x1) 109 110 L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... 111 mov %rdx, w1 112 mov 8(up,un,8), %rax 113 test $2, R8(un) 114 jnz L(m110) 115 116 L(m100):lea 2(un), n C un = 4, 8, 12, ... 117 jmp L(m1l0) 118 119 L(m110):lea (un), n C un = 2, 6, 10, ... 120 jmp L(m1l2) 121 122 L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... 123 mov %rdx, w0 124 test $2, R8(un) 125 jz L(m111) 126 127 L(m101):lea 3(un), n C un = 1, 5, 9, ... 128 test n, n 129 js L(m1l1) 130 mov %rax, -8(rp) 131 mov %rdx, (rp) 132 pop %rbp 133 pop %rbx 134 FUNC_EXIT() 135 ret 136 137 L(m111):lea 1(un), n C un = 3, 7, 11, ... 138 mov 8(up,un,8), %rax 139 jmp L(m1l3) 140 141 ALIGN(16) 142 L(m1tp):mov %rdx, w0 143 add %rax, w1 144 L(m1l1):mov -16(up,n,8), %rax 145 adc $0, w0 146 mul v0 147 add %rax, w0 148 mov w1, -24(rp,n,8) 149 mov -8(up,n,8), %rax 150 mov %rdx, w1 151 adc $0, w1 152 L(m1l0):mul v0 153 mov w0, -16(rp,n,8) 154 add %rax, w1 155 mov %rdx, w0 156 mov (up,n,8), %rax 157 adc $0, w0 158 L(m1l3):mul v0 159 mov w1, -8(rp,n,8) 160 mov %rdx, w1 161 add %rax, w0 162 mov 8(up,n,8), %rax 163 adc $0, w1 164 L(m1l2):mul v0 165 mov w0, (rp,n,8) 166 add $4, n 167 jnc L(m1tp) 168 169 L(m1ed):add %rax, w1 170 adc $0, %rdx 171 mov w1, I(-8(rp),-24(rp,n,8)) 172 mov %rdx, I((rp),-16(rp,n,8)) 173 174 dec R32(vn) 175 jz L(ret2) 176 177 lea 8(vp), vp 178 lea 8(rp), rp 179 push %r12 180 push %r13 181 push %r14 182 jmp L(do_addmul) 183 184 L(do_mul_2): 185 define(`v1', `%r14') 186 push %r12 187 push %r13 188 push %r14 189 190 mov 8(vp), v1 191 192 test $1, R8(un) 193 jnz L(m2b1) 194 195 L(m2b0):lea (un), n 196 mov %rax, w2 C 0 197 mov (up,un,8), %rax 198 mov %rdx, w1 C 1 199 mul v1 200 mov %rax, w0 C 1 201 mov w2, (rp,un,8) C 0 202 mov 8(up,un,8), %rax 203 mov %rdx, w2 C 2 204 jmp L(m2l0) 205 206 L(m2b1):lea 1(un), n 207 mov %rax, w0 C 1 208 mov %rdx, w3 C 2 209 mov (up,un,8), %rax 210 mul v1 211 mov w0, (rp,un,8) C 1 212 mov %rdx, w0 C 3 213 mov %rax, w2 C 0 214 mov 8(up,un,8), %rax 215 jmp L(m2l1) 216 217 ALIGN(32) 218 L(m2tp):add %rax, w2 C 0 219 mov (up,n,8), %rax 220 adc $0, w0 C 1 221 L(m2l1):mul v0 222 add %rax, w2 C 0 223 mov (up,n,8), %rax 224 mov %rdx, w1 C 1 225 adc $0, w1 C 1 226 mul v1 227 add w3, w2 C 0 228 adc $0, w1 C 1 229 add %rax, w0 C 1 230 mov w2, (rp,n,8) C 0 231 mov 8(up,n,8), %rax 232 mov %rdx, w2 C 2 233 adc $0, w2 C 2 234 L(m2l0):mul v0 235 add %rax, w0 C 1 236 mov %rdx, w3 C 2 237 adc $0, w3 C 2 238 add w1, w0 C 1 239 adc $0, w3 C 2 240 mov 8(up,n,8), %rax 241 mul v1 242 add $2, n 243 mov w0, -8(rp,n,8) C 1 244 mov %rdx, w0 C 3 245 jnc L(m2tp) 246 247 L(m2ed):add %rax, w2 248 adc $0, %rdx 249 add w3, w2 250 adc $0, %rdx 251 mov w2, I((rp),(rp,n,8)) 252 mov %rdx, I(8(rp),8(rp,n,8)) 253 254 add $-2, R32(vn) 255 jz L(ret5) 256 257 lea 16(vp), vp 258 lea 16(rp), rp 259 260 261 L(do_addmul): 262 push %r15 263 push vn C save vn in new stack slot 264 define(`vn', `(%rsp)') 265 define(`X0', `%r14') 266 define(`X1', `%r15') 267 define(`v1', `%r8') 268 269 L(outer): 270 mov (vp), v0 271 mov 8(vp), v1 272 273 mov (up,un,8), %rax 274 mul v0 275 276 test $1, R8(un) 277 jnz L(bx1) 278 279 L(bx0): mov %rax, X1 280 mov (up,un,8), %rax 281 mov %rdx, X0 282 mul v1 283 test $2, R8(un) 284 jnz L(b10) 285 286 L(b00): lea (un), n C un = 4, 8, 12, ... 287 mov (rp,un,8), w3 288 mov %rax, w0 289 mov 8(up,un,8), %rax 290 mov %rdx, w1 291 jmp L(lo0) 292 293 L(b10): lea 2(un), n C un = 2, 6, 10, ... 294 mov (rp,un,8), w1 295 mov %rdx, w3 296 mov %rax, w2 297 mov 8(up,un,8), %rax 298 jmp L(lo2) 299 300 L(bx1): mov %rax, X0 301 mov (up,un,8), %rax 302 mov %rdx, X1 303 mul v1 304 test $2, R8(un) 305 jz L(b11) 306 307 L(b01): lea 1(un), n C un = 1, 5, 9, ... 308 mov (rp,un,8), w2 309 mov %rdx, w0 310 mov %rax, w3 311 jmp L(lo1) 312 313 L(b11): lea -1(un), n C un = 3, 7, 11, ... 314 mov (rp,un,8), w0 315 mov %rax, w1 316 mov 8(up,un,8), %rax 317 mov %rdx, w2 318 jmp L(lo3) 319 320 ALIGN(32) 321 L(top): 322 L(lo2): mul v0 323 add w1, X1 324 mov X1, -16(rp,n,8) 325 mov %rdx, X1 326 adc %rax, X0 327 adc $0, X1 328 mov -8(up,n,8), %rax 329 mul v1 330 mov -8(rp,n,8), w1 331 mov %rdx, w0 332 add w1, w2 333 adc %rax, w3 334 adc $0, w0 335 L(lo1): mov (up,n,8), %rax 336 mul v0 337 add w2, X0 338 mov X0, -8(rp,n,8) 339 mov %rdx, X0 340 adc %rax, X1 341 mov (up,n,8), %rax 342 adc $0, X0 343 mov (rp,n,8), w2 344 mul v1 345 add w2, w3 346 adc %rax, w0 347 mov 8(up,n,8), %rax 348 mov %rdx, w1 349 adc $0, w1 350 L(lo0): mul v0 351 add w3, X1 352 mov X1, (rp,n,8) 353 adc %rax, X0 354 mov 8(up,n,8), %rax 355 mov %rdx, X1 356 adc $0, X1 357 mov 8(rp,n,8), w3 358 mul v1 359 add w3, w0 360 adc %rax, w1 361 mov 16(up,n,8), %rax 362 mov %rdx, w2 363 adc $0, w2 364 L(lo3): mul v0 365 add w0, X0 366 mov X0, 8(rp,n,8) 367 mov %rdx, X0 368 adc %rax, X1 369 adc $0, X0 370 mov 16(up,n,8), %rax 371 mov 16(rp,n,8), w0 372 mul v1 373 mov %rdx, w3 374 add w0, w1 375 adc %rax, w2 376 adc $0, w3 377 mov 24(up,n,8), %rax 378 add $4, n 379 jnc L(top) 380 381 L(end): mul v0 382 add w1, X1 383 mov X1, I(-16(rp),-16(rp,n,8)) 384 mov %rdx, X1 385 adc %rax, X0 386 adc $0, X1 387 mov I(-8(up),-8(up,n,8)), %rax 388 mul v1 389 mov I(-8(rp),-8(rp,n,8)), w1 390 add w1, w2 391 adc %rax, w3 392 adc $0, %rdx 393 add w2, X0 394 adc $0, X1 395 mov X0, I(-8(rp),-8(rp,n,8)) 396 add w3, X1 397 mov X1, I((rp),(rp,n,8)) 398 adc $0, %rdx 399 mov %rdx, I(8(rp),8(rp,n,8)) 400 401 402 addl $-2, vn 403 lea 16(vp), vp 404 lea 16(rp), rp 405 jnz L(outer) 406 407 pop %rax C deallocate vn slot 408 pop %r15 409 L(ret5):pop %r14 410 pop %r13 411 pop %r12 412 L(ret2):pop %rbp 413 pop %rbx 414 FUNC_EXIT() 415 ret 416 EPILOGUE()