github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mullo_basecase.asm (about) 1 dnl AMD64 mpn_mullo_basecase. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C The inner loops of this code are the result of running a code generation and 36 C optimisation tool suite written by David Harvey and Torbjorn Granlund. 37 38 C NOTES 39 C * There is a major stupidity in that we call mpn_mul_1 initially, for a 40 C large trip count. Instead, we should start with mul_2 for any operand 41 C size congruence class. 42 C * Stop iterating addmul_2 earlier, falling into straight-line triangle code 43 C for the last 2-3 iterations. 44 C * Perhaps implement n=4 special code. 45 C * The reload of the outer loop jump address hurts branch prediction. 46 C * The addmul_2 loop ends with an MUL whose high part is not used upon loop 47 C exit. 48 49 C INPUT PARAMETERS 50 define(`rp', `%rdi') 51 define(`up', `%rsi') 52 define(`vp_param', `%rdx') 53 define(`n', `%rcx') 54 55 define(`vp', `%r11') 56 define(`outer_addr', `%r8') 57 define(`j', `%r9') 58 define(`v0', `%r13') 59 define(`v1', `%r14') 60 define(`w0', `%rbx') 61 define(`w1', `%r15') 62 define(`w2', `%rbp') 63 define(`w3', `%r10') 64 65 ABI_SUPPORT(DOS64) 66 ABI_SUPPORT(STD64) 67 68 ASM_START() 69 TEXT 70 ALIGN(16) 71 PROLOGUE(mpn_mullo_basecase) 72 FUNC_ENTRY(4) 73 cmp $4, n 74 jge L(gen) 75 mov (up), %rax C u0 76 mov (vp_param), %r8 C v0 77 78 lea L(tab)(%rip), %r9 79 ifdef(`PIC', 80 ` movslq (%r9,%rcx,4), %r10 81 add %r10, %r9 82 jmp *%r9 83 ',` 84 jmp *(%r9,n,8) 85 ') 86 JUMPTABSECT 87 ALIGN(8) 88 L(tab): JMPENT( L(tab), L(tab)) C not allowed 89 JMPENT( L(1), L(tab)) C 1 90 JMPENT( L(2), L(tab)) C 2 91 JMPENT( L(3), L(tab)) C 3 92 dnl JMPENT( L(0m4), L(tab)) C 4 93 dnl JMPENT( L(1m4), L(tab)) C 5 94 dnl JMPENT( L(2m4), L(tab)) C 6 95 dnl JMPENT( L(3m4), L(tab)) C 7 96 dnl JMPENT( L(0m4), L(tab)) C 8 97 dnl JMPENT( L(1m4), L(tab)) C 9 98 dnl JMPENT( L(2m4), L(tab)) C 10 99 dnl JMPENT( L(3m4), L(tab)) C 11 100 TEXT 101 102 L(1): imul %r8, %rax 103 mov %rax, (rp) 104 FUNC_EXIT() 105 ret 106 107 L(2): mov 8(vp_param), %r11 108 imul %rax, %r11 C u0 x v1 109 mul %r8 C u0 x v0 110 mov %rax, (rp) 111 imul 8(up), %r8 C u1 x v0 112 lea (%r11, %rdx), %rax 113 add %r8, %rax 114 mov %rax, 8(rp) 115 FUNC_EXIT() 116 ret 117 118 L(3): mov 8(vp_param), %r9 C v1 119 mov 16(vp_param), %r11 120 mul %r8 C u0 x v0 -> <r1,r0> 121 mov %rax, (rp) C r0 122 mov (up), %rax C u0 123 mov %rdx, %rcx C r1 124 mul %r9 C u0 x v1 -> <r2,r1> 125 imul 8(up), %r9 C u1 x v1 -> r2 126 mov 16(up), %r10 127 imul %r8, %r10 C u2 x v0 -> r2 128 add %rax, %rcx 129 adc %rdx, %r9 130 add %r10, %r9 131 mov 8(up), %rax C u1 132 mul %r8 C u1 x v0 -> <r2,r1> 133 add %rax, %rcx 134 adc %rdx, %r9 135 mov %r11, %rax 136 imul (up), %rax C u0 x v2 -> r2 137 add %rax, %r9 138 mov %rcx, 8(rp) 139 mov %r9, 16(rp) 140 FUNC_EXIT() 141 ret 142 143 L(0m4): 144 L(1m4): 145 L(2m4): 146 L(3m4): 147 L(gen): push %rbx 148 push %rbp 149 push %r13 150 push %r14 151 push %r15 152 153 mov (up), %rax 154 mov (vp_param), v0 155 mov vp_param, vp 156 157 lea (rp,n,8), rp 158 lea (up,n,8), up 159 neg n 160 161 mul v0 162 163 test $1, R8(n) 164 jz L(mul_2) 165 166 L(mul_1): 167 lea -8(rp), rp 168 lea -8(up), up 169 test $2, R8(n) 170 jnz L(mul_1_prologue_3) 171 172 L(mul_1_prologue_2): C n = 7, 11, 15, ... 173 lea -1(n), j 174 lea L(addmul_outer_1)(%rip), outer_addr 175 mov %rax, w0 176 mov %rdx, w1 177 xor R32(w2), R32(w2) 178 xor R32(w3), R32(w3) 179 mov 16(up,n,8), %rax 180 jmp L(mul_1_entry_2) 181 182 L(mul_1_prologue_3): C n = 5, 9, 13, ... 183 lea 1(n), j 184 lea L(addmul_outer_3)(%rip), outer_addr 185 mov %rax, w2 186 mov %rdx, w3 187 xor R32(w0), R32(w0) 188 jmp L(mul_1_entry_0) 189 190 ALIGN(16) 191 L(mul_1_top): 192 mov w0, -16(rp,j,8) 193 add %rax, w1 194 mov (up,j,8), %rax 195 adc %rdx, w2 196 xor R32(w0), R32(w0) 197 mul v0 198 mov w1, -8(rp,j,8) 199 add %rax, w2 200 adc %rdx, w3 201 L(mul_1_entry_0): 202 mov 8(up,j,8), %rax 203 mul v0 204 mov w2, (rp,j,8) 205 add %rax, w3 206 adc %rdx, w0 207 mov 16(up,j,8), %rax 208 mul v0 209 mov w3, 8(rp,j,8) 210 xor R32(w2), R32(w2) C zero 211 mov w2, w3 C zero 212 add %rax, w0 213 mov 24(up,j,8), %rax 214 mov w2, w1 C zero 215 adc %rdx, w1 216 L(mul_1_entry_2): 217 mul v0 218 add $4, j 219 js L(mul_1_top) 220 221 mov w0, -16(rp) 222 add %rax, w1 223 mov w1, -8(rp) 224 adc %rdx, w2 225 226 imul (up), v0 227 add v0, w2 228 mov w2, (rp) 229 230 add $1, n 231 jz L(ret) 232 233 mov 8(vp), v0 234 mov 16(vp), v1 235 236 lea 16(up), up 237 lea 8(vp), vp 238 lea 24(rp), rp 239 240 jmp *outer_addr 241 242 243 L(mul_2): 244 mov 8(vp), v1 245 test $2, R8(n) 246 jz L(mul_2_prologue_3) 247 248 ALIGN(16) 249 L(mul_2_prologue_1): 250 lea 0(n), j 251 mov %rax, w3 252 mov %rdx, w0 253 xor R32(w1), R32(w1) 254 mov (up,n,8), %rax 255 lea L(addmul_outer_3)(%rip), outer_addr 256 jmp L(mul_2_entry_1) 257 258 ALIGN(16) 259 L(mul_2_prologue_3): 260 lea 2(n), j 261 mov $0, R32(w3) 262 mov %rax, w1 263 mov (up,n,8), %rax 264 mov %rdx, w2 265 lea L(addmul_outer_1)(%rip), outer_addr 266 jmp L(mul_2_entry_3) 267 268 ALIGN(16) 269 L(mul_2_top): 270 mov -32(up,j,8), %rax 271 mul v1 272 add %rax, w0 273 adc %rdx, w1 274 mov -24(up,j,8), %rax 275 xor R32(w2), R32(w2) 276 mul v0 277 add %rax, w0 278 mov -24(up,j,8), %rax 279 adc %rdx, w1 280 adc $0, R32(w2) 281 mul v1 282 add %rax, w1 283 mov w0, -24(rp,j,8) 284 adc %rdx, w2 285 mov -16(up,j,8), %rax 286 mul v0 287 mov $0, R32(w3) 288 add %rax, w1 289 adc %rdx, w2 290 mov -16(up,j,8), %rax 291 adc $0, R32(w3) 292 L(mul_2_entry_3): 293 mov $0, R32(w0) 294 mov w1, -16(rp,j,8) 295 mul v1 296 add %rax, w2 297 mov -8(up,j,8), %rax 298 adc %rdx, w3 299 mov $0, R32(w1) 300 mul v0 301 add %rax, w2 302 mov -8(up,j,8), %rax 303 adc %rdx, w3 304 adc R32(w1), R32(w0) 305 mul v1 306 add %rax, w3 307 mov w2, -8(rp,j,8) 308 adc %rdx, w0 309 mov (up,j,8), %rax 310 mul v0 311 add %rax, w3 312 adc %rdx, w0 313 adc $0, R32(w1) 314 L(mul_2_entry_1): 315 add $4, j 316 mov w3, -32(rp,j,8) 317 js L(mul_2_top) 318 319 imul -16(up), v1 320 add v1, w0 321 imul -8(up), v0 322 add v0, w0 323 mov w0, -8(rp) 324 325 add $2, n 326 jz L(ret) 327 328 mov 16(vp), v0 329 mov 24(vp), v1 330 331 lea 16(vp), vp 332 lea 16(rp), rp 333 334 jmp *outer_addr 335 336 337 L(addmul_outer_1): 338 lea -2(n), j 339 mov -16(up,n,8), %rax 340 mul v0 341 mov %rax, w3 342 mov -16(up,n,8), %rax 343 mov %rdx, w0 344 xor R32(w1), R32(w1) 345 lea L(addmul_outer_3)(%rip), outer_addr 346 jmp L(addmul_entry_1) 347 348 L(addmul_outer_3): 349 lea 0(n), j 350 mov -16(up,n,8), %rax 351 xor R32(w3), R32(w3) 352 mul v0 353 mov %rax, w1 354 mov -16(up,n,8), %rax 355 mov %rdx, w2 356 lea L(addmul_outer_1)(%rip), outer_addr 357 jmp L(addmul_entry_3) 358 359 ALIGN(16) 360 L(addmul_top): 361 add w3, -32(rp,j,8) 362 adc %rax, w0 363 mov -24(up,j,8), %rax 364 adc %rdx, w1 365 xor R32(w2), R32(w2) 366 mul v0 367 add %rax, w0 368 mov -24(up,j,8), %rax 369 adc %rdx, w1 370 adc R32(w2), R32(w2) 371 mul v1 372 xor R32(w3), R32(w3) 373 add w0, -24(rp,j,8) 374 adc %rax, w1 375 mov -16(up,j,8), %rax 376 adc %rdx, w2 377 mul v0 378 add %rax, w1 379 mov -16(up,j,8), %rax 380 adc %rdx, w2 381 adc $0, R32(w3) 382 L(addmul_entry_3): 383 mul v1 384 add w1, -16(rp,j,8) 385 adc %rax, w2 386 mov -8(up,j,8), %rax 387 adc %rdx, w3 388 mul v0 389 xor R32(w0), R32(w0) 390 add %rax, w2 391 adc %rdx, w3 392 mov $0, R32(w1) 393 mov -8(up,j,8), %rax 394 adc R32(w1), R32(w0) 395 mul v1 396 add w2, -8(rp,j,8) 397 adc %rax, w3 398 adc %rdx, w0 399 mov (up,j,8), %rax 400 mul v0 401 add %rax, w3 402 mov (up,j,8), %rax 403 adc %rdx, w0 404 adc $0, R32(w1) 405 L(addmul_entry_1): 406 mul v1 407 add $4, j 408 js L(addmul_top) 409 410 add w3, -32(rp) 411 adc %rax, w0 412 413 imul -24(up), v0 414 add v0, w0 415 add w0, -24(rp) 416 417 add $2, n 418 jns L(ret) 419 420 lea 16(vp), vp 421 422 mov (vp), v0 423 mov 8(vp), v1 424 425 lea -16(up), up 426 427 jmp *outer_addr 428 429 L(ret): pop %r15 430 pop %r14 431 pop %r13 432 pop %rbp 433 pop %rbx 434 FUNC_EXIT() 435 ret 436 EPILOGUE()