github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mulx/adx/addmul_1.asm (about) 1 dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2012, 2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 - 37 C AMD K10 - 38 C AMD bd1 - 39 C AMD bobcat - 40 C Intel P4 - 41 C Intel PNR - 42 C Intel NHM - 43 C Intel SBR - 44 C Intel HWL - 45 C Intel BWL ? 46 C Intel atom - 47 C VIA nano - 48 49 define(`rp', `%rdi') dnl rcx 50 define(`up', `%rsi') dnl rdx 51 define(`n_param', `%rdx') dnl r8 52 define(`v0_param',`%rcx') dnl r9 53 54 define(`n', `%rcx') dnl 55 define(`v0', `%rdx') dnl 56 57 C Testing mechanism for running this on older AMD64 processors 58 ifelse(FAKE_MULXADX,1,` 59 include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4') 60 ',` 61 define(`adox', ``adox' $1, $2') 62 define(`adcx', ``adcx' $1, $2') 63 define(`mulx', ``mulx' $1, $2, $3') 64 ') 65 66 ASM_START() 67 TEXT 68 ALIGN(16) 69 PROLOGUE(mpn_addmul_1) 70 mov (up), %r8 71 72 push %rbx 73 push %r12 74 push %r13 75 76 lea (up,n_param,8), up 77 lea -16(rp,n_param,8), rp 78 mov R32(n_param), R32(%rax) 79 xchg v0_param, v0 C FIXME: is this insn fast? 80 81 neg n 82 83 and $3, R8(%rax) 84 jz L(b0) 85 cmp $2, R8(%rax) 86 jl L(b1) 87 jz L(b2) 88 89 L(b3): mulx( (up,n,8), %r11, %r10) 90 mulx( 8(up,n,8), %r13, %r12) 91 mulx( 16(up,n,8), %rbx, %rax) 92 dec n 93 jmp L(lo3) 94 95 L(b0): mulx( (up,n,8), %r9, %r8) 96 mulx( 8(up,n,8), %r11, %r10) 97 mulx( 16(up,n,8), %r13, %r12) 98 jmp L(lo0) 99 100 L(b2): mulx( (up,n,8), %r13, %r12) 101 mulx( 8(up,n,8), %rbx, %rax) 102 lea 2(n), n 103 jrcxz L(wd2) 104 L(gt2): mulx( (up,n,8), %r9, %r8) 105 jmp L(lo2) 106 107 L(b1): and R8(%rax), R8(%rax) 108 mulx( (up,n,8), %rbx, %rax) 109 lea 1(n), n 110 jrcxz L(wd1) 111 mulx( (up,n,8), %r9, %r8) 112 mulx( 8(up,n,8), %r11, %r10) 113 jmp L(lo1) 114 115 L(end): adcx( %r10, %r13) 116 mov %r11, -8(rp) 117 L(wd2): adox( (rp), %r13) 118 adcx( %r12, %rbx) 119 mov %r13, (rp) 120 L(wd1): adox( 8(rp), %rbx) 121 adcx( %rcx, %rax) 122 adox( %rcx, %rax) 123 mov %rbx, 8(rp) 124 pop %r13 125 pop %r12 126 pop %rbx 127 ret 128 129 L(top): jrcxz L(end) 130 mulx( (up,n,8), %r9, %r8) 131 adcx( %r10, %r13) 132 mov %r11, -8(rp,n,8) 133 L(lo2): adox( (rp,n,8), %r13) 134 mulx( 8(up,n,8), %r11, %r10) 135 adcx( %r12, %rbx) 136 mov %r13, (rp,n,8) 137 L(lo1): adox( 8(rp,n,8), %rbx) 138 mulx( 16(up,n,8), %r13, %r12) 139 adcx( %rax, %r9) 140 mov %rbx, 8(rp,n,8) 141 L(lo0): adox( 16(rp,n,8), %r9) 142 mulx( 24(up,n,8), %rbx, %rax) 143 adcx( %r8, %r11) 144 mov %r9, 16(rp,n,8) 145 L(lo3): adox( 24(rp,n,8), %r11) 146 lea 4(n), n 147 jmp L(top) 148 EPILOGUE() 149 ASM_END()