github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_2.asm (about) 1 dnl AMD64 mpn_mod_1s_2p 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 4 37 C AMD K10 4 38 C Intel P4 19 39 C Intel core2 8 40 C Intel NHM 6.5 41 C Intel SBR 4.5 42 C Intel atom 28 43 C VIA nano 8 44 45 ABI_SUPPORT(DOS64) 46 ABI_SUPPORT(STD64) 47 48 ASM_START() 49 TEXT 50 ALIGN(16) 51 PROLOGUE(mpn_mod_1s_2p) 52 FUNC_ENTRY(4) 53 push %r14 54 test $1, R8(%rsi) 55 mov %rdx, %r14 56 push %r13 57 mov %rcx, %r13 58 push %r12 59 push %rbp 60 push %rbx 61 mov 16(%rcx), %r10 62 mov 24(%rcx), %rbx 63 mov 32(%rcx), %rbp 64 je L(b0) 65 dec %rsi 66 je L(one) 67 mov -8(%rdi,%rsi,8), %rax 68 mul %r10 69 mov %rax, %r9 70 mov %rdx, %r8 71 mov (%rdi,%rsi,8), %rax 72 add -16(%rdi,%rsi,8), %r9 73 adc $0, %r8 74 mul %rbx 75 add %rax, %r9 76 adc %rdx, %r8 77 jmp L(11) 78 79 L(b0): mov -8(%rdi,%rsi,8), %r8 80 mov -16(%rdi,%rsi,8), %r9 81 82 L(11): sub $4, %rsi 83 jb L(ed2) 84 lea 40(%rdi,%rsi,8), %rdi 85 mov -40(%rdi), %r11 86 mov -32(%rdi), %rax 87 jmp L(m0) 88 89 ALIGN(16) 90 L(top): mov -24(%rdi), %r9 91 add %rax, %r11 92 mov -16(%rdi), %rax 93 adc %rdx, %r12 94 mul %r10 95 add %rax, %r9 96 mov %r11, %rax 97 mov %rdx, %r8 98 adc $0, %r8 99 mul %rbx 100 add %rax, %r9 101 mov %r12, %rax 102 adc %rdx, %r8 103 mul %rbp 104 sub $2, %rsi 105 jb L(ed1) 106 mov -40(%rdi), %r11 107 add %rax, %r9 108 mov -32(%rdi), %rax 109 adc %rdx, %r8 110 L(m0): mul %r10 111 add %rax, %r11 112 mov %r9, %rax 113 mov %rdx, %r12 114 adc $0, %r12 115 mul %rbx 116 add %rax, %r11 117 lea -32(%rdi), %rdi C ap -= 4 118 mov %r8, %rax 119 adc %rdx, %r12 120 mul %rbp 121 sub $2, %rsi 122 jae L(top) 123 124 L(ed0): mov %r11, %r9 125 mov %r12, %r8 126 L(ed1): add %rax, %r9 127 adc %rdx, %r8 128 L(ed2): mov 8(%r13), R32(%rdi) C cnt 129 mov %r8, %rax 130 mov %r9, %r8 131 mul %r10 132 add %rax, %r8 133 adc $0, %rdx 134 L(1): xor R32(%rcx), R32(%rcx) 135 mov %r8, %r9 136 sub R32(%rdi), R32(%rcx) 137 shr R8(%rcx), %r9 138 mov R32(%rdi), R32(%rcx) 139 sal R8(%rcx), %rdx 140 or %rdx, %r9 141 sal R8(%rcx), %r8 142 mov %r9, %rax 143 mulq (%r13) 144 mov %rax, %rsi 145 inc %r9 146 add %r8, %rsi 147 adc %r9, %rdx 148 imul %r14, %rdx 149 sub %rdx, %r8 150 lea (%r8,%r14), %rax 151 cmp %r8, %rsi 152 cmovc %rax, %r8 153 mov %r8, %rax 154 sub %r14, %rax 155 cmovc %r8, %rax 156 mov R32(%rdi), R32(%rcx) 157 shr R8(%rcx), %rax 158 pop %rbx 159 pop %rbp 160 pop %r12 161 pop %r13 162 pop %r14 163 FUNC_EXIT() 164 ret 165 L(one): 166 mov (%rdi), %r8 167 mov 8(%rcx), R32(%rdi) 168 xor %rdx, %rdx 169 jmp L(1) 170 EPILOGUE() 171 172 ALIGN(16) 173 PROLOGUE(mpn_mod_1s_2p_cps) 174 FUNC_ENTRY(2) 175 push %rbp 176 bsr %rsi, %rcx 177 push %rbx 178 mov %rdi, %rbx 179 push %r12 180 xor $63, R32(%rcx) 181 mov %rsi, %r12 182 mov R32(%rcx), R32(%rbp) C preserve cnt over call 183 sal R8(%rcx), %r12 C b << cnt 184 IFSTD(` mov %r12, %rdi ') C pass parameter 185 IFDOS(` mov %r12, %rcx ') C pass parameter 186 ASSERT(nz, `test $15, %rsp') 187 CALL( mpn_invert_limb) 188 mov %r12, %r8 189 mov %rax, %r11 190 mov %rax, (%rbx) C store bi 191 mov %rbp, 8(%rbx) C store cnt 192 neg %r8 193 mov R32(%rbp), R32(%rcx) 194 mov $1, R32(%rsi) 195 ifdef(`SHLD_SLOW',` 196 shl R8(%rcx), %rsi 197 neg R32(%rcx) 198 mov %rax, %rbp 199 shr R8(%rcx), %rax 200 or %rax, %rsi 201 mov %rbp, %rax 202 neg R32(%rcx) 203 ',` 204 shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano 205 ') 206 imul %r8, %rsi 207 mul %rsi 208 209 add %rsi, %rdx 210 shr R8(%rcx), %rsi 211 mov %rsi, 16(%rbx) C store B1modb 212 213 not %rdx 214 imul %r12, %rdx 215 lea (%rdx,%r12), %rsi 216 cmp %rdx, %rax 217 cmovnc %rdx, %rsi 218 mov %r11, %rax 219 mul %rsi 220 221 add %rsi, %rdx 222 shr R8(%rcx), %rsi 223 mov %rsi, 24(%rbx) C store B2modb 224 225 not %rdx 226 imul %r12, %rdx 227 add %rdx, %r12 228 cmp %rdx, %rax 229 cmovnc %rdx, %r12 230 231 shr R8(%rcx), %r12 232 mov %r12, 32(%rbx) C store B3modb 233 234 pop %r12 235 pop %rbx 236 pop %rbp 237 FUNC_EXIT() 238 ret 239 EPILOGUE()