github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_4.asm (about) 1 dnl AMD64 mpn_mod_1s_4p 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 3 37 C AMD K10 3 38 C Intel P4 15.5 39 C Intel core2 5 40 C Intel corei 4 41 C Intel atom 23 42 C VIA nano 4.75 43 44 ABI_SUPPORT(DOS64) 45 ABI_SUPPORT(STD64) 46 47 ASM_START() 48 TEXT 49 ALIGN(16) 50 PROLOGUE(mpn_mod_1s_4p) 51 FUNC_ENTRY(4) 52 push %r15 53 push %r14 54 push %r13 55 push %r12 56 push %rbp 57 push %rbx 58 59 mov %rdx, %r15 60 mov %rcx, %r14 61 mov 16(%rcx), %r11 C B1modb 62 mov 24(%rcx), %rbx C B2modb 63 mov 32(%rcx), %rbp C B3modb 64 mov 40(%rcx), %r13 C B4modb 65 mov 48(%rcx), %r12 C B5modb 66 xor R32(%r8), R32(%r8) 67 mov R32(%rsi), R32(%rdx) 68 and $3, R32(%rdx) 69 je L(b0) 70 cmp $2, R32(%rdx) 71 jc L(b1) 72 je L(b2) 73 74 L(b3): lea -24(%rdi,%rsi,8), %rdi 75 mov 8(%rdi), %rax 76 mul %r11 77 mov (%rdi), %r9 78 add %rax, %r9 79 adc %rdx, %r8 80 mov 16(%rdi), %rax 81 mul %rbx 82 jmp L(m0) 83 84 ALIGN(8) 85 L(b0): lea -32(%rdi,%rsi,8), %rdi 86 mov 8(%rdi), %rax 87 mul %r11 88 mov (%rdi), %r9 89 add %rax, %r9 90 adc %rdx, %r8 91 mov 16(%rdi), %rax 92 mul %rbx 93 add %rax, %r9 94 adc %rdx, %r8 95 mov 24(%rdi), %rax 96 mul %rbp 97 jmp L(m0) 98 99 ALIGN(8) 100 L(b1): lea -8(%rdi,%rsi,8), %rdi 101 mov (%rdi), %r9 102 jmp L(m1) 103 104 ALIGN(8) 105 L(b2): lea -16(%rdi,%rsi,8), %rdi 106 mov 8(%rdi), %r8 107 mov (%rdi), %r9 108 jmp L(m1) 109 110 ALIGN(16) 111 L(top): mov -24(%rdi), %rax 112 mov -32(%rdi), %r10 113 mul %r11 C up[1] * B1modb 114 add %rax, %r10 115 mov -16(%rdi), %rax 116 mov $0, R32(%rcx) 117 adc %rdx, %rcx 118 mul %rbx C up[2] * B2modb 119 add %rax, %r10 120 mov -8(%rdi), %rax 121 adc %rdx, %rcx 122 sub $32, %rdi 123 mul %rbp C up[3] * B3modb 124 add %rax, %r10 125 mov %r13, %rax 126 adc %rdx, %rcx 127 mul %r9 C rl * B4modb 128 add %rax, %r10 129 mov %r12, %rax 130 adc %rdx, %rcx 131 mul %r8 C rh * B5modb 132 mov %r10, %r9 133 mov %rcx, %r8 134 L(m0): add %rax, %r9 135 adc %rdx, %r8 136 L(m1): sub $4, %rsi 137 ja L(top) 138 139 L(end): mov 8(%r14), R32(%rsi) 140 mov %r8, %rax 141 mul %r11 142 mov %rax, %r8 143 add %r9, %r8 144 adc $0, %rdx 145 xor R32(%rcx), R32(%rcx) 146 sub R32(%rsi), R32(%rcx) 147 mov %r8, %rdi 148 shr R8(%rcx), %rdi 149 mov R32(%rsi), R32(%rcx) 150 sal R8(%rcx), %rdx 151 or %rdx, %rdi 152 mov %rdi, %rax 153 mulq (%r14) 154 mov %r15, %rbx 155 mov %rax, %r9 156 sal R8(%rcx), %r8 157 inc %rdi 158 add %r8, %r9 159 adc %rdi, %rdx 160 imul %rbx, %rdx 161 sub %rdx, %r8 162 lea (%r8,%rbx), %rax 163 cmp %r8, %r9 164 cmovc %rax, %r8 165 mov %r8, %rax 166 sub %rbx, %rax 167 cmovc %r8, %rax 168 shr R8(%rcx), %rax 169 pop %rbx 170 pop %rbp 171 pop %r12 172 pop %r13 173 pop %r14 174 pop %r15 175 FUNC_EXIT() 176 ret 177 EPILOGUE() 178 179 ALIGN(16) 180 PROLOGUE(mpn_mod_1s_4p_cps) 181 FUNC_ENTRY(2) 182 push %rbp 183 bsr %rsi, %rcx 184 push %rbx 185 mov %rdi, %rbx 186 push %r12 187 xor $63, R32(%rcx) 188 mov %rsi, %r12 189 mov R32(%rcx), R32(%rbp) C preserve cnt over call 190 sal R8(%rcx), %r12 C b << cnt 191 IFSTD(` mov %r12, %rdi ') C pass parameter 192 IFDOS(` mov %r12, %rcx ') C pass parameter 193 ASSERT(nz, `test $15, %rsp') 194 CALL( mpn_invert_limb) 195 mov %r12, %r8 196 mov %rax, %r11 197 mov %rax, (%rbx) C store bi 198 mov %rbp, 8(%rbx) C store cnt 199 neg %r8 200 mov R32(%rbp), R32(%rcx) 201 mov $1, R32(%rsi) 202 ifdef(`SHLD_SLOW',` 203 shl R8(%rcx), %rsi 204 neg R32(%rcx) 205 mov %rax, %rbp 206 shr R8(%rcx), %rax 207 or %rax, %rsi 208 mov %rbp, %rax 209 neg R32(%rcx) 210 ',` 211 shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano 212 ') 213 imul %r8, %rsi 214 mul %rsi 215 216 add %rsi, %rdx 217 shr R8(%rcx), %rsi 218 mov %rsi, 16(%rbx) C store B1modb 219 220 not %rdx 221 imul %r12, %rdx 222 lea (%rdx,%r12), %rsi 223 cmp %rdx, %rax 224 cmovnc %rdx, %rsi 225 mov %r11, %rax 226 mul %rsi 227 228 add %rsi, %rdx 229 shr R8(%rcx), %rsi 230 mov %rsi, 24(%rbx) C store B2modb 231 232 not %rdx 233 imul %r12, %rdx 234 lea (%rdx,%r12), %rsi 235 cmp %rdx, %rax 236 cmovnc %rdx, %rsi 237 mov %r11, %rax 238 mul %rsi 239 240 add %rsi, %rdx 241 shr R8(%rcx), %rsi 242 mov %rsi, 32(%rbx) C store B3modb 243 244 not %rdx 245 imul %r12, %rdx 246 lea (%rdx,%r12), %rsi 247 cmp %rdx, %rax 248 cmovnc %rdx, %rsi 249 mov %r11, %rax 250 mul %rsi 251 252 add %rsi, %rdx 253 shr R8(%rcx), %rsi 254 mov %rsi, 40(%rbx) C store B4modb 255 256 not %rdx 257 imul %r12, %rdx 258 add %rdx, %r12 259 cmp %rdx, %rax 260 cmovnc %rdx, %r12 261 262 shr R8(%rcx), %r12 263 mov %r12, 48(%rbx) C store B5modb 264 265 pop %r12 266 pop %rbx 267 pop %rbp 268 FUNC_EXIT() 269 ret 270 EPILOGUE()