github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_1.asm (about) 1 dnl AMD64 mpn_mod_1_1p 2 3 dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller. 4 5 dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 6 37 C AMD K10 6 38 C Intel P4 26 39 C Intel core2 12.5 40 C Intel NHM 11.3 41 C Intel SBR 8.4 (slowdown, old code took 8.0) 42 C Intel atom 26 43 C VIA nano 13 44 45 define(`B2mb', `%r10') 46 define(`B2modb', `%r11') 47 define(`ap', `%rdi') 48 define(`n', `%rsi') 49 define(`pre', `%r8') 50 define(`b', `%rbx') 51 52 define(`r0', `%rbp') C r1 kept in %rax 53 define(`r2', `%rcx') C kept negated. Also used as shift count 54 define(`t0', `%r9') 55 56 C mp_limb_t 57 C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4]) 58 C %rdi %rsi %rdx %rcx 59 C The pre array contains bi, cnt, B1modb, B2modb 60 C Note: This implementation needs B1modb only when cnt > 0 61 62 C The iteration is almost as follows, 63 C 64 C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u 65 C 66 C where r2 is a single bit represented as a mask. But to make sure that the 67 C result fits in two limbs and a bit, carry from the addition 68 C 69 C r_0 + r_2 B2mod 70 C 71 C is handled specially. On carry, we subtract b to cancel the carry, 72 C and we use instead the value 73 C 74 C r_0 + B2mb (mod B) 75 C 76 C This addition can be issued early since it doesn't depend on r2, and it is 77 C the source of the cmov in the loop. 78 C 79 C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b 80 81 ABI_SUPPORT(DOS64) 82 ABI_SUPPORT(STD64) 83 84 ASM_START() 85 TEXT 86 ALIGN(16) 87 PROLOGUE(mpn_mod_1_1p) 88 FUNC_ENTRY(4) 89 push %rbp 90 push %rbx 91 mov %rdx, b 92 mov %rcx, pre 93 94 mov -8(ap, n, 8), %rax 95 cmp $3, n 96 jnc L(first) 97 mov -16(ap, n, 8), r0 98 jmp L(reduce_two) 99 100 L(first): 101 C First iteration, no r2 102 mov 24(pre), B2modb 103 mul B2modb 104 mov -24(ap, n, 8), r0 105 add %rax, r0 106 mov -16(ap, n, 8), %rax 107 adc %rdx, %rax 108 sbb r2, r2 109 sub $4, n 110 jc L(reduce_three) 111 112 mov B2modb, B2mb 113 sub b, B2mb 114 115 ALIGN(16) 116 L(top): and B2modb, r2 117 lea (B2mb, r0), t0 118 mul B2modb 119 add r0, r2 120 mov (ap, n, 8), r0 121 cmovc t0, r2 122 add %rax, r0 123 mov r2, %rax 124 adc %rdx, %rax 125 sbb r2, r2 126 sub $1, n 127 jnc L(top) 128 129 L(reduce_three): 130 C Eliminate r2 131 and b, r2 132 sub r2, %rax 133 134 L(reduce_two): 135 mov 8(pre), R32(%rcx) 136 test R32(%rcx), R32(%rcx) 137 jz L(normalized) 138 139 C Unnormalized, use B1modb to reduce to size < B (b+1) 140 mulq 16(pre) 141 xor t0, t0 142 add %rax, r0 143 adc %rdx, t0 144 mov t0, %rax 145 146 C Left-shift to normalize 147 ifdef(`SHLD_SLOW',` 148 shl R8(%rcx), %rax 149 mov r0, t0 150 neg R32(%rcx) 151 shr R8(%rcx), t0 152 or t0, %rax 153 neg R32(%rcx) 154 ',` 155 shld R8(%rcx), r0, %rax 156 ') 157 shl R8(%rcx), r0 158 jmp L(udiv) 159 160 L(normalized): 161 mov %rax, t0 162 sub b, t0 163 cmovnc t0, %rax 164 165 L(udiv): 166 lea 1(%rax), t0 167 mulq (pre) 168 add r0, %rax 169 adc t0, %rdx 170 imul b, %rdx 171 sub %rdx, r0 172 cmp r0, %rax 173 lea (b, r0), %rax 174 cmovnc r0, %rax 175 cmp b, %rax 176 jnc L(fix) 177 L(ok): shr R8(%rcx), %rax 178 179 pop %rbx 180 pop %rbp 181 FUNC_EXIT() 182 ret 183 L(fix): sub b, %rax 184 jmp L(ok) 185 EPILOGUE() 186 187 ALIGN(16) 188 PROLOGUE(mpn_mod_1_1p_cps) 189 FUNC_ENTRY(2) 190 push %rbp 191 bsr %rsi, %rcx 192 push %rbx 193 mov %rdi, %rbx 194 push %r12 195 xor $63, R32(%rcx) 196 mov %rsi, %r12 197 mov R32(%rcx), R32(%rbp) 198 sal R8(%rcx), %r12 199 IFSTD(` mov %r12, %rdi ') C pass parameter 200 IFDOS(` mov %r12, %rcx ') C pass parameter 201 ASSERT(nz, `test $15, %rsp') 202 CALL( mpn_invert_limb) 203 neg %r12 204 mov %r12, %r8 205 mov %rax, (%rbx) C store bi 206 mov %rbp, 8(%rbx) C store cnt 207 imul %rax, %r12 208 mov %r12, 24(%rbx) C store B2modb 209 mov R32(%rbp), R32(%rcx) 210 test R32(%rcx), R32(%rcx) 211 jz L(z) 212 213 mov $1, R32(%rdx) 214 ifdef(`SHLD_SLOW',` 215 C Destroys %rax, unlike shld. Otherwise, we could do B1modb 216 C before B2modb, and get rid of the move %r12, %r8 above. 217 218 shl R8(%rcx), %rdx 219 neg R32(%rcx) 220 shr R8(%rcx), %rax 221 or %rax, %rdx 222 neg R32(%rcx) 223 ',` 224 shld R8(%rcx), %rax, %rdx 225 ') 226 imul %rdx, %r8 227 shr R8(%rcx), %r8 228 mov %r8, 16(%rbx) C store B1modb 229 L(z): 230 pop %r12 231 pop %rbx 232 pop %rbp 233 FUNC_EXIT() 234 ret 235 EPILOGUE() 236 ASM_END()