github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mod_1_4.asm (about) 1 dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F). 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2009, 2010 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C TODO: 36 C * Optimize. The present code was written quite straightforwardly. 37 C * Optimize post-loop reduction code. 38 C * Write a cps function that uses sse2 insns. 39 40 C cycles/limb 41 C P6 model 0-8,10-12 - 42 C P6 model 9 (Banias) ? 43 C P6 model 13 (Dothan) 3.4 44 C P4 model 0-1 (Willamette) ? 45 C P4 model 2 (Northwood) 4 46 C P4 model 3-4 (Prescott) 4.5 47 48 C INPUT PARAMETERS 49 C ap sp + 4 50 C n sp + 8 51 C b sp + 12 52 C cps sp + 16 53 54 define(`B1modb', `%mm1') 55 define(`B2modb', `%mm2') 56 define(`B3modb', `%mm3') 57 define(`B4modb', `%mm4') 58 define(`B5modb', `%mm5') 59 define(`ap', `%edx') 60 define(`n', `%eax') 61 62 ASM_START() 63 TEXT 64 ALIGN(16) 65 PROLOGUE(mpn_mod_1s_4p) 66 push %ebx 67 mov 8(%esp), ap 68 mov 12(%esp), n 69 mov 20(%esp), %ecx 70 71 movd 8(%ecx), B1modb 72 movd 12(%ecx), B2modb 73 movd 16(%ecx), B3modb 74 movd 20(%ecx), B4modb 75 movd 24(%ecx), B5modb 76 77 mov n, %ebx 78 lea -4(ap,n,4), ap 79 and $3, %ebx 80 je L(b0) 81 cmp $2, %ebx 82 jc L(b1) 83 je L(b2) 84 85 L(b3): movd -4(ap), %mm7 86 pmuludq B1modb, %mm7 87 movd -8(ap), %mm6 88 paddq %mm6, %mm7 89 movd (ap), %mm6 90 pmuludq B2modb, %mm6 91 paddq %mm6, %mm7 92 lea -24(ap), ap 93 add $-3, n 94 jz L(end) 95 jmp L(top) 96 97 L(b0): movd -8(ap), %mm7 98 pmuludq B1modb, %mm7 99 movd -12(ap), %mm6 100 paddq %mm6, %mm7 101 movd -4(ap), %mm6 102 pmuludq B2modb, %mm6 103 paddq %mm6, %mm7 104 movd (ap), %mm6 105 pmuludq B3modb, %mm6 106 paddq %mm6, %mm7 107 lea -28(ap), ap 108 add $-4, n 109 jz L(end) 110 jmp L(top) 111 112 L(b1): movd (ap), %mm7 113 lea -16(ap), ap 114 dec n 115 jz L(x) 116 jmp L(top) 117 118 L(b2): movd -4(ap), %mm7 C rl 119 punpckldq (ap), %mm7 C rh 120 lea -20(ap), ap 121 add $-2, n 122 jz L(end) 123 124 ALIGN(8) 125 L(top): movd 4(ap), %mm0 126 pmuludq B1modb, %mm0 127 movd 0(ap), %mm6 128 paddq %mm6, %mm0 129 130 movd 8(ap), %mm6 131 pmuludq B2modb, %mm6 132 paddq %mm6, %mm0 133 134 movd 12(ap), %mm6 135 pmuludq B3modb, %mm6 136 paddq %mm6, %mm0 137 138 movq %mm7, %mm6 139 psrlq $32, %mm7 C rh 140 pmuludq B5modb, %mm7 141 pmuludq B4modb, %mm6 142 143 paddq %mm0, %mm7 144 paddq %mm6, %mm7 145 146 add $-16, ap 147 add $-4, n 148 jnz L(top) 149 150 L(end): pcmpeqd %mm4, %mm4 151 psrlq $32, %mm4 C 0x00000000FFFFFFFF 152 pand %mm7, %mm4 C rl 153 psrlq $32, %mm7 C rh 154 pmuludq B1modb, %mm7 C rh,cl 155 paddq %mm4, %mm7 C rh,rl 156 L(x): movd 4(%ecx), %mm4 C cnt 157 psllq %mm4, %mm7 C rh,rl normalized 158 movq %mm7, %mm2 C rl in low half 159 psrlq $32, %mm7 C rh 160 movd (%ecx), %mm1 C bi 161 pmuludq %mm7, %mm1 C qh,ql 162 paddq %mm2, %mm1 C qh-1,ql 163 movd %mm1, %ecx C ql 164 psrlq $32, %mm1 C qh-1 165 movd 16(%esp), %mm3 C b 166 pmuludq %mm1, %mm3 C (qh-1) * b 167 psubq %mm3, %mm2 C r in low half (could use psubd) 168 movd %mm2, %eax C r 169 mov 16(%esp), %ebx 170 sub %ebx, %eax C r 171 cmp %eax, %ecx 172 lea (%eax,%ebx), %edx 173 cmovc( %edx, %eax) 174 movd %mm4, %ecx C cnt 175 cmp %ebx, %eax 176 jae L(fix) 177 emms 178 pop %ebx 179 shr %cl, %eax 180 ret 181 182 L(fix): sub %ebx, %eax 183 emms 184 pop %ebx 185 shr %cl, %eax 186 ret 187 EPILOGUE() 188 189 ALIGN(16) 190 PROLOGUE(mpn_mod_1s_4p_cps) 191 C CAUTION: This is the same code as in k7/mod_1_4.asm 192 push %ebp 193 push %edi 194 push %esi 195 push %ebx 196 mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx 197 mov 24(%esp), %ebx 198 bsr %ebx, %ecx 199 xor $31, %ecx 200 sal %cl, %ebx C b << cnt 201 mov %ebx, %edx 202 not %edx 203 mov $-1, %eax 204 div %ebx 205 xor %edi, %edi 206 sub %ebx, %edi 207 mov $1, %esi 208 mov %eax, (%ebp) C store bi 209 mov %ecx, 4(%ebp) C store cnt 210 shld %cl, %eax, %esi 211 imul %edi, %esi 212 mov %eax, %edi 213 mul %esi 214 215 add %esi, %edx 216 shr %cl, %esi 217 mov %esi, 8(%ebp) C store B1modb 218 219 not %edx 220 imul %ebx, %edx 221 lea (%edx,%ebx), %esi 222 cmp %edx, %eax 223 cmovnc( %edx, %esi) 224 mov %edi, %eax 225 mul %esi 226 227 add %esi, %edx 228 shr %cl, %esi 229 mov %esi, 12(%ebp) C store B2modb 230 231 not %edx 232 imul %ebx, %edx 233 lea (%edx,%ebx), %esi 234 cmp %edx, %eax 235 cmovnc( %edx, %esi) 236 mov %edi, %eax 237 mul %esi 238 239 add %esi, %edx 240 shr %cl, %esi 241 mov %esi, 16(%ebp) C store B3modb 242 243 not %edx 244 imul %ebx, %edx 245 lea (%edx,%ebx), %esi 246 cmp %edx, %eax 247 cmovnc( %edx, %esi) 248 mov %edi, %eax 249 mul %esi 250 251 add %esi, %edx 252 shr %cl, %esi 253 mov %esi, 20(%ebp) C store B4modb 254 255 not %edx 256 imul %ebx, %edx 257 add %edx, %ebx 258 cmp %edx, %eax 259 cmovnc( %edx, %ebx) 260 261 shr %cl, %ebx 262 mov %ebx, 24(%ebp) C store B5modb 263 264 pop %ebx 265 pop %esi 266 pop %edi 267 pop %ebp 268 ret 269 EPILOGUE()