github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mod_1_1.asm (about) 1 dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2009, 2010 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C TODO: 36 C * Optimize. The present code was written quite straightforwardly. 37 C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill. 38 C * Write a cps function that uses sse2 insns. 39 40 C cycles/limb 41 C P6 model 0-8,10-12 - 42 C P6 model 9 (Banias) ? 43 C P6 model 13 (Dothan) ? 44 C P4 model 0-1 (Willamette) ? 45 C P4 model 2 (Northwood) 16 46 C P4 model 3-4 (Prescott) 18 47 48 C INPUT PARAMETERS 49 C ap sp + 4 50 C n sp + 8 51 C b sp + 12 52 C cps sp + 16 53 54 define(`B1modb', `%mm1') 55 define(`B2modb', `%mm2') 56 define(`ap', `%edx') 57 define(`n', `%eax') 58 59 TEXT 60 ALIGN(16) 61 PROLOGUE(mpn_mod_1_1p) 62 push %ebx 63 mov 8(%esp), ap 64 mov 12(%esp), n 65 mov 20(%esp), %ecx 66 movd 8(%ecx), B1modb 67 movd 12(%ecx), B2modb 68 69 lea -4(ap,n,4), ap 70 71 C FIXME: See comment in generic/mod_1_1.c. 72 movd (ap), %mm7 73 movd -4(ap), %mm4 74 pmuludq B1modb, %mm7 75 paddq %mm4, %mm7 76 add $-2, n 77 jz L(end) 78 79 ALIGN(8) 80 L(top): movq %mm7, %mm6 81 psrlq $32, %mm7 C rh 82 movd -8(ap), %mm0 83 add $-4, ap 84 pmuludq B2modb, %mm7 85 pmuludq B1modb, %mm6 86 add $-1, n 87 paddq %mm0, %mm7 88 paddq %mm6, %mm7 89 jnz L(top) 90 91 L(end): pcmpeqd %mm4, %mm4 92 psrlq $32, %mm4 C 0x00000000FFFFFFFF 93 pand %mm7, %mm4 C rl 94 psrlq $32, %mm7 C rh 95 pmuludq B1modb, %mm7 C rh,cl 96 paddq %mm4, %mm7 C rh,rl 97 movd 4(%ecx), %mm4 C cnt 98 psllq %mm4, %mm7 C rh,rl normalized 99 movq %mm7, %mm2 C rl in low half 100 psrlq $32, %mm7 C rh 101 movd (%ecx), %mm1 C bi 102 pmuludq %mm7, %mm1 C qh,ql 103 paddq %mm2, %mm1 C qh-1,ql 104 movd %mm1, %ecx C ql 105 psrlq $32, %mm1 C qh-1 106 movd 16(%esp), %mm3 C b 107 pmuludq %mm1, %mm3 C (qh-1) * b 108 psubq %mm3, %mm2 C r in low half (could use psubd) 109 movd %mm2, %eax C r 110 mov 16(%esp), %ebx 111 sub %ebx, %eax C r 112 cmp %eax, %ecx 113 lea (%eax,%ebx), %edx 114 cmovc( %edx, %eax) 115 movd %mm4, %ecx C cnt 116 cmp %ebx, %eax 117 jae L(fix) 118 emms 119 pop %ebx 120 shr %cl, %eax 121 ret 122 123 L(fix): sub %ebx, %eax 124 emms 125 pop %ebx 126 shr %cl, %eax 127 ret 128 EPILOGUE() 129 130 PROLOGUE(mpn_mod_1_1p_cps) 131 C CAUTION: This is the same code as in k7/mod_1_1.asm 132 push %ebp 133 mov 12(%esp), %ebp 134 push %esi 135 bsr %ebp, %ecx 136 push %ebx 137 xor $31, %ecx 138 mov 16(%esp), %esi 139 sal %cl, %ebp 140 mov %ebp, %edx 141 not %edx 142 mov $-1, %eax 143 div %ebp 144 mov %eax, (%esi) C store bi 145 mov %ecx, 4(%esi) C store cnt 146 xor %ebx, %ebx 147 sub %ebp, %ebx 148 mov $1, %edx 149 shld %cl, %eax, %edx 150 imul %edx, %ebx 151 mul %ebx 152 add %ebx, %edx 153 not %edx 154 imul %ebp, %edx 155 add %edx, %ebp 156 cmp %edx, %eax 157 cmovc( %ebp, %edx) 158 shr %cl, %ebx 159 mov %ebx, 8(%esi) C store B1modb 160 shr %cl, %edx 161 mov %edx, 12(%esi) C store B2modb 162 pop %ebx 163 pop %esi 164 pop %ebp 165 ret 166 EPILOGUE()