github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mod_1_1.asm (about) 1 dnl x86-32 mpn_mod_1_1p, requiring cmov. 2 3 dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund. 4 5 dnl Copyright 2010, 2011 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C P5 ? 37 C P6 model 0-8,10-12 ? 38 C P6 model 9 (Banias) ? 39 C P6 model 13 (Dothan) ? 40 C P4 model 0 (Willamette) ? 41 C P4 model 1 (?) ? 42 C P4 model 2 (Northwood) ? 43 C P4 model 3 (Prescott) ? 44 C P4 model 4 (Nocona) ? 45 C AMD K6 ? 46 C AMD K7 7 47 C AMD K8 ? 48 49 define(`B2mb', `%ebx') 50 define(`r0', `%esi') 51 define(`r2', `%ebp') 52 define(`t0', `%edi') 53 define(`ap', `%ecx') C Also shift count 54 55 C Stack frame 56 C pre 36(%esp) 57 C b 32(%esp) 58 C n 28(%esp) 59 C ap 24(%esp) 60 C return 20(%esp) 61 C %ebp 16(%esp) 62 C %edi 12(%esp) 63 C %esi 8(%esp) 64 C %ebx 4(%esp) 65 C B2mod (%esp) 66 67 define(`B2modb', `(%esp)') 68 define(`n', `28(%esp)') 69 define(`b', `32(%esp)') 70 define(`pre', `36(%esp)') 71 72 C mp_limb_t 73 C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4]) 74 C 75 C The pre array contains bi, cnt, B1modb, B2modb 76 C Note: This implementation needs B1modb only when cnt > 0 77 78 ASM_START() 79 TEXT 80 ALIGN(8) 81 PROLOGUE(mpn_mod_1_1p) 82 push %ebp 83 push %edi 84 push %esi 85 push %ebx 86 mov 32(%esp), %ebp C pre[] 87 88 mov 12(%ebp), %eax C B2modb 89 push %eax C Put it on stack 90 91 mov n, %edx 92 mov 24(%esp), ap 93 94 lea (ap, %edx, 4), ap 95 mov -4(ap), %eax 96 cmp $3, %edx 97 jnc L(first) 98 mov -8(ap), r0 99 jmp L(reduce_two) 100 101 L(first): 102 C First iteration, no r2 103 mull B2modb 104 mov -12(ap), r0 105 add %eax, r0 106 mov -8(ap), %eax 107 adc %edx, %eax 108 sbb r2, r2 109 subl $3, n 110 lea -16(ap), ap 111 jz L(reduce_three) 112 113 mov B2modb, B2mb 114 sub b, B2mb 115 lea (B2mb, r0), t0 116 jmp L(mid) 117 118 ALIGN(16) 119 L(top): C Loopmixed to 7 c/l on k7 120 add %eax, r0 121 lea (B2mb, r0), t0 122 mov r2, %eax 123 adc %edx, %eax 124 sbb r2, r2 125 L(mid): mull B2modb 126 and B2modb, r2 127 add r0, r2 128 decl n 129 mov (ap), r0 130 cmovc( t0, r2) 131 lea -4(ap), ap 132 jnz L(top) 133 134 add %eax, r0 135 mov r2, %eax 136 adc %edx, %eax 137 sbb r2, r2 138 139 L(reduce_three): 140 C Eliminate r2 141 and b, r2 142 sub r2, %eax 143 144 L(reduce_two): 145 mov pre, %ebp 146 movb 4(%ebp), %cl 147 test %cl, %cl 148 jz L(normalized) 149 150 C Unnormalized, use B1modb to reduce to size < B b 151 mull 8(%ebp) 152 xor t0, t0 153 add %eax, r0 154 adc %edx, t0 155 mov t0, %eax 156 157 C Left-shift to normalize 158 shld %cl, r0, %eax C Always use shld? 159 160 shl %cl, r0 161 jmp L(udiv) 162 163 L(normalized): 164 mov %eax, t0 165 sub b, t0 166 cmovnc( t0, %eax) 167 168 L(udiv): 169 lea 1(%eax), t0 170 mull (%ebp) 171 mov b, %ebx C Needed in register for lea 172 add r0, %eax 173 adc t0, %edx 174 imul %ebx, %edx 175 sub %edx, r0 176 cmp r0, %eax 177 lea (%ebx, r0), %eax 178 cmovnc( r0, %eax) 179 cmp %ebx, %eax 180 jnc L(fix) 181 L(ok): shr %cl, %eax 182 183 add $4, %esp 184 pop %ebx 185 pop %esi 186 pop %edi 187 pop %ebp 188 189 ret 190 L(fix): sub %ebx, %eax 191 jmp L(ok) 192 EPILOGUE() 193 194 PROLOGUE(mpn_mod_1_1p_cps) 195 push %ebp 196 mov 12(%esp), %ebp 197 push %esi 198 bsr %ebp, %ecx 199 push %ebx 200 xor $31, %ecx 201 mov 16(%esp), %esi 202 sal %cl, %ebp 203 mov %ebp, %edx 204 not %edx 205 mov $-1, %eax 206 div %ebp C On K7, invert_limb would be a few cycles faster. 207 mov %eax, (%esi) C store bi 208 mov %ecx, 4(%esi) C store cnt 209 neg %ebp 210 mov $1, %edx 211 shld %cl, %eax, %edx 212 imul %ebp, %edx 213 shr %cl, %edx 214 imul %ebp, %eax 215 mov %edx, 8(%esi) C store B1modb 216 mov %eax, 12(%esi) C store B2modb 217 pop %ebx 218 pop %esi 219 pop %ebp 220 ret 221 EPILOGUE()