github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mode1o.asm (about) 1 dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder. 2 3 dnl Copyright 2000-2002, 2014 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P5: 23.0 cycles/limb 35 36 37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 38 C mp_limb_t divisor); 39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 40 C mp_limb_t divisor, mp_limb_t carry); 41 C 42 C There seems no way to pair up the two lone instructions in the main loop. 43 C 44 C The special case for size==1 saves about 20 cycles (non-PIC), making it 45 C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at 46 C all sizes. 47 C 48 C Alternatives: 49 C 50 C Using mmx for the multiplies might be possible, with pmullw and pmulhw 51 C having just 3 cycle latencies, but carry bit handling would probably be 52 C complicated. 53 54 defframe(PARAM_CARRY, 16) 55 defframe(PARAM_DIVISOR,12) 56 defframe(PARAM_SIZE, 8) 57 defframe(PARAM_SRC, 4) 58 59 dnl re-using parameter space 60 define(VAR_INVERSE,`PARAM_SIZE') 61 62 TEXT 63 64 ALIGN(16) 65 PROLOGUE(mpn_modexact_1c_odd) 66 deflit(`FRAME',0) 67 68 movl PARAM_DIVISOR, %eax 69 movl PARAM_CARRY, %edx 70 71 jmp L(start_1c) 72 73 EPILOGUE() 74 75 ALIGN(16) 76 PROLOGUE(mpn_modexact_1_odd) 77 deflit(`FRAME',0) 78 79 movl PARAM_DIVISOR, %eax 80 xorl %edx, %edx C carry 81 82 L(start_1c): 83 84 ifdef(`PIC',` 85 ifdef(`DARWIN',` 86 shrl %eax C d/2 87 LEA( binvert_limb_table, %ecx) 88 pushl %ebx FRAME_pushl() 89 movl PARAM_SIZE, %ebx 90 91 andl $127, %eax 92 subl $2, %ebx 93 94 movb (%eax,%ecx), %cl 95 jc L(one_limb) 96 ',` 97 call L(here) FRAME_pushl() 98 L(here): 99 100 shrl %eax C d/2 101 movl (%esp), %ecx C eip 102 103 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx 104 movl %ebx, (%esp) C push ebx 105 106 andl $127, %eax 107 movl PARAM_SIZE, %ebx 108 109 movl binvert_limb_table@GOT(%ecx), %ecx 110 subl $2, %ebx 111 112 movb (%eax,%ecx), %cl C inv 8 bits 113 jc L(one_limb) 114 ') 115 ',` 116 dnl non-PIC 117 shrl %eax C d/2 118 pushl %ebx FRAME_pushl() 119 120 movl PARAM_SIZE, %ebx 121 andl $127, %eax 122 123 subl $2, %ebx 124 jc L(one_limb) 125 126 movb binvert_limb_table(%eax), %cl C inv 8 bits 127 ') 128 129 movl %ecx, %eax 130 addl %ecx, %ecx C 2*inv 131 132 imull %eax, %eax C inv*inv 133 134 imull PARAM_DIVISOR, %eax C inv*inv*d 135 136 subl %eax, %ecx C inv = 2*inv - inv*inv*d 137 138 movl %ecx, %eax 139 addl %ecx, %ecx C 2*inv 140 141 imull %eax, %eax C inv*inv 142 143 imull PARAM_DIVISOR, %eax C inv*inv*d 144 145 subl %eax, %ecx C inv = 2*inv - inv*inv*d 146 pushl %esi FRAME_pushl() 147 148 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 149 movl %ecx, %eax 150 imull PARAM_DIVISOR, %eax 151 cmpl $1, %eax') 152 153 movl PARAM_SRC, %esi 154 movl %ecx, VAR_INVERSE 155 156 movl (%esi), %eax C src[0] 157 leal 4(%esi,%ebx,4), %esi C &src[size-1] 158 159 xorl $-1, %ebx C -(size-1) 160 ASSERT(nz) 161 jmp L(entry) 162 163 164 C The use of VAR_INVERSE means only a store is needed for that value, rather 165 C than a push and pop of say %edi. 166 167 ALIGN(16) 168 L(top): 169 C eax scratch, low product 170 C ebx counter, limbs, negative 171 C ecx carry bit 172 C edx scratch, high product 173 C esi &src[size-1] 174 C edi 175 C ebp 176 177 mull PARAM_DIVISOR C h:dummy = q*d 178 179 movl (%esi,%ebx,4), %eax C src[i] 180 subl %ecx, %edx C h -= -c 181 182 L(entry): 183 subl %edx, %eax C s = src[i] - h 184 185 sbbl %ecx, %ecx C new -c (0 or -1) 186 187 imull VAR_INVERSE, %eax C q = s*i 188 189 incl %ebx 190 jnz L(top) 191 192 193 mull PARAM_DIVISOR 194 195 movl (%esi), %eax C src high 196 subl %ecx, %edx C h -= -c 197 198 cmpl PARAM_DIVISOR, %eax 199 200 jbe L(skip_last) 201 deflit(FRAME_LAST,FRAME) 202 203 204 subl %edx, %eax C s = src[i] - h 205 popl %esi FRAME_popl() 206 207 sbbl %ecx, %ecx C c (0 or -1) 208 popl %ebx FRAME_popl() 209 210 imull VAR_INVERSE, %eax C q = s*i 211 212 mull PARAM_DIVISOR C h:dummy = q*d 213 214 movl %edx, %eax 215 216 subl %ecx, %eax 217 218 ret 219 220 221 C When high<divisor can skip last step. 222 223 L(skip_last): 224 deflit(`FRAME',FRAME_LAST) 225 C eax src high 226 C ebx 227 C ecx 228 C edx r 229 C esi 230 231 subl %eax, %edx C r-s 232 popl %esi FRAME_popl() 233 234 sbbl %eax, %eax C -1 if underflow 235 movl PARAM_DIVISOR, %ebx 236 237 andl %ebx, %eax C divisor if underflow 238 popl %ebx FRAME_popl() 239 240 addl %edx, %eax C addback if underflow 241 242 ret 243 244 245 C Special case for size==1 using a division for r = c-a mod d. 246 C Could look for a-c<d and save a division sometimes, but that doesn't seem 247 C worth bothering about. 248 249 L(one_limb): 250 deflit(`FRAME',4) 251 C eax 252 C ebx size-2 (==-1) 253 C ecx 254 C edx carry 255 C esi src end 256 C edi 257 C ebp 258 259 movl %edx, %eax 260 movl PARAM_SRC, %edx 261 262 movl PARAM_DIVISOR, %ecx 263 popl %ebx FRAME_popl() 264 265 subl (%edx), %eax C c-a 266 267 sbbl %edx, %edx 268 decl %ecx C d-1 269 270 andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a 271 272 divl PARAM_DIVISOR 273 274 movl %edx, %eax 275 276 ret 277 278 EPILOGUE() 279 ASM_END()