github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/bdiv_q_1.asm (about) 1 dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. 2 3 dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato. 4 5 dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C odd even divisor 37 C P6: 10.0 12.0 cycles/limb 38 39 C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) 40 41 C The odd case is basically the same as mpn_modexact_1_odd, just with an 42 C extra store, and it runs at the same 10 cycles which is the dependent 43 C chain. 44 C 45 C The shifts for the even case aren't on the dependent chain so in principle 46 C it could run the same too, but nothing running at 10 has been found. 47 C Perhaps there's too many uops (an extra 4 over the odd case). 48 49 defframe(PARAM_SHIFT, 24) 50 defframe(PARAM_INVERSE,20) 51 defframe(PARAM_DIVISOR,16) 52 defframe(PARAM_SIZE, 12) 53 defframe(PARAM_SRC, 8) 54 defframe(PARAM_DST, 4) 55 56 defframe(SAVE_EBX, -4) 57 defframe(SAVE_ESI, -8) 58 defframe(SAVE_EDI, -12) 59 defframe(SAVE_EBP, -16) 60 deflit(STACK_SPACE, 16) 61 62 dnl re-use parameter space 63 define(VAR_INVERSE,`PARAM_SRC') 64 65 TEXT 66 67 C mp_limb_t 68 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 69 C mp_limb_t inverse, int shift) 70 71 ALIGN(16) 72 PROLOGUE(mpn_pi1_bdiv_q_1) 73 deflit(`FRAME',0) 74 75 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 76 77 movl %esi, SAVE_ESI 78 movl PARAM_SRC, %esi 79 80 movl %ebx, SAVE_EBX 81 movl PARAM_SIZE, %ebx 82 83 movl %ebp, SAVE_EBP 84 movl PARAM_INVERSE, %ebp 85 86 movl PARAM_SHIFT, %ecx C trailing twos 87 88 L(common): 89 movl %edi, SAVE_EDI 90 movl PARAM_DST, %edi 91 92 leal (%esi,%ebx,4), %esi C src end 93 94 leal (%edi,%ebx,4), %edi C dst end 95 negl %ebx C -size 96 97 movl (%esi,%ebx,4), %eax C src[0] 98 99 orl %ecx, %ecx 100 jz L(odd_entry) 101 102 movl %edi, PARAM_DST 103 movl %ebp, VAR_INVERSE 104 105 L(even): 106 C eax src[0] 107 C ebx counter, limbs, negative 108 C ecx shift 109 C edx 110 C esi 111 C edi 112 C ebp 113 114 xorl %ebp, %ebp C initial carry bit 115 xorl %edx, %edx C initial carry limb (for size==1) 116 117 incl %ebx 118 jz L(even_one) 119 120 movl (%esi,%ebx,4), %edi C src[1] 121 122 shrdl( %cl, %edi, %eax) 123 124 jmp L(even_entry) 125 126 127 L(even_top): 128 C eax scratch 129 C ebx counter, limbs, negative 130 C ecx shift 131 C edx scratch 132 C esi &src[size] 133 C edi &dst[size] and scratch 134 C ebp carry bit 135 136 movl (%esi,%ebx,4), %edi 137 138 mull PARAM_DIVISOR 139 140 movl -4(%esi,%ebx,4), %eax 141 shrdl( %cl, %edi, %eax) 142 143 subl %ebp, %eax 144 145 sbbl %ebp, %ebp 146 subl %edx, %eax 147 148 sbbl $0, %ebp 149 150 L(even_entry): 151 imull VAR_INVERSE, %eax 152 153 movl PARAM_DST, %edi 154 negl %ebp 155 156 movl %eax, -4(%edi,%ebx,4) 157 incl %ebx 158 jnz L(even_top) 159 160 mull PARAM_DIVISOR 161 162 movl -4(%esi), %eax 163 164 L(even_one): 165 shrl %cl, %eax 166 movl SAVE_ESI, %esi 167 168 subl %ebp, %eax 169 movl SAVE_EBP, %ebp 170 171 subl %edx, %eax 172 movl SAVE_EBX, %ebx 173 174 imull VAR_INVERSE, %eax 175 176 movl %eax, -4(%edi) 177 movl SAVE_EDI, %edi 178 addl $STACK_SPACE, %esp 179 180 ret 181 182 C The dependent chain here is 183 C 184 C subl %edx, %eax 1 185 C imull %ebp, %eax 4 186 C mull PARAM_DIVISOR 5 187 C ---- 188 C total 10 189 C 190 C and this is the measured speed. No special scheduling is necessary, out 191 C of order execution hides the load latency. 192 193 L(odd_top): 194 C eax scratch (src limb) 195 C ebx counter, limbs, negative 196 C ecx carry bit 197 C edx carry limb, high of last product 198 C esi &src[size] 199 C edi &dst[size] 200 C ebp inverse 201 202 mull PARAM_DIVISOR 203 204 movl (%esi,%ebx,4), %eax 205 subl %ecx, %eax 206 207 sbbl %ecx, %ecx 208 subl %edx, %eax 209 210 sbbl $0, %ecx 211 212 L(odd_entry): 213 imull %ebp, %eax 214 215 movl %eax, (%edi,%ebx,4) 216 negl %ecx 217 218 incl %ebx 219 jnz L(odd_top) 220 221 222 movl SAVE_ESI, %esi 223 224 movl SAVE_EDI, %edi 225 226 movl SAVE_EBP, %ebp 227 228 movl SAVE_EBX, %ebx 229 addl $STACK_SPACE, %esp 230 231 ret 232 233 EPILOGUE() 234 235 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 236 C mp_limb_t divisor); 237 C 238 239 ALIGN(16) 240 PROLOGUE(mpn_bdiv_q_1) 241 deflit(`FRAME',0) 242 243 movl PARAM_DIVISOR, %eax 244 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 245 246 movl %esi, SAVE_ESI 247 movl PARAM_SRC, %esi 248 249 movl %ebx, SAVE_EBX 250 movl PARAM_SIZE, %ebx 251 252 bsfl %eax, %ecx C trailing twos 253 254 movl %ebp, SAVE_EBP 255 256 shrl %cl, %eax C d without twos 257 258 movl %eax, %edx 259 shrl %eax C d/2 without twos 260 261 movl %edx, PARAM_DIVISOR 262 andl $127, %eax 263 264 ifdef(`PIC',` 265 LEA( binvert_limb_table, %ebp) 266 movzbl (%eax,%ebp), %ebp C inv 8 bits 267 ',` 268 movzbl binvert_limb_table(%eax), %ebp C inv 8 bits 269 ') 270 271 leal (%ebp,%ebp), %eax C 2*inv 272 273 imull %ebp, %ebp C inv*inv 274 imull %edx, %ebp C inv*inv*d 275 276 subl %ebp, %eax C inv = 2*inv - inv*inv*d 277 leal (%eax,%eax), %ebp C 2*inv 278 279 imull %eax, %eax C inv*inv 280 imull %edx, %eax C inv*inv*d 281 282 subl %eax, %ebp C inv = 2*inv - inv*inv*d 283 284 jmp L(common) 285 286 EPILOGUE() 287 ASM_END()