github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/dive_1.asm (about) 1 dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. 2 3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C odd even divisor 35 C P6: 10.0 12.0 cycles/limb 36 37 38 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 39 C mp_limb_t divisor); 40 C 41 C The odd case is basically the same as mpn_modexact_1_odd, just with an 42 C extra store, and it runs at the same 10 cycles which is the dependent 43 C chain. 44 C 45 C The shifts for the even case aren't on the dependent chain so in principle 46 C it could run the same too, but nothing running at 10 has been found. 47 C Perhaps there's too many uops (an extra 4 over the odd case). 48 49 defframe(PARAM_DIVISOR,16) 50 defframe(PARAM_SIZE, 12) 51 defframe(PARAM_SRC, 8) 52 defframe(PARAM_DST, 4) 53 54 defframe(SAVE_EBX, -4) 55 defframe(SAVE_ESI, -8) 56 defframe(SAVE_EDI, -12) 57 defframe(SAVE_EBP, -16) 58 defframe(VAR_INVERSE, -20) 59 deflit(STACK_SPACE, 20) 60 61 TEXT 62 63 ALIGN(16) 64 PROLOGUE(mpn_divexact_1) 65 deflit(`FRAME',0) 66 67 movl PARAM_DIVISOR, %eax 68 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 69 70 movl %esi, SAVE_ESI 71 movl PARAM_SRC, %esi 72 73 movl %ebx, SAVE_EBX 74 movl PARAM_SIZE, %ebx 75 76 bsfl %eax, %ecx C trailing twos 77 78 movl %ebp, SAVE_EBP 79 80 shrl %cl, %eax C d without twos 81 82 movl %eax, %edx 83 shrl %eax C d/2 without twos 84 85 movl %edx, PARAM_DIVISOR 86 andl $127, %eax 87 88 ifdef(`PIC',` 89 LEA( binvert_limb_table, %ebp) 90 movzbl (%eax,%ebp), %ebp C inv 8 bits 91 ',` 92 movzbl binvert_limb_table(%eax), %ebp C inv 8 bits 93 ') 94 95 leal (%ebp,%ebp), %eax C 2*inv 96 97 imull %ebp, %ebp C inv*inv 98 99 movl %edi, SAVE_EDI 100 movl PARAM_DST, %edi 101 102 leal (%esi,%ebx,4), %esi C src end 103 104 imull PARAM_DIVISOR, %ebp C inv*inv*d 105 106 subl %ebp, %eax C inv = 2*inv - inv*inv*d 107 leal (%eax,%eax), %ebp C 2*inv 108 109 imull %eax, %eax C inv*inv 110 111 leal (%edi,%ebx,4), %edi C dst end 112 negl %ebx C -size 113 114 movl %edi, PARAM_DST 115 116 imull PARAM_DIVISOR, %eax C inv*inv*d 117 118 subl %eax, %ebp C inv = 2*inv - inv*inv*d 119 120 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 121 movl PARAM_DIVISOR, %eax 122 imull %ebp, %eax 123 cmpl $1, %eax') 124 125 movl %ebp, VAR_INVERSE 126 movl (%esi,%ebx,4), %eax C src[0] 127 128 orl %ecx, %ecx 129 jnz L(even) 130 131 C ecx initial carry is zero 132 jmp L(odd_entry) 133 134 135 C The dependent chain here is 136 C 137 C subl %edx, %eax 1 138 C imull %ebp, %eax 4 139 C mull PARAM_DIVISOR 5 140 C ---- 141 C total 10 142 C 143 C and this is the measured speed. No special scheduling is necessary, out 144 C of order execution hides the load latency. 145 146 L(odd_top): 147 C eax scratch (src limb) 148 C ebx counter, limbs, negative 149 C ecx carry bit 150 C edx carry limb, high of last product 151 C esi &src[size] 152 C edi &dst[size] 153 C ebp 154 155 mull PARAM_DIVISOR 156 157 movl (%esi,%ebx,4), %eax 158 subl %ecx, %eax 159 160 sbbl %ecx, %ecx 161 subl %edx, %eax 162 163 sbbl $0, %ecx 164 165 L(odd_entry): 166 imull VAR_INVERSE, %eax 167 168 movl %eax, (%edi,%ebx,4) 169 negl %ecx 170 171 incl %ebx 172 jnz L(odd_top) 173 174 175 movl SAVE_ESI, %esi 176 177 movl SAVE_EDI, %edi 178 179 movl SAVE_EBP, %ebp 180 181 movl SAVE_EBX, %ebx 182 addl $STACK_SPACE, %esp 183 184 ret 185 186 187 L(even): 188 C eax src[0] 189 C ebx counter, limbs, negative 190 C ecx shift 191 C edx 192 C esi 193 C edi 194 C ebp 195 196 xorl %ebp, %ebp C initial carry bit 197 xorl %edx, %edx C initial carry limb (for size==1) 198 199 incl %ebx 200 jz L(even_one) 201 202 movl (%esi,%ebx,4), %edi C src[1] 203 204 shrdl( %cl, %edi, %eax) 205 206 jmp L(even_entry) 207 208 209 L(even_top): 210 C eax scratch 211 C ebx counter, limbs, negative 212 C ecx shift 213 C edx scratch 214 C esi &src[size] 215 C edi &dst[size] and scratch 216 C ebp carry bit 217 218 movl (%esi,%ebx,4), %edi 219 220 mull PARAM_DIVISOR 221 222 movl -4(%esi,%ebx,4), %eax 223 shrdl( %cl, %edi, %eax) 224 225 subl %ebp, %eax 226 227 sbbl %ebp, %ebp 228 subl %edx, %eax 229 230 sbbl $0, %ebp 231 232 L(even_entry): 233 imull VAR_INVERSE, %eax 234 235 movl PARAM_DST, %edi 236 negl %ebp 237 238 movl %eax, -4(%edi,%ebx,4) 239 incl %ebx 240 jnz L(even_top) 241 242 243 244 mull PARAM_DIVISOR 245 246 movl -4(%esi), %eax 247 248 L(even_one): 249 shrl %cl, %eax 250 movl SAVE_ESI, %esi 251 252 subl %ebp, %eax 253 movl SAVE_EBP, %ebp 254 255 subl %edx, %eax 256 movl SAVE_EBX, %ebx 257 258 imull VAR_INVERSE, %eax 259 260 movl %eax, -4(%edi) 261 movl SAVE_EDI, %edi 262 addl $STACK_SPACE, %esp 263 264 ret 265 266 EPILOGUE() 267 ASM_END()