github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/bdiv_q_1.asm (about) 1 dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. 2 3 dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato. 4 5 dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C P4: 19.0 cycles/limb 37 38 C Pairs of movd's are used to avoid unaligned loads. Despite the loads not 39 C being on the dependent chain and there being plenty of cycles available, 40 C using an unaligned movq on every second iteration measured about 23 c/l. 41 C 42 43 defframe(PARAM_SHIFT, 24) 44 defframe(PARAM_INVERSE,20) 45 defframe(PARAM_DIVISOR,16) 46 defframe(PARAM_SIZE, 12) 47 defframe(PARAM_SRC, 8) 48 defframe(PARAM_DST, 4) 49 50 TEXT 51 52 C mp_limb_t 53 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 54 C mp_limb_t inverse, int shift) 55 ALIGN(32) 56 PROLOGUE(mpn_pi1_bdiv_q_1) 57 deflit(`FRAME',0) 58 59 movl PARAM_SIZE, %edx 60 61 movl PARAM_SRC, %eax 62 63 movl PARAM_DIVISOR, %ecx 64 65 movd %ecx, %mm6 66 movl PARAM_SHIFT, %ecx 67 68 movd %ecx, %mm7 C shift 69 70 C 71 72 movl PARAM_INVERSE, %ecx 73 movd %ecx, %mm5 C inv 74 75 movl PARAM_DST, %ecx 76 pxor %mm1, %mm1 C initial carry limb 77 pxor %mm0, %mm0 C initial carry bit 78 79 subl $1, %edx 80 jz L(done) 81 82 pcmpeqd %mm4, %mm4 83 psrlq $32, %mm4 C 0x00000000FFFFFFFF 84 85 C The dependent chain here is as follows. 86 C 87 C latency 88 C psubq s = (src-cbit) - climb 2 89 C pmuludq q = s*inverse 8 90 C pmuludq prod = q*divisor 8 91 C psrlq climb = high(prod) 2 92 C -- 93 C 20 94 C 95 C Yet the loop measures 19.0 c/l, so obviously there's something gained 96 C there over a straight reading of the chip documentation. 97 98 L(top): 99 C eax src, incrementing 100 C ebx 101 C ecx dst, incrementing 102 C edx counter, size-1 iterations 103 C 104 C mm0 carry bit 105 C mm1 carry limb 106 C mm4 0x00000000FFFFFFFF 107 C mm5 inverse 108 C mm6 divisor 109 C mm7 shift 110 111 movd (%eax), %mm2 112 movd 4(%eax), %mm3 113 addl $4, %eax 114 punpckldq %mm3, %mm2 115 116 psrlq %mm7, %mm2 117 pand %mm4, %mm2 C src 118 psubq %mm0, %mm2 C src - cbit 119 120 psubq %mm1, %mm2 C src - cbit - climb 121 movq %mm2, %mm0 122 psrlq $63, %mm0 C new cbit 123 124 pmuludq %mm5, %mm2 C s*inverse 125 movd %mm2, (%ecx) C q 126 addl $4, %ecx 127 128 movq %mm6, %mm1 129 pmuludq %mm2, %mm1 C q*divisor 130 psrlq $32, %mm1 C new climb 131 132 L(entry): 133 subl $1, %edx 134 jnz L(top) 135 136 L(done): 137 movd (%eax), %mm2 138 psrlq %mm7, %mm2 C src 139 psubq %mm0, %mm2 C src - cbit 140 141 psubq %mm1, %mm2 C src - cbit - climb 142 143 pmuludq %mm5, %mm2 C s*inverse 144 movd %mm2, (%ecx) C q 145 146 emms 147 ret 148 149 EPILOGUE() 150 151 ALIGN(16) 152 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 153 C mp_limb_t divisor); 154 C 155 PROLOGUE(mpn_bdiv_q_1) 156 deflit(`FRAME',0) 157 158 movl PARAM_SIZE, %edx 159 160 movl PARAM_DIVISOR, %ecx 161 162 C eax src 163 C ebx 164 C ecx divisor 165 C edx size-1 166 167 movl %ecx, %eax 168 bsfl %ecx, %ecx C trailing twos 169 170 shrl %cl, %eax C d = divisor without twos 171 movd %eax, %mm6 172 movd %ecx, %mm7 C shift 173 174 shrl %eax C d/2 175 176 andl $127, %eax C d/2, 7 bits 177 178 ifdef(`PIC',` 179 LEA( binvert_limb_table, %ecx) 180 movzbl (%eax,%ecx), %eax C inv 8 bits 181 ',` 182 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 183 ') 184 185 C 186 187 movd %eax, %mm5 C inv 188 189 movd %eax, %mm0 C inv 190 191 pmuludq %mm5, %mm5 C inv*inv 192 193 C 194 195 pmuludq %mm6, %mm5 C inv*inv*d 196 paddd %mm0, %mm0 C 2*inv 197 198 C 199 200 psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d 201 pxor %mm5, %mm5 202 203 paddd %mm0, %mm5 204 pmuludq %mm0, %mm0 C inv*inv 205 206 pcmpeqd %mm4, %mm4 207 psrlq $32, %mm4 C 0x00000000FFFFFFFF 208 209 C 210 211 pmuludq %mm6, %mm0 C inv*inv*d 212 paddd %mm5, %mm5 C 2*inv 213 214 movl PARAM_SRC, %eax 215 movl PARAM_DST, %ecx 216 pxor %mm1, %mm1 C initial carry limb 217 218 C 219 220 psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d 221 222 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 223 pushl %eax FRAME_pushl() 224 movq %mm6, %mm0 225 pmuludq %mm5, %mm0 226 movd %mm0, %eax 227 cmpl $1, %eax 228 popl %eax FRAME_popl()') 229 230 pxor %mm0, %mm0 C initial carry bit 231 jmp L(entry) 232 233 EPILOGUE() 234 ASM_END()