github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/dive_1.asm (about) 1 dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. 2 3 dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C Athlon: 11.0 36 C Hammer: 9.0 37 38 39 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 40 C mp_limb_t divisor); 41 C 42 C The dependent chain is mul+imul+sub for 11 cycles and that speed is 43 C achieved with no special effort. The load and shrld latencies are hidden 44 C by out of order execution. 45 C 46 C It's a touch faster on size==1 to use the mul-by-inverse than divl. 47 48 defframe(PARAM_DIVISOR,16) 49 defframe(PARAM_SIZE, 12) 50 defframe(PARAM_SRC, 8) 51 defframe(PARAM_DST, 4) 52 53 defframe(SAVE_EBX, -4) 54 defframe(SAVE_ESI, -8) 55 defframe(SAVE_EDI, -12) 56 defframe(SAVE_EBP, -16) 57 defframe(VAR_INVERSE, -20) 58 defframe(VAR_DST_END, -24) 59 60 deflit(STACK_SPACE, 24) 61 62 TEXT 63 64 ALIGN(16) 65 PROLOGUE(mpn_divexact_1) 66 deflit(`FRAME',0) 67 68 movl PARAM_DIVISOR, %eax 69 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 70 movl $-1, %ecx C shift count 71 72 movl %ebp, SAVE_EBP 73 movl PARAM_SIZE, %ebp 74 75 movl %esi, SAVE_ESI 76 movl %edi, SAVE_EDI 77 78 C If there's usually only one or two trailing zero bits then this 79 C should be faster than bsfl. 80 L(strip_twos): 81 incl %ecx 82 shrl %eax 83 jnc L(strip_twos) 84 85 movl %ebx, SAVE_EBX 86 leal 1(%eax,%eax), %ebx C d without twos 87 andl $127, %eax C d/2, 7 bits 88 89 ifdef(`PIC',` 90 LEA( binvert_limb_table, %edx) 91 movzbl (%eax,%edx), %eax C inv 8 bits 92 ',` 93 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 94 ') 95 96 leal (%eax,%eax), %edx C 2*inv 97 movl %ebx, PARAM_DIVISOR C d without twos 98 99 imull %eax, %eax C inv*inv 100 101 movl PARAM_SRC, %esi 102 movl PARAM_DST, %edi 103 104 imull %ebx, %eax C inv*inv*d 105 106 subl %eax, %edx C inv = 2*inv - inv*inv*d 107 leal (%edx,%edx), %eax C 2*inv 108 109 imull %edx, %edx C inv*inv 110 111 leal (%esi,%ebp,4), %esi C src end 112 leal (%edi,%ebp,4), %edi C dst end 113 negl %ebp C -size 114 115 imull %ebx, %edx C inv*inv*d 116 117 subl %edx, %eax C inv = 2*inv - inv*inv*d 118 119 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 120 pushl %eax FRAME_pushl() 121 imull PARAM_DIVISOR, %eax 122 cmpl $1, %eax 123 popl %eax FRAME_popl()') 124 125 movl %eax, VAR_INVERSE 126 movl (%esi,%ebp,4), %eax C src[0] 127 128 incl %ebp 129 jz L(one) 130 131 movl (%esi,%ebp,4), %edx C src[1] 132 133 shrdl( %cl, %edx, %eax) 134 135 movl %edi, VAR_DST_END 136 xorl %ebx, %ebx 137 jmp L(entry) 138 139 ALIGN(8) 140 L(top): 141 C eax q 142 C ebx carry bit, 0 or 1 143 C ecx shift 144 C edx 145 C esi src end 146 C edi dst end 147 C ebp counter, limbs, negative 148 149 mull PARAM_DIVISOR C carry limb in edx 150 151 movl -4(%esi,%ebp,4), %eax 152 movl (%esi,%ebp,4), %edi 153 154 shrdl( %cl, %edi, %eax) 155 156 subl %ebx, %eax C apply carry bit 157 setc %bl 158 movl VAR_DST_END, %edi 159 160 subl %edx, %eax C apply carry limb 161 adcl $0, %ebx 162 163 L(entry): 164 imull VAR_INVERSE, %eax 165 166 movl %eax, -4(%edi,%ebp,4) 167 incl %ebp 168 jnz L(top) 169 170 171 mull PARAM_DIVISOR C carry limb in edx 172 173 movl -4(%esi), %eax C src high limb 174 shrl %cl, %eax 175 movl SAVE_ESI, %esi 176 177 subl %ebx, %eax C apply carry bit 178 movl SAVE_EBX, %ebx 179 movl SAVE_EBP, %ebp 180 181 subl %edx, %eax C apply carry limb 182 183 imull VAR_INVERSE, %eax 184 185 movl %eax, -4(%edi) 186 movl SAVE_EDI, %edi 187 addl $STACK_SPACE, %esp 188 189 ret 190 191 192 L(one): 193 shrl %cl, %eax 194 movl SAVE_ESI, %esi 195 movl SAVE_EBX, %ebx 196 197 imull VAR_INVERSE, %eax 198 199 movl SAVE_EBP, %ebp 200 movl %eax, -4(%edi) 201 202 movl SAVE_EDI, %edi 203 addl $STACK_SPACE, %esp 204 205 ret 206 207 EPILOGUE() 208 ASM_END()