github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/lshift.asm (about) 1 dnl Intel Pentium mpn_lshift -- mpn left shift. 2 3 dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C P5,P54: 6.0 36 C P55: 5.375 37 38 39 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 40 C unsigned shift); 41 C 42 C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, 43 C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. 44 45 defframe(PARAM_SHIFT,16) 46 defframe(PARAM_SIZE, 12) 47 defframe(PARAM_SRC, 8) 48 defframe(PARAM_DST, 4) 49 50 TEXT 51 ALIGN(8) 52 PROLOGUE(mpn_lshift) 53 54 pushl %edi 55 pushl %esi 56 pushl %ebx 57 pushl %ebp 58 deflit(`FRAME',16) 59 60 movl PARAM_DST,%edi 61 movl PARAM_SRC,%esi 62 movl PARAM_SIZE,%ebp 63 movl PARAM_SHIFT,%ecx 64 65 C We can use faster code for shift-by-1 under certain conditions. 66 cmp $1,%ecx 67 jne L(normal) 68 leal 4(%esi),%eax 69 cmpl %edi,%eax 70 jnc L(special) C jump if s_ptr + 1 >= res_ptr 71 leal (%esi,%ebp,4),%eax 72 cmpl %eax,%edi 73 jnc L(special) C jump if res_ptr >= s_ptr + size 74 75 L(normal): 76 leal -4(%edi,%ebp,4),%edi 77 leal -4(%esi,%ebp,4),%esi 78 79 movl (%esi),%edx 80 subl $4,%esi 81 xorl %eax,%eax 82 shldl( %cl, %edx, %eax) C compute carry limb 83 pushl %eax C push carry limb onto stack 84 85 decl %ebp 86 pushl %ebp 87 shrl $3,%ebp 88 jz L(end) 89 90 movl (%edi),%eax C fetch destination cache line 91 92 ALIGN(4) 93 L(oop): movl -28(%edi),%eax C fetch destination cache line 94 movl %edx,%ebx 95 96 movl (%esi),%eax 97 movl -4(%esi),%edx 98 shldl( %cl, %eax, %ebx) 99 shldl( %cl, %edx, %eax) 100 movl %ebx,(%edi) 101 movl %eax,-4(%edi) 102 103 movl -8(%esi),%ebx 104 movl -12(%esi),%eax 105 shldl( %cl, %ebx, %edx) 106 shldl( %cl, %eax, %ebx) 107 movl %edx,-8(%edi) 108 movl %ebx,-12(%edi) 109 110 movl -16(%esi),%edx 111 movl -20(%esi),%ebx 112 shldl( %cl, %edx, %eax) 113 shldl( %cl, %ebx, %edx) 114 movl %eax,-16(%edi) 115 movl %edx,-20(%edi) 116 117 movl -24(%esi),%eax 118 movl -28(%esi),%edx 119 shldl( %cl, %eax, %ebx) 120 shldl( %cl, %edx, %eax) 121 movl %ebx,-24(%edi) 122 movl %eax,-28(%edi) 123 124 subl $32,%esi 125 subl $32,%edi 126 decl %ebp 127 jnz L(oop) 128 129 L(end): popl %ebp 130 andl $7,%ebp 131 jz L(end2) 132 L(oop2): 133 movl (%esi),%eax 134 shldl( %cl,%eax,%edx) 135 movl %edx,(%edi) 136 movl %eax,%edx 137 subl $4,%esi 138 subl $4,%edi 139 decl %ebp 140 jnz L(oop2) 141 142 L(end2): 143 shll %cl,%edx C compute least significant limb 144 movl %edx,(%edi) C store it 145 146 popl %eax C pop carry limb 147 148 popl %ebp 149 popl %ebx 150 popl %esi 151 popl %edi 152 ret 153 154 155 C We loop from least significant end of the arrays, which is only 156 C permissable if the source and destination don't overlap, since the 157 C function is documented to work for overlapping source and destination. 158 159 L(special): 160 movl (%esi),%edx 161 addl $4,%esi 162 163 decl %ebp 164 pushl %ebp 165 shrl $3,%ebp 166 167 addl %edx,%edx 168 incl %ebp 169 decl %ebp 170 jz L(Lend) 171 172 movl (%edi),%eax C fetch destination cache line 173 174 ALIGN(4) 175 L(Loop): 176 movl 28(%edi),%eax C fetch destination cache line 177 movl %edx,%ebx 178 179 movl (%esi),%eax 180 movl 4(%esi),%edx 181 adcl %eax,%eax 182 movl %ebx,(%edi) 183 adcl %edx,%edx 184 movl %eax,4(%edi) 185 186 movl 8(%esi),%ebx 187 movl 12(%esi),%eax 188 adcl %ebx,%ebx 189 movl %edx,8(%edi) 190 adcl %eax,%eax 191 movl %ebx,12(%edi) 192 193 movl 16(%esi),%edx 194 movl 20(%esi),%ebx 195 adcl %edx,%edx 196 movl %eax,16(%edi) 197 adcl %ebx,%ebx 198 movl %edx,20(%edi) 199 200 movl 24(%esi),%eax 201 movl 28(%esi),%edx 202 adcl %eax,%eax 203 movl %ebx,24(%edi) 204 adcl %edx,%edx 205 movl %eax,28(%edi) 206 207 leal 32(%esi),%esi C use leal not to clobber carry 208 leal 32(%edi),%edi 209 decl %ebp 210 jnz L(Loop) 211 212 L(Lend): 213 popl %ebp 214 sbbl %eax,%eax C save carry in %eax 215 andl $7,%ebp 216 jz L(Lend2) 217 addl %eax,%eax C restore carry from eax 218 L(Loop2): 219 movl %edx,%ebx 220 movl (%esi),%edx 221 adcl %edx,%edx 222 movl %ebx,(%edi) 223 224 leal 4(%esi),%esi C use leal not to clobber carry 225 leal 4(%edi),%edi 226 decl %ebp 227 jnz L(Loop2) 228 229 jmp L(L1) 230 L(Lend2): 231 addl %eax,%eax C restore carry from eax 232 L(L1): movl %edx,(%edi) C store last limb 233 234 sbbl %eax,%eax 235 negl %eax 236 237 popl %ebp 238 popl %ebx 239 popl %esi 240 popl %edi 241 ret 242 243 EPILOGUE()