github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/rshift.asm (about) 1 dnl Intel Pentium mpn_rshift -- mpn right shift. 2 3 dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C P5,P54: 6.0 36 C P55: 5.375 37 38 39 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 40 C unsigned shift); 41 C 42 C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, 43 C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. 44 45 defframe(PARAM_SHIFT,16) 46 defframe(PARAM_SIZE, 12) 47 defframe(PARAM_SRC, 8) 48 defframe(PARAM_DST, 4) 49 50 TEXT 51 ALIGN(8) 52 PROLOGUE(mpn_rshift) 53 54 pushl %edi 55 pushl %esi 56 pushl %ebx 57 pushl %ebp 58 deflit(`FRAME',16) 59 60 movl PARAM_DST,%edi 61 movl PARAM_SRC,%esi 62 movl PARAM_SIZE,%ebp 63 movl PARAM_SHIFT,%ecx 64 65 C We can use faster code for shift-by-1 under certain conditions. 66 cmp $1,%ecx 67 jne L(normal) 68 leal 4(%edi),%eax 69 cmpl %esi,%eax 70 jnc L(special) C jump if res_ptr + 1 >= s_ptr 71 leal (%edi,%ebp,4),%eax 72 cmpl %eax,%esi 73 jnc L(special) C jump if s_ptr >= res_ptr + size 74 75 L(normal): 76 movl (%esi),%edx 77 addl $4,%esi 78 xorl %eax,%eax 79 shrdl( %cl, %edx, %eax) C compute carry limb 80 pushl %eax C push carry limb onto stack 81 82 decl %ebp 83 pushl %ebp 84 shrl $3,%ebp 85 jz L(end) 86 87 movl (%edi),%eax C fetch destination cache line 88 89 ALIGN(4) 90 L(oop): movl 28(%edi),%eax C fetch destination cache line 91 movl %edx,%ebx 92 93 movl (%esi),%eax 94 movl 4(%esi),%edx 95 shrdl( %cl, %eax, %ebx) 96 shrdl( %cl, %edx, %eax) 97 movl %ebx,(%edi) 98 movl %eax,4(%edi) 99 100 movl 8(%esi),%ebx 101 movl 12(%esi),%eax 102 shrdl( %cl, %ebx, %edx) 103 shrdl( %cl, %eax, %ebx) 104 movl %edx,8(%edi) 105 movl %ebx,12(%edi) 106 107 movl 16(%esi),%edx 108 movl 20(%esi),%ebx 109 shrdl( %cl, %edx, %eax) 110 shrdl( %cl, %ebx, %edx) 111 movl %eax,16(%edi) 112 movl %edx,20(%edi) 113 114 movl 24(%esi),%eax 115 movl 28(%esi),%edx 116 shrdl( %cl, %eax, %ebx) 117 shrdl( %cl, %edx, %eax) 118 movl %ebx,24(%edi) 119 movl %eax,28(%edi) 120 121 addl $32,%esi 122 addl $32,%edi 123 decl %ebp 124 jnz L(oop) 125 126 L(end): popl %ebp 127 andl $7,%ebp 128 jz L(end2) 129 L(oop2): 130 movl (%esi),%eax 131 shrdl( %cl,%eax,%edx) C compute result limb 132 movl %edx,(%edi) 133 movl %eax,%edx 134 addl $4,%esi 135 addl $4,%edi 136 decl %ebp 137 jnz L(oop2) 138 139 L(end2): 140 shrl %cl,%edx C compute most significant limb 141 movl %edx,(%edi) C store it 142 143 popl %eax C pop carry limb 144 145 popl %ebp 146 popl %ebx 147 popl %esi 148 popl %edi 149 ret 150 151 152 C We loop from least significant end of the arrays, which is only 153 C permissable if the source and destination don't overlap, since the 154 C function is documented to work for overlapping source and destination. 155 156 L(special): 157 leal -4(%edi,%ebp,4),%edi 158 leal -4(%esi,%ebp,4),%esi 159 160 movl (%esi),%edx 161 subl $4,%esi 162 163 decl %ebp 164 pushl %ebp 165 shrl $3,%ebp 166 167 shrl %edx 168 incl %ebp 169 decl %ebp 170 jz L(Lend) 171 172 movl (%edi),%eax C fetch destination cache line 173 174 ALIGN(4) 175 L(Loop): 176 movl -28(%edi),%eax C fetch destination cache line 177 movl %edx,%ebx 178 179 movl (%esi),%eax 180 movl -4(%esi),%edx 181 rcrl %eax 182 movl %ebx,(%edi) 183 rcrl %edx 184 movl %eax,-4(%edi) 185 186 movl -8(%esi),%ebx 187 movl -12(%esi),%eax 188 rcrl %ebx 189 movl %edx,-8(%edi) 190 rcrl %eax 191 movl %ebx,-12(%edi) 192 193 movl -16(%esi),%edx 194 movl -20(%esi),%ebx 195 rcrl %edx 196 movl %eax,-16(%edi) 197 rcrl %ebx 198 movl %edx,-20(%edi) 199 200 movl -24(%esi),%eax 201 movl -28(%esi),%edx 202 rcrl %eax 203 movl %ebx,-24(%edi) 204 rcrl %edx 205 movl %eax,-28(%edi) 206 207 leal -32(%esi),%esi C use leal not to clobber carry 208 leal -32(%edi),%edi 209 decl %ebp 210 jnz L(Loop) 211 212 L(Lend): 213 popl %ebp 214 sbbl %eax,%eax C save carry in %eax 215 andl $7,%ebp 216 jz L(Lend2) 217 addl %eax,%eax C restore carry from eax 218 L(Loop2): 219 movl %edx,%ebx 220 movl (%esi),%edx 221 rcrl %edx 222 movl %ebx,(%edi) 223 224 leal -4(%esi),%esi C use leal not to clobber carry 225 leal -4(%edi),%edi 226 decl %ebp 227 jnz L(Loop2) 228 229 jmp L(L1) 230 L(Lend2): 231 addl %eax,%eax C restore carry from eax 232 L(L1): movl %edx,(%edi) C store last limb 233 234 movl $0,%eax 235 rcrl %eax 236 237 popl %ebp 238 popl %ebx 239 popl %esi 240 popl %edi 241 ret 242 243 EPILOGUE()