github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/k62mmx/lshift.asm (about) 1 dnl AMD K6-2 mpn_lshift -- mpn left shift. 2 3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6-2: 1.75 cycles/limb 35 36 37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38 C unsigned shift); 39 C 40 41 defframe(PARAM_SHIFT,16) 42 defframe(PARAM_SIZE, 12) 43 defframe(PARAM_SRC, 8) 44 defframe(PARAM_DST, 4) 45 deflit(`FRAME',0) 46 47 dnl used after src has been fetched 48 define(VAR_RETVAL,`PARAM_SRC') 49 50 dnl minimum 9, because unrolled loop can't handle less 51 deflit(UNROLL_THRESHOLD, 9) 52 53 TEXT 54 ALIGN(32) 55 56 PROLOGUE(mpn_lshift) 57 deflit(`FRAME',0) 58 59 C The 1 limb case can be done without the push %ebx, but it's then 60 C still the same speed. The push is left as a free helping hand for 61 C the two_or_more code. 62 63 movl PARAM_SIZE, %eax 64 pushl %ebx FRAME_pushl() 65 66 movl PARAM_SRC, %ebx 67 decl %eax 68 69 movl PARAM_SHIFT, %ecx 70 jnz L(two_or_more) 71 72 movl (%ebx), %edx C src limb 73 movl PARAM_DST, %ebx 74 75 shldl( %cl, %edx, %eax) C return value 76 77 shll %cl, %edx 78 79 movl %edx, (%ebx) C dst limb 80 popl %ebx 81 82 ret 83 84 85 C ----------------------------------------------------------------------------- 86 ALIGN(16) C avoid offset 0x1f 87 L(two_or_more): 88 C eax size-1 89 C ebx src 90 C ecx shift 91 C edx 92 93 movl (%ebx,%eax,4), %edx C src high limb 94 negl %ecx 95 96 movd PARAM_SHIFT, %mm6 97 addl $32, %ecx C 32-shift 98 99 shrl %cl, %edx 100 cmpl $UNROLL_THRESHOLD-1, %eax 101 102 movl %edx, VAR_RETVAL 103 jae L(unroll) 104 105 106 movd %ecx, %mm7 107 movl %eax, %ecx 108 109 movl PARAM_DST, %eax 110 111 L(simple): 112 C eax dst 113 C ebx src 114 C ecx counter, size-1 to 1 115 C edx retval 116 C 117 C mm0 scratch 118 C mm6 shift 119 C mm7 32-shift 120 121 movq -4(%ebx,%ecx,4), %mm0 122 123 psrlq %mm7, %mm0 124 125 Zdisp( movd, %mm0, 0,(%eax,%ecx,4)) 126 loop L(simple) 127 128 129 movd (%ebx), %mm0 130 popl %ebx 131 132 psllq %mm6, %mm0 133 134 movd %mm0, (%eax) 135 movl %edx, %eax 136 137 femms 138 ret 139 140 141 C ----------------------------------------------------------------------------- 142 ALIGN(16) 143 L(unroll): 144 C eax size-1 145 C ebx src 146 C ecx 32-shift 147 C edx retval (but instead VAR_RETVAL is used) 148 C 149 C mm6 shift 150 151 addl $32, %ecx 152 movl PARAM_DST, %edx 153 154 movd %ecx, %mm7 155 subl $7, %eax C size-8 156 157 leal (%edx,%eax,4), %ecx C alignment of dst 158 159 movq 32-8(%ebx,%eax,4), %mm2 C src high qword 160 testb $4, %cl 161 162 jz L(dst_aligned) 163 psllq %mm6, %mm2 164 165 psrlq $32, %mm2 166 decl %eax 167 168 movd %mm2, 32(%edx,%eax,4) C dst high limb 169 movq 32-8(%ebx,%eax,4), %mm2 C new src high qword 170 L(dst_aligned): 171 172 movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword 173 174 175 C This loop is the important bit, the rest is just support for it. 176 C Four src limbs are held at the start, and four more will be read. 177 C Four dst limbs will be written. This schedule seems necessary for 178 C full speed. 179 C 180 C The use of size-8 lets the loop stop when %eax goes negative and 181 C leaves -4 to -1 which can be tested with test $1 and $2. 182 183 L(top): 184 C eax counter, size-8 step by -4 until <0 185 C ebx src 186 C ecx 187 C edx dst 188 C 189 C mm0 src next qword 190 C mm1 scratch 191 C mm2 src prev qword 192 C mm6 shift 193 C mm7 64-shift 194 195 psllq %mm6, %mm2 196 subl $4, %eax 197 198 movq %mm0, %mm1 199 psrlq %mm7, %mm0 200 201 por %mm0, %mm2 202 movq 24(%ebx,%eax,4), %mm0 203 204 psllq %mm6, %mm1 205 movq %mm2, 40(%edx,%eax,4) 206 207 movq %mm0, %mm2 208 psrlq %mm7, %mm0 209 210 por %mm0, %mm1 211 movq 16(%ebx,%eax,4), %mm0 212 213 movq %mm1, 32(%edx,%eax,4) 214 jnc L(top) 215 216 217 C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4. 218 C 219 C 8(%ebx) is the next source, and 24(%edx) is the next destination. 220 C %eax is between -4 and -1, representing respectively 0 to 3 extra 221 C limbs that must be read. 222 223 224 testl $2, %eax C testl to avoid bad cache line crossing 225 jz L(finish_nottwo) 226 227 C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes 228 C new mm2 and a new mm0 is loaded. 229 230 psllq %mm6, %mm2 231 movq %mm0, %mm1 232 233 psrlq %mm7, %mm0 234 subl $2, %eax 235 236 por %mm0, %mm2 237 movq 16(%ebx,%eax,4), %mm0 238 239 movq %mm2, 32(%edx,%eax,4) 240 movq %mm1, %mm2 241 L(finish_nottwo): 242 243 244 C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0 245 246 testb $1, %al 247 psllq %mm6, %mm2 248 249 movq %mm0, %mm1 250 psrlq %mm7, %mm0 251 252 por %mm0, %mm2 253 psllq %mm6, %mm1 254 255 movq %mm2, 24(%edx,%eax,4) 256 jz L(finish_even) 257 258 259 C Size is odd, so mm1 and one extra limb to process. 260 261 movd (%ebx), %mm0 C src[0] 262 popl %ebx 263 deflit(`FRAME',0) 264 265 movq %mm0, %mm2 266 psllq $32, %mm0 267 268 psrlq %mm7, %mm0 269 270 psllq %mm6, %mm2 271 por %mm0, %mm1 272 273 movq %mm1, 4(%edx) C dst[1,2] 274 movd %mm2, (%edx) C dst[0] 275 276 movl VAR_RETVAL, %eax 277 278 femms 279 ret 280 281 282 nop C avoid bad cache line crossing 283 L(finish_even): 284 deflit(`FRAME',4) 285 C Size is even, so only mm1 left to process. 286 287 movq %mm1, (%edx) C dst[0,1] 288 movl VAR_RETVAL, %eax 289 290 popl %ebx 291 femms 292 ret 293 294 EPILOGUE()