github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/k62mmx/rshift.asm (about) 1 dnl AMD K6-2 mpn_rshift -- mpn right shift. 2 3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6-2: 1.75 cycles/limb 35 36 37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38 C unsigned shift); 39 C 40 41 defframe(PARAM_SHIFT,16) 42 defframe(PARAM_SIZE, 12) 43 defframe(PARAM_SRC, 8) 44 defframe(PARAM_DST, 4) 45 deflit(`FRAME',0) 46 47 dnl Minimum 9, because the unrolled loop can't handle less. 48 dnl 49 deflit(UNROLL_THRESHOLD, 9) 50 51 TEXT 52 ALIGN(32) 53 54 PROLOGUE(mpn_rshift) 55 deflit(`FRAME',0) 56 57 C The 1 limb case can be done without the push %ebx, but it's then 58 C still the same speed. The push is left as a free helping hand for 59 C the two_or_more code. 60 61 movl PARAM_SIZE, %eax 62 pushl %ebx FRAME_pushl() 63 64 movl PARAM_SRC, %ebx 65 decl %eax 66 67 movl PARAM_SHIFT, %ecx 68 jnz L(two_or_more) 69 70 movl (%ebx), %edx C src limb 71 movl PARAM_DST, %ebx 72 73 shrdl( %cl, %edx, %eax) C return value 74 75 shrl %cl, %edx 76 77 movl %edx, (%ebx) C dst limb 78 popl %ebx 79 80 ret 81 82 83 C ----------------------------------------------------------------------------- 84 ALIGN(16) C avoid offset 0x1f 85 L(two_or_more): 86 C eax size-1 87 C ebx src 88 C ecx shift 89 C edx 90 91 movl (%ebx), %edx C src low limb 92 negl %ecx 93 94 addl $32, %ecx 95 movd PARAM_SHIFT, %mm6 96 97 shll %cl, %edx 98 cmpl $UNROLL_THRESHOLD-1, %eax 99 100 jae L(unroll) 101 102 103 C eax size-1 104 C ebx src 105 C ecx 32-shift 106 C edx retval 107 C 108 C mm6 shift 109 110 movl PARAM_DST, %ecx 111 leal (%ebx,%eax,4), %ebx 112 113 leal -4(%ecx,%eax,4), %ecx 114 negl %eax 115 116 C This loop runs at about 3 cycles/limb, which is the amount of 117 C decoding, and this is despite every second access being unaligned. 118 119 L(simple): 120 C eax counter, -(size-1) to -1 121 C ebx &src[size-1] 122 C ecx &dst[size-1] 123 C edx retval 124 C 125 C mm0 scratch 126 C mm6 shift 127 128 Zdisp( movq, 0,(%ebx,%eax,4), %mm0) 129 incl %eax 130 131 psrlq %mm6, %mm0 132 133 Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) 134 jnz L(simple) 135 136 137 movq %mm0, (%ecx) 138 movl %edx, %eax 139 140 popl %ebx 141 142 femms 143 ret 144 145 146 C ----------------------------------------------------------------------------- 147 ALIGN(16) 148 L(unroll): 149 C eax size-1 150 C ebx src 151 C ecx 32-shift 152 C edx retval 153 C 154 C mm6 shift 155 156 addl $32, %ecx 157 subl $7, %eax C size-8 158 159 movd %ecx, %mm7 160 movl PARAM_DST, %ecx 161 162 movq (%ebx), %mm2 C src low qword 163 leal (%ebx,%eax,4), %ebx C src end - 32 164 165 testb $4, %cl 166 leal (%ecx,%eax,4), %ecx C dst end - 32 167 168 notl %eax C -(size-7) 169 jz L(dst_aligned) 170 171 psrlq %mm6, %mm2 172 incl %eax 173 174 Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb 175 movq 4(%ebx,%eax,4), %mm2 C new src low qword 176 L(dst_aligned): 177 178 movq 12(%ebx,%eax,4), %mm0 C src second lowest qword 179 nop C avoid bad cache line crossing 180 181 182 C This loop is the important bit, the rest is just support for it. 183 C Four src limbs are held at the start, and four more will be read. 184 C Four dst limbs will be written. This schedule seems necessary for 185 C full speed. 186 C 187 C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and 188 C and leaves 0 to 3 which can be tested with test $1 and $2. 189 190 L(top): 191 C eax counter, -(size-7) step by +4 until >=0 192 C ebx src end - 32 193 C ecx dst end - 32 194 C edx retval 195 C 196 C mm0 src next qword 197 C mm1 scratch 198 C mm2 src prev qword 199 C mm6 shift 200 C mm7 64-shift 201 202 psrlq %mm6, %mm2 203 addl $4, %eax 204 205 movq %mm0, %mm1 206 psllq %mm7, %mm0 207 208 por %mm0, %mm2 209 movq 4(%ebx,%eax,4), %mm0 210 211 psrlq %mm6, %mm1 212 movq %mm2, -12(%ecx,%eax,4) 213 214 movq %mm0, %mm2 215 psllq %mm7, %mm0 216 217 por %mm0, %mm1 218 movq 12(%ebx,%eax,4), %mm0 219 220 movq %mm1, -4(%ecx,%eax,4) 221 ja L(top) C jump if no carry and not zero 222 223 224 225 C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0 226 C to 3 representing respectively 3 to 0 further limbs. 227 228 testl $2, %eax C testl to avoid bad cache line crossings 229 jnz L(finish_nottwo) 230 231 C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0 232 C becomes new mm2 and a new mm0 is loaded. 233 234 psrlq %mm6, %mm2 235 movq %mm0, %mm1 236 237 psllq %mm7, %mm0 238 addl $2, %eax 239 240 por %mm0, %mm2 241 movq 12(%ebx,%eax,4), %mm0 242 243 movq %mm2, -4(%ecx,%eax,4) 244 movq %mm1, %mm2 245 L(finish_nottwo): 246 247 248 testb $1, %al 249 psrlq %mm6, %mm2 250 251 movq %mm0, %mm1 252 psllq %mm7, %mm0 253 254 por %mm0, %mm2 255 psrlq %mm6, %mm1 256 257 movq %mm2, 4(%ecx,%eax,4) 258 jnz L(finish_even) 259 260 261 C one further extra limb to process 262 263 movd 32-4(%ebx), %mm0 C src[size-1], most significant limb 264 popl %ebx 265 266 movq %mm0, %mm2 267 psllq %mm7, %mm0 268 269 por %mm0, %mm1 270 psrlq %mm6, %mm2 271 272 movq %mm1, 32-12(%ecx) C dst[size-3,size-2] 273 movd %mm2, 32-4(%ecx) C dst[size-1] 274 275 movl %edx, %eax C retval 276 277 femms 278 ret 279 280 281 nop C avoid bad cache line crossing 282 L(finish_even): 283 C no further extra limbs 284 285 movq %mm1, 32-8(%ecx) C dst[size-2,size-1] 286 movl %edx, %eax C retval 287 288 popl %ebx 289 290 femms 291 ret 292 293 EPILOGUE()