github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mmx/lshift.asm (about) 1 dnl Intel P5 mpn_lshift -- mpn left shift. 2 3 dnl Copyright 2000-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P5: 1.75 cycles/limb. 35 36 37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38 C unsigned shift); 39 C 40 C Shift src,size left by shift many bits and store the result in dst,size. 41 C Zeros are shifted in at the right. Return the bits shifted out at the 42 C left. 43 C 44 C The comments in mpn_rshift apply here too. 45 46 defframe(PARAM_SHIFT,16) 47 defframe(PARAM_SIZE, 12) 48 defframe(PARAM_SRC, 8) 49 defframe(PARAM_DST, 4) 50 deflit(`FRAME',0) 51 52 dnl minimum 5, because the unrolled loop can't handle less 53 deflit(UNROLL_THRESHOLD, 5) 54 55 TEXT 56 ALIGN(8) 57 58 PROLOGUE(mpn_lshift) 59 60 pushl %ebx 61 pushl %edi 62 deflit(`FRAME',8) 63 64 movl PARAM_SIZE, %eax 65 movl PARAM_DST, %edx 66 67 movl PARAM_SRC, %ebx 68 movl PARAM_SHIFT, %ecx 69 70 cmp $UNROLL_THRESHOLD, %eax 71 jae L(unroll) 72 73 movl -4(%ebx,%eax,4), %edi C src high limb 74 decl %eax 75 76 jnz L(simple) 77 78 shldl( %cl, %edi, %eax) C eax was decremented to zero 79 80 shll %cl, %edi 81 82 movl %edi, (%edx) C dst low limb 83 popl %edi C risk of data cache bank clash 84 85 popl %ebx 86 87 ret 88 89 90 C ----------------------------------------------------------------------------- 91 L(simple): 92 C eax size-1 93 C ebx src 94 C ecx shift 95 C edx dst 96 C esi 97 C edi 98 C ebp 99 deflit(`FRAME',8) 100 101 movd (%ebx,%eax,4), %mm5 C src high limb 102 103 movd %ecx, %mm6 C lshift 104 negl %ecx 105 106 psllq %mm6, %mm5 107 addl $32, %ecx 108 109 movd %ecx, %mm7 110 psrlq $32, %mm5 C retval 111 112 113 L(simple_top): 114 C eax counter, limbs, negative 115 C ebx src 116 C ecx 117 C edx dst 118 C esi 119 C edi 120 C 121 C mm0 scratch 122 C mm5 return value 123 C mm6 shift 124 C mm7 32-shift 125 126 movq -4(%ebx,%eax,4), %mm0 127 decl %eax 128 129 psrlq %mm7, %mm0 130 131 C 132 133 movd %mm0, 4(%edx,%eax,4) 134 jnz L(simple_top) 135 136 137 movd (%ebx), %mm0 138 139 movd %mm5, %eax 140 psllq %mm6, %mm0 141 142 popl %edi 143 popl %ebx 144 145 movd %mm0, (%edx) 146 147 emms 148 149 ret 150 151 152 C ----------------------------------------------------------------------------- 153 ALIGN(8) 154 L(unroll): 155 C eax size 156 C ebx src 157 C ecx shift 158 C edx dst 159 C esi 160 C edi 161 C ebp 162 deflit(`FRAME',8) 163 164 movd -4(%ebx,%eax,4), %mm5 C src high limb 165 leal (%ebx,%eax,4), %edi 166 167 movd %ecx, %mm6 C lshift 168 andl $4, %edi 169 170 psllq %mm6, %mm5 171 jz L(start_src_aligned) 172 173 174 C src isn't aligned, process high limb separately (marked xxx) to 175 C make it so. 176 C 177 C source -8(ebx,%eax,4) 178 C | 179 C +-------+-------+-------+-- 180 C | | 181 C +-------+-------+-------+-- 182 C 0mod8 4mod8 0mod8 183 C 184 C dest 185 C -4(edx,%eax,4) 186 C | 187 C +-------+-------+-- 188 C | xxx | | 189 C +-------+-------+-- 190 191 movq -8(%ebx,%eax,4), %mm0 C unaligned load 192 193 psllq %mm6, %mm0 194 decl %eax 195 196 psrlq $32, %mm0 197 198 C 199 200 movd %mm0, (%edx,%eax,4) 201 L(start_src_aligned): 202 203 movq -8(%ebx,%eax,4), %mm1 C src high qword 204 leal (%edx,%eax,4), %edi 205 206 andl $4, %edi 207 psrlq $32, %mm5 C return value 208 209 movq -16(%ebx,%eax,4), %mm3 C src second highest qword 210 jz L(start_dst_aligned) 211 212 C dst isn't aligned, subtract 4 to make it so, and pretend the shift 213 C is 32 bits extra. High limb of dst (marked xxx) handled here 214 C separately. 215 C 216 C source -8(ebx,%eax,4) 217 C | 218 C +-------+-------+-- 219 C | mm1 | 220 C +-------+-------+-- 221 C 0mod8 4mod8 222 C 223 C dest 224 C -4(edx,%eax,4) 225 C | 226 C +-------+-------+-------+-- 227 C | xxx | | 228 C +-------+-------+-------+-- 229 C 0mod8 4mod8 0mod8 230 231 movq %mm1, %mm0 232 addl $32, %ecx C new shift 233 234 psllq %mm6, %mm0 235 236 movd %ecx, %mm6 237 psrlq $32, %mm0 238 239 C wasted cycle here waiting for %mm0 240 241 movd %mm0, -4(%edx,%eax,4) 242 subl $4, %edx 243 L(start_dst_aligned): 244 245 246 psllq %mm6, %mm1 247 negl %ecx C -shift 248 249 addl $64, %ecx C 64-shift 250 movq %mm3, %mm2 251 252 movd %ecx, %mm7 253 subl $8, %eax C size-8 254 255 psrlq %mm7, %mm3 256 257 por %mm1, %mm3 C mm3 ready to store 258 jc L(finish) 259 260 261 C The comments in mpn_rshift apply here too. 262 263 ALIGN(8) 264 L(unroll_loop): 265 C eax counter, limbs 266 C ebx src 267 C ecx 268 C edx dst 269 C esi 270 C edi 271 C 272 C mm0 273 C mm1 274 C mm2 src qword from 16(%ebx,%eax,4) 275 C mm3 dst qword ready to store to 24(%edx,%eax,4) 276 C 277 C mm5 return value 278 C mm6 lshift 279 C mm7 rshift 280 281 movq 8(%ebx,%eax,4), %mm0 282 psllq %mm6, %mm2 283 284 movq %mm0, %mm1 285 psrlq %mm7, %mm0 286 287 movq %mm3, 24(%edx,%eax,4) C prev 288 por %mm2, %mm0 289 290 movq (%ebx,%eax,4), %mm3 C 291 psllq %mm6, %mm1 C 292 293 movq %mm0, 16(%edx,%eax,4) 294 movq %mm3, %mm2 C 295 296 psrlq %mm7, %mm3 C 297 subl $4, %eax 298 299 por %mm1, %mm3 C 300 jnc L(unroll_loop) 301 302 303 304 L(finish): 305 C eax -4 to -1 representing respectively 0 to 3 limbs remaining 306 307 testb $2, %al 308 309 jz L(finish_no_two) 310 311 movq 8(%ebx,%eax,4), %mm0 312 psllq %mm6, %mm2 313 314 movq %mm0, %mm1 315 psrlq %mm7, %mm0 316 317 movq %mm3, 24(%edx,%eax,4) C prev 318 por %mm2, %mm0 319 320 movq %mm1, %mm2 321 movq %mm0, %mm3 322 323 subl $2, %eax 324 L(finish_no_two): 325 326 327 C eax -4 or -3 representing respectively 0 or 1 limbs remaining 328 C 329 C mm2 src prev qword, from 16(%ebx,%eax,4) 330 C mm3 dst qword, for 24(%edx,%eax,4) 331 332 testb $1, %al 333 movd %mm5, %eax C retval 334 335 popl %edi 336 jz L(finish_zero) 337 338 339 C One extra src limb, destination was aligned. 340 C 341 C source ebx 342 C --+---------------+-------+ 343 C | mm2 | | 344 C --+---------------+-------+ 345 C 346 C dest edx+12 edx+4 edx 347 C --+---------------+---------------+-------+ 348 C | mm3 | | | 349 C --+---------------+---------------+-------+ 350 C 351 C mm6 = shift 352 C mm7 = ecx = 64-shift 353 354 355 C One extra src limb, destination was unaligned. 356 C 357 C source ebx 358 C --+---------------+-------+ 359 C | mm2 | | 360 C --+---------------+-------+ 361 C 362 C dest edx+12 edx+4 363 C --+---------------+---------------+ 364 C | mm3 | | 365 C --+---------------+---------------+ 366 C 367 C mm6 = shift+32 368 C mm7 = ecx = 64-(shift+32) 369 370 371 C In both cases there's one extra limb of src to fetch and combine 372 C with mm2 to make a qword at 4(%edx), and in the aligned case 373 C there's an extra limb of dst to be formed from that extra src limb 374 C left shifted. 375 376 377 movd (%ebx), %mm0 378 psllq %mm6, %mm2 379 380 movq %mm3, 12(%edx) 381 psllq $32, %mm0 382 383 movq %mm0, %mm1 384 psrlq %mm7, %mm0 385 386 por %mm2, %mm0 387 psllq %mm6, %mm1 388 389 movq %mm0, 4(%edx) 390 psrlq $32, %mm1 391 392 andl $32, %ecx 393 popl %ebx 394 395 jz L(finish_one_unaligned) 396 397 movd %mm1, (%edx) 398 L(finish_one_unaligned): 399 400 emms 401 402 ret 403 404 405 L(finish_zero): 406 407 C No extra src limbs, destination was aligned. 408 C 409 C source ebx 410 C --+---------------+ 411 C | mm2 | 412 C --+---------------+ 413 C 414 C dest edx+8 edx 415 C --+---------------+---------------+ 416 C | mm3 | | 417 C --+---------------+---------------+ 418 C 419 C mm6 = shift 420 C mm7 = ecx = 64-shift 421 422 423 C No extra src limbs, destination was unaligned. 424 C 425 C source ebx 426 C --+---------------+ 427 C | mm2 | 428 C --+---------------+ 429 C 430 C dest edx+8 edx+4 431 C --+---------------+-------+ 432 C | mm3 | | 433 C --+---------------+-------+ 434 C 435 C mm6 = shift+32 436 C mm7 = ecx = 64-(shift+32) 437 438 439 C The movd for the unaligned case writes the same data to 4(%edx) 440 C that the movq does for the aligned case. 441 442 443 movq %mm3, 8(%edx) 444 andl $32, %ecx 445 446 psllq %mm6, %mm2 447 jz L(finish_zero_unaligned) 448 449 movq %mm2, (%edx) 450 L(finish_zero_unaligned): 451 452 psrlq $32, %mm2 453 popl %ebx 454 455 movd %mm5, %eax C retval 456 457 movd %mm2, 4(%edx) 458 459 emms 460 461 ret 462 463 EPILOGUE()