github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/lshift.asm (about) 1 dnl AMD K7 mpn_lshift -- mpn left shift. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K7: 1.21 cycles/limb (at 16 limbs/loop). 35 36 37 38 dnl K7: UNROLL_COUNT cycles/limb 39 dnl 4 1.51 40 dnl 8 1.26 41 dnl 16 1.21 42 dnl 32 1.2 43 dnl Maximum possible with the current code is 64. 44 45 deflit(UNROLL_COUNT, 16) 46 47 48 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 49 C unsigned shift); 50 C 51 C Shift src,size left by shift many bits and store the result in dst,size. 52 C Zeros are shifted in at the right. The bits shifted out at the left are 53 C the return value. 54 C 55 C The comments in mpn_rshift apply here too. 56 57 ifdef(`PIC',` 58 deflit(UNROLL_THRESHOLD, 10) 59 ',` 60 deflit(UNROLL_THRESHOLD, 10) 61 ') 62 63 defframe(PARAM_SHIFT,16) 64 defframe(PARAM_SIZE, 12) 65 defframe(PARAM_SRC, 8) 66 defframe(PARAM_DST, 4) 67 68 defframe(SAVE_EDI, -4) 69 defframe(SAVE_ESI, -8) 70 defframe(SAVE_EBX, -12) 71 deflit(SAVE_SIZE, 12) 72 73 TEXT 74 ALIGN(32) 75 76 PROLOGUE(mpn_lshift) 77 deflit(`FRAME',0) 78 79 movl PARAM_SIZE, %eax 80 movl PARAM_SRC, %edx 81 subl $SAVE_SIZE, %esp 82 deflit(`FRAME',SAVE_SIZE) 83 84 movl PARAM_SHIFT, %ecx 85 movl %edi, SAVE_EDI 86 87 movl PARAM_DST, %edi 88 decl %eax 89 jnz L(more_than_one_limb) 90 91 movl (%edx), %edx 92 93 shldl( %cl, %edx, %eax) C eax was decremented to zero 94 95 shll %cl, %edx 96 97 movl %edx, (%edi) 98 movl SAVE_EDI, %edi 99 addl $SAVE_SIZE, %esp 100 101 ret 102 103 104 C ----------------------------------------------------------------------------- 105 L(more_than_one_limb): 106 C eax size-1 107 C ebx 108 C ecx shift 109 C edx src 110 C esi 111 C edi dst 112 C ebp 113 114 movd PARAM_SHIFT, %mm6 115 movd (%edx,%eax,4), %mm5 C src high limb 116 cmp $UNROLL_THRESHOLD-1, %eax 117 118 jae L(unroll) 119 negl %ecx 120 movd (%edx), %mm4 C src low limb 121 122 addl $32, %ecx 123 124 movd %ecx, %mm7 125 126 L(simple_top): 127 C eax loop counter, limbs 128 C ebx 129 C ecx 130 C edx src 131 C esi 132 C edi dst 133 C ebp 134 C 135 C mm0 scratch 136 C mm4 src low limb 137 C mm5 src high limb 138 C mm6 shift 139 C mm7 32-shift 140 141 movq -4(%edx,%eax,4), %mm0 142 decl %eax 143 144 psrlq %mm7, %mm0 145 146 movd %mm0, 4(%edi,%eax,4) 147 jnz L(simple_top) 148 149 150 psllq %mm6, %mm5 151 psllq %mm6, %mm4 152 153 psrlq $32, %mm5 154 movd %mm4, (%edi) C dst low limb 155 156 movd %mm5, %eax C return value 157 158 movl SAVE_EDI, %edi 159 addl $SAVE_SIZE, %esp 160 emms 161 162 ret 163 164 165 C ----------------------------------------------------------------------------- 166 ALIGN(16) 167 L(unroll): 168 C eax size-1 169 C ebx (saved) 170 C ecx shift 171 C edx src 172 C esi 173 C edi dst 174 C ebp 175 C 176 C mm5 src high limb, for return value 177 C mm6 lshift 178 179 movl %esi, SAVE_ESI 180 movl %ebx, SAVE_EBX 181 leal -4(%edx,%eax,4), %edx C &src[size-2] 182 183 testb $4, %dl 184 movq (%edx), %mm1 C src high qword 185 186 jz L(start_src_aligned) 187 188 189 C src isn't aligned, process high limb (marked xxx) separately to 190 C make it so 191 C 192 C source -4(edx,%eax,4) 193 C | 194 C +-------+-------+-------+-- 195 C | xxx | 196 C +-------+-------+-------+-- 197 C 0mod8 4mod8 0mod8 198 C 199 C dest -4(edi,%eax,4) 200 C | 201 C +-------+-------+-- 202 C | xxx | | 203 C +-------+-------+-- 204 205 psllq %mm6, %mm1 206 subl $4, %edx 207 movl %eax, PARAM_SIZE C size-1 208 209 psrlq $32, %mm1 210 decl %eax C size-2 is new size-1 211 212 movd %mm1, 4(%edi,%eax,4) 213 movq (%edx), %mm1 C new src high qword 214 L(start_src_aligned): 215 216 217 leal -4(%edi,%eax,4), %edi C &dst[size-2] 218 psllq %mm6, %mm5 219 220 testl $4, %edi 221 psrlq $32, %mm5 C return value 222 223 jz L(start_dst_aligned) 224 225 226 C dst isn't aligned, subtract 4 bytes to make it so, and pretend the 227 C shift is 32 bits extra. High limb of dst (marked xxx) handled 228 C here separately. 229 C 230 C source %edx 231 C +-------+-------+-- 232 C | mm1 | 233 C +-------+-------+-- 234 C 0mod8 4mod8 235 C 236 C dest %edi 237 C +-------+-------+-------+-- 238 C | xxx | 239 C +-------+-------+-------+-- 240 C 0mod8 4mod8 0mod8 241 242 movq %mm1, %mm0 243 psllq %mm6, %mm1 244 addl $32, %ecx C shift+32 245 246 psrlq $32, %mm1 247 248 movd %mm1, 4(%edi) 249 movq %mm0, %mm1 250 subl $4, %edi 251 252 movd %ecx, %mm6 C new lshift 253 L(start_dst_aligned): 254 255 decl %eax C size-2, two last limbs handled at end 256 movq %mm1, %mm2 C copy of src high qword 257 negl %ecx 258 259 andl $-2, %eax C round size down to even 260 addl $64, %ecx 261 262 movl %eax, %ebx 263 negl %eax 264 265 andl $UNROLL_MASK, %eax 266 decl %ebx 267 268 shll %eax 269 270 movd %ecx, %mm7 C rshift = 64-lshift 271 272 ifdef(`PIC',` 273 call L(pic_calc) 274 L(here): 275 ',` 276 leal L(entry) (%eax,%eax,4), %esi 277 ') 278 shrl $UNROLL_LOG2, %ebx C loop counter 279 280 leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx 281 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi 282 movl PARAM_SIZE, %eax C for use at end 283 jmp *%esi 284 285 286 ifdef(`PIC',` 287 L(pic_calc): 288 C See mpn/x86/README about old gas bugs 289 leal (%eax,%eax,4), %esi 290 addl $L(entry)-L(here), %esi 291 addl (%esp), %esi 292 293 ret_internal 294 ') 295 296 297 C ----------------------------------------------------------------------------- 298 ALIGN(32) 299 L(top): 300 C eax size (for use at end) 301 C ebx loop counter 302 C ecx rshift 303 C edx src 304 C esi computed jump 305 C edi dst 306 C ebp 307 C 308 C mm0 scratch 309 C mm1 \ carry (alternating, mm2 first) 310 C mm2 / 311 C mm6 lshift 312 C mm7 rshift 313 C 314 C 10 code bytes/limb 315 C 316 C The two chunks differ in whether mm1 or mm2 hold the carry. 317 C The computed jump puts the initial carry in both mm1 and mm2. 318 319 L(entry): 320 deflit(CHUNK_COUNT, 4) 321 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 322 deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 323 deflit(`disp1', eval(disp0 - 8)) 324 325 Zdisp( movq, disp0,(%edx), %mm0) 326 psllq %mm6, %mm2 327 328 movq %mm0, %mm1 329 psrlq %mm7, %mm0 330 331 por %mm2, %mm0 332 Zdisp( movq, %mm0, disp0,(%edi)) 333 334 335 Zdisp( movq, disp1,(%edx), %mm0) 336 psllq %mm6, %mm1 337 338 movq %mm0, %mm2 339 psrlq %mm7, %mm0 340 341 por %mm1, %mm0 342 Zdisp( movq, %mm0, disp1,(%edi)) 343 ') 344 345 subl $UNROLL_BYTES, %edx 346 subl $UNROLL_BYTES, %edi 347 decl %ebx 348 349 jns L(top) 350 351 352 353 define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') 354 355 L(end): 356 testb $1, %al 357 movl SAVE_EBX, %ebx 358 psllq %mm6, %mm2 C wanted left shifted in all cases below 359 360 movd %mm5, %eax 361 362 movl SAVE_ESI, %esi 363 jz L(end_even) 364 365 366 L(end_odd): 367 368 C Size odd, destination was aligned. 369 C 370 C source edx+8 edx+4 371 C --+---------------+-------+ 372 C | mm2 | | 373 C --+---------------+-------+ 374 C 375 C dest edi 376 C --+---------------+---------------+-------+ 377 C | written | | | 378 C --+---------------+---------------+-------+ 379 C 380 C mm6 = shift 381 C mm7 = ecx = 64-shift 382 383 384 C Size odd, destination was unaligned. 385 C 386 C source edx+8 edx+4 387 C --+---------------+-------+ 388 C | mm2 | | 389 C --+---------------+-------+ 390 C 391 C dest edi 392 C --+---------------+---------------+ 393 C | written | | 394 C --+---------------+---------------+ 395 C 396 C mm6 = shift+32 397 C mm7 = ecx = 64-(shift+32) 398 399 400 C In both cases there's one extra limb of src to fetch and combine 401 C with mm2 to make a qword at (%edi), and in the aligned case 402 C there's an extra limb of dst to be formed from that extra src limb 403 C left shifted. 404 405 movd disp(4) (%edx), %mm0 406 testb $32, %cl 407 408 movq %mm0, %mm1 409 psllq $32, %mm0 410 411 psrlq %mm7, %mm0 412 psllq %mm6, %mm1 413 414 por %mm2, %mm0 415 416 movq %mm0, disp(0) (%edi) 417 jz L(end_odd_unaligned) 418 movd %mm1, disp(-4) (%edi) 419 L(end_odd_unaligned): 420 421 movl SAVE_EDI, %edi 422 addl $SAVE_SIZE, %esp 423 emms 424 425 ret 426 427 428 L(end_even): 429 430 C Size even, destination was aligned. 431 C 432 C source edx+8 433 C --+---------------+ 434 C | mm2 | 435 C --+---------------+ 436 C 437 C dest edi 438 C --+---------------+---------------+ 439 C | written | | 440 C --+---------------+---------------+ 441 C 442 C mm6 = shift 443 C mm7 = ecx = 64-shift 444 445 446 C Size even, destination was unaligned. 447 C 448 C source edx+8 449 C --+---------------+ 450 C | mm2 | 451 C --+---------------+ 452 C 453 C dest edi+4 454 C --+---------------+-------+ 455 C | written | | 456 C --+---------------+-------+ 457 C 458 C mm6 = shift+32 459 C mm7 = ecx = 64-(shift+32) 460 461 462 C The movq for the aligned case overwrites the movd for the 463 C unaligned case. 464 465 movq %mm2, %mm0 466 psrlq $32, %mm2 467 468 testb $32, %cl 469 movd %mm2, disp(4) (%edi) 470 471 jz L(end_even_unaligned) 472 movq %mm0, disp(0) (%edi) 473 L(end_even_unaligned): 474 475 movl SAVE_EDI, %edi 476 addl $SAVE_SIZE, %esp 477 emms 478 479 ret 480 481 EPILOGUE()