github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/rshift.asm (about) 1 dnl AMD K7 mpn_rshift -- mpn right shift. 2 3 dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K7: 1.21 cycles/limb (at 16 limbs/loop). 35 36 37 38 dnl K7: UNROLL_COUNT cycles/limb 39 dnl 4 1.51 40 dnl 8 1.26 41 dnl 16 1.21 42 dnl 32 1.2 43 dnl Maximum possible with the current code is 64. 44 45 deflit(UNROLL_COUNT, 16) 46 47 48 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 49 C unsigned shift); 50 C 51 C Shift src,size right by shift many bits and store the result in dst,size. 52 C Zeros are shifted in at the left. The bits shifted out at the right are 53 C the return value. 54 C 55 C This code uses 64-bit MMX operations, which makes it possible to handle 56 C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer 57 C code, on the other hand, suffers from shrd being a vector path decode and 58 C running at 3 cycles back-to-back. 59 C 60 C Full speed depends on source and destination being aligned, and some hairy 61 C setups and finish-ups are done to arrange this for the loop. 62 63 ifdef(`PIC',` 64 deflit(UNROLL_THRESHOLD, 10) 65 ',` 66 deflit(UNROLL_THRESHOLD, 10) 67 ') 68 69 defframe(PARAM_SHIFT,16) 70 defframe(PARAM_SIZE, 12) 71 defframe(PARAM_SRC, 8) 72 defframe(PARAM_DST, 4) 73 74 defframe(SAVE_EDI, -4) 75 defframe(SAVE_ESI, -8) 76 defframe(SAVE_EBX, -12) 77 deflit(SAVE_SIZE, 12) 78 79 TEXT 80 ALIGN(32) 81 82 PROLOGUE(mpn_rshift) 83 deflit(`FRAME',0) 84 85 movl PARAM_SIZE, %eax 86 movl PARAM_SRC, %edx 87 subl $SAVE_SIZE, %esp 88 deflit(`FRAME',SAVE_SIZE) 89 90 movl PARAM_SHIFT, %ecx 91 movl %edi, SAVE_EDI 92 93 movl PARAM_DST, %edi 94 decl %eax 95 jnz L(more_than_one_limb) 96 97 movl (%edx), %edx C src limb 98 99 shrdl( %cl, %edx, %eax) C eax was decremented to zero 100 101 shrl %cl, %edx 102 103 movl %edx, (%edi) C dst limb 104 movl SAVE_EDI, %edi 105 addl $SAVE_SIZE, %esp 106 107 ret 108 109 110 C ----------------------------------------------------------------------------- 111 L(more_than_one_limb): 112 C eax size-1 113 C ebx 114 C ecx shift 115 C edx src 116 C esi 117 C edi dst 118 C ebp 119 120 movd PARAM_SHIFT, %mm6 C rshift 121 movd (%edx), %mm5 C src low limb 122 cmp $UNROLL_THRESHOLD-1, %eax 123 124 jae L(unroll) 125 leal (%edx,%eax,4), %edx C &src[size-1] 126 leal -4(%edi,%eax,4), %edi C &dst[size-2] 127 128 movd (%edx), %mm4 C src high limb 129 negl %eax 130 131 132 L(simple_top): 133 C eax loop counter, limbs, negative 134 C ebx 135 C ecx shift 136 C edx carry 137 C edx &src[size-1] 138 C edi &dst[size-2] 139 C ebp 140 C 141 C mm0 scratch 142 C mm4 src high limb 143 C mm5 src low limb 144 C mm6 shift 145 146 movq (%edx,%eax,4), %mm0 147 incl %eax 148 149 psrlq %mm6, %mm0 150 151 movd %mm0, (%edi,%eax,4) 152 jnz L(simple_top) 153 154 155 psllq $32, %mm5 156 psrlq %mm6, %mm4 157 158 psrlq %mm6, %mm5 159 movd %mm4, 4(%edi) C dst high limb 160 161 movd %mm5, %eax C return value 162 163 movl SAVE_EDI, %edi 164 addl $SAVE_SIZE, %esp 165 emms 166 167 ret 168 169 170 C ----------------------------------------------------------------------------- 171 ALIGN(16) 172 L(unroll): 173 C eax size-1 174 C ebx 175 C ecx shift 176 C edx src 177 C esi 178 C edi dst 179 C ebp 180 C 181 C mm5 src low limb 182 C mm6 rshift 183 184 testb $4, %dl 185 movl %esi, SAVE_ESI 186 movl %ebx, SAVE_EBX 187 188 psllq $32, %mm5 189 jz L(start_src_aligned) 190 191 192 C src isn't aligned, process low limb separately (marked xxx) and 193 C step src and dst by one limb, making src aligned. 194 C 195 C source edx 196 C --+-------+-------+-------+ 197 C | xxx | 198 C --+-------+-------+-------+ 199 C 4mod8 0mod8 4mod8 200 C 201 C dest edi 202 C --+-------+-------+ 203 C | | xxx | 204 C --+-------+-------+ 205 206 movq (%edx), %mm0 C src low two limbs 207 addl $4, %edx 208 movl %eax, PARAM_SIZE C size-1 209 210 addl $4, %edi 211 decl %eax C size-2 is new size-1 212 213 psrlq %mm6, %mm0 214 movl %edi, PARAM_DST C new dst 215 216 movd %mm0, -4(%edi) 217 L(start_src_aligned): 218 219 220 movq (%edx), %mm1 C src low two limbs 221 decl %eax C size-2, two last limbs handled at end 222 testl $4, %edi 223 224 psrlq %mm6, %mm5 225 jz L(start_dst_aligned) 226 227 228 C dst isn't aligned, add 4 to make it so, and pretend the shift is 229 C 32 bits extra. Low limb of dst (marked xxx) handled here separately. 230 C 231 C source edx 232 C --+-------+-------+ 233 C | mm1 | 234 C --+-------+-------+ 235 C 4mod8 0mod8 236 C 237 C dest edi 238 C --+-------+-------+-------+ 239 C | xxx | 240 C --+-------+-------+-------+ 241 C 4mod8 0mod8 4mod8 242 243 movq %mm1, %mm0 244 psrlq %mm6, %mm1 245 addl $32, %ecx C shift+32 246 247 movd %mm1, (%edi) 248 movq %mm0, %mm1 249 addl $4, %edi C new dst 250 251 movd %ecx, %mm6 252 L(start_dst_aligned): 253 254 255 movq %mm1, %mm2 C copy of src low two limbs 256 negl %ecx 257 andl $-2, %eax C round size down to even 258 259 movl %eax, %ebx 260 negl %eax 261 addl $64, %ecx 262 263 andl $UNROLL_MASK, %eax 264 decl %ebx 265 266 shll %eax 267 268 movd %ecx, %mm7 C lshift = 64-rshift 269 270 ifdef(`PIC',` 271 call L(pic_calc) 272 L(here): 273 ',` 274 leal L(entry) (%eax,%eax,4), %esi 275 negl %eax 276 ') 277 shrl $UNROLL_LOG2, %ebx C loop counter 278 279 leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx 280 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi 281 movl PARAM_SIZE, %eax C for use at end 282 283 jmp *%esi 284 285 286 ifdef(`PIC',` 287 L(pic_calc): 288 C See mpn/x86/README about old gas bugs 289 leal (%eax,%eax,4), %esi 290 addl $L(entry)-L(here), %esi 291 addl (%esp), %esi 292 negl %eax 293 294 ret_internal 295 ') 296 297 298 C ----------------------------------------------------------------------------- 299 ALIGN(64) 300 L(top): 301 C eax size, for use at end 302 C ebx loop counter 303 C ecx lshift 304 C edx src 305 C esi was computed jump 306 C edi dst 307 C ebp 308 C 309 C mm0 scratch 310 C mm1 \ carry (alternating) 311 C mm2 / 312 C mm6 rshift 313 C mm7 lshift 314 C 315 C 10 code bytes/limb 316 C 317 C The two chunks differ in whether mm1 or mm2 hold the carry. 318 C The computed jump puts the initial carry in both mm1 and mm2. 319 320 L(entry): 321 deflit(CHUNK_COUNT, 4) 322 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 323 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 324 deflit(`disp1', eval(disp0 + 8)) 325 326 Zdisp( movq, disp0,(%edx), %mm0) 327 psrlq %mm6, %mm2 328 329 movq %mm0, %mm1 330 psllq %mm7, %mm0 331 332 por %mm2, %mm0 333 Zdisp( movq, %mm0, disp0,(%edi)) 334 335 336 Zdisp( movq, disp1,(%edx), %mm0) 337 psrlq %mm6, %mm1 338 339 movq %mm0, %mm2 340 psllq %mm7, %mm0 341 342 por %mm1, %mm0 343 Zdisp( movq, %mm0, disp1,(%edi)) 344 ') 345 346 addl $UNROLL_BYTES, %edx 347 addl $UNROLL_BYTES, %edi 348 decl %ebx 349 350 jns L(top) 351 352 353 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) 354 deflit(`disp1', eval(disp0-0 + 8)) 355 356 testb $1, %al 357 psrlq %mm6, %mm2 C wanted rshifted in all cases below 358 movl SAVE_ESI, %esi 359 360 movd %mm5, %eax C return value 361 362 movl SAVE_EBX, %ebx 363 jz L(end_even) 364 365 366 C Size odd, destination was aligned. 367 C 368 C source 369 C edx 370 C +-------+---------------+-- 371 C | | mm2 | 372 C +-------+---------------+-- 373 C 374 C dest edi 375 C +-------+---------------+---------------+-- 376 C | | | written | 377 C +-------+---------------+---------------+-- 378 C 379 C mm6 = shift 380 C mm7 = ecx = 64-shift 381 382 383 C Size odd, destination was unaligned. 384 C 385 C source 386 C edx 387 C +-------+---------------+-- 388 C | | mm2 | 389 C +-------+---------------+-- 390 C 391 C dest edi 392 C +---------------+---------------+-- 393 C | | written | 394 C +---------------+---------------+-- 395 C 396 C mm6 = shift+32 397 C mm7 = ecx = 64-(shift+32) 398 399 400 C In both cases there's one extra limb of src to fetch and combine 401 C with mm2 to make a qword to store, and in the aligned case there's 402 C a further extra limb of dst to be formed. 403 404 405 movd disp0(%edx), %mm0 406 movq %mm0, %mm1 407 408 psllq %mm7, %mm0 409 testb $32, %cl 410 411 por %mm2, %mm0 412 psrlq %mm6, %mm1 413 414 movq %mm0, disp0(%edi) 415 jz L(finish_odd_unaligned) 416 417 movd %mm1, disp1(%edi) 418 L(finish_odd_unaligned): 419 420 movl SAVE_EDI, %edi 421 addl $SAVE_SIZE, %esp 422 emms 423 424 ret 425 426 427 L(end_even): 428 429 C Size even, destination was aligned. 430 C 431 C source 432 C +---------------+-- 433 C | mm2 | 434 C +---------------+-- 435 C 436 C dest edi 437 C +---------------+---------------+-- 438 C | | mm3 | 439 C +---------------+---------------+-- 440 C 441 C mm6 = shift 442 C mm7 = ecx = 64-shift 443 444 445 C Size even, destination was unaligned. 446 C 447 C source 448 C +---------------+-- 449 C | mm2 | 450 C +---------------+-- 451 C 452 C dest edi 453 C +-------+---------------+-- 454 C | | mm3 | 455 C +-------+---------------+-- 456 C 457 C mm6 = shift+32 458 C mm7 = 64-(shift+32) 459 460 461 C The movd for the unaligned case is the same data as the movq for 462 C the aligned case, it's just a choice between whether one or two 463 C limbs should be written. 464 465 466 testb $32, %cl 467 movd %mm2, disp0(%edi) 468 469 jz L(end_even_unaligned) 470 471 movq %mm2, disp0(%edi) 472 L(end_even_unaligned): 473 474 movl SAVE_EDI, %edi 475 addl $SAVE_SIZE, %esp 476 emms 477 478 ret 479 480 EPILOGUE()