github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mmx/rshift.asm (about) 1 dnl Intel P5 mpn_rshift -- mpn right shift. 2 3 dnl Copyright 2000, 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P5: 1.75 cycles/limb. 35 36 37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38 C unsigned shift); 39 C 40 C Shift src,size right by shift many bits and store the result in dst,size. 41 C Zeros are shifted in at the left. Return the bits shifted out at the 42 C right. 43 C 44 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, 45 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. 46 C 47 C Full speed depends on source and destination being aligned. Unaligned mmx 48 C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy 49 C setups and finish-ups are done to ensure alignment for the loop. 50 C 51 C MMX shifts work out a bit faster even for the simple loop. 52 53 defframe(PARAM_SHIFT,16) 54 defframe(PARAM_SIZE, 12) 55 defframe(PARAM_SRC, 8) 56 defframe(PARAM_DST, 4) 57 deflit(`FRAME',0) 58 59 dnl Minimum 5, because the unrolled loop can't handle less. 60 deflit(UNROLL_THRESHOLD, 5) 61 62 TEXT 63 ALIGN(8) 64 65 PROLOGUE(mpn_rshift) 66 67 pushl %ebx 68 pushl %edi 69 deflit(`FRAME',8) 70 71 movl PARAM_SIZE, %eax 72 movl PARAM_DST, %edx 73 74 movl PARAM_SRC, %ebx 75 movl PARAM_SHIFT, %ecx 76 77 cmp $UNROLL_THRESHOLD, %eax 78 jae L(unroll) 79 80 decl %eax 81 movl (%ebx), %edi C src low limb 82 83 jnz L(simple) 84 85 shrdl( %cl, %edi, %eax) C eax was decremented to zero 86 87 shrl %cl, %edi 88 89 movl %edi, (%edx) C dst low limb 90 popl %edi C risk of data cache bank clash 91 92 popl %ebx 93 94 ret 95 96 97 C ----------------------------------------------------------------------------- 98 ALIGN(8) 99 L(simple): 100 C eax size-1 101 C ebx src 102 C ecx shift 103 C edx dst 104 C esi 105 C edi 106 C ebp 107 deflit(`FRAME',8) 108 109 movd (%ebx), %mm5 C src[0] 110 leal (%ebx,%eax,4), %ebx C &src[size-1] 111 112 movd %ecx, %mm6 C rshift 113 leal -4(%edx,%eax,4), %edx C &dst[size-2] 114 115 psllq $32, %mm5 116 negl %eax 117 118 119 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted 120 C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 121 C cycles and would be 8 in a simple loop. Using mmx helps the return value 122 C and last limb calculations too. 123 124 L(simple_top): 125 C eax counter, limbs, negative 126 C ebx &src[size-1] 127 C ecx return value 128 C edx &dst[size-2] 129 C 130 C mm0 scratch 131 C mm5 return value 132 C mm6 shift 133 134 movq (%ebx,%eax,4), %mm0 135 incl %eax 136 137 psrlq %mm6, %mm0 138 139 movd %mm0, (%edx,%eax,4) 140 jnz L(simple_top) 141 142 143 movd (%ebx), %mm0 144 psrlq %mm6, %mm5 C return value 145 146 psrlq %mm6, %mm0 147 popl %edi 148 149 movd %mm5, %eax 150 popl %ebx 151 152 movd %mm0, 4(%edx) 153 154 emms 155 156 ret 157 158 159 C ----------------------------------------------------------------------------- 160 ALIGN(8) 161 L(unroll): 162 C eax size 163 C ebx src 164 C ecx shift 165 C edx dst 166 C esi 167 C edi 168 C ebp 169 deflit(`FRAME',8) 170 171 movd (%ebx), %mm5 C src[0] 172 movl $4, %edi 173 174 movd %ecx, %mm6 C rshift 175 testl %edi, %ebx 176 177 psllq $32, %mm5 178 jz L(start_src_aligned) 179 180 181 C src isn't aligned, process low limb separately (marked xxx) and 182 C step src and dst by one limb, making src aligned. 183 C 184 C source ebx 185 C --+-------+-------+-------+ 186 C | xxx | 187 C --+-------+-------+-------+ 188 C 4mod8 0mod8 4mod8 189 C 190 C dest edx 191 C --+-------+-------+ 192 C | | xxx | 193 C --+-------+-------+ 194 195 movq (%ebx), %mm0 C unaligned load 196 197 psrlq %mm6, %mm0 198 addl $4, %ebx 199 200 decl %eax 201 202 movd %mm0, (%edx) 203 addl $4, %edx 204 L(start_src_aligned): 205 206 207 movq (%ebx), %mm1 208 testl %edi, %edx 209 210 psrlq %mm6, %mm5 C retval 211 jz L(start_dst_aligned) 212 213 C dst isn't aligned, add 4 to make it so, and pretend the shift is 214 C 32 bits extra. Low limb of dst (marked xxx) handled here 215 C separately. 216 C 217 C source ebx 218 C --+-------+-------+ 219 C | mm1 | 220 C --+-------+-------+ 221 C 4mod8 0mod8 222 C 223 C dest edx 224 C --+-------+-------+-------+ 225 C | xxx | 226 C --+-------+-------+-------+ 227 C 4mod8 0mod8 4mod8 228 229 movq %mm1, %mm0 230 addl $32, %ecx C new shift 231 232 psrlq %mm6, %mm0 233 234 movd %ecx, %mm6 235 236 movd %mm0, (%edx) 237 addl $4, %edx 238 L(start_dst_aligned): 239 240 241 movq 8(%ebx), %mm3 242 negl %ecx 243 244 movq %mm3, %mm2 C mm2 src qword 245 addl $64, %ecx 246 247 movd %ecx, %mm7 248 psrlq %mm6, %mm1 249 250 leal -12(%ebx,%eax,4), %ebx 251 leal -20(%edx,%eax,4), %edx 252 253 psllq %mm7, %mm3 254 subl $7, %eax C size-7 255 256 por %mm1, %mm3 C mm3 ready to store 257 negl %eax C -(size-7) 258 259 jns L(finish) 260 261 262 C This loop is the important bit, the rest is just support. Careful 263 C instruction scheduling achieves the claimed 1.75 c/l. The 264 C relevant parts of the pairing rules are: 265 C 266 C - mmx loads and stores execute only in the U pipe 267 C - only one mmx shift in a pair 268 C - wait one cycle before storing an mmx register result 269 C - the usual address generation interlock 270 C 271 C Two qword calculations are slightly interleaved. The instructions 272 C marked "C" belong to the second qword, and the "C prev" one is for 273 C the second qword from the previous iteration. 274 275 ALIGN(8) 276 L(unroll_loop): 277 C eax counter, limbs, negative 278 C ebx &src[size-12] 279 C ecx 280 C edx &dst[size-12] 281 C esi 282 C edi 283 C 284 C mm0 285 C mm1 286 C mm2 src qword from -8(%ebx,%eax,4) 287 C mm3 dst qword ready to store to -8(%edx,%eax,4) 288 C 289 C mm5 return value 290 C mm6 rshift 291 C mm7 lshift 292 293 movq (%ebx,%eax,4), %mm0 294 psrlq %mm6, %mm2 295 296 movq %mm0, %mm1 297 psllq %mm7, %mm0 298 299 movq %mm3, -8(%edx,%eax,4) C prev 300 por %mm2, %mm0 301 302 movq 8(%ebx,%eax,4), %mm3 C 303 psrlq %mm6, %mm1 C 304 305 movq %mm0, (%edx,%eax,4) 306 movq %mm3, %mm2 C 307 308 psllq %mm7, %mm3 C 309 addl $4, %eax 310 311 por %mm1, %mm3 C 312 js L(unroll_loop) 313 314 315 L(finish): 316 C eax 0 to 3 representing respectively 3 to 0 limbs remaining 317 318 testb $2, %al 319 320 jnz L(finish_no_two) 321 322 movq (%ebx,%eax,4), %mm0 323 psrlq %mm6, %mm2 324 325 movq %mm0, %mm1 326 psllq %mm7, %mm0 327 328 movq %mm3, -8(%edx,%eax,4) C prev 329 por %mm2, %mm0 330 331 movq %mm1, %mm2 332 movq %mm0, %mm3 333 334 addl $2, %eax 335 L(finish_no_two): 336 337 338 C eax 2 or 3 representing respectively 1 or 0 limbs remaining 339 C 340 C mm2 src prev qword, from -8(%ebx,%eax,4) 341 C mm3 dst qword, for -8(%edx,%eax,4) 342 343 testb $1, %al 344 popl %edi 345 346 movd %mm5, %eax C retval 347 jnz L(finish_zero) 348 349 350 C One extra limb, destination was aligned. 351 C 352 C source ebx 353 C +-------+---------------+-- 354 C | | mm2 | 355 C +-------+---------------+-- 356 C 357 C dest edx 358 C +-------+---------------+---------------+-- 359 C | | | mm3 | 360 C +-------+---------------+---------------+-- 361 C 362 C mm6 = shift 363 C mm7 = ecx = 64-shift 364 365 366 C One extra limb, destination was unaligned. 367 C 368 C source ebx 369 C +-------+---------------+-- 370 C | | mm2 | 371 C +-------+---------------+-- 372 C 373 C dest edx 374 C +---------------+---------------+-- 375 C | | mm3 | 376 C +---------------+---------------+-- 377 C 378 C mm6 = shift+32 379 C mm7 = ecx = 64-(shift+32) 380 381 382 C In both cases there's one extra limb of src to fetch and combine 383 C with mm2 to make a qword at 8(%edx), and in the aligned case 384 C there's a further extra limb of dst to be formed. 385 386 387 movd 8(%ebx), %mm0 388 psrlq %mm6, %mm2 389 390 movq %mm0, %mm1 391 psllq %mm7, %mm0 392 393 movq %mm3, (%edx) 394 por %mm2, %mm0 395 396 psrlq %mm6, %mm1 397 andl $32, %ecx 398 399 popl %ebx 400 jz L(finish_one_unaligned) 401 402 C dst was aligned, must store one extra limb 403 movd %mm1, 16(%edx) 404 L(finish_one_unaligned): 405 406 movq %mm0, 8(%edx) 407 408 emms 409 410 ret 411 412 413 L(finish_zero): 414 415 C No extra limbs, destination was aligned. 416 C 417 C source ebx 418 C +---------------+-- 419 C | mm2 | 420 C +---------------+-- 421 C 422 C dest edx+4 423 C +---------------+---------------+-- 424 C | | mm3 | 425 C +---------------+---------------+-- 426 C 427 C mm6 = shift 428 C mm7 = ecx = 64-shift 429 430 431 C No extra limbs, destination was unaligned. 432 C 433 C source ebx 434 C +---------------+-- 435 C | mm2 | 436 C +---------------+-- 437 C 438 C dest edx+4 439 C +-------+---------------+-- 440 C | | mm3 | 441 C +-------+---------------+-- 442 C 443 C mm6 = shift+32 444 C mm7 = 64-(shift+32) 445 446 447 C The movd for the unaligned case is clearly the same data as the 448 C movq for the aligned case, it's just a choice between whether one 449 C or two limbs should be written. 450 451 452 movq %mm3, 4(%edx) 453 psrlq %mm6, %mm2 454 455 movd %mm2, 12(%edx) 456 andl $32, %ecx 457 458 popl %ebx 459 jz L(finish_zero_unaligned) 460 461 movq %mm2, 12(%edx) 462 L(finish_zero_unaligned): 463 464 emms 465 466 ret 467 468 EPILOGUE()