github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/lshift.asm (about) 1 dnl AMD K6 mpn_lshift -- mpn left shift. 2 3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: 3.0 cycles/limb 35 36 37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38 C unsigned shift); 39 C 40 C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx 41 C instructions. This is despite every second fetch being unaligned. 42 43 44 defframe(PARAM_SHIFT,16) 45 defframe(PARAM_SIZE, 12) 46 defframe(PARAM_SRC, 8) 47 defframe(PARAM_DST, 4) 48 49 TEXT 50 ALIGN(32) 51 52 PROLOGUE(mpn_lshift) 53 deflit(`FRAME',0) 54 55 C The 1 limb case can be done without the push %ebx, but it's then 56 C still the same speed. The push is left as a free helping hand for 57 C the two_or_more code. 58 59 movl PARAM_SIZE, %eax 60 pushl %ebx FRAME_pushl() 61 62 movl PARAM_SRC, %ebx 63 decl %eax 64 65 movl PARAM_SHIFT, %ecx 66 jnz L(two_or_more) 67 68 movl (%ebx), %edx C src limb 69 movl PARAM_DST, %ebx 70 71 shldl( %cl, %edx, %eax) C return value 72 73 shll %cl, %edx 74 75 movl %edx, (%ebx) C dst limb 76 popl %ebx 77 78 ret 79 80 81 ALIGN(16) C avoid offset 0x1f 82 nop C avoid bad cache line crossing 83 L(two_or_more): 84 C eax size-1 85 C ebx src 86 C ecx shift 87 C edx 88 89 movl (%ebx,%eax,4), %edx C src high limb 90 negl %ecx 91 92 movd PARAM_SHIFT, %mm6 93 addl $32, %ecx C 32-shift 94 95 shrl %cl, %edx 96 97 movd %ecx, %mm7 98 movl PARAM_DST, %ecx 99 100 L(top): 101 C eax counter, size-1 to 1 102 C ebx src 103 C ecx dst 104 C edx retval 105 C 106 C mm0 scratch 107 C mm6 shift 108 C mm7 32-shift 109 110 movq -4(%ebx,%eax,4), %mm0 111 decl %eax 112 113 psrlq %mm7, %mm0 114 115 movd %mm0, 4(%ecx,%eax,4) 116 jnz L(top) 117 118 119 movd (%ebx), %mm0 120 popl %ebx 121 122 psllq %mm6, %mm0 123 movl %edx, %eax 124 125 movd %mm0, (%ecx) 126 127 emms 128 ret 129 130 EPILOGUE()