github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/lshiftc-movdqu2.asm (about) 1 dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2010-2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb cycles/limb cycles/limb good 37 C aligned unaligned best seen for cpu? 38 C AMD K8,K9 3 3 ? no, use shl/shr 39 C AMD K10 1.8-2.0 1.8-2.0 ? yes 40 C AMD bd1 1.9 1.9 ? yes 41 C AMD bobcat 3.67 3.67 yes, bad for n < 20 42 C Intel P4 4.75 4.75 ? no, slow movdqu 43 C Intel core2 2.27 2.27 ? no, use shld/shrd 44 C Intel NHM 2.15 2.15 ? no, use shld/shrd 45 C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6 46 C Intel atom 12.9 12.9 ? no 47 C VIA nano 6.18 6.44 ? no, slow movdqu 48 49 C We try to do as many aligned 16-byte operations as possible. The top-most 50 C and bottom-most writes might need 8-byte operations. 51 C 52 C This variant rely on fast load movdqu, and uses it even for aligned operands, 53 C in order to avoid the need for two separate loops. 54 C 55 C TODO 56 C * Could 2-limb wind-down code be simplified? 57 C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 58 C for other affected CPUs. 59 60 C INPUT PARAMETERS 61 define(`rp', `%rdi') 62 define(`ap', `%rsi') 63 define(`n', `%rdx') 64 define(`cnt', `%rcx') 65 66 ASM_START() 67 TEXT 68 ALIGN(64) 69 PROLOGUE(mpn_lshiftc) 70 FUNC_ENTRY(4) 71 movd R32(%rcx), %xmm4 72 mov $64, R32(%rax) 73 sub R32(%rcx), R32(%rax) 74 movd R32(%rax), %xmm5 75 76 neg R32(%rcx) 77 mov -8(ap,n,8), %rax 78 shr R8(%rcx), %rax 79 80 pcmpeqb %xmm3, %xmm3 C set to 111...111 81 82 cmp $3, n 83 jle L(bc) 84 85 lea (rp,n,8), R32(%rcx) 86 test $8, R8(%rcx) 87 jz L(rp_aligned) 88 89 C Do one initial limb in order to make rp aligned 90 movq -8(ap,n,8), %xmm0 91 movq -16(ap,n,8), %xmm1 92 psllq %xmm4, %xmm0 93 psrlq %xmm5, %xmm1 94 por %xmm1, %xmm0 95 pxor %xmm3, %xmm0 96 movq %xmm0, -8(rp,n,8) 97 dec n 98 99 L(rp_aligned): 100 lea 1(n), %r8d 101 102 and $6, R32(%r8) 103 jz L(ba0) 104 cmp $4, R32(%r8) 105 jz L(ba4) 106 jc L(ba2) 107 L(ba6): add $-4, n 108 jmp L(i56) 109 L(ba0): add $-6, n 110 jmp L(i70) 111 L(ba4): add $-2, n 112 jmp L(i34) 113 L(ba2): add $-8, n 114 jle L(end) 115 116 ALIGN(16) 117 L(top): movdqu 40(ap,n,8), %xmm1 118 movdqu 48(ap,n,8), %xmm0 119 psllq %xmm4, %xmm0 120 psrlq %xmm5, %xmm1 121 por %xmm1, %xmm0 122 pxor %xmm3, %xmm0 123 movdqa %xmm0, 48(rp,n,8) 124 L(i70): 125 movdqu 24(ap,n,8), %xmm1 126 movdqu 32(ap,n,8), %xmm0 127 psllq %xmm4, %xmm0 128 psrlq %xmm5, %xmm1 129 por %xmm1, %xmm0 130 pxor %xmm3, %xmm0 131 movdqa %xmm0, 32(rp,n,8) 132 L(i56): 133 movdqu 8(ap,n,8), %xmm1 134 movdqu 16(ap,n,8), %xmm0 135 psllq %xmm4, %xmm0 136 psrlq %xmm5, %xmm1 137 por %xmm1, %xmm0 138 pxor %xmm3, %xmm0 139 movdqa %xmm0, 16(rp,n,8) 140 L(i34): 141 movdqu -8(ap,n,8), %xmm1 142 movdqu (ap,n,8), %xmm0 143 psllq %xmm4, %xmm0 144 psrlq %xmm5, %xmm1 145 por %xmm1, %xmm0 146 pxor %xmm3, %xmm0 147 movdqa %xmm0, (rp,n,8) 148 sub $8, n 149 jg L(top) 150 151 L(end): test $1, R8(n) 152 jnz L(end8) 153 154 movdqu (ap), %xmm1 155 pxor %xmm0, %xmm0 156 punpcklqdq %xmm1, %xmm0 157 psllq %xmm4, %xmm1 158 psrlq %xmm5, %xmm0 159 por %xmm1, %xmm0 160 pxor %xmm3, %xmm0 161 movdqa %xmm0, (rp) 162 FUNC_EXIT() 163 ret 164 165 C Basecase 166 ALIGN(16) 167 L(bc): dec R32(n) 168 jz L(end8) 169 170 movq (ap,n,8), %xmm1 171 movq -8(ap,n,8), %xmm0 172 psllq %xmm4, %xmm1 173 psrlq %xmm5, %xmm0 174 por %xmm1, %xmm0 175 pxor %xmm3, %xmm0 176 movq %xmm0, (rp,n,8) 177 sub $2, R32(n) 178 jl L(end8) 179 movq 8(ap), %xmm1 180 movq (ap), %xmm0 181 psllq %xmm4, %xmm1 182 psrlq %xmm5, %xmm0 183 por %xmm1, %xmm0 184 pxor %xmm3, %xmm0 185 movq %xmm0, 8(rp) 186 187 L(end8):movq (ap), %xmm0 188 psllq %xmm4, %xmm0 189 pxor %xmm3, %xmm0 190 movq %xmm0, (rp) 191 FUNC_EXIT() 192 ret 193 EPILOGUE()