github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/lshift-movdqu2.asm (about) 1 dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2010-2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb cycles/limb cycles/limb good 37 C aligned unaligned best seen for cpu? 38 C AMD K8,K9 3 3 2.35 no, use shl/shr 39 C AMD K10 1.5-1.8 1.5-1.8 1.33 yes 40 C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes 41 C AMD bobcat 3.17 3.17 yes, bad for n < 20 42 C Intel P4 4.67 4.67 2.7 no, slow movdqu 43 C Intel core2 2.15 2.15 1.25 no, use shld/shrd 44 C Intel NHM 1.66 1.66 1.25 no, use shld/shrd 45 C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 46 C Intel atom 11.7 11.7 4.5 no 47 C VIA nano 5.7 5.95 2.0 no, slow movdqu 48 49 C We try to do as many aligned 16-byte operations as possible. The top-most 50 C and bottom-most writes might need 8-byte operations. 51 C 52 C This variant rely on fast load movdqu, and uses it even for aligned operands, 53 C in order to avoid the need for two separate loops. 54 C 55 C TODO 56 C * Could 2-limb wind-down code be simplified? 57 C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 58 C for other affected CPUs. 59 60 C INPUT PARAMETERS 61 define(`rp', `%rdi') 62 define(`ap', `%rsi') 63 define(`n', `%rdx') 64 define(`cnt', `%rcx') 65 66 ASM_START() 67 TEXT 68 ALIGN(64) 69 PROLOGUE(mpn_lshift) 70 FUNC_ENTRY(4) 71 movd R32(%rcx), %xmm4 72 mov $64, R32(%rax) 73 sub R32(%rcx), R32(%rax) 74 movd R32(%rax), %xmm5 75 76 neg R32(%rcx) 77 mov -8(ap,n,8), %rax 78 shr R8(%rcx), %rax 79 80 cmp $3, n 81 jle L(bc) 82 83 lea (rp,n,8), R32(%rcx) 84 test $8, R8(%rcx) 85 jz L(rp_aligned) 86 87 C Do one initial limb in order to make rp aligned 88 movq -8(ap,n,8), %xmm0 89 movq -16(ap,n,8), %xmm1 90 psllq %xmm4, %xmm0 91 psrlq %xmm5, %xmm1 92 por %xmm1, %xmm0 93 movq %xmm0, -8(rp,n,8) 94 dec n 95 96 L(rp_aligned): 97 lea 1(n), %r8d 98 99 and $6, R32(%r8) 100 jz L(ba0) 101 cmp $4, R32(%r8) 102 jz L(ba4) 103 jc L(ba2) 104 L(ba6): add $-4, n 105 jmp L(i56) 106 L(ba0): add $-6, n 107 jmp L(i70) 108 L(ba4): add $-2, n 109 jmp L(i34) 110 L(ba2): add $-8, n 111 jle L(end) 112 113 ALIGN(16) 114 L(top): movdqu 40(ap,n,8), %xmm1 115 movdqu 48(ap,n,8), %xmm0 116 psllq %xmm4, %xmm0 117 psrlq %xmm5, %xmm1 118 por %xmm1, %xmm0 119 movdqa %xmm0, 48(rp,n,8) 120 L(i70): 121 movdqu 24(ap,n,8), %xmm1 122 movdqu 32(ap,n,8), %xmm0 123 psllq %xmm4, %xmm0 124 psrlq %xmm5, %xmm1 125 por %xmm1, %xmm0 126 movdqa %xmm0, 32(rp,n,8) 127 L(i56): 128 movdqu 8(ap,n,8), %xmm1 129 movdqu 16(ap,n,8), %xmm0 130 psllq %xmm4, %xmm0 131 psrlq %xmm5, %xmm1 132 por %xmm1, %xmm0 133 movdqa %xmm0, 16(rp,n,8) 134 L(i34): 135 movdqu -8(ap,n,8), %xmm1 136 movdqu (ap,n,8), %xmm0 137 psllq %xmm4, %xmm0 138 psrlq %xmm5, %xmm1 139 por %xmm1, %xmm0 140 movdqa %xmm0, (rp,n,8) 141 sub $8, n 142 jg L(top) 143 144 L(end): test $1, R8(n) 145 jnz L(end8) 146 147 movdqu (ap), %xmm1 148 pxor %xmm0, %xmm0 149 punpcklqdq %xmm1, %xmm0 150 psllq %xmm4, %xmm1 151 psrlq %xmm5, %xmm0 152 por %xmm1, %xmm0 153 movdqa %xmm0, (rp) 154 FUNC_EXIT() 155 ret 156 157 C Basecase 158 ALIGN(16) 159 L(bc): dec R32(n) 160 jz L(end8) 161 162 movq (ap,n,8), %xmm1 163 movq -8(ap,n,8), %xmm0 164 psllq %xmm4, %xmm1 165 psrlq %xmm5, %xmm0 166 por %xmm1, %xmm0 167 movq %xmm0, (rp,n,8) 168 sub $2, R32(n) 169 jl L(end8) 170 movq 8(ap), %xmm1 171 movq (ap), %xmm0 172 psllq %xmm4, %xmm1 173 psrlq %xmm5, %xmm0 174 por %xmm1, %xmm0 175 movq %xmm0, 8(rp) 176 177 L(end8):movq (ap), %xmm0 178 psllq %xmm4, %xmm0 179 movq %xmm0, (rp) 180 FUNC_EXIT() 181 ret 182 EPILOGUE()