github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/rshift-movdqu2.asm (about) 1 dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2010-2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb cycles/limb cycles/limb good 37 C aligned unaligned best seen for cpu? 38 C AMD K8,K9 3 3 2.35 no, use shl/shr 39 C AMD K10 1.5-1.8 1.5-1.8 1.33 yes 40 C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes 41 C AMD bobcat 3.17 3.17 yes, bad for n < 20 42 C Intel P4 4.67 4.67 2.7 no, slow movdqu 43 C Intel core2 2.15 2.15 1.25 no, use shld/shrd 44 C Intel NHM 1.66 1.66 1.25 no, use shld/shrd 45 C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 46 C Intel atom 11.7 11.7 4.5 no 47 C VIA nano 5.7 5.95 2.0 no, slow movdqu 48 49 C We try to do as many aligned 16-byte operations as possible. The top-most 50 C and bottom-most writes might need 8-byte operations. 51 C 52 C This variant rely on fast load movdqu, and uses it even for aligned operands, 53 C in order to avoid the need for two separate loops. 54 C 55 C TODO 56 C * Could 2-limb wind-down code be simplified? 57 C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 58 C for other affected CPUs. 59 60 C INPUT PARAMETERS 61 define(`rp', `%rdi') 62 define(`ap', `%rsi') 63 define(`n', `%rdx') 64 define(`cnt', `%rcx') 65 66 ASM_START() 67 TEXT 68 ALIGN(64) 69 PROLOGUE(mpn_rshift) 70 FUNC_ENTRY(4) 71 movd R32(%rcx), %xmm4 72 mov $64, R32(%rax) 73 sub R32(%rcx), R32(%rax) 74 movd R32(%rax), %xmm5 75 76 neg R32(%rcx) 77 mov (ap), %rax 78 shl R8(%rcx), %rax 79 80 cmp $3, n 81 jle L(bc) 82 83 test $8, R8(rp) 84 jz L(rp_aligned) 85 86 C Do one initial limb in order to make rp aligned 87 movq (ap), %xmm0 88 movq 8(ap), %xmm1 89 psrlq %xmm4, %xmm0 90 psllq %xmm5, %xmm1 91 por %xmm1, %xmm0 92 movq %xmm0, (rp) 93 lea 8(ap), ap 94 lea 8(rp), rp 95 dec n 96 97 L(rp_aligned): 98 lea 1(n), %r8d 99 lea (ap,n,8), ap 100 lea (rp,n,8), rp 101 neg n 102 103 and $6, R32(%r8) 104 jz L(bu0) 105 cmp $4, R32(%r8) 106 jz L(bu4) 107 jc L(bu2) 108 L(bu6): add $4, n 109 jmp L(i56) 110 L(bu0): add $6, n 111 jmp L(i70) 112 L(bu4): add $2, n 113 jmp L(i34) 114 L(bu2): add $8, n 115 jge L(end) 116 117 ALIGN(16) 118 L(top): movdqu -64(ap,n,8), %xmm1 119 movdqu -56(ap,n,8), %xmm0 120 psllq %xmm5, %xmm0 121 psrlq %xmm4, %xmm1 122 por %xmm1, %xmm0 123 movdqa %xmm0, -64(rp,n,8) 124 L(i70): 125 movdqu -48(ap,n,8), %xmm1 126 movdqu -40(ap,n,8), %xmm0 127 psllq %xmm5, %xmm0 128 psrlq %xmm4, %xmm1 129 por %xmm1, %xmm0 130 movdqa %xmm0, -48(rp,n,8) 131 L(i56): 132 movdqu -32(ap,n,8), %xmm1 133 movdqu -24(ap,n,8), %xmm0 134 psllq %xmm5, %xmm0 135 psrlq %xmm4, %xmm1 136 por %xmm1, %xmm0 137 movdqa %xmm0, -32(rp,n,8) 138 L(i34): 139 movdqu -16(ap,n,8), %xmm1 140 movdqu -8(ap,n,8), %xmm0 141 psllq %xmm5, %xmm0 142 psrlq %xmm4, %xmm1 143 por %xmm1, %xmm0 144 movdqa %xmm0, -16(rp,n,8) 145 add $8, n 146 jl L(top) 147 148 L(end): test $1, R8(n) 149 jnz L(e1) 150 151 movdqu -16(ap), %xmm1 152 movq -8(ap), %xmm0 153 psrlq %xmm4, %xmm1 154 psllq %xmm5, %xmm0 155 por %xmm1, %xmm0 156 movdqa %xmm0, -16(rp) 157 FUNC_EXIT() 158 ret 159 160 L(e1): movq -8(ap), %xmm0 161 psrlq %xmm4, %xmm0 162 movq %xmm0, -8(rp) 163 FUNC_EXIT() 164 ret 165 166 C Basecase 167 ALIGN(16) 168 L(bc): dec R32(n) 169 jnz 1f 170 movq (ap), %xmm0 171 psrlq %xmm4, %xmm0 172 movq %xmm0, (rp) 173 FUNC_EXIT() 174 ret 175 176 1: movq (ap), %xmm1 177 movq 8(ap), %xmm0 178 psrlq %xmm4, %xmm1 179 psllq %xmm5, %xmm0 180 por %xmm1, %xmm0 181 movq %xmm0, (rp) 182 dec R32(n) 183 jnz 1f 184 movq 8(ap), %xmm0 185 psrlq %xmm4, %xmm0 186 movq %xmm0, 8(rp) 187 FUNC_EXIT() 188 ret 189 190 1: movq 8(ap), %xmm1 191 movq 16(ap), %xmm0 192 psrlq %xmm4, %xmm1 193 psllq %xmm5, %xmm0 194 por %xmm1, %xmm0 195 movq %xmm0, 8(rp) 196 movq 16(ap), %xmm0 197 psrlq %xmm4, %xmm0 198 movq %xmm0, 16(rp) 199 FUNC_EXIT() 200 ret 201 EPILOGUE()