github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyd-palignr.asm (about) 1 dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3. 2 3 dnl Copyright 2012, 2015 Free Software Foundation, Inc. 4 5 dnl Contributed to the GNU project by Torbjorn Granlund. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb cycles/limb cycles/limb good 36 C aligned unaligned best seen for cpu? 37 C AMD K8,K9 2.0 illop 1.0/1.0 N 38 C AMD K10 0.85 illop Y/N 39 C AMD bull 0.70 0.70 Y 40 C AMD pile 0.68 0.68 Y 41 C AMD steam 42 C AMD excavator 43 C AMD bobcat 1.97 8.24 1.5/1.5 N 44 C AMD jaguar 0.77 0.89 0.65/opt N/Y 45 C Intel P4 2.26 illop Y/N 46 C Intel core 0.52 0.80 opt/opt Y 47 C Intel NHM 0.52 0.64 opt/opt Y 48 C Intel SBR 0.51 0.51 opt/opt Y 49 C Intel IBR 0.50 0.50 opt/opt Y 50 C Intel HWL 0.50 0.51 opt/opt Y 51 C Intel BWL 0.55 0.55 opt/opt Y 52 C Intel atom 1.16 1.66 opt/opt Y 53 C Intel SLM 1.02 1.04 opt/opt Y 54 C VIA nano 1.08 1.06 opt/opt Y 55 56 C We use only 16-byte operations, except for unaligned top-most and bottom-most 57 C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). 58 C 59 C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop, 60 C taken from the x86_64 default code. 61 62 C INPUT PARAMETERS 63 define(`rp', `%rdi') 64 define(`up', `%rsi') 65 define(`n', `%rdx') 66 67 C There are three instructions for loading an aligned 128-bit quantity. We use 68 C movaps, since it has the shortest coding. 69 define(`movdqa', ``movaps'') 70 71 ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)') 72 73 ASM_START() 74 TEXT 75 ALIGN(64) 76 PROLOGUE(mpn_copyd) 77 FUNC_ENTRY(3) 78 79 lea -8(up,n,8), up 80 lea -8(rp,n,8), rp 81 82 cmp $COPYD_SSE_THRESHOLD, n 83 jbe L(bc) 84 85 test $8, R8(rp) C is rp 16-byte aligned? 86 jnz L(rp_aligned) C jump if rp aligned 87 88 mov (up), %rax C copy one limb 89 mov %rax, (rp) 90 lea -8(up), up 91 lea -8(rp), rp 92 dec n 93 94 L(rp_aligned): 95 test $8, R8(up) 96 jz L(uent) 97 98 ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 99 ` sub $8, n', 100 ` jmp L(am)') 101 102 ALIGN(16) 103 L(atop):movdqa -8(up), %xmm0 104 movdqa -24(up), %xmm1 105 movdqa -40(up), %xmm2 106 movdqa -56(up), %xmm3 107 lea -64(up), up 108 movdqa %xmm0, -8(rp) 109 movdqa %xmm1, -24(rp) 110 movdqa %xmm2, -40(rp) 111 movdqa %xmm3, -56(rp) 112 lea -64(rp), rp 113 L(am): sub $8, n 114 jnc L(atop) 115 116 test $4, R8(n) 117 jz 1f 118 movdqa -8(up), %xmm0 119 movdqa -24(up), %xmm1 120 lea -32(up), up 121 movdqa %xmm0, -8(rp) 122 movdqa %xmm1, -24(rp) 123 lea -32(rp), rp 124 125 1: test $2, R8(n) 126 jz 1f 127 movdqa -8(up), %xmm0 128 lea -16(up), up 129 movdqa %xmm0, -8(rp) 130 lea -16(rp), rp 131 132 1: test $1, R8(n) 133 jz 1f 134 mov (up), %r8 135 mov %r8, (rp) 136 137 1: FUNC_EXIT() 138 ret 139 140 L(uent):sub $16, n 141 movdqa (up), %xmm0 142 jc L(uend) 143 144 ALIGN(16) 145 L(utop):sub $16, n 146 movdqa -16(up), %xmm1 147 palignr($8, %xmm1, %xmm0) 148 movdqa %xmm0, -8(rp) 149 movdqa -32(up), %xmm2 150 palignr($8, %xmm2, %xmm1) 151 movdqa %xmm1, -24(rp) 152 movdqa -48(up), %xmm3 153 palignr($8, %xmm3, %xmm2) 154 movdqa %xmm2, -40(rp) 155 movdqa -64(up), %xmm0 156 palignr($8, %xmm0, %xmm3) 157 movdqa %xmm3, -56(rp) 158 movdqa -80(up), %xmm1 159 palignr($8, %xmm1, %xmm0) 160 movdqa %xmm0, -72(rp) 161 movdqa -96(up), %xmm2 162 palignr($8, %xmm2, %xmm1) 163 movdqa %xmm1, -88(rp) 164 movdqa -112(up), %xmm3 165 palignr($8, %xmm3, %xmm2) 166 movdqa %xmm2, -104(rp) 167 movdqa -128(up), %xmm0 168 palignr($8, %xmm0, %xmm3) 169 movdqa %xmm3, -120(rp) 170 lea -128(up), up 171 lea -128(rp), rp 172 jnc L(utop) 173 174 L(uend):test $8, R8(n) 175 jz 1f 176 movdqa -16(up), %xmm1 177 palignr($8, %xmm1, %xmm0) 178 movdqa %xmm0, -8(rp) 179 movdqa -32(up), %xmm0 180 palignr($8, %xmm0, %xmm1) 181 movdqa %xmm1, -24(rp) 182 movdqa -48(up), %xmm1 183 palignr($8, %xmm1, %xmm0) 184 movdqa %xmm0, -40(rp) 185 movdqa -64(up), %xmm0 186 palignr($8, %xmm0, %xmm1) 187 movdqa %xmm1, -56(rp) 188 lea -64(up), up 189 lea -64(rp), rp 190 191 1: test $4, R8(n) 192 jz 1f 193 movdqa -16(up), %xmm1 194 palignr($8, %xmm1, %xmm0) 195 movdqa %xmm0, -8(rp) 196 movdqa -32(up), %xmm0 197 palignr($8, %xmm0, %xmm1) 198 movdqa %xmm1, -24(rp) 199 lea -32(up), up 200 lea -32(rp), rp 201 202 1: test $2, R8(n) 203 jz 1f 204 movdqa -16(up), %xmm1 205 palignr($8, %xmm1, %xmm0) 206 movdqa %xmm0, -8(rp) 207 lea -16(up), up 208 lea -16(rp), rp 209 210 1: test $1, R8(n) 211 jz 1f 212 mov (up), %r8 213 mov %r8, (rp) 214 215 1: FUNC_EXIT() 216 ret 217 218 C Basecase code. Needed for good small operands speed, not for 219 C correctness as the above code is currently written. 220 221 L(bc): sub $4, R32(n) 222 jc L(end) 223 224 ALIGN(16) 225 L(top): mov (up), %r8 226 mov -8(up), %r9 227 lea -32(rp), rp 228 mov -16(up), %r10 229 mov -24(up), %r11 230 lea -32(up), up 231 mov %r8, 32(rp) 232 mov %r9, 24(rp) 233 ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 234 ` sub $4, R32(n)') 235 mov %r10, 16(rp) 236 mov %r11, 8(rp) 237 ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 238 ` jnc L(top)') 239 240 L(end): test $1, R8(n) 241 jz 1f 242 mov (up), %r8 243 mov %r8, (rp) 244 lea -8(rp), rp 245 lea -8(up), up 246 1: test $2, R8(n) 247 jz 1f 248 mov (up), %r8 249 mov -8(up), %r9 250 mov %r8, (rp) 251 mov %r9, -8(rp) 252 1: FUNC_EXIT() 253 ret 254 EPILOGUE()