github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyi-palignr.asm (about) 1 dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. 2 3 dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. 4 5 dnl Contributed to the GNU project by Torbjörn Granlund. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb cycles/limb cycles/limb good 36 C aligned unaligned best seen for cpu? 37 C AMD K8,K9 2.0 illop 1.0/1.0 N 38 C AMD K10 0.85 illop Y/N 39 C AMD bull 0.70 0.66 Y 40 C AMD pile 0.68 0.66 Y 41 C AMD steam ? ? 42 C AMD excavator ? ? 43 C AMD bobcat 1.97 8.16 1.5/1.5 N 44 C AMD jaguar 0.77 0.93 0.65/opt N/Y 45 C Intel P4 2.26 illop Y/N 46 C Intel core 0.52 0.64 opt/opt Y 47 C Intel NHM 0.52 0.71 opt/opt Y 48 C Intel SBR 0.51 0.54 opt/0.51 Y 49 C Intel IBR 0.50 0.54 opt/opt Y 50 C Intel HWL 0.50 0.51 opt/opt Y 51 C Intel BWL 0.55 0.55 opt/opt Y 52 C Intel atom 1.16 1.61 opt/opt Y 53 C Intel SLM 1.02 1.07 opt/opt Y 54 C VIA nano 1.09 1.08 opt/opt Y 55 56 C We use only 16-byte operations, except for unaligned top-most and bottom-most 57 C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That 58 C instruction is better adapted to mpn_copyd's needs, we need to contort the 59 C code to use it here. 60 C 61 C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, 62 C taken from the x86_64 default code. 63 64 C INPUT PARAMETERS 65 define(`rp', `%rdi') 66 define(`up', `%rsi') 67 define(`n', `%rdx') 68 69 C There are three instructions for loading an aligned 128-bit quantity. We use 70 C movaps, since it has the shortest coding. 71 dnl define(`movdqa', ``movaps'') 72 73 ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') 74 75 ASM_START() 76 TEXT 77 ALIGN(64) 78 PROLOGUE(mpn_copyi) 79 FUNC_ENTRY(3) 80 81 cmp $COPYI_SSE_THRESHOLD, n 82 jbe L(bc) 83 84 test $8, R8(rp) C is rp 16-byte aligned? 85 jz L(rp_aligned) C jump if rp aligned 86 87 movsq C copy one limb 88 dec n 89 90 L(rp_aligned): 91 test $8, R8(up) 92 jnz L(uent) 93 94 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 95 ` sub $8, n', 96 ` jmp L(am)') 97 98 ALIGN(16) 99 L(atop):movdqa 0(up), %xmm0 100 movdqa 16(up), %xmm1 101 movdqa 32(up), %xmm2 102 movdqa 48(up), %xmm3 103 lea 64(up), up 104 movdqa %xmm0, (rp) 105 movdqa %xmm1, 16(rp) 106 movdqa %xmm2, 32(rp) 107 movdqa %xmm3, 48(rp) 108 lea 64(rp), rp 109 L(am): sub $8, n 110 jnc L(atop) 111 112 test $4, R8(n) 113 jz 1f 114 movdqa (up), %xmm0 115 movdqa 16(up), %xmm1 116 lea 32(up), up 117 movdqa %xmm0, (rp) 118 movdqa %xmm1, 16(rp) 119 lea 32(rp), rp 120 121 1: test $2, R8(n) 122 jz 1f 123 movdqa (up), %xmm0 124 lea 16(up), up 125 movdqa %xmm0, (rp) 126 lea 16(rp), rp 127 128 1: test $1, R8(n) 129 jz 1f 130 mov (up), %r8 131 mov %r8, (rp) 132 133 1: FUNC_EXIT() 134 ret 135 136 L(uent): 137 C Code handling up - rp = 8 (mod 16) 138 139 cmp $16, n 140 jc L(ued0) 141 142 IFDOS(` add $-56, %rsp ') 143 IFDOS(` movdqa %xmm6, (%rsp) ') 144 IFDOS(` movdqa %xmm7, 16(%rsp) ') 145 IFDOS(` movdqa %xmm8, 32(%rsp) ') 146 147 movaps 120(up), %xmm7 148 movaps 104(up), %xmm6 149 movaps 88(up), %xmm5 150 movaps 72(up), %xmm4 151 movaps 56(up), %xmm3 152 movaps 40(up), %xmm2 153 lea 128(up), up 154 sub $32, n 155 jc L(ued1) 156 157 ALIGN(16) 158 L(utop):movaps -104(up), %xmm1 159 sub $16, n 160 movaps -120(up), %xmm0 161 palignr($8, %xmm6, %xmm7) 162 movaps -136(up), %xmm8 163 movdqa %xmm7, 112(rp) 164 palignr($8, %xmm5, %xmm6) 165 movaps 120(up), %xmm7 166 movdqa %xmm6, 96(rp) 167 palignr($8, %xmm4, %xmm5) 168 movaps 104(up), %xmm6 169 movdqa %xmm5, 80(rp) 170 palignr($8, %xmm3, %xmm4) 171 movaps 88(up), %xmm5 172 movdqa %xmm4, 64(rp) 173 palignr($8, %xmm2, %xmm3) 174 movaps 72(up), %xmm4 175 movdqa %xmm3, 48(rp) 176 palignr($8, %xmm1, %xmm2) 177 movaps 56(up), %xmm3 178 movdqa %xmm2, 32(rp) 179 palignr($8, %xmm0, %xmm1) 180 movaps 40(up), %xmm2 181 movdqa %xmm1, 16(rp) 182 palignr($8, %xmm8, %xmm0) 183 lea 128(up), up 184 movdqa %xmm0, (rp) 185 lea 128(rp), rp 186 jnc L(utop) 187 188 L(ued1):movaps -104(up), %xmm1 189 movaps -120(up), %xmm0 190 movaps -136(up), %xmm8 191 palignr($8, %xmm6, %xmm7) 192 movdqa %xmm7, 112(rp) 193 palignr($8, %xmm5, %xmm6) 194 movdqa %xmm6, 96(rp) 195 palignr($8, %xmm4, %xmm5) 196 movdqa %xmm5, 80(rp) 197 palignr($8, %xmm3, %xmm4) 198 movdqa %xmm4, 64(rp) 199 palignr($8, %xmm2, %xmm3) 200 movdqa %xmm3, 48(rp) 201 palignr($8, %xmm1, %xmm2) 202 movdqa %xmm2, 32(rp) 203 palignr($8, %xmm0, %xmm1) 204 movdqa %xmm1, 16(rp) 205 palignr($8, %xmm8, %xmm0) 206 movdqa %xmm0, (rp) 207 lea 128(rp), rp 208 209 IFDOS(` movdqa (%rsp), %xmm6 ') 210 IFDOS(` movdqa 16(%rsp), %xmm7 ') 211 IFDOS(` movdqa 32(%rsp), %xmm8 ') 212 IFDOS(` add $56, %rsp ') 213 214 L(ued0):test $8, R8(n) 215 jz 1f 216 movaps 56(up), %xmm3 217 movaps 40(up), %xmm2 218 movaps 24(up), %xmm1 219 movaps 8(up), %xmm0 220 movaps -8(up), %xmm4 221 palignr($8, %xmm2, %xmm3) 222 movdqa %xmm3, 48(rp) 223 palignr($8, %xmm1, %xmm2) 224 movdqa %xmm2, 32(rp) 225 palignr($8, %xmm0, %xmm1) 226 movdqa %xmm1, 16(rp) 227 palignr($8, %xmm4, %xmm0) 228 lea 64(up), up 229 movdqa %xmm0, (rp) 230 lea 64(rp), rp 231 232 1: test $4, R8(n) 233 jz 1f 234 movaps 24(up), %xmm1 235 movaps 8(up), %xmm0 236 palignr($8, %xmm0, %xmm1) 237 movaps -8(up), %xmm3 238 movdqa %xmm1, 16(rp) 239 palignr($8, %xmm3, %xmm0) 240 lea 32(up), up 241 movdqa %xmm0, (rp) 242 lea 32(rp), rp 243 244 1: test $2, R8(n) 245 jz 1f 246 movdqa 8(up), %xmm0 247 movdqa -8(up), %xmm3 248 palignr($8, %xmm3, %xmm0) 249 lea 16(up), up 250 movdqa %xmm0, (rp) 251 lea 16(rp), rp 252 253 1: test $1, R8(n) 254 jz 1f 255 mov (up), %r8 256 mov %r8, (rp) 257 258 1: FUNC_EXIT() 259 ret 260 261 C Basecase code. Needed for good small operands speed, not for 262 C correctness as the above code is currently written. 263 264 L(bc): lea -8(rp), rp 265 sub $4, R32(n) 266 jc L(end) 267 268 ALIGN(16) 269 L(top): mov (up), %r8 270 mov 8(up), %r9 271 lea 32(rp), rp 272 mov 16(up), %r10 273 mov 24(up), %r11 274 lea 32(up), up 275 mov %r8, -24(rp) 276 mov %r9, -16(rp) 277 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 278 ` sub $4, R32(n)') 279 mov %r10, -8(rp) 280 mov %r11, (rp) 281 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 282 ` jnc L(top)') 283 284 L(end): test $1, R8(n) 285 jz 1f 286 mov (up), %r8 287 mov %r8, 8(rp) 288 lea 8(rp), rp 289 lea 8(up), up 290 1: test $2, R8(n) 291 jz 1f 292 mov (up), %r8 293 mov 8(up), %r9 294 mov %r8, 8(rp) 295 mov %r9, 16(rp) 296 1: FUNC_EXIT() 297 ret 298 EPILOGUE()