github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/com-palignr.asm (about) 1 dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3. 2 3 dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. 4 5 dnl Contributed to the GNU project by Torbjorn Granlund. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb cycles/limb cycles/limb good 36 C aligned unaligned best seen for cpu? 37 C AMD K8,K9 2.0 illop 1.0/1.0 N 38 C AMD K10 0.85 illop Y/N 39 C AMD bull 1.39 ? 1.45 Y/N 40 C AMD pile 0.8-1.4 0.7-1.4 Y 41 C AMD steam 42 C AMD excavator 43 C AMD bobcat 1.97 ? 8.17 1.5/1.5 N 44 C AMD jaguar 1.02 1.02 0.91/0.91 N 45 C Intel P4 2.26 illop Y/N 46 C Intel core 0.52 0.95 opt/0.74 Y 47 C Intel NHM 0.52 0.65 opt/opt Y 48 C Intel SBR 0.51 0.65 opt/opt Y 49 C Intel IBR 0.50 0.64 opt/0.57 Y 50 C Intel HWL 0.51 0.58 opt/opt Y 51 C Intel BWL 0.57 0.69 opt/0.65 Y 52 C Intel atom 1.16 1.70 opt/opt Y 53 C Intel SLM 1.02 1.52 N 54 C VIA nano 1.09 1.10 opt/opt Y 55 56 C We use only 16-byte operations, except for unaligned top-most and bottom-most 57 C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That 58 C instruction is better adapted to mpn_copyd's needs, we need to contort the 59 C code to use it here. 60 C 61 C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken 62 C from the x86_64 default code. 63 64 C INPUT PARAMETERS 65 define(`rp', `%rdi') 66 define(`up', `%rsi') 67 define(`n', `%rdx') 68 69 C There are three instructions for loading an aligned 128-bit quantity. We use 70 C movaps, since it has the shortest coding. 71 define(`movdqa', ``movaps'') 72 73 ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)') 74 75 ASM_START() 76 TEXT 77 ALIGN(64) 78 PROLOGUE(mpn_com) 79 FUNC_ENTRY(3) 80 81 cmp $COM_SSE_THRESHOLD, n 82 jbe L(bc) 83 84 pcmpeqb %xmm7, %xmm7 C set to 111...111 85 86 test $8, R8(rp) C is rp 16-byte aligned? 87 jz L(rp_aligned) C jump if rp aligned 88 89 mov (up), %r8 90 lea 8(up), up 91 not %r8 92 mov %r8, (rp) 93 lea 8(rp), rp 94 dec n 95 96 L(rp_aligned): 97 test $8, R8(up) 98 jnz L(uent) 99 100 ifelse(eval(COM_SSE_THRESHOLD >= 8),1, 101 ` sub $8, n', 102 ` jmp L(am)') 103 104 ALIGN(16) 105 L(atop):movdqa 0(up), %xmm0 106 movdqa 16(up), %xmm1 107 movdqa 32(up), %xmm2 108 movdqa 48(up), %xmm3 109 lea 64(up), up 110 pxor %xmm7, %xmm0 111 pxor %xmm7, %xmm1 112 pxor %xmm7, %xmm2 113 pxor %xmm7, %xmm3 114 movdqa %xmm0, (rp) 115 movdqa %xmm1, 16(rp) 116 movdqa %xmm2, 32(rp) 117 movdqa %xmm3, 48(rp) 118 lea 64(rp), rp 119 L(am): sub $8, n 120 jnc L(atop) 121 122 test $4, R8(n) 123 jz 1f 124 movdqa (up), %xmm0 125 movdqa 16(up), %xmm1 126 lea 32(up), up 127 pxor %xmm7, %xmm0 128 pxor %xmm7, %xmm1 129 movdqa %xmm0, (rp) 130 movdqa %xmm1, 16(rp) 131 lea 32(rp), rp 132 133 1: test $2, R8(n) 134 jz 1f 135 movdqa (up), %xmm0 136 lea 16(up), up 137 pxor %xmm7, %xmm0 138 movdqa %xmm0, (rp) 139 lea 16(rp), rp 140 141 1: test $1, R8(n) 142 jz 1f 143 mov (up), %r8 144 not %r8 145 mov %r8, (rp) 146 147 1: FUNC_EXIT() 148 ret 149 150 L(uent): 151 C Code handling up - rp = 8 (mod 16) 152 153 C FIXME: The code below only handles overlap if it is close to complete, or 154 C quite separate: up-rp < 5 or up-up > 15 limbs 155 lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES 156 sub rp, %rax 157 cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES 158 jbe L(bc) C deflect to plain loop 159 160 sub $16, n 161 jc L(uend) 162 163 movdqa 120(up), %xmm3 164 165 sub $16, n 166 jmp L(um) 167 168 ALIGN(16) 169 L(utop):movdqa 120(up), %xmm3 170 pxor %xmm7, %xmm0 171 movdqa %xmm0, -128(rp) 172 sub $16, n 173 L(um): movdqa 104(up), %xmm2 174 palignr($8, %xmm2, %xmm3) 175 movdqa 88(up), %xmm1 176 pxor %xmm7, %xmm3 177 movdqa %xmm3, 112(rp) 178 palignr($8, %xmm1, %xmm2) 179 movdqa 72(up), %xmm0 180 pxor %xmm7, %xmm2 181 movdqa %xmm2, 96(rp) 182 palignr($8, %xmm0, %xmm1) 183 movdqa 56(up), %xmm3 184 pxor %xmm7, %xmm1 185 movdqa %xmm1, 80(rp) 186 palignr($8, %xmm3, %xmm0) 187 movdqa 40(up), %xmm2 188 pxor %xmm7, %xmm0 189 movdqa %xmm0, 64(rp) 190 palignr($8, %xmm2, %xmm3) 191 movdqa 24(up), %xmm1 192 pxor %xmm7, %xmm3 193 movdqa %xmm3, 48(rp) 194 palignr($8, %xmm1, %xmm2) 195 movdqa 8(up), %xmm0 196 pxor %xmm7, %xmm2 197 movdqa %xmm2, 32(rp) 198 palignr($8, %xmm0, %xmm1) 199 movdqa -8(up), %xmm3 200 pxor %xmm7, %xmm1 201 movdqa %xmm1, 16(rp) 202 palignr($8, %xmm3, %xmm0) 203 lea 128(up), up 204 lea 128(rp), rp 205 jnc L(utop) 206 207 pxor %xmm7, %xmm0 208 movdqa %xmm0, -128(rp) 209 210 L(uend):test $8, R8(n) 211 jz 1f 212 movdqa 56(up), %xmm3 213 movdqa 40(up), %xmm2 214 palignr($8, %xmm2, %xmm3) 215 movdqa 24(up), %xmm1 216 pxor %xmm7, %xmm3 217 movdqa %xmm3, 48(rp) 218 palignr($8, %xmm1, %xmm2) 219 movdqa 8(up), %xmm0 220 pxor %xmm7, %xmm2 221 movdqa %xmm2, 32(rp) 222 palignr($8, %xmm0, %xmm1) 223 movdqa -8(up), %xmm3 224 pxor %xmm7, %xmm1 225 movdqa %xmm1, 16(rp) 226 palignr($8, %xmm3, %xmm0) 227 lea 64(up), up 228 pxor %xmm7, %xmm0 229 movdqa %xmm0, (rp) 230 lea 64(rp), rp 231 232 1: test $4, R8(n) 233 jz 1f 234 movdqa 24(up), %xmm1 235 movdqa 8(up), %xmm0 236 palignr($8, %xmm0, %xmm1) 237 movdqa -8(up), %xmm3 238 pxor %xmm7, %xmm1 239 movdqa %xmm1, 16(rp) 240 palignr($8, %xmm3, %xmm0) 241 lea 32(up), up 242 pxor %xmm7, %xmm0 243 movdqa %xmm0, (rp) 244 lea 32(rp), rp 245 246 1: test $2, R8(n) 247 jz 1f 248 movdqa 8(up), %xmm0 249 movdqa -8(up), %xmm3 250 palignr($8, %xmm3, %xmm0) 251 lea 16(up), up 252 pxor %xmm7, %xmm0 253 movdqa %xmm0, (rp) 254 lea 16(rp), rp 255 256 1: test $1, R8(n) 257 jz 1f 258 mov (up), %r8 259 not %r8 260 mov %r8, (rp) 261 262 1: FUNC_EXIT() 263 ret 264 265 C Basecase code. Needed for good small operands speed, not for 266 C correctness as the above code is currently written. 267 268 L(bc): lea -8(rp), rp 269 sub $4, R32(n) 270 jc L(end) 271 272 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, 273 ` ALIGN(16)') 274 L(top): mov (up), %r8 275 mov 8(up), %r9 276 lea 32(rp), rp 277 mov 16(up), %r10 278 mov 24(up), %r11 279 lea 32(up), up 280 not %r8 281 not %r9 282 not %r10 283 not %r11 284 mov %r8, -24(rp) 285 mov %r9, -16(rp) 286 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, 287 ` sub $4, R32(n)') 288 mov %r10, -8(rp) 289 mov %r11, (rp) 290 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, 291 ` jnc L(top)') 292 293 L(end): test $1, R8(n) 294 jz 1f 295 mov (up), %r8 296 not %r8 297 mov %r8, 8(rp) 298 lea 8(rp), rp 299 lea 8(up), up 300 1: test $2, R8(n) 301 jz 1f 302 mov (up), %r8 303 mov 8(up), %r9 304 not %r8 305 not %r9 306 mov %r8, 8(rp) 307 mov %r9, 16(rp) 308 1: FUNC_EXIT() 309 ret 310 EPILOGUE()