github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/sec_tabselect.asm (about) 1 dnl AMD64 SSE mpn_sec_tabselect. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2011-2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb cycles/limb cycles/limb 37 C ali,evn n unal,evn n other cases 38 C AMD K8,K9 1.65 1.65 1.8 39 C AMD K10 0.78 0.78 0.85 40 C AMD bd1 0.80 0.91 1.25 41 C AMD bobcat 2.15 2.15 2.37 42 C Intel P4 2.5 2.5 2.95 43 C Intel core2 1.17 1.25 1.25 44 C Intel NHM 0.87 0.90 0.90 45 C Intel SBR 0.63 0.79 0.77 46 C Intel atom 4.3 4.3 4.3 slower than plain code 47 C VIA nano 1.4 5.1 3.14 too alignment dependent 48 49 C NOTES 50 C * We only honour the least significant 32 bits of the `which' and `nents' 51 C arguments to allow efficient code using just SSE2. We would need to 52 C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence. 53 C * We use movd for copying between xmm and plain registers, since old gas 54 C rejects movq. But gas assembles movd as movq when given a 64-bit greg. 55 56 define(`rp', `%rdi') 57 define(`tp', `%rsi') 58 define(`n', `%rdx') 59 define(`nents', `%rcx') 60 define(`which', `%r8') 61 62 define(`i', `%r10') 63 define(`j', `%r9') 64 65 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 66 C nents n rp tab which j i temp * * * * 67 68 ABI_SUPPORT(DOS64) 69 ABI_SUPPORT(STD64) 70 71 ASM_START() 72 TEXT 73 ALIGN(16) 74 PROLOGUE(mpn_sec_tabselect) 75 FUNC_ENTRY(4) 76 IFDOS(` mov 56(%rsp), %r8d ') 77 78 movd which, %xmm8 79 pshufd $0, %xmm8, %xmm8 C 4 `which' copies 80 mov $1, R32(%rax) 81 movd %rax, %xmm9 82 pshufd $0, %xmm9, %xmm9 C 4 copies of 1 83 84 mov n, j 85 add $-8, j 86 js L(outer_end) 87 88 L(outer_top): 89 mov nents, i 90 mov tp, %r11 91 pxor %xmm13, %xmm13 92 pxor %xmm4, %xmm4 93 pxor %xmm5, %xmm5 94 pxor %xmm6, %xmm6 95 pxor %xmm7, %xmm7 96 ALIGN(16) 97 L(top): movdqa %xmm8, %xmm0 98 pcmpeqd %xmm13, %xmm0 99 paddd %xmm9, %xmm13 100 movdqu 0(tp), %xmm2 101 movdqu 16(tp), %xmm3 102 pand %xmm0, %xmm2 103 pand %xmm0, %xmm3 104 por %xmm2, %xmm4 105 por %xmm3, %xmm5 106 movdqu 32(tp), %xmm2 107 movdqu 48(tp), %xmm3 108 pand %xmm0, %xmm2 109 pand %xmm0, %xmm3 110 por %xmm2, %xmm6 111 por %xmm3, %xmm7 112 lea (tp,n,8), tp 113 add $-1, i 114 jne L(top) 115 116 movdqu %xmm4, 0(rp) 117 movdqu %xmm5, 16(rp) 118 movdqu %xmm6, 32(rp) 119 movdqu %xmm7, 48(rp) 120 121 lea 64(%r11), tp 122 lea 64(rp), rp 123 add $-8, j 124 jns L(outer_top) 125 L(outer_end): 126 127 test $4, R8(n) 128 je L(b0xx) 129 L(b1xx):mov nents, i 130 mov tp, %r11 131 pxor %xmm13, %xmm13 132 pxor %xmm4, %xmm4 133 pxor %xmm5, %xmm5 134 ALIGN(16) 135 L(tp4): movdqa %xmm8, %xmm0 136 pcmpeqd %xmm13, %xmm0 137 paddd %xmm9, %xmm13 138 movdqu 0(tp), %xmm2 139 movdqu 16(tp), %xmm3 140 pand %xmm0, %xmm2 141 pand %xmm0, %xmm3 142 por %xmm2, %xmm4 143 por %xmm3, %xmm5 144 lea (tp,n,8), tp 145 add $-1, i 146 jne L(tp4) 147 movdqu %xmm4, 0(rp) 148 movdqu %xmm5, 16(rp) 149 lea 32(%r11), tp 150 lea 32(rp), rp 151 152 L(b0xx):test $2, R8(n) 153 je L(b00x) 154 L(b01x):mov nents, i 155 mov tp, %r11 156 pxor %xmm13, %xmm13 157 pxor %xmm4, %xmm4 158 ALIGN(16) 159 L(tp2): movdqa %xmm8, %xmm0 160 pcmpeqd %xmm13, %xmm0 161 paddd %xmm9, %xmm13 162 movdqu 0(tp), %xmm2 163 pand %xmm0, %xmm2 164 por %xmm2, %xmm4 165 lea (tp,n,8), tp 166 add $-1, i 167 jne L(tp2) 168 movdqu %xmm4, 0(rp) 169 lea 16(%r11), tp 170 lea 16(rp), rp 171 172 L(b00x):test $1, R8(n) 173 je L(b000) 174 L(b001):mov nents, i 175 mov tp, %r11 176 pxor %xmm13, %xmm13 177 pxor %xmm4, %xmm4 178 ALIGN(16) 179 L(tp1): movdqa %xmm8, %xmm0 180 pcmpeqd %xmm13, %xmm0 181 paddd %xmm9, %xmm13 182 movq 0(tp), %xmm2 183 pand %xmm0, %xmm2 184 por %xmm2, %xmm4 185 lea (tp,n,8), tp 186 add $-1, i 187 jne L(tp1) 188 movq %xmm4, 0(rp) 189 190 L(b000):FUNC_EXIT() 191 ret 192 EPILOGUE()