github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/sec_tabselect.asm (about) 1 dnl AMD64 mpn_sec_tabselect. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2011-2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb good for cpu 37 C AMD K8,K9 1.5 Y 38 C AMD K10 1.4 39 C AMD bd1 2.64 40 C AMD bobcat 2.15 Y 41 C Intel P4 4 42 C Intel core2 1.38 43 C Intel NHM 1.75 44 C Intel SBR 1.25 45 C Intel atom 2.5 Y 46 C VIA nano 1.75 Y 47 48 C NOTES 49 C * This has not been tuned for any specific processor. Its speed should not 50 C be too bad, though. 51 C * Using SSE2/AVX2 could result in many-fold speedup. 52 C * WORKS FOR n mod 4 = 0 ONLY! 53 54 C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) 55 define(`rp', `%rdi') 56 define(`tp', `%rsi') 57 define(`n', `%rdx') 58 define(`nents', `%rcx') 59 define(`which', `%r8') 60 61 define(`i', `%rbp') 62 define(`j', `%r9') 63 64 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 65 C nents n rp tab i which j * * * * * * 66 67 ABI_SUPPORT(DOS64) 68 ABI_SUPPORT(STD64) 69 70 ASM_START() 71 TEXT 72 ALIGN(16) 73 PROLOGUE(mpn_sec_tabselect) 74 FUNC_ENTRY(4) 75 IFDOS(` mov 56(%rsp), %r8d ') 76 77 push %rbx 78 push %rbp 79 push %r12 80 push %r13 81 push %r14 82 push %r15 83 84 mov n, j 85 add $-4, j 86 js L(outer_end) 87 88 L(outer_top): 89 mov nents, i 90 push tp 91 xor R32(%r12), R32(%r12) 92 xor R32(%r13), R32(%r13) 93 xor R32(%r14), R32(%r14) 94 xor R32(%r15), R32(%r15) 95 mov which, %rbx 96 97 ALIGN(16) 98 L(top): sub $1, %rbx 99 sbb %rax, %rax 100 mov 0(tp), %r10 101 mov 8(tp), %r11 102 and %rax, %r10 103 and %rax, %r11 104 or %r10, %r12 105 or %r11, %r13 106 mov 16(tp), %r10 107 mov 24(tp), %r11 108 and %rax, %r10 109 and %rax, %r11 110 or %r10, %r14 111 or %r11, %r15 112 lea (tp,n,8), tp 113 add $-1, i 114 jne L(top) 115 116 mov %r12, 0(rp) 117 mov %r13, 8(rp) 118 mov %r14, 16(rp) 119 mov %r15, 24(rp) 120 pop tp 121 lea 32(tp), tp 122 lea 32(rp), rp 123 add $-4, j 124 jns L(outer_top) 125 L(outer_end): 126 127 test $2, R8(n) 128 jz L(b0x) 129 L(b1x): mov nents, i 130 push tp 131 xor R32(%r12), R32(%r12) 132 xor R32(%r13), R32(%r13) 133 mov which, %rbx 134 ALIGN(16) 135 L(tp2): sub $1, %rbx 136 sbb %rax, %rax 137 mov 0(tp), %r10 138 mov 8(tp), %r11 139 and %rax, %r10 140 and %rax, %r11 141 or %r10, %r12 142 or %r11, %r13 143 lea (tp,n,8), tp 144 add $-1, i 145 jne L(tp2) 146 mov %r12, 0(rp) 147 mov %r13, 8(rp) 148 pop tp 149 lea 16(tp), tp 150 lea 16(rp), rp 151 152 L(b0x): test $1, R8(n) 153 jz L(b00) 154 L(b01): mov nents, i 155 xor R32(%r12), R32(%r12) 156 mov which, %rbx 157 ALIGN(16) 158 L(tp1): sub $1, %rbx 159 sbb %rax, %rax 160 mov 0(tp), %r10 161 and %rax, %r10 162 or %r10, %r12 163 lea (tp,n,8), tp 164 add $-1, i 165 jne L(tp1) 166 mov %r12, 0(rp) 167 168 L(b00): pop %r15 169 pop %r14 170 pop %r13 171 pop %r12 172 pop %rbp 173 pop %rbx 174 FUNC_EXIT() 175 ret 176 EPILOGUE()