github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyi.asm (about) 1 dnl AMD64 mpn_copyi optimised for CPUs with fast SSE. 2 3 dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, 4 dnl Inc. 5 6 dnl Contributed to the GNU project by Torbjörn Granlund. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb cycles/limb cycles/limb good 37 C aligned unaligned best seen for cpu? 38 C AMD K8,K9 39 C AMD K10 0.85 1.64 Y/N 40 C AMD bull 1.4 1.4 N 41 C AMD pile 0.77 0.93 N 42 C AMD steam ? ? 43 C AMD excavator ? ? 44 C AMD bobcat 45 C AMD jaguar 0.65 1.02 opt/0.93 Y/N 46 C Intel P4 2.3 2.3 Y 47 C Intel core 1.0 1.0 0.52/0.64 N 48 C Intel NHM 0.5 0.67 Y 49 C Intel SBR 0.51 0.75 opt/0.54 Y/N 50 C Intel IBR 0.50 0.57 opt/0.54 Y 51 C Intel HWL 0.50 0.57 opt/0.51 Y 52 C Intel BWL 0.55 0.62 opt/0.55 Y 53 C Intel atom 54 C Intel SLM 1.02 1.27 opt/1.07 Y/N 55 C VIA nano 1.16 5.16 Y/N 56 57 C We try to do as many 16-byte operations as possible. The top-most and 58 C bottom-most writes might need 8-byte operations. We can always write using 59 C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 60 C operations. 61 62 C Instead of having separate loops for reading aligned and unaligned, we read 63 C using MOVDQU. This seems to work great except for core2; there performance 64 C doubles when reading using MOVDQA (for aligned source). It is unclear how to 65 C best handle the unaligned case there. 66 67 C INPUT PARAMETERS 68 define(`rp', `%rdi') 69 define(`up', `%rsi') 70 define(`n', `%rdx') 71 72 ABI_SUPPORT(DOS64) 73 ABI_SUPPORT(STD64) 74 75 dnl define(`movdqu', lddqu) 76 77 ASM_START() 78 TEXT 79 ALIGN(64) 80 PROLOGUE(mpn_copyi) 81 FUNC_ENTRY(3) 82 83 cmp $3, n C NB: bc code below assumes this limit 84 jc L(bc) 85 86 test $8, R8(rp) C is rp 16-byte aligned? 87 jz L(ali) C jump if rp aligned 88 movsq C copy single limb 89 dec n 90 91 sub $16, n 92 jc L(sma) 93 94 ALIGN(16) 95 L(top): movdqu (up), %xmm0 96 movdqu 16(up), %xmm1 97 movdqu 32(up), %xmm2 98 movdqu 48(up), %xmm3 99 movdqu 64(up), %xmm4 100 movdqu 80(up), %xmm5 101 movdqu 96(up), %xmm6 102 movdqu 112(up), %xmm7 103 lea 128(up), up 104 movdqa %xmm0, (rp) 105 movdqa %xmm1, 16(rp) 106 movdqa %xmm2, 32(rp) 107 movdqa %xmm3, 48(rp) 108 movdqa %xmm4, 64(rp) 109 movdqa %xmm5, 80(rp) 110 movdqa %xmm6, 96(rp) 111 movdqa %xmm7, 112(rp) 112 lea 128(rp), rp 113 L(ali): sub $16, n 114 jnc L(top) 115 116 L(sma): test $8, R8(n) 117 jz 1f 118 movdqu (up), %xmm0 119 movdqu 16(up), %xmm1 120 movdqu 32(up), %xmm2 121 movdqu 48(up), %xmm3 122 lea 64(up), up 123 movdqa %xmm0, (rp) 124 movdqa %xmm1, 16(rp) 125 movdqa %xmm2, 32(rp) 126 movdqa %xmm3, 48(rp) 127 lea 64(rp), rp 128 1: 129 test $4, R8(n) 130 jz 1f 131 movdqu (up), %xmm0 132 movdqu 16(up), %xmm1 133 lea 32(up), up 134 movdqa %xmm0, (rp) 135 movdqa %xmm1, 16(rp) 136 lea 32(rp), rp 137 1: 138 test $2, R8(n) 139 jz 1f 140 movdqu (up), %xmm0 141 lea 16(up), up 142 movdqa %xmm0, (rp) 143 lea 16(rp), rp 144 ALIGN(16) 145 1: 146 L(end): test $1, R8(n) 147 jz 1f 148 mov (up), %r8 149 mov %r8, (rp) 150 1: 151 FUNC_EXIT() 152 ret 153 154 C Basecase code. Needed for good small operands speed, not for correctness as 155 C the above code is currently written. The commented-out lines need to be 156 C reinstated if this code is to be used for n > 3, and then the post loop 157 C offsets need fixing. 158 159 L(bc): sub $2, n 160 jc L(end) 161 ALIGN(16) 162 1: mov (up), %rax 163 mov 8(up), %rcx 164 dnl lea 16(up), up 165 mov %rax, (rp) 166 mov %rcx, 8(rp) 167 dnl lea 16(rp), rp 168 dnl sub $2, n 169 dnl jnc 1b 170 171 test $1, R8(n) 172 jz L(ret) 173 mov 16(up), %rax 174 mov %rax, 16(rp) 175 L(ret): FUNC_EXIT() 176 ret 177 EPILOGUE()