github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyd.asm (about) 1 dnl AMD64 mpn_copyd optimised for CPUs with fast SSE. 2 3 dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, 4 dnl Inc. 5 6 dnl Contributed to the GNU project by Torbjörn Granlund. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb cycles/limb cycles/limb good 37 C aligned unaligned best seen for cpu? 38 C AMD K8,K9 39 C AMD K10 0.85 1.64 Y/N 40 C AMD bull 1.4 1.4 Y 41 C AMD pile 0.68 0.98 Y/N 42 C AMD steam 43 C AMD excavator 44 C AMD bobcat 45 C AMD jaguar 0.65 1.02 opt/0.93 Y/N 46 C Intel P4 2.3 2.3 Y 47 C Intel core 1.0 1.0 0.52/0.80 N 48 C Intel NHM 0.5 0.67 Y 49 C Intel SBR 0.51 0.75 opt/0.54 Y/N 50 C Intel IBR 0.50 0.57 opt/0.50 Y 51 C Intel HWL 0.50 0.57 opt/0.51 Y 52 C Intel BWL 0.55 0.62 opt/0.55 Y 53 C Intel atom 54 C Intel SLM 1.02 1.27 opt/1.04 Y/N 55 C VIA nano 1.16 5.16 Y/N 56 57 C We try to do as many 16-byte operations as possible. The top-most and 58 C bottom-most writes might need 8-byte operations. We can always write using 59 C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 60 C operations. 61 62 C Instead of having separate loops for reading aligned and unaligned, we read 63 C using MOVDQU. This seems to work great except for core2; there performance 64 C doubles when reading using MOVDQA (for aligned source). It is unclear how to 65 C best handle the unaligned case there. 66 67 C INPUT PARAMETERS 68 define(`rp', `%rdi') 69 define(`up', `%rsi') 70 define(`n', `%rdx') 71 72 ABI_SUPPORT(DOS64) 73 ABI_SUPPORT(STD64) 74 75 dnl define(`movdqu', lddqu) 76 77 ASM_START() 78 TEXT 79 ALIGN(16) 80 PROLOGUE(mpn_copyd) 81 FUNC_ENTRY(3) 82 83 test n, n 84 jz L(don) 85 86 lea -16(rp,n,8), rp 87 lea -16(up,n,8), up 88 89 test $8, R8(rp) C is rp 16-byte aligned? 90 jz L(ali) C jump if rp aligned 91 mov 8(up), %rax 92 lea -8(up), up 93 mov %rax, 8(rp) 94 lea -8(rp), rp 95 dec n 96 97 sub $16, n 98 jc L(sma) 99 100 ALIGN(16) 101 L(top): movdqu (up), %xmm0 102 movdqu -16(up), %xmm1 103 movdqu -32(up), %xmm2 104 movdqu -48(up), %xmm3 105 movdqu -64(up), %xmm4 106 movdqu -80(up), %xmm5 107 movdqu -96(up), %xmm6 108 movdqu -112(up), %xmm7 109 lea -128(up), up 110 movdqa %xmm0, (rp) 111 movdqa %xmm1, -16(rp) 112 movdqa %xmm2, -32(rp) 113 movdqa %xmm3, -48(rp) 114 movdqa %xmm4, -64(rp) 115 movdqa %xmm5, -80(rp) 116 movdqa %xmm6, -96(rp) 117 movdqa %xmm7, -112(rp) 118 lea -128(rp), rp 119 L(ali): sub $16, n 120 jnc L(top) 121 122 L(sma): test $8, R8(n) 123 jz 1f 124 movdqu (up), %xmm0 125 movdqu -16(up), %xmm1 126 movdqu -32(up), %xmm2 127 movdqu -48(up), %xmm3 128 lea -64(up), up 129 movdqa %xmm0, (rp) 130 movdqa %xmm1, -16(rp) 131 movdqa %xmm2, -32(rp) 132 movdqa %xmm3, -48(rp) 133 lea -64(rp), rp 134 1: 135 test $4, R8(n) 136 jz 1f 137 movdqu (up), %xmm0 138 movdqu -16(up), %xmm1 139 lea -32(up), up 140 movdqa %xmm0, (rp) 141 movdqa %xmm1, -16(rp) 142 lea -32(rp), rp 143 1: 144 test $2, R8(n) 145 jz 1f 146 movdqu (up), %xmm0 147 lea -16(up), up 148 movdqa %xmm0, (rp) 149 lea -16(rp), rp 150 1: 151 test $1, R8(n) 152 jz 1f 153 mov 8(up), %r8 154 mov %r8, 8(rp) 155 1: 156 L(don): FUNC_EXIT() 157 ret 158 EPILOGUE()