github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastavx/copyd.asm (about) 1 dnl AMD64 mpn_copyd optimised for CPUs with fast AVX. 2 3 dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. 4 5 dnl Contributed to the GNU project by Torbjörn Granlund. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb cycles/limb cycles/limb good 36 C aligned unaligned best seen for cpu? 37 C AMD K8,K9 n/a 38 C AMD K10 n/a 39 C AMD bull n/a 40 C AMD pile 4.87 4.87 N 41 C AMD steam ? ? 42 C AMD bobcat n/a 43 C AMD jaguar n/a 44 C Intel P4 n/a 45 C Intel core n/a 46 C Intel NHM n/a 47 C Intel SBR 0.50 0.91 N 48 C Intel IBR 0.50 0.65 N 49 C Intel HWL 0.25 0.30 Y 50 C Intel BWL 0.28 0.37 Y 51 C Intel atom n/a 52 C VIA nano n/a 53 54 C We try to do as many 32-byte operations as possible. The top-most and 55 C bottom-most writes might need 8-byte operations. For the bulk copying, we 56 C write using aligned 32-byte operations, but we read with both aligned and 57 C unaligned 32-byte operations. 58 59 define(`rp', `%rdi') 60 define(`up', `%rsi') 61 define(`n', `%rdx') 62 63 ABI_SUPPORT(DOS64) 64 ABI_SUPPORT(STD64) 65 66 dnl define(`vmovdqu', vlddqu) 67 68 ASM_START() 69 TEXT 70 ALIGN(32) 71 PROLOGUE(mpn_copyd) 72 FUNC_ENTRY(3) 73 74 lea -32(rp,n,8), rp 75 lea -32(up,n,8), up 76 77 cmp $7, n C basecase needed for correctness 78 jbe L(bc) 79 80 test $8, R8(rp) C is rp 16-byte aligned? 81 jz L(a2) C jump if rp aligned 82 mov 24(up), %rax 83 lea -8(up), up 84 mov %rax, 24(rp) 85 lea -8(rp), rp 86 dec n 87 L(a2): test $16, R8(rp) C is rp 32-byte aligned? 88 jz L(a3) C jump if rp aligned 89 vmovdqu 16(up), %xmm0 90 lea -16(up), up 91 vmovdqa %xmm0, 16(rp) 92 lea -16(rp), rp 93 sub $2, n 94 L(a3): sub $16, n 95 jc L(sma) 96 97 ALIGN(16) 98 L(top): vmovdqu (up), %ymm0 99 vmovdqu -32(up), %ymm1 100 vmovdqu -64(up), %ymm2 101 vmovdqu -96(up), %ymm3 102 lea -128(up), up 103 vmovdqa %ymm0, (rp) 104 vmovdqa %ymm1, -32(rp) 105 vmovdqa %ymm2, -64(rp) 106 vmovdqa %ymm3, -96(rp) 107 lea -128(rp), rp 108 L(ali): sub $16, n 109 jnc L(top) 110 111 L(sma): test $8, R8(n) 112 jz 1f 113 vmovdqu (up), %ymm0 114 vmovdqu -32(up), %ymm1 115 lea -64(up), up 116 vmovdqa %ymm0, (rp) 117 vmovdqa %ymm1, -32(rp) 118 lea -64(rp), rp 119 1: 120 test $4, R8(n) 121 jz 1f 122 vmovdqu (up), %ymm0 123 lea -32(up), up 124 vmovdqa %ymm0, (rp) 125 lea -32(rp), rp 126 1: 127 test $2, R8(n) 128 jz 1f 129 vmovdqu 16(up), %xmm0 130 lea -16(up), up 131 vmovdqa %xmm0, 16(rp) 132 lea -16(rp), rp 133 1: 134 test $1, R8(n) 135 jz 1f 136 mov 24(up), %r8 137 mov %r8, 24(rp) 138 1: 139 FUNC_EXIT() 140 ret 141 142 ALIGN(16) 143 L(bc): test $4, R8(n) 144 jz 1f 145 mov 24(up), %rax 146 mov 16(up), %rcx 147 mov 8(up), %r8 148 mov (up), %r9 149 lea -32(up), up 150 mov %rax, 24(rp) 151 mov %rcx, 16(rp) 152 mov %r8, 8(rp) 153 mov %r9, (rp) 154 lea -32(rp), rp 155 1: 156 test $2, R8(n) 157 jz 1f 158 mov 24(up), %rax 159 mov 16(up), %rcx 160 lea -16(up), up 161 mov %rax, 24(rp) 162 mov %rcx, 16(rp) 163 lea -16(rp), rp 164 1: 165 test $1, R8(n) 166 jz 1f 167 mov 24(up), %rax 168 mov %rax, 24(rp) 169 1: 170 FUNC_EXIT() 171 ret 172 EPILOGUE()