github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastavx/copyi.asm (about) 1 dnl AMD64 mpn_copyi optimised for CPUs with fast AVX. 2 3 dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. 4 5 dnl Contributed to the GNU project by Torbjörn Granlund. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb cycles/limb cycles/limb good 36 C aligned unaligned best seen for cpu? 37 C AMD K8,K9 n/a 38 C AMD K10 n/a 39 C AMD bull n/a 40 C AMD pile 4.87 4.87 N 41 C AMD steam ? ? 42 C AMD bobcat n/a 43 C AMD jaguar n/a 44 C Intel P4 n/a 45 C Intel core n/a 46 C Intel NHM n/a 47 C Intel SBR 0.50 0.91 N 48 C Intel IBR 0.50 0.65 N 49 C Intel HWL 0.25 0.30 Y 50 C Intel BWL 0.28 0.37 Y 51 C Intel atom n/a 52 C VIA nano n/a 53 54 C We try to do as many 32-byte operations as possible. The top-most and 55 C bottom-most writes might need 8-byte operations. For the bulk copying, we 56 C write using aligned 32-byte operations, but we read with both aligned and 57 C unaligned 32-byte operations. 58 59 define(`rp', `%rdi') 60 define(`up', `%rsi') 61 define(`n', `%rdx') 62 63 ABI_SUPPORT(DOS64) 64 ABI_SUPPORT(STD64) 65 66 dnl define(`vmovdqu', vlddqu) 67 68 ASM_START() 69 TEXT 70 ALIGN(32) 71 PROLOGUE(mpn_copyi) 72 FUNC_ENTRY(3) 73 74 cmp $7, n 75 jbe L(bc) 76 77 test $8, R8(rp) C is rp 16-byte aligned? 78 jz L(a2) C jump if rp aligned 79 mov (up), %rax 80 lea 8(up), up 81 mov %rax, (rp) 82 lea 8(rp), rp 83 dec n 84 L(a2): test $16, R8(rp) C is rp 32-byte aligned? 85 jz L(a3) C jump if rp aligned 86 vmovdqu (up), %xmm0 87 lea 16(up), up 88 vmovdqa %xmm0, (rp) 89 lea 16(rp), rp 90 sub $2, n 91 L(a3): sub $16, n 92 jc L(sma) 93 94 ALIGN(16) 95 L(top): vmovdqu (up), %ymm0 96 vmovdqu 32(up), %ymm1 97 vmovdqu 64(up), %ymm2 98 vmovdqu 96(up), %ymm3 99 lea 128(up), up 100 vmovdqa %ymm0, (rp) 101 vmovdqa %ymm1, 32(rp) 102 vmovdqa %ymm2, 64(rp) 103 vmovdqa %ymm3, 96(rp) 104 lea 128(rp), rp 105 L(ali): sub $16, n 106 jnc L(top) 107 108 L(sma): test $8, R8(n) 109 jz 1f 110 vmovdqu (up), %ymm0 111 vmovdqu 32(up), %ymm1 112 lea 64(up), up 113 vmovdqa %ymm0, (rp) 114 vmovdqa %ymm1, 32(rp) 115 lea 64(rp), rp 116 1: 117 test $4, R8(n) 118 jz 1f 119 vmovdqu (up), %ymm0 120 lea 32(up), up 121 vmovdqa %ymm0, (rp) 122 lea 32(rp), rp 123 1: 124 test $2, R8(n) 125 jz 1f 126 vmovdqu (up), %xmm0 127 lea 16(up), up 128 vmovdqa %xmm0, (rp) 129 lea 16(rp), rp 130 1: 131 L(end): test $1, R8(n) 132 jz 1f 133 mov (up), %r8 134 mov %r8, (rp) 135 1: 136 FUNC_EXIT() 137 ret 138 139 ALIGN(16) 140 L(bc): test $4, R8(n) 141 jz 1f 142 mov (up), %rax 143 mov 8(up), %rcx 144 mov 16(up), %r8 145 mov 24(up), %r9 146 lea 32(up), up 147 mov %rax, (rp) 148 mov %rcx, 8(rp) 149 mov %r8, 16(rp) 150 mov %r9, 24(rp) 151 lea 32(rp), rp 152 1: 153 test $2, R8(n) 154 jz 1f 155 mov (up), %rax 156 mov 8(up), %rcx 157 lea 16(up), up 158 mov %rax, (rp) 159 mov %rcx, 8(rp) 160 lea 16(rp), rp 161 1: 162 test $1, R8(n) 163 jz 1f 164 mov (up), %rax 165 mov %rax, (rp) 166 1: 167 FUNC_EXIT() 168 ret 169 EPILOGUE()