github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/com.asm (about) 1 dnl AMD64 mpn_com optimised for CPUs with fast SSE. 2 3 dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, 4 dnl Inc. 5 6 dnl Contributed to the GNU project by Torbjorn Granlund. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb cycles/limb cycles/limb good 37 C aligned unaligned best seen for cpu? 38 C AMD K8,K9 2.0 2.0 N 39 C AMD K10 0.85 1.3 Y/N 40 C AMD bull 1.40 1.40 Y 41 C AMD pile 0.9-1.4 0.9-1.4 Y 42 C AMD steam 43 C AMD excavator 44 C AMD bobcat 3.1 3.1 N 45 C AMD jaguar 0.91 0.91 opt/opt Y 46 C Intel P4 2.28 illop Y 47 C Intel core2 1.02 1.02 N 48 C Intel NHM 0.53 0.68 Y 49 C Intel SBR 0.51 0.75 opt/0.65 Y/N 50 C Intel IBR 0.50 0.57 opt/opt Y 51 C Intel HWL 0.51 0.64 opt/0.58 Y 52 C Intel BWL 0.61 0.65 0.57/opt Y 53 C Intel atom 3.68 3.68 N 54 C Intel SLM 1.09 1.35 N 55 C VIA nano 1.17 5.09 Y/N 56 57 C We try to do as many 16-byte operations as possible. The top-most and 58 C bottom-most writes might need 8-byte operations. We can always write using 59 C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 60 C operations. 61 62 C Instead of having separate loops for reading aligned and unaligned, we read 63 C using MOVDQU. This seems to work great except for core2; there performance 64 C doubles when reading using MOVDQA (for aligned source). It is unclear how to 65 C best handle the unaligned case there. 66 67 C INPUT PARAMETERS 68 define(`rp', `%rdi') 69 define(`up', `%rsi') 70 define(`n', `%rdx') 71 72 ABI_SUPPORT(DOS64) 73 ABI_SUPPORT(STD64) 74 75 ASM_START() 76 TEXT 77 ALIGN(16) 78 PROLOGUE(mpn_com) 79 FUNC_ENTRY(3) 80 81 pcmpeqb %xmm7, %xmm7 C set to 111...111 82 83 test $8, R8(rp) C is rp 16-byte aligned? 84 jz L(ali) C jump if rp aligned 85 mov (up), %rax 86 lea 8(up), up 87 not %rax 88 mov %rax, (rp) 89 lea 8(rp), rp 90 dec n 91 92 sub $14, n 93 jc L(sma) 94 95 ALIGN(16) 96 L(top): movdqu (up), %xmm0 97 movdqu 16(up), %xmm1 98 movdqu 32(up), %xmm2 99 movdqu 48(up), %xmm3 100 movdqu 64(up), %xmm4 101 movdqu 80(up), %xmm5 102 movdqu 96(up), %xmm6 103 lea 112(up), up 104 pxor %xmm7, %xmm0 105 pxor %xmm7, %xmm1 106 pxor %xmm7, %xmm2 107 pxor %xmm7, %xmm3 108 pxor %xmm7, %xmm4 109 pxor %xmm7, %xmm5 110 pxor %xmm7, %xmm6 111 movdqa %xmm0, (rp) 112 movdqa %xmm1, 16(rp) 113 movdqa %xmm2, 32(rp) 114 movdqa %xmm3, 48(rp) 115 movdqa %xmm4, 64(rp) 116 movdqa %xmm5, 80(rp) 117 movdqa %xmm6, 96(rp) 118 lea 112(rp), rp 119 L(ali): sub $14, n 120 jnc L(top) 121 122 L(sma): add $14, n 123 test $8, R8(n) 124 jz 1f 125 movdqu (up), %xmm0 126 movdqu 16(up), %xmm1 127 movdqu 32(up), %xmm2 128 movdqu 48(up), %xmm3 129 lea 64(up), up 130 pxor %xmm7, %xmm0 131 pxor %xmm7, %xmm1 132 pxor %xmm7, %xmm2 133 pxor %xmm7, %xmm3 134 movdqa %xmm0, (rp) 135 movdqa %xmm1, 16(rp) 136 movdqa %xmm2, 32(rp) 137 movdqa %xmm3, 48(rp) 138 lea 64(rp), rp 139 1: 140 test $4, R8(n) 141 jz 1f 142 movdqu (up), %xmm0 143 movdqu 16(up), %xmm1 144 lea 32(up), up 145 pxor %xmm7, %xmm0 146 pxor %xmm7, %xmm1 147 movdqa %xmm0, (rp) 148 movdqa %xmm1, 16(rp) 149 lea 32(rp), rp 150 1: 151 test $2, R8(n) 152 jz 1f 153 movdqu (up), %xmm0 154 lea 16(up), up 155 pxor %xmm7, %xmm0 156 movdqa %xmm0, (rp) 157 lea 16(rp), rp 158 1: 159 test $1, R8(n) 160 jz 1f 161 mov (up), %rax 162 not %rax 163 mov %rax, (rp) 164 1: 165 L(don): FUNC_EXIT() 166 ret 167 EPILOGUE()