github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bd1/aorsmul_1.asm (about) 1 dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer. 2 3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 35 C AMD K10 36 C AMD bd1 4.5-4.7 37 C AMD bobcat 38 C Intel P4 39 C Intel core2 40 C Intel NHM 41 C Intel SBR 42 C Intel atom 43 C VIA nano 44 45 C The loop of this code is the result of running a code generation and 46 C optimisation tool suite written by David Harvey and Torbjorn Granlund. 47 48 C TODO 49 C * Try to make loop run closer to 4 c/l. 50 51 define(`rp', `%rdi') C rcx 52 define(`up', `%rsi') C rdx 53 define(`n_param', `%rdx') C r8 54 define(`v0', `%rcx') C r9 55 56 define(`n', `%r11') 57 58 ifdef(`OPERATION_addmul_1',` 59 define(`ADDSUB', `add') 60 define(`func', `mpn_addmul_1') 61 ') 62 ifdef(`OPERATION_submul_1',` 63 define(`ADDSUB', `sub') 64 define(`func', `mpn_submul_1') 65 ') 66 67 ABI_SUPPORT(DOS64) 68 ABI_SUPPORT(STD64) 69 70 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 71 72 IFDOS(` define(`up', ``%rsi'') ') dnl 73 IFDOS(` define(`rp', ``%rcx'') ') dnl 74 IFDOS(` define(`v0', ``%r9'') ') dnl 75 IFDOS(` define(`r9', ``rdi'') ') dnl 76 IFDOS(` define(`n', ``%r8'') ') dnl 77 IFDOS(` define(`r8', ``r11'') ') dnl 78 79 ASM_START() 80 TEXT 81 ALIGN(16) 82 PROLOGUE(func) 83 IFDOS(``push %rsi '') 84 IFDOS(``push %rdi '') 85 IFDOS(``mov %rdx, %rsi '') 86 87 mov (up), %rax C read first u limb early 88 push %rbx 89 IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it 90 IFDOS(` mov n, %rbx ') 91 mul v0 92 93 IFSTD(` mov %rbx, n ') 94 95 and $3, R32(%rbx) 96 lea -16(rp,n,8), rp 97 jz L(b0) 98 cmp $2, R32(%rbx) 99 jb L(b1) 100 jz L(b2) 101 102 L(b3): mov $0, R32(%r8) 103 mov %rax, %rbx 104 mov $0, R32(%r9) 105 mov 8(up), %rax 106 mov %rdx, %r10 107 lea (up,n,8), up 108 not n 109 jmp L(L3) 110 111 L(b0): mov $0, R32(%r10) 112 mov %rax, %r8 113 mov %rdx, %rbx 114 mov 8(up), %rax 115 lea (up,n,8), up 116 neg n 117 jmp L(L0) 118 119 L(b1): cmp $1, n 120 jz L(n1) 121 mov %rax, %r9 122 mov 8(up), %rax 123 mov %rdx, %r8 124 mov $0, R32(%rbx) 125 lea (up,n,8), up 126 neg n 127 inc n 128 jmp L(L1) 129 130 L(b2): mov $0, R32(%rbx) 131 mov %rax, %r10 132 mov %rdx, %r9 133 mov 8(up), %rax 134 mov $0, R32(%r8) 135 lea (up,n,8), up 136 neg n 137 add $2, n 138 jns L(end) 139 140 ALIGN(32) 141 L(top): mul v0 142 ADDSUB %r10, (rp,n,8) 143 adc %rax, %r9 144 mov (up,n,8), %rax 145 adc %rdx, %r8 146 L(L1): mul v0 147 mov $0, R32(%r10) 148 ADDSUB %r9, 8(rp,n,8) 149 adc %rax, %r8 150 adc %rdx, %rbx 151 mov 8(up,n,8), %rax 152 L(L0): mul v0 153 ADDSUB %r8, 16(rp,n,8) 154 mov $0, R32(%r8) 155 adc %rax, %rbx 156 mov $0, R32(%r9) 157 mov 16(up,n,8), %rax 158 adc %rdx, %r10 159 L(L3): mul v0 160 ADDSUB %rbx, 24(rp,n,8) 161 mov $0, R32(%rbx) 162 adc %rax, %r10 163 adc %rdx, %r9 164 mov 24(up,n,8), %rax 165 add $4, n 166 js L(top) 167 168 L(end): mul v0 169 ADDSUB %r10, (rp) 170 adc %r9, %rax 171 adc %r8, %rdx 172 L(n1): ADDSUB %rax, 8(rp) 173 adc $0, %rdx 174 mov %rdx, %rax 175 176 pop %rbx 177 IFDOS(``pop %rdi '') 178 IFDOS(``pop %rsi '') 179 ret 180 EPILOGUE() 181 ASM_END()