github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/aorrlsh_n.asm (about) 1 dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. 2 3 dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 2.87 < 3.85 for lshift + add_n 35 C AMD K10 2.75 < 3.85 for lshift + add_n 36 C Intel P4 22 > 7.33 for lshift + add_n 37 C Intel core2 4.1 > 3.27 for lshift + add_n 38 C Intel NHM 4.4 > 3.75 for lshift + add_n 39 C Intel SBR 3.17 < 3.46 for lshift + add_n 40 C Intel atom ? ? 8.75 for lshift + add_n 41 C VIA nano 4.7 < 6.25 for lshift + add_n 42 43 C TODO 44 C * Can we propagate carry into rdx instead of using a special carry register? 45 C That could save enough insns to get to 10 cycles/iteration. 46 47 define(`rp', `%rdi') 48 define(`up', `%rsi') 49 define(`vp_param', `%rdx') 50 define(`n_param', `%rcx') 51 define(`cnt', `%r8') 52 53 define(`vp', `%r12') 54 define(`n', `%rbp') 55 56 ifdef(`OPERATION_addlsh_n',` 57 define(ADDSUB, `add') 58 define(ADCSBB, `adc') 59 define(func, mpn_addlsh_n) 60 ') 61 ifdef(`OPERATION_rsblsh_n',` 62 define(ADDSUB, `sub') 63 define(ADCSBB, `sbb') 64 define(func, mpn_rsblsh_n) 65 ') 66 67 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) 68 69 ABI_SUPPORT(DOS64) 70 ABI_SUPPORT(STD64) 71 72 ASM_START() 73 TEXT 74 ALIGN(16) 75 PROLOGUE(func) 76 FUNC_ENTRY(4) 77 IFDOS(` mov 56(%rsp), %r8d ') 78 push %r12 79 push %rbp 80 push %rbx 81 82 mov (vp_param), %rax C load first V limb early 83 84 mov $0, R32(n) 85 sub n_param, n 86 87 lea -16(up,n_param,8), up 88 lea -16(rp,n_param,8), rp 89 lea 16(vp_param,n_param,8), vp 90 91 mov n_param, %r9 92 93 mov %r8, %rcx 94 mov $1, R32(%r8) 95 shl R8(%rcx), %r8 96 97 mul %r8 C initial multiply 98 99 and $3, R32(%r9) 100 jz L(b0) 101 cmp $2, R32(%r9) 102 jc L(b1) 103 jz L(b2) 104 105 L(b3): mov %rax, %r11 106 ADDSUB 16(up,n,8), %r11 107 mov -8(vp,n,8), %rax 108 sbb R32(%rcx), R32(%rcx) 109 mov %rdx, %rbx 110 mul %r8 111 or %rax, %rbx 112 mov (vp,n,8), %rax 113 mov %rdx, %r9 114 mul %r8 115 or %rax, %r9 116 add $3, n 117 jnz L(lo3) 118 jmp L(cj3) 119 120 L(b2): mov %rax, %rbx 121 mov -8(vp,n,8), %rax 122 mov %rdx, %r9 123 mul %r8 124 or %rax, %r9 125 add $2, n 126 jz L(cj2) 127 mov %rdx, %r10 128 mov -16(vp,n,8), %rax 129 mul %r8 130 or %rax, %r10 131 xor R32(%rcx), R32(%rcx) C clear carry register 132 jmp L(lo2) 133 134 L(b1): mov %rax, %r9 135 mov %rdx, %r10 136 add $1, n 137 jnz L(gt1) 138 ADDSUB 8(up,n,8), %r9 139 jmp L(cj1) 140 L(gt1): mov -16(vp,n,8), %rax 141 mul %r8 142 or %rax, %r10 143 mov %rdx, %r11 144 mov -8(vp,n,8), %rax 145 mul %r8 146 or %rax, %r11 147 ADDSUB 8(up,n,8), %r9 148 ADCSBB 16(up,n,8), %r10 149 ADCSBB 24(up,n,8), %r11 150 mov (vp,n,8), %rax 151 sbb R32(%rcx), R32(%rcx) 152 jmp L(lo1) 153 154 L(b0): mov %rax, %r10 155 mov %rdx, %r11 156 mov -8(vp,n,8), %rax 157 mul %r8 158 or %rax, %r11 159 ADDSUB 16(up,n,8), %r10 160 ADCSBB 24(up,n,8), %r11 161 mov (vp,n,8), %rax 162 sbb R32(%rcx), R32(%rcx) 163 mov %rdx, %rbx 164 mul %r8 165 or %rax, %rbx 166 mov 8(vp,n,8), %rax 167 add $4, n 168 jz L(end) 169 170 ALIGN(8) 171 L(top): mov %rdx, %r9 172 mul %r8 173 or %rax, %r9 174 mov %r10, -16(rp,n,8) 175 L(lo3): mov %rdx, %r10 176 mov -16(vp,n,8), %rax 177 mul %r8 178 or %rax, %r10 179 mov %r11, -8(rp,n,8) 180 L(lo2): mov %rdx, %r11 181 mov -8(vp,n,8), %rax 182 mul %r8 183 or %rax, %r11 184 add R32(%rcx), R32(%rcx) 185 ADCSBB (up,n,8), %rbx 186 ADCSBB 8(up,n,8), %r9 187 ADCSBB 16(up,n,8), %r10 188 ADCSBB 24(up,n,8), %r11 189 mov (vp,n,8), %rax 190 sbb R32(%rcx), R32(%rcx) 191 mov %rbx, (rp,n,8) 192 L(lo1): mov %rdx, %rbx 193 mul %r8 194 or %rax, %rbx 195 mov %r9, 8(rp,n,8) 196 L(lo0): mov 8(vp,n,8), %rax 197 add $4, n 198 jnz L(top) 199 200 L(end): mov %rdx, %r9 201 mul %r8 202 or %rax, %r9 203 mov %r10, -16(rp,n,8) 204 L(cj3): mov %r11, -8(rp,n,8) 205 L(cj2): add R32(%rcx), R32(%rcx) 206 ADCSBB (up,n,8), %rbx 207 ADCSBB 8(up,n,8), %r9 208 mov %rbx, (rp,n,8) 209 L(cj1): mov %r9, 8(rp,n,8) 210 mov %rdx, %rax 211 ADCSBB $0, %rax 212 pop %rbx 213 pop %rbp 214 pop %r12 215 FUNC_EXIT() 216 ret 217 EPILOGUE()