github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/aorrlsh1_n.asm (about) 1 dnl AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom. 2 dnl Used also for AMD bd1. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund. 5 6 dnl Copyright 2011, 2012 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C TODO 37 C * This code is slightly large at 433 bytes. 38 C * sublsh1_n.asm and this file use the same basic pattern. 39 40 C cycles/limb 41 C AMD K8,K9 ? 42 C AMD K10 ? 43 C AMD bd1 2.3 44 C AMD bobcat ? 45 C Intel P4 ? 46 C Intel core2 ? 47 C Intel NHM ? 48 C Intel SBR ? 49 C Intel atom 4.875 (4.75 is probably possible) 50 C VIA nano ? 51 52 C INPUT PARAMETERS 53 define(`rp', `%rdi') 54 define(`up', `%rsi') 55 define(`vp', `%rdx') 56 define(`n', `%rcx') 57 define(`cy', `%r8') 58 59 ifdef(`OPERATION_addlsh1_n', ` 60 define(ADDSUB, add) 61 define(ADCSBB, adc) 62 define(func_n, mpn_addlsh1_n) 63 define(func_nc, mpn_addlsh1_nc)') 64 ifdef(`OPERATION_rsblsh1_n', ` 65 define(ADDSUB, sub) 66 define(ADCSBB, sbb) 67 define(func_n, mpn_rsblsh1_n) 68 define(func_nc, mpn_rsblsh1_nc)') 69 70 ABI_SUPPORT(DOS64) 71 ABI_SUPPORT(STD64) 72 73 MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) 74 75 ASM_START() 76 TEXT 77 ALIGN(16) 78 PROLOGUE(func_n) 79 FUNC_ENTRY(4) 80 push %rbp 81 xor R32(%rbp), R32(%rbp) 82 L(ent): mov R32(n), R32(%rax) 83 and $3, R32(%rax) 84 jz L(b0) 85 cmp $2, R32(%rax) 86 jz L(b2) 87 jg L(b3) 88 89 L(b1): mov (vp), %r8 90 add %r8, %r8 91 lea 8(vp), vp 92 sbb R32(%rax), R32(%rax) C save scy 93 add R32(%rbp), R32(%rbp) C restore acy 94 ADCSBB (up), %r8 95 mov %r8, (rp) 96 sbb R32(%rbp), R32(%rbp) C save acy 97 lea 8(up), up 98 lea 8(rp), rp 99 jmp L(b0) 100 101 L(b2): mov (vp), %r8 102 add %r8, %r8 103 mov 8(vp), %r9 104 adc %r9, %r9 105 lea 16(vp), vp 106 sbb R32(%rax), R32(%rax) C save scy 107 add R32(%rbp), R32(%rbp) C restore acy 108 ADCSBB (up), %r8 109 mov %r8, (rp) 110 ADCSBB 8(up), %r9 111 mov %r9, 8(rp) 112 sbb R32(%rbp), R32(%rbp) C save acy 113 lea 16(up), up 114 lea 16(rp), rp 115 jmp L(b0) 116 117 L(b3): mov (vp), %r8 118 add %r8, %r8 119 mov 8(vp), %r9 120 adc %r9, %r9 121 mov 16(vp), %r10 122 adc %r10, %r10 123 lea 24(vp), vp 124 sbb R32(%rax), R32(%rax) C save scy 125 add R32(%rbp), R32(%rbp) C restore acy 126 ADCSBB (up), %r8 127 mov %r8, (rp) 128 ADCSBB 8(up), %r9 129 mov %r9, 8(rp) 130 ADCSBB 16(up), %r10 131 mov %r10, 16(rp) 132 sbb R32(%rbp), R32(%rbp) C save acy 133 lea 24(up), up 134 lea 24(rp), rp 135 136 L(b0): test $4, R8(n) 137 jz L(skp) 138 add R32(%rax), R32(%rax) C restore scy 139 mov (vp), %r8 140 adc %r8, %r8 141 mov 8(vp), %r9 142 adc %r9, %r9 143 mov 16(vp), %r10 144 adc %r10, %r10 145 mov 24(vp), %r11 146 adc %r11, %r11 147 lea 32(vp), vp 148 sbb R32(%rax), R32(%rax) C save scy 149 add R32(%rbp), R32(%rbp) C restore acy 150 ADCSBB (up), %r8 151 mov %r8, (rp) 152 ADCSBB 8(up), %r9 153 mov %r9, 8(rp) 154 ADCSBB 16(up), %r10 155 mov %r10, 16(rp) 156 ADCSBB 24(up), %r11 157 mov %r11, 24(rp) 158 lea 32(up), up 159 lea 32(rp), rp 160 sbb R32(%rbp), R32(%rbp) C save acy 161 162 L(skp): cmp $8, n 163 jl L(rtn) 164 165 push %r12 166 push %r13 167 push %r14 168 push %rbx 169 lea -64(rp), rp 170 jmp L(x) 171 172 ALIGN(16) 173 L(top): add R32(%rax), R32(%rax) C restore scy 174 lea 64(rp), rp 175 mov (vp), %r8 176 adc %r8, %r8 177 mov 8(vp), %r9 178 adc %r9, %r9 179 mov 16(vp), %r10 180 adc %r10, %r10 181 mov 24(vp), %r11 182 adc %r11, %r11 183 mov 32(vp), %r12 184 adc %r12, %r12 185 mov 40(vp), %r13 186 adc %r13, %r13 187 mov 48(vp), %r14 188 adc %r14, %r14 189 mov 56(vp), %rbx 190 adc %rbx, %rbx 191 lea 64(vp), vp 192 sbb R32(%rax), R32(%rax) C save scy 193 add R32(%rbp), R32(%rbp) C restore acy 194 ADCSBB (up), %r8 195 mov %r8, (rp) 196 ADCSBB 8(up), %r9 197 mov %r9, 8(rp) 198 ADCSBB 16(up), %r10 199 mov %r10, 16(rp) 200 ADCSBB 24(up), %r11 201 mov %r11, 24(rp) 202 ADCSBB 32(up), %r12 203 mov %r12, 32(rp) 204 ADCSBB 40(up), %r13 205 mov %r13, 40(rp) 206 ADCSBB 48(up), %r14 207 mov %r14, 48(rp) 208 ADCSBB 56(up), %rbx 209 mov %rbx, 56(rp) 210 sbb R32(%rbp), R32(%rbp) C save acy 211 lea 64(up), up 212 L(x): sub $8, n 213 jge L(top) 214 215 L(end): pop %rbx 216 pop %r14 217 pop %r13 218 pop %r12 219 L(rtn): 220 ifdef(`OPERATION_addlsh1_n',` 221 add R32(%rbp), R32(%rax) 222 neg R32(%rax)') 223 ifdef(`OPERATION_rsblsh1_n',` 224 sub R32(%rax), R32(%rbp) 225 movslq R32(%rbp), %rax') 226 227 pop %rbp 228 FUNC_EXIT() 229 ret 230 EPILOGUE() 231 PROLOGUE(func_nc) 232 FUNC_ENTRY(4) 233 IFDOS(` mov 56(%rsp), %r8 ') 234 push %rbp 235 neg %r8 C set CF 236 sbb R32(%rbp), R32(%rbp) C save acy 237 jmp L(ent) 238 EPILOGUE()