github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/sublsh1_n.asm (about) 1 dnl AMD64 mpn_sublsh1_n optimised for Intel Atom. 2 dnl Used also for AMD bd1. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund. 5 6 dnl Copyright 2011, 2012 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C TODO 37 C * This code is slightly large at 501 bytes. 38 C * aorrlsh1_n.asm and this file use the same basic pattern. 39 40 C cycles/limb 41 C AMD K8,K9 ? 42 C AMD K10 ? 43 C AMD bd1 2.3 44 C AMD bobcat ? 45 C Intel P4 ? 46 C Intel core2 ? 47 C Intel NHM ? 48 C Intel SBR ? 49 C Intel atom 5 (4.875 is probably possible) 50 C VIA nano ? 51 52 C INPUT PARAMETERS 53 define(`rp', `%rdi') 54 define(`up', `%rsi') 55 define(`vp', `%rdx') 56 define(`n', `%rcx') 57 define(`cy', `%r8') 58 59 ABI_SUPPORT(DOS64) 60 ABI_SUPPORT(STD64) 61 62 ASM_START() 63 TEXT 64 ALIGN(16) 65 PROLOGUE(mpn_sublsh1_n) 66 FUNC_ENTRY(4) 67 push %rbp 68 push %r15 69 xor R32(%rbp), R32(%rbp) 70 L(ent): mov R32(n), R32(%rax) 71 and $3, R32(%rax) 72 jz L(b0) 73 cmp $2, R32(%rax) 74 jz L(b2) 75 jg L(b3) 76 77 L(b1): mov (vp), %r8 78 add %r8, %r8 79 lea 8(vp), vp 80 sbb R32(%rax), R32(%rax) C save scy 81 add R32(%rbp), R32(%rbp) C restore acy 82 mov (up), %r15 83 sbb %r8, %r15 84 mov %r15, (rp) 85 sbb R32(%rbp), R32(%rbp) C save acy 86 lea 8(up), up 87 lea 8(rp), rp 88 jmp L(b0) 89 90 L(b2): mov (vp), %r8 91 add %r8, %r8 92 mov 8(vp), %r9 93 adc %r9, %r9 94 lea 16(vp), vp 95 sbb R32(%rax), R32(%rax) C save scy 96 add R32(%rbp), R32(%rbp) C restore acy 97 mov (up), %r15 98 sbb %r8, %r15 99 mov %r15, (rp) 100 mov 8(up), %r15 101 sbb %r9, %r15 102 mov %r15, 8(rp) 103 sbb R32(%rbp), R32(%rbp) C save acy 104 lea 16(up), up 105 lea 16(rp), rp 106 jmp L(b0) 107 108 L(b3): mov (vp), %r8 109 add %r8, %r8 110 mov 8(vp), %r9 111 adc %r9, %r9 112 mov 16(vp), %r10 113 adc %r10, %r10 114 lea 24(vp), vp 115 sbb R32(%rax), R32(%rax) C save scy 116 add R32(%rbp), R32(%rbp) C restore acy 117 mov (up), %r15 118 sbb %r8, %r15 119 mov %r15, (rp) 120 mov 8(up), %r15 121 sbb %r9, %r15 122 mov %r15, 8(rp) 123 mov 16(up), %r15 124 sbb %r10, %r15 125 mov %r15, 16(rp) 126 sbb R32(%rbp), R32(%rbp) C save acy 127 lea 24(up), up 128 lea 24(rp), rp 129 130 L(b0): test $4, R8(n) 131 jz L(skp) 132 add R32(%rax), R32(%rax) C restore scy 133 mov (vp), %r8 134 adc %r8, %r8 135 mov 8(vp), %r9 136 adc %r9, %r9 137 mov 16(vp), %r10 138 adc %r10, %r10 139 mov 24(vp), %r11 140 adc %r11, %r11 141 lea 32(vp), vp 142 sbb R32(%rax), R32(%rax) C save scy 143 add R32(%rbp), R32(%rbp) C restore acy 144 mov (up), %r15 145 sbb %r8, %r15 146 mov %r15, (rp) 147 mov 8(up), %r15 148 sbb %r9, %r15 149 mov %r15, 8(rp) 150 mov 16(up), %r15 151 sbb %r10, %r15 152 mov %r15, 16(rp) 153 mov 24(up), %r15 154 sbb %r11, %r15 155 mov %r15, 24(rp) 156 lea 32(up), up 157 lea 32(rp), rp 158 sbb R32(%rbp), R32(%rbp) C save acy 159 160 L(skp): cmp $8, n 161 jl L(rtn) 162 163 push %r12 164 push %r13 165 push %r14 166 push %rbx 167 lea -64(rp), rp 168 jmp L(x) 169 170 ALIGN(16) 171 L(top): mov (vp), %r8 172 add R32(%rax), R32(%rax) 173 lea 64(vp), vp 174 adc %r8, %r8 175 mov -56(vp), %r9 176 adc %r9, %r9 177 mov -48(vp), %r10 178 adc %r10, %r10 179 mov -40(vp), %r11 180 adc %r11, %r11 181 mov -32(vp), %r12 182 adc %r12, %r12 183 mov -24(vp), %r13 184 adc %r13, %r13 185 mov -16(vp), %r14 186 adc %r14, %r14 187 mov -8(vp), %r15 188 adc %r15, %r15 189 sbb R32(%rax), R32(%rax) 190 add R32(%rbp), R32(%rbp) 191 mov (up), %rbp 192 lea 64(rp), rp 193 mov 8(up), %rbx 194 sbb %r8, %rbp 195 mov 32(up), %r8 196 mov %rbp, (rp) 197 sbb %r9, %rbx 198 mov 16(up), %rbp 199 mov %rbx, 8(rp) 200 sbb %r10, %rbp 201 mov 24(up), %rbx 202 mov %rbp, 16(rp) 203 sbb %r11, %rbx 204 mov %rbx, 24(rp) 205 sbb %r12, %r8 206 mov 40(up), %r9 207 mov %r8, 32(rp) 208 sbb %r13, %r9 209 mov 48(up), %rbp 210 mov %r9, 40(rp) 211 sbb %r14, %rbp 212 mov 56(up), %rbx 213 mov %rbp, 48(rp) 214 sbb %r15, %rbx 215 lea 64(up), up 216 mov %rbx, 56(rp) 217 sbb R32(%rbp), R32(%rbp) 218 L(x): sub $8, n 219 jge L(top) 220 221 L(end): pop %rbx 222 pop %r14 223 pop %r13 224 pop %r12 225 L(rtn): 226 add R32(%rbp), R32(%rax) 227 neg R32(%rax) 228 229 pop %r15 230 pop %rbp 231 FUNC_EXIT() 232 ret 233 EPILOGUE() 234 PROLOGUE(mpn_sublsh1_nc) 235 FUNC_ENTRY(4) 236 IFDOS(` mov 56(%rsp), %r8 ') 237 push %rbp 238 push %r15 239 neg %r8 C set CF 240 sbb R32(%rbp), R32(%rbp) C save acy 241 jmp L(ent) 242 EPILOGUE()