github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/rsh1aors_n.asm (about) 1 dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2011, 2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C TODO 36 C * Schedule loop less. It is now almost surely overscheduled, resulting in 37 C large feed-in and wind-down code. 38 39 C cycles/limb 40 C AMD K8,K9 ? 41 C AMD K10 ? 42 C Intel P4 ? 43 C Intel core2 ? 44 C Intel NMH ? 45 C Intel SBR ? 46 C Intel atom 5.25 47 C VIA nano ? 48 49 C INPUT PARAMETERS 50 define(`rp',`%rdi') 51 define(`up',`%rsi') 52 define(`vp',`%rdx') 53 define(`n',`%rcx') 54 55 ifdef(`OPERATION_rsh1add_n', ` 56 define(ADDSUB, add) 57 define(ADCSBB, adc) 58 define(func_n, mpn_rsh1add_n) 59 define(func_nc, mpn_rsh1add_nc)') 60 ifdef(`OPERATION_rsh1sub_n', ` 61 define(ADDSUB, sub) 62 define(ADCSBB, sbb) 63 define(func_n, mpn_rsh1sub_n) 64 define(func_nc, mpn_rsh1sub_nc)') 65 66 ABI_SUPPORT(DOS64) 67 ABI_SUPPORT(STD64) 68 69 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) 70 71 ASM_START() 72 TEXT 73 ALIGN(16) 74 PROLOGUE(func_n) 75 FUNC_ENTRY(4) 76 push %rbx 77 push %rbp 78 push %r12 79 push %r13 80 push %r14 81 push %r15 82 83 mov (up), %r15 84 ADDSUB (vp), %r15 85 sbb R32(%rbx), R32(%rbx) 86 xor R32(%rax), R32(%rax) 87 shr %r15 88 adc R32(%rax), R32(%rax) C return value 89 90 mov R32(n), R32(%rbp) 91 and $3, R32(%rbp) 92 jz L(b0) 93 cmp $2, R32(%rbp) 94 jae L(b23) 95 96 L(b1): dec n 97 jnz L(gt1) 98 shl $63, %rbx 99 add %rbx, %r15 100 mov %r15, (rp) 101 jmp L(cj1) 102 L(gt1): lea 24(up), up 103 lea 24(vp), vp 104 mov -16(up), %r9 105 add R32(%rbx), R32(%rbx) 106 mov -8(up), %r10 107 lea 24(rp), rp 108 mov (up), %r11 109 ADCSBB -16(vp), %r9 110 ADCSBB -8(vp), %r10 111 mov %r15, %r12 112 ADCSBB (vp), %r11 113 mov %r9, %r13 114 sbb R32(%rbx), R32(%rbx) 115 mov %r11, %r15 116 mov %r10, %r14 117 shl $63, %r11 118 shl $63, %r10 119 shl $63, %r9 120 or %r9, %r12 121 shr %r13 122 mov 8(up), %r8 123 shr %r14 124 or %r10, %r13 125 shr %r15 126 or %r11, %r14 127 sub $4, n 128 jz L(cj5) 129 L(gt5): mov 16(up), %r9 130 add R32(%rbx), R32(%rbx) 131 mov 24(up), %r10 132 ADCSBB 8(vp), %r8 133 mov %r15, %rbp 134 mov 32(up), %r11 135 jmp L(lo1) 136 137 L(b23): jnz L(b3) 138 mov 8(up), %r8 139 sub $2, n 140 jnz L(gt2) 141 add R32(%rbx), R32(%rbx) 142 ADCSBB 8(vp), %r8 143 mov %r8, %r12 144 jmp L(cj2) 145 L(gt2): mov 16(up), %r9 146 add R32(%rbx), R32(%rbx) 147 mov 24(up), %r10 148 ADCSBB 8(vp), %r8 149 mov %r15, %rbp 150 mov 32(up), %r11 151 ADCSBB 16(vp), %r9 152 lea 32(up), up 153 ADCSBB 24(vp), %r10 154 mov %r9, %r13 155 ADCSBB 32(vp), %r11 156 mov %r8, %r12 157 jmp L(lo2) 158 159 L(b3): lea 40(up), up 160 lea 8(vp), vp 161 mov %r15, %r14 162 add R32(%rbx), R32(%rbx) 163 mov -32(up), %r11 164 ADCSBB 0(vp), %r11 165 lea 8(rp), rp 166 sbb R32(%rbx), R32(%rbx) 167 mov %r11, %r15 168 shl $63, %r11 169 mov -24(up), %r8 170 shr %r15 171 or %r11, %r14 172 sub $3, n 173 jnz L(gt3) 174 add R32(%rbx), R32(%rbx) 175 ADCSBB 8(vp), %r8 176 jmp L(cj3) 177 L(gt3): mov -16(up), %r9 178 add R32(%rbx), R32(%rbx) 179 mov -8(up), %r10 180 ADCSBB 8(vp), %r8 181 mov %r15, %rbp 182 mov (up), %r11 183 ADCSBB 16(vp), %r9 184 ADCSBB 24(vp), %r10 185 mov %r8, %r12 186 jmp L(lo3) 187 188 L(b0): lea 48(up), up 189 lea 16(vp), vp 190 add R32(%rbx), R32(%rbx) 191 mov -40(up), %r10 192 lea 16(rp), rp 193 mov -32(up), %r11 194 ADCSBB -8(vp), %r10 195 mov %r15, %r13 196 ADCSBB (vp), %r11 197 sbb R32(%rbx), R32(%rbx) 198 mov %r11, %r15 199 mov %r10, %r14 200 shl $63, %r11 201 shl $63, %r10 202 mov -24(up), %r8 203 shr %r14 204 or %r10, %r13 205 shr %r15 206 or %r11, %r14 207 sub $4, n 208 jnz L(gt4) 209 add R32(%rbx), R32(%rbx) 210 ADCSBB 8(vp), %r8 211 jmp L(cj4) 212 L(gt4): mov -16(up), %r9 213 add R32(%rbx), R32(%rbx) 214 mov -8(up), %r10 215 ADCSBB 8(vp), %r8 216 mov %r15, %rbp 217 mov (up), %r11 218 ADCSBB 16(vp), %r9 219 jmp L(lo0) 220 221 ALIGN(8) 222 L(top): mov 16(up), %r9 223 shr %r14 224 or %r10, %r13 225 shr %r15 226 or %r11, %r14 227 add R32(%rbx), R32(%rbx) 228 mov 24(up), %r10 229 mov %rbp, (rp) 230 ADCSBB 8(vp), %r8 231 mov %r15, %rbp 232 lea 32(rp), rp 233 mov 32(up), %r11 234 L(lo1): ADCSBB 16(vp), %r9 235 lea 32(up), up 236 mov %r12, -24(rp) 237 L(lo0): ADCSBB 24(vp), %r10 238 mov %r8, %r12 239 mov %r13, -16(rp) 240 L(lo3): ADCSBB 32(vp), %r11 241 mov %r9, %r13 242 mov %r14, -8(rp) 243 L(lo2): sbb R32(%rbx), R32(%rbx) 244 shl $63, %r8 245 mov %r11, %r15 246 shr %r12 247 mov %r10, %r14 248 shl $63, %r9 249 lea 32(vp), vp 250 shl $63, %r10 251 or %r8, %rbp 252 shl $63, %r11 253 or %r9, %r12 254 shr %r13 255 mov 8(up), %r8 256 sub $4, n 257 jg L(top) 258 259 L(end): shr %r14 260 or %r10, %r13 261 shr %r15 262 or %r11, %r14 263 mov %rbp, (rp) 264 lea 32(rp), rp 265 L(cj5): add R32(%rbx), R32(%rbx) 266 ADCSBB 8(vp), %r8 267 mov %r12, -24(rp) 268 L(cj4): mov %r13, -16(rp) 269 L(cj3): mov %r8, %r12 270 mov %r14, -8(rp) 271 L(cj2): sbb R32(%rbx), R32(%rbx) 272 shl $63, %r8 273 shr %r12 274 or %r8, %r15 275 shl $63, %rbx 276 add %rbx, %r12 277 mov %r15, (rp) 278 mov %r12, 8(rp) 279 L(cj1): pop %r15 280 pop %r14 281 pop %r13 282 pop %r12 283 pop %rbp 284 pop %rbx 285 FUNC_EXIT() 286 ret 287 EPILOGUE()