github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/pentium4/rsh1aors_n.asm (about) 1 dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb 37 C AMD K8,K9 4.13 38 C AMD K10 4.13 39 C Intel P4 5.70 40 C Intel core2 4.75 41 C Intel corei 5 42 C Intel atom 8.75 43 C VIA nano 5.25 44 45 C TODO 46 C * Try to make this smaller, 746 bytes seem excessive for this 2nd class 47 C function. Less sw pipelining would help, and since we now probably 48 C pipeline somewhat too deeply, it might not affect performance too much. 49 C * A separate small-n loop might speed things as well as make things smaller. 50 C That loop should be selected before pushing registers. 51 52 C INPUT PARAMETERS 53 define(`rp', `%rdi') 54 define(`up', `%rsi') 55 define(`vp', `%rdx') 56 define(`n', `%rcx') 57 define(`cy', `%r8') 58 59 ifdef(`OPERATION_rsh1add_n', ` 60 define(ADDSUB, add) 61 define(func, mpn_rsh1add_n) 62 define(func_nc, mpn_rsh1add_nc)') 63 ifdef(`OPERATION_rsh1sub_n', ` 64 define(ADDSUB, sub) 65 define(func, mpn_rsh1sub_n) 66 define(func_nc, mpn_rsh1sub_nc)') 67 68 ABI_SUPPORT(DOS64) 69 ABI_SUPPORT(STD64) 70 71 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) 72 73 ASM_START() 74 TEXT 75 PROLOGUE(func) 76 FUNC_ENTRY(4) 77 xor %r8, %r8 78 IFDOS(` jmp L(ent) ') 79 EPILOGUE() 80 PROLOGUE(func_nc) 81 FUNC_ENTRY(4) 82 IFDOS(` mov 56(%rsp), %r8 ') 83 L(ent): push %rbx 84 push %r12 85 push %r13 86 push %r14 87 push %r15 88 89 mov (vp), %r9 90 mov (up), %r15 91 92 mov R32(n), R32(%rax) 93 and $3, R32(%rax) 94 jne L(n00) 95 96 mov R32(%r8), R32(%rbx) C n = 0, 4, 8, ... 97 mov 8(up), %r10 98 ADDSUB %r9, %r15 99 mov 8(vp), %r9 100 setc R8(%rax) 101 ADDSUB %rbx, %r15 C return bit 102 jnc 1f 103 mov $1, R8(%rax) 104 1: mov 16(up), %r12 105 ADDSUB %r9, %r10 106 mov 16(vp), %r9 107 setc R8(%rbx) 108 mov %r15, %r13 109 ADDSUB %rax, %r10 110 jnc 1f 111 mov $1, R8(%rbx) 112 1: mov 24(up), %r11 113 ADDSUB %r9, %r12 114 lea 32(up), up 115 mov 24(vp), %r9 116 lea 32(vp), vp 117 setc R8(%rax) 118 mov %r10, %r14 119 shl $63, %r10 120 shr %r13 121 jmp L(L00) 122 123 L(n00): cmp $2, R32(%rax) 124 jnc L(n01) 125 xor R32(%rbx), R32(%rbx) C n = 1, 5, 9, ... 126 lea -24(rp), rp 127 mov R32(%r8), R32(%rax) 128 dec n 129 jnz L(gt1) 130 ADDSUB %r9, %r15 131 setc R8(%rbx) 132 ADDSUB %rax, %r15 133 jnc 1f 134 mov $1, R8(%rbx) 135 1: mov %r15, %r14 136 shl $63, %rbx 137 shr %r14 138 jmp L(cj1) 139 L(gt1): mov 8(up), %r8 140 ADDSUB %r9, %r15 141 mov 8(vp), %r9 142 setc R8(%rbx) 143 ADDSUB %rax, %r15 144 jnc 1f 145 mov $1, R8(%rbx) 146 1: mov 16(up), %r10 147 ADDSUB %r9, %r8 148 mov 16(vp), %r9 149 setc R8(%rax) 150 mov %r15, %r14 151 ADDSUB %rbx, %r8 152 jnc 1f 153 mov $1, R8(%rax) 154 1: mov 24(up), %r12 155 ADDSUB %r9, %r10 156 mov 24(vp), %r9 157 setc R8(%rbx) 158 mov %r8, %r13 159 shl $63, %r8 160 shr %r14 161 lea 8(up), up 162 lea 8(vp), vp 163 jmp L(L01) 164 165 L(n01): jne L(n10) 166 lea -16(rp), rp C n = 2, 6, 10, ... 167 mov R32(%r8), R32(%rbx) 168 mov 8(up), %r11 169 ADDSUB %r9, %r15 170 mov 8(vp), %r9 171 setc R8(%rax) 172 ADDSUB %rbx, %r15 173 jnc 1f 174 mov $1, R8(%rax) 175 1: sub $2, n 176 jnz L(gt2) 177 ADDSUB %r9, %r11 178 setc R8(%rbx) 179 mov %r15, %r13 180 ADDSUB %rax, %r11 181 jnc 1f 182 mov $1, R8(%rbx) 183 1: mov %r11, %r14 184 shl $63, %r11 185 shr %r13 186 jmp L(cj2) 187 L(gt2): mov 16(up), %r8 188 ADDSUB %r9, %r11 189 mov 16(vp), %r9 190 setc R8(%rbx) 191 mov %r15, %r13 192 ADDSUB %rax, %r11 193 jnc 1f 194 mov $1, R8(%rbx) 195 1: mov 24(up), %r10 196 ADDSUB %r9, %r8 197 mov 24(vp), %r9 198 setc R8(%rax) 199 mov %r11, %r14 200 shl $63, %r11 201 shr %r13 202 lea 16(up), up 203 lea 16(vp), vp 204 jmp L(L10) 205 206 L(n10): xor R32(%rbx), R32(%rbx) C n = 3, 7, 11, ... 207 lea -8(rp), rp 208 mov R32(%r8), R32(%rax) 209 mov 8(up), %r12 210 ADDSUB %r9, %r15 211 mov 8(vp), %r9 212 setc R8(%rbx) 213 ADDSUB %rax, %r15 214 jnc 1f 215 mov $1, R8(%rbx) 216 1: mov 16(up), %r11 217 ADDSUB %r9, %r12 218 mov 16(vp), %r9 219 setc R8(%rax) 220 mov %r15, %r14 221 ADDSUB %rbx, %r12 222 jnc 1f 223 mov $1, R8(%rax) 224 1: sub $3, n 225 jnz L(gt3) 226 ADDSUB %r9, %r11 227 setc R8(%rbx) 228 mov %r12, %r13 229 shl $63, %r12 230 shr %r14 231 jmp L(cj3) 232 L(gt3): mov 24(up), %r8 233 ADDSUB %r9, %r11 234 mov 24(vp), %r9 235 setc R8(%rbx) 236 mov %r12, %r13 237 shl $63, %r12 238 shr %r14 239 lea 24(up), up 240 lea 24(vp), vp 241 jmp L(L11) 242 243 L(c0): mov $1, R8(%rbx) 244 jmp L(rc0) 245 L(c1): mov $1, R8(%rax) 246 jmp L(rc1) 247 L(c2): mov $1, R8(%rbx) 248 jmp L(rc2) 249 250 ALIGN(16) 251 L(top): mov (up), %r8 C not on critical path 252 or %r13, %r10 253 ADDSUB %r9, %r11 C not on critical path 254 mov (vp), %r9 C not on critical path 255 setc R8(%rbx) C save carry out 256 mov %r12, %r13 C new for later 257 shl $63, %r12 C shift new right 258 shr %r14 C shift old left 259 mov %r10, (rp) 260 L(L11): ADDSUB %rax, %r11 C apply previous carry out 261 jc L(c0) C jump if ripple 262 L(rc0): mov 8(up), %r10 263 or %r14, %r12 264 ADDSUB %r9, %r8 265 mov 8(vp), %r9 266 setc R8(%rax) 267 mov %r11, %r14 268 shl $63, %r11 269 shr %r13 270 mov %r12, 8(rp) 271 L(L10): ADDSUB %rbx, %r8 272 jc L(c1) 273 L(rc1): mov 16(up), %r12 274 or %r13, %r11 275 ADDSUB %r9, %r10 276 mov 16(vp), %r9 277 setc R8(%rbx) 278 mov %r8, %r13 279 shl $63, %r8 280 shr %r14 281 mov %r11, 16(rp) 282 L(L01): ADDSUB %rax, %r10 283 jc L(c2) 284 L(rc2): mov 24(up), %r11 285 or %r14, %r8 286 ADDSUB %r9, %r12 287 lea 32(up), up 288 mov 24(vp), %r9 289 lea 32(vp), vp 290 setc R8(%rax) 291 mov %r10, %r14 292 shl $63, %r10 293 shr %r13 294 mov %r8, 24(rp) 295 lea 32(rp), rp 296 L(L00): ADDSUB %rbx, %r12 297 jc L(c3) 298 L(rc3): sub $4, n 299 ja L(top) 300 301 L(end): or %r13, %r10 302 ADDSUB %r9, %r11 303 setc R8(%rbx) 304 mov %r12, %r13 305 shl $63, %r12 306 shr %r14 307 mov %r10, (rp) 308 L(cj3): ADDSUB %rax, %r11 309 jnc 1f 310 mov $1, R8(%rbx) 311 1: or %r14, %r12 312 mov %r11, %r14 313 shl $63, %r11 314 shr %r13 315 mov %r12, 8(rp) 316 L(cj2): or %r13, %r11 317 shl $63, %rbx 318 shr %r14 319 mov %r11, 16(rp) 320 L(cj1): or %r14, %rbx 321 mov %rbx, 24(rp) 322 323 mov R32(%r15), R32(%rax) 324 and $1, R32(%rax) 325 pop %r15 326 pop %r14 327 pop %r13 328 pop %r12 329 pop %rbx 330 FUNC_EXIT() 331 ret 332 L(c3): mov $1, R8(%rax) 333 jmp L(rc3) 334 EPILOGUE()