github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/div_qr_1n_pi1.asm (about) 1 dnl x86-64 mpn_div_qr_1n_pi1 2 dnl -- Divide an mpn number by a normalized single-limb number, 3 dnl using a single-limb inverse. 4 5 dnl Contributed to the GNU project by Niels Möller 6 7 dnl Copyright 2013 Free Software Foundation, Inc. 8 9 dnl This file is part of the GNU MP Library. 10 dnl 11 dnl The GNU MP Library is free software; you can redistribute it and/or modify 12 dnl it under the terms of either: 13 dnl 14 dnl * the GNU Lesser General Public License as published by the Free 15 dnl Software Foundation; either version 3 of the License, or (at your 16 dnl option) any later version. 17 dnl 18 dnl or 19 dnl 20 dnl * the GNU General Public License as published by the Free Software 21 dnl Foundation; either version 2 of the License, or (at your option) any 22 dnl later version. 23 dnl 24 dnl or both in parallel, as here. 25 dnl 26 dnl The GNU MP Library is distributed in the hope that it will be useful, but 27 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29 dnl for more details. 30 dnl 31 dnl You should have received copies of the GNU General Public License and the 32 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 33 dnl see https://www.gnu.org/licenses/. 34 35 include(`../config.m4') 36 37 38 C c/l 39 C AMD K8,K9 11 40 C AMD K10 11 41 C AMD bull 16 42 C AMD pile 14.25 43 C AMD steam ? 44 C AMD bobcat 16 45 C AMD jaguar ? 46 C Intel P4 47.5 poor 47 C Intel core 28.5 very poor 48 C Intel NHM 29 very poor 49 C Intel SBR 16 poor 50 C Intel IBR 13.5 51 C Intel HWL 12 52 C Intel BWL ? 53 C Intel atom 53 very poor 54 C VIA nano 19 55 56 57 C INPUT Parameters 58 define(`QP', `%rdi') 59 define(`UP', `%rsi') 60 define(`UN_INPUT', `%rdx') 61 define(`U1', `%rcx') C Also in %rax 62 define(`D', `%r8') 63 define(`DINV', `%r9') 64 65 C Invariants 66 define(`B2', `%rbp') 67 define(`B2md', `%rbx') 68 69 C Variables 70 define(`UN', `%r8') C Overlaps D input 71 define(`T', `%r10') 72 define(`U0', `%r11') 73 define(`U2', `%r12') 74 define(`Q0', `%r13') 75 define(`Q1', `%r14') 76 define(`Q2', `%r15') 77 78 ABI_SUPPORT(STD64) 79 80 ASM_START() 81 TEXT 82 ALIGN(16) 83 PROLOGUE(mpn_div_qr_1n_pi1) 84 FUNC_ENTRY(6) 85 IFDOS(` mov 56(%rsp), %r8 ') 86 IFDOS(` mov 64(%rsp), %r9 ') 87 dec UN_INPUT 88 jnz L(first) 89 90 C Just a single 2/1 division. 91 C T, U0 are allocated in scratch registers 92 lea 1(U1), T 93 mov U1, %rax 94 mul DINV 95 mov (UP), U0 96 add U0, %rax 97 adc T, %rdx 98 mov %rdx, T 99 imul D, %rdx 100 sub %rdx, U0 101 cmp U0, %rax 102 lea (U0, D), %rax 103 cmovnc U0, %rax 104 sbb $0, T 105 cmp D, %rax 106 jc L(single_div_done) 107 sub D, %rax 108 add $1, T 109 L(single_div_done): 110 mov T, (QP) 111 FUNC_EXIT 112 ret 113 L(first): 114 C FIXME: Could delay some of these until we enter the loop. 115 push %r15 116 push %r14 117 push %r13 118 push %r12 119 push %rbx 120 push %rbp 121 122 mov D, B2 123 imul DINV, B2 124 neg B2 125 mov B2, B2md 126 sub D, B2md 127 128 C D not needed until final reduction 129 push D 130 mov UN_INPUT, UN C Clobbers D 131 132 mov DINV, %rax 133 mul U1 134 mov %rax, Q0 135 add U1, %rdx 136 mov %rdx, T 137 138 mov B2, %rax 139 mul U1 140 mov -8(UP, UN, 8), U0 141 mov (UP, UN, 8), U1 142 mov T, (QP, UN, 8) 143 add %rax, U0 144 adc %rdx, U1 145 sbb U2, U2 146 dec UN 147 mov U1, %rax 148 jz L(final) 149 mov $0, R32(Q1) 150 151 ALIGN(16) 152 153 C Loop is 28 instructions, 30 K8/K10 decoder slots, should run 154 C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1 155 C is zero, and carry holds an extra copy of U2. 156 L(loop): 157 C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 158 C Remains to add in B (U1 + c) 159 cmovc DINV, Q1 160 mov U2, Q2 161 neg Q2 162 mul DINV 163 add %rdx, Q1 164 adc $0, Q2 165 add Q0, Q1 166 mov %rax, Q0 167 mov B2, %rax 168 lea (B2md, U0), T 169 adc $0, Q2 170 171 C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u 172 mul U1 173 and B2, U2 174 add U2, U0 175 cmovnc U0, T 176 177 C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c 178 adc U1, Q1 179 mov -8(UP, UN, 8), U0 180 adc Q2, 8(QP, UN, 8) 181 jc L(q_incr) 182 L(q_incr_done): 183 add %rax, U0 184 mov T, %rax 185 adc %rdx, %rax 186 mov Q1, (QP, UN, 8) 187 mov $0, R32(Q1) 188 sbb U2, U2 189 dec UN 190 mov %rax, U1 191 jnz L(loop) 192 193 L(final): 194 pop D 195 196 mov U2, Q1 197 and D, U2 198 sub U2, %rax 199 neg Q1 200 201 mov %rax, U1 202 sub D, %rax 203 cmovc U1, %rax 204 sbb $-1, Q1 205 206 lea 1(%rax), T 207 mul DINV 208 add U0, %rax 209 adc T, %rdx 210 mov %rdx, T 211 imul D, %rdx 212 sub %rdx, U0 213 cmp U0, %rax 214 lea (U0, D), %rax 215 cmovnc U0, %rax 216 sbb $0, T 217 cmp D, %rax 218 jc L(div_done) 219 sub D, %rax 220 add $1, T 221 L(div_done): 222 add T, Q0 223 mov Q0, (QP) 224 adc Q1, 8(QP) 225 jnc L(done) 226 L(final_q_incr): 227 addq $1, 16(QP) 228 lea 8(QP), QP 229 jc L(final_q_incr) 230 231 L(done): 232 pop %rbp 233 pop %rbx 234 pop %r12 235 pop %r13 236 pop %r14 237 pop %r15 238 FUNC_EXIT 239 ret 240 241 L(q_incr): 242 C U1 is not live, so use it for indexing 243 lea 16(QP, UN, 8), U1 244 L(q_incr_loop): 245 addq $1, (U1) 246 jnc L(q_incr_done) 247 lea 8(U1), U1 248 jmp L(q_incr_loop) 249 EPILOGUE()