github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/div_qr_1n_pi1.asm (about) 1 dnl x86-64 mpn_div_qr_1n_pi1 2 dnl -- Divide an mpn number by a normalized single-limb number, 3 dnl using a single-limb inverse. 4 5 dnl Contributed to the GNU project by Niels Möller 6 7 dnl Copyright 2013 Free Software Foundation, Inc. 8 9 dnl This file is part of the GNU MP Library. 10 dnl 11 dnl The GNU MP Library is free software; you can redistribute it and/or modify 12 dnl it under the terms of either: 13 dnl 14 dnl * the GNU Lesser General Public License as published by the Free 15 dnl Software Foundation; either version 3 of the License, or (at your 16 dnl option) any later version. 17 dnl 18 dnl or 19 dnl 20 dnl * the GNU General Public License as published by the Free Software 21 dnl Foundation; either version 2 of the License, or (at your option) any 22 dnl later version. 23 dnl 24 dnl or both in parallel, as here. 25 dnl 26 dnl The GNU MP Library is distributed in the hope that it will be useful, but 27 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29 dnl for more details. 30 dnl 31 dnl You should have received copies of the GNU General Public License and the 32 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 33 dnl see https://www.gnu.org/licenses/. 34 35 include(`../config.m4') 36 37 38 C c/l 39 C AMD K8,K9 13 40 C AMD K10 13 41 C AMD bull 16.5 42 C AMD pile 15 43 C AMD steam ? 44 C AMD bobcat 16 45 C AMD jaguar ? 46 C Intel P4 47 poor 47 C Intel core 19.25 48 C Intel NHM 18 49 C Intel SBR 15 poor 50 C Intel IBR 13 51 C Intel HWL 11.7 52 C Intel BWL ? 53 C Intel atom 52 very poor 54 C VIA nano 19 55 56 57 C INPUT Parameters 58 define(`QP', `%rdi') 59 define(`UP', `%rsi') 60 define(`UN_INPUT', `%rdx') 61 define(`U1', `%rcx') C Also in %rax 62 define(`D', `%r8') 63 define(`DINV', `%r9') 64 65 C Invariants 66 define(`B2', `%rbp') 67 define(`B2md', `%rbx') 68 69 C Variables 70 define(`UN', `%r8') C Overlaps D input 71 define(`T', `%r10') 72 define(`U0', `%r11') 73 define(`U2', `%r12') 74 define(`Q0', `%r13') 75 define(`Q1', `%r14') 76 define(`Q2', `%r15') 77 78 ABI_SUPPORT(STD64) 79 80 ASM_START() 81 TEXT 82 ALIGN(16) 83 PROLOGUE(mpn_div_qr_1n_pi1) 84 FUNC_ENTRY(6) 85 IFDOS(` mov 56(%rsp), %r8 ') 86 IFDOS(` mov 64(%rsp), %r9 ') 87 dec UN_INPUT 88 jnz L(first) 89 90 C Just a single 2/1 division. 91 C T, U0 are allocated in scratch registers 92 lea 1(U1), T 93 mov U1, %rax 94 mul DINV 95 mov (UP), U0 96 add U0, %rax 97 adc T, %rdx 98 mov %rdx, T 99 imul D, %rdx 100 sub %rdx, U0 101 cmp U0, %rax 102 lea (U0, D), %rax 103 cmovnc U0, %rax 104 sbb $0, T 105 cmp D, %rax 106 jc L(single_div_done) 107 sub D, %rax 108 add $1, T 109 L(single_div_done): 110 mov T, (QP) 111 FUNC_EXIT 112 ret 113 L(first): 114 C FIXME: Could delay some of these until we enter the loop. 115 push %r15 116 push %r14 117 push %r13 118 push %r12 119 push %rbx 120 push %rbp 121 122 mov D, B2 123 imul DINV, B2 124 neg B2 125 mov B2, B2md 126 sub D, B2md 127 128 C D not needed until final reduction 129 push D 130 mov UN_INPUT, UN C Clobbers D 131 132 mov DINV, %rax 133 mul U1 134 mov %rax, Q0 135 add U1, %rdx 136 mov %rdx, T 137 138 mov B2, %rax 139 mul U1 140 mov -8(UP, UN, 8), U0 141 mov (UP, UN, 8), U1 142 mov T, (QP, UN, 8) 143 add %rax, U0 144 adc %rdx, U1 145 sbb U2, U2 146 dec UN 147 mov U1, %rax 148 jz L(final) 149 150 ALIGN(16) 151 152 C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles. 153 C At entry, %rax holds an extra copy of U1 154 L(loop): 155 C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 156 C Remains to add in B (U1 + c) 157 mov DINV, Q1 158 mov U2, Q2 159 and U2, Q1 160 neg Q2 161 mul DINV 162 add %rdx, Q1 163 adc $0, Q2 164 add Q0, Q1 165 mov %rax, Q0 166 mov B2, %rax 167 lea (B2md, U0), T 168 adc $0, Q2 169 170 C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u 171 mul U1 172 and B2, U2 173 add U2, U0 174 cmovnc U0, T 175 176 C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c 177 adc U1, Q1 178 mov -8(UP, UN, 8), U0 179 adc Q2, 8(QP, UN, 8) 180 jc L(q_incr) 181 L(q_incr_done): 182 add %rax, U0 183 mov T, %rax 184 adc %rdx, %rax 185 mov Q1, (QP, UN, 8) 186 sbb U2, U2 187 dec UN 188 mov %rax, U1 189 jnz L(loop) 190 191 L(final): 192 pop D 193 194 mov U2, Q1 195 and D, U2 196 sub U2, %rax 197 neg Q1 198 199 mov %rax, U1 200 sub D, %rax 201 cmovc U1, %rax 202 sbb $-1, Q1 203 204 lea 1(%rax), T 205 mul DINV 206 add U0, %rax 207 adc T, %rdx 208 mov %rdx, T 209 imul D, %rdx 210 sub %rdx, U0 211 cmp U0, %rax 212 lea (U0, D), %rax 213 cmovnc U0, %rax 214 sbb $0, T 215 cmp D, %rax 216 jc L(div_done) 217 sub D, %rax 218 add $1, T 219 L(div_done): 220 add T, Q0 221 mov Q0, (QP) 222 adc Q1, 8(QP) 223 jnc L(done) 224 L(final_q_incr): 225 addq $1, 16(QP) 226 lea 8(QP), QP 227 jc L(final_q_incr) 228 229 L(done): 230 pop %rbp 231 pop %rbx 232 pop %r12 233 pop %r13 234 pop %r14 235 pop %r15 236 FUNC_EXIT 237 ret 238 239 L(q_incr): 240 C U1 is not live, so use it for indexing 241 lea 16(QP, UN, 8), U1 242 L(q_incr_loop): 243 addq $1, (U1) 244 jnc L(q_incr_done) 245 lea 8(U1), U1 246 jmp L(q_incr_loop) 247 EPILOGUE()