github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/div_qr_2u_pi1.asm (about) 1 dnl x86-64 mpn_div_qr_2u_pi1 2 dnl -- Divide an mpn number by an unnormalized 2-limb number, 3 dnl using a single-limb inverse and shifting the dividend on the fly. 4 5 dnl Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C c/l 37 C INPUT PARAMETERS 38 define(`qp', `%rdi') 39 define(`rp', `%rsi') 40 define(`up_param', `%rdx') 41 define(`un_param', `%rcx') dnl %rcx needed for shift count 42 define(`d1', `%r8') 43 define(`d0', `%r9') 44 define(`shift_param', `FRAME+8(%rsp)') 45 define(`di_param', `FRAME+16(%rsp)') 46 47 define(`di', `%r10') 48 define(`up', `%r11') 49 define(`un', `%rbp') 50 define(`u2', `%rbx') 51 define(`u1', `%r12') 52 define(`u0', `%rsi') dnl Same as rp, which is saved and restored. 53 define(`t1', `%r13') 54 define(`t0', `%r14') 55 define(`md1', `%r15') 56 57 ASM_START() 58 TEXT 59 ALIGN(16) 60 deflit(`FRAME', 0) 61 PROLOGUE(mpn_div_qr_2u_pi1) 62 mov di_param, di 63 mov up_param, up 64 push %r15 65 push %r14 66 push %r13 67 push %r12 68 push %rbx 69 push %rbp 70 push rp 71 deflit(`FRAME', 56) 72 lea -2(un_param), un 73 mov d1, md1 74 neg md1 75 76 C int parameter, 32 bits only 77 movl shift_param, R32(%rcx) 78 79 C FIXME: Different code for SHLD_SLOW 80 81 xor R32(u2), R32(u2) 82 mov 8(up, un, 8), u1 83 shld %cl, u1, u2 84 C Remains to read (up, un, 8) and shift u1, u0 85 C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di) 86 mov di, %rax 87 mul u2 88 mov (up, un, 8), u0 89 shld %cl, u0, u1 90 mov u1, t0 91 add %rax, t0 C q0 in t0 92 adc u2, %rdx 93 mov %rdx, t1 C q in t1 94 imul md1, %rdx 95 mov d0, %rax 96 lea (%rdx, u1), u2 97 mul t1 98 mov u0, u1 99 shl %cl, u1 100 sub d0, u1 101 sbb d1, u2 102 sub %rax, u1 103 sbb %rdx, u2 104 xor R32(%rax), R32(%rax) 105 xor R32(%rdx), R32(%rdx) 106 cmp t0, u2 107 cmovnc d0, %rax 108 cmovnc d1, %rdx 109 adc $0, t1 110 nop 111 add %rax, u1 112 adc %rdx, u2 113 cmp d1, u2 114 jae L(fix_qh) 115 L(bck_qh): 116 push t1 C push qh on stack 117 118 jmp L(next) 119 120 ALIGN(16) 121 L(loop): 122 C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di) 123 C Based on the optimized divrem_2.asm code. 124 125 mov di, %rax 126 mul u2 127 mov (up, un, 8), u0 128 xor R32(t1), R32(t1) 129 shld %cl, u0, t1 130 or t1, u1 131 mov u1, t0 132 add %rax, t0 C q0 in t0 133 adc u2, %rdx 134 mov %rdx, t1 C q in t1 135 imul md1, %rdx 136 mov d0, %rax 137 lea (%rdx, u1), u2 138 mul t1 139 mov u0, u1 140 shl %cl, u1 141 sub d0, u1 142 sbb d1, u2 143 sub %rax, u1 144 sbb %rdx, u2 145 xor R32(%rax), R32(%rax) 146 xor R32(%rdx), R32(%rdx) 147 cmp t0, u2 148 cmovnc d0, %rax 149 cmovnc d1, %rdx 150 adc $0, t1 151 nop 152 add %rax, u1 153 adc %rdx, u2 154 cmp d1, u2 155 jae L(fix) 156 L(bck): 157 mov t1, (qp, un, 8) 158 L(next): 159 sub $1, un 160 jnc L(loop) 161 L(end): 162 C qh on stack 163 pop %rax 164 pop rp 165 shrd %cl, u2, u1 166 shr %cl, u2 167 mov u2, 8(rp) 168 mov u1, (rp) 169 170 pop %rbp 171 pop %rbx 172 pop %r12 173 pop %r13 174 pop %r14 175 pop %r15 176 ret 177 178 L(fix): C Unlikely update. u2 >= d1 179 seta %dl 180 cmp d0, u1 181 setae %al 182 orb %dl, %al C "orb" form to placate Sun tools 183 je L(bck) 184 inc t1 185 sub d0, u1 186 sbb d1, u2 187 jmp L(bck) 188 189 C Duplicated, just jumping back to a different address. 190 L(fix_qh): C Unlikely update. u2 >= d1 191 seta %dl 192 cmp d0, u1 193 setae %al 194 orb %dl, %al C "orb" form to placate Sun tools 195 je L(bck_qh) 196 inc t1 197 sub d0, u1 198 sbb d1, u2 199 jmp L(bck_qh) 200 EPILOGUE()