github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/div_qr_2.c (about) 1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and 2 quotient. The divisor is two limbs. 3 4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller 5 6 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY 7 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 8 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 11 Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of either: 17 18 * the GNU Lesser General Public License as published by the Free 19 Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 or 23 24 * the GNU General Public License as published by the Free Software 25 Foundation; either version 2 of the License, or (at your option) any 26 later version. 27 28 or both in parallel, as here. 29 30 The GNU MP Library is distributed in the hope that it will be useful, but 31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 33 for more details. 34 35 You should have received copies of the GNU General Public License and the 36 GNU Lesser General Public License along with the GNU MP Library. If not, 37 see https://www.gnu.org/licenses/. */ 38 39 #include "gmp.h" 40 #include "gmp-impl.h" 41 #include "longlong.h" 42 43 #ifndef DIV_QR_2_PI2_THRESHOLD 44 /* Disabled unless explicitly tuned. */ 45 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX 46 #endif 47 48 #ifndef SANITY_CHECK 49 #define SANITY_CHECK 0 50 #endif 51 52 /* Define some longlong.h-style macros, but for wider operations. 53 * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating 54 carry-out into an additional sum operand. 55 * add_csaac accepts two addends and a carry in, and generates a sum 56 and a carry out. A little like a "full adder". 57 */ 58 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) 59 60 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 61 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 62 __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ 63 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 64 : "0" ((USItype)(s2)), \ 65 "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ 66 "%2" ((USItype)(a0)), "g" ((USItype)(b0))) 67 #define add_csaac(co, s, a, b, ci) \ 68 __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0" \ 69 : "=r" (co), "=r" (s) \ 70 : "rm" ((USItype)(ci)), "0" (CNST_LIMB(0)), \ 71 "%1" ((USItype)(a)), "g" ((USItype)(b))) 72 #endif 73 74 #if defined (__amd64__) && W_TYPE_SIZE == 64 75 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 76 __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ 77 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 78 : "0" ((UDItype)(s2)), \ 79 "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ 80 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) 81 #define add_csaac(co, s, a, b, ci) \ 82 __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0" \ 83 : "=r" (co), "=r" (s) \ 84 : "rm" ((UDItype)(ci)), "0" (CNST_LIMB(0)), \ 85 "%1" ((UDItype)(a)), "g" ((UDItype)(b))) 86 #endif 87 88 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) 89 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a 90 processor running in 32-bit mode, since the carry flag then gets the 32-bit 91 carry. */ 92 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 93 __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \ 94 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 95 : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0)) 96 #endif 97 98 #endif /* __GNUC__ */ 99 100 #ifndef add_sssaaaa 101 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 102 do { \ 103 UWtype __s0, __s1, __c0, __c1; \ 104 __s0 = (a0) + (b0); \ 105 __s1 = (a1) + (b1); \ 106 __c0 = __s0 < (a0); \ 107 __c1 = __s1 < (a1); \ 108 (s0) = __s0; \ 109 __s1 = __s1 + __c0; \ 110 (s1) = __s1; \ 111 (s2) += __c1 + (__s1 < __c0); \ 112 } while (0) 113 #endif 114 115 #ifndef add_csaac 116 #define add_csaac(co, s, a, b, ci) \ 117 do { \ 118 UWtype __s, __c; \ 119 __s = (a) + (b); \ 120 __c = __s < (a); \ 121 __s = __s + (ci); \ 122 (s) = __s; \ 123 (co) = __c + (__s < (ci)); \ 124 } while (0) 125 #endif 126 127 /* Typically used with r1, r0 same as n3, n2. Other types of overlap 128 between inputs and outputs are not supported. */ 129 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ 130 do { \ 131 mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ 132 mp_limb_t _t1, _t0; \ 133 mp_limb_t _c, _mask; \ 134 \ 135 umul_ppmm (_q3,_q2a, n3, di1); \ 136 umul_ppmm (_q2,_q1, n2, di1); \ 137 umul_ppmm (_q2c,_q1c, n3, di0); \ 138 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \ 139 umul_ppmm (_q1d,_q0, n2, di0); \ 140 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \ 141 \ 142 add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1)); \ 143 \ 144 /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */ \ 145 add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0)); \ 146 add_csaac (_c, _q1, _q1, n1, _c); \ 147 add_csaac (_c, _q2, _q2, r0, _c); \ 148 _q3 = _q3 + r1 + _c; \ 149 \ 150 umul_ppmm (_t1,_t0, _q2, d0); \ 151 _t1 += _q2 * d1 + _q3 * d0; \ 152 \ 153 sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ 154 \ 155 _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \ 156 add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \ 157 sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \ 158 \ 159 if (UNLIKELY (r1 >= d1)) \ 160 { \ 161 if (r1 > d1 || r0 >= d0) \ 162 { \ 163 sub_ddmmss (r1, r0, r1, r0, d1, d0); \ 164 add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\ 165 } \ 166 } \ 167 (q1) = _q3; \ 168 (q0) = _q2; \ 169 } while (0) 170 171 static void 172 invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0) 173 { 174 mp_limb_t v1, v0, p1, t1, t0, p0, mask; 175 invert_limb (v1, d1); 176 p1 = d1 * v1; 177 /* <1, v1> * d1 = <B-1, p1> */ 178 p1 += d0; 179 if (p1 < d0) 180 { 181 v1--; 182 mask = -(mp_limb_t) (p1 >= d1); 183 p1 -= d1; 184 v1 += mask; 185 p1 -= mask & d1; 186 } 187 /* <1, v1> * d1 + d0 = <B-1, p1> */ 188 umul_ppmm (t1, p0, d0, v1); 189 p1 += t1; 190 if (p1 < t1) 191 { 192 if (UNLIKELY (p1 >= d1)) 193 { 194 if (p1 > d1 || p0 >= d0) 195 { 196 sub_ddmmss (p1, p0, p1, p0, d1, d0); 197 v1--; 198 } 199 } 200 sub_ddmmss (p1, p0, p1, p0, d1, d0); 201 v1--; 202 } 203 /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>, 204 * with <p1, p0> + <d1, d0> >= B^2. 205 * 206 * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The 207 * partial remainder after <1, v1> is 208 * 209 * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0> 210 * = <~p1, ~p0, B-1> 211 */ 212 udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1); 213 di[0] = v0; 214 di[1] = v1; 215 216 #if SANITY_CHECK 217 { 218 mp_limb_t tp[4]; 219 mp_limb_t dp[2]; 220 dp[0] = d0; 221 dp[1] = d1; 222 mpn_mul_n (tp, dp, di, 2); 223 ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0); 224 ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX); 225 ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX); 226 ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1); 227 } 228 #endif 229 } 230 231 static mp_limb_t 232 mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, 233 mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0) 234 { 235 mp_limb_t qh; 236 mp_size_t i; 237 mp_limb_t r1, r0; 238 239 ASSERT (nn >= 2); 240 ASSERT (d1 & GMP_NUMB_HIGHBIT); 241 242 r1 = np[nn-1]; 243 r0 = np[nn-2]; 244 245 qh = 0; 246 if (r1 >= d1 && (r1 > d1 || r0 >= d0)) 247 { 248 #if GMP_NAIL_BITS == 0 249 sub_ddmmss (r1, r0, r1, r0, d1, d0); 250 #else 251 r0 = r0 - d0; 252 r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); 253 r0 &= GMP_NUMB_MASK; 254 #endif 255 qh = 1; 256 } 257 258 for (i = nn - 2; i >= 2; i -= 2) 259 { 260 mp_limb_t n1, n0, q1, q0; 261 n1 = np[i-1]; 262 n0 = np[i-2]; 263 udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0); 264 qp[i-1] = q1; 265 qp[i-2] = q0; 266 } 267 268 if (i > 0) 269 { 270 mp_limb_t q; 271 udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1); 272 qp[0] = q; 273 } 274 rp[1] = r1; 275 rp[0] = r0; 276 277 return qh; 278 } 279 280 281 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least 282 significant quotient limbs at qp and the 2 long remainder at np. 283 Return the most significant limb of the quotient. 284 285 Preconditions: 286 1. qp must either not overlap with the input operands at all, or 287 qp >= np + 2 must hold true. (This means that it's possible to put 288 the quotient in the high part of {np,nn}, right above the remainder. 289 2. nn >= 2. */ 290 291 mp_limb_t 292 mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, 293 mp_srcptr dp) 294 { 295 mp_limb_t d1; 296 mp_limb_t d0; 297 gmp_pi1_t dinv; 298 299 ASSERT (nn >= 2); 300 ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2); 301 ASSERT_MPN (np, nn); 302 ASSERT_MPN (dp, 2); 303 304 d1 = dp[1]; d0 = dp[0]; 305 306 ASSERT (d1 > 0); 307 308 if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT)) 309 { 310 if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD)) 311 { 312 gmp_pi1_t dinv; 313 invert_pi1 (dinv, d1, d0); 314 return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32); 315 } 316 else 317 { 318 mp_limb_t di[2]; 319 invert_4by2 (di, d1, d0); 320 return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]); 321 } 322 } 323 else 324 { 325 int shift; 326 count_leading_zeros (shift, d1); 327 d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift)); 328 d0 <<= shift; 329 invert_pi1 (dinv, d1, d0); 330 return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32); 331 } 332 }