github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/sqrtrem.c (about) 1 /* mpn_sqrtrem -- square root and remainder 2 3 Contributed to the GNU project by Paul Zimmermann (most code), 4 Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt). 5 6 THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A 7 MUTABLE INTERFACE. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED 8 INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR 9 DISAPPEAR IN A FUTURE GMP RELEASE. 10 11 Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015 Free Software 12 Foundation, Inc. 13 14 This file is part of the GNU MP Library. 15 16 The GNU MP Library is free software; you can redistribute it and/or modify 17 it under the terms of either: 18 19 * the GNU Lesser General Public License as published by the Free 20 Software Foundation; either version 3 of the License, or (at your 21 option) any later version. 22 23 or 24 25 * the GNU General Public License as published by the Free Software 26 Foundation; either version 2 of the License, or (at your option) any 27 later version. 28 29 or both in parallel, as here. 30 31 The GNU MP Library is distributed in the hope that it will be useful, but 32 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 33 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 34 for more details. 35 36 You should have received copies of the GNU General Public License and the 37 GNU Lesser General Public License along with the GNU MP Library. If not, 38 see https://www.gnu.org/licenses/. */ 39 40 41 /* See "Karatsuba Square Root", reference in gmp.texi. */ 42 43 44 #include <stdio.h> 45 #include <stdlib.h> 46 47 #include "gmp.h" 48 #include "gmp-impl.h" 49 #include "longlong.h" 50 #define USE_DIVAPPR_Q 1 51 #define TRACE(x) 52 53 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */ 54 { 55 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */ 56 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */ 57 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */ 58 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */ 59 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */ 60 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */ 61 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */ 62 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */ 63 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */ 64 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */ 65 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */ 66 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */ 67 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */ 68 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */ 69 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */ 70 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */ 71 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */ 72 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */ 73 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */ 74 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */ 75 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */ 76 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */ 77 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */ 78 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */ 79 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */ 80 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */ 81 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */ 82 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */ 83 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */ 84 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */ 85 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */ 86 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */ 87 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */ 88 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */ 89 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */ 90 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */ 91 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */ 92 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */ 93 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */ 94 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */ 95 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */ 96 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */ 97 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */ 98 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */ 99 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */ 100 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */ 101 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */ 102 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */ 103 }; 104 105 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */ 106 107 #if GMP_NUMB_BITS > 32 108 #define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */ 109 #else 110 #define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */ 111 #endif 112 113 static mp_limb_t 114 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0) 115 { 116 #if GMP_NUMB_BITS > 32 117 mp_limb_t a1; 118 #endif 119 mp_limb_t x0, t2, t, x2; 120 unsigned abits; 121 122 ASSERT_ALWAYS (GMP_NAIL_BITS == 0); 123 ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64); 124 ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2); 125 126 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a), 127 since we can do the former without division. As part of the last 128 iteration convert from 1/sqrt(a) to sqrt(a). */ 129 130 abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */ 131 x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */ 132 133 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */ 134 135 #if GMP_NUMB_BITS > 32 136 a1 = a0 >> (GMP_LIMB_BITS - 1 - 32); 137 t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16; 138 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2)); 139 140 /* x0 is now a 16 bits approximation of 1/sqrt(a0) */ 141 142 t2 = x0 * (a0 >> (32-8)); 143 t = t2 >> 25; 144 t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8)); 145 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15); 146 x0 >>= 32; 147 #else 148 t2 = x0 * (a0 >> (16-8)); 149 t = t2 >> 13; 150 t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8)); 151 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7); 152 x0 >>= 16; 153 #endif 154 155 /* x0 is now a full limb approximation of sqrt(a0) */ 156 157 x2 = x0 * x0; 158 if (x2 + 2*x0 <= a0 - 1) 159 { 160 x2 += 2*x0 + 1; 161 x0++; 162 } 163 164 *rp = a0 - x2; 165 return x0; 166 } 167 168 169 #define Prec (GMP_NUMB_BITS >> 1) 170 171 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized 172 return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ 173 static mp_limb_t 174 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np) 175 { 176 mp_limb_t q, u, np0, sp0, rp0, q2; 177 int cc; 178 179 ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2); 180 181 np0 = np[0]; 182 sp0 = mpn_sqrtrem1 (rp, np[1]); 183 rp0 = rp[0]; 184 /* rp0 <= 2*sp0 < 2^(Prec + 1) */ 185 rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1)); 186 q = rp0 / sp0; 187 /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */ 188 q -= q >> Prec; 189 /* now we have q < 2^Prec */ 190 u = rp0 - q * sp0; 191 /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */ 192 sp0 = (sp0 << Prec) | q; 193 cc = u >> (Prec - 1); 194 rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1)); 195 /* subtract q * q from rp */ 196 q2 = q * q; 197 cc -= rp0 < q2; 198 rp0 -= q2; 199 if (cc < 0) 200 { 201 rp0 += sp0; 202 cc += rp0 < sp0; 203 --sp0; 204 rp0 += sp0; 205 cc += rp0 < sp0; 206 } 207 208 rp[0] = rp0; 209 sp[0] = sp0; 210 return cc; 211 } 212 213 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, 214 and in {np, n} the low n limbs of the remainder, returns the high 215 limb of the remainder (which is 0 or 1). 216 Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 217 where B=2^GMP_NUMB_BITS. 218 Needs a scratch of n/2+1 limbs. */ 219 static mp_limb_t 220 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch) 221 { 222 mp_limb_t q; /* carry out of {sp, n} */ 223 int c, b; /* carry out of remainder */ 224 mp_size_t l, h; 225 226 ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2); 227 228 if (n == 1) 229 c = mpn_sqrtrem2 (sp, np, np); 230 else 231 { 232 l = n / 2; 233 h = n - l; 234 q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch); 235 if (q != 0) 236 ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h)); 237 TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1))); 238 mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h); 239 q += scratch[l]; 240 c = scratch[0] & 1; 241 mpn_rshift (sp, scratch, l, 1); 242 sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; 243 if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */ 244 return 1; /* Remainder is non-zero */ 245 q >>= 1; 246 if (c != 0) 247 c = mpn_add_n (np + l, np + l, sp + l, h); 248 TRACE(printf("sqr(,,%u)\n", (unsigned) l)); 249 mpn_sqr (np + n, sp, l); 250 b = q + mpn_sub_n (np, np, np + n, 2 * l); 251 c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b); 252 253 if (c < 0) 254 { 255 q = mpn_add_1 (sp + l, sp + l, h, q); 256 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n 257 c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q; 258 #else 259 c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q; 260 #endif 261 c -= mpn_sub_1 (np, np, n, CNST_LIMB(1)); 262 q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1)); 263 } 264 } 265 266 return c; 267 } 268 269 #if USE_DIVAPPR_Q 270 static void 271 mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch) 272 { 273 gmp_pi1_t inv; 274 mp_limb_t qh; 275 ASSERT (dn > 2); 276 ASSERT (nn >= dn); 277 ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); 278 279 MPN_COPY (scratch, np, nn); 280 invert_pi1 (inv, dp[dn-1], dp[dn-2]); 281 if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD)) 282 qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32); 283 else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD)) 284 qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv); 285 else 286 { 287 mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0); 288 TMP_DECL; 289 TMP_MARK; 290 /* Sadly, scratch is too small. */ 291 qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch)); 292 TMP_FREE; 293 } 294 qp [nn - dn] = qh; 295 } 296 #endif 297 298 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd}, 299 returns zero if the operand was a perfect square, one otherwise. 300 Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4 301 where B=2^GMP_NUMB_BITS. 302 THINK: In the odd case, three more (dummy) limbs are taken into account, 303 when nsh is maximal, two limbs are discarded from the result of the 304 division. Too much? Is a single dummy limb enough? */ 305 static int 306 mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd) 307 { 308 mp_limb_t q; /* carry out of {sp, n} */ 309 int c; /* carry out of remainder */ 310 mp_size_t l, h; 311 mp_ptr qp, tp, scratch; 312 TMP_DECL; 313 TMP_MARK; 314 315 ASSERT (np[2 * n - 1 - odd] != 0); 316 ASSERT (n > 4); 317 ASSERT (nsh < GMP_NUMB_BITS / 2); 318 319 l = (n - 1) / 2; 320 h = n - l; 321 ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd); 322 scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */ 323 tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */ 324 if (nsh != 0) 325 { 326 /* o is used to exactly set the lowest bits of the dividend, is it needed? */ 327 int o = l > (1 + odd); 328 ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh)); 329 } 330 else 331 MPN_COPY (tp, np + l - 1 - odd, n + h + 1); 332 q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch); 333 if (q != 0) 334 ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h)); 335 qp = tp + n + 1; /* l + 2 */ 336 TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1))); 337 #if USE_DIVAPPR_Q 338 mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch); 339 #else 340 mpn_div_q (qp, tp, n + 1, sp + l, h, scratch); 341 #endif 342 q += qp [l + 1]; 343 c = 1; 344 if (q > 1) 345 { 346 /* FIXME: if s!=0 we will shift later, a noop on this area. */ 347 MPN_FILL (sp, l, GMP_NUMB_MAX); 348 } 349 else 350 { 351 /* FIXME: if s!=0 we will shift again later, shift just once. */ 352 mpn_rshift (sp, qp + 1, l, 1); 353 sp[l - 1] |= q << (GMP_NUMB_BITS - 1); 354 if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */ 355 (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0) 356 { 357 mp_limb_t cy; 358 /* Approximation is not good enough, the extra limb(+ nsh bits) 359 is smaller than needed to absorb the possible error. */ 360 /* {qp + 1, l + 1} equals 2*{sp, l} */ 361 /* FIXME: use mullo or wrap-around, or directly evaluate 362 remainder with a single sqrmod_bnm1. */ 363 TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1))); 364 ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1)); 365 /* Compute the remainder of the previous mpn_div(appr)_q. */ 366 cy = mpn_sub_n (tp + 1, tp + 1, scratch, h); 367 #if USE_DIVAPPR_Q || WANT_ASSERT 368 MPN_DECR_U (tp + 1 + h, l, cy); 369 #if USE_DIVAPPR_Q 370 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0); 371 if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0) 372 { 373 /* May happen only if div result was not exact. */ 374 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n 375 cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h); 376 #else 377 cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2)); 378 #endif 379 ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy)); 380 MPN_DECR_U (sp, l, 1); 381 } 382 /* Can the root be exact when a correction was needed? We 383 did not find an example, but it depends on divappr 384 internals, and we can not assume it true in general...*/ 385 /* else */ 386 #else /* WANT_ASSERT */ 387 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0); 388 #endif 389 #endif 390 if (mpn_zero_p (tp + l + 1, h - l)) 391 { 392 TRACE(printf("sqr(,,%u)\n", (unsigned) l)); 393 mpn_sqr (scratch, sp, l); 394 c = mpn_cmp (tp + 1, scratch + l, l); 395 if (c == 0) 396 { 397 if (nsh != 0) 398 { 399 mpn_lshift (tp, np, l, 2 * nsh); 400 np = tp; 401 } 402 c = mpn_cmp (np, scratch + odd, l - odd); 403 } 404 if (c < 0) 405 { 406 MPN_DECR_U (sp, l, 1); 407 c = 1; 408 } 409 } 410 } 411 } 412 TMP_FREE; 413 414 if ((odd | nsh) != 0) 415 mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0)); 416 return c; 417 } 418 419 420 mp_size_t 421 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn) 422 { 423 mp_limb_t *tp, s0[1], cc, high, rl; 424 int c; 425 mp_size_t rn, tn; 426 TMP_DECL; 427 428 ASSERT (nn > 0); 429 ASSERT_MPN (np, nn); 430 431 ASSERT (np[nn - 1] != 0); 432 ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn)); 433 ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn)); 434 ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn)); 435 436 high = np[nn - 1]; 437 if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2))) 438 c = 0; 439 else 440 { 441 count_leading_zeros (c, high); 442 c -= GMP_NAIL_BITS; 443 444 c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ 445 } 446 if (nn == 1) { 447 if (c == 0) 448 { 449 sp[0] = mpn_sqrtrem1 (&rl, high); 450 if (rp != NULL) 451 rp[0] = rl; 452 } 453 else 454 { 455 cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c; 456 sp[0] = cc; 457 if (rp != NULL) 458 rp[0] = rl = high - cc*cc; 459 } 460 return rl != 0; 461 } 462 tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */ 463 464 if ((rp == NULL) && (nn > 8)) 465 return mpn_dc_sqrt (sp, np, tn, c, nn & 1); 466 TMP_MARK; 467 if (((nn & 1) | c) != 0) 468 { 469 mp_limb_t mask; 470 mp_ptr scratch; 471 TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1); 472 tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */ 473 if (c != 0) 474 mpn_lshift (tp + (nn & 1), np, nn, 2 * c); 475 else 476 MPN_COPY (tp + (nn & 1), np, nn); 477 c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */ 478 mask = (CNST_LIMB (1) << c) - 1; 479 rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch); 480 /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2, 481 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */ 482 s0[0] = sp[0] & mask; /* S mod 2^k */ 483 rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */ 484 cc = mpn_submul_1 (tp, s0, 1, s0[0]); 485 rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc; 486 mpn_rshift (sp, sp, tn, c); 487 tp[tn] = rl; 488 if (rp == NULL) 489 rp = tp; 490 c = c << 1; 491 if (c < GMP_NUMB_BITS) 492 tn++; 493 else 494 { 495 tp++; 496 c -= GMP_NUMB_BITS; 497 } 498 if (c != 0) 499 mpn_rshift (rp, tp, tn, c); 500 else 501 MPN_COPY_INCR (rp, tp, tn); 502 rn = tn; 503 } 504 else 505 { 506 if (rp != np) 507 { 508 if (rp == NULL) /* nn <= 8 */ 509 rp = TMP_SALLOC_LIMBS (nn); 510 MPN_COPY (rp, np, nn); 511 } 512 rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1))); 513 } 514 515 MPN_NORMALIZE (rp, rn); 516 517 TMP_FREE; 518 return rn; 519 }