github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/sqrtrem.c (about)

     1  /* mpn_sqrtrem -- square root and remainder
     2  
     3     Contributed to the GNU project by Paul Zimmermann (most code),
     4     Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
     5  
     6     THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
     7     MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
     8     INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
     9     DISAPPEAR IN A FUTURE GMP RELEASE.
    10  
    11  Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015 Free Software
    12  Foundation, Inc.
    13  
    14  This file is part of the GNU MP Library.
    15  
    16  The GNU MP Library is free software; you can redistribute it and/or modify
    17  it under the terms of either:
    18  
    19    * the GNU Lesser General Public License as published by the Free
    20      Software Foundation; either version 3 of the License, or (at your
    21      option) any later version.
    22  
    23  or
    24  
    25    * the GNU General Public License as published by the Free Software
    26      Foundation; either version 2 of the License, or (at your option) any
    27      later version.
    28  
    29  or both in parallel, as here.
    30  
    31  The GNU MP Library is distributed in the hope that it will be useful, but
    32  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    33  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    34  for more details.
    35  
    36  You should have received copies of the GNU General Public License and the
    37  GNU Lesser General Public License along with the GNU MP Library.  If not,
    38  see https://www.gnu.org/licenses/.  */
    39  
    40  
    41  /* See "Karatsuba Square Root", reference in gmp.texi.  */
    42  
    43  
    44  #include <stdio.h>
    45  #include <stdlib.h>
    46  
    47  #include "gmp.h"
    48  #include "gmp-impl.h"
    49  #include "longlong.h"
    50  #define USE_DIVAPPR_Q 1
    51  #define TRACE(x)
    52  
    53  static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
    54  {
    55    0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
    56    0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
    57    0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
    58    0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
    59    0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
    60    0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
    61    0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
    62    0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
    63    0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
    64    0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
    65    0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
    66    0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
    67    0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
    68    0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
    69    0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
    70    0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
    71    0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
    72    0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
    73    0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
    74    0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
    75    0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
    76    0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
    77    0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
    78    0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
    79    0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
    80    0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
    81    0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
    82    0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
    83    0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
    84    0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
    85    0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
    86    0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
    87    0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
    88    0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
    89    0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
    90    0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
    91    0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
    92    0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
    93    0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
    94    0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
    95    0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
    96    0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
    97    0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
    98    0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
    99    0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
   100    0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
   101    0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
   102    0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
   103  };
   104  
   105  /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
   106  
   107  #if GMP_NUMB_BITS > 32
   108  #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
   109  #else
   110  #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
   111  #endif
   112  
   113  static mp_limb_t
   114  mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
   115  {
   116  #if GMP_NUMB_BITS > 32
   117    mp_limb_t a1;
   118  #endif
   119    mp_limb_t x0, t2, t, x2;
   120    unsigned abits;
   121  
   122    ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
   123    ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
   124    ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
   125  
   126    /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
   127       since we can do the former without division.  As part of the last
   128       iteration convert from 1/sqrt(a) to sqrt(a).  */
   129  
   130    abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
   131    x0 = 0x100 | invsqrttab[abits - 0x80];	/* initial 1/sqrt(a) */
   132  
   133    /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
   134  
   135  #if GMP_NUMB_BITS > 32
   136    a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
   137    t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
   138    x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
   139  
   140    /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
   141  
   142    t2 = x0 * (a0 >> (32-8));
   143    t = t2 >> 25;
   144    t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
   145    x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
   146    x0 >>= 32;
   147  #else
   148    t2 = x0 * (a0 >> (16-8));
   149    t = t2 >> 13;
   150    t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
   151    x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
   152    x0 >>= 16;
   153  #endif
   154  
   155    /* x0 is now a full limb approximation of sqrt(a0) */
   156  
   157    x2 = x0 * x0;
   158    if (x2 + 2*x0 <= a0 - 1)
   159      {
   160        x2 += 2*x0 + 1;
   161        x0++;
   162      }
   163  
   164    *rp = a0 - x2;
   165    return x0;
   166  }
   167  
   168  
   169  #define Prec (GMP_NUMB_BITS >> 1)
   170  
   171  /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
   172     return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
   173  static mp_limb_t
   174  mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
   175  {
   176    mp_limb_t q, u, np0, sp0, rp0, q2;
   177    int cc;
   178  
   179    ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
   180  
   181    np0 = np[0];
   182    sp0 = mpn_sqrtrem1 (rp, np[1]);
   183    rp0 = rp[0];
   184    /* rp0 <= 2*sp0 < 2^(Prec + 1) */
   185    rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
   186    q = rp0 / sp0;
   187    /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
   188    q -= q >> Prec;
   189    /* now we have q < 2^Prec */
   190    u = rp0 - q * sp0;
   191    /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
   192    sp0 = (sp0 << Prec) | q;
   193    cc = u >> (Prec - 1);
   194    rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
   195    /* subtract q * q from rp */
   196    q2 = q * q;
   197    cc -= rp0 < q2;
   198    rp0 -= q2;
   199    if (cc < 0)
   200      {
   201        rp0 += sp0;
   202        cc += rp0 < sp0;
   203        --sp0;
   204        rp0 += sp0;
   205        cc += rp0 < sp0;
   206      }
   207  
   208    rp[0] = rp0;
   209    sp[0] = sp0;
   210    return cc;
   211  }
   212  
   213  /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
   214     and in {np, n} the low n limbs of the remainder, returns the high
   215     limb of the remainder (which is 0 or 1).
   216     Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
   217     where B=2^GMP_NUMB_BITS.
   218     Needs a scratch of n/2+1 limbs. */
   219  static mp_limb_t
   220  mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
   221  {
   222    mp_limb_t q;			/* carry out of {sp, n} */
   223    int c, b;			/* carry out of remainder */
   224    mp_size_t l, h;
   225  
   226    ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
   227  
   228    if (n == 1)
   229      c = mpn_sqrtrem2 (sp, np, np);
   230    else
   231      {
   232        l = n / 2;
   233        h = n - l;
   234        q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
   235        if (q != 0)
   236  	ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
   237        TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
   238        mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
   239        q += scratch[l];
   240        c = scratch[0] & 1;
   241        mpn_rshift (sp, scratch, l, 1);
   242        sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
   243        if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
   244  	return 1; /* Remainder is non-zero */
   245        q >>= 1;
   246        if (c != 0)
   247  	c = mpn_add_n (np + l, np + l, sp + l, h);
   248        TRACE(printf("sqr(,,%u)\n", (unsigned) l));
   249        mpn_sqr (np + n, sp, l);
   250        b = q + mpn_sub_n (np, np, np + n, 2 * l);
   251        c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
   252  
   253        if (c < 0)
   254  	{
   255  	  q = mpn_add_1 (sp + l, sp + l, h, q);
   256  #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
   257  	  c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
   258  #else
   259  	  c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
   260  #endif
   261  	  c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
   262  	  q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
   263  	}
   264      }
   265  
   266    return c;
   267  }
   268  
   269  #if USE_DIVAPPR_Q
   270  static void
   271  mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
   272  {
   273    gmp_pi1_t inv;
   274    mp_limb_t qh;
   275    ASSERT (dn > 2);
   276    ASSERT (nn >= dn);
   277    ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
   278  
   279    MPN_COPY (scratch, np, nn);
   280    invert_pi1 (inv, dp[dn-1], dp[dn-2]);
   281    if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
   282      qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
   283    else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
   284      qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
   285    else
   286      {
   287        mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
   288        TMP_DECL;
   289        TMP_MARK;
   290        /* Sadly, scratch is too small. */
   291        qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
   292        TMP_FREE;
   293      }
   294    qp [nn - dn] = qh;
   295  }
   296  #endif
   297  
   298  /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
   299     returns zero if the operand was a perfect square, one otherwise.
   300     Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
   301     where B=2^GMP_NUMB_BITS.
   302     THINK: In the odd case, three more (dummy) limbs are taken into account,
   303     when nsh is maximal, two limbs are discarded from the result of the
   304     division. Too much? Is a single dummy limb enough? */
   305  static int
   306  mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
   307  {
   308    mp_limb_t q;			/* carry out of {sp, n} */
   309    int c;			/* carry out of remainder */
   310    mp_size_t l, h;
   311    mp_ptr qp, tp, scratch;
   312    TMP_DECL;
   313    TMP_MARK;
   314  
   315    ASSERT (np[2 * n - 1 - odd] != 0);
   316    ASSERT (n > 4);
   317    ASSERT (nsh < GMP_NUMB_BITS / 2);
   318  
   319    l = (n - 1) / 2;
   320    h = n - l;
   321    ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
   322    scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
   323    tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
   324    if (nsh != 0)
   325      {
   326        /* o is used to exactly set the lowest bits of the dividend, is it needed? */
   327        int o = l > (1 + odd);
   328        ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
   329      }
   330    else
   331      MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
   332    q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
   333    if (q != 0)
   334      ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
   335    qp = tp + n + 1; /* l + 2 */
   336    TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
   337  #if USE_DIVAPPR_Q
   338    mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
   339  #else
   340    mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
   341  #endif
   342    q += qp [l + 1];
   343    c = 1;
   344    if (q > 1)
   345      {
   346        /* FIXME: if s!=0 we will shift later, a noop on this area. */
   347        MPN_FILL (sp, l, GMP_NUMB_MAX);
   348      }
   349    else
   350      {
   351        /* FIXME: if s!=0 we will shift again later, shift just once. */
   352        mpn_rshift (sp, qp + 1, l, 1);
   353        sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
   354        if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
   355  	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
   356  	{
   357  	  mp_limb_t cy;
   358  	  /* Approximation is not good enough, the extra limb(+ nsh bits)
   359  	     is smaller than needed to absorb the possible error. */
   360  	  /* {qp + 1, l + 1} equals 2*{sp, l} */
   361  	  /* FIXME: use mullo or wrap-around, or directly evaluate
   362  	     remainder with a single sqrmod_bnm1. */
   363  	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
   364  	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
   365  	  /* Compute the remainder of the previous mpn_div(appr)_q. */
   366  	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
   367  #if USE_DIVAPPR_Q || WANT_ASSERT
   368  	  MPN_DECR_U (tp + 1 + h, l, cy);
   369  #if USE_DIVAPPR_Q
   370  	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
   371  	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
   372  	    {
   373  	      /* May happen only if div result was not exact. */
   374  #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
   375  	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
   376  #else
   377  	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
   378  #endif
   379  	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
   380  	      MPN_DECR_U (sp, l, 1);
   381  	    }
   382  	  /* Can the root be exact when a correction was needed? We
   383  	     did not find an example, but it depends on divappr
   384  	     internals, and we can not assume it true in general...*/
   385  	  /* else */
   386  #else /* WANT_ASSERT */
   387  	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
   388  #endif
   389  #endif
   390  	  if (mpn_zero_p (tp + l + 1, h - l))
   391  	    {
   392  	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
   393  	      mpn_sqr (scratch, sp, l);
   394  	      c = mpn_cmp (tp + 1, scratch + l, l);
   395  	      if (c == 0)
   396  		{
   397  		  if (nsh != 0)
   398  		    {
   399  		      mpn_lshift (tp, np, l, 2 * nsh);
   400  		      np = tp;
   401  		    }
   402  		  c = mpn_cmp (np, scratch + odd, l - odd);
   403  		}
   404  	      if (c < 0)
   405  		{
   406  		  MPN_DECR_U (sp, l, 1);
   407  		  c = 1;
   408  		}
   409  	    }
   410  	}
   411      }
   412    TMP_FREE;
   413  
   414    if ((odd | nsh) != 0)
   415      mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
   416    return c;
   417  }
   418  
   419  
   420  mp_size_t
   421  mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
   422  {
   423    mp_limb_t *tp, s0[1], cc, high, rl;
   424    int c;
   425    mp_size_t rn, tn;
   426    TMP_DECL;
   427  
   428    ASSERT (nn > 0);
   429    ASSERT_MPN (np, nn);
   430  
   431    ASSERT (np[nn - 1] != 0);
   432    ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
   433    ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
   434    ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
   435  
   436    high = np[nn - 1];
   437    if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
   438      c = 0;
   439    else
   440      {
   441        count_leading_zeros (c, high);
   442        c -= GMP_NAIL_BITS;
   443  
   444        c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
   445      }
   446    if (nn == 1) {
   447      if (c == 0)
   448        {
   449  	sp[0] = mpn_sqrtrem1 (&rl, high);
   450  	if (rp != NULL)
   451  	  rp[0] = rl;
   452        }
   453      else
   454        {
   455  	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
   456  	sp[0] = cc;
   457  	if (rp != NULL)
   458  	  rp[0] = rl = high - cc*cc;
   459        }
   460      return rl != 0;
   461    }
   462    tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
   463  
   464    if ((rp == NULL) && (nn > 8))
   465      return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
   466    TMP_MARK;
   467    if (((nn & 1) | c) != 0)
   468      {
   469        mp_limb_t mask;
   470        mp_ptr scratch;
   471        TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
   472        tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
   473        if (c != 0)
   474  	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
   475        else
   476  	MPN_COPY (tp + (nn & 1), np, nn);
   477        c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;		/* c now represents k */
   478        mask = (CNST_LIMB (1) << c) - 1;
   479        rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
   480        /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
   481  	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
   482        s0[0] = sp[0] & mask;	/* S mod 2^k */
   483        rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
   484        cc = mpn_submul_1 (tp, s0, 1, s0[0]);
   485        rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
   486        mpn_rshift (sp, sp, tn, c);
   487        tp[tn] = rl;
   488        if (rp == NULL)
   489  	rp = tp;
   490        c = c << 1;
   491        if (c < GMP_NUMB_BITS)
   492  	tn++;
   493        else
   494  	{
   495  	  tp++;
   496  	  c -= GMP_NUMB_BITS;
   497  	}
   498        if (c != 0)
   499  	mpn_rshift (rp, tp, tn, c);
   500        else
   501  	MPN_COPY_INCR (rp, tp, tn);
   502        rn = tn;
   503      }
   504    else
   505      {
   506        if (rp != np)
   507  	{
   508  	  if (rp == NULL) /* nn <= 8 */
   509  	    rp = TMP_SALLOC_LIMBS (nn);
   510  	  MPN_COPY (rp, np, nn);
   511  	}
   512        rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
   513      }
   514  
   515    MPN_NORMALIZE (rp, rn);
   516  
   517    TMP_FREE;
   518    return rn;
   519  }