github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpz/n_pow_ui.c (about)

     1  /* mpz_n_pow_ui -- mpn raised to ulong.
     2  
     3     THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
     4     CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
     5     FUTURE GNU MP RELEASES.
     6  
     7  Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
     8  
     9  This file is part of the GNU MP Library.
    10  
    11  The GNU MP Library is free software; you can redistribute it and/or modify
    12  it under the terms of either:
    13  
    14    * the GNU Lesser General Public License as published by the Free
    15      Software Foundation; either version 3 of the License, or (at your
    16      option) any later version.
    17  
    18  or
    19  
    20    * the GNU General Public License as published by the Free Software
    21      Foundation; either version 2 of the License, or (at your option) any
    22      later version.
    23  
    24  or both in parallel, as here.
    25  
    26  The GNU MP Library is distributed in the hope that it will be useful, but
    27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    29  for more details.
    30  
    31  You should have received copies of the GNU General Public License and the
    32  GNU Lesser General Public License along with the GNU MP Library.  If not,
    33  see https://www.gnu.org/licenses/.  */
    34  
    35  #include "gmp.h"
    36  #include "gmp-impl.h"
    37  #include "longlong.h"
    38  
    39  
    40  /* Change this to "#define TRACE(x) x" for some traces. */
    41  #define TRACE(x)
    42  
    43  
    44  /* Use this to test the mul_2 code on a CPU without a native version of that
    45     routine.  */
    46  #if 0
    47  #define mpn_mul_2  refmpn_mul_2
    48  #define HAVE_NATIVE_mpn_mul_2  1
    49  #endif
    50  
    51  
    52  /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
    53     ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
    54     bsize==2 or >2, but separating that isn't easy because there's shared
    55     code both before and after (the size calculations and the powers of 2
    56     handling).
    57  
    58     Alternatives:
    59  
    60     It would work to just use the mpn_mul powering loop for 1 and 2 limb
    61     bases, but the current separate loop allows mul_1 and mul_2 to be done
    62     in-place, which might help cache locality a bit.  If mpn_mul was relaxed
    63     to allow source==dest when vn==1 or 2 then some pointer twiddling might
    64     let us get the same effect in one loop.
    65  
    66     The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
    67     form the biggest possible power of b that fits, only the biggest power of
    68     2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
    69     using mp_bases[b].big_base for small b, and thereby get better value
    70     from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
    71     so would be more complicated than it's worth, and could well end up being
    72     a slowdown for small e.  For big e on the other hand the algorithm is
    73     dominated by mpn_sqr so there wouldn't much of a saving.  The current
    74     code can be viewed as simply doing the first few steps of the powering in
    75     a single or double limb where possible.
    76  
    77     If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
    78     copy made of b is unnecessary.  We could just use the old alloc'ed block
    79     and free it at the end.  But arranging this seems like a lot more trouble
    80     than it's worth.  */
    81  
    82  
    83  /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
    84     a limb without overflowing.
    85     FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
    86  
    87  #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
    88  
    89  
    90  /* The following are for convenience, they update the size and check the
    91     alloc.  */
    92  
    93  #define MPN_SQR(dst, alloc, src, size)          \
    94    do {                                          \
    95      ASSERT (2*(size) <= (alloc));               \
    96      mpn_sqr (dst, src, size);                   \
    97      (size) *= 2;                                \
    98      (size) -= ((dst)[(size)-1] == 0);           \
    99    } while (0)
   100  
   101  #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
   102    do {                                                  \
   103      mp_limb_t  cy;                                      \
   104      ASSERT ((size) + (size2) <= (alloc));               \
   105      cy = mpn_mul (dst, src, size, src2, size2);         \
   106      (size) += (size2) - (cy == 0);                      \
   107    } while (0)
   108  
   109  #define MPN_MUL_2(ptr, size, alloc, mult)       \
   110    do {                                          \
   111      mp_limb_t  cy;                              \
   112      ASSERT ((size)+2 <= (alloc));               \
   113      cy = mpn_mul_2 (ptr, ptr, size, mult);      \
   114      (size)++;                                   \
   115      (ptr)[(size)] = cy;                         \
   116      (size) += (cy != 0);                        \
   117    } while (0)
   118  
   119  #define MPN_MUL_1(ptr, size, alloc, limb)       \
   120    do {                                          \
   121      mp_limb_t  cy;                              \
   122      ASSERT ((size)+1 <= (alloc));               \
   123      cy = mpn_mul_1 (ptr, ptr, size, limb);      \
   124      (ptr)[size] = cy;                           \
   125      (size) += (cy != 0);                        \
   126    } while (0)
   127  
   128  #define MPN_LSHIFT(ptr, size, alloc, shift)     \
   129    do {                                          \
   130      mp_limb_t  cy;                              \
   131      ASSERT ((size)+1 <= (alloc));               \
   132      cy = mpn_lshift (ptr, ptr, size, shift);    \
   133      (ptr)[size] = cy;                           \
   134      (size) += (cy != 0);                        \
   135    } while (0)
   136  
   137  #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
   138    do {                                                  \
   139      if ((shift) == 0)                                   \
   140        MPN_COPY (dst, src, size);                        \
   141      else                                                \
   142        {                                                 \
   143          mpn_rshift (dst, src, size, shift);             \
   144          (size) -= ((dst)[(size)-1] == 0);               \
   145        }                                                 \
   146    } while (0)
   147  
   148  
   149  /* ralloc and talloc are only wanted for ASSERTs, after the initial space
   150     allocations.  Avoid writing values to them in a normal build, to ensure
   151     the compiler lets them go dead.  gcc already figures this out itself
   152     actually.  */
   153  
   154  #define SWAP_RP_TP                                      \
   155    do {                                                  \
   156      MP_PTR_SWAP (rp, tp);                               \
   157      ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
   158    } while (0)
   159  
   160  
   161  void
   162  mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
   163  {
   164    mp_ptr         rp;
   165    mp_size_t      rtwos_limbs, ralloc, rsize;
   166    int            rneg, i, cnt, btwos, r_bp_overlap;
   167    mp_limb_t      blimb, rl;
   168    mp_bitcnt_t    rtwos_bits;
   169  #if HAVE_NATIVE_mpn_mul_2
   170    mp_limb_t      blimb_low, rl_high;
   171  #else
   172    mp_limb_t      b_twolimbs[2];
   173  #endif
   174    TMP_DECL;
   175  
   176    TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
   177  		 PTR(r), bp, bsize, e, e);
   178  	 mpn_trace ("b", bp, bsize));
   179  
   180    ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
   181    ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
   182  
   183    /* b^0 == 1, including 0^0 == 1 */
   184    if (e == 0)
   185      {
   186        PTR(r)[0] = 1;
   187        SIZ(r) = 1;
   188        return;
   189      }
   190  
   191    /* 0^e == 0 apart from 0^0 above */
   192    if (bsize == 0)
   193      {
   194        SIZ(r) = 0;
   195        return;
   196      }
   197  
   198    /* Sign of the final result. */
   199    rneg = (bsize < 0 && (e & 1) != 0);
   200    bsize = ABS (bsize);
   201    TRACE (printf ("rneg %d\n", rneg));
   202  
   203    r_bp_overlap = (PTR(r) == bp);
   204  
   205    /* Strip low zero limbs from b. */
   206    rtwos_limbs = 0;
   207    for (blimb = *bp; blimb == 0; blimb = *++bp)
   208      {
   209        rtwos_limbs += e;
   210        bsize--; ASSERT (bsize >= 1);
   211      }
   212    TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
   213  
   214    /* Strip low zero bits from b. */
   215    count_trailing_zeros (btwos, blimb);
   216    blimb >>= btwos;
   217    rtwos_bits = e * btwos;
   218    rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
   219    rtwos_bits %= GMP_NUMB_BITS;
   220    TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
   221  		 btwos, rtwos_limbs, rtwos_bits));
   222  
   223    TMP_MARK;
   224  
   225    rl = 1;
   226  #if HAVE_NATIVE_mpn_mul_2
   227    rl_high = 0;
   228  #endif
   229  
   230    if (bsize == 1)
   231      {
   232      bsize_1:
   233        /* Power up as far as possible within blimb.  We start here with e!=0,
   234  	 but if e is small then we might reach e==0 and the whole b^e in rl.
   235  	 Notice this code works when blimb==1 too, reaching e==0.  */
   236  
   237        while (blimb <= GMP_NUMB_HALFMAX)
   238  	{
   239  	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
   240  			 e, blimb, rl));
   241  	  ASSERT (e != 0);
   242  	  if ((e & 1) != 0)
   243  	    rl *= blimb;
   244  	  e >>= 1;
   245  	  if (e == 0)
   246  	    goto got_rl;
   247  	  blimb *= blimb;
   248  	}
   249  
   250  #if HAVE_NATIVE_mpn_mul_2
   251        TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
   252  		     e, blimb, rl));
   253  
   254        /* Can power b once more into blimb:blimb_low */
   255        bsize = 2;
   256        ASSERT (e != 0);
   257        if ((e & 1) != 0)
   258  	{
   259  	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
   260  	  rl >>= GMP_NAIL_BITS;
   261  	}
   262        e >>= 1;
   263        umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
   264        blimb_low >>= GMP_NAIL_BITS;
   265  
   266      got_rl:
   267        TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
   268  		     e, blimb, blimb_low, rl_high, rl));
   269  
   270        /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
   271  	 final mul_1 or mul_2 rather than a separate lshift.
   272  	 - rl_high:rl mustn't be 1 (since then there's no final mul)
   273  	 - rl_high mustn't overflow
   274  	 - rl_high mustn't change to non-zero, since mul_1+lshift is
   275  	 probably faster than mul_2 (FIXME: is this true?)  */
   276  
   277        if (rtwos_bits != 0
   278  	  && ! (rl_high == 0 && rl == 1)
   279  	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
   280  	{
   281  	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
   282  	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
   283  	  if (! (rl_high == 0 && new_rl_high != 0))
   284  	    {
   285  	      rl_high = new_rl_high;
   286  	      rl <<= rtwos_bits;
   287  	      rtwos_bits = 0;
   288  	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
   289  			     rl_high, rl));
   290  	    }
   291  	}
   292  #else
   293      got_rl:
   294        TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
   295  		     e, blimb, rl));
   296  
   297        /* Combine left-over rtwos_bits into rl to be handled by the final
   298  	 mul_1 rather than a separate lshift.
   299  	 - rl mustn't be 1 (since then there's no final mul)
   300  	 - rl mustn't overflow	*/
   301  
   302        if (rtwos_bits != 0
   303  	  && rl != 1
   304  	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
   305  	{
   306  	  rl <<= rtwos_bits;
   307  	  rtwos_bits = 0;
   308  	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
   309  	}
   310  #endif
   311      }
   312    else if (bsize == 2)
   313      {
   314        mp_limb_t  bsecond = bp[1];
   315        if (btwos != 0)
   316  	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
   317        bsecond >>= btwos;
   318        if (bsecond == 0)
   319  	{
   320  	  /* Two limbs became one after rshift. */
   321  	  bsize = 1;
   322  	  goto bsize_1;
   323  	}
   324  
   325        TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
   326  #if HAVE_NATIVE_mpn_mul_2
   327        blimb_low = blimb;
   328  #else
   329        bp = b_twolimbs;
   330        b_twolimbs[0] = blimb;
   331        b_twolimbs[1] = bsecond;
   332  #endif
   333        blimb = bsecond;
   334      }
   335    else
   336      {
   337        if (r_bp_overlap || btwos != 0)
   338  	{
   339  	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
   340  	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
   341  	  bp = tp;
   342  	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
   343  	}
   344  #if HAVE_NATIVE_mpn_mul_2
   345        /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
   346        blimb_low = bp[0];
   347  #endif
   348        blimb = bp[bsize-1];
   349  
   350        TRACE (printf ("big bsize=%ld  ", bsize);
   351  	     mpn_trace ("b", bp, bsize));
   352      }
   353  
   354    /* At this point blimb is the most significant limb of the base to use.
   355  
   356       Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
   357       limb to round up the division; +1 for multiplies all using an extra
   358       limb over the true size; +2 for rl at the end; +1 for lshift at the
   359       end.
   360  
   361       The size calculation here is reasonably accurate.  The base is at least
   362       half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
   363       when it will power up as just over 16, an overestimate of 17/16 =
   364       6.25%.  For a 64-bit limb it's half that.
   365  
   366       If e==0 then blimb won't be anything useful (though it will be
   367       non-zero), but that doesn't matter since we just end up with ralloc==5,
   368       and that's fine for 2 limbs of rl and 1 of lshift.  */
   369  
   370    ASSERT (blimb != 0);
   371    count_leading_zeros (cnt, blimb);
   372    ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
   373    TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
   374  		 ralloc, bsize, blimb, cnt));
   375    rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
   376  
   377    /* Low zero limbs resulting from powers of 2. */
   378    MPN_ZERO (rp, rtwos_limbs);
   379    rp += rtwos_limbs;
   380  
   381    if (e == 0)
   382      {
   383        /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
   384  	 start. */
   385        rp[0] = rl;
   386        rsize = 1;
   387  #if HAVE_NATIVE_mpn_mul_2
   388        rp[1] = rl_high;
   389        rsize += (rl_high != 0);
   390  #endif
   391        ASSERT (rp[rsize-1] != 0);
   392      }
   393    else
   394      {
   395        mp_ptr     tp;
   396        mp_size_t  talloc;
   397  
   398        /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
   399  	 low bit of e is zero, tp only has to hold the second last power
   400  	 step, which is half the size of the final result.  There's no need
   401  	 to round up the divide by 2, since ralloc includes a +2 for rl
   402  	 which not needed by tp.  In the mpn_mul loop when the low bit of e
   403  	 is 1, tp must hold nearly the full result, so just size it the same
   404  	 as rp.  */
   405  
   406        talloc = ralloc;
   407  #if HAVE_NATIVE_mpn_mul_2
   408        if (bsize <= 2 || (e & 1) == 0)
   409  	talloc /= 2;
   410  #else
   411        if (bsize <= 1 || (e & 1) == 0)
   412  	talloc /= 2;
   413  #endif
   414        TRACE (printf ("talloc %ld\n", talloc));
   415        tp = TMP_ALLOC_LIMBS (talloc);
   416  
   417        /* Go from high to low over the bits of e, starting with i pointing at
   418  	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
   419        count_leading_zeros (cnt, (mp_limb_t) e);
   420        i = GMP_LIMB_BITS - cnt - 2;
   421  
   422  #if HAVE_NATIVE_mpn_mul_2
   423        if (bsize <= 2)
   424  	{
   425  	  mp_limb_t  mult[2];
   426  
   427  	  /* Any bsize==1 will have been powered above to be two limbs. */
   428  	  ASSERT (bsize == 2);
   429  	  ASSERT (blimb != 0);
   430  
   431  	  /* Arrange the final result ends up in r, not in the temp space */
   432  	  if ((i & 1) == 0)
   433  	    SWAP_RP_TP;
   434  
   435  	  rp[0] = blimb_low;
   436  	  rp[1] = blimb;
   437  	  rsize = 2;
   438  
   439  	  mult[0] = blimb_low;
   440  	  mult[1] = blimb;
   441  
   442  	  for ( ; i >= 0; i--)
   443  	    {
   444  	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
   445  			     i, e, rsize, ralloc, talloc);
   446  		     mpn_trace ("r", rp, rsize));
   447  
   448  	      MPN_SQR (tp, talloc, rp, rsize);
   449  	      SWAP_RP_TP;
   450  	      if ((e & (1L << i)) != 0)
   451  		MPN_MUL_2 (rp, rsize, ralloc, mult);
   452  	    }
   453  
   454  	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
   455  	  if (rl_high != 0)
   456  	    {
   457  	      mult[0] = rl;
   458  	      mult[1] = rl_high;
   459  	      MPN_MUL_2 (rp, rsize, ralloc, mult);
   460  	    }
   461  	  else if (rl != 1)
   462  	    MPN_MUL_1 (rp, rsize, ralloc, rl);
   463  	}
   464  #else
   465        if (bsize == 1)
   466  	{
   467  	  /* Arrange the final result ends up in r, not in the temp space */
   468  	  if ((i & 1) == 0)
   469  	    SWAP_RP_TP;
   470  
   471  	  rp[0] = blimb;
   472  	  rsize = 1;
   473  
   474  	  for ( ; i >= 0; i--)
   475  	    {
   476  	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
   477  			     i, e, rsize, ralloc, talloc);
   478  		     mpn_trace ("r", rp, rsize));
   479  
   480  	      MPN_SQR (tp, talloc, rp, rsize);
   481  	      SWAP_RP_TP;
   482  	      if ((e & (1L << i)) != 0)
   483  		MPN_MUL_1 (rp, rsize, ralloc, blimb);
   484  	    }
   485  
   486  	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
   487  	  if (rl != 1)
   488  	    MPN_MUL_1 (rp, rsize, ralloc, rl);
   489  	}
   490  #endif
   491        else
   492  	{
   493  	  int  parity;
   494  
   495  	  /* Arrange the final result ends up in r, not in the temp space */
   496  	  ULONG_PARITY (parity, e);
   497  	  if (((parity ^ i) & 1) != 0)
   498  	    SWAP_RP_TP;
   499  
   500  	  MPN_COPY (rp, bp, bsize);
   501  	  rsize = bsize;
   502  
   503  	  for ( ; i >= 0; i--)
   504  	    {
   505  	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
   506  			     i, e, rsize, ralloc, talloc);
   507  		     mpn_trace ("r", rp, rsize));
   508  
   509  	      MPN_SQR (tp, talloc, rp, rsize);
   510  	      SWAP_RP_TP;
   511  	      if ((e & (1L << i)) != 0)
   512  		{
   513  		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
   514  		  SWAP_RP_TP;
   515  		}
   516  	    }
   517  	}
   518      }
   519  
   520    ASSERT (rp == PTR(r) + rtwos_limbs);
   521    TRACE (mpn_trace ("end loop r", rp, rsize));
   522    TMP_FREE;
   523  
   524    /* Apply any partial limb factors of 2. */
   525    if (rtwos_bits != 0)
   526      {
   527        MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
   528        TRACE (mpn_trace ("lshift r", rp, rsize));
   529      }
   530  
   531    rsize += rtwos_limbs;
   532    SIZ(r) = (rneg ? -rsize : rsize);
   533  }