github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/sqrlo.c (about)

     1  /* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
     2     of the result.
     3  
     4     Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     5  
     6     THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
     7     FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
     8     THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     9  
    10  Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
    11  
    12  This file is part of the GNU MP Library.
    13  
    14  The GNU MP Library is free software; you can redistribute it and/or modify
    15  it under the terms of either:
    16  
    17    * the GNU Lesser General Public License as published by the Free
    18      Software Foundation; either version 3 of the License, or (at your
    19      option) any later version.
    20  
    21  or
    22  
    23    * the GNU General Public License as published by the Free Software
    24      Foundation; either version 2 of the License, or (at your option) any
    25      later version.
    26  
    27  or both in parallel, as here.
    28  
    29  The GNU MP Library is distributed in the hope that it will be useful, but
    30  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    31  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    32  for more details.
    33  
    34  You should have received copies of the GNU General Public License and the
    35  GNU Lesser General Public License along with the GNU MP Library.  If not,
    36  see https://www.gnu.org/licenses/.  */
    37  
    38  #include "gmp.h"
    39  #include "gmp-impl.h"
    40  
    41  #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
    42  #define MAYBE_range_basecase 1
    43  #define MAYBE_range_toom22   1
    44  #else
    45  #define MAYBE_range_basecase                                           \
    46    ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
    47  #define MAYBE_range_toom22                                             \
    48    ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
    49  #endif
    50  
    51  /*  THINK: The DC strategy uses different constants in different Toom's
    52  	 ranges. Something smoother?
    53  */
    54  
    55  /*
    56    Compute the least significant half of the product {xy,n}*{yp,n}, or
    57    formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
    58  
    59    Above the given threshold, the Divide and Conquer strategy is used.
    60    The operand is split in two, and a full square plus a mullo
    61    is used to obtain the final result. The more natural strategy is to
    62    split in two halves, but this is far from optimal when a
    63    sub-quadratic multiplication is used.
    64  
    65    Mulders suggests an unbalanced split in favour of the full product,
    66    split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
    67  
    68    To compute the value of a, we assume that the cost of mullo for a
    69    given size ML(n) is a fraction of the cost of a full product with
    70    same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
    71    then we can write:
    72  
    73    ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
    74  
    75    Given a value for e, want to minimise the value of k, i.e. the
    76    function k=(1-a)^e/(1-2*a^e).
    77  
    78    With e=2, the exponent for schoolbook multiplication, the minimum is
    79    given by the values a=1-a=1/2.
    80  
    81    With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
    82    Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
    83  
    84    Other possible approximations follow:
    85    e=log(5)/log(3) [Toom-3] -> a ~= 9/40
    86    e=log(7)/log(4) [Toom-4] -> a ~= 7/39
    87    e=log(11)/log(6) [Toom-6] -> a ~= 1/8
    88    e=log(15)/log(8) [Toom-8] -> a ~= 1/10
    89  
    90    The values above where obtained with the following trivial commands
    91    in the gp-pari shell:
    92  
    93  fun(e,a)=(1-a)^e/(1-2*a^e)
    94  mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
    95  contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
    96  contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
    97  contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
    98  contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
    99  contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
   100  
   101    ,
   102    |\
   103    | \
   104    +----,
   105    |    |
   106    |    |
   107    |    |\
   108    |    | \
   109    +----+--`
   110    ^ n2 ^n1^
   111  
   112    For an actual implementation, the assumption that M(n)=n^e is
   113    incorrect, as a consequence also the assumption that ML(n)=k*M(n)
   114    with a constant k is wrong.
   115  
   116    But theory suggest us two things:
   117    - the best the multiplication product is (lower e), the more k
   118      approaches 1, and a approaches 0.
   119  
   120    - A value for a smaller than optimal is probably less bad than a
   121      bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
   122      value, and k(a)=0.808_ the mul/mullo speed ratio. We get
   123      k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
   124  */
   125  
   126  static mp_size_t
   127  mpn_sqrlo_itch (mp_size_t n)
   128  {
   129    return 2*n;
   130  }
   131  
   132  /*
   133      mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
   134      It accepts tp == rp.
   135  */
   136  static void
   137  mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
   138  {
   139    mp_size_t n2, n1;
   140    ASSERT (n >= 2);
   141    ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
   142    ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
   143  
   144    /* Divide-and-conquer */
   145  
   146    /* We need fractional approximation of the value 0 < a <= 1/2
   147       giving the minimum in the function k=(1-a)^e/(1-2*a^e).
   148    */
   149    if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
   150      n1 = n >> 1;
   151    else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
   152      n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
   153    else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
   154      n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
   155    else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
   156      n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
   157    /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
   158    else
   159      n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
   160  
   161    n2 = n - n1;
   162  
   163    /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
   164  
   165    /* x0 ^ 2 */
   166    mpn_sqr (tp, xp, n2);
   167    MPN_COPY (rp, tp, n2);
   168  
   169    /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
   170    if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
   171      mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
   172    else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
   173      mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
   174    else
   175      mpn_mullo_n (tp + n, xp + n2, xp, n1);
   176    /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
   177  #if HAVE_NATIVE_mpn_addlsh1_n
   178    mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
   179  #else
   180    mpn_lshift (rp + n2, tp + n, n1, 1);
   181    mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
   182  #endif
   183  }
   184  
   185  /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
   186  #define SQR_BASECASE_ALLOC \
   187   (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
   188  
   189  /* FIXME: This function should accept a temporary area; dc_sqrlo
   190     accepts a pointer tp, and handle the case tp == rp, do the same here.
   191  */
   192  
   193  void
   194  mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
   195  {
   196    ASSERT (n >= 1);
   197    ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
   198  
   199    if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
   200      {
   201        /* FIXME: smarter criteria? */
   202  #if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
   203        /* mullo computes as many products as sqr, but directly writes
   204  	 on the result area. */
   205        mpn_mullo_basecase (rp, xp, xp, n);
   206  #else
   207        /* Allocate workspace of fixed size on stack: fast! */
   208        mp_limb_t tp[SQR_BASECASE_ALLOC];
   209        mpn_sqr_basecase (tp, xp, n);
   210        MPN_COPY (rp, tp, n);
   211  #endif
   212      }
   213    else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
   214      {
   215        mpn_sqrlo_basecase (rp, xp, n);
   216      }
   217    else
   218      {
   219        mp_ptr tp;
   220        TMP_DECL;
   221        TMP_MARK;
   222        tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
   223        if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
   224  	{
   225  	  mpn_dc_sqrlo (rp, xp, n, tp);
   226  	}
   227        else
   228  	{
   229  	  /* For really large operands, use plain mpn_mul_n but throw away upper n
   230  	     limbs of result.  */
   231  #if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
   232  	  mpn_fft_mul (tp, xp, n, xp, n);
   233  #else
   234  	  mpn_sqr (tp, xp, n);
   235  #endif
   236  	  MPN_COPY (rp, tp, n);
   237  	}
   238        TMP_FREE;
   239      }
   240  }