github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/mullo_n.c (about)

     1  /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
     2     of their products.
     3  
     4     Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     5  
     6     THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
     7     FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
     8     THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     9  
    10  Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
    11  
    12  This file is part of the GNU MP Library.
    13  
    14  The GNU MP Library is free software; you can redistribute it and/or modify
    15  it under the terms of either:
    16  
    17    * the GNU Lesser General Public License as published by the Free
    18      Software Foundation; either version 3 of the License, or (at your
    19      option) any later version.
    20  
    21  or
    22  
    23    * the GNU General Public License as published by the Free Software
    24      Foundation; either version 2 of the License, or (at your option) any
    25      later version.
    26  
    27  or both in parallel, as here.
    28  
    29  The GNU MP Library is distributed in the hope that it will be useful, but
    30  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    31  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    32  for more details.
    33  
    34  You should have received copies of the GNU General Public License and the
    35  GNU Lesser General Public License along with the GNU MP Library.  If not,
    36  see https://www.gnu.org/licenses/.  */
    37  
    38  #include "gmp.h"
    39  #include "gmp-impl.h"
    40  
    41  
    42  #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
    43  #define MAYBE_range_basecase 1
    44  #define MAYBE_range_toom22   1
    45  #else
    46  #define MAYBE_range_basecase                                           \
    47    ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
    48  #define MAYBE_range_toom22                                             \
    49    ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
    50  #endif
    51  
    52  /*  THINK: The DC strategy uses different constants in different Toom's
    53  	 ranges. Something smoother?
    54  */
    55  
    56  /*
    57    Compute the least significant half of the product {xy,n}*{yp,n}, or
    58    formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
    59  
    60    Above the given threshold, the Divide and Conquer strategy is used.
    61    The operands are split in two, and a full product plus two mullo
    62    are used to obtain the final result. The more natural strategy is to
    63    split in two halves, but this is far from optimal when a
    64    sub-quadratic multiplication is used.
    65  
    66    Mulders suggests an unbalanced split in favour of the full product,
    67    split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
    68  
    69    To compute the value of a, we assume that the cost of mullo for a
    70    given size ML(n) is a fraction of the cost of a full product with
    71    same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
    72    then we can write:
    73  
    74    ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
    75  
    76    Given a value for e, want to minimise the value of k, i.e. the
    77    function k=(1-a)^e/(1-2*a^e).
    78  
    79    With e=2, the exponent for schoolbook multiplication, the minimum is
    80    given by the values a=1-a=1/2.
    81  
    82    With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
    83    Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
    84  
    85    Other possible approximations follow:
    86    e=log(5)/log(3) [Toom-3] -> a ~= 9/40
    87    e=log(7)/log(4) [Toom-4] -> a ~= 7/39
    88    e=log(11)/log(6) [Toom-6] -> a ~= 1/8
    89    e=log(15)/log(8) [Toom-8] -> a ~= 1/10
    90  
    91    The values above where obtained with the following trivial commands
    92    in the gp-pari shell:
    93  
    94  fun(e,a)=(1-a)^e/(1-2*a^e)
    95  mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
    96  contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
    97  contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
    98  contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
    99  contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
   100  contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
   101  
   102    ,
   103    |\
   104    | \
   105    +----,
   106    |    |
   107    |    |
   108    |    |\
   109    |    | \
   110    +----+--`
   111    ^ n2 ^n1^
   112  
   113    For an actual implementation, the assumption that M(n)=n^e is
   114    incorrect, as a consequence also the assumption that ML(n)=k*M(n)
   115    with a constant k is wrong.
   116  
   117    But theory suggest us two things:
   118    - the best the multiplication product is (lower e), the more k
   119      approaches 1, and a approaches 0.
   120  
   121    - A value for a smaller than optimal is probably less bad than a
   122      bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
   123      value, and k(a)=0.808_ the mul/mullo speed ratio. We get
   124      k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
   125  */
   126  
   127  static mp_size_t
   128  mpn_mullo_n_itch (mp_size_t n)
   129  {
   130    return 2*n;
   131  }
   132  
   133  /*
   134      mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
   135      It accepts tp == rp.
   136  */
   137  static void
   138  mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
   139  {
   140    mp_size_t n2, n1;
   141    ASSERT (n >= 2);
   142    ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
   143    ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
   144    ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
   145  
   146    /* Divide-and-conquer */
   147  
   148    /* We need fractional approximation of the value 0 < a <= 1/2
   149       giving the minimum in the function k=(1-a)^e/(1-2*a^e).
   150    */
   151    if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
   152      n1 = n >> 1;
   153    else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
   154      n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
   155    else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
   156      n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
   157    else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
   158      n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
   159    /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
   160    else
   161      n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
   162  
   163    n2 = n - n1;
   164  
   165    /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
   166  	      y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
   167  
   168    /* x0 * y0 */
   169    mpn_mul_n (tp, xp, yp, n2);
   170    MPN_COPY (rp, tp, n2);
   171  
   172    /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
   173    if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
   174      mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
   175    else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
   176      mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
   177    else
   178      mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
   179    mpn_add_n (rp + n2, tp + n2, tp + n, n1);
   180  
   181    /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
   182    if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
   183      mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
   184    else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
   185      mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
   186    else
   187      mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
   188    mpn_add_n (rp + n2, rp + n2, tp + n, n1);
   189  }
   190  
   191  /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
   192  #define MUL_BASECASE_ALLOC \
   193   (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
   194  
   195  /* FIXME: This function should accept a temporary area; dc_mullow_n
   196     accepts a pointer tp, and handle the case tp == rp, do the same here.
   197     Maybe recombine the two functions.
   198     THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
   199  	  (typically thanks to mpn_addmul_2) should we unconditionally use
   200  	  mpn_mul_n?
   201  */
   202  
   203  void
   204  mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
   205  {
   206    ASSERT (n >= 1);
   207    ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
   208    ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
   209  
   210    if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
   211      {
   212        /* Allocate workspace of fixed size on stack: fast! */
   213        mp_limb_t tp[MUL_BASECASE_ALLOC];
   214        mpn_mul_basecase (tp, xp, n, yp, n);
   215        MPN_COPY (rp, tp, n);
   216      }
   217    else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
   218      {
   219        mpn_mullo_basecase (rp, xp, yp, n);
   220      }
   221    else
   222      {
   223        mp_ptr tp;
   224        TMP_DECL;
   225        TMP_MARK;
   226        tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
   227        if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
   228  	{
   229  	  mpn_dc_mullo_n (rp, xp, yp, n, tp);
   230  	}
   231        else
   232  	{
   233  	  /* For really large operands, use plain mpn_mul_n but throw away upper n
   234  	     limbs of result.  */
   235  #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
   236  	  mpn_fft_mul (tp, xp, n, yp, n);
   237  #else
   238  	  mpn_mul_n (tp, xp, yp, n);
   239  #endif
   240  	  MPN_COPY (rp, tp, n);
   241  	}
   242        TMP_FREE;
   243      }
   244  }