github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/mul.c (about)

     1  /* mpn_mul -- Multiply two natural numbers.
     2  
     3     Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
     6  2014 Free Software Foundation, Inc.
     7  
     8  This file is part of the GNU MP Library.
     9  
    10  The GNU MP Library is free software; you can redistribute it and/or modify
    11  it under the terms of either:
    12  
    13    * the GNU Lesser General Public License as published by the Free
    14      Software Foundation; either version 3 of the License, or (at your
    15      option) any later version.
    16  
    17  or
    18  
    19    * the GNU General Public License as published by the Free Software
    20      Foundation; either version 2 of the License, or (at your option) any
    21      later version.
    22  
    23  or both in parallel, as here.
    24  
    25  The GNU MP Library is distributed in the hope that it will be useful, but
    26  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  for more details.
    29  
    30  You should have received copies of the GNU General Public License and the
    31  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  see https://www.gnu.org/licenses/.  */
    33  
    34  #include "gmp.h"
    35  #include "gmp-impl.h"
    36  
    37  
    38  #ifndef MUL_BASECASE_MAX_UN
    39  #define MUL_BASECASE_MAX_UN 500
    40  #endif
    41  
    42  /* Areas where the different toom algorithms can be called (extracted
    43     from the t-toom*.c files, and ignoring small constant offsets):
    44  
    45     1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
    46                                          4/7              6/7
    47  				       6/11
    48                                         |--------------------| toom22 (small)
    49                                                             || toom22 (large)
    50                                                         |xxxx| toom22 called
    51                        |-------------------------------------| toom32
    52                                           |xxxxxxxxxxxxxxxx| | toom32 called
    53                                                 |------------| toom33
    54                                                            |x| toom33 called
    55               |---------------------------------|            | toom42
    56  	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
    57                                         |--------------------| toom43
    58                                                 |xxxxxxxxxx|   toom43 called
    59           |-----------------------------|                      toom52 (unused)
    60                                                     |--------| toom44
    61  						   |xxxxxxxx| toom44 called
    62                                |--------------------|        | toom53
    63                                          |xxxxxx|              toom53 called
    64      |-------------------------|                               toom62 (unused)
    65                                             |----------------| toom54 (unused)
    66                        |--------------------|                  toom63
    67  	                      |xxxxxxxxx|                   | toom63 called
    68                            |---------------------------------| toom6h
    69  						   |xxxxxxxx| toom6h called
    70                                    |-------------------------| toom8h (32 bit)
    71                   |------------------------------------------| toom8h (64 bit)
    72  						   |xxxxxxxx| toom8h called
    73  */
    74  
    75  #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
    76  #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
    77  
    78  /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
    79     (pointed to by VP, with VN limbs), and store the result at PRODP.  The
    80     result is UN + VN limbs.  Return the most significant limb of the result.
    81  
    82     NOTE: The space pointed to by PRODP is overwritten before finished with U
    83     and V, so overlap is an error.
    84  
    85     Argument constraints:
    86     1. UN >= VN.
    87     2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
    88        the multiplier and the multiplicand.  */
    89  
    90  /*
    91    * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
    92      ideal lines of the surrounding algorithms.  Is that optimal?
    93  
    94    * The toomX3 code now uses a structure similar to the one of toomX2, except
    95      that it loops longer in the unbalanced case.  The result is that the
    96      remaining area might have un < vn.  Should we fix the toomX2 code in a
    97      similar way?
    98  
    99    * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
   100      therefore calls mpn_mul recursively for certain cases.
   101  
   102    * Allocate static temp space using THRESHOLD variables (except for toom44
   103      when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
   104  
   105    * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
   106      have the same vn threshold.  This is not true, we should actually use
   107      mul_basecase for slightly larger operands for toom32 than for toom22, and
   108      even larger for toom42.
   109  
   110    * That problem is even more prevalent for toomX3.  We therefore use special
   111      THRESHOLD variables there.
   112  
   113    * Is our ITCH allocation correct?
   114  */
   115  
   116  #define ITCH (16*vn + 100)
   117  
   118  mp_limb_t
   119  mpn_mul (mp_ptr prodp,
   120  	 mp_srcptr up, mp_size_t un,
   121  	 mp_srcptr vp, mp_size_t vn)
   122  {
   123    ASSERT (un >= vn);
   124    ASSERT (vn >= 1);
   125    ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
   126    ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
   127  
   128    if (un == vn)
   129      {
   130        if (up == vp)
   131  	mpn_sqr (prodp, up, un);
   132        else
   133  	mpn_mul_n (prodp, up, vp, un);
   134      }
   135    else if (vn < MUL_TOOM22_THRESHOLD)
   136      { /* plain schoolbook multiplication */
   137  
   138        /* Unless un is very large, or else if have an applicable mpn_mul_N,
   139  	 perform basecase multiply directly.  */
   140        if (un <= MUL_BASECASE_MAX_UN
   141  #if HAVE_NATIVE_mpn_mul_2
   142  	  || vn <= 2
   143  #else
   144  	  || vn == 1
   145  #endif
   146  	  )
   147  	mpn_mul_basecase (prodp, up, un, vp, vn);
   148        else
   149  	{
   150  	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
   151  	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
   152  	     these pieces with the vp[] operand.  After each such partial
   153  	     multiplication (but the last) we copy the most significant vn
   154  	     limbs into a temporary buffer since that part would otherwise be
   155  	     overwritten by the next multiplication.  After the next
   156  	     multiplication, we add it back.  This illustrates the situation:
   157  
   158                                                      -->vn<--
   159                                                        |  |<------- un ------->|
   160                                                           _____________________|
   161                                                          X                    /|
   162                                                        /XX__________________/  |
   163                                      _____________________                     |
   164                                     X                    /                     |
   165                                   /XX__________________/                       |
   166                 _____________________                                          |
   167                /                    /                                          |
   168              /____________________/                                            |
   169  	    ==================================================================
   170  
   171  	    The parts marked with X are the parts whose sums are copied into
   172  	    the temporary buffer.  */
   173  
   174  	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
   175  	  mp_limb_t cy;
   176  	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
   177  
   178  	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
   179  	  prodp += MUL_BASECASE_MAX_UN;
   180  	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
   181  	  up += MUL_BASECASE_MAX_UN;
   182  	  un -= MUL_BASECASE_MAX_UN;
   183  	  while (un > MUL_BASECASE_MAX_UN)
   184  	    {
   185  	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
   186  	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
   187  	      mpn_incr_u (prodp + vn, cy);
   188  	      prodp += MUL_BASECASE_MAX_UN;
   189  	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
   190  	      up += MUL_BASECASE_MAX_UN;
   191  	      un -= MUL_BASECASE_MAX_UN;
   192  	    }
   193  	  if (un > vn)
   194  	    {
   195  	      mpn_mul_basecase (prodp, up, un, vp, vn);
   196  	    }
   197  	  else
   198  	    {
   199  	      ASSERT (un > 0);
   200  	      mpn_mul_basecase (prodp, vp, vn, up, un);
   201  	    }
   202  	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
   203  	  mpn_incr_u (prodp + vn, cy);
   204  	}
   205      }
   206    else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
   207      {
   208        /* Use ToomX2 variants */
   209        mp_ptr scratch;
   210        TMP_SDECL; TMP_SMARK;
   211  
   212  #define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
   213        scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
   214        ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
   215        ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
   216        ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
   217  #undef ITCH_TOOMX2
   218  
   219        /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
   220  	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
   221  	 wise; we would get better balance by slightly moving the bound.  We
   222  	 will sometimes end up with un < vn, like in the X3 arm below.  */
   223        if (un >= 3 * vn)
   224  	{
   225  	  mp_limb_t cy;
   226  	  mp_ptr ws;
   227  
   228  	  /* The maximum ws usage is for the mpn_mul result.  */
   229  	  ws = TMP_SALLOC_LIMBS (4 * vn);
   230  
   231  	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
   232  	  un -= 2 * vn;
   233  	  up += 2 * vn;
   234  	  prodp += 2 * vn;
   235  
   236  	  while (un >= 3 * vn)
   237  	    {
   238  	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
   239  	      un -= 2 * vn;
   240  	      up += 2 * vn;
   241  	      cy = mpn_add_n (prodp, prodp, ws, vn);
   242  	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
   243  	      mpn_incr_u (prodp + vn, cy);
   244  	      prodp += 2 * vn;
   245  	    }
   246  
   247  	  /* vn <= un < 3vn */
   248  
   249  	  if (4 * un < 5 * vn)
   250  	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
   251  	  else if (4 * un < 7 * vn)
   252  	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
   253  	  else
   254  	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
   255  
   256  	  cy = mpn_add_n (prodp, prodp, ws, vn);
   257  	  MPN_COPY (prodp + vn, ws + vn, un);
   258  	  mpn_incr_u (prodp + vn, cy);
   259  	}
   260        else
   261  	{
   262  	  if (4 * un < 5 * vn)
   263  	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
   264  	  else if (4 * un < 7 * vn)
   265  	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
   266  	  else
   267  	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
   268  	}
   269        TMP_SFREE;
   270      }
   271    else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
   272  	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
   273      {
   274        /* Handle the largest operands that are not in the FFT range.  The 2nd
   275  	 condition makes very unbalanced operands avoid the FFT code (except
   276  	 perhaps as coefficient products of the Toom code.  */
   277  
   278        if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
   279  	{
   280  	  /* Use ToomX3 variants */
   281  	  mp_ptr scratch;
   282  	  TMP_DECL; TMP_MARK;
   283  
   284  #define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
   285  	  scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
   286  	  ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
   287  	  ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
   288  	  ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
   289  	  ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
   290  	  ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
   291  	  ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
   292  #undef ITCH_TOOMX3
   293  
   294  	  if (2 * un >= 5 * vn)
   295  	    {
   296  	      mp_limb_t cy;
   297  	      mp_ptr ws;
   298  
   299  	      /* The maximum ws usage is for the mpn_mul result.  */
   300  	      ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
   301  
   302  	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
   303  		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
   304  	      else
   305  		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
   306  	      un -= 2 * vn;
   307  	      up += 2 * vn;
   308  	      prodp += 2 * vn;
   309  
   310  	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
   311  		{
   312  		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
   313  		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
   314  		  else
   315  		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
   316  		  un -= 2 * vn;
   317  		  up += 2 * vn;
   318  		  cy = mpn_add_n (prodp, prodp, ws, vn);
   319  		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
   320  		  mpn_incr_u (prodp + vn, cy);
   321  		  prodp += 2 * vn;
   322  		}
   323  
   324  	      /* vn / 2 <= un < 2.5vn */
   325  
   326  	      if (un < vn)
   327  		mpn_mul (ws, vp, vn, up, un);
   328  	      else
   329  		mpn_mul (ws, up, un, vp, vn);
   330  
   331  	      cy = mpn_add_n (prodp, prodp, ws, vn);
   332  	      MPN_COPY (prodp + vn, ws + vn, un);
   333  	      mpn_incr_u (prodp + vn, cy);
   334  	    }
   335  	  else
   336  	    {
   337  	      if (6 * un < 7 * vn)
   338  		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
   339  	      else if (2 * un < 3 * vn)
   340  		{
   341  		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
   342  		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
   343  		  else
   344  		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
   345  		}
   346  	      else if (6 * un < 11 * vn)
   347  		{
   348  		  if (4 * un < 7 * vn)
   349  		    {
   350  		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
   351  			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
   352  		      else
   353  			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
   354  		    }
   355  		  else
   356  		    {
   357  		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
   358  			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
   359  		      else
   360  			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
   361  		    }
   362  		}
   363  	      else
   364  		{
   365  		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
   366  		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
   367  		  else
   368  		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
   369  		}
   370  	    }
   371  	  TMP_FREE;
   372  	}
   373        else
   374  	{
   375  	  mp_ptr scratch;
   376  	  TMP_DECL; TMP_MARK;
   377  
   378  	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
   379  	    {
   380  	      scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
   381  	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
   382  	    }
   383  	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
   384  	    {
   385  	      scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
   386  	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
   387  	    }
   388  	  else
   389  	    {
   390  	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
   391  	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
   392  	    }
   393  	  TMP_FREE;
   394  	}
   395      }
   396    else
   397      {
   398        if (un >= 8 * vn)
   399  	{
   400  	  mp_limb_t cy;
   401  	  mp_ptr ws;
   402  	  TMP_DECL; TMP_MARK;
   403  
   404  	  /* The maximum ws usage is for the mpn_mul result.  */
   405  	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
   406  
   407  	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
   408  	  un -= 3 * vn;
   409  	  up += 3 * vn;
   410  	  prodp += 3 * vn;
   411  
   412  	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
   413  	    {
   414  	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
   415  	      un -= 3 * vn;
   416  	      up += 3 * vn;
   417  	      cy = mpn_add_n (prodp, prodp, ws, vn);
   418  	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
   419  	      mpn_incr_u (prodp + vn, cy);
   420  	      prodp += 3 * vn;
   421  	    }
   422  
   423  	  /* vn / 2 <= un < 3.5vn */
   424  
   425  	  if (un < vn)
   426  	    mpn_mul (ws, vp, vn, up, un);
   427  	  else
   428  	    mpn_mul (ws, up, un, vp, vn);
   429  
   430  	  cy = mpn_add_n (prodp, prodp, ws, vn);
   431  	  MPN_COPY (prodp + vn, ws + vn, un);
   432  	  mpn_incr_u (prodp + vn, cy);
   433  
   434  	  TMP_FREE;
   435  	}
   436        else
   437  	mpn_fft_mul (prodp, up, un, vp, vn);
   438      }
   439  
   440    return prodp[un + vn - 1];	/* historic */
   441  }