github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom32_mul.c (about)

     1  /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
     2     times as large as bn.  Or more accurately, bn < an < 3bn.
     3  
     4     Contributed to the GNU project by Torbjorn Granlund.
     5     Improvements by Marco Bodrato and Niels Möller.
     6  
     7     The idea of applying toom to unbalanced multiplication is due to Marco
     8     Bodrato and Alberto Zanoni.
     9  
    10     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
    11     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    12     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
    13  
    14  Copyright 2006-2010 Free Software Foundation, Inc.
    15  
    16  This file is part of the GNU MP Library.
    17  
    18  The GNU MP Library is free software; you can redistribute it and/or modify
    19  it under the terms of either:
    20  
    21    * the GNU Lesser General Public License as published by the Free
    22      Software Foundation; either version 3 of the License, or (at your
    23      option) any later version.
    24  
    25  or
    26  
    27    * the GNU General Public License as published by the Free Software
    28      Foundation; either version 2 of the License, or (at your option) any
    29      later version.
    30  
    31  or both in parallel, as here.
    32  
    33  The GNU MP Library is distributed in the hope that it will be useful, but
    34  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    35  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    36  for more details.
    37  
    38  You should have received copies of the GNU General Public License and the
    39  GNU Lesser General Public License along with the GNU MP Library.  If not,
    40  see https://www.gnu.org/licenses/.  */
    41  
    42  
    43  #include "gmp.h"
    44  #include "gmp-impl.h"
    45  
    46  /* Evaluate in: -1, 0, +1, +inf
    47  
    48    <-s-><--n--><--n-->
    49     ___ ______ ______
    50    |a2_|___a1_|___a0_|
    51  	|_b1_|___b0_|
    52  	<-t--><--n-->
    53  
    54    v0  =  a0         * b0      #   A(0)*B(0)
    55    v1  = (a0+ a1+ a2)*(b0+ b1) #   A(1)*B(1)      ah  <= 2  bh <= 1
    56    vm1 = (a0- a1+ a2)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh = 0
    57    vinf=          a2 *     b1  # A(inf)*B(inf)
    58  */
    59  
    60  #define TOOM32_MUL_N_REC(p, a, b, n, ws)				\
    61    do {									\
    62      mpn_mul_n (p, a, b, n);						\
    63    } while (0)
    64  
    65  void
    66  mpn_toom32_mul (mp_ptr pp,
    67  		mp_srcptr ap, mp_size_t an,
    68  		mp_srcptr bp, mp_size_t bn,
    69  		mp_ptr scratch)
    70  {
    71    mp_size_t n, s, t;
    72    int vm1_neg;
    73    mp_limb_t cy;
    74    mp_limb_signed_t hi;
    75    mp_limb_t ap1_hi, bp1_hi;
    76  
    77  #define a0  ap
    78  #define a1  (ap + n)
    79  #define a2  (ap + 2 * n)
    80  #define b0  bp
    81  #define b1  (bp + n)
    82  
    83    /* Required, to ensure that s + t >= n. */
    84    ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
    85  
    86    n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
    87  
    88    s = an - 2 * n;
    89    t = bn - n;
    90  
    91    ASSERT (0 < s && s <= n);
    92    ASSERT (0 < t && t <= n);
    93    ASSERT (s + t >= n);
    94  
    95    /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
    96  #define ap1 (pp)		/* n, most significant limb in ap1_hi */
    97  #define bp1 (pp + n)		/* n, most significant bit in bp1_hi */
    98  #define am1 (pp + 2*n)		/* n, most significant bit in hi */
    99  #define bm1 (pp + 3*n)		/* n */
   100  #define v1 (scratch)		/* 2n + 1 */
   101  #define vm1 (pp)		/* 2n + 1 */
   102  #define scratch_out (scratch + 2*n + 1) /* Currently unused. */
   103  
   104    /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
   105  
   106    /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
   107  
   108    /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
   109    ap1_hi = mpn_add (ap1, a0, n, a2, s);
   110  #if HAVE_NATIVE_mpn_add_n_sub_n
   111    if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
   112      {
   113        ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
   114        hi = 0;
   115        vm1_neg = 1;
   116      }
   117    else
   118      {
   119        cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
   120        hi = ap1_hi - (cy & 1);
   121        ap1_hi += (cy >> 1);
   122        vm1_neg = 0;
   123      }
   124  #else
   125    if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
   126      {
   127        ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
   128        hi = 0;
   129        vm1_neg = 1;
   130      }
   131    else
   132      {
   133        hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
   134        vm1_neg = 0;
   135      }
   136    ap1_hi += mpn_add_n (ap1, ap1, a1, n);
   137  #endif
   138  
   139    /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
   140    if (t == n)
   141      {
   142  #if HAVE_NATIVE_mpn_add_n_sub_n
   143        if (mpn_cmp (b0, b1, n) < 0)
   144  	{
   145  	  cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
   146  	  vm1_neg ^= 1;
   147  	}
   148        else
   149  	{
   150  	  cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
   151  	}
   152        bp1_hi = cy >> 1;
   153  #else
   154        bp1_hi = mpn_add_n (bp1, b0, b1, n);
   155  
   156        if (mpn_cmp (b0, b1, n) < 0)
   157  	{
   158  	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
   159  	  vm1_neg ^= 1;
   160  	}
   161        else
   162  	{
   163  	  ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
   164  	}
   165  #endif
   166      }
   167    else
   168      {
   169        /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
   170        bp1_hi = mpn_add (bp1, b0, n, b1, t);
   171  
   172        if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
   173  	{
   174  	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
   175  	  MPN_ZERO (bm1 + t, n - t);
   176  	  vm1_neg ^= 1;
   177  	}
   178        else
   179  	{
   180  	  ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
   181  	}
   182      }
   183  
   184    TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
   185    if (ap1_hi == 1)
   186      {
   187        cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
   188      }
   189    else if (ap1_hi == 2)
   190      {
   191  #if HAVE_NATIVE_mpn_addlsh1_n
   192        cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
   193  #else
   194        cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
   195  #endif
   196      }
   197    else
   198      cy = 0;
   199    if (bp1_hi != 0)
   200      cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
   201    v1[2 * n] = cy;
   202  
   203    TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
   204    if (hi)
   205      hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
   206  
   207    vm1[2*n] = hi;
   208  
   209    /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
   210    if (vm1_neg)
   211      {
   212  #if HAVE_NATIVE_mpn_rsh1sub_n
   213        mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
   214  #else
   215        mpn_sub_n (v1, v1, vm1, 2*n+1);
   216        ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
   217  #endif
   218      }
   219    else
   220      {
   221  #if HAVE_NATIVE_mpn_rsh1add_n
   222        mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
   223  #else
   224        mpn_add_n (v1, v1, vm1, 2*n+1);
   225        ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
   226  #endif
   227      }
   228  
   229    /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
   230  
   231       y = x1 + x3 + (x0 + x2) * B
   232         = (x0 + x2) * B + (x0 + x2) - vm1.
   233  
   234       y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
   235       follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
   236       (already in place, except for carry propagation).
   237  
   238       We thus add
   239  
   240     B^3  B^2   B    1
   241      |    |    |    |
   242     +-----+----+
   243   + |  x0 + x2 |
   244     +----+-----+----+
   245   +      |  x0 + x2 |
   246  	+----------+
   247   -      |  vm1     |
   248   --+----++----+----+-
   249     | y2  | y1 | y0 |
   250     +-----+----+----+
   251  
   252    Since we store y0 at the same location as the low half of x0 + x2, we
   253    need to do the middle sum first. */
   254  
   255    hi = vm1[2*n];
   256    cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
   257    MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
   258  
   259    /* FIXME: Can we get rid of this second vm1_neg conditional by
   260       swapping the location of +1 and -1 values? */
   261    if (vm1_neg)
   262      {
   263        cy = mpn_add_n (v1, v1, vm1, n);
   264        hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
   265        MPN_INCR_U (v1 + n, n+1, hi);
   266      }
   267    else
   268      {
   269        cy = mpn_sub_n (v1, v1, vm1, n);
   270        hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
   271        MPN_DECR_U (v1 + n, n+1, hi);
   272      }
   273  
   274    TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
   275    /* vinf, s+t limbs.  Use mpn_mul for now, to handle unbalanced operands */
   276    if (s > t)  mpn_mul (pp+3*n, a2, s, b1, t);
   277    else        mpn_mul (pp+3*n, b1, t, a2, s);
   278  
   279    /* Remaining interpolation.
   280  
   281       y * B + x0 + x3 B^3 - x0 B^2 - x3 B
   282       = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
   283       = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
   284         + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
   285       = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
   286         + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
   287  
   288  	  B^4       B^3       B^2        B         1
   289   |         |         |         |         |         |
   290     +-------+                   +---------+---------+
   291     |  Hx3  |                   | Hx0-Lx3 |    Lx0  |
   292     +------+----------+---------+---------+---------+
   293  	  |    y2    |  y1     |   y0    |
   294  	  ++---------+---------+---------+
   295  	  -| Hx0-Lx3 | - Lx0   |
   296  	   +---------+---------+
   297  		      | - Hx3  |
   298  		      +--------+
   299  
   300      We must take into account the carry from Hx0 - Lx3.
   301    */
   302  
   303    cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
   304    hi = scratch[2*n] + cy;
   305  
   306    cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
   307    hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
   308  
   309    hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
   310  
   311    /* FIXME: Is support for s + t == n needed? */
   312    if (LIKELY (s + t > n))
   313      {
   314        hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
   315  
   316        if (hi < 0)
   317  	MPN_DECR_U (pp + 4*n, s+t-n, -hi);
   318        else
   319  	MPN_INCR_U (pp + 4*n, s+t-n, hi);
   320      }
   321    else
   322      ASSERT (hi == 0);
   323  }