github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom62_mul.c

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom62_mul.c (about)

     1  /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
     2     as large as bn.  Or more accurately, (5/2)bn < an < 6bn.
     3  
     4     Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     5  
     6     The idea of applying toom to unbalanced multiplication is due to Marco
     7     Bodrato and Alberto Zanoni.
     8  
     9     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
    10     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    11     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
    12  
    13  Copyright 2006-2008, 2012 Free Software Foundation, Inc.
    14  
    15  This file is part of the GNU MP Library.
    16  
    17  The GNU MP Library is free software; you can redistribute it and/or modify
    18  it under the terms of either:
    19  
    20    * the GNU Lesser General Public License as published by the Free
    21      Software Foundation; either version 3 of the License, or (at your
    22      option) any later version.
    23  
    24  or
    25  
    26    * the GNU General Public License as published by the Free Software
    27      Foundation; either version 2 of the License, or (at your option) any
    28      later version.
    29  
    30  or both in parallel, as here.
    31  
    32  The GNU MP Library is distributed in the hope that it will be useful, but
    33  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    34  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    35  for more details.
    36  
    37  You should have received copies of the GNU General Public License and the
    38  GNU Lesser General Public License along with the GNU MP Library.  If not,
    39  see https://www.gnu.org/licenses/.  */
    40  
    41  
    42  #include "gmp.h"
    43  #include "gmp-impl.h"
    44  
    45  /* Evaluate in:
    46     0, +1, -1, +2, -2, 1/2, +inf
    47  
    48    <-s-><--n--><--n--><--n--><--n--><--n-->
    49     ___ ______ ______ ______ ______ ______
    50    |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
    51  			     |_b1_|___b0_|
    52  			     <-t--><--n-->
    53  
    54    v0  =    a0                       *   b0      #    A(0)*B(0)
    55    v1  = (  a0+  a1+ a2+ a3+  a4+  a5)*( b0+ b1) #    A(1)*B(1)      ah  <= 5   bh <= 1
    56    vm1 = (  a0-  a1+ a2- a3+  a4-  a5)*( b0- b1) #   A(-1)*B(-1)    |ah| <= 2   bh  = 0
    57    v2  = (  a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) #    A(2)*B(2)      ah  <= 62  bh <= 2
    58    vm2 = (  a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) #   A(-2)*B(-2)    -41<=ah<=20 -1<=bh<=0
    59    vh  = (32a0+16a1+8a2+4a3+ 2a4+  a5)*(2b0+ b1) #  A(1/2)*B(1/2)    ah  <= 62  bh <= 2
    60    vinf=                           a5 *      b1  #  A(inf)*B(inf)
    61  */
    62  
    63  void
    64  mpn_toom62_mul (mp_ptr pp,
    65  		mp_srcptr ap, mp_size_t an,
    66  		mp_srcptr bp, mp_size_t bn,
    67  		mp_ptr scratch)
    68  {
    69    mp_size_t n, s, t;
    70    mp_limb_t cy;
    71    mp_ptr as1, asm1, as2, asm2, ash;
    72    mp_ptr bs1, bsm1, bs2, bsm2, bsh;
    73    mp_ptr gp;
    74    enum toom7_flags aflags, bflags;
    75    TMP_DECL;
    76  
    77  #define a0  ap
    78  #define a1  (ap + n)
    79  #define a2  (ap + 2*n)
    80  #define a3  (ap + 3*n)
    81  #define a4  (ap + 4*n)
    82  #define a5  (ap + 5*n)
    83  #define b0  bp
    84  #define b1  (bp + n)
    85  
    86    n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
    87  
    88    s = an - 5 * n;
    89    t = bn - n;
    90  
    91    ASSERT (0 < s && s <= n);
    92    ASSERT (0 < t && t <= n);
    93  
    94    TMP_MARK;
    95  
    96    as1 = TMP_SALLOC_LIMBS (n + 1);
    97    asm1 = TMP_SALLOC_LIMBS (n + 1);
    98    as2 = TMP_SALLOC_LIMBS (n + 1);
    99    asm2 = TMP_SALLOC_LIMBS (n + 1);
   100    ash = TMP_SALLOC_LIMBS (n + 1);
   101  
   102    bs1 = TMP_SALLOC_LIMBS (n + 1);
   103    bsm1 = TMP_SALLOC_LIMBS (n);
   104    bs2 = TMP_SALLOC_LIMBS (n + 1);
   105    bsm2 = TMP_SALLOC_LIMBS (n + 1);
   106    bsh = TMP_SALLOC_LIMBS (n + 1);
   107  
   108    gp = pp;
   109  
   110    /* Compute as1 and asm1.  */
   111    aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp));
   112  
   113    /* Compute as2 and asm2. */
   114    aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp)));
   115  
   116    /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
   117       = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5  */
   118  
   119  #if HAVE_NATIVE_mpn_addlsh1_n
   120    cy = mpn_addlsh1_n (ash, a1, a0, n);
   121    cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
   122    cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
   123    cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
   124    if (s < n)
   125      {
   126        mp_limb_t cy2;
   127        cy2 = mpn_addlsh1_n (ash, a5, ash, s);
   128        ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
   129        MPN_INCR_U (ash + s, n+1-s, cy2);
   130      }
   131    else
   132      ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
   133  #else
   134    cy = mpn_lshift (ash, a0, n, 1);
   135    cy += mpn_add_n (ash, ash, a1, n);
   136    cy = 2*cy + mpn_lshift (ash, ash, n, 1);
   137    cy += mpn_add_n (ash, ash, a2, n);
   138    cy = 2*cy + mpn_lshift (ash, ash, n, 1);
   139    cy += mpn_add_n (ash, ash, a3, n);
   140    cy = 2*cy + mpn_lshift (ash, ash, n, 1);
   141    cy += mpn_add_n (ash, ash, a4, n);
   142    cy = 2*cy + mpn_lshift (ash, ash, n, 1);
   143    ash[n] = cy + mpn_add (ash, ash, n, a5, s);
   144  #endif
   145  
   146    /* Compute bs1 and bsm1.  */
   147    if (t == n)
   148      {
   149  #if HAVE_NATIVE_mpn_add_n_sub_n
   150        if (mpn_cmp (b0, b1, n) < 0)
   151  	{
   152  	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
   153  	  bflags = toom7_w3_neg;
   154  	}
   155        else
   156  	{
   157  	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
   158  	  bflags = (enum toom7_flags) 0;
   159  	}
   160        bs1[n] = cy >> 1;
   161  #else
   162        bs1[n] = mpn_add_n (bs1, b0, b1, n);
   163        if (mpn_cmp (b0, b1, n) < 0)
   164  	{
   165  	  mpn_sub_n (bsm1, b1, b0, n);
   166  	  bflags = toom7_w3_neg;
   167  	}
   168        else
   169  	{
   170  	  mpn_sub_n (bsm1, b0, b1, n);
   171  	  bflags = (enum toom7_flags) 0;
   172  	}
   173  #endif
   174      }
   175    else
   176      {
   177        bs1[n] = mpn_add (bs1, b0, n, b1, t);
   178        if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
   179  	{
   180  	  mpn_sub_n (bsm1, b1, b0, t);
   181  	  MPN_ZERO (bsm1 + t, n - t);
   182  	  bflags = toom7_w3_neg;
   183  	}
   184        else
   185  	{
   186  	  mpn_sub (bsm1, b0, n, b1, t);
   187  	  bflags = (enum toom7_flags) 0;
   188  	}
   189      }
   190  
   191    /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
   192       bsm1 - b1 */
   193    mpn_add (bs2, bs1, n + 1, b1, t);
   194    if (bflags & toom7_w3_neg)
   195      {
   196        bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
   197        bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
   198      }
   199    else
   200      {
   201        /* FIXME: Simplify this logic? */
   202        if (t < n)
   203  	{
   204  	  if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
   205  	    {
   206  	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
   207  	      MPN_ZERO (bsm2 + t, n + 1 - t);
   208  	      bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
   209  	    }
   210  	  else
   211  	    {
   212  	      ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
   213  	      bsm2[n] = 0;
   214  	    }
   215  	}
   216        else
   217  	{
   218  	  if (mpn_cmp (bsm1, b1, n) < 0)
   219  	    {
   220  	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
   221  	      bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
   222  	    }
   223  	  else
   224  	    {
   225  	      ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n));
   226  	    }
   227  	  bsm2[n] = 0;
   228  	}
   229      }
   230  
   231    /* Compute bsh, recycling bs1. bsh=bs1+b0;  */
   232    bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n);
   233  
   234    ASSERT (as1[n] <= 5);
   235    ASSERT (bs1[n] <= 1);
   236    ASSERT (asm1[n] <= 2);
   237    ASSERT (as2[n] <= 62);
   238    ASSERT (bs2[n] <= 2);
   239    ASSERT (asm2[n] <= 41);
   240    ASSERT (bsm2[n] <= 1);
   241    ASSERT (ash[n] <= 62);
   242    ASSERT (bsh[n] <= 2);
   243  
   244  #define v0    pp				/* 2n */
   245  #define v1    (pp + 2 * n)			/* 2n+1 */
   246  #define vinf  (pp + 6 * n)			/* s+t */
   247  #define v2    scratch				/* 2n+1 */
   248  #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
   249  #define vh    (scratch + 4 * n + 2)		/* 2n+1 */
   250  #define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
   251  #define scratch_out (scratch + 8 * n + 4)		/* 2n+1 */
   252    /* Total scratch need: 10*n+5 */
   253  
   254    /* Must be in allocation order, as they overwrite one limb beyond
   255     * 2n+1. */
   256    mpn_mul_n (v2, as2, bs2, n + 1);		/* v2, 2n+1 limbs */
   257    mpn_mul_n (vm2, asm2, bsm2, n + 1);		/* vm2, 2n+1 limbs */
   258    mpn_mul_n (vh, ash, bsh, n + 1);		/* vh, 2n+1 limbs */
   259  
   260    /* vm1, 2n+1 limbs */
   261    mpn_mul_n (vm1, asm1, bsm1, n);
   262    cy = 0;
   263    if (asm1[n] == 1)
   264      {
   265        cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
   266      }
   267    else if (asm1[n] == 2)
   268      {
   269  #if HAVE_NATIVE_mpn_addlsh1_n
   270        cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
   271  #else
   272        cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
   273  #endif
   274      }
   275    vm1[2 * n] = cy;
   276  
   277    /* v1, 2n+1 limbs */
   278    mpn_mul_n (v1, as1, bs1, n);
   279    if (as1[n] == 1)
   280      {
   281        cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
   282      }
   283    else if (as1[n] == 2)
   284      {
   285  #if HAVE_NATIVE_mpn_addlsh1_n
   286        cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
   287  #else
   288        cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
   289  #endif
   290      }
   291    else if (as1[n] != 0)
   292      {
   293        cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
   294      }
   295    else
   296      cy = 0;
   297    if (bs1[n] != 0)
   298      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
   299    v1[2 * n] = cy;
   300  
   301    mpn_mul_n (v0, a0, b0, n);			/* v0, 2n limbs */
   302  
   303    /* vinf, s+t limbs */
   304    if (s > t)  mpn_mul (vinf, a5, s, b1, t);
   305    else        mpn_mul (vinf, b1, t, a5, s);
   306  
   307    mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags),
   308  			     vm2, vm1, v2, vh, s + t, scratch_out);
   309  
   310    TMP_FREE;
   311  }