github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom53_mul.c

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom53_mul.c (about)

     1  /* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3
     2     times as large as bn.  Or more accurately, (4/3)bn < an < (5/2)bn.
     3  
     4     Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     5  
     6     The idea of applying toom to unbalanced multiplication is due to Marco
     7     Bodrato and Alberto Zanoni.
     8  
     9     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
    10     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    11     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
    12  
    13  Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc.
    14  
    15  This file is part of the GNU MP Library.
    16  
    17  The GNU MP Library is free software; you can redistribute it and/or modify
    18  it under the terms of either:
    19  
    20    * the GNU Lesser General Public License as published by the Free
    21      Software Foundation; either version 3 of the License, or (at your
    22      option) any later version.
    23  
    24  or
    25  
    26    * the GNU General Public License as published by the Free Software
    27      Foundation; either version 2 of the License, or (at your option) any
    28      later version.
    29  
    30  or both in parallel, as here.
    31  
    32  The GNU MP Library is distributed in the hope that it will be useful, but
    33  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    34  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    35  for more details.
    36  
    37  You should have received copies of the GNU General Public License and the
    38  GNU Lesser General Public License along with the GNU MP Library.  If not,
    39  see https://www.gnu.org/licenses/.  */
    40  
    41  
    42  #include "gmp.h"
    43  #include "gmp-impl.h"
    44  
    45  /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
    46  
    47    <-s-><--n--><--n--><--n--><--n-->
    48     ___ ______ ______ ______ ______
    49    |a4_|___a3_|___a2_|___a1_|___a0_|
    50  	       |__b2|___b1_|___b0_|
    51  	       <-t--><--n--><--n-->
    52  
    53    v0  =    a0                  *  b0          #    A(0)*B(0)
    54    v1  = (  a0+ a1+ a2+ a3+  a4)*( b0+ b1+ b2) #    A(1)*B(1)      ah  <= 4   bh <= 2
    55    vm1 = (  a0- a1+ a2- a3+  a4)*( b0- b1+ b2) #   A(-1)*B(-1)    |ah| <= 2   bh <= 1
    56    v2  = (  a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) #    A(2)*B(2)      ah  <= 30  bh <= 6
    57    vm2 = (  a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) #    A(2)*B(2)     -9<=ah<=20 -1<=bh<=4
    58    vh  = (16a0+8a1+4a2+2a3+  a4)*(4b0+2b1+ b2) #  A(1/2)*B(1/2)    ah  <= 30  bh <= 6
    59    vinf=                     a4 *          b2  #  A(inf)*B(inf)
    60  */
    61  
    62  void
    63  mpn_toom53_mul (mp_ptr pp,
    64  		mp_srcptr ap, mp_size_t an,
    65  		mp_srcptr bp, mp_size_t bn,
    66  		mp_ptr scratch)
    67  {
    68    mp_size_t n, s, t;
    69    mp_limb_t cy;
    70    mp_ptr gp;
    71    mp_ptr as1, asm1, as2, asm2, ash;
    72    mp_ptr bs1, bsm1, bs2, bsm2, bsh;
    73    mp_ptr tmp;
    74    enum toom7_flags flags;
    75    TMP_DECL;
    76  
    77  #define a0  ap
    78  #define a1  (ap + n)
    79  #define a2  (ap + 2*n)
    80  #define a3  (ap + 3*n)
    81  #define a4  (ap + 4*n)
    82  #define b0  bp
    83  #define b1  (bp + n)
    84  #define b2  (bp + 2*n)
    85  
    86    n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);
    87  
    88    s = an - 4 * n;
    89    t = bn - 2 * n;
    90  
    91    ASSERT (0 < s && s <= n);
    92    ASSERT (0 < t && t <= n);
    93  
    94    TMP_MARK;
    95  
    96    tmp = TMP_ALLOC_LIMBS (10 * (n + 1));
    97    as1  = tmp; tmp += n + 1;
    98    asm1 = tmp; tmp += n + 1;
    99    as2  = tmp; tmp += n + 1;
   100    asm2 = tmp; tmp += n + 1;
   101    ash  = tmp; tmp += n + 1;
   102    bs1  = tmp; tmp += n + 1;
   103    bsm1 = tmp; tmp += n + 1;
   104    bs2  = tmp; tmp += n + 1;
   105    bsm2 = tmp; tmp += n + 1;
   106    bsh  = tmp; tmp += n + 1;
   107  
   108    gp = pp;
   109  
   110    /* Compute as1 and asm1.  */
   111    flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));
   112  
   113    /* Compute as2 and asm2. */
   114    flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp)));
   115  
   116    /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
   117       = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4  */
   118  #if HAVE_NATIVE_mpn_addlsh1_n
   119    cy = mpn_addlsh1_n (ash, a1, a0, n);
   120    cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
   121    cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
   122    if (s < n)
   123      {
   124        mp_limb_t cy2;
   125        cy2 = mpn_addlsh1_n (ash, a4, ash, s);
   126        ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
   127        MPN_INCR_U (ash + s, n+1-s, cy2);
   128      }
   129    else
   130      ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
   131  #else
   132    cy = mpn_lshift (ash, a0, n, 1);
   133    cy += mpn_add_n (ash, ash, a1, n);
   134    cy = 2*cy + mpn_lshift (ash, ash, n, 1);
   135    cy += mpn_add_n (ash, ash, a2, n);
   136    cy = 2*cy + mpn_lshift (ash, ash, n, 1);
   137    cy += mpn_add_n (ash, ash, a3, n);
   138    cy = 2*cy + mpn_lshift (ash, ash, n, 1);
   139    ash[n] = cy + mpn_add (ash, ash, n, a4, s);
   140  #endif
   141  
   142    /* Compute bs1 and bsm1.  */
   143    bs1[n] = mpn_add (bs1, b0, n, b2, t);		/* b0 + b2 */
   144  #if HAVE_NATIVE_mpn_add_n_sub_n
   145    if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
   146      {
   147        bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
   148        bsm1[n] = 0;
   149        flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
   150      }
   151    else
   152      {
   153        cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n);
   154        bsm1[n] = bs1[n] - (cy & 1);
   155        bs1[n] += (cy >> 1);
   156      }
   157  #else
   158    if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
   159      {
   160        mpn_sub_n (bsm1, b1, bs1, n);
   161        bsm1[n] = 0;
   162        flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
   163      }
   164    else
   165      {
   166        bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n);
   167      }
   168    bs1[n] += mpn_add_n (bs1, bs1, b1, n);  /* b0+b1+b2 */
   169  #endif
   170  
   171    /* Compute bs2 and bsm2. */
   172  #if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
   173  #if HAVE_NATIVE_mpn_addlsh2_n
   174    cy = mpn_addlsh2_n (bs2, b0, b2, t);
   175  #else /* HAVE_NATIVE_mpn_addlsh_n */
   176    cy = mpn_addlsh_n (bs2, b0, b2, t, 2);
   177  #endif
   178    if (t < n)
   179      cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy);
   180    bs2[n] = cy;
   181  #else
   182    cy = mpn_lshift (gp, b2, t, 2);
   183    bs2[n] = mpn_add (bs2, b0, n, gp, t);
   184    MPN_INCR_U (bs2 + t, n+1-t, cy);
   185  #endif
   186  
   187    gp[n] = mpn_lshift (gp, b1, n, 1);
   188  
   189  #if HAVE_NATIVE_mpn_add_n_sub_n
   190    if (mpn_cmp (bs2, gp, n+1) < 0)
   191      {
   192        ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1));
   193        flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
   194      }
   195    else
   196      {
   197        ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1));
   198      }
   199  #else
   200    if (mpn_cmp (bs2, gp, n+1) < 0)
   201      {
   202        ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1));
   203        flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
   204      }
   205    else
   206      {
   207        ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1));
   208      }
   209    mpn_add_n (bs2, bs2, gp, n+1);
   210  #endif
   211  
   212    /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2.  */
   213  #if HAVE_NATIVE_mpn_addlsh1_n
   214    cy = mpn_addlsh1_n (bsh, b1, b0, n);
   215    if (t < n)
   216      {
   217        mp_limb_t cy2;
   218        cy2 = mpn_addlsh1_n (bsh, b2, bsh, t);
   219        bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1);
   220        MPN_INCR_U (bsh + t, n+1-t, cy2);
   221      }
   222    else
   223      bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n);
   224  #else
   225    cy = mpn_lshift (bsh, b0, n, 1);
   226    cy += mpn_add_n (bsh, bsh, b1, n);
   227    cy = 2*cy + mpn_lshift (bsh, bsh, n, 1);
   228    bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t);
   229  #endif
   230  
   231    ASSERT (as1[n] <= 4);
   232    ASSERT (bs1[n] <= 2);
   233    ASSERT (asm1[n] <= 2);
   234    ASSERT (bsm1[n] <= 1);
   235    ASSERT (as2[n] <= 30);
   236    ASSERT (bs2[n] <= 6);
   237    ASSERT (asm2[n] <= 20);
   238    ASSERT (bsm2[n] <= 4);
   239    ASSERT (ash[n] <= 30);
   240    ASSERT (bsh[n] <= 6);
   241  
   242  #define v0    pp				/* 2n */
   243  #define v1    (pp + 2 * n)			/* 2n+1 */
   244  #define vinf  (pp + 6 * n)			/* s+t */
   245  #define v2    scratch				/* 2n+1 */
   246  #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
   247  #define vh    (scratch + 4 * n + 2)		/* 2n+1 */
   248  #define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
   249  #define scratch_out (scratch + 8 * n + 4)		/* 2n+1 */
   250    /* Total scratch need: 10*n+5 */
   251  
   252    /* Must be in allocation order, as they overwrite one limb beyond
   253     * 2n+1. */
   254    mpn_mul_n (v2, as2, bs2, n + 1);		/* v2, 2n+1 limbs */
   255    mpn_mul_n (vm2, asm2, bsm2, n + 1);		/* vm2, 2n+1 limbs */
   256    mpn_mul_n (vh, ash, bsh, n + 1);		/* vh, 2n+1 limbs */
   257  
   258    /* vm1, 2n+1 limbs */
   259  #ifdef SMALLER_RECURSION
   260    mpn_mul_n (vm1, asm1, bsm1, n);
   261    if (asm1[n] == 1)
   262      {
   263        cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
   264      }
   265    else if (asm1[n] == 2)
   266      {
   267  #if HAVE_NATIVE_mpn_addlsh1_n
   268        cy = 2 * bsm1[n] + mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
   269  #else
   270        cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
   271  #endif
   272      }
   273    else
   274      cy = 0;
   275    if (bsm1[n] != 0)
   276      cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
   277    vm1[2 * n] = cy;
   278  #else /* SMALLER_RECURSION */
   279    vm1[2 * n] = 0;
   280    mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
   281  #endif /* SMALLER_RECURSION */
   282  
   283    /* v1, 2n+1 limbs */
   284  #ifdef SMALLER_RECURSION
   285    mpn_mul_n (v1, as1, bs1, n);
   286    if (as1[n] == 1)
   287      {
   288        cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
   289      }
   290    else if (as1[n] == 2)
   291      {
   292  #if HAVE_NATIVE_mpn_addlsh1_n
   293        cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
   294  #else
   295        cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
   296  #endif
   297      }
   298    else if (as1[n] != 0)
   299      {
   300        cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
   301      }
   302    else
   303      cy = 0;
   304    if (bs1[n] == 1)
   305      {
   306        cy += mpn_add_n (v1 + n, v1 + n, as1, n);
   307      }
   308    else if (bs1[n] == 2)
   309      {
   310  #if HAVE_NATIVE_mpn_addlsh1_n
   311        cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
   312  #else
   313        cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
   314  #endif
   315      }
   316    v1[2 * n] = cy;
   317  #else /* SMALLER_RECURSION */
   318    v1[2 * n] = 0;
   319    mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
   320  #endif /* SMALLER_RECURSION */
   321  
   322    mpn_mul_n (v0, a0, b0, n);			/* v0, 2n limbs */
   323  
   324    /* vinf, s+t limbs */
   325    if (s > t)  mpn_mul (vinf, a4, s, b2, t);
   326    else        mpn_mul (vinf, b2, t, a4, s);
   327  
   328    mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t,
   329  			     scratch_out);
   330  
   331    TMP_FREE;
   332  }