github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom_interpolate_7pts.c (about)

     1  /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
     2  
     3     Contributed to the GNU project by Niels Möller.
     4     Improvements by Marco Bodrato.
     5  
     6     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
     7     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     8     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     9  
    10  Copyright 2006, 2007, 2009, 2014 Free Software Foundation, Inc.
    11  
    12  This file is part of the GNU MP Library.
    13  
    14  The GNU MP Library is free software; you can redistribute it and/or modify
    15  it under the terms of either:
    16  
    17    * the GNU Lesser General Public License as published by the Free
    18      Software Foundation; either version 3 of the License, or (at your
    19      option) any later version.
    20  
    21  or
    22  
    23    * the GNU General Public License as published by the Free Software
    24      Foundation; either version 2 of the License, or (at your option) any
    25      later version.
    26  
    27  or both in parallel, as here.
    28  
    29  The GNU MP Library is distributed in the hope that it will be useful, but
    30  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    31  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    32  for more details.
    33  
    34  You should have received copies of the GNU General Public License and the
    35  GNU Lesser General Public License along with the GNU MP Library.  If not,
    36  see https://www.gnu.org/licenses/.  */
    37  
    38  #include "gmp.h"
    39  #include "gmp-impl.h"
    40  
    41  #define BINVERT_3 MODLIMB_INVERSE_3
    42  
    43  #define BINVERT_9 \
    44    ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
    45  
    46  #define BINVERT_15 \
    47    ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
    48  
    49  /* For the various mpn_divexact_byN here, fall back to using either
    50     mpn_pi1_bdiv_q_1 or mpn_divexact_1.  The former has less overhead and is
    51     many faster if it is native.  For now, since mpn_divexact_1 is native on
    52     several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
    53     mpn_pi1_bdiv_q_1 unconditionally.  FIXME.  */
    54  
    55  /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
    56  #ifndef mpn_divexact_by3
    57  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
    58  #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
    59  #else
    60  #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
    61  #endif
    62  #endif
    63  
    64  #ifndef mpn_divexact_by9
    65  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
    66  #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
    67  #else
    68  #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
    69  #endif
    70  #endif
    71  
    72  #ifndef mpn_divexact_by15
    73  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
    74  #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
    75  #else
    76  #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
    77  #endif
    78  #endif
    79  
    80  /* Interpolation for toom4, using the evaluation points 0, infinity,
    81     1, -1, 2, -2, 1/2. More precisely, we want to compute
    82     f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
    83     seven values
    84  
    85       w0 = f(0),
    86       w1 = f(-2),
    87       w2 = f(1),
    88       w3 = f(-1),
    89       w4 = f(2)
    90       w5 = 64 * f(1/2)
    91       w6 = limit at infinity of f(x) / x^6,
    92  
    93     The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
    94     w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
    95     w6n }. The other values are 2n + 1 limbs each (with most
    96     significant limbs small). f(-1) and f(-1/2) may be negative, signs
    97     determined by the flag bits. Inputs are destroyed.
    98  
    99     Needs (2*n + 1) limbs of temporary storage.
   100  */
   101  
   102  void
   103  mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
   104  			   mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
   105  			   mp_size_t w6n, mp_ptr tp)
   106  {
   107    mp_size_t m;
   108    mp_limb_t cy;
   109  
   110    m = 2*n + 1;
   111  #define w0 rp
   112  #define w2 (rp + 2*n)
   113  #define w6 (rp + 6*n)
   114  
   115    ASSERT (w6n > 0);
   116    ASSERT (w6n <= 2*n);
   117  
   118    /* Using formulas similar to Marco Bodrato's
   119  
   120       W5 = W5 + W4
   121       W1 =(W4 - W1)/2
   122       W4 = W4 - W0
   123       W4 =(W4 - W1)/4 - W6*16
   124       W3 =(W2 - W3)/2
   125       W2 = W2 - W3
   126  
   127       W5 = W5 - W2*65      May be negative.
   128       W2 = W2 - W6 - W0
   129       W5 =(W5 + W2*45)/2   Now >= 0 again.
   130       W4 =(W4 - W2)/3
   131       W2 = W2 - W4
   132  
   133       W1 = W5 - W1         May be negative.
   134       W5 =(W5 - W3*8)/9
   135       W3 = W3 - W5
   136       W1 =(W1/15 + W5)/2   Now >= 0 again.
   137       W5 = W5 - W1
   138  
   139       where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
   140  	   W4 = f(2), W5 = f(1/2), W6 = f(oo),
   141  
   142       Note that most intermediate results are positive; the ones that
   143       may be negative are represented in two's complement. We must
   144       never shift right a value that may be negative, since that would
   145       invalidate the sign bit. On the other hand, divexact by odd
   146       numbers work fine with two's complement.
   147    */
   148  
   149    mpn_add_n (w5, w5, w4, m);
   150    if (flags & toom7_w1_neg)
   151      {
   152  #ifdef HAVE_NATIVE_mpn_rsh1add_n
   153        mpn_rsh1add_n (w1, w1, w4, m);
   154  #else
   155        mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
   156        mpn_rshift (w1, w1, m, 1);
   157  #endif
   158      }
   159    else
   160      {
   161  #ifdef HAVE_NATIVE_mpn_rsh1sub_n
   162        mpn_rsh1sub_n (w1, w4, w1, m);
   163  #else
   164        mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
   165        mpn_rshift (w1, w1, m, 1);
   166  #endif
   167      }
   168    mpn_sub (w4, w4, m, w0, 2*n);
   169    mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
   170    mpn_rshift (w4, w4, m, 2); /* w4>=0 */
   171  
   172    tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
   173    mpn_sub (w4, w4, m, tp, w6n+1);
   174  
   175    if (flags & toom7_w3_neg)
   176      {
   177  #ifdef HAVE_NATIVE_mpn_rsh1add_n
   178        mpn_rsh1add_n (w3, w3, w2, m);
   179  #else
   180        mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
   181        mpn_rshift (w3, w3, m, 1);
   182  #endif
   183      }
   184    else
   185      {
   186  #ifdef HAVE_NATIVE_mpn_rsh1sub_n
   187        mpn_rsh1sub_n (w3, w2, w3, m);
   188  #else
   189        mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
   190        mpn_rshift (w3, w3, m, 1);
   191  #endif
   192      }
   193  
   194    mpn_sub_n (w2, w2, w3, m);
   195  
   196    mpn_submul_1 (w5, w2, m, 65);
   197    mpn_sub (w2, w2, m, w6, w6n);
   198    mpn_sub (w2, w2, m, w0, 2*n);
   199  
   200    mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
   201    mpn_rshift (w5, w5, m, 1);
   202    mpn_sub_n (w4, w4, w2, m);
   203  
   204    mpn_divexact_by3 (w4, w4, m);
   205    mpn_sub_n (w2, w2, w4, m);
   206  
   207    mpn_sub_n (w1, w5, w1, m);
   208    mpn_lshift (tp, w3, m, 3);
   209    mpn_sub_n (w5, w5, tp, m);
   210    mpn_divexact_by9 (w5, w5, m);
   211    mpn_sub_n (w3, w3, w5, m);
   212  
   213    mpn_divexact_by15 (w1, w1, m);
   214    mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
   215    mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
   216    mpn_sub_n (w5, w5, w1, m);
   217  
   218    /* These bounds are valid for the 4x4 polynomial product of toom44,
   219     * and they are conservative for toom53 and toom62. */
   220    ASSERT (w1[2*n] < 2);
   221    ASSERT (w2[2*n] < 3);
   222    ASSERT (w3[2*n] < 4);
   223    ASSERT (w4[2*n] < 3);
   224    ASSERT (w5[2*n] < 2);
   225  
   226    /* Addition chain. Note carries and the 2n'th limbs that need to be
   227     * added in.
   228     *
   229     * Special care is needed for w2[2n] and the corresponding carry,
   230     * since the "simple" way of adding it all together would overwrite
   231     * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
   232     * the high half of w3 and the low half of w4.
   233     *
   234     *         7    6    5    4    3    2    1    0
   235     *    |    |    |    |    |    |    |    |    |
   236     *                  ||w3 (2n+1)|
   237     *             ||w4 (2n+1)|
   238     *        ||w5 (2n+1)|        ||w1 (2n+1)|
   239     *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
   240     *  -----------------------------------------------
   241     *  r |    |    |    |    |    |    |    |    |
   242     *        c7   c6   c5   c4   c3                 Carries to propagate
   243     */
   244  
   245    cy = mpn_add_n (rp + n, rp + n, w1, m);
   246    MPN_INCR_U (w2 + n + 1, n , cy);
   247    cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
   248    MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
   249    cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
   250    MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
   251    cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
   252    MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
   253    if (w6n > n + 1)
   254      {
   255        cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);
   256        MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy);
   257      }
   258    else
   259      {
   260        ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
   261  #if WANT_ASSERT
   262        {
   263  	mp_size_t i;
   264  	for (i = w6n; i <= n; i++)
   265  	  ASSERT (w5[n + i] == 0);
   266        }
   267  #endif
   268      }
   269  }