github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom_interpolate_12pts.c (about)

     1  /* Interpolation for the algorithm Toom-Cook 6.5-way.
     2  
     3     Contributed to the GNU project by Marco Bodrato.
     4  
     5     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
     6     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     7     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     8  
     9  Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
    10  
    11  This file is part of the GNU MP Library.
    12  
    13  The GNU MP Library is free software; you can redistribute it and/or modify
    14  it under the terms of either:
    15  
    16    * the GNU Lesser General Public License as published by the Free
    17      Software Foundation; either version 3 of the License, or (at your
    18      option) any later version.
    19  
    20  or
    21  
    22    * the GNU General Public License as published by the Free Software
    23      Foundation; either version 2 of the License, or (at your option) any
    24      later version.
    25  
    26  or both in parallel, as here.
    27  
    28  The GNU MP Library is distributed in the hope that it will be useful, but
    29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    31  for more details.
    32  
    33  You should have received copies of the GNU General Public License and the
    34  GNU Lesser General Public License along with the GNU MP Library.  If not,
    35  see https://www.gnu.org/licenses/.  */
    36  
    37  
    38  #include "gmp.h"
    39  #include "gmp-impl.h"
    40  
    41  
    42  #if HAVE_NATIVE_mpn_sublsh_n
    43  #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
    44  #else
    45  static mp_limb_t
    46  DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
    47  {
    48  #if USE_MUL_1 && 0
    49    return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
    50  #else
    51    mp_limb_t __cy;
    52    __cy = mpn_lshift(ws,src,n,s);
    53    return    __cy + mpn_sub_n(dst,dst,ws,n);
    54  #endif
    55  }
    56  #endif
    57  
    58  #if HAVE_NATIVE_mpn_addlsh_n
    59  #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
    60  #else
    61  static mp_limb_t
    62  DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
    63  {
    64  #if USE_MUL_1 && 0
    65    return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
    66  #else
    67    mp_limb_t __cy;
    68    __cy = mpn_lshift(ws,src,n,s);
    69    return    __cy + mpn_add_n(dst,dst,ws,n);
    70  #endif
    71  }
    72  #endif
    73  
    74  #if HAVE_NATIVE_mpn_subrsh
    75  #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
    76  #else
    77  /* FIXME: This is not a correct definition, it assumes no carry */
    78  #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
    79  do {									\
    80    mp_limb_t __cy;							\
    81    MPN_DECR_U (dst, nd, src[0] >> s);					\
    82    __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
    83    MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
    84  } while (0)
    85  #endif
    86  
    87  
    88  #if GMP_NUMB_BITS < 21
    89  #error Not implemented: Both sublsh_n(,,,20) should be corrected.
    90  #endif
    91  
    92  #if GMP_NUMB_BITS < 16
    93  #error Not implemented: divexact_by42525 needs splitting.
    94  #endif
    95  
    96  #if GMP_NUMB_BITS < 12
    97  #error Not implemented: Hard to adapt...
    98  #endif
    99  
   100  /* FIXME: tuneup should decide the best variant */
   101  #ifndef AORSMUL_FASTER_AORS_AORSLSH
   102  #define AORSMUL_FASTER_AORS_AORSLSH 1
   103  #endif
   104  #ifndef AORSMUL_FASTER_AORS_2AORSLSH
   105  #define AORSMUL_FASTER_AORS_2AORSLSH 1
   106  #endif
   107  #ifndef AORSMUL_FASTER_2AORSLSH
   108  #define AORSMUL_FASTER_2AORSLSH 1
   109  #endif
   110  #ifndef AORSMUL_FASTER_3AORSLSH
   111  #define AORSMUL_FASTER_3AORSLSH 1
   112  #endif
   113  
   114  #define BINVERT_9 \
   115    ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
   116  
   117  #define BINVERT_255 \
   118    (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
   119  
   120    /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
   121  #if GMP_LIMB_BITS == 32
   122  #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
   123  #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
   124  #else
   125  #if GMP_LIMB_BITS == 64
   126  #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
   127  #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
   128  #endif
   129  #endif
   130  
   131  #ifndef mpn_divexact_by255
   132  #if GMP_NUMB_BITS % 8 == 0
   133  #define mpn_divexact_by255(dst,src,size) \
   134    (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
   135  #else
   136  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
   137  #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
   138  #else
   139  #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
   140  #endif
   141  #endif
   142  #endif
   143  
   144  #ifndef mpn_divexact_by9x4
   145  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
   146  #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
   147  #else
   148  #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
   149  #endif
   150  #endif
   151  
   152  #ifndef mpn_divexact_by42525
   153  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
   154  #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
   155  #else
   156  #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
   157  #endif
   158  #endif
   159  
   160  #ifndef mpn_divexact_by2835x4
   161  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
   162  #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
   163  #else
   164  #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
   165  #endif
   166  #endif
   167  
   168  /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
   169     points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
   170     we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
   171     degree 11 (or 10), given the 12 (rsp. 11) values:
   172  
   173       r0 = limit at infinity of f(x) / x^7,
   174       r1 = f(4),f(-4),
   175       r2 = f(2),f(-2),
   176       r3 = f(1),f(-1),
   177       r4 = f(1/4),f(-1/4),
   178       r5 = f(1/2),f(-1/2),
   179       r6 = f(0).
   180  
   181     All couples of the form f(n),f(-n) must be already mixed with
   182     toom_couple_handling(f(n),...,f(-n),...)
   183  
   184     The result is stored in {pp, spt + 7*n (or 6*n)}.
   185     At entry, r6 is stored at {pp, 2n},
   186     r4 is stored at {pp + 3n, 3n + 1}.
   187     r2 is stored at {pp + 7n, 3n + 1}.
   188     r0 is stored at {pp +11n, spt}.
   189  
   190     The other values are 3n+1 limbs each (with most significant limbs small).
   191  
   192     Negative intermediate results are stored two-complemented.
   193     Inputs are destroyed.
   194  */
   195  
   196  void
   197  mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
   198  			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
   199  {
   200    mp_limb_t cy;
   201    mp_size_t n3;
   202    mp_size_t n3p1;
   203    n3 = 3 * n;
   204    n3p1 = n3 + 1;
   205  
   206  #define   r4    (pp + n3)			/* 3n+1 */
   207  #define   r2    (pp + 7 * n)			/* 3n+1 */
   208  #define   r0    (pp +11 * n)			/* s+t <= 2*n */
   209  
   210    /******************************* interpolation *****************************/
   211    if (half != 0) {
   212      cy = mpn_sub_n (r3, r3, r0, spt);
   213      MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
   214  
   215      cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
   216      MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
   217      DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
   218  
   219      cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
   220      MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
   221      DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
   222    };
   223  
   224    r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
   225    DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
   226  
   227  #if HAVE_NATIVE_mpn_add_n_sub_n
   228    mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
   229  #else
   230    ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
   231    mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
   232    MP_PTR_SWAP(r1, wsi);
   233  #endif
   234  
   235    r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
   236    DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
   237  
   238  #if HAVE_NATIVE_mpn_add_n_sub_n
   239    mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
   240  #else
   241    mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
   242    ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
   243    MP_PTR_SWAP(r5, wsi);
   244  #endif
   245  
   246    r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
   247  
   248  #if AORSMUL_FASTER_AORS_AORSLSH
   249    mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
   250  #else
   251    mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
   252    DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
   253  #endif
   254    /* A division by 2835x4 follows. Warning: the operand can be negative! */
   255    mpn_divexact_by2835x4(r4, r4, n3p1);
   256    if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
   257      r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
   258  
   259  #if AORSMUL_FASTER_2AORSLSH
   260    mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
   261  #else
   262    DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
   263    DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
   264  #endif
   265    mpn_divexact_by255(r5, r5, n3p1);
   266  
   267    ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
   268  
   269  #if AORSMUL_FASTER_3AORSLSH
   270    ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
   271  #else
   272    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
   273    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
   274    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
   275  #endif
   276    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
   277    mpn_divexact_by42525(r1, r1, n3p1);
   278  
   279  #if AORSMUL_FASTER_AORS_2AORSLSH
   280    ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
   281  #else
   282    ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
   283    ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
   284    ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
   285  #endif
   286    mpn_divexact_by9x4(r2, r2, n3p1);
   287  
   288    ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
   289  
   290    mpn_sub_n (r4, r2, r4, n3p1);
   291    ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
   292    ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
   293  
   294    mpn_add_n (r5, r5, r1, n3p1);
   295    ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
   296  
   297    /* last interpolation steps... */
   298    ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
   299    ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
   300    /* ... could be mixed with recomposition
   301  	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
   302    */
   303  
   304    /***************************** recomposition *******************************/
   305    /*
   306      pp[] prior to operations:
   307      |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
   308  
   309      summation scheme for remaining operations:
   310      |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
   311      |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
   312  	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
   313    */
   314  
   315    cy = mpn_add_n (pp + n, pp + n, r5, n);
   316    cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
   317  #if HAVE_NATIVE_mpn_add_nc
   318    cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
   319  #else
   320    MPN_INCR_U (r5 + 2 * n, n + 1, cy);
   321    cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
   322  #endif
   323    MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
   324  
   325    pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
   326    cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
   327  #if HAVE_NATIVE_mpn_add_nc
   328    cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
   329  #else
   330    MPN_INCR_U (r3 + 2 * n, n + 1, cy);
   331    cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
   332  #endif
   333    MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
   334  
   335    pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
   336    if (half) {
   337      cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
   338  #if HAVE_NATIVE_mpn_add_nc
   339      if (LIKELY (spt > n)) {
   340        cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
   341        MPN_INCR_U (pp + 4 * n3, spt - n, cy);
   342      } else {
   343        ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
   344      }
   345  #else
   346      MPN_INCR_U (r1 + 2 * n, n + 1, cy);
   347      if (LIKELY (spt > n)) {
   348        cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
   349        MPN_INCR_U (pp + 4 * n3, spt - n, cy);
   350      } else {
   351        ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
   352      }
   353  #endif
   354    } else {
   355      ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
   356    }
   357  
   358  #undef   r0
   359  #undef   r2
   360  #undef   r4
   361  }