github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom42_mulmid.c (about)

     1  /* mpn_toom42_mulmid -- toom42 middle product
     2  
     3     Contributed by David Harvey.
     4  
     5     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
     6     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     7     GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     8  
     9  Copyright 2011 Free Software Foundation, Inc.
    10  
    11  This file is part of the GNU MP Library.
    12  
    13  The GNU MP Library is free software; you can redistribute it and/or modify
    14  it under the terms of either:
    15  
    16    * the GNU Lesser General Public License as published by the Free
    17      Software Foundation; either version 3 of the License, or (at your
    18      option) any later version.
    19  
    20  or
    21  
    22    * the GNU General Public License as published by the Free Software
    23      Foundation; either version 2 of the License, or (at your option) any
    24      later version.
    25  
    26  or both in parallel, as here.
    27  
    28  The GNU MP Library is distributed in the hope that it will be useful, but
    29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    31  for more details.
    32  
    33  You should have received copies of the GNU General Public License and the
    34  GNU Lesser General Public License along with the GNU MP Library.  If not,
    35  see https://www.gnu.org/licenses/.  */
    36  
    37  
    38  #include "gmp.h"
    39  #include "gmp-impl.h"
    40  
    41  
    42  
    43  /*
    44    Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
    45  
    46    Neither ap nor bp may overlap rp.
    47  
    48    Must have n >= 4.
    49  
    50    Amount of scratch space required is given by mpn_toom42_mulmid_itch().
    51  
    52    FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
    53    requirements should be clarified.
    54  */
    55  void
    56  mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
    57                     mp_ptr scratch)
    58  {
    59    mp_limb_t cy, e[12], zh, zl;
    60    mp_size_t m;
    61    int neg;
    62  
    63    ASSERT (n >= 4);
    64    ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
    65    ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
    66  
    67    ap += n & 1;   /* handle odd row and diagonal later */
    68    m = n / 2;
    69  
    70    /* (e0h:e0l) etc are correction terms, in 2's complement */
    71  #define e0l (e[0])
    72  #define e0h (e[1])
    73  #define e1l (e[2])
    74  #define e1h (e[3])
    75  #define e2l (e[4])
    76  #define e2h (e[5])
    77  #define e3l (e[6])
    78  #define e3h (e[7])
    79  #define e4l (e[8])
    80  #define e4h (e[9])
    81  #define e5l (e[10])
    82  #define e5h (e[11])
    83  
    84  #define s (scratch + 2)
    85  #define t (rp + m + 2)
    86  #define p0 rp
    87  #define p1 scratch
    88  #define p2 (rp + m)
    89  #define next_scratch (scratch + 3*m + 1)
    90  
    91    /*
    92              rp                            scratch
    93    |---------|-----------|    |---------|---------|----------|
    94    0         m         2m+2   0         m         2m        3m+1
    95              <----p2---->       <-------------s------------->
    96    <----p0----><---t---->     <----p1---->
    97    */
    98  
    99    /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
   100    cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
   101    cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
   102  		       bp + m, bp, m, cy);
   103    mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
   104  
   105    /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
   106    if (mpn_cmp (bp + m, bp, m) < 0)
   107      {
   108        ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
   109  				      ap + m - 1, ap + 2*m - 1, m, 0));
   110        neg = 1;
   111      }
   112    else
   113      {
   114        ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
   115  				      ap + m - 1, ap + 2*m - 1, m, 0));
   116        neg = 0;
   117      }
   118  
   119    /* recursive middle products. The picture is:
   120  
   121        b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
   122        ...       -   A   A   A   B   B   B   -   -   -   -
   123        b[m]      -   -   A   A   A   B   B   B   -   -   -
   124        b[m-1]    -   -   -   C   C   C   D   D   D   -   -
   125        ...       -   -   -   -   C   C   C   D   D   D   -
   126        b[0]      -   -   -   -   -   C   C   C   D   D   D
   127                 a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
   128    */
   129  
   130    if (m < MULMID_TOOM42_THRESHOLD)
   131      {
   132        /* A + B */
   133        mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
   134        /* accumulate high limbs of p0 into e1 */
   135        ADDC_LIMB (cy, e1l, e1l, p0[m]);
   136        e1h += p0[m + 1] + cy;
   137        /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
   138        mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
   139        /* C + D   (overwrites t) */
   140        mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
   141      }
   142    else
   143      {
   144        /* as above, but use toom42 instead */
   145        mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
   146        ADDC_LIMB (cy, e1l, e1l, p0[m]);
   147        e1h += p0[m + 1] + cy;
   148        mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
   149        mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
   150      }
   151  
   152    /* apply error terms */
   153  
   154    /* -e0 at rp[0] */
   155    SUBC_LIMB (cy, rp[0], rp[0], e0l);
   156    SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
   157    if (UNLIKELY (cy))
   158      {
   159        cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
   160        SUBC_LIMB (cy, e1l, e1l, cy);
   161        e1h -= cy;
   162      }
   163  
   164    /* z = e1 - e2 + high(p0) */
   165    SUBC_LIMB (cy, zl, e1l, e2l);
   166    zh = e1h - e2h - cy;
   167  
   168    /* z at rp[m] */
   169    ADDC_LIMB (cy, rp[m], rp[m], zl);
   170    zh = (zh + cy) & GMP_NUMB_MASK;
   171    ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
   172    cy -= (zh >> (GMP_NUMB_BITS - 1));
   173    if (UNLIKELY (cy))
   174      {
   175        if (cy == 1)
   176  	mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
   177        else /* cy == -1 */
   178  	mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
   179      }
   180  
   181    /* e3 at rp[2*m] */
   182    ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
   183    rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
   184  
   185    /* e4 at p1[0] */
   186    ADDC_LIMB (cy, p1[0], p1[0], e4l);
   187    ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
   188    if (UNLIKELY (cy))
   189      mpn_add_1 (p1 + 2, p1 + 2, m, 1);
   190  
   191    /* -e5 at p1[m] */
   192    SUBC_LIMB (cy, p1[m], p1[m], e5l);
   193    p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
   194  
   195    /* adjustment if p1 ends up negative */
   196    cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
   197  
   198    /* add (-1)^neg * (p1 - B^m * p1) to output */
   199    if (neg)
   200      {
   201        mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
   202        mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
   203        mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
   204      }
   205    else
   206      {
   207        mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
   208        mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
   209        mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
   210      }
   211  
   212    /* odd row and diagonal */
   213    if (n & 1)
   214      {
   215        /*
   216          Products marked E are already done. We need to do products marked O.
   217  
   218          OOOOO----
   219          -EEEEO---
   220          --EEEEO--
   221          ---EEEEO-
   222          ----EEEEO
   223         */
   224  
   225        /* first row of O's */
   226        cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
   227        ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
   228  
   229        /* O's on diagonal */
   230        /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
   231           that can handle the sum below. Currently we're relying on
   232           mulmid_basecase being pretty fast for a diagonal sum like this,
   233  	 which is true at least for the K8 asm version, but surely false
   234  	 for the generic version. */
   235        mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
   236        mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
   237      }
   238  }