github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom4_sqr.c (about)

     1  /* mpn_toom4_sqr -- Square {ap,an}.
     2  
     3     Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     4  
     5     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
     6     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     7     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     8  
     9  Copyright 2006-2010, 2013 Free Software Foundation, Inc.
    10  
    11  This file is part of the GNU MP Library.
    12  
    13  The GNU MP Library is free software; you can redistribute it and/or modify
    14  it under the terms of either:
    15  
    16    * the GNU Lesser General Public License as published by the Free
    17      Software Foundation; either version 3 of the License, or (at your
    18      option) any later version.
    19  
    20  or
    21  
    22    * the GNU General Public License as published by the Free Software
    23      Foundation; either version 2 of the License, or (at your option) any
    24      later version.
    25  
    26  or both in parallel, as here.
    27  
    28  The GNU MP Library is distributed in the hope that it will be useful, but
    29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    31  for more details.
    32  
    33  You should have received copies of the GNU General Public License and the
    34  GNU Lesser General Public License along with the GNU MP Library.  If not,
    35  see https://www.gnu.org/licenses/.  */
    36  
    37  
    38  #include "gmp.h"
    39  #include "gmp-impl.h"
    40  
    41  /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
    42  
    43    <-s--><--n--><--n--><--n-->
    44     ____ ______ ______ ______
    45    |_a3_|___a2_|___a1_|___a0_|
    46  
    47    v0  =   a0             ^2 #    A(0)^2
    48    v1  = ( a0+ a1+ a2+ a3)^2 #    A(1)^2   ah  <= 3
    49    vm1 = ( a0- a1+ a2- a3)^2 #   A(-1)^2  |ah| <= 1
    50    v2  = ( a0+2a1+4a2+8a3)^2 #    A(2)^2   ah  <= 14
    51    vh  = (8a0+4a1+2a2+ a3)^2 #  A(1/2)^2   ah  <= 14
    52    vmh = (8a0-4a1+2a2- a3)^2 # A(-1/2)^2  -4<=ah<=9
    53    vinf=               a3 ^2 #  A(inf)^2
    54  */
    55  
    56  #if TUNE_PROGRAM_BUILD
    57  #define MAYBE_sqr_basecase 1
    58  #define MAYBE_sqr_toom2   1
    59  #define MAYBE_sqr_toom4   1
    60  #else
    61  #define MAYBE_sqr_basecase						\
    62    (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD)
    63  #define MAYBE_sqr_toom2							\
    64    (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD)
    65  #define MAYBE_sqr_toom4							\
    66    (SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD)
    67  #endif
    68  
    69  #define TOOM4_SQR_REC(p, a, n, ws)					\
    70    do {									\
    71      if (MAYBE_sqr_basecase						\
    72  	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
    73        mpn_sqr_basecase (p, a, n);					\
    74      else if (MAYBE_sqr_toom2						\
    75  	     && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
    76        mpn_toom2_sqr (p, a, n, ws);					\
    77      else if (! MAYBE_sqr_toom4						\
    78  	     || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))		\
    79        mpn_toom3_sqr (p, a, n, ws);					\
    80      else								\
    81        mpn_toom4_sqr (p, a, n, ws);					\
    82    } while (0)
    83  
    84  void
    85  mpn_toom4_sqr (mp_ptr pp,
    86  	       mp_srcptr ap, mp_size_t an,
    87  	       mp_ptr scratch)
    88  {
    89    mp_size_t n, s;
    90    mp_limb_t cy;
    91  
    92  #define a0  ap
    93  #define a1  (ap + n)
    94  #define a2  (ap + 2*n)
    95  #define a3  (ap + 3*n)
    96  
    97    n = (an + 3) >> 2;
    98  
    99    s = an - 3 * n;
   100  
   101    ASSERT (0 < s && s <= n);
   102  
   103    /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
   104     * following limb, so these must be computed in order, and we need a
   105     * one limb gap to tp. */
   106  #define v0    pp				/* 2n */
   107  #define v1    (pp + 2 * n)			/* 2n+1 */
   108  #define vinf  (pp + 6 * n)			/* s+t */
   109  #define v2    scratch				/* 2n+1 */
   110  #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
   111  #define vh    (scratch + 4 * n + 2)		/* 2n+1 */
   112  #define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
   113  #define tp (scratch + 8*n + 5)
   114  
   115    /* No overlap with v1 */
   116  #define apx   pp				/* n+1 */
   117  #define amx   (pp + 4*n + 2)			/* n+1 */
   118  
   119    /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
   120       gives roughly 32 n/3 + log term. */
   121  
   122    /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
   123    mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
   124  
   125    TOOM4_SQR_REC (v2, apx, n + 1, tp);	/* v2,  2n+1 limbs */
   126    TOOM4_SQR_REC (vm2, amx, n + 1, tp);	/* vm2,  2n+1 limbs */
   127  
   128    /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
   129  #if HAVE_NATIVE_mpn_addlsh1_n
   130    cy = mpn_addlsh1_n (apx, a1, a0, n);
   131    cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
   132    if (s < n)
   133      {
   134        mp_limb_t cy2;
   135        cy2 = mpn_addlsh1_n (apx, a3, apx, s);
   136        apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
   137        MPN_INCR_U (apx + s, n+1-s, cy2);
   138      }
   139    else
   140      apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
   141  #else
   142    cy = mpn_lshift (apx, a0, n, 1);
   143    cy += mpn_add_n (apx, apx, a1, n);
   144    cy = 2*cy + mpn_lshift (apx, apx, n, 1);
   145    cy += mpn_add_n (apx, apx, a2, n);
   146    cy = 2*cy + mpn_lshift (apx, apx, n, 1);
   147    apx[n] = cy + mpn_add (apx, apx, n, a3, s);
   148  #endif
   149  
   150    ASSERT (apx[n] < 15);
   151  
   152    TOOM4_SQR_REC (vh, apx, n + 1, tp);	/* vh,  2n+1 limbs */
   153  
   154    /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
   155    mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
   156  
   157    TOOM4_SQR_REC (v1, apx, n + 1, tp);	/* v1,  2n+1 limbs */
   158    TOOM4_SQR_REC (vm1, amx, n + 1, tp);	/* vm1,  2n+1 limbs */
   159  
   160    TOOM4_SQR_REC (v0, a0, n, tp);
   161    TOOM4_SQR_REC (vinf, a3, s, tp);	/* vinf, 2s limbs */
   162  
   163    mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) 0, vm2, vm1, v2, vh, 2*s, tp);
   164  }