github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/div_qr_1n_pi2.c (about)

     1  /* mpn_div_qr_1u_pi2.
     2  
     3     THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
     4     ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     5     GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     6  
     7  Copyright 2013 Free Software Foundation, Inc.
     8  
     9  This file is part of the GNU MP Library.
    10  
    11  The GNU MP Library is free software; you can redistribute it and/or modify
    12  it under the terms of either:
    13  
    14    * the GNU Lesser General Public License as published by the Free
    15      Software Foundation; either version 3 of the License, or (at your
    16      option) any later version.
    17  
    18  or
    19  
    20    * the GNU General Public License as published by the Free Software
    21      Foundation; either version 2 of the License, or (at your option) any
    22      later version.
    23  
    24  or both in parallel, as here.
    25  
    26  The GNU MP Library is distributed in the hope that it will be useful, but
    27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    29  for more details.
    30  
    31  You should have received copies of the GNU General Public License and the
    32  GNU Lesser General Public License along with the GNU MP Library.  If not,
    33  see https://www.gnu.org/licenses/.  */
    34  
    35  /* ISSUES:
    36  
    37     * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
    38  
    39     * Are there any problems with generating n quotient limbs in the q area?  It
    40       surely simplifies things.
    41  
    42     * Not yet adequately tested.
    43  */
    44  
    45  #include "gmp.h"
    46  #include "gmp-impl.h"
    47  #include "longlong.h"
    48  
    49  /* Define some longlong.h-style macros, but for wider operations.
    50     * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
    51       carry-out into an additional sum operand.
    52  */
    53  #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
    54  
    55  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
    56  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    57    __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
    58  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    59  	   : "0"  ((USItype)(s2)),					\
    60  	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
    61  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
    62  #endif
    63  
    64  #if defined (__amd64__) && W_TYPE_SIZE == 64
    65  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    66    __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
    67  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    68  	   : "0"  ((UDItype)(s2)),					\
    69  	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
    70  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
    71  #endif
    72  
    73  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
    74  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
    75     processor running in 32-bit mode, since the carry flag then gets the 32-bit
    76     carry.  */
    77  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    78    __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
    79  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    80  	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
    81  #endif
    82  
    83  #endif /* __GNUC__ */
    84  
    85  #ifndef add_sssaaaa
    86  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    87    do {									\
    88      UWtype __s0, __s1, __c0, __c1;					\
    89      __s0 = (a0) + (b0);							\
    90      __s1 = (a1) + (b1);							\
    91      __c0 = __s0 < (a0);							\
    92      __c1 = __s1 < (a1);							\
    93      (s0) = __s0;							\
    94      __s1 = __s1 + __c0;							\
    95      (s1) = __s1;							\
    96      (s2) += __c1 + (__s1 < __c0);					\
    97    } while (0)
    98  #endif
    99  
   100  struct precomp_div_1_pi2
   101  {
   102    mp_limb_t dip[2];
   103    mp_limb_t d;
   104    int norm_cnt;
   105  };
   106  
   107  mp_limb_t
   108  mpn_div_qr_1n_pi2 (mp_ptr qp,
   109  		   mp_srcptr up, mp_size_t un,
   110  		   struct precomp_div_1_pi2 *pd)
   111  {
   112    mp_limb_t most_significant_q_limb;
   113    mp_size_t i;
   114    mp_limb_t r, u2, u1, u0;
   115    mp_limb_t d0, di1, di0;
   116    mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
   117    mp_limb_t cnd;
   118  
   119    ASSERT (un >= 2);
   120    ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
   121    ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
   122    ASSERT_MPN (up, un);
   123  
   124  #define q3 q3a
   125  #define q2 q2b
   126  #define q1 q1b
   127  
   128    up += un - 3;
   129    r = up[2];
   130    d0 = pd->d;
   131  
   132    most_significant_q_limb = (r >= d0);
   133    r -= d0 & -most_significant_q_limb;
   134  
   135    qp += un - 3;
   136    qp[2] = most_significant_q_limb;
   137  
   138    di1 = pd->dip[1];
   139    di0 = pd->dip[0];
   140  
   141    for (i = un - 3; i >= 0; i -= 2)
   142      {
   143        u2 = r;
   144        u1 = up[1];
   145        u0 = up[0];
   146  
   147        /* Dividend in {r,u1,u0} */
   148  
   149        umul_ppmm (q1d,q0d, u1, di0);
   150        umul_ppmm (q2b,q1b, u1, di1);
   151        q2b++;				/* cannot spill */
   152        add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
   153  
   154        umul_ppmm (q2c,q1c, u2,  di0);
   155        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
   156        umul_ppmm (q3a,q2a, u2, di1);
   157  
   158        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
   159  
   160        q3 += r;
   161  
   162        r = u0 - q2 * d0;
   163  
   164        cnd = (r >= q1);
   165        r += d0 & -cnd;
   166        sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
   167  
   168        if (UNLIKELY (r >= d0))
   169  	{
   170  	  r -= d0;
   171  	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
   172  	}
   173  
   174        qp[0] = q2;
   175        qp[1] = q3;
   176  
   177        up -= 2;
   178        qp -= 2;
   179      }
   180  
   181    if ((un & 1) == 0)
   182      {
   183        u2 = r;
   184        u1 = up[1];
   185  
   186        udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
   187        qp[1] = q3;
   188      }
   189  
   190    return r;
   191  
   192  #undef q3
   193  #undef q2
   194  #undef q1
   195  }