github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/div_qr_2.c (about)

     1  /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
     2     quotient.  The divisor is two limbs.
     3  
     4     Contributed to the GNU project by Torbjorn Granlund and Niels Möller
     5  
     6     THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
     7     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     8     GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     9  
    10  
    11  Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
    12  
    13  This file is part of the GNU MP Library.
    14  
    15  The GNU MP Library is free software; you can redistribute it and/or modify
    16  it under the terms of either:
    17  
    18    * the GNU Lesser General Public License as published by the Free
    19      Software Foundation; either version 3 of the License, or (at your
    20      option) any later version.
    21  
    22  or
    23  
    24    * the GNU General Public License as published by the Free Software
    25      Foundation; either version 2 of the License, or (at your option) any
    26      later version.
    27  
    28  or both in parallel, as here.
    29  
    30  The GNU MP Library is distributed in the hope that it will be useful, but
    31  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    32  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    33  for more details.
    34  
    35  You should have received copies of the GNU General Public License and the
    36  GNU Lesser General Public License along with the GNU MP Library.  If not,
    37  see https://www.gnu.org/licenses/.  */
    38  
    39  #include "gmp.h"
    40  #include "gmp-impl.h"
    41  #include "longlong.h"
    42  
    43  #ifndef DIV_QR_2_PI2_THRESHOLD
    44  /* Disabled unless explicitly tuned. */
    45  #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
    46  #endif
    47  
    48  #ifndef SANITY_CHECK
    49  #define SANITY_CHECK 0
    50  #endif
    51  
    52  /* Define some longlong.h-style macros, but for wider operations.
    53     * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
    54       carry-out into an additional sum operand.
    55     * add_csaac accepts two addends and a carry in, and generates a sum
    56       and a carry out.  A little like a "full adder".
    57  */
    58  #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
    59  
    60  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
    61  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    62    __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
    63  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    64  	   : "0"  ((USItype)(s2)),					\
    65  	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
    66  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
    67  #define add_csaac(co, s, a, b, ci)					\
    68    __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0"		\
    69  	   : "=r" (co), "=r" (s)					\
    70  	   : "rm"  ((USItype)(ci)), "0" (CNST_LIMB(0)),			\
    71  	     "%1" ((USItype)(a)), "g" ((USItype)(b)))
    72  #endif
    73  
    74  #if defined (__amd64__) && W_TYPE_SIZE == 64
    75  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    76    __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
    77  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    78  	   : "0"  ((UDItype)(s2)),					\
    79  	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
    80  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
    81  #define add_csaac(co, s, a, b, ci)					\
    82    __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0"		\
    83  	   : "=r" (co), "=r" (s)					\
    84  	   : "rm"  ((UDItype)(ci)), "0" (CNST_LIMB(0)),			\
    85  	     "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
    86  #endif
    87  
    88  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
    89  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
    90     processor running in 32-bit mode, since the carry flag then gets the 32-bit
    91     carry.  */
    92  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    93    __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
    94  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    95  	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
    96  #endif
    97  
    98  #endif /* __GNUC__ */
    99  
   100  #ifndef add_sssaaaa
   101  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
   102    do {									\
   103      UWtype __s0, __s1, __c0, __c1;					\
   104      __s0 = (a0) + (b0);							\
   105      __s1 = (a1) + (b1);							\
   106      __c0 = __s0 < (a0);							\
   107      __c1 = __s1 < (a1);							\
   108      (s0) = __s0;							\
   109      __s1 = __s1 + __c0;							\
   110      (s1) = __s1;							\
   111      (s2) += __c1 + (__s1 < __c0);					\
   112    } while (0)
   113  #endif
   114  
   115  #ifndef add_csaac
   116  #define add_csaac(co, s, a, b, ci)					\
   117    do {									\
   118      UWtype __s, __c;							\
   119      __s = (a) + (b);							\
   120      __c = __s < (a);							\
   121      __s = __s + (ci);							\
   122      (s) = __s;								\
   123      (co) = __c + (__s < (ci));						\
   124    } while (0)
   125  #endif
   126  
   127  /* Typically used with r1, r0 same as n3, n2. Other types of overlap
   128     between inputs and outputs are not supported. */
   129  #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
   130    do {									\
   131      mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
   132      mp_limb_t _t1, _t0;							\
   133      mp_limb_t _c, _mask;						\
   134  									\
   135      umul_ppmm (_q3,_q2a, n3, di1);					\
   136      umul_ppmm (_q2,_q1, n2, di1);					\
   137      umul_ppmm (_q2c,_q1c, n3, di0);					\
   138      add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c);			\
   139      umul_ppmm (_q1d,_q0, n2, di0);					\
   140      add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d);			\
   141  									\
   142      add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1));		\
   143  									\
   144      /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */				\
   145      add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0));				\
   146      add_csaac (_c, _q1, _q1, n1, _c);					\
   147      add_csaac (_c, _q2, _q2, r0, _c);					\
   148      _q3 = _q3 + r1 + _c;						\
   149  									\
   150      umul_ppmm (_t1,_t0, _q2, d0);					\
   151      _t1 += _q2 * d1 + _q3 * d0;						\
   152  									\
   153      sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
   154  									\
   155      _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
   156      add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
   157      sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
   158  									\
   159      if (UNLIKELY (r1 >= d1))						\
   160        {									\
   161  	if (r1 > d1 || r0 >= d0)					\
   162  	  {								\
   163  	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
   164  	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
   165  	  }								\
   166        }									\
   167      (q1) = _q3;								\
   168      (q0) = _q2;								\
   169    } while (0)
   170  
   171  static void
   172  invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
   173  {
   174    mp_limb_t v1, v0, p1, t1, t0, p0, mask;
   175    invert_limb (v1, d1);
   176    p1 = d1 * v1;
   177    /* <1, v1> * d1 = <B-1, p1> */
   178    p1 += d0;
   179    if (p1 < d0)
   180      {
   181        v1--;
   182        mask = -(mp_limb_t) (p1 >= d1);
   183        p1 -= d1;
   184        v1 += mask;
   185        p1 -= mask & d1;
   186      }
   187    /* <1, v1> * d1 + d0 = <B-1, p1> */
   188    umul_ppmm (t1, p0, d0, v1);
   189    p1 += t1;
   190    if (p1 < t1)
   191      {
   192        if (UNLIKELY (p1 >= d1))
   193  	{
   194  	  if (p1 > d1 || p0 >= d0)
   195  	    {
   196  	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
   197  	      v1--;
   198  	    }
   199  	}
   200        sub_ddmmss (p1, p0, p1, p0, d1, d0);
   201        v1--;
   202      }
   203    /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
   204     * with <p1, p0> + <d1, d0> >= B^2.
   205     *
   206     * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
   207     * partial remainder after <1, v1> is
   208     *
   209     * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
   210     *                              = <~p1, ~p0, B-1>
   211     */
   212    udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
   213    di[0] = v0;
   214    di[1] = v1;
   215  
   216  #if SANITY_CHECK
   217    {
   218      mp_limb_t tp[4];
   219      mp_limb_t dp[2];
   220      dp[0] = d0;
   221      dp[1] = d1;
   222      mpn_mul_n (tp, dp, di, 2);
   223      ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
   224      ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
   225      ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
   226      ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
   227    }
   228  #endif
   229  }
   230  
   231  static mp_limb_t
   232  mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
   233  		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
   234  {
   235    mp_limb_t qh;
   236    mp_size_t i;
   237    mp_limb_t r1, r0;
   238  
   239    ASSERT (nn >= 2);
   240    ASSERT (d1 & GMP_NUMB_HIGHBIT);
   241  
   242    r1 = np[nn-1];
   243    r0 = np[nn-2];
   244  
   245    qh = 0;
   246    if (r1 >= d1 && (r1 > d1 || r0 >= d0))
   247      {
   248  #if GMP_NAIL_BITS == 0
   249        sub_ddmmss (r1, r0, r1, r0, d1, d0);
   250  #else
   251        r0 = r0 - d0;
   252        r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
   253        r0 &= GMP_NUMB_MASK;
   254  #endif
   255        qh = 1;
   256      }
   257  
   258    for (i = nn - 2; i >= 2; i -= 2)
   259      {
   260        mp_limb_t n1, n0, q1, q0;
   261        n1 = np[i-1];
   262        n0 = np[i-2];
   263        udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
   264        qp[i-1] = q1;
   265        qp[i-2] = q0;
   266      }
   267  
   268    if (i > 0)
   269      {
   270        mp_limb_t q;
   271        udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
   272        qp[0] = q;
   273      }
   274    rp[1] = r1;
   275    rp[0] = r0;
   276  
   277    return qh;
   278  }
   279  
   280  
   281  /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
   282     significant quotient limbs at qp and the 2 long remainder at np.
   283     Return the most significant limb of the quotient.
   284  
   285     Preconditions:
   286     1. qp must either not overlap with the input operands at all, or
   287        qp >= np + 2 must hold true.  (This means that it's possible to put
   288        the quotient in the high part of {np,nn}, right above the remainder.
   289     2. nn >= 2.  */
   290  
   291  mp_limb_t
   292  mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
   293  	      mp_srcptr dp)
   294  {
   295    mp_limb_t d1;
   296    mp_limb_t d0;
   297    gmp_pi1_t dinv;
   298  
   299    ASSERT (nn >= 2);
   300    ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
   301    ASSERT_MPN (np, nn);
   302    ASSERT_MPN (dp, 2);
   303  
   304    d1 = dp[1]; d0 = dp[0];
   305  
   306    ASSERT (d1 > 0);
   307  
   308    if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
   309      {
   310        if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
   311  	{
   312  	  gmp_pi1_t dinv;
   313  	  invert_pi1 (dinv, d1, d0);
   314  	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
   315  	}
   316        else
   317  	{
   318  	  mp_limb_t di[2];
   319  	  invert_4by2 (di, d1, d0);
   320  	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
   321  	}
   322      }
   323    else
   324      {
   325        int shift;
   326        count_leading_zeros (shift, d1);
   327        d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
   328        d0 <<= shift;
   329        invert_pi1 (dinv, d1, d0);
   330        return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
   331      }
   332  }