github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/div_qr_1u_pi2.c (about)

     1  /* mpn_div_qr_1u_pi2.
     2  
     3     THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
     4     ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     5     GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     6  
     7  Copyright 2013 Free Software Foundation, Inc.
     8  
     9  This file is part of the GNU MP Library.
    10  
    11  The GNU MP Library is free software; you can redistribute it and/or modify
    12  it under the terms of either:
    13  
    14    * the GNU Lesser General Public License as published by the Free
    15      Software Foundation; either version 3 of the License, or (at your
    16      option) any later version.
    17  
    18  or
    19  
    20    * the GNU General Public License as published by the Free Software
    21      Foundation; either version 2 of the License, or (at your option) any
    22      later version.
    23  
    24  or both in parallel, as here.
    25  
    26  The GNU MP Library is distributed in the hope that it will be useful, but
    27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    29  for more details.
    30  
    31  You should have received copies of the GNU General Public License and the
    32  GNU Lesser General Public License along with the GNU MP Library.  If not,
    33  see https://www.gnu.org/licenses/.  */
    34  
    35  /* ISSUES:
    36  
    37     * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
    38  
    39     * Are there any problems with generating n quotient limbs in the q area?  It
    40       surely simplifies things.
    41  
    42     * Not yet adequately tested.
    43  */
    44  
    45  #include "gmp.h"
    46  #include "gmp-impl.h"
    47  #include "longlong.h"
    48  
    49  /* Define some longlong.h-style macros, but for wider operations.
    50     * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
    51       carry-out into an additional sum operand.
    52  */
    53  #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
    54  
    55  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
    56  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    57    __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
    58  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    59  	   : "0"  ((USItype)(s2)),					\
    60  	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
    61  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
    62  #endif
    63  
    64  #if defined (__amd64__) && W_TYPE_SIZE == 64
    65  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    66    __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
    67  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    68  	   : "0"  ((UDItype)(s2)),					\
    69  	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
    70  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
    71  #endif
    72  
    73  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
    74  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
    75     processor running in 32-bit mode, since the carry flag then gets the 32-bit
    76     carry.  */
    77  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    78    __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
    79  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
    80  	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
    81  #endif
    82  
    83  #endif /* __GNUC__ */
    84  
    85  #ifndef add_sssaaaa
    86  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
    87    do {									\
    88      UWtype __s0, __s1, __c0, __c1;					\
    89      __s0 = (a0) + (b0);							\
    90      __s1 = (a1) + (b1);							\
    91      __c0 = __s0 < (a0);							\
    92      __c1 = __s1 < (a1);							\
    93      (s0) = __s0;							\
    94      __s1 = __s1 + __c0;							\
    95      (s1) = __s1;							\
    96      (s2) += __c1 + (__s1 < __c0);					\
    97    } while (0)
    98  #endif
    99  
   100  struct precomp_div_1_pi2
   101  {
   102    mp_limb_t dip[2];
   103    mp_limb_t d;
   104    int norm_cnt;
   105  };
   106  
   107  mp_limb_t
   108  mpn_div_qr_1u_pi2 (mp_ptr qp,
   109  		   mp_srcptr up, mp_size_t un,
   110  		   struct precomp_div_1_pi2 *pd)
   111  {
   112    mp_size_t i;
   113    mp_limb_t r, u2, u1, u0;
   114    mp_limb_t d0, di1, di0;
   115    mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
   116    mp_limb_t cnd;
   117    int cnt;
   118  
   119    ASSERT (un >= 2);
   120    ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0);
   121    ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
   122    ASSERT_MPN (up, un);
   123  
   124  #define q3 q3a
   125  #define q2 q2b
   126  #define q1 q1b
   127  
   128    up += un - 3;
   129    cnt = pd->norm_cnt;
   130    r = up[2] >> (GMP_NUMB_BITS - cnt);
   131    d0 = pd->d << cnt;
   132  
   133    qp += un - 2;
   134  
   135    di1 = pd->dip[1];
   136    di0 = pd->dip[0];
   137  
   138    for (i = un - 3; i >= 0; i -= 2)
   139      {
   140        u2 = r;
   141        u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
   142        u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt));
   143  
   144        /* Dividend in {r,u1,u0} */
   145  
   146        umul_ppmm (q1d,q0d, u1, di0);
   147        umul_ppmm (q2b,q1b, u1, di1);
   148        q2b++;				/* cannot spill */
   149        add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
   150  
   151        umul_ppmm (q2c,q1c, u2,  di0);
   152        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
   153        umul_ppmm (q3a,q2a, u2, di1);
   154  
   155        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
   156  
   157        q3 += r;
   158  
   159        r = u0 - q2 * d0;
   160  
   161        cnd = (r >= q1);
   162        r += d0 & -cnd;
   163        sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
   164  
   165        if (UNLIKELY (r >= d0))
   166  	{
   167  	  r -= d0;
   168  	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
   169  	}
   170  
   171        qp[0] = q2;
   172        qp[1] = q3;
   173  
   174        up -= 2;
   175        qp -= 2;
   176      }
   177  
   178    if ((un & 1) != 0)
   179      {
   180        u2 = r;
   181        u1 = (up[2] << cnt);
   182  
   183        udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
   184        qp[1] = q3;
   185      }
   186    else
   187      {
   188        u2 = r;
   189        u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
   190        u0 = (up[1] << cnt);
   191  
   192        /* Dividend in {r,u1,u0} */
   193  
   194        umul_ppmm (q1d,q0d, u1, di0);
   195        umul_ppmm (q2b,q1b, u1, di1);
   196        q2b++;				/* cannot spill */
   197        add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
   198  
   199        umul_ppmm (q2c,q1c, u2,  di0);
   200        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
   201        umul_ppmm (q3a,q2a, u2, di1);
   202  
   203        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
   204  
   205        q3 += r;
   206  
   207        r = u0 - q2 * d0;
   208  
   209        cnd = (r >= q1);
   210        r += d0 & -cnd;
   211        sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
   212  
   213        if (UNLIKELY (r >= d0))
   214  	{
   215  	  r -= d0;
   216  	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
   217  	}
   218  
   219        qp[0] = q2;
   220        qp[1] = q3;
   221      }
   222  
   223    return r >> cnt;
   224  
   225  #undef q3
   226  #undef q2
   227  #undef q1
   228  }