github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/mod_1_1.c (about)

     1  /* mpn_mod_1_1p (ap, n, b, cps)
     2     Divide (ap,,n) by b.  Return the single-limb remainder.
     3  
     4     Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
     5     Based on a suggestion by Peter L. Montgomery.
     6  
     7     THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
     8     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     9     GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
    10  
    11  Copyright 2008-2011, 2013 Free Software Foundation, Inc.
    12  
    13  This file is part of the GNU MP Library.
    14  
    15  The GNU MP Library is free software; you can redistribute it and/or modify
    16  it under the terms of either:
    17  
    18    * the GNU Lesser General Public License as published by the Free
    19      Software Foundation; either version 3 of the License, or (at your
    20      option) any later version.
    21  
    22  or
    23  
    24    * the GNU General Public License as published by the Free Software
    25      Foundation; either version 2 of the License, or (at your option) any
    26      later version.
    27  
    28  or both in parallel, as here.
    29  
    30  The GNU MP Library is distributed in the hope that it will be useful, but
    31  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    32  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    33  for more details.
    34  
    35  You should have received copies of the GNU General Public License and the
    36  GNU Lesser General Public License along with the GNU MP Library.  If not,
    37  see https://www.gnu.org/licenses/.  */
    38  
    39  #include "gmp.h"
    40  #include "gmp-impl.h"
    41  #include "longlong.h"
    42  
    43  #ifndef MOD_1_1P_METHOD
    44  # define MOD_1_1P_METHOD 1    /* need to make sure this is 2 for asm testing */
    45  #endif
    46  
    47  /* Define some longlong.h-style macros, but for wider operations.
    48   * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
    49   * carry out, in the form of a mask. */
    50  
    51  #if defined (__GNUC__) && ! defined (NO_ASM)
    52  
    53  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
    54  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
    55    __asm__ (  "add	%6, %k2\n\t"					\
    56  	     "adc	%4, %k1\n\t"					\
    57  	     "sbb	%k0, %k0"					\
    58  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
    59  	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
    60  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
    61  #endif
    62  
    63  #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
    64  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
    65    __asm__ (  "add	%6, %q2\n\t"					\
    66  	     "adc	%4, %q1\n\t"					\
    67  	     "sbb	%q0, %q0"					\
    68  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
    69  	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
    70  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
    71  #endif
    72  
    73  #if defined (__sparc__) && W_TYPE_SIZE == 32
    74  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
    75    __asm__ (  "addcc	%r5, %6, %2\n\t"				\
    76  	     "addxcc	%r3, %4, %1\n\t"				\
    77  	     "subx	%%g0, %%g0, %0"					\
    78  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
    79  	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
    80  	 __CLOBBER_CC)
    81  #endif
    82  
    83  #if defined (__sparc__) && W_TYPE_SIZE == 64
    84  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
    85    __asm__ (  "addcc	%r5, %6, %2\n\t"				\
    86  	     "addccc	%r7, %8, %%g0\n\t"				\
    87  	     "addccc	%r3, %4, %1\n\t"				\
    88  	     "clr	%0\n\t"						\
    89  	     "movcs	%%xcc, -1, %0"					\
    90  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
    91  	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
    92  	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
    93  	 __CLOBBER_CC)
    94  #if __VIS__ >= 0x300
    95  #undef add_mssaaaa
    96  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
    97    __asm__ (  "addcc	%r5, %6, %2\n\t"				\
    98  	     "addxccc	%r3, %4, %1\n\t"				\
    99  	     "clr	%0\n\t"						\
   100  	     "movcs	%%xcc, -1, %0"					\
   101  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
   102  	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
   103  	 __CLOBBER_CC)
   104  #endif
   105  #endif
   106  
   107  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
   108  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
   109     processor running in 32-bit mode, since the carry flag then gets the 32-bit
   110     carry.  */
   111  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
   112    __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
   113  	     "adde	%1, %3, %4\n\t"					\
   114  	     "subfe	%0, %0, %0\n\t"					\
   115  	     "nor	%0, %0, %0"					\
   116  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
   117  	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
   118  #endif
   119  
   120  #if defined (__s390x__) && W_TYPE_SIZE == 64
   121  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
   122    __asm__ (  "algr	%2, %6\n\t"					\
   123  	     "alcgr	%1, %4\n\t"					\
   124  	     "lghi	%0, 0\n\t"					\
   125  	     "alcgr	%0, %0\n\t"					\
   126  	     "lcgr	%0, %0"						\
   127  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
   128  	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
   129  	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
   130  #endif
   131  
   132  #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
   133  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
   134    __asm__ (  "adds	%2, %5, %6\n\t"					\
   135  	     "adcs	%1, %3, %4\n\t"					\
   136  	     "movcc	%0, #0\n\t"					\
   137  	     "movcs	%0, #-1"					\
   138  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
   139  	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
   140  #endif
   141  #endif /* defined (__GNUC__) */
   142  
   143  #ifndef add_mssaaaa
   144  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
   145    do {									\
   146      UWtype __s0, __s1, __c0, __c1;					\
   147      __s0 = (a0) + (b0);							\
   148      __s1 = (a1) + (b1);							\
   149      __c0 = __s0 < (a0);							\
   150      __c1 = __s1 < (a1);							\
   151      (s0) = __s0;							\
   152      __s1 = __s1 + __c0;							\
   153      (s1) = __s1;							\
   154      (m) = - (__c1 + (__s1 < __c0));					\
   155    } while (0)
   156  #endif
   157  
   158  #if MOD_1_1P_METHOD == 1
   159  void
   160  mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
   161  {
   162    mp_limb_t bi;
   163    mp_limb_t B1modb, B2modb;
   164    int cnt;
   165  
   166    count_leading_zeros (cnt, b);
   167  
   168    b <<= cnt;
   169    invert_limb (bi, b);
   170  
   171    cps[0] = bi;
   172    cps[1] = cnt;
   173  
   174    B1modb = -b;
   175    if (LIKELY (cnt != 0))
   176      B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
   177    ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
   178    cps[2] = B1modb >> cnt;
   179  
   180    /* In the normalized case, this can be simplified to
   181     *
   182     *   B2modb = - b * bi;
   183     *   ASSERT (B2modb <= b);    // NB: equality iff b = B/2
   184     */
   185    udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
   186    cps[3] = B2modb >> cnt;
   187  }
   188  
   189  mp_limb_t
   190  mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
   191  {
   192    mp_limb_t rh, rl, bi, ph, pl, r;
   193    mp_limb_t B1modb, B2modb;
   194    mp_size_t i;
   195    int cnt;
   196    mp_limb_t mask;
   197  
   198    ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
   199  
   200    B1modb = bmodb[2];
   201    B2modb = bmodb[3];
   202  
   203    rl = ap[n - 1];
   204    umul_ppmm (ph, pl, rl, B1modb);
   205    add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]);
   206  
   207    for (i = n - 3; i >= 0; i -= 1)
   208      {
   209        /* rr = ap[i]				< B
   210  	    + LO(rr)  * (B mod b)		<= (B-1)(b-1)
   211  	    + HI(rr)  * (B^2 mod b)		<= (B-1)(b-1)
   212        */
   213        umul_ppmm (ph, pl, rl, B1modb);
   214        add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]);
   215  
   216        umul_ppmm (rh, rl, rh, B2modb);
   217        add_ssaaaa (rh, rl, rh, rl, ph, pl);
   218      }
   219  
   220    cnt = bmodb[1];
   221    bi = bmodb[0];
   222  
   223    if (LIKELY (cnt != 0))
   224      rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
   225  
   226    mask = -(mp_limb_t) (rh >= b);
   227    rh -= mask & b;
   228  
   229    udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
   230  
   231    return r >> cnt;
   232  }
   233  #endif /* MOD_1_1P_METHOD == 1 */
   234  
   235  #if MOD_1_1P_METHOD == 2
   236  void
   237  mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
   238  {
   239    mp_limb_t bi;
   240    mp_limb_t B2modb;
   241    int cnt;
   242  
   243    count_leading_zeros (cnt, b);
   244  
   245    b <<= cnt;
   246    invert_limb (bi, b);
   247  
   248    cps[0] = bi;
   249    cps[1] = cnt;
   250  
   251    if (LIKELY (cnt != 0))
   252      {
   253        mp_limb_t B1modb = -b;
   254        B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
   255        ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
   256        cps[2] = B1modb >> cnt;
   257      }
   258    B2modb = - b * bi;
   259    ASSERT (B2modb <= b);    // NB: equality iff b = B/2
   260    cps[3] = B2modb;
   261  }
   262  
   263  mp_limb_t
   264  mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
   265  {
   266    int cnt;
   267    mp_limb_t bi, B1modb;
   268    mp_limb_t r0, r1;
   269    mp_limb_t r;
   270  
   271    ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
   272  
   273    r0 = ap[n-2];
   274    r1 = ap[n-1];
   275  
   276    if (n > 2)
   277      {
   278        mp_limb_t B2modb, B2mb;
   279        mp_limb_t p0, p1;
   280        mp_limb_t r2;
   281        mp_size_t j;
   282  
   283        B2modb = bmodb[3];
   284        B2mb = B2modb - b;
   285  
   286        umul_ppmm (p1, p0, r1, B2modb);
   287        add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
   288  
   289        for (j = n-4; j >= 0; j--)
   290  	{
   291  	  mp_limb_t cy;
   292  	  /* mp_limb_t t = r0 + B2mb; */
   293  	  umul_ppmm (p1, p0, r1, B2modb);
   294  
   295  	  ADDC_LIMB (cy, r0, r0, r2 & B2modb);
   296  	  /* Alternative, for cmov: if (cy) r0 = t; */
   297  	  r0 -= (-cy) & b;
   298  	  add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
   299  	}
   300  
   301        r1 -= (r2 & b);
   302      }
   303  
   304    cnt = bmodb[1];
   305  
   306    if (LIKELY (cnt != 0))
   307      {
   308        mp_limb_t t;
   309        mp_limb_t B1modb = bmodb[2];
   310  
   311        umul_ppmm (r1, t, r1, B1modb);
   312        r0 += t;
   313        r1 += (r0 < t);
   314  
   315        /* Normalize */
   316        r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
   317        r0 <<= cnt;
   318  
   319        /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */
   320      }
   321    else
   322      {
   323        mp_limb_t mask = -(mp_limb_t) (r1 >= b);
   324        r1 -= mask & b;
   325      }
   326  
   327    bi = bmodb[0];
   328  
   329    udiv_rnnd_preinv (r, r1, r0, b, bi);
   330    return r >> cnt;
   331  }
   332  #endif /* MOD_1_1P_METHOD == 2 */