github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/diveby3.c (about)

     1  /* mpn_divexact_by3c -- mpn exact division by 3.
     2  
     3  Copyright 2000-2003, 2008 Free Software Foundation, Inc.
     4  
     5  This file is part of the GNU MP Library.
     6  
     7  The GNU MP Library is free software; you can redistribute it and/or modify
     8  it under the terms of either:
     9  
    10    * the GNU Lesser General Public License as published by the Free
    11      Software Foundation; either version 3 of the License, or (at your
    12      option) any later version.
    13  
    14  or
    15  
    16    * the GNU General Public License as published by the Free Software
    17      Foundation; either version 2 of the License, or (at your option) any
    18      later version.
    19  
    20  or both in parallel, as here.
    21  
    22  The GNU MP Library is distributed in the hope that it will be useful, but
    23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  for more details.
    26  
    27  You should have received copies of the GNU General Public License and the
    28  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  see https://www.gnu.org/licenses/.  */
    30  
    31  #include "gmp.h"
    32  #include "gmp-impl.h"
    33  
    34  #if DIVEXACT_BY3_METHOD == 0
    35  
    36  mp_limb_t
    37  mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
    38  {
    39    mp_limb_t r;
    40    r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
    41  
    42    /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
    43       We want to return C.  We compute the remainder mod 4 and notice that the
    44       inverse of (2^(2k)-1)/3 mod 4 is 1.  */
    45    return r & 3;
    46  }
    47  
    48  #endif
    49  
    50  #if DIVEXACT_BY3_METHOD == 1
    51  
    52  /* The algorithm here is basically the same as mpn_divexact_1, as described
    53     in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
    54     borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
    55     high(divisor*q) can be determined with two comparisons instead of a
    56     multiply.
    57  
    58     The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
    59     1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
    60     2.95.2 at least).
    61  
    62     It will be noted that the new c is formed by adding three values each 0
    63     or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
    64     causes a borrow, that leaves a limb value of either 0xFF...FF or
    65     0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
    66     0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
    67     respectively, hence a total of no more than 2.
    68  
    69     Alternatives:
    70  
    71     This implementation has each multiply on the dependent chain, due to
    72     "l=s-c".  See below for alternative code which avoids that.  */
    73  
    74  mp_limb_t
    75  mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
    76  {
    77    mp_limb_t  l, q, s;
    78    mp_size_t  i;
    79  
    80    ASSERT (un >= 1);
    81    ASSERT (c == 0 || c == 1 || c == 2);
    82    ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
    83  
    84    i = 0;
    85    do
    86      {
    87        s = up[i];
    88        SUBC_LIMB (c, l, s, c);
    89  
    90        q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
    91        rp[i] = q;
    92  
    93        c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
    94        c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
    95      }
    96    while (++i < un);
    97  
    98    ASSERT (c == 0 || c == 1 || c == 2);
    99    return c;
   100  }
   101  
   102  
   103  #endif
   104  
   105  #if DIVEXACT_BY3_METHOD == 2
   106  
   107  /* The following alternative code re-arranges the quotient calculation from
   108     (src[i]-c)*inverse to instead
   109  
   110         q = src[i]*inverse - c*inverse
   111  
   112     thereby allowing src[i]*inverse to be scheduled back as far as desired,
   113     making full use of multiplier throughput and leaving just some carry
   114     handing on the dependent chain.
   115  
   116     The carry handling consists of determining the c for the next iteration.
   117     This is the same as described above, namely look for any borrow from
   118     src[i]-c, and at the high of 3*q.
   119  
   120     high(3*q) is done with two comparisons as above (in c2 and c3).  The
   121     borrow from src[i]-c is incorporated into those by noting that if there's
   122     a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
   123     giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
   124     enough to make high(3*q) come out 1 bigger, as required.
   125  
   126     l = -c*inverse is calculated at the same time as c, since for most chips
   127     it can be more conveniently derived from separate c1/c2/c3 values than
   128     from a combined c equal to 0, 1 or 2.
   129  
   130     The net effect is that with good pipelining this loop should be able to
   131     run at perhaps 4 cycles/limb, depending on available execute resources
   132     etc.
   133  
   134     Usage:
   135  
   136     This code is not used by default, since we really can't rely on the
   137     compiler generating a good software pipeline, nor on such an approach
   138     even being worthwhile on all CPUs.
   139  
   140     Itanium is one chip where this algorithm helps though, see
   141     mpn/ia64/diveby3.asm.  */
   142  
   143  mp_limb_t
   144  mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
   145  {
   146    mp_limb_t  s, sm, cl, q, qx, c2, c3;
   147    mp_size_t  i;
   148  
   149    ASSERT (un >= 1);
   150    ASSERT (cy == 0 || cy == 1 || cy == 2);
   151    ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
   152  
   153    cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
   154  
   155    for (i = 0; i < un; i++)
   156      {
   157        s = up[i];
   158        sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
   159  
   160        q = (cl + sm) & GMP_NUMB_MASK;
   161        rp[i] = q;
   162        qx = q + (s < cy);
   163  
   164        c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
   165        c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
   166  
   167        cy = c2 + c3;
   168        cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
   169      }
   170  
   171    return cy;
   172  }
   173  
   174  #endif