github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/mode1o.c (about)

     1  /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
     2  
     3     THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
     4     CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
     5     FUTURE GNU MP RELEASES.
     6  
     7  Copyright 2000-2004 Free Software Foundation, Inc.
     8  
     9  This file is part of the GNU MP Library.
    10  
    11  The GNU MP Library is free software; you can redistribute it and/or modify
    12  it under the terms of either:
    13  
    14    * the GNU Lesser General Public License as published by the Free
    15      Software Foundation; either version 3 of the License, or (at your
    16      option) any later version.
    17  
    18  or
    19  
    20    * the GNU General Public License as published by the Free Software
    21      Foundation; either version 2 of the License, or (at your option) any
    22      later version.
    23  
    24  or both in parallel, as here.
    25  
    26  The GNU MP Library is distributed in the hope that it will be useful, but
    27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    29  for more details.
    30  
    31  You should have received copies of the GNU General Public License and the
    32  GNU Lesser General Public License along with the GNU MP Library.  If not,
    33  see https://www.gnu.org/licenses/.  */
    34  
    35  #include "gmp.h"
    36  #include "gmp-impl.h"
    37  #include "longlong.h"
    38  
    39  
    40  /* Calculate an r satisfying
    41  
    42             r*B^k + a - c == q*d
    43  
    44     where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
    45     (the caller won't know which), and q is the quotient (discarded).  d must
    46     be odd, c can be any limb value.
    47  
    48     If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
    49  
    50     This slightly strange function suits the initial Nx1 reduction for GCDs
    51     or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
    52     -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
    53     ignored, or for the Jacobi symbol it can be accounted for.  The function
    54     also suits divisibility and congruence testing since if r=0 (or r=d) is
    55     obtained then a==c mod d.
    56  
    57  
    58     r is a bit like the remainder returned by mpn_divexact_by3c, and is the
    59     sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
    60     represents a borrow, since effectively quotient limbs are chosen so that
    61     subtracting that multiple of d from src at each step will produce a zero
    62     limb.
    63  
    64     A long calculation can be done piece by piece from low to high by passing
    65     the return value from one part as the carry parameter to the next part.
    66     The effective final k becomes anything between size and size-n, if n
    67     pieces are used.
    68  
    69  
    70     A similar sort of routine could be constructed based on adding multiples
    71     of d at each limb, much like redc in mpz_powm does.  Subtracting however
    72     has a small advantage that when subtracting to cancel out l there's never
    73     a borrow into h, whereas using an addition would put a carry into h
    74     depending whether l==0 or l!=0.
    75  
    76  
    77     In terms of efficiency, this function is similar to a mul-by-inverse
    78     mpn_mod_1.  Both are essentially two multiplies and are best suited to
    79     CPUs with low latency multipliers (in comparison to a divide instruction
    80     at least.)  But modexact has a few less supplementary operations, only
    81     needs low part and high part multiplies, and has fewer working quantities
    82     (helping CPUs with few registers).
    83  
    84  
    85     In the main loop it will be noted that the new carry (call it r) is the
    86     sum of the high product h and any borrow from l=s-c.  If c<d then we will
    87     have r<d too, for the following reasons.  Let q=l*inverse be the quotient
    88     limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then
    89  
    90         l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
    91  
    92     But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
    93     never have h=d-1 and so r=h+borrow <= d-1.
    94  
    95     When c>=d, on the other hand, h=d-1 can certainly occur together with a
    96     borrow, thereby giving only r<=d, as per the function definition above.
    97  
    98     As a design decision it's left to the caller to check for r=d if it might
    99     be passing c>=d.  Several applications have c<d initially so the extra
   100     test is often unnecessary, for example the GCDs or a plain divisibility
   101     d|a test will pass c=0.
   102  
   103  
   104     The special case for size==1 is so that it can be assumed c<=d in the
   105     high<=divisor test at the end.  c<=d is only guaranteed after at least
   106     one iteration of the main loop.  There's also a decent chance one % is
   107     faster than a binvert_limb, though that will depend on the processor.
   108  
   109     A CPU specific implementation might want to omit the size==1 code or the
   110     high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
   111     useful.  */
   112  
   113  
   114  mp_limb_t
   115  mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
   116                       mp_limb_t orig_c)
   117  {
   118    mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
   119    mp_limb_t  c = orig_c;
   120    mp_size_t  i;
   121  
   122    ASSERT (size >= 1);
   123    ASSERT (d & 1);
   124    ASSERT_MPN (src, size);
   125    ASSERT_LIMB (d);
   126    ASSERT_LIMB (c);
   127  
   128    if (size == 1)
   129      {
   130        s = src[0];
   131        if (s > c)
   132  	{
   133  	  l = s-c;
   134  	  h = l % d;
   135  	  if (h != 0)
   136  	    h = d - h;
   137  	}
   138        else
   139  	{
   140  	  l = c-s;
   141  	  h = l % d;
   142  	}
   143        return h;
   144      }
   145  
   146  
   147    binvert_limb (inverse, d);
   148    dmul = d << GMP_NAIL_BITS;
   149  
   150    i = 0;
   151    do
   152      {
   153        s = src[i];
   154        SUBC_LIMB (c, l, s, c);
   155        l = (l * inverse) & GMP_NUMB_MASK;
   156        umul_ppmm (h, dummy, l, dmul);
   157        c += h;
   158      }
   159    while (++i < size-1);
   160  
   161  
   162    s = src[i];
   163    if (s <= d)
   164      {
   165        /* With high<=d the final step can be a subtract and addback.  If c==0
   166  	 then the addback will restore to l>=0.  If c==d then will get l==d
   167  	 if s==0, but that's ok per the function definition.  */
   168  
   169        l = c - s;
   170        if (c < s)
   171  	l += d;
   172  
   173        ret = l;
   174      }
   175    else
   176      {
   177        /* Can't skip a divide, just do the loop code once more. */
   178  
   179        SUBC_LIMB (c, l, s, c);
   180        l = (l * inverse) & GMP_NUMB_MASK;
   181        umul_ppmm (h, dummy, l, dmul);
   182        c += h;
   183        ret = c;
   184      }
   185  
   186    ASSERT (orig_c < d ? ret < d : ret <= d);
   187    return ret;
   188  }
   189  
   190  
   191  
   192  #if 0
   193  
   194  /* The following is an alternate form that might shave one cycle on a
   195     superscalar processor since it takes c+=h off the dependent chain,
   196     leaving just a low product, high product, and a subtract.
   197  
   198     This is for CPU specific implementations to consider.  A special case for
   199     high<divisor and/or size==1 can be added if desired.
   200  
   201     Notice that c is only ever 0 or 1, since if s-c produces a borrow then
   202     x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
   203     c=(x==0xFF..FF) too, if that helped.  */
   204  
   205  mp_limb_t
   206  mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
   207  {
   208    mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
   209    mp_limb_t  c = 0;
   210    mp_size_t  i;
   211  
   212    ASSERT (size >= 1);
   213    ASSERT (d & 1);
   214  
   215    binvert_limb (inverse, d);
   216    dmul = d << GMP_NAIL_BITS;
   217  
   218    for (i = 0; i < size; i++)
   219      {
   220        ASSERT (c==0 || c==1);
   221  
   222        s = src[i];
   223        SUBC_LIMB (c1, x, s, c);
   224  
   225        SUBC_LIMB (c2, y, x, h);
   226        c = c1 + c2;
   227  
   228        y = (y * inverse) & GMP_NUMB_MASK;
   229        umul_ppmm (h, dummy, y, dmul);
   230      }
   231  
   232    h += c;
   233    return h;
   234  }
   235  
   236  #endif