github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpz/bin_uiui.c

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpz/bin_uiui.c (about)

     1  /* mpz_bin_uiui - compute n over k.
     2  
     3  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     4  
     5  Copyright 2010-2012 Free Software Foundation, Inc.
     6  
     7  This file is part of the GNU MP Library.
     8  
     9  The GNU MP Library is free software; you can redistribute it and/or modify
    10  it under the terms of either:
    11  
    12    * the GNU Lesser General Public License as published by the Free
    13      Software Foundation; either version 3 of the License, or (at your
    14      option) any later version.
    15  
    16  or
    17  
    18    * the GNU General Public License as published by the Free Software
    19      Foundation; either version 2 of the License, or (at your option) any
    20      later version.
    21  
    22  or both in parallel, as here.
    23  
    24  The GNU MP Library is distributed in the hope that it will be useful, but
    25  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  for more details.
    28  
    29  You should have received copies of the GNU General Public License and the
    30  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  see https://www.gnu.org/licenses/.  */
    32  
    33  #include "gmp.h"
    34  #include "gmp-impl.h"
    35  #include "longlong.h"
    36  
    37  #ifndef BIN_GOETGHELUCK_THRESHOLD
    38  #define BIN_GOETGHELUCK_THRESHOLD  1000
    39  #endif
    40  #ifndef BIN_UIUI_ENABLE_SMALLDC
    41  #define BIN_UIUI_ENABLE_SMALLDC    1
    42  #endif
    43  #ifndef BIN_UIUI_RECURSIVE_SMALLDC
    44  #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
    45  #endif
    46  
    47  /* Algorithm:
    48  
    49     Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
    50     which are then accumulated into mpn numbers.  The first inner loop
    51     accumulates divisor factors, the 2nd inner loop accumulates exactly the same
    52     number of dividend factors.  We avoid accumulating more for the divisor,
    53     even with its smaller factors, since we else cannot guarantee divisibility.
    54  
    55     Since we know each division will yield an integer, we compute the quotient
    56     using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
    57     2^t.
    58  
    59     Improvements:
    60  
    61     (1) An obvious improvement to this code would be to compute mod 2^t
    62     everywhere.  Unfortunately, we cannot determine t beforehand, unless we
    63     invoke some approximation, such as Stirling's formula.  Of course, we don't
    64     need t to be tight.  However, it is not clear that this would help much,
    65     our numbers are kept reasonably small already.
    66  
    67     (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
    68     Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
    69     would make it both reasonably accurate and fast.  (We could use a table
    70     stored into a limb, perhaps.)  The table should take the removed factors of
    71     2 into account (those done on-the-fly in mulN).
    72  
    73     (3) The first time in the loop we compute the odd part of a
    74     factorial in kp, we might use oddfac_1 for this task.
    75   */
    76  
    77  /* This threshold determines how large divisor to accumulate before we call
    78     bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
    79     since we are just basecase code anyway?  Presumably, this depends on the
    80     relative speed of the asymptotically fast code and this code.  */
    81  #define SOME_THRESHOLD 20
    82  
    83  /* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
    84     All versions of MAXFACS don't take this 2 removal into account now, meaning
    85     that then, shifting just adds some overhead.  (We remove factors from the
    86     completed limb anyway.)  */
    87  
    88  static mp_limb_t
    89  mul1 (mp_limb_t m)
    90  {
    91    return m;
    92  }
    93  
    94  static mp_limb_t
    95  mul2 (mp_limb_t m)
    96  {
    97    /* We need to shift before multiplying, to avoid an overflow. */
    98    mp_limb_t m01 = (m | 1) * ((m + 1) >> 1);
    99    return m01;
   100  }
   101  
   102  static mp_limb_t
   103  mul3 (mp_limb_t m)
   104  {
   105    mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
   106    mp_limb_t m2 = (m + 2);
   107    return m01 * m2;
   108  }
   109  
   110  static mp_limb_t
   111  mul4 (mp_limb_t m)
   112  {
   113    mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
   114    mp_limb_t m23 = (m + 2) * (m + 3) >> 1;
   115    return m01 * m23;
   116  }
   117  
   118  static mp_limb_t
   119  mul5 (mp_limb_t m)
   120  {
   121    mp_limb_t m012 = (m + 0) * (m + 1) * (m + 2) >> 1;
   122    mp_limb_t m34 = (m + 3) * (m + 4) >> 1;
   123    return m012 * m34;
   124  }
   125  
   126  static mp_limb_t
   127  mul6 (mp_limb_t m)
   128  {
   129    mp_limb_t m01 = (m + 0) * (m + 1);
   130    mp_limb_t m23 = (m + 2) * (m + 3);
   131    mp_limb_t m45 = (m + 4) * (m + 5) >> 1;
   132    mp_limb_t m0123 = m01 * m23 >> 3;
   133    return m0123 * m45;
   134  }
   135  
   136  static mp_limb_t
   137  mul7 (mp_limb_t m)
   138  {
   139    mp_limb_t m01 = (m + 0) * (m + 1);
   140    mp_limb_t m23 = (m + 2) * (m + 3);
   141    mp_limb_t m456 = (m + 4) * (m + 5) * (m + 6) >> 1;
   142    mp_limb_t m0123 = m01 * m23 >> 3;
   143    return m0123 * m456;
   144  }
   145  
   146  static mp_limb_t
   147  mul8 (mp_limb_t m)
   148  {
   149    mp_limb_t m01 = (m + 0) * (m + 1);
   150    mp_limb_t m23 = (m + 2) * (m + 3);
   151    mp_limb_t m45 = (m + 4) * (m + 5);
   152    mp_limb_t m67 = (m + 6) * (m + 7);
   153    mp_limb_t m0123 = m01 * m23 >> 3;
   154    mp_limb_t m4567 = m45 * m67 >> 3;
   155    return m0123 * m4567;
   156  }
   157  
   158  typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
   159  
   160  static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8};
   161  #define M (numberof(mulfunc))
   162  
   163  /* Number of factors-of-2 removed by the corresponding mulN function.  */
   164  static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6};
   165  
   166  #if 1
   167  /* This variant is inaccurate but share the code with other functions.  */
   168  #define MAXFACS(max,l)							\
   169    do {									\
   170      (max) = log_n_max (l);						\
   171    } while (0)
   172  #else
   173  
   174  /* This variant is exact(?) but uses a loop.  It takes the 2 removal
   175   of mulN into account.  */
   176  static const unsigned long ftab[] =
   177  #if GMP_NUMB_BITS == 64
   178    /* 1 to 8 factors per iteration */
   179    {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x100000000),0x32cbfe,0x16a0b,0x24c4,0xa16,0x34b,0x1b2 /*,0xdf,0x8d */};
   180  #endif
   181  #if GMP_NUMB_BITS == 32
   182    /* 1 to 7 factors per iteration */
   183    {0xffffffff,0x10000,0x801,0x16b,0x71,0x42,0x26 /* ,0x1e */};
   184  #endif
   185  
   186  #define MAXFACS(max,l)							\
   187    do {									\
   188      int __i;								\
   189      for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)		\
   190        ;									\
   191      (max) = __i + 1;							\
   192    } while (0)
   193  #endif
   194  
   195  /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
   196     is an odd integer. */
   197  static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE };
   198  
   199  static void
   200  mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
   201  {
   202    int nmax, kmax, nmaxnow, numfac;
   203    mp_ptr np, kp;
   204    mp_size_t nn, kn, alloc;
   205    mp_limb_t i, j, t, iii, jjj, cy, dinv;
   206    mp_bitcnt_t i2cnt, j2cnt;
   207    int cnt;
   208    mp_size_t maxn;
   209    TMP_DECL;
   210  
   211    ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);
   212    TMP_MARK;
   213  
   214    maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */
   215  
   216    /* FIXME: This allocation might be insufficient, but is usually way too
   217       large.  */
   218    alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
   219    alloc = MIN (alloc, k) + 1;
   220    np = TMP_ALLOC_LIMBS (alloc);
   221    kp = TMP_ALLOC_LIMBS (SOME_THRESHOLD + 1);
   222  
   223    MAXFACS (nmax, n);
   224    ASSERT (nmax <= M);
   225    MAXFACS (kmax, k);
   226    ASSERT (kmax <= M);
   227    ASSERT (k >= M);
   228  
   229    i = n - k + 1;
   230  
   231    np[0] = 1; nn = 1;
   232  
   233    i2cnt = 0;				/* total low zeros in dividend */
   234    j2cnt = __gmp_fac2cnt_table[ODD_FACTORIAL_TABLE_LIMIT / 2 - 1];
   235  					/* total low zeros in divisor */
   236  
   237    numfac = 1;
   238    j = ODD_FACTORIAL_TABLE_LIMIT + 1;
   239    jjj = ODD_FACTORIAL_TABLE_MAX;
   240    ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);
   241  
   242    while (1)
   243      {
   244        kp[0] = jjj;				/* store new factors */
   245        kn = 1;
   246        t = k - j + 1;
   247        kmax = MIN (kmax, t);
   248  
   249        while (kmax != 0 && kn < SOME_THRESHOLD)
   250  	{
   251  	  jjj = mulfunc[kmax - 1] (j);
   252  	  j += kmax;				/* number of factors used */
   253  	  count_trailing_zeros (cnt, jjj);	/* count low zeros */
   254  	  jjj >>= cnt;				/* remove remaining low zeros */
   255  	  j2cnt += tcnttab[kmax - 1] + cnt;	/* update low zeros count */
   256  	  cy = mpn_mul_1 (kp, kp, kn, jjj);	/* accumulate new factors */
   257  	  kp[kn] = cy;
   258  	  kn += cy != 0;
   259  	  t = k - j + 1;
   260  	  kmax = MIN (kmax, t);
   261  	}
   262        numfac = j - numfac;
   263  
   264        while (numfac != 0)
   265  	{
   266  	  nmaxnow = MIN (nmax, numfac);
   267  	  iii = mulfunc[nmaxnow - 1] (i);
   268  	  i += nmaxnow;				/* number of factors used */
   269  	  count_trailing_zeros (cnt, iii);	/* count low zeros */
   270  	  iii >>= cnt;				/* remove remaining low zeros */
   271  	  i2cnt += tcnttab[nmaxnow - 1] + cnt;	/* update low zeros count */
   272  	  cy = mpn_mul_1 (np, np, nn, iii);	/* accumulate new factors */
   273  	  np[nn] = cy;
   274  	  nn += cy != 0;
   275  	  numfac -= nmaxnow;
   276  	}
   277  
   278        ASSERT (nn < alloc);
   279  
   280        binvert_limb (dinv, kp[0]);
   281        nn += (np[nn - 1] >= kp[kn - 1]);
   282        nn -= kn;
   283        mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
   284  
   285        if (kmax == 0)
   286  	break;
   287        numfac = j;
   288  
   289        jjj = mulfunc[kmax - 1] (j);
   290        j += kmax;				/* number of factors used */
   291        count_trailing_zeros (cnt, jjj);		/* count low zeros */
   292        jjj >>= cnt;				/* remove remaining low zeros */
   293        j2cnt += tcnttab[kmax - 1] + cnt;		/* update low zeros count */
   294      }
   295  
   296    /* Put back the right number of factors of 2.  */
   297    cnt = i2cnt - j2cnt;
   298    if (cnt != 0)
   299      {
   300        ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
   301        cy = mpn_lshift (np, np, nn, cnt);
   302        np[nn] = cy;
   303        nn += cy != 0;
   304      }
   305  
   306    nn -= np[nn - 1] == 0;	/* normalisation */
   307  
   308    kp = MPZ_NEWALLOC (r, nn);
   309    SIZ(r) = nn;
   310    MPN_COPY (kp, np, nn);
   311    TMP_FREE;
   312  }
   313  
   314  static void
   315  mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
   316  {
   317    int nmax, numfac;
   318    mp_ptr rp;
   319    mp_size_t rn, alloc;
   320    mp_limb_t i, iii, cy;
   321    mp_bitcnt_t i2cnt, cnt;
   322  
   323    count_leading_zeros (cnt, (mp_limb_t) n);
   324    cnt = GMP_LIMB_BITS - cnt;
   325    alloc = cnt * k / GMP_NUMB_BITS + 3;	/* FIXME: ensure rounding is enough. */
   326    rp = MPZ_NEWALLOC (r, alloc);
   327  
   328    MAXFACS (nmax, n);
   329    nmax = MIN (nmax, M);
   330  
   331    i = n - k + 1;
   332  
   333    nmax = MIN (nmax, k);
   334    rp[0] = mulfunc[nmax - 1] (i);
   335    rn = 1;
   336    i += nmax;				/* number of factors used */
   337    i2cnt = tcnttab[nmax - 1];		/* low zeros count */
   338    numfac = k - nmax;
   339    while (numfac != 0)
   340      {
   341        nmax = MIN (nmax, numfac);
   342        iii = mulfunc[nmax - 1] (i);
   343        i += nmax;			/* number of factors used */
   344        i2cnt += tcnttab[nmax - 1];	/* update low zeros count */
   345        cy = mpn_mul_1 (rp, rp, rn, iii);	/* accumulate new factors */
   346        rp[rn] = cy;
   347        rn += cy != 0;
   348        numfac -= nmax;
   349      }
   350  
   351    ASSERT (rn < alloc);
   352  
   353    mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2],
   354  		    __gmp_fac2cnt_table[k / 2 - 1] - i2cnt);
   355    /* A two-fold, branch-free normalisation is possible :*/
   356    /* rn -= rp[rn - 1] == 0; */
   357    /* rn -= rp[rn - 1] == 0; */
   358    MPN_NORMALIZE_NOT_ZERO (rp, rn);
   359  
   360    SIZ(r) = rn;
   361  }
   362  
   363  /* Algorithm:
   364  
   365     Plain and simply multiply things together.
   366  
   367     We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
   368     that k!/2^t is odd).
   369  
   370  */
   371  
   372  static mp_limb_t
   373  bc_bin_uiui (unsigned int n, unsigned int k)
   374  {
   375    return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])
   376      << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))
   377      & GMP_NUMB_MASK;
   378  }
   379  
   380  /* Algorithm:
   381  
   382     Recursively exploit the relation
   383     bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
   384  
   385     Values for binomial(k,k>>1) that fit in a limb are precomputed
   386     (with inverses).
   387  */
   388  
   389  /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
   390     binomial(i*2,i)/2^t (where t is chosen so that it is odd). */
   391  static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE };
   392  
   393  /* bin2kkinv[i] = bin2kk[i]^-1 mod B */
   394  static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE };
   395  
   396  /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */
   397  static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE };
   398  
   399  static void
   400  mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
   401  {
   402    mp_ptr rp;
   403    mp_size_t rn;
   404    unsigned long int hk;
   405  
   406    hk = k >> 1;
   407  
   408    if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT)
   409      mpz_smallk_bin_uiui (r, n, hk);
   410    else
   411      mpz_smallkdc_bin_uiui (r, n, hk);
   412    k -= hk;
   413    n -= hk;
   414    if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) {
   415      mp_limb_t cy;
   416      rn = SIZ (r);
   417      rp = MPZ_REALLOC (r, rn + 1);
   418      cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));
   419      rp [rn] = cy;
   420      rn += cy != 0;
   421    } else {
   422      mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];
   423      mpz_t t;
   424  
   425      ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;
   426      PTR (t) = buffer;
   427      if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT)
   428        mpz_smallk_bin_uiui (t, n, k);
   429      else
   430        mpz_smallkdc_bin_uiui (t, n, k);
   431      mpz_mul (r, r, t);
   432      rp = PTR (r);
   433      rn = SIZ (r);
   434    }
   435  
   436    mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],
   437  		    bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],
   438  		    fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));
   439    /* A two-fold, branch-free normalisation is possible :*/
   440    /* rn -= rp[rn - 1] == 0; */
   441    /* rn -= rp[rn - 1] == 0; */
   442    MPN_NORMALIZE_NOT_ZERO (rp, rn);
   443  
   444    SIZ(r) = rn;
   445  }
   446  
   447  /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
   448   *
   449   * Contributed to the GNU project by Marco Bodrato.
   450   *
   451   * Implementation of the algorithm by P. Goetgheluck, "Computing
   452   * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
   453   * No. 4 (April 1987), pp. 360-365.
   454   *
   455   * Acknowledgment: Peter Luschny did spot the slowness of the previous
   456   * code and suggested the reference.
   457   */
   458  
   459  /* TODO: Remove duplicated constants / macros / static functions...
   460   */
   461  
   462  /*************************************************************/
   463  /* Section macros: common macros, for swing/fac/bin (&sieve) */
   464  /*************************************************************/
   465  
   466  #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I)			\
   467    if ((PR) > (MAX_PR)) {					\
   468      (VEC)[(I)++] = (PR);					\
   469      (PR) = 1;							\
   470    }
   471  
   472  #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I)		\
   473    do {								\
   474      if ((PR) > (MAX_PR)) {					\
   475        (VEC)[(I)++] = (PR);					\
   476        (PR) = (P);						\
   477      } else							\
   478        (PR) *= (P);						\
   479    } while (0)
   480  
   481  #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)			\
   482      __max_i = (end);						\
   483  								\
   484      do {							\
   485        ++__i;							\
   486        if (((sieve)[__index] & __mask) == 0)			\
   487  	{							\
   488  	  (prime) = id_to_n(__i)
   489  
   490  #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve)		\
   491    do {								\
   492      mp_limb_t __mask, __index, __max_i, __i;			\
   493  								\
   494      __i = (start)-(off);					\
   495      __index = __i / GMP_LIMB_BITS;				\
   496      __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS);		\
   497      __i += (off);						\
   498  								\
   499      LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)
   500  
   501  #define LOOP_ON_SIEVE_STOP					\
   502  	}							\
   503        __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1);	\
   504        __index += __mask & 1;					\
   505      }  while (__i <= __max_i)					\
   506  
   507  #define LOOP_ON_SIEVE_END					\
   508      LOOP_ON_SIEVE_STOP;						\
   509    } while (0)
   510  
   511  /*********************************************************/
   512  /* Section sieve: sieving functions and tools for primes */
   513  /*********************************************************/
   514  
   515  #if WANT_ASSERT
   516  static mp_limb_t
   517  bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; }
   518  #endif
   519  
   520  /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/
   521  static mp_limb_t
   522  id_to_n  (mp_limb_t id)  { return id*3+1+(id&1); }
   523  
   524  /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
   525  static mp_limb_t
   526  n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; }
   527  
   528  static mp_size_t
   529  primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; }
   530  
   531  /*********************************************************/
   532  /* Section binomial: fast binomial implementation        */
   533  /*********************************************************/
   534  
   535  #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
   536    do {							\
   537      mp_limb_t __a, __b, __prime, __ma,__mb;		\
   538      __prime = (P);					\
   539      __a = (N); __b = (K); __mb = 0;			\
   540      FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I);		\
   541      do {						\
   542        __mb += __b % __prime; __b /= __prime;		\
   543        __ma = __a % __prime; __a /= __prime;		\
   544        if (__ma < __mb) {				\
   545          __mb = 1; (PR) *= __prime;			\
   546        } else  __mb = 0;					\
   547      } while (__a >= __prime);				\
   548    } while (0)
   549  
   550  #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
   551    do {							\
   552      mp_limb_t __prime;					\
   553      __prime = (P);					\
   554      if (((N) % __prime) < ((K) % __prime)) {		\
   555        FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I);	\
   556      }							\
   557    } while (0)
   558  
   559  /* Returns an approximation of the sqare root of x.  *
   560   * It gives: x <= limb_apprsqrt (x) ^ 2 < x * 9/4    */
   561  static mp_limb_t
   562  limb_apprsqrt (mp_limb_t x)
   563  {
   564    int s;
   565  
   566    ASSERT (x > 2);
   567    count_leading_zeros (s, x - 1);
   568    s = GMP_LIMB_BITS - 1 - s;
   569    return (CNST_LIMB(1) << (s >> 1)) + (CNST_LIMB(1) << ((s - 1) >> 1));
   570  }
   571  
   572  static void
   573  mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
   574  {
   575    mp_limb_t *sieve, *factors, count;
   576    mp_limb_t prod, max_prod, j;
   577    TMP_DECL;
   578  
   579    ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);
   580    ASSERT (n >= 25);
   581  
   582    TMP_MARK;
   583    sieve = TMP_ALLOC_LIMBS (primesieve_size (n));
   584  
   585    count = gmp_primesieve (sieve, n) + 1;
   586    factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);
   587  
   588    max_prod = GMP_NUMB_MAX / n;
   589  
   590    /* Handle primes = 2, 3 separately. */
   591    popc_limb (count, n - k);
   592    popc_limb (j, k);
   593    count += j;
   594    popc_limb (j, n);
   595    count -= j;
   596    prod = CNST_LIMB(1) << count;
   597  
   598    j = 0;
   599    COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);
   600  
   601    /* Accumulate prime factors from 5 to n/2 */
   602      {
   603        mp_limb_t s;
   604  
   605        {
   606  	mp_limb_t prime;
   607  	s = limb_apprsqrt(n);
   608  	s = n_to_bit (s);
   609  	LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);
   610  	COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
   611  	LOOP_ON_SIEVE_END;
   612  	s++;
   613        }
   614  
   615        ASSERT (max_prod <= GMP_NUMB_MAX / 2);
   616        max_prod <<= 1;
   617        ASSERT (bit_to_n (s) * bit_to_n (s) > n);
   618        ASSERT (s <= n_to_bit (n >> 1));
   619        {
   620  	mp_limb_t prime;
   621  
   622  	LOOP_ON_SIEVE_BEGIN (prime, s, n_to_bit (n >> 1), 0,sieve);
   623  	SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
   624  	LOOP_ON_SIEVE_END;
   625        }
   626        max_prod >>= 1;
   627      }
   628  
   629    /* Store primes from (n-k)+1 to n */
   630    ASSERT (n_to_bit (n - k) < n_to_bit (n));
   631      {
   632        mp_limb_t prime;
   633        LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);
   634        FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);
   635        LOOP_ON_SIEVE_END;
   636      }
   637  
   638    if (LIKELY (j != 0))
   639      {
   640        factors[j++] = prod;
   641        mpz_prodlimbs (r, factors, j);
   642      }
   643    else
   644      {
   645        PTR (r)[0] = prod;
   646        SIZ (r) = 1;
   647      }
   648    TMP_FREE;
   649  }
   650  
   651  #undef COUNT_A_PRIME
   652  #undef SH_COUNT_A_PRIME
   653  #undef LOOP_ON_SIEVE_END
   654  #undef LOOP_ON_SIEVE_STOP
   655  #undef LOOP_ON_SIEVE_BEGIN
   656  #undef LOOP_ON_SIEVE_CONTINUE
   657  
   658  /*********************************************************/
   659  /* End of implementation of Goetgheluck's algorithm      */
   660  /*********************************************************/
   661  
   662  void
   663  mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
   664  {
   665    if (UNLIKELY (n < k)) {
   666      SIZ (r) = 0;
   667  #if BITS_PER_ULONG > GMP_NUMB_BITS
   668    } else if (UNLIKELY (n > GMP_NUMB_MAX)) {
   669      mpz_t tmp;
   670  
   671      mpz_init_set_ui (tmp, n);
   672      mpz_bin_ui (r, tmp, k);
   673      mpz_clear (tmp);
   674  #endif
   675    } else {
   676      ASSERT (n <= GMP_NUMB_MAX);
   677      /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
   678      k = MIN (k, n - k);
   679      if (k < 2) {
   680        PTR(r)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */
   681        SIZ(r) = 1;
   682      } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */
   683        PTR(r)[0] = bc_bin_uiui (n, k);
   684        SIZ(r) = 1;
   685      } else if (k <= ODD_FACTORIAL_TABLE_LIMIT)
   686        mpz_smallk_bin_uiui (r, n, k);
   687      else if (BIN_UIUI_ENABLE_SMALLDC &&
   688  	     k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)
   689        mpz_smallkdc_bin_uiui (r, n, k);
   690      else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&
   691  	     k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
   692        mpz_goetgheluck_bin_uiui (r, n, k);
   693      else
   694        mpz_bdiv_bin_uiui (r, n, k);
   695    }
   696  }