github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/tune/speed.h (about)

     1  /* Header for speed and threshold things.
     2  
     3  Copyright 1999-2003, 2005, 2006, 2008-2015 Free Software Foundation, Inc.
     4  
     5  This file is part of the GNU MP Library.
     6  
     7  The GNU MP Library is free software; you can redistribute it and/or modify
     8  it under the terms of either:
     9  
    10    * the GNU Lesser General Public License as published by the Free
    11      Software Foundation; either version 3 of the License, or (at your
    12      option) any later version.
    13  
    14  or
    15  
    16    * the GNU General Public License as published by the Free Software
    17      Foundation; either version 2 of the License, or (at your option) any
    18      later version.
    19  
    20  or both in parallel, as here.
    21  
    22  The GNU MP Library is distributed in the hope that it will be useful, but
    23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  for more details.
    26  
    27  You should have received copies of the GNU General Public License and the
    28  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  see https://www.gnu.org/licenses/.  */
    30  
    31  #ifndef __SPEED_H__
    32  #define __SPEED_H__
    33  
    34  
    35  /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
    36     newsize long. */
    37  #define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
    38    do {							\
    39      ASSERT ((newsize) >= (oldsize));			\
    40      MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
    41    } while (0)
    42  
    43  /* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
    44     x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
    45  #define MP_LIMB_T_LOWBITMASK(n)	\
    46    ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
    47  
    48  
    49  /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
    50  
    51  #define TMP_ALLOC_ALIGNED(bytes, align)	\
    52    align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
    53  #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
    54    ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
    55  
    56  /* CACHE_LINE_SIZE is our default alignment for speed operands, and the
    57     limit on what s->align_xp etc and then request for off-alignment.  Maybe
    58     this should be an option of some sort, but in any case here are some line
    59     sizes,
    60  
    61         bytes
    62  	 32   pentium
    63  	 64   athlon
    64  	 64   itanium-2 L1
    65  	128   itanium-2 L2
    66  */
    67  #define CACHE_LINE_SIZE   64 /* bytes */
    68  
    69  #define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
    70  
    71  /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
    72     alignment.  */
    73  #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
    74    do {									\
    75      mp_ptr     __ptr;							\
    76      mp_size_t  __ptr_align, __ptr_add;					\
    77  									\
    78      ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0);		\
    79      __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
    80      __ptr_align = (__ptr - (mp_ptr) NULL);				\
    81      __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
    82      (ptr) = __ptr + __ptr_add;						\
    83    } while (0)
    84  
    85  
    86  /* This is the size for s->xp_block and s->yp_block, used in certain
    87     routines that want to run across many different data values and use
    88     s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
    89  
    90     512 means 2kbytes of data for each of xp_block and yp_block, making 4k
    91     total, which should fit easily in any L1 data cache. */
    92  
    93  #define SPEED_BLOCK_SIZE   512 /* limbs */
    94  
    95  
    96  extern double  speed_unittime;
    97  extern double  speed_cycletime;
    98  extern int     speed_precision;
    99  extern char    speed_time_string[];
   100  void speed_time_init (void);
   101  void speed_cycletime_fail (const char *str);
   102  void speed_cycletime_init (void);
   103  void speed_cycletime_need_cycles (void);
   104  void speed_cycletime_need_seconds (void);
   105  void speed_starttime (void);
   106  double speed_endtime (void);
   107  
   108  
   109  struct speed_params {
   110    unsigned   reps;	/* how many times to run the routine */
   111    mp_ptr     xp;	/* first argument */
   112    mp_ptr     yp;	/* second argument */
   113    mp_size_t  size;	/* size of both arguments */
   114    mp_limb_t  r;		/* user supplied parameter */
   115    mp_size_t  align_xp;	/* alignment of xp */
   116    mp_size_t  align_yp;	/* alignment of yp */
   117    mp_size_t  align_wp;	/* intended alignment of wp */
   118    mp_size_t  align_wp2; /* intended alignment of wp2 */
   119    mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
   120    mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
   121  
   122    double     time_divisor; /* optionally set by the speed routine */
   123  
   124    /* used by the cache priming things */
   125    int	     cache;
   126    unsigned   src_num, dst_num;
   127    struct {
   128      mp_ptr    ptr;
   129      mp_size_t size;
   130    } src[5], dst[4];
   131  };
   132  
   133  typedef double (*speed_function_t) (struct speed_params *);
   134  
   135  double speed_measure (speed_function_t fun, struct speed_params *);
   136  
   137  /* Prototypes for speed measuring routines */
   138  
   139  double speed_back_to_back (struct speed_params *);
   140  double speed_count_leading_zeros (struct speed_params *);
   141  double speed_count_trailing_zeros (struct speed_params *);
   142  double speed_find_a (struct speed_params *);
   143  double speed_gmp_allocate_free (struct speed_params *);
   144  double speed_gmp_allocate_reallocate_free (struct speed_params *);
   145  double speed_invert_limb (struct speed_params *);
   146  double speed_malloc_free (struct speed_params *);
   147  double speed_malloc_realloc_free (struct speed_params *);
   148  double speed_memcpy (struct speed_params *);
   149  double speed_binvert_limb (struct speed_params *);
   150  double speed_binvert_limb_mul1 (struct speed_params *);
   151  double speed_binvert_limb_loop (struct speed_params *);
   152  double speed_binvert_limb_cond (struct speed_params *);
   153  double speed_binvert_limb_arith (struct speed_params *);
   154  
   155  double speed_mpf_init_clear (struct speed_params *);
   156  
   157  double speed_mpn_add_n (struct speed_params *);
   158  double speed_mpn_add_1 (struct speed_params *);
   159  double speed_mpn_add_1_inplace (struct speed_params *);
   160  double speed_mpn_add_err1_n (struct speed_params *);
   161  double speed_mpn_add_err2_n (struct speed_params *);
   162  double speed_mpn_add_err3_n (struct speed_params *);
   163  double speed_mpn_addlsh_n (struct speed_params *);
   164  double speed_mpn_addlsh1_n (struct speed_params *);
   165  double speed_mpn_addlsh2_n (struct speed_params *);
   166  double speed_mpn_addlsh_n_ip1 (struct speed_params *);
   167  double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
   168  double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
   169  double speed_mpn_addlsh_n_ip2 (struct speed_params *);
   170  double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
   171  double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
   172  double speed_mpn_add_n_sub_n (struct speed_params *);
   173  double speed_mpn_and_n (struct speed_params *);
   174  double speed_mpn_andn_n (struct speed_params *);
   175  double speed_mpn_addmul_1 (struct speed_params *);
   176  double speed_mpn_addmul_2 (struct speed_params *);
   177  double speed_mpn_addmul_3 (struct speed_params *);
   178  double speed_mpn_addmul_4 (struct speed_params *);
   179  double speed_mpn_addmul_5 (struct speed_params *);
   180  double speed_mpn_addmul_6 (struct speed_params *);
   181  double speed_mpn_addmul_7 (struct speed_params *);
   182  double speed_mpn_addmul_8 (struct speed_params *);
   183  double speed_mpn_cnd_add_n (struct speed_params *);
   184  double speed_mpn_cnd_sub_n (struct speed_params *);
   185  double speed_mpn_com (struct speed_params *);
   186  double speed_mpn_neg (struct speed_params *);
   187  double speed_mpn_copyd (struct speed_params *);
   188  double speed_mpn_copyi (struct speed_params *);
   189  double speed_MPN_COPY (struct speed_params *);
   190  double speed_MPN_COPY_DECR (struct speed_params *);
   191  double speed_MPN_COPY_INCR (struct speed_params *);
   192  double speed_mpn_sec_tabselect (struct speed_params *);
   193  double speed_mpn_divexact_1 (struct speed_params *);
   194  double speed_mpn_divexact_by3 (struct speed_params *);
   195  double speed_mpn_bdiv_q_1 (struct speed_params *);
   196  double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
   197  double speed_mpn_bdiv_dbm1c (struct speed_params *);
   198  double speed_mpn_divrem_1 (struct speed_params *);
   199  double speed_mpn_divrem_1f (struct speed_params *);
   200  double speed_mpn_divrem_1c (struct speed_params *);
   201  double speed_mpn_divrem_1cf (struct speed_params *);
   202  double speed_mpn_divrem_1_div (struct speed_params *);
   203  double speed_mpn_divrem_1f_div (struct speed_params *);
   204  double speed_mpn_divrem_1_inv (struct speed_params *);
   205  double speed_mpn_divrem_1f_inv (struct speed_params *);
   206  double speed_mpn_divrem_2 (struct speed_params *);
   207  double speed_mpn_divrem_2_div (struct speed_params *);
   208  double speed_mpn_divrem_2_inv (struct speed_params *);
   209  double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
   210  double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
   211  double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
   212  double speed_mpn_div_qr_1 (struct speed_params *);
   213  double speed_mpn_div_qr_2n (struct speed_params *);
   214  double speed_mpn_div_qr_2u (struct speed_params *);
   215  double speed_mpn_fib2_ui (struct speed_params *);
   216  double speed_mpn_matrix22_mul (struct speed_params *);
   217  double speed_mpn_hgcd (struct speed_params *);
   218  double speed_mpn_hgcd_lehmer (struct speed_params *);
   219  double speed_mpn_hgcd_appr (struct speed_params *);
   220  double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
   221  double speed_mpn_hgcd_reduce (struct speed_params *);
   222  double speed_mpn_hgcd_reduce_1 (struct speed_params *);
   223  double speed_mpn_hgcd_reduce_2 (struct speed_params *);
   224  double speed_mpn_gcd (struct speed_params *);
   225  double speed_mpn_gcd_1 (struct speed_params *);
   226  double speed_mpn_gcd_1N (struct speed_params *);
   227  double speed_mpn_gcdext (struct speed_params *);
   228  double speed_mpn_gcdext_double (struct speed_params *);
   229  double speed_mpn_gcdext_one_double (struct speed_params *);
   230  double speed_mpn_gcdext_one_single (struct speed_params *);
   231  double speed_mpn_gcdext_single (struct speed_params *);
   232  double speed_mpn_get_str (struct speed_params *);
   233  double speed_mpn_hamdist (struct speed_params *);
   234  double speed_mpn_ior_n (struct speed_params *);
   235  double speed_mpn_iorn_n (struct speed_params *);
   236  double speed_mpn_jacobi_base (struct speed_params *);
   237  double speed_mpn_jacobi_base_1 (struct speed_params *);
   238  double speed_mpn_jacobi_base_2 (struct speed_params *);
   239  double speed_mpn_jacobi_base_3 (struct speed_params *);
   240  double speed_mpn_jacobi_base_4 (struct speed_params *);
   241  double speed_mpn_lshift (struct speed_params *);
   242  double speed_mpn_lshiftc (struct speed_params *);
   243  double speed_mpn_mod_1 (struct speed_params *);
   244  double speed_mpn_mod_1c (struct speed_params *);
   245  double speed_mpn_mod_1_div (struct speed_params *);
   246  double speed_mpn_mod_1_inv (struct speed_params *);
   247  double speed_mpn_mod_1_1 (struct speed_params *);
   248  double speed_mpn_mod_1_1_1 (struct speed_params *);
   249  double speed_mpn_mod_1_1_2 (struct speed_params *);
   250  double speed_mpn_mod_1_2 (struct speed_params *);
   251  double speed_mpn_mod_1_3 (struct speed_params *);
   252  double speed_mpn_mod_1_4 (struct speed_params *);
   253  double speed_mpn_mod_34lsub1 (struct speed_params *);
   254  double speed_mpn_modexact_1_odd (struct speed_params *);
   255  double speed_mpn_modexact_1c_odd (struct speed_params *);
   256  double speed_mpn_mul_1 (struct speed_params *);
   257  double speed_mpn_mul_1_inplace (struct speed_params *);
   258  double speed_mpn_mul_2 (struct speed_params *);
   259  double speed_mpn_mul_3 (struct speed_params *);
   260  double speed_mpn_mul_4 (struct speed_params *);
   261  double speed_mpn_mul_5 (struct speed_params *);
   262  double speed_mpn_mul_6 (struct speed_params *);
   263  double speed_mpn_mul (struct speed_params *);
   264  double speed_mpn_mul_basecase (struct speed_params *);
   265  double speed_mpn_mulmid (struct speed_params *);
   266  double speed_mpn_mulmid_basecase (struct speed_params *);
   267  double speed_mpn_mul_fft (struct speed_params *);
   268  double speed_mpn_mul_fft_sqr (struct speed_params *);
   269  double speed_mpn_fft_mul (struct speed_params *);
   270  double speed_mpn_fft_sqr (struct speed_params *);
   271  #if WANT_OLD_FFT_FULL
   272  double speed_mpn_mul_fft_full (struct speed_params *);
   273  double speed_mpn_mul_fft_full_sqr (struct speed_params *);
   274  #endif
   275  double speed_mpn_nussbaumer_mul (struct speed_params *);
   276  double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
   277  double speed_mpn_mul_n (struct speed_params *);
   278  double speed_mpn_mul_n_sqr (struct speed_params *);
   279  double speed_mpn_mulmid_n (struct speed_params *);
   280  double speed_mpn_sqrlo (struct speed_params *);
   281  double speed_mpn_sqrlo_basecase (struct speed_params *);
   282  double speed_mpn_mullo_n (struct speed_params *);
   283  double speed_mpn_mullo_basecase (struct speed_params *);
   284  double speed_mpn_nand_n (struct speed_params *);
   285  double speed_mpn_nior_n (struct speed_params *);
   286  double speed_mpn_popcount (struct speed_params *);
   287  double speed_mpn_preinv_divrem_1 (struct speed_params *);
   288  double speed_mpn_preinv_divrem_1f (struct speed_params *);
   289  double speed_mpn_preinv_mod_1 (struct speed_params *);
   290  double speed_mpn_sbpi1_div_qr (struct speed_params *);
   291  double speed_mpn_dcpi1_div_qr (struct speed_params *);
   292  double speed_mpn_sbpi1_divappr_q (struct speed_params *);
   293  double speed_mpn_dcpi1_divappr_q (struct speed_params *);
   294  double speed_mpn_mu_div_qr (struct speed_params *);
   295  double speed_mpn_mu_divappr_q (struct speed_params *);
   296  double speed_mpn_mupi_div_qr (struct speed_params *);
   297  double speed_mpn_mu_div_q (struct speed_params *);
   298  double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
   299  double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
   300  double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
   301  double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
   302  double speed_mpn_mu_bdiv_q (struct speed_params *);
   303  double speed_mpn_mu_bdiv_qr (struct speed_params *);
   304  double speed_mpn_broot (struct speed_params *);
   305  double speed_mpn_broot_invm1 (struct speed_params *);
   306  double speed_mpn_brootinv (struct speed_params *);
   307  double speed_mpn_invert (struct speed_params *);
   308  double speed_mpn_invertappr (struct speed_params *);
   309  double speed_mpn_ni_invertappr (struct speed_params *);
   310  double speed_mpn_sec_invert (struct speed_params *s);
   311  double speed_mpn_binvert (struct speed_params *);
   312  double speed_mpn_redc_1 (struct speed_params *);
   313  double speed_mpn_redc_2 (struct speed_params *);
   314  double speed_mpn_redc_n (struct speed_params *);
   315  double speed_mpn_rsblsh_n (struct speed_params *);
   316  double speed_mpn_rsblsh1_n (struct speed_params *);
   317  double speed_mpn_rsblsh2_n (struct speed_params *);
   318  double speed_mpn_rsh1add_n (struct speed_params *);
   319  double speed_mpn_rsh1sub_n (struct speed_params *);
   320  double speed_mpn_rshift (struct speed_params *);
   321  double speed_mpn_sb_divrem_m3 (struct speed_params *);
   322  double speed_mpn_sb_divrem_m3_div (struct speed_params *);
   323  double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
   324  double speed_mpn_set_str (struct speed_params *);
   325  double speed_mpn_bc_set_str (struct speed_params *);
   326  double speed_mpn_dc_set_str (struct speed_params *);
   327  double speed_mpn_set_str_pre (struct speed_params *);
   328  double speed_mpn_sqr_basecase (struct speed_params *);
   329  double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
   330  double speed_mpn_sqr_diagonal (struct speed_params *);
   331  double speed_mpn_sqr (struct speed_params *);
   332  double speed_mpn_sqrtrem (struct speed_params *);
   333  double speed_mpn_rootrem (struct speed_params *);
   334  double speed_mpn_sqrt (struct speed_params *);
   335  double speed_mpn_root (struct speed_params *);
   336  double speed_mpn_sub_n (struct speed_params *);
   337  double speed_mpn_sub_1 (struct speed_params *);
   338  double speed_mpn_sub_1_inplace (struct speed_params *);
   339  double speed_mpn_sub_err1_n (struct speed_params *);
   340  double speed_mpn_sub_err2_n (struct speed_params *);
   341  double speed_mpn_sub_err3_n (struct speed_params *);
   342  double speed_mpn_sublsh_n (struct speed_params *);
   343  double speed_mpn_sublsh1_n (struct speed_params *);
   344  double speed_mpn_sublsh2_n (struct speed_params *);
   345  double speed_mpn_sublsh_n_ip1 (struct speed_params *);
   346  double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
   347  double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
   348  double speed_mpn_submul_1 (struct speed_params *);
   349  double speed_mpn_toom2_sqr (struct speed_params *);
   350  double speed_mpn_toom3_sqr (struct speed_params *);
   351  double speed_mpn_toom4_sqr (struct speed_params *);
   352  double speed_mpn_toom6_sqr (struct speed_params *);
   353  double speed_mpn_toom8_sqr (struct speed_params *);
   354  double speed_mpn_toom22_mul (struct speed_params *);
   355  double speed_mpn_toom33_mul (struct speed_params *);
   356  double speed_mpn_toom44_mul (struct speed_params *);
   357  double speed_mpn_toom6h_mul (struct speed_params *);
   358  double speed_mpn_toom8h_mul (struct speed_params *);
   359  double speed_mpn_toom32_mul (struct speed_params *);
   360  double speed_mpn_toom42_mul (struct speed_params *);
   361  double speed_mpn_toom43_mul (struct speed_params *);
   362  double speed_mpn_toom63_mul (struct speed_params *);
   363  double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
   364  double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
   365  double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
   366  double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
   367  double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
   368  double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
   369  double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
   370  double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
   371  double speed_mpn_toom42_mulmid (struct speed_params *);
   372  double speed_mpn_mulmod_bnm1 (struct speed_params *);
   373  double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
   374  double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
   375  double speed_mpn_sqrmod_bnm1 (struct speed_params *);
   376  double speed_mpn_udiv_qrnnd (struct speed_params *);
   377  double speed_mpn_udiv_qrnnd_r (struct speed_params *);
   378  double speed_mpn_umul_ppmm (struct speed_params *);
   379  double speed_mpn_umul_ppmm_r (struct speed_params *);
   380  double speed_mpn_xnor_n (struct speed_params *);
   381  double speed_mpn_xor_n (struct speed_params *);
   382  double speed_MPN_ZERO (struct speed_params *);
   383  
   384  double speed_mpq_init_clear (struct speed_params *);
   385  
   386  double speed_mpz_add (struct speed_params *);
   387  double speed_mpz_bin_uiui (struct speed_params *);
   388  double speed_mpz_bin_ui (struct speed_params *);
   389  double speed_mpz_fac_ui (struct speed_params *);
   390  double speed_mpz_2fac_ui (struct speed_params *);
   391  double speed_mpz_fib_ui (struct speed_params *);
   392  double speed_mpz_fib2_ui (struct speed_params *);
   393  double speed_mpz_init_clear (struct speed_params *);
   394  double speed_mpz_init_realloc_clear (struct speed_params *);
   395  double speed_mpz_jacobi (struct speed_params *);
   396  double speed_mpz_lucnum_ui (struct speed_params *);
   397  double speed_mpz_lucnum2_ui (struct speed_params *);
   398  double speed_mpz_mod (struct speed_params *);
   399  double speed_mpz_powm (struct speed_params *);
   400  double speed_mpz_powm_mod (struct speed_params *);
   401  double speed_mpz_powm_redc (struct speed_params *);
   402  double speed_mpz_powm_sec (struct speed_params *);
   403  double speed_mpz_powm_ui (struct speed_params *);
   404  double speed_mpz_urandomb (struct speed_params *);
   405  
   406  double speed_gmp_randseed (struct speed_params *);
   407  double speed_gmp_randseed_ui (struct speed_params *);
   408  
   409  double speed_noop (struct speed_params *);
   410  double speed_noop_wxs (struct speed_params *);
   411  double speed_noop_wxys (struct speed_params *);
   412  
   413  double speed_operator_div (struct speed_params *);
   414  double speed_operator_mod (struct speed_params *);
   415  
   416  double speed_udiv_qrnnd (struct speed_params *);
   417  double speed_udiv_qrnnd_preinv1 (struct speed_params *);
   418  double speed_udiv_qrnnd_preinv2 (struct speed_params *);
   419  double speed_udiv_qrnnd_preinv3 (struct speed_params *);
   420  double speed_udiv_qrnnd_c (struct speed_params *);
   421  double speed_umul_ppmm (struct speed_params *);
   422  
   423  /* Prototypes for other routines */
   424  
   425  #if defined (__cplusplus)
   426  extern "C" {
   427  #endif
   428  
   429  /* low 32-bits in p[0], high 32-bits in p[1] */
   430  void speed_cyclecounter (unsigned p[2]);
   431  
   432  #if defined (__cplusplus)
   433  }
   434  #endif
   435  
   436  void mftb_function (unsigned p[2]);
   437  
   438  double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
   439  int gettimeofday_microseconds_p (void);
   440  int getrusage_microseconds_p (void);
   441  int cycles_works_p (void);
   442  long clk_tck (void);
   443  double freq_measure (const char *, double (*)(void));
   444  
   445  int double_cmp_ptr (const double *, const double *);
   446  void pentium_wbinvd (void);
   447  typedef int (*qsort_function_t) (const void *, const void *);
   448  
   449  void noop (void);
   450  void noop_1 (mp_limb_t);
   451  void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
   452  void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
   453  void mpn_cache_fill (mp_srcptr, mp_size_t);
   454  void mpn_cache_fill_dummy (mp_limb_t);
   455  void speed_cache_fill (struct speed_params *);
   456  void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
   457  void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
   458  
   459  extern int  speed_option_addrs;
   460  extern int  speed_option_verbose;
   461  extern int  speed_option_cycles_broken;
   462  void speed_option_set (const char *);
   463  
   464  mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
   465  mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
   466  
   467  mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
   468  mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
   469  mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
   470  mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
   471  
   472  int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
   473  int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
   474  int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
   475  int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
   476  
   477  mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
   478  mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
   479  
   480  mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
   481  mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
   482  
   483  void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
   484  void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
   485  
   486  mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
   487  mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
   488  mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
   489  mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
   490  mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
   491  mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
   492  
   493  mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
   494  mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
   495  
   496  mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
   497  mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
   498  
   499  mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
   500  mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
   501  
   502  mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
   503  mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
   504  
   505  mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
   506  void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
   507  
   508  void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
   509  void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
   510  
   511  int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
   512  
   513  
   514  /* "get" is called repeatedly until it ticks over, just in case on a fast
   515     processor it takes less than a microsecond, though this is probably
   516     unlikely if it's a system call.
   517  
   518     speed_cyclecounter is called on the same side of the "get" for the start
   519     and end measurements.  It doesn't matter how long it takes from the "get"
   520     sample to the cycles sample, since that period will cancel out in the
   521     difference calculation (assuming it's the same each time).
   522  
   523     Letting the test run for more than a process time slice is probably only
   524     going to reduce accuracy, especially for getrusage when the cycle counter
   525     is real time, or for gettimeofday if the cycle counter is in fact process
   526     time.  Use CLK_TCK/2 as a reasonable stop.
   527  
   528     It'd be desirable to be quite accurate here.  The default speed_precision
   529     for a cycle counter is 10000 cycles, so to mix that with getrusage or
   530     gettimeofday the frequency should be at least that accurate.  But running
   531     measurements for 10000 microseconds (or more) is too long.  Be satisfied
   532     with just a half clock tick (5000 microseconds usually).  */
   533  
   534  #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
   535    do {									\
   536      type      st1, st, et1, et;						\
   537      unsigned  sc[2], ec[2];						\
   538      long      dt, half_tick;						\
   539      double    dc, cyc;							\
   540  									\
   541      half_tick = (1000000L / clk_tck()) / 2;				\
   542  									\
   543      get (st1);								\
   544      do {								\
   545        get (st);								\
   546      } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
   547  									\
   548      getc (sc);								\
   549  									\
   550      for (;;)								\
   551        {									\
   552  	get (et1);							\
   553  	do {								\
   554  	  get (et);							\
   555  	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
   556  									\
   557  	getc (ec);							\
   558  									\
   559  	dc = speed_cyclecounter_diff (ec, sc);				\
   560  									\
   561  	/* allow secs to cancel before multiplying */			\
   562  	dt = sec(et) - sec(st);						\
   563  	dt = dt * 1000000L + (usec(et) - usec(st));			\
   564  									\
   565  	if (dt >= half_tick)						\
   566  	  break;							\
   567        }									\
   568  									\
   569      cyc = dt * 1e-6 / dc;						\
   570  									\
   571      if (speed_option_verbose >= 2)					\
   572        printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
   573  	      name, dc, dt, cyc);					\
   574  									\
   575      return dt * 1e-6 / dc;						\
   576  									\
   577    } while (0)
   578  
   579  
   580  
   581  
   582  /* The measuring routines use these big macros to save duplication for
   583     similar forms.  They also get used for some automatically generated
   584     measuring of new implementations of functions.
   585  
   586     Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
   587     function pointer is considered undesirable since it's not the way a
   588     normal application will be calling, and some processors might do
   589     different things with an indirect call, like not branch predicting, or
   590     doing a full pipe flush.  At least some of the "functions" measured are
   591     actually macros too.
   592  
   593     The net effect is to bloat the object code, possibly in a big way, but
   594     only what's being measured is being run, so that doesn't matter.
   595  
   596     The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
   597     ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
   598     function pointer doesn't work in gcc 3.2.  Using an actual non-pure
   599     function pointer variable works, but stands a real risk of a
   600     non-optimizing compiler generating unnecessary overheads in the call.
   601     Currently the best idea is not to use those attributes for a timing
   602     program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
   603     gmp-impl.h to omit them from routines there.  */
   604  
   605  #define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
   606  
   607  /* For mpn_copy or similar. */
   608  #define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
   609    {									\
   610      mp_ptr    wp;							\
   611      unsigned  i;							\
   612      double    t;							\
   613      TMP_DECL;								\
   614  									\
   615      SPEED_RESTRICT_COND (s->size >= 0);					\
   616  									\
   617      TMP_MARK;								\
   618      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
   619  									\
   620      speed_operand_src (s, s->xp, s->size);				\
   621      speed_operand_dst (s, wp, s->size);					\
   622      speed_cache_fill (s);						\
   623  									\
   624      speed_starttime ();							\
   625      i = s->reps;							\
   626      do									\
   627        call;								\
   628      while (--i != 0);							\
   629      t = speed_endtime ();						\
   630  									\
   631      TMP_FREE;								\
   632      return t;								\
   633    }
   634  #define SPEED_ROUTINE_MPN_COPY(function)				\
   635    SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
   636  
   637  #define SPEED_ROUTINE_MPN_TABSELECT(function)				\
   638    {									\
   639      mp_ptr    xp, wp;							\
   640      unsigned  i;							\
   641      double    t;							\
   642      TMP_DECL;								\
   643  									\
   644      SPEED_RESTRICT_COND (s->size >= 0);					\
   645  									\
   646      if (s->r == 0)							\
   647        s->r = s->size;	/* default to a quadratic shape */		\
   648  									\
   649      TMP_MARK;								\
   650      SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp);		\
   651      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
   652  									\
   653      speed_operand_src (s, xp, s->size * s->r);				\
   654      speed_operand_dst (s, wp, s->size);					\
   655      speed_cache_fill (s);						\
   656  									\
   657      speed_starttime ();							\
   658      i = s->reps;							\
   659      do									\
   660        function (wp, xp, s->size, s->r, (s->r) / 2);			\
   661      while (--i != 0);							\
   662      t = speed_endtime () / s->r;					\
   663  									\
   664      TMP_FREE;								\
   665      return t;								\
   666    }
   667  
   668  
   669  #define SPEED_ROUTINE_MPN_COPYC(function)				\
   670    {									\
   671      mp_ptr    wp;							\
   672      unsigned  i;							\
   673      double    t;							\
   674      TMP_DECL;								\
   675  									\
   676      SPEED_RESTRICT_COND (s->size >= 0);					\
   677  									\
   678      TMP_MARK;								\
   679      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
   680  									\
   681      speed_operand_src (s, s->xp, s->size);				\
   682      speed_operand_dst (s, wp, s->size);					\
   683      speed_cache_fill (s);						\
   684  									\
   685      speed_starttime ();							\
   686      i = s->reps;							\
   687      do									\
   688        function (wp, s->xp, s->size, 0);					\
   689      while (--i != 0);							\
   690      t = speed_endtime ();						\
   691  									\
   692      TMP_FREE;								\
   693      return t;								\
   694    }
   695  
   696  /* s->size is still in limbs, and it's limbs which are copied, but
   697     "function" takes a size in bytes not limbs.  */
   698  #define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
   699    {									\
   700      mp_ptr    wp;							\
   701      unsigned  i;							\
   702      double    t;							\
   703      TMP_DECL;								\
   704  									\
   705      SPEED_RESTRICT_COND (s->size >= 0);					\
   706  									\
   707      TMP_MARK;								\
   708      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
   709  									\
   710      speed_operand_src (s, s->xp, s->size);				\
   711      speed_operand_dst (s, wp, s->size);					\
   712      speed_cache_fill (s);						\
   713  									\
   714      speed_starttime ();							\
   715      i = s->reps;							\
   716      do									\
   717        function (wp, s->xp, s->size * GMP_LIMB_BYTES);		\
   718      while (--i != 0);							\
   719      t = speed_endtime ();						\
   720  									\
   721      TMP_FREE;								\
   722      return t;								\
   723    }
   724  
   725  
   726  /* For mpn_add_n, mpn_sub_n, or similar. */
   727  #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
   728    {									\
   729      mp_ptr     wp;							\
   730      mp_ptr     xp, yp;							\
   731      unsigned   i;							\
   732      double     t;							\
   733      TMP_DECL;								\
   734  									\
   735      SPEED_RESTRICT_COND (s->size >= 1);					\
   736  									\
   737      TMP_MARK;								\
   738      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
   739  									\
   740      xp = s->xp;								\
   741      yp = s->yp;								\
   742  									\
   743      if (s->r == 0)	;						\
   744      else if (s->r == 1) { xp = wp;	    }				\
   745      else if (s->r == 2) {	   yp = wp; }				\
   746      else if (s->r == 3) { xp = wp; yp = wp; }				\
   747      else if (s->r == 4) {     yp = xp;	    }				\
   748      else		{						\
   749        TMP_FREE;								\
   750        return -1.0;							\
   751      }									\
   752  									\
   753      /* initialize wp if operand overlap */				\
   754      if (xp == wp || yp == wp)						\
   755        MPN_COPY (wp, s->xp, s->size);					\
   756  									\
   757      speed_operand_src (s, xp, s->size);					\
   758      speed_operand_src (s, yp, s->size);					\
   759      speed_operand_dst (s, wp, s->size);					\
   760      speed_cache_fill (s);						\
   761  									\
   762      speed_starttime ();							\
   763      i = s->reps;							\
   764      do									\
   765        call;								\
   766      while (--i != 0);							\
   767      t = speed_endtime ();						\
   768  									\
   769      TMP_FREE;								\
   770      return t;								\
   771    }
   772  
   773  
   774  /* For mpn_aors_errK_n, where 1 <= K <= 3. */
   775  #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
   776    {									\
   777      mp_ptr     wp;							\
   778      mp_ptr     xp, yp;							\
   779      mp_ptr     zp[K];							\
   780      mp_limb_t  ep[2*K];							\
   781      unsigned   i;							\
   782      double     t;							\
   783      TMP_DECL;								\
   784  									\
   785      SPEED_RESTRICT_COND (s->size >= 1);					\
   786  									\
   787      TMP_MARK;								\
   788      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
   789  									\
   790      /* (don't have a mechanism to specify zp alignments) */		\
   791      for (i = 0; i < K; i++)						\
   792        SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
   793  									\
   794      xp = s->xp;								\
   795      yp = s->yp;								\
   796  									\
   797      if (s->r == 0)	;						\
   798      else if (s->r == 1) { xp = wp;	    }				\
   799      else if (s->r == 2) {	   yp = wp; }				\
   800      else if (s->r == 3) { xp = wp; yp = wp; }				\
   801      else if (s->r == 4) {     yp = xp;	    }				\
   802      else		{						\
   803        TMP_FREE;								\
   804        return -1.0;							\
   805      }									\
   806  									\
   807      /* initialize wp if operand overlap */				\
   808      if (xp == wp || yp == wp)						\
   809        MPN_COPY (wp, s->xp, s->size);					\
   810  									\
   811      speed_operand_src (s, xp, s->size);					\
   812      speed_operand_src (s, yp, s->size);					\
   813      for (i = 0; i < K; i++)						\
   814        speed_operand_src (s, zp[i], s->size);				\
   815      speed_operand_dst (s, wp, s->size);					\
   816      speed_cache_fill (s);						\
   817  									\
   818      speed_starttime ();							\
   819      i = s->reps;							\
   820      do									\
   821        call;								\
   822      while (--i != 0);							\
   823      t = speed_endtime ();						\
   824  									\
   825      TMP_FREE;								\
   826      return t;								\
   827    }
   828  
   829  #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
   830    SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
   831  
   832  #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
   833    SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
   834  
   835  #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
   836    SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
   837  
   838  
   839  /* For mpn_add_n, mpn_sub_n, or similar. */
   840  #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
   841    {									\
   842      mp_ptr     ap, sp;							\
   843      mp_ptr     xp, yp;							\
   844      unsigned   i;							\
   845      double     t;							\
   846      TMP_DECL;								\
   847  									\
   848      SPEED_RESTRICT_COND (s->size >= 1);					\
   849  									\
   850      TMP_MARK;								\
   851      SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
   852      SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
   853  									\
   854      xp = s->xp;								\
   855      yp = s->yp;								\
   856  									\
   857      if ((s->r & 1) != 0) { xp = ap; }					\
   858      if ((s->r & 2) != 0) { yp = ap; }					\
   859      if ((s->r & 4) != 0) { xp = sp; }					\
   860      if ((s->r & 8) != 0) { yp = sp; }					\
   861      if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
   862        {									\
   863  	TMP_FREE;							\
   864  	return -1.0;							\
   865        }									\
   866  									\
   867      /* initialize ap if operand overlap */				\
   868      if (xp == ap || yp == ap)						\
   869        MPN_COPY (ap, s->xp, s->size);					\
   870      /* initialize sp if operand overlap */				\
   871      if (xp == sp || yp == sp)						\
   872        MPN_COPY (sp, s->xp, s->size);					\
   873  									\
   874      speed_operand_src (s, xp, s->size);					\
   875      speed_operand_src (s, yp, s->size);					\
   876      speed_operand_dst (s, ap, s->size);					\
   877      speed_operand_dst (s, sp, s->size);					\
   878      speed_cache_fill (s);						\
   879  									\
   880      speed_starttime ();							\
   881      i = s->reps;							\
   882      do									\
   883        call;								\
   884      while (--i != 0);							\
   885      t = speed_endtime ();						\
   886  									\
   887      TMP_FREE;								\
   888      return t;								\
   889    }
   890  
   891  #define SPEED_ROUTINE_MPN_BINARY_N(function)				\
   892     SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
   893  
   894  #define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
   895     SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
   896  
   897  
   898  /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
   899  #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
   900    {									\
   901      mp_ptr    wp;							\
   902      unsigned  i;							\
   903      double    t;							\
   904      TMP_DECL;								\
   905  									\
   906      SPEED_RESTRICT_COND (s->size >= 1);					\
   907  									\
   908      TMP_MARK;								\
   909      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
   910  									\
   911      speed_operand_src (s, s->xp, s->size);				\
   912      speed_operand_dst (s, wp, s->size);					\
   913      speed_cache_fill (s);						\
   914  									\
   915      speed_starttime ();							\
   916      i = s->reps;							\
   917      do									\
   918        call;								\
   919      while (--i != 0);							\
   920      t = speed_endtime ();						\
   921  									\
   922      TMP_FREE;								\
   923      return t;								\
   924    }
   925  
   926  #define SPEED_ROUTINE_MPN_UNARY_1(function)				\
   927    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
   928  
   929  #define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
   930    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
   931  
   932  /* FIXME: wp is uninitialized here, should start it off from xp */
   933  #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
   934    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
   935  
   936  #define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
   937    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
   938  
   939  #define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
   940      SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
   941  
   942  #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
   943    {									\
   944      unsigned   shift;							\
   945      mp_limb_t  dinv;							\
   946  									\
   947      SPEED_RESTRICT_COND (s->size > 0);					\
   948      SPEED_RESTRICT_COND (s->r != 0);					\
   949  									\
   950      count_trailing_zeros (shift, s->r);					\
   951      binvert_limb (dinv, s->r >> shift);					\
   952  									\
   953      SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
   954    }
   955  #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
   956    SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
   957    ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
   958  
   959  #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
   960    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
   961  
   962  #define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
   963    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
   964  
   965  #define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
   966    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
   967  
   968  #define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
   969    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
   970  
   971  #define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
   972    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
   973  
   974  
   975  #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
   976    {									\
   977      unsigned   shift;							\
   978      mp_limb_t  dinv;							\
   979  									\
   980      SPEED_RESTRICT_COND (s->size >= 0);					\
   981      SPEED_RESTRICT_COND (s->r != 0);					\
   982  									\
   983      count_leading_zeros (shift, s->r);					\
   984      invert_limb (dinv, s->r << shift);					\
   985  									\
   986      SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
   987    }									\
   988  
   989  #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
   990    SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
   991    ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
   992  
   993  /* s->size limbs worth of fraction part */
   994  #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
   995    SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
   996    ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
   997  
   998  
   999  /* s->r is duplicated to form the multiplier, defaulting to
  1000     MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
  1001     least it provides some control.  */
  1002  #define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
  1003    {									\
  1004      mp_ptr     wp;							\
  1005      mp_size_t  wn;							\
  1006      unsigned   i;							\
  1007      double     t;							\
  1008      mp_limb_t  yp[N];							\
  1009      TMP_DECL;								\
  1010  									\
  1011      SPEED_RESTRICT_COND (s->size >= N);					\
  1012  									\
  1013      TMP_MARK;								\
  1014      wn = s->size + N-1;							\
  1015      SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
  1016      for (i = 0; i < N; i++)						\
  1017        yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
  1018  									\
  1019      speed_operand_src (s, s->xp, s->size);				\
  1020      speed_operand_src (s, yp, (mp_size_t) N);				\
  1021      speed_operand_dst (s, wp, wn);					\
  1022      speed_cache_fill (s);						\
  1023  									\
  1024      speed_starttime ();							\
  1025      i = s->reps;							\
  1026      do									\
  1027        function (wp, s->xp, s->size, yp);				\
  1028      while (--i != 0);							\
  1029      t = speed_endtime ();						\
  1030  									\
  1031      TMP_FREE;								\
  1032      return t;								\
  1033    }
  1034  
  1035  #define SPEED_ROUTINE_MPN_UNARY_2(function)				\
  1036    SPEED_ROUTINE_MPN_UNARY_N (function, 2)
  1037  #define SPEED_ROUTINE_MPN_UNARY_3(function)				\
  1038    SPEED_ROUTINE_MPN_UNARY_N (function, 3)
  1039  #define SPEED_ROUTINE_MPN_UNARY_4(function)				\
  1040    SPEED_ROUTINE_MPN_UNARY_N (function, 4)
  1041  #define SPEED_ROUTINE_MPN_UNARY_5(function)				\
  1042    SPEED_ROUTINE_MPN_UNARY_N (function, 5)
  1043  #define SPEED_ROUTINE_MPN_UNARY_6(function)				\
  1044    SPEED_ROUTINE_MPN_UNARY_N (function, 6)
  1045  #define SPEED_ROUTINE_MPN_UNARY_7(function)				\
  1046    SPEED_ROUTINE_MPN_UNARY_N (function, 7)
  1047  #define SPEED_ROUTINE_MPN_UNARY_8(function)				\
  1048    SPEED_ROUTINE_MPN_UNARY_N (function, 8)
  1049  
  1050  
  1051  /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
  1052  #define SPEED_ROUTINE_MPN_MUL(function)					\
  1053    {									\
  1054      mp_ptr    wp;							\
  1055      mp_size_t size1;							\
  1056      unsigned  i;							\
  1057      double    t;							\
  1058      TMP_DECL;								\
  1059  									\
  1060      size1 = (s->r == 0 ? s->size : s->r);				\
  1061      if (size1 < 0) size1 = -size1 - s->size;				\
  1062  									\
  1063      SPEED_RESTRICT_COND (size1 >= 1);					\
  1064      SPEED_RESTRICT_COND (s->size >= size1);				\
  1065  									\
  1066      TMP_MARK;								\
  1067      SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
  1068  									\
  1069      speed_operand_src (s, s->xp, s->size);				\
  1070      speed_operand_src (s, s->yp, size1);				\
  1071      speed_operand_dst (s, wp, size1 + s->size);				\
  1072      speed_cache_fill (s);						\
  1073  									\
  1074      speed_starttime ();							\
  1075      i = s->reps;							\
  1076      do									\
  1077        function (wp, s->xp, s->size, s->yp, size1);			\
  1078      while (--i != 0);							\
  1079      t = speed_endtime ();						\
  1080  									\
  1081      TMP_FREE;								\
  1082      return t;								\
  1083    }
  1084  
  1085  
  1086  #define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
  1087    {									\
  1088      mp_ptr    wp;							\
  1089      unsigned  i;							\
  1090      double    t;							\
  1091      TMP_DECL;								\
  1092  									\
  1093      SPEED_RESTRICT_COND (s->size >= 1);					\
  1094  									\
  1095      TMP_MARK;								\
  1096      SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
  1097  									\
  1098      speed_operand_src (s, s->xp, s->size);				\
  1099      speed_operand_src (s, s->yp, s->size);				\
  1100      speed_operand_dst (s, wp, 2*s->size);				\
  1101      speed_cache_fill (s);						\
  1102  									\
  1103      speed_starttime ();							\
  1104      i = s->reps;							\
  1105      do									\
  1106        call;								\
  1107      while (--i != 0);							\
  1108      t = speed_endtime ();						\
  1109  									\
  1110      TMP_FREE;								\
  1111      return t;								\
  1112    }
  1113  
  1114  #define SPEED_ROUTINE_MPN_MUL_N(function)				\
  1115    SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
  1116  
  1117  #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
  1118    {									\
  1119      mp_ptr    wp;							\
  1120      unsigned  i;							\
  1121      double    t;							\
  1122      TMP_DECL;								\
  1123  									\
  1124      SPEED_RESTRICT_COND (s->size >= 1);					\
  1125  									\
  1126      TMP_MARK;								\
  1127      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
  1128  									\
  1129      speed_operand_src (s, s->xp, s->size);				\
  1130      speed_operand_src (s, s->yp, s->size);				\
  1131      speed_operand_dst (s, wp, s->size);					\
  1132      speed_cache_fill (s);						\
  1133  									\
  1134      speed_starttime ();							\
  1135      i = s->reps;							\
  1136      do									\
  1137        call;								\
  1138      while (--i != 0);							\
  1139      t = speed_endtime ();						\
  1140  									\
  1141      TMP_FREE;								\
  1142      return t;								\
  1143    }
  1144  
  1145  #define SPEED_ROUTINE_MPN_MULLO_N(function)				\
  1146    SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
  1147  
  1148  #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
  1149    SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
  1150  
  1151  #define SPEED_ROUTINE_MPN_SQRLO(function)				\
  1152    {									\
  1153      mp_ptr    wp;							\
  1154      unsigned  i;							\
  1155      double    t;							\
  1156      TMP_DECL;								\
  1157  									\
  1158      SPEED_RESTRICT_COND (s->size >= 1);					\
  1159  									\
  1160      TMP_MARK;								\
  1161      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
  1162  									\
  1163      speed_operand_src (s, s->xp, s->size);				\
  1164      speed_operand_dst (s, wp, s->size);					\
  1165      speed_cache_fill (s);						\
  1166  									\
  1167      speed_starttime ();							\
  1168      i = s->reps;							\
  1169      do									\
  1170        function (wp, s->xp, s->size);					\
  1171      while (--i != 0);							\
  1172      t = speed_endtime ();						\
  1173  									\
  1174      TMP_FREE;								\
  1175      return t;								\
  1176    }
  1177  
  1178  /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
  1179  #define SPEED_ROUTINE_MPN_MULMID(function)				\
  1180    {									\
  1181      mp_ptr    wp, xp;							\
  1182      mp_size_t size1;							\
  1183      unsigned  i;							\
  1184      double    t;							\
  1185      TMP_DECL;								\
  1186  									\
  1187      size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
  1188  									\
  1189      SPEED_RESTRICT_COND (s->size >= 1);					\
  1190      SPEED_RESTRICT_COND (size1 >= s->size);				\
  1191  									\
  1192      TMP_MARK;								\
  1193      SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
  1194      SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
  1195  									\
  1196      speed_operand_src (s, xp, size1);					\
  1197      speed_operand_src (s, s->yp, s->size);				\
  1198      speed_operand_dst (s, wp, size1 - s->size + 3);			\
  1199      speed_cache_fill (s);						\
  1200  									\
  1201      speed_starttime ();							\
  1202      i = s->reps;							\
  1203      do									\
  1204        function (wp, xp, size1, s->yp, s->size);				\
  1205      while (--i != 0);							\
  1206      t = speed_endtime ();						\
  1207  									\
  1208      TMP_FREE;								\
  1209      return t;								\
  1210    }
  1211  
  1212  #define SPEED_ROUTINE_MPN_MULMID_N(function)				\
  1213    {									\
  1214      mp_ptr    wp, xp;							\
  1215      mp_size_t size1;							\
  1216      unsigned  i;							\
  1217      double    t;							\
  1218      TMP_DECL;								\
  1219  									\
  1220      size1 = 2 * s->size - 1;						\
  1221  									\
  1222      SPEED_RESTRICT_COND (s->size >= 1);					\
  1223  									\
  1224      TMP_MARK;								\
  1225      SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
  1226      SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
  1227  									\
  1228      speed_operand_src (s, xp, size1);					\
  1229      speed_operand_src (s, s->yp, s->size);				\
  1230      speed_operand_dst (s, wp, size1 - s->size + 3);			\
  1231      speed_cache_fill (s);						\
  1232  									\
  1233      speed_starttime ();							\
  1234      i = s->reps;							\
  1235      do									\
  1236        function (wp, xp, s->yp, s->size);				\
  1237      while (--i != 0);							\
  1238      t = speed_endtime ();						\
  1239  									\
  1240      TMP_FREE;								\
  1241      return t;								\
  1242    }
  1243  
  1244  #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
  1245    {									\
  1246      mp_ptr    wp, xp, scratch;						\
  1247      mp_size_t size1, scratch_size;					\
  1248      unsigned  i;							\
  1249      double    t;							\
  1250      TMP_DECL;								\
  1251  									\
  1252      size1 = 2 * s->size - 1;						\
  1253  									\
  1254      SPEED_RESTRICT_COND (s->size >= 1);					\
  1255  									\
  1256      TMP_MARK;								\
  1257      SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
  1258      SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
  1259      scratch_size = mpn_toom42_mulmid_itch (s->size);			\
  1260      SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
  1261  									\
  1262      speed_operand_src (s, xp, size1);					\
  1263      speed_operand_src (s, s->yp, s->size);				\
  1264      speed_operand_dst (s, wp, size1 - s->size + 3);			\
  1265      speed_cache_fill (s);						\
  1266  									\
  1267      speed_starttime ();							\
  1268      i = s->reps;							\
  1269      do									\
  1270        function (wp, xp, s->yp, s->size, scratch);			\
  1271      while (--i != 0);							\
  1272      t = speed_endtime ();						\
  1273  									\
  1274      TMP_FREE;								\
  1275      return t;								\
  1276    }
  1277  
  1278  #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
  1279    {									\
  1280      mp_ptr    wp, tp;							\
  1281      unsigned  i;							\
  1282      double    t;							\
  1283      mp_size_t itch;							\
  1284      TMP_DECL;								\
  1285  									\
  1286      SPEED_RESTRICT_COND (s->size >= 1);					\
  1287  									\
  1288      itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
  1289  									\
  1290      TMP_MARK;								\
  1291      SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
  1292      SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
  1293  									\
  1294      speed_operand_src (s, s->xp, s->size);				\
  1295      speed_operand_src (s, s->yp, s->size);				\
  1296      speed_operand_dst (s, wp, 2 * s->size);				\
  1297      speed_operand_dst (s, tp, itch);					\
  1298      speed_cache_fill (s);						\
  1299  									\
  1300      speed_starttime ();							\
  1301      i = s->reps;							\
  1302      do									\
  1303        call;								\
  1304      while (--i != 0);							\
  1305      t = speed_endtime ();						\
  1306  									\
  1307      TMP_FREE;								\
  1308      return t;								\
  1309    }
  1310  #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
  1311    {									\
  1312      mp_ptr    wp, tp;							\
  1313      unsigned  i;							\
  1314      double    t;							\
  1315      mp_size_t size, itch;						\
  1316      TMP_DECL;								\
  1317  									\
  1318      SPEED_RESTRICT_COND (s->size >= 1);					\
  1319  									\
  1320      size = mpn_mulmod_bnm1_next_size (s->size);				\
  1321      itch = mpn_mulmod_bnm1_itch (size, size, size);			\
  1322  									\
  1323      TMP_MARK;								\
  1324      SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
  1325      SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
  1326  									\
  1327      speed_operand_src (s, s->xp, s->size);				\
  1328      speed_operand_src (s, s->yp, s->size);				\
  1329      speed_operand_dst (s, wp, size);					\
  1330      speed_operand_dst (s, tp, itch);					\
  1331      speed_cache_fill (s);						\
  1332  									\
  1333      speed_starttime ();							\
  1334      i = s->reps;							\
  1335      do									\
  1336        function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
  1337      while (--i != 0);							\
  1338      t = speed_endtime ();						\
  1339  									\
  1340      TMP_FREE;								\
  1341      return t;								\
  1342    }
  1343  
  1344  #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
  1345    {									\
  1346      mp_ptr    wp, tspace;						\
  1347      unsigned  i;							\
  1348      double    t;							\
  1349      TMP_DECL;								\
  1350  									\
  1351      SPEED_RESTRICT_COND (s->size >= minsize);				\
  1352  									\
  1353      TMP_MARK;								\
  1354      SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
  1355      SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
  1356  									\
  1357      speed_operand_src (s, s->xp, s->size);				\
  1358      speed_operand_src (s, s->yp, s->size);				\
  1359      speed_operand_dst (s, wp, 2*s->size);				\
  1360      speed_operand_dst (s, tspace, tsize);				\
  1361      speed_cache_fill (s);						\
  1362  									\
  1363      speed_starttime ();							\
  1364      i = s->reps;							\
  1365      do									\
  1366        call;								\
  1367      while (--i != 0);							\
  1368      t = speed_endtime ();						\
  1369  									\
  1370      TMP_FREE;								\
  1371      return t;								\
  1372    }
  1373  
  1374  #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
  1375    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1376      (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
  1377       mpn_toom22_mul_itch (s->size, s->size),				\
  1378       MPN_TOOM22_MUL_MINSIZE)
  1379  
  1380  #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
  1381    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1382      (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
  1383       mpn_toom33_mul_itch (s->size, s->size),				\
  1384       MPN_TOOM33_MUL_MINSIZE)
  1385  
  1386  #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
  1387    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1388      (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
  1389       mpn_toom44_mul_itch (s->size, s->size),				\
  1390       MPN_TOOM44_MUL_MINSIZE)
  1391  
  1392  #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
  1393    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1394      (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
  1395       mpn_toom6h_mul_itch (s->size, s->size),				\
  1396       MPN_TOOM6H_MUL_MINSIZE)
  1397  
  1398  #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
  1399    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1400      (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
  1401       mpn_toom8h_mul_itch (s->size, s->size),				\
  1402       MPN_TOOM8H_MUL_MINSIZE)
  1403  
  1404  #define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
  1405    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1406      (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
  1407       mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
  1408       MPN_TOOM32_MUL_MINSIZE)
  1409  
  1410  #define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
  1411    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1412      (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
  1413       mpn_toom42_mul_itch (s->size, s->size/2),				\
  1414       MPN_TOOM42_MUL_MINSIZE)
  1415  
  1416  #define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
  1417    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1418      (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
  1419       mpn_toom43_mul_itch (s->size, s->size*3/4),			\
  1420       MPN_TOOM43_MUL_MINSIZE)
  1421  
  1422  #define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
  1423    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1424      (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
  1425       mpn_toom63_mul_itch (s->size, s->size/2),				\
  1426       MPN_TOOM63_MUL_MINSIZE)
  1427  
  1428  #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
  1429    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1430      (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
  1431       mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
  1432       MPN_TOOM32_MUL_MINSIZE)
  1433  #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
  1434    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1435      (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
  1436       mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
  1437       MPN_TOOM43_MUL_MINSIZE)
  1438  
  1439  #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
  1440    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1441      (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
  1442       mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
  1443       MPN_TOOM32_MUL_MINSIZE)
  1444  #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
  1445    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1446      (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
  1447       mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
  1448       MPN_TOOM53_MUL_MINSIZE)
  1449  
  1450  #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
  1451    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1452      (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
  1453       mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
  1454       MPN_TOOM42_MUL_MINSIZE)
  1455  #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
  1456    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1457      (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
  1458       mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
  1459       MPN_TOOM53_MUL_MINSIZE)
  1460  
  1461  #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
  1462    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1463      (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
  1464       mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
  1465       MPN_TOOM54_MUL_MINSIZE)
  1466  #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
  1467    SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
  1468      (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
  1469       mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
  1470       MPN_TOOM54_MUL_MINSIZE)
  1471  
  1472  
  1473  
  1474  #define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
  1475    {									\
  1476      mp_ptr    wp;							\
  1477      unsigned  i;							\
  1478      double    t;							\
  1479      TMP_DECL;								\
  1480  									\
  1481      SPEED_RESTRICT_COND (s->size >= 1);					\
  1482  									\
  1483      TMP_MARK;								\
  1484      SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
  1485  									\
  1486      speed_operand_src (s, s->xp, s->size);				\
  1487      speed_operand_dst (s, wp, 2*s->size);				\
  1488      speed_cache_fill (s);						\
  1489  									\
  1490      speed_starttime ();							\
  1491      i = s->reps;							\
  1492      do									\
  1493        call;								\
  1494      while (--i != 0);							\
  1495      t = speed_endtime ();						\
  1496  									\
  1497      TMP_FREE;								\
  1498      return t;								\
  1499    }
  1500  
  1501  #define SPEED_ROUTINE_MPN_SQR(function)					\
  1502    SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
  1503  
  1504  #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
  1505    {									\
  1506      mp_ptr    wp, tp;							\
  1507      unsigned  i;							\
  1508      double    t;							\
  1509      TMP_DECL;								\
  1510  									\
  1511      SPEED_RESTRICT_COND (s->size >= 2);					\
  1512  									\
  1513      TMP_MARK;								\
  1514      SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
  1515      SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
  1516  									\
  1517      speed_operand_src (s, s->xp, s->size);				\
  1518      speed_operand_src (s, tp, 2 * s->size);				\
  1519      speed_operand_dst (s, wp, 2 * s->size);				\
  1520      speed_cache_fill (s);						\
  1521  									\
  1522      speed_starttime ();							\
  1523      i = s->reps;							\
  1524      do									\
  1525        call;								\
  1526      while (--i != 0);							\
  1527      t = speed_endtime () / 2;						\
  1528  									\
  1529      TMP_FREE;								\
  1530      return t;								\
  1531    }
  1532  
  1533  #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
  1534    {									\
  1535      mp_ptr    wp, tspace;						\
  1536      unsigned  i;							\
  1537      double    t;							\
  1538      TMP_DECL;								\
  1539  									\
  1540      SPEED_RESTRICT_COND (s->size >= minsize);				\
  1541  									\
  1542      TMP_MARK;								\
  1543      SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
  1544      SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
  1545  									\
  1546      speed_operand_src (s, s->xp, s->size);				\
  1547      speed_operand_dst (s, wp, 2*s->size);				\
  1548      speed_operand_dst (s, tspace, tsize);				\
  1549      speed_cache_fill (s);						\
  1550  									\
  1551      speed_starttime ();							\
  1552      i = s->reps;							\
  1553      do									\
  1554        call;								\
  1555      while (--i != 0);							\
  1556      t = speed_endtime ();						\
  1557  									\
  1558      TMP_FREE;								\
  1559      return t;								\
  1560    }
  1561  
  1562  #define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
  1563    SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
  1564  				mpn_toom2_sqr_itch (s->size),		\
  1565  				MPN_TOOM2_SQR_MINSIZE)
  1566  
  1567  #define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
  1568    SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
  1569  				mpn_toom3_sqr_itch (s->size),		\
  1570  				MPN_TOOM3_SQR_MINSIZE)
  1571  
  1572  
  1573  #define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
  1574    SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
  1575  				mpn_toom4_sqr_itch (s->size),		\
  1576  				MPN_TOOM4_SQR_MINSIZE)
  1577  
  1578  #define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
  1579    SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
  1580  				mpn_toom6_sqr_itch (s->size),		\
  1581  				MPN_TOOM6_SQR_MINSIZE)
  1582  
  1583  #define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
  1584    SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
  1585  				mpn_toom8_sqr_itch (s->size),		\
  1586  				MPN_TOOM8_SQR_MINSIZE)
  1587  
  1588  #define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
  1589    {									\
  1590      unsigned   i;							\
  1591  									\
  1592      SPEED_RESTRICT_COND (s->size >= 0);					\
  1593  									\
  1594      speed_operand_src (s, s->xp, s->size);				\
  1595      speed_cache_fill (s);						\
  1596  									\
  1597      speed_starttime ();							\
  1598      i = s->reps;							\
  1599      do									\
  1600        call;								\
  1601      while (--i != 0);							\
  1602  									\
  1603      return speed_endtime ();						\
  1604    }
  1605  
  1606  #define SPEED_ROUTINE_MPN_MOD_1(function)				\
  1607     SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
  1608  
  1609  #define SPEED_ROUTINE_MPN_MOD_1C(function)				\
  1610     SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
  1611  
  1612  #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
  1613    SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
  1614  
  1615  #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
  1616    SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
  1617  
  1618  #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
  1619     SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
  1620  
  1621  #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
  1622    {									\
  1623      unsigned   i;							\
  1624      mp_limb_t  inv;							\
  1625  									\
  1626      SPEED_RESTRICT_COND (s->size >= 0);					\
  1627      SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
  1628  									\
  1629      invert_limb (inv, s->r);						\
  1630      speed_operand_src (s, s->xp, s->size);				\
  1631      speed_cache_fill (s);						\
  1632  									\
  1633      speed_starttime ();							\
  1634      i = s->reps;							\
  1635      do									\
  1636        (*function) (s->xp, s->size, s->r, inv);				\
  1637      while (--i != 0);							\
  1638  									\
  1639      return speed_endtime ();						\
  1640    }
  1641  
  1642  #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
  1643    {									\
  1644      unsigned   i;							\
  1645      mp_limb_t  inv[4];							\
  1646  									\
  1647      SPEED_RESTRICT_COND (s->size >= 2);					\
  1648  									\
  1649      mpn_mod_1_1p_cps (inv, s->r);					\
  1650      speed_operand_src (s, s->xp, s->size);				\
  1651      speed_cache_fill (s);						\
  1652  									\
  1653      speed_starttime ();							\
  1654      i = s->reps;							\
  1655      do {								\
  1656        pfunc (inv, s->r);						\
  1657        function (s->xp, s->size, s->r << inv[1], inv);				\
  1658      } while (--i != 0);							\
  1659  									\
  1660      return speed_endtime ();						\
  1661    }
  1662  #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
  1663    {									\
  1664      unsigned   i;							\
  1665      mp_limb_t  inv[N+3];						\
  1666  									\
  1667      SPEED_RESTRICT_COND (s->size >= 1);					\
  1668      SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
  1669  									\
  1670      speed_operand_src (s, s->xp, s->size);				\
  1671      speed_cache_fill (s);						\
  1672  									\
  1673      speed_starttime ();							\
  1674      i = s->reps;							\
  1675      do {								\
  1676        pfunc (inv, s->r);						\
  1677        function (s->xp, s->size, s->r, inv);				\
  1678      } while (--i != 0);							\
  1679  									\
  1680      return speed_endtime ();						\
  1681    }
  1682  
  1683  
  1684  /* A division of 2*s->size by s->size limbs */
  1685  
  1686  #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
  1687    {									\
  1688      unsigned  i;							\
  1689      mp_ptr    a, d, q, r;						\
  1690      double    t;							\
  1691      gmp_pi1_t dinv;							\
  1692      TMP_DECL;								\
  1693  									\
  1694      SPEED_RESTRICT_COND (s->size >= 1);					\
  1695  									\
  1696      TMP_MARK;								\
  1697      SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
  1698      SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
  1699      SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
  1700      SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
  1701  									\
  1702      MPN_COPY (a, s->xp, s->size);					\
  1703      MPN_COPY (a+s->size, s->xp, s->size);				\
  1704  									\
  1705      MPN_COPY (d, s->yp, s->size);					\
  1706  									\
  1707      /* normalize the data */						\
  1708      d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  1709      a[2*s->size-1] = d[s->size-1] - 1;					\
  1710  									\
  1711      invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
  1712  									\
  1713      speed_operand_src (s, a, 2*s->size);				\
  1714      speed_operand_src (s, d, s->size);					\
  1715      speed_operand_dst (s, q, s->size+1);				\
  1716      speed_operand_dst (s, r, s->size);					\
  1717      speed_cache_fill (s);						\
  1718  									\
  1719      speed_starttime ();							\
  1720      i = s->reps;							\
  1721      do									\
  1722        call;								\
  1723      while (--i != 0);							\
  1724      t = speed_endtime ();						\
  1725  									\
  1726      TMP_FREE;								\
  1727      return t;								\
  1728    }
  1729  
  1730  
  1731  /* A remainder 2*s->size by s->size limbs */
  1732  
  1733  #define SPEED_ROUTINE_MPZ_MOD(function)					\
  1734    {									\
  1735      unsigned   i;							\
  1736      mpz_t      a, d, r;							\
  1737  									\
  1738      SPEED_RESTRICT_COND (s->size >= 1);					\
  1739  									\
  1740      mpz_init_set_n (d, s->yp, s->size);					\
  1741  									\
  1742      /* high part less than d, low part a duplicate copied in */		\
  1743      mpz_init_set_n (a, s->xp, s->size);					\
  1744      mpz_mod (a, a, d);							\
  1745      mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
  1746      MPN_COPY (PTR(a), s->xp, s->size);					\
  1747  									\
  1748      mpz_init (r);							\
  1749  									\
  1750      speed_operand_src (s, PTR(a), SIZ(a));				\
  1751      speed_operand_src (s, PTR(d), SIZ(d));				\
  1752      speed_cache_fill (s);						\
  1753  									\
  1754      speed_starttime ();							\
  1755      i = s->reps;							\
  1756      do									\
  1757        function (r, a, d);						\
  1758      while (--i != 0);							\
  1759      return speed_endtime ();						\
  1760    }
  1761  
  1762  #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
  1763    {									\
  1764      unsigned   i;							\
  1765      mp_ptr     dp, tp, ap, qp;						\
  1766      gmp_pi1_t  inv;							\
  1767      double     t;							\
  1768      mp_size_t size1;							\
  1769      TMP_DECL;								\
  1770  									\
  1771      size1 = (s->r == 0 ? 2 * s->size : s->r);				\
  1772  									\
  1773      SPEED_RESTRICT_COND (s->size >= DMIN);				\
  1774      SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
  1775  									\
  1776      TMP_MARK;								\
  1777      SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
  1778      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  1779      SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
  1780      SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
  1781  									\
  1782      /* we don't fill in dividend completely when size1 > s->size */	\
  1783      MPN_COPY (ap,         s->xp, s->size);				\
  1784      MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
  1785  									\
  1786      MPN_COPY (dp,         s->yp, s->size);				\
  1787  									\
  1788      /* normalize the data */						\
  1789      dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  1790      ap[size1 - 1] = dp[s->size - 1] - 1;				\
  1791  									\
  1792      invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
  1793  									\
  1794      speed_operand_src (s, ap, size1);					\
  1795      speed_operand_dst (s, tp, size1);					\
  1796      speed_operand_src (s, dp, s->size);					\
  1797      speed_operand_dst (s, qp, size1 - s->size);				\
  1798      speed_cache_fill (s);						\
  1799  									\
  1800      speed_starttime ();							\
  1801      i = s->reps;							\
  1802      do {								\
  1803        MPN_COPY (tp, ap, size1);						\
  1804        function (qp, tp, size1, dp, s->size, INV);			\
  1805      } while (--i != 0);							\
  1806      t = speed_endtime ();						\
  1807  									\
  1808      TMP_FREE;								\
  1809      return t;								\
  1810    }
  1811  #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
  1812    {									\
  1813      unsigned   i;							\
  1814      mp_ptr     dp, tp, qp, scratch;					\
  1815      double     t;							\
  1816      mp_size_t itch;							\
  1817      TMP_DECL;								\
  1818  									\
  1819      SPEED_RESTRICT_COND (s->size >= 2);					\
  1820  									\
  1821      itch = itchfn (2 * s->size, s->size, 0);				\
  1822      TMP_MARK;								\
  1823      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  1824      SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
  1825      SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
  1826      SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
  1827  									\
  1828      MPN_COPY (tp,         s->xp, s->size);				\
  1829      MPN_COPY (tp+s->size, s->xp, s->size);				\
  1830  									\
  1831      /* normalize the data */						\
  1832      dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  1833      tp[2*s->size-1] = dp[s->size-1] - 1;				\
  1834  									\
  1835      speed_operand_dst (s, qp, s->size);					\
  1836      speed_operand_src (s, tp, 2 * s->size);				\
  1837      speed_operand_src (s, dp, s->size);					\
  1838      speed_operand_dst (s, scratch, itch);				\
  1839      speed_cache_fill (s);						\
  1840  									\
  1841      speed_starttime ();							\
  1842      i = s->reps;							\
  1843      do {								\
  1844        function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
  1845      } while (--i != 0);							\
  1846      t = speed_endtime ();						\
  1847  									\
  1848      TMP_FREE;								\
  1849      return t;								\
  1850    }
  1851  #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
  1852    {									\
  1853      unsigned   i;							\
  1854      mp_ptr     dp, tp, qp, rp, scratch;					\
  1855      double     t;							\
  1856      mp_size_t size1, itch;						\
  1857      TMP_DECL;								\
  1858  									\
  1859      size1 = (s->r == 0 ? 2 * s->size : s->r);				\
  1860  									\
  1861      SPEED_RESTRICT_COND (s->size >= 2);					\
  1862      SPEED_RESTRICT_COND (size1 >= s->size);				\
  1863  									\
  1864      itch = itchfn (size1, s->size, 0);					\
  1865      TMP_MARK;								\
  1866      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  1867      SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
  1868      SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
  1869      SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
  1870      SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
  1871  									\
  1872      /* we don't fill in dividend completely when size1 > s->size */	\
  1873      MPN_COPY (tp,         s->xp, s->size);				\
  1874      MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
  1875  									\
  1876      MPN_COPY (dp,         s->yp, s->size);				\
  1877  									\
  1878      /* normalize the data */						\
  1879      dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  1880      tp[size1 - 1] = dp[s->size - 1] - 1;				\
  1881  									\
  1882      speed_operand_dst (s, qp, size1 - s->size);				\
  1883      speed_operand_dst (s, rp, s->size);					\
  1884      speed_operand_src (s, tp, size1);					\
  1885      speed_operand_src (s, dp, s->size);					\
  1886      speed_operand_dst (s, scratch, itch);				\
  1887      speed_cache_fill (s);						\
  1888  									\
  1889      speed_starttime ();							\
  1890      i = s->reps;							\
  1891      do {								\
  1892        function (qp, rp, tp, size1, dp, s->size, scratch);		\
  1893      } while (--i != 0);							\
  1894      t = speed_endtime ();						\
  1895  									\
  1896      TMP_FREE;								\
  1897      return t;								\
  1898    }
  1899  #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
  1900    {									\
  1901      unsigned   i;							\
  1902      mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
  1903      double     t;							\
  1904      mp_size_t  size1, itch;						\
  1905      TMP_DECL;								\
  1906  									\
  1907      size1 = (s->r == 0 ? 2 * s->size : s->r);				\
  1908  									\
  1909      SPEED_RESTRICT_COND (s->size >= 2);					\
  1910      SPEED_RESTRICT_COND (size1 >= s->size);				\
  1911  									\
  1912      itch = itchfn (size1, s->size, s->size);				\
  1913      TMP_MARK;								\
  1914      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  1915      SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
  1916      SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
  1917      SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
  1918      SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
  1919      SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
  1920  									\
  1921      /* we don't fill in dividend completely when size1 > s->size */	\
  1922      MPN_COPY (tp,         s->xp, s->size);				\
  1923      MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
  1924  									\
  1925      MPN_COPY (dp,         s->yp, s->size);				\
  1926  									\
  1927      /* normalize the data */						\
  1928      dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  1929      tp[size1 - 1] = dp[s->size-1] - 1;					\
  1930  									\
  1931      tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
  1932      mpn_invert (ip, dp, s->size, tmp);					\
  1933  									\
  1934      speed_operand_dst (s, qp, size1 - s->size);				\
  1935      speed_operand_dst (s, rp, s->size);					\
  1936      speed_operand_src (s, tp, size1);					\
  1937      speed_operand_src (s, dp, s->size);					\
  1938      speed_operand_src (s, ip, s->size);					\
  1939      speed_operand_dst (s, scratch, itch);				\
  1940      speed_cache_fill (s);						\
  1941  									\
  1942      speed_starttime ();							\
  1943      i = s->reps;							\
  1944      do {								\
  1945        function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
  1946      } while (--i != 0);							\
  1947      t = speed_endtime ();						\
  1948  									\
  1949      TMP_FREE;								\
  1950      return t;								\
  1951    }
  1952  
  1953  #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
  1954    {									\
  1955      unsigned   i;							\
  1956      mp_ptr     dp, tp, ap, qp;						\
  1957      mp_limb_t  inv;							\
  1958      double     t;							\
  1959      TMP_DECL;								\
  1960  									\
  1961      SPEED_RESTRICT_COND (s->size >= 1);					\
  1962  									\
  1963      TMP_MARK;								\
  1964      SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
  1965      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  1966      SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
  1967      SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
  1968  									\
  1969      MPN_COPY (ap,         s->xp, s->size);				\
  1970      MPN_COPY (ap+s->size, s->xp, s->size);				\
  1971  									\
  1972      /* divisor must be odd */						\
  1973      MPN_COPY (dp, s->yp, s->size);					\
  1974      dp[0] |= 1;								\
  1975      binvert_limb (inv, dp[0]);						\
  1976      inv = -inv;								\
  1977  									\
  1978      speed_operand_src (s, ap, 2*s->size);				\
  1979      speed_operand_dst (s, tp, 2*s->size);				\
  1980      speed_operand_src (s, dp, s->size);					\
  1981      speed_operand_dst (s, qp, s->size);					\
  1982      speed_cache_fill (s);						\
  1983  									\
  1984      speed_starttime ();							\
  1985      i = s->reps;							\
  1986      do {								\
  1987        MPN_COPY (tp, ap, 2*s->size);					\
  1988        function (qp, tp, 2*s->size, dp, s->size, inv);			\
  1989      } while (--i != 0);							\
  1990      t = speed_endtime ();						\
  1991  									\
  1992      TMP_FREE;								\
  1993      return t;								\
  1994    }
  1995  #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
  1996    {									\
  1997      unsigned   i;							\
  1998      mp_ptr     dp, tp, qp;						\
  1999      mp_limb_t  inv;							\
  2000      double     t;							\
  2001      TMP_DECL;								\
  2002  									\
  2003      SPEED_RESTRICT_COND (s->size >= 1);					\
  2004  									\
  2005      TMP_MARK;								\
  2006      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  2007      SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
  2008      SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
  2009  									\
  2010      /* divisor must be odd */						\
  2011      MPN_COPY (dp, s->yp, s->size);					\
  2012      dp[0] |= 1;								\
  2013      binvert_limb (inv, dp[0]);						\
  2014      inv = -inv;								\
  2015  									\
  2016      speed_operand_src (s, s->xp, s->size);				\
  2017      speed_operand_dst (s, tp, s->size);					\
  2018      speed_operand_src (s, dp, s->size);					\
  2019      speed_operand_dst (s, qp, s->size);					\
  2020      speed_cache_fill (s);						\
  2021  									\
  2022      speed_starttime ();							\
  2023      i = s->reps;							\
  2024      do {								\
  2025        MPN_COPY (tp, s->xp, s->size);					\
  2026        function (qp, tp, s->size, dp, s->size, inv);			\
  2027      } while (--i != 0);							\
  2028      t = speed_endtime ();						\
  2029  									\
  2030      TMP_FREE;								\
  2031      return t;								\
  2032    }
  2033  #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
  2034    {									\
  2035      unsigned   i;							\
  2036      mp_ptr     dp, qp, scratch;						\
  2037      double     t;							\
  2038      mp_size_t itch;							\
  2039      TMP_DECL;								\
  2040  									\
  2041      SPEED_RESTRICT_COND (s->size >= 2);					\
  2042  									\
  2043      itch = itchfn (s->size, s->size);					\
  2044      TMP_MARK;								\
  2045      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  2046      SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
  2047      SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
  2048  									\
  2049      /* divisor must be odd */						\
  2050      MPN_COPY (dp, s->yp, s->size);					\
  2051      dp[0] |= 1;								\
  2052  									\
  2053      speed_operand_dst (s, qp, s->size);					\
  2054      speed_operand_src (s, s->xp, s->size);				\
  2055      speed_operand_src (s, dp, s->size);					\
  2056      speed_operand_dst (s, scratch, itch);				\
  2057      speed_cache_fill (s);						\
  2058  									\
  2059      speed_starttime ();							\
  2060      i = s->reps;							\
  2061      do {								\
  2062        function (qp, s->xp, s->size, dp, s->size, scratch);		\
  2063      } while (--i != 0);							\
  2064      t = speed_endtime ();						\
  2065  									\
  2066      TMP_FREE;								\
  2067      return t;								\
  2068    }
  2069  #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
  2070    {									\
  2071      unsigned   i;							\
  2072      mp_ptr     dp, tp, qp, rp, scratch;					\
  2073      double     t;							\
  2074      mp_size_t itch;							\
  2075      TMP_DECL;								\
  2076  									\
  2077      SPEED_RESTRICT_COND (s->size >= 2);					\
  2078  									\
  2079      itch = itchfn (2 * s->size, s->size);				\
  2080      TMP_MARK;								\
  2081      SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
  2082      SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
  2083      SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
  2084      SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
  2085      SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
  2086  									\
  2087      MPN_COPY (tp,         s->xp, s->size);				\
  2088      MPN_COPY (tp+s->size, s->xp, s->size);				\
  2089  									\
  2090      /* divisor must be odd */						\
  2091      MPN_COPY (dp, s->yp, s->size);					\
  2092      dp[0] |= 1;								\
  2093  									\
  2094      speed_operand_dst (s, qp, s->size);					\
  2095      speed_operand_dst (s, rp, s->size);					\
  2096      speed_operand_src (s, tp, 2 * s->size);				\
  2097      speed_operand_src (s, dp, s->size);					\
  2098      speed_operand_dst (s, scratch, itch);				\
  2099      speed_cache_fill (s);						\
  2100  									\
  2101      speed_starttime ();							\
  2102      i = s->reps;							\
  2103      do {								\
  2104        function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
  2105      } while (--i != 0);							\
  2106      t = speed_endtime ();						\
  2107  									\
  2108      TMP_FREE;								\
  2109      return t;								\
  2110    }
  2111  
  2112  #define SPEED_ROUTINE_MPN_BROOT(function)	\
  2113    {						\
  2114      SPEED_RESTRICT_COND (s->r & 1);		\
  2115      s->xp[0] |= 1;				\
  2116      SPEED_ROUTINE_MPN_UNARY_1_CALL		\
  2117        ((*function) (wp, s->xp, s->size, s->r));	\
  2118    }
  2119  
  2120  #define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
  2121    {							\
  2122      mp_ptr    wp, tp;					\
  2123      unsigned  i;					\
  2124      double    t;					\
  2125      TMP_DECL;						\
  2126      TMP_MARK;						\
  2127      SPEED_RESTRICT_COND (s->size >= 1);			\
  2128      SPEED_RESTRICT_COND (s->r & 1);			\
  2129      wp = TMP_ALLOC_LIMBS (s->size);			\
  2130      tp = TMP_ALLOC_LIMBS ( (itch));			\
  2131      s->xp[0] |= 1;					\
  2132  							\
  2133      speed_operand_src (s, s->xp, s->size);		\
  2134      speed_operand_dst (s, wp, s->size);			\
  2135      speed_cache_fill (s);				\
  2136  							\
  2137      speed_starttime ();					\
  2138      i = s->reps;					\
  2139      do							\
  2140        (*function) (wp, s->xp, s->size, s->r, tp);	\
  2141      while (--i != 0);					\
  2142      t = speed_endtime ();				\
  2143  							\
  2144      TMP_FREE;						\
  2145      return t;						\
  2146    }
  2147  
  2148  #define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
  2149    {									\
  2150      long  i;								\
  2151      mp_ptr    up, tp, ip;						\
  2152      double    t;							\
  2153      TMP_DECL;								\
  2154  									\
  2155      SPEED_RESTRICT_COND (s->size >= 1);					\
  2156  									\
  2157      TMP_MARK;								\
  2158      SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
  2159      SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
  2160      SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
  2161  									\
  2162      MPN_COPY (up, s->xp, s->size);					\
  2163  									\
  2164      /* normalize the data */						\
  2165      up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  2166  									\
  2167      speed_operand_src (s, up, s->size);					\
  2168      speed_operand_dst (s, tp, s->size);					\
  2169      speed_operand_dst (s, ip, s->size);					\
  2170      speed_cache_fill (s);						\
  2171  									\
  2172      speed_starttime ();							\
  2173      i = s->reps;							\
  2174      do									\
  2175        function (ip, up, s->size, tp);					\
  2176      while (--i != 0);							\
  2177      t = speed_endtime ();						\
  2178  									\
  2179      TMP_FREE;								\
  2180      return t;								\
  2181    }
  2182  
  2183  #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
  2184    {									\
  2185      long  i;								\
  2186      mp_ptr    up, tp, ip;						\
  2187      double    t;							\
  2188      TMP_DECL;								\
  2189  									\
  2190      SPEED_RESTRICT_COND (s->size >= 1);					\
  2191  									\
  2192      TMP_MARK;								\
  2193      SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
  2194      SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
  2195      SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
  2196  									\
  2197      MPN_COPY (up, s->xp, s->size);					\
  2198  									\
  2199      /* normalize the data */						\
  2200      up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  2201  									\
  2202      speed_operand_src (s, up, s->size);					\
  2203      speed_operand_dst (s, tp, s->size);					\
  2204      speed_operand_dst (s, ip, s->size);					\
  2205      speed_cache_fill (s);						\
  2206  									\
  2207      speed_starttime ();							\
  2208      i = s->reps;							\
  2209      do									\
  2210        function (ip, up, s->size, tp);					\
  2211      while (--i != 0);							\
  2212      t = speed_endtime ();						\
  2213  									\
  2214      TMP_FREE;								\
  2215      return t;								\
  2216    }
  2217  
  2218  #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
  2219    {									\
  2220      long  i;								\
  2221      mp_ptr    up, tp, ip;						\
  2222      double    t;							\
  2223      TMP_DECL;								\
  2224  									\
  2225      SPEED_RESTRICT_COND (s->size >= 3);					\
  2226  									\
  2227      TMP_MARK;								\
  2228      SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
  2229      SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
  2230      SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
  2231  									\
  2232      MPN_COPY (up, s->xp, s->size);					\
  2233  									\
  2234      /* normalize the data */						\
  2235      up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
  2236  									\
  2237      speed_operand_src (s, up, s->size);					\
  2238      speed_operand_dst (s, tp, s->size);					\
  2239      speed_operand_dst (s, ip, s->size);					\
  2240      speed_cache_fill (s);						\
  2241  									\
  2242      speed_starttime ();							\
  2243      i = s->reps;							\
  2244      do									\
  2245        function (ip, up, s->size, tp);					\
  2246      while (--i != 0);							\
  2247      t = speed_endtime ();						\
  2248  									\
  2249      TMP_FREE;								\
  2250      return t;								\
  2251    }
  2252  
  2253  #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
  2254    {									\
  2255      long  i;								\
  2256      mp_ptr    up, tp, ip;						\
  2257      double    t;							\
  2258      TMP_DECL;								\
  2259  									\
  2260      SPEED_RESTRICT_COND (s->size >= 1);					\
  2261  									\
  2262      TMP_MARK;								\
  2263      SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
  2264      SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
  2265      SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
  2266  									\
  2267      MPN_COPY (up, s->xp, s->size);					\
  2268  									\
  2269      /* normalize the data */						\
  2270      up[0] |= 1;								\
  2271  									\
  2272      speed_operand_src (s, up, s->size);					\
  2273      speed_operand_dst (s, tp, s->size);					\
  2274      speed_operand_dst (s, ip, s->size);					\
  2275      speed_cache_fill (s);						\
  2276  									\
  2277      speed_starttime ();							\
  2278      i = s->reps;							\
  2279      do									\
  2280        function (ip, up, s->size, tp);					\
  2281      while (--i != 0);							\
  2282      t = speed_endtime ();						\
  2283  									\
  2284      TMP_FREE;								\
  2285      return t;								\
  2286    }
  2287  
  2288  #define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn)			\
  2289    {									\
  2290      long  i;								\
  2291      mp_ptr    up, mp, tp, ip;						\
  2292      double    t;							\
  2293      TMP_DECL;								\
  2294  									\
  2295      SPEED_RESTRICT_COND (s->size >= 1);					\
  2296  									\
  2297      TMP_MARK;								\
  2298      SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
  2299      SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
  2300      SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp);			\
  2301      SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
  2302  									\
  2303      speed_operand_src (s, up, s->size);					\
  2304      speed_operand_dst (s, tp, s->size);					\
  2305      speed_operand_dst (s, ip, s->size);					\
  2306      speed_cache_fill (s);						\
  2307  									\
  2308      MPN_COPY (mp, s->yp, s->size);					\
  2309      /* Must be odd */							\
  2310      mp[0] |= 1;								\
  2311      speed_starttime ();							\
  2312      i = s->reps;							\
  2313      do									\
  2314        {									\
  2315  	MPN_COPY (up, s->xp, s->size);					\
  2316  	function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp);	\
  2317        }									\
  2318      while (--i != 0);							\
  2319      t = speed_endtime ();						\
  2320  									\
  2321      TMP_FREE;								\
  2322      return t;								\
  2323    }
  2324  
  2325  #define SPEED_ROUTINE_REDC_1(function)					\
  2326    {									\
  2327      unsigned   i;							\
  2328      mp_ptr     cp, mp, tp, ap;						\
  2329      mp_limb_t  inv;							\
  2330      double     t;							\
  2331      TMP_DECL;								\
  2332  									\
  2333      SPEED_RESTRICT_COND (s->size >= 1);					\
  2334  									\
  2335      TMP_MARK;								\
  2336      SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
  2337      SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
  2338      SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
  2339      SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
  2340  									\
  2341      MPN_COPY (ap,         s->xp, s->size);				\
  2342      MPN_COPY (ap+s->size, s->xp, s->size);				\
  2343  									\
  2344      /* modulus must be odd */						\
  2345      MPN_COPY (mp, s->yp, s->size);					\
  2346      mp[0] |= 1;								\
  2347      binvert_limb (inv, mp[0]);						\
  2348      inv = -inv;								\
  2349  									\
  2350      speed_operand_src (s, ap, 2*s->size+1);				\
  2351      speed_operand_dst (s, tp, 2*s->size+1);				\
  2352      speed_operand_src (s, mp, s->size);					\
  2353      speed_operand_dst (s, cp, s->size);					\
  2354      speed_cache_fill (s);						\
  2355  									\
  2356      speed_starttime ();							\
  2357      i = s->reps;							\
  2358      do {								\
  2359        MPN_COPY (tp, ap, 2*s->size);					\
  2360        function (cp, tp, mp, s->size, inv);				\
  2361      } while (--i != 0);							\
  2362      t = speed_endtime ();						\
  2363  									\
  2364      TMP_FREE;								\
  2365      return t;								\
  2366    }
  2367  #define SPEED_ROUTINE_REDC_2(function)					\
  2368    {									\
  2369      unsigned   i;							\
  2370      mp_ptr     cp, mp, tp, ap;						\
  2371      mp_limb_t  invp[2];							\
  2372      double     t;							\
  2373      TMP_DECL;								\
  2374  									\
  2375      SPEED_RESTRICT_COND (s->size >= 1);					\
  2376  									\
  2377      TMP_MARK;								\
  2378      SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
  2379      SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
  2380      SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
  2381      SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
  2382  									\
  2383      MPN_COPY (ap,         s->xp, s->size);				\
  2384      MPN_COPY (ap+s->size, s->xp, s->size);				\
  2385  									\
  2386      /* modulus must be odd */						\
  2387      MPN_COPY (mp, s->yp, s->size);					\
  2388      mp[0] |= 1;								\
  2389      mpn_binvert (invp, mp, 2, tp);					\
  2390      invp[0] = -invp[0]; invp[1] = ~invp[1];				\
  2391  									\
  2392      speed_operand_src (s, ap, 2*s->size+1);				\
  2393      speed_operand_dst (s, tp, 2*s->size+1);				\
  2394      speed_operand_src (s, mp, s->size);					\
  2395      speed_operand_dst (s, cp, s->size);					\
  2396      speed_cache_fill (s);						\
  2397  									\
  2398      speed_starttime ();							\
  2399      i = s->reps;							\
  2400      do {								\
  2401        MPN_COPY (tp, ap, 2*s->size);					\
  2402        function (cp, tp, mp, s->size, invp);				\
  2403      } while (--i != 0);							\
  2404      t = speed_endtime ();						\
  2405  									\
  2406      TMP_FREE;								\
  2407      return t;								\
  2408    }
  2409  #define SPEED_ROUTINE_REDC_N(function)					\
  2410    {									\
  2411      unsigned   i;							\
  2412      mp_ptr     cp, mp, tp, ap, invp;					\
  2413      double     t;							\
  2414      TMP_DECL;								\
  2415  									\
  2416      SPEED_RESTRICT_COND (s->size > 8);					\
  2417  									\
  2418      TMP_MARK;								\
  2419      SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
  2420      SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
  2421      SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
  2422      SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
  2423      SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
  2424  									\
  2425      MPN_COPY (ap,         s->xp, s->size);				\
  2426      MPN_COPY (ap+s->size, s->xp, s->size);				\
  2427  									\
  2428      /* modulus must be odd */						\
  2429      MPN_COPY (mp, s->yp, s->size);					\
  2430      mp[0] |= 1;								\
  2431      mpn_binvert (invp, mp, s->size, tp);				\
  2432  									\
  2433      speed_operand_src (s, ap, 2*s->size+1);				\
  2434      speed_operand_dst (s, tp, 2*s->size+1);				\
  2435      speed_operand_src (s, mp, s->size);					\
  2436      speed_operand_dst (s, cp, s->size);					\
  2437      speed_cache_fill (s);						\
  2438  									\
  2439      speed_starttime ();							\
  2440      i = s->reps;							\
  2441      do {								\
  2442        MPN_COPY (tp, ap, 2*s->size);					\
  2443        function (cp, tp, mp, s->size, invp);				\
  2444      } while (--i != 0);							\
  2445      t = speed_endtime ();						\
  2446  									\
  2447      TMP_FREE;								\
  2448      return t;								\
  2449    }
  2450  
  2451  
  2452  #define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
  2453    {									\
  2454      unsigned i;								\
  2455  									\
  2456      SPEED_RESTRICT_COND (s->size >= 1);					\
  2457  									\
  2458      speed_operand_src (s, s->xp, s->size);				\
  2459      speed_cache_fill (s);						\
  2460  									\
  2461      speed_starttime ();							\
  2462      i = s->reps;							\
  2463      do									\
  2464        function (s->xp, s->size);					\
  2465      while (--i != 0);							\
  2466  									\
  2467      return speed_endtime ();						\
  2468    }
  2469  
  2470  #define SPEED_ROUTINE_MPN_HAMDIST(function)				\
  2471    {									\
  2472      unsigned i;								\
  2473  									\
  2474      SPEED_RESTRICT_COND (s->size >= 1);					\
  2475  									\
  2476      speed_operand_src (s, s->xp, s->size);				\
  2477      speed_operand_src (s, s->yp, s->size);				\
  2478      speed_cache_fill (s);						\
  2479  									\
  2480      speed_starttime ();							\
  2481      i = s->reps;							\
  2482      do									\
  2483        function (s->xp, s->yp, s->size);					\
  2484      while (--i != 0);							\
  2485  									\
  2486      return speed_endtime ();						\
  2487    }
  2488  
  2489  
  2490  #define SPEED_ROUTINE_MPZ_UI(function)					\
  2491    {									\
  2492      mpz_t     z;							\
  2493      unsigned  i;							\
  2494      double    t;							\
  2495  									\
  2496      SPEED_RESTRICT_COND (s->size >= 0);					\
  2497  									\
  2498      mpz_init (z);							\
  2499  									\
  2500      speed_starttime ();							\
  2501      i = s->reps;							\
  2502      do									\
  2503        function (z, s->size);						\
  2504      while (--i != 0);							\
  2505      t = speed_endtime ();						\
  2506  									\
  2507      mpz_clear (z);							\
  2508      return t;								\
  2509    }
  2510  
  2511  #define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
  2512  #define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
  2513  #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
  2514  
  2515  
  2516  #define SPEED_ROUTINE_MPZ_2_UI(function)				\
  2517    {									\
  2518      mpz_t     z, z2;							\
  2519      unsigned  i;							\
  2520      double    t;							\
  2521  									\
  2522      SPEED_RESTRICT_COND (s->size >= 0);					\
  2523  									\
  2524      mpz_init (z);							\
  2525      mpz_init (z2);							\
  2526  									\
  2527      speed_starttime ();							\
  2528      i = s->reps;							\
  2529      do									\
  2530        function (z, z2, s->size);					\
  2531      while (--i != 0);							\
  2532      t = speed_endtime ();						\
  2533  									\
  2534      mpz_clear (z);							\
  2535      mpz_clear (z2);							\
  2536      return t;								\
  2537    }
  2538  
  2539  #define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
  2540  #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
  2541  
  2542  
  2543  #define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
  2544    {									\
  2545      mp_ptr     fp, f1p;							\
  2546      mp_size_t  alloc;							\
  2547      unsigned   i;							\
  2548      double     t;							\
  2549      TMP_DECL;								\
  2550  									\
  2551      SPEED_RESTRICT_COND (s->size >= 0);					\
  2552  									\
  2553      TMP_MARK;								\
  2554      alloc = MPN_FIB2_SIZE (s->size);					\
  2555      SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
  2556      SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
  2557  									\
  2558      speed_starttime ();							\
  2559      i = s->reps;							\
  2560      do									\
  2561        function (fp, f1p, s->size);					\
  2562      while (--i != 0);							\
  2563      t = speed_endtime ();						\
  2564  									\
  2565      TMP_FREE;								\
  2566      return t;								\
  2567    }
  2568  
  2569  
  2570  
  2571  /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
  2572     limbs.  m is forced to odd so that redc can be used.  e is limited in
  2573     size so the calculation doesn't take too long. */
  2574  #define SPEED_ROUTINE_MPZ_POWM(function)				\
  2575    {									\
  2576      mpz_t     r, b, e, m;						\
  2577      unsigned  i;							\
  2578      double    t;							\
  2579  									\
  2580      SPEED_RESTRICT_COND (s->size >= 1);					\
  2581  									\
  2582      mpz_init (r);							\
  2583      mpz_init_set_n (b, s->xp, s->size);					\
  2584      mpz_init_set_n (m, s->yp, s->size);					\
  2585      mpz_setbit (m, 0);	/* force m to odd */				\
  2586      mpz_init_set_n (e, s->xp_block, 6);					\
  2587  									\
  2588      speed_starttime ();							\
  2589      i = s->reps;							\
  2590      do									\
  2591        function (r, b, e, m);						\
  2592      while (--i != 0);							\
  2593      t = speed_endtime ();						\
  2594  									\
  2595      mpz_clear (r);							\
  2596      mpz_clear (b);							\
  2597      mpz_clear (e);							\
  2598      mpz_clear (m);							\
  2599      return t;								\
  2600    }
  2601  
  2602  /* (m-2)^0xAAAAAAAA mod m */
  2603  #define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
  2604    {									\
  2605      mpz_t     r, b, m;							\
  2606      unsigned  long  e;							\
  2607      unsigned  i;							\
  2608      double    t;							\
  2609  									\
  2610      SPEED_RESTRICT_COND (s->size >= 1);					\
  2611  									\
  2612      mpz_init (r);							\
  2613  									\
  2614      /* force m to odd */						\
  2615      mpz_init (m);							\
  2616      mpz_set_n (m, s->xp, s->size);					\
  2617      PTR(m)[0] |= 1;							\
  2618  									\
  2619      e = (~ (unsigned long) 0) / 3;					\
  2620      if (s->r != 0)							\
  2621        e = s->r;								\
  2622  									\
  2623      mpz_init_set (b, m);						\
  2624      mpz_sub_ui (b, b, 2);						\
  2625  /* printf ("%X\n", mpz_get_ui(m)); */					\
  2626      i = s->reps;							\
  2627      speed_starttime ();							\
  2628      do									\
  2629        function (r, b, e, m);						\
  2630      while (--i != 0);							\
  2631      t = speed_endtime ();						\
  2632  									\
  2633      mpz_clear (r);							\
  2634      mpz_clear (b);							\
  2635      mpz_clear (m);							\
  2636      return t;								\
  2637    }
  2638  
  2639  
  2640  #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
  2641    {									\
  2642      mp_ptr    wp, wp2, xp, yp;						\
  2643      unsigned  i;							\
  2644      double    t;							\
  2645      TMP_DECL;								\
  2646  									\
  2647      SPEED_RESTRICT_COND (s->size >= 0);					\
  2648  									\
  2649      TMP_MARK;								\
  2650      SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
  2651      SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
  2652      xp = s->xp;								\
  2653      yp = s->yp;								\
  2654  									\
  2655      if (s->r == 0)	;						\
  2656      else if (s->r == 1) { xp = wp;	      }				\
  2657      else if (s->r == 2) {	    yp = wp2; }				\
  2658      else if (s->r == 3) { xp = wp;  yp = wp2; }				\
  2659      else if (s->r == 4) { xp = wp2; yp = wp;  }				\
  2660      else {								\
  2661        TMP_FREE;								\
  2662        return -1.0;							\
  2663      }									\
  2664      if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
  2665      if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
  2666  									\
  2667      speed_operand_src (s, xp, s->size);					\
  2668      speed_operand_src (s, yp, s->size);					\
  2669      speed_operand_dst (s, wp, s->size);					\
  2670      speed_operand_dst (s, wp2, s->size);				\
  2671      speed_cache_fill (s);						\
  2672  									\
  2673      speed_starttime ();							\
  2674      i = s->reps;							\
  2675      do									\
  2676        call;								\
  2677      while (--i != 0);							\
  2678      t = speed_endtime ();						\
  2679  									\
  2680      TMP_FREE;								\
  2681      return t;								\
  2682    }
  2683  
  2684  #define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
  2685    SPEED_ROUTINE_MPN_ADDSUB_CALL						\
  2686      (function (wp, wp2, xp, yp, s->size));
  2687  
  2688  #define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
  2689    SPEED_ROUTINE_MPN_ADDSUB_CALL						\
  2690      (function (wp, wp2, xp, yp, s->size, 0));
  2691  
  2692  
  2693  /* Doing an Nx1 gcd with the given r. */
  2694  #define SPEED_ROUTINE_MPN_GCD_1N(function)				\
  2695    {									\
  2696      mp_ptr    xp;							\
  2697      unsigned  i;							\
  2698      double    t;							\
  2699      TMP_DECL;								\
  2700  									\
  2701      SPEED_RESTRICT_COND (s->size >= 1);					\
  2702      SPEED_RESTRICT_COND (s->r != 0);					\
  2703  									\
  2704      TMP_MARK;								\
  2705      SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
  2706      MPN_COPY (xp, s->xp, s->size);					\
  2707      xp[0] |= refmpn_zero_p (xp, s->size);				\
  2708  									\
  2709      speed_operand_src (s, s->xp, s->size);				\
  2710      speed_cache_fill (s);						\
  2711  									\
  2712      speed_starttime ();							\
  2713      i = s->reps;							\
  2714      do									\
  2715        function (xp, s->size, s->r);					\
  2716      while (--i != 0);							\
  2717      t = speed_endtime ();						\
  2718  									\
  2719      TMP_FREE;								\
  2720      return t;								\
  2721    }
  2722  
  2723  
  2724  /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
  2725  
  2726  #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
  2727    {									\
  2728      unsigned  i, j;							\
  2729      mp_ptr    px, py;							\
  2730      mp_limb_t x_mask, y_mask;						\
  2731      double    t;							\
  2732      TMP_DECL;								\
  2733  									\
  2734      SPEED_RESTRICT_COND (s->size >= 1);					\
  2735      SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
  2736  									\
  2737      TMP_MARK;								\
  2738      SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
  2739      SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
  2740      MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
  2741      MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
  2742  									\
  2743      x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
  2744      y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
  2745      for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
  2746        {									\
  2747  	px[i] &= x_mask; px[i] += (px[i] == 0);				\
  2748  	py[i] &= y_mask; py[i] += (py[i] == 0);				\
  2749  	setup;								\
  2750        }									\
  2751  									\
  2752      speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
  2753      speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
  2754      speed_cache_fill (s);						\
  2755  									\
  2756      speed_starttime ();							\
  2757      i = s->reps;							\
  2758      do									\
  2759        {									\
  2760  	j = SPEED_BLOCK_SIZE;						\
  2761  	do								\
  2762  	  {								\
  2763  	    call;							\
  2764  	  }								\
  2765  	while (--j != 0);						\
  2766        }									\
  2767      while (--i != 0);							\
  2768      t = speed_endtime ();						\
  2769  									\
  2770      TMP_FREE;								\
  2771  									\
  2772      s->time_divisor = SPEED_BLOCK_SIZE;					\
  2773      return t;								\
  2774    }
  2775  
  2776  #define SPEED_ROUTINE_MPN_GCD_1(function)				\
  2777    SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
  2778  
  2779  #define SPEED_ROUTINE_MPN_JACBASE(function)				\
  2780    SPEED_ROUTINE_MPN_GCD_1_CALL						\
  2781      ({									\
  2782         /* require x<y, y odd, y!=1 */					\
  2783         px[i] %= py[i];							\
  2784         px[i] |= 1;							\
  2785         py[i] |= 1;							\
  2786         if (py[i]==1) py[i]=3;						\
  2787       },									\
  2788       function (px[j-1], py[j-1], 0))
  2789  
  2790  
  2791  #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
  2792    {									\
  2793      mp_size_t hgcd_init_itch, hgcd_itch;				\
  2794      mp_ptr ap, bp, wp, tmp1;						\
  2795      struct hgcd_matrix hgcd;						\
  2796      int res;								\
  2797      unsigned i;								\
  2798      double t;								\
  2799      TMP_DECL;								\
  2800  									\
  2801      if (s->size < 2)							\
  2802        return -1;							\
  2803  									\
  2804      TMP_MARK;								\
  2805  									\
  2806      SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
  2807      SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
  2808  									\
  2809      s->xp[s->size - 1] |= 1;						\
  2810      s->yp[s->size - 1] |= 1;						\
  2811  									\
  2812      hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
  2813      hgcd_itch = itchfunc (s->size);					\
  2814  									\
  2815      SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
  2816      SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
  2817  									\
  2818      speed_operand_src (s, s->xp, s->size);				\
  2819      speed_operand_src (s, s->yp, s->size);				\
  2820      speed_operand_dst (s, ap, s->size + 1);				\
  2821      speed_operand_dst (s, bp, s->size + 1);				\
  2822      speed_operand_dst (s, wp, hgcd_itch);				\
  2823      speed_operand_dst (s, tmp1, hgcd_init_itch);			\
  2824      speed_cache_fill (s);						\
  2825  									\
  2826      speed_starttime ();							\
  2827      i = s->reps;							\
  2828      do									\
  2829        {									\
  2830  	MPN_COPY (ap, s->xp, s->size);					\
  2831  	MPN_COPY (bp, s->yp, s->size);					\
  2832  	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
  2833  	res = func (ap, bp, s->size, &hgcd, wp);			\
  2834        }									\
  2835      while (--i != 0);							\
  2836      t = speed_endtime ();						\
  2837      TMP_FREE;								\
  2838      return t;								\
  2839    }
  2840  
  2841  #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
  2842    {									\
  2843      mp_size_t hgcd_init_itch, hgcd_step_itch;				\
  2844      mp_ptr ap, bp, wp, tmp1;						\
  2845      struct hgcd_matrix hgcd;						\
  2846      mp_size_t p = s->size/2;						\
  2847      int res;								\
  2848      unsigned i;								\
  2849      double t;								\
  2850      TMP_DECL;								\
  2851  									\
  2852      if (s->size < 2)							\
  2853        return -1;							\
  2854  									\
  2855      TMP_MARK;								\
  2856  									\
  2857      SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
  2858      SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
  2859  									\
  2860      s->xp[s->size - 1] |= 1;						\
  2861      s->yp[s->size - 1] |= 1;						\
  2862  									\
  2863      hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
  2864      hgcd_step_itch = itchfunc (s->size, p);				\
  2865  									\
  2866      SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
  2867      SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
  2868  									\
  2869      speed_operand_src (s, s->xp, s->size);				\
  2870      speed_operand_src (s, s->yp, s->size);				\
  2871      speed_operand_dst (s, ap, s->size + 1);				\
  2872      speed_operand_dst (s, bp, s->size + 1);				\
  2873      speed_operand_dst (s, wp, hgcd_step_itch);				\
  2874      speed_operand_dst (s, tmp1, hgcd_init_itch);			\
  2875      speed_cache_fill (s);						\
  2876  									\
  2877      speed_starttime ();							\
  2878      i = s->reps;							\
  2879      do									\
  2880        {									\
  2881  	MPN_COPY (ap, s->xp, s->size);					\
  2882  	MPN_COPY (bp, s->yp, s->size);					\
  2883  	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
  2884  	res = func (&hgcd, ap, bp, s->size, p, wp);			\
  2885        }									\
  2886      while (--i != 0);							\
  2887      t = speed_endtime ();						\
  2888      TMP_FREE;								\
  2889      return t;								\
  2890    }
  2891  
  2892  /* Run some GCDs of s->size limbs each.  The number of different data values
  2893     is decreased as s->size**2, since GCD is a quadratic algorithm.
  2894     SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
  2895     though, because the plain gcd is about twice as fast as gcdext.  */
  2896  
  2897  #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
  2898    {									\
  2899      unsigned  i;							\
  2900      mp_size_t j, pieces, psize;						\
  2901      mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
  2902      double    t;							\
  2903      TMP_DECL;								\
  2904  									\
  2905      SPEED_RESTRICT_COND (s->size >= 1);					\
  2906  									\
  2907      TMP_MARK;								\
  2908      SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
  2909      SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
  2910      SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
  2911      SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
  2912  									\
  2913      pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
  2914      pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
  2915      pieces = MAX (pieces, 1);						\
  2916  									\
  2917      psize = pieces * s->size;						\
  2918      px = TMP_ALLOC_LIMBS (psize);					\
  2919      py = TMP_ALLOC_LIMBS (psize);					\
  2920      MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
  2921      MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
  2922  									\
  2923      /* Requirements: x >= y, y must be odd, high limbs != 0.		\
  2924         No need to ensure random numbers are really great.  */		\
  2925      for (j = 0; j < pieces; j++)					\
  2926        {									\
  2927  	mp_ptr	x = px + j * s->size;					\
  2928  	mp_ptr	y = py + j * s->size;					\
  2929  	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
  2930  	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
  2931  									\
  2932  	if (x[s->size - 1] < y[s->size - 1])				\
  2933  	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
  2934  	else if (x[s->size - 1] == y[s->size - 1])			\
  2935  	  {								\
  2936  	    x[s->size - 1] = 2;						\
  2937  	    y[s->size - 1] = 1;						\
  2938  	  }								\
  2939  	y[0] |= 1;							\
  2940        }									\
  2941  									\
  2942      speed_operand_src (s, px, psize);					\
  2943      speed_operand_src (s, py, psize);					\
  2944      speed_operand_dst (s, xtmp, s->size);				\
  2945      speed_operand_dst (s, ytmp, s->size);				\
  2946      speed_operand_dst (s, wp, s->size);					\
  2947      speed_cache_fill (s);						\
  2948  									\
  2949      speed_starttime ();							\
  2950      i = s->reps;							\
  2951      do									\
  2952        {									\
  2953  	j = pieces;							\
  2954  	do								\
  2955  	  {								\
  2956  	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
  2957  	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
  2958  	    call;							\
  2959  	  }								\
  2960  	while (--j != 0);						\
  2961        }									\
  2962      while (--i != 0);							\
  2963      t = speed_endtime ();						\
  2964  									\
  2965      TMP_FREE;								\
  2966  									\
  2967      s->time_divisor = pieces;						\
  2968      return t;								\
  2969    }
  2970  
  2971  #define SPEED_ROUTINE_MPN_GCD(function)	\
  2972    SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
  2973  
  2974  #define SPEED_ROUTINE_MPN_GCDEXT(function)				\
  2975    SPEED_ROUTINE_MPN_GCD_CALL						\
  2976      (4, { mp_size_t  wp2size;						\
  2977  	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
  2978  
  2979  
  2980  #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
  2981    {									\
  2982      unsigned  i;							\
  2983      mp_size_t j, pieces, psize, wp2size;				\
  2984      mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
  2985      double    t;							\
  2986      TMP_DECL;								\
  2987  									\
  2988      SPEED_RESTRICT_COND (s->size >= 1);					\
  2989  									\
  2990      TMP_MARK;								\
  2991  									\
  2992      SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
  2993      SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
  2994      MPN_COPY (xtmp, s->xp, s->size);					\
  2995      MPN_COPY (ytmp, s->yp, s->size);					\
  2996  									\
  2997      SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
  2998      SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
  2999  									\
  3000      pieces = SPEED_BLOCK_SIZE / 3;					\
  3001      psize = 3 * pieces;							\
  3002      px = TMP_ALLOC_LIMBS (psize);					\
  3003      py = TMP_ALLOC_LIMBS (psize);					\
  3004      MPN_COPY (px, s->xp_block, psize);					\
  3005      MPN_COPY (py, s->yp_block, psize);					\
  3006  									\
  3007      /* x must have at least as many bits as y,				\
  3008         high limbs must be non-zero */					\
  3009      for (j = 0; j < pieces; j++)					\
  3010        {									\
  3011  	mp_ptr	x = px+3*j;						\
  3012  	mp_ptr	y = py+3*j;						\
  3013  	x[2] += (x[2] == 0);						\
  3014  	y[2] += (y[2] == 0);						\
  3015  	if (x[2] < y[2])						\
  3016  	  MP_LIMB_T_SWAP (x[2], y[2]);					\
  3017        }									\
  3018  									\
  3019      speed_operand_src (s, px, psize);					\
  3020      speed_operand_src (s, py, psize);					\
  3021      speed_operand_dst (s, xtmp, s->size);				\
  3022      speed_operand_dst (s, ytmp, s->size);				\
  3023      speed_operand_dst (s, wp, s->size);					\
  3024      speed_cache_fill (s);						\
  3025  									\
  3026      speed_starttime ();							\
  3027      i = s->reps;							\
  3028      do									\
  3029        {									\
  3030  	mp_ptr	x = px;							\
  3031  	mp_ptr	y = py;							\
  3032  	mp_ptr	xth = &xtmp[s->size-3];					\
  3033  	mp_ptr	yth = &ytmp[s->size-3];					\
  3034  	j = pieces;							\
  3035  	do								\
  3036  	  {								\
  3037  	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
  3038  	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
  3039  									\
  3040  	    ytmp[0] |= 1; /* y must be odd, */				\
  3041  									\
  3042  	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
  3043  									\
  3044  	    x += 3;							\
  3045  	    y += 3;							\
  3046  	  }								\
  3047  	while (--j != 0);						\
  3048        }									\
  3049      while (--i != 0);							\
  3050      t = speed_endtime ();						\
  3051  									\
  3052      TMP_FREE;								\
  3053  									\
  3054      s->time_divisor = pieces;						\
  3055      return t;								\
  3056    }
  3057  
  3058  #define SPEED_ROUTINE_MPZ_JACOBI(function)				\
  3059    {									\
  3060      mpz_t     a, b;							\
  3061      unsigned  i;							\
  3062      mp_size_t j, pieces, psize;						\
  3063      mp_ptr    px, py;							\
  3064      double    t;							\
  3065      TMP_DECL;								\
  3066  									\
  3067      TMP_MARK;								\
  3068      pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
  3069      pieces = MAX (pieces, 1);						\
  3070      s->time_divisor = pieces;						\
  3071  									\
  3072      psize = pieces * s->size;						\
  3073      px = TMP_ALLOC_LIMBS (psize);					\
  3074      py = TMP_ALLOC_LIMBS (psize);					\
  3075      MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
  3076      MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
  3077  									\
  3078      for (j = 0; j < pieces; j++)					\
  3079        {									\
  3080  	mp_ptr	x = px+j*s->size;					\
  3081  	mp_ptr	y = py+j*s->size;					\
  3082  									\
  3083  	/* y odd */							\
  3084  	y[0] |= 1;							\
  3085  									\
  3086  	/* high limbs non-zero */					\
  3087  	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
  3088  	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
  3089        }									\
  3090  									\
  3091      SIZ(a) = s->size;							\
  3092      SIZ(b) = s->size;							\
  3093  									\
  3094      speed_operand_src (s, px, psize);					\
  3095      speed_operand_src (s, py, psize);					\
  3096      speed_cache_fill (s);						\
  3097  									\
  3098      speed_starttime ();							\
  3099      i = s->reps;							\
  3100      do									\
  3101        {									\
  3102  	j = pieces;							\
  3103  	do								\
  3104  	  {								\
  3105  	    PTR(a) = px+(j-1)*s->size;					\
  3106  	    PTR(b) = py+(j-1)*s->size;					\
  3107  	    function (a, b);						\
  3108  	  }								\
  3109  	while (--j != 0);						\
  3110        }									\
  3111      while (--i != 0);							\
  3112      t = speed_endtime ();						\
  3113  									\
  3114      TMP_FREE;								\
  3115      return t;								\
  3116    }
  3117  
  3118  #define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
  3119    {									\
  3120      mp_ptr    wp, xp;							\
  3121      mp_limb_t yp[2];							\
  3122      unsigned  i;							\
  3123      double    t;							\
  3124      TMP_DECL;								\
  3125  									\
  3126      SPEED_RESTRICT_COND (s->size >= 2);					\
  3127  									\
  3128      TMP_MARK;								\
  3129      SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
  3130      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
  3131  									\
  3132      /* source is destroyed */						\
  3133      MPN_COPY (xp, s->xp, s->size);					\
  3134  									\
  3135      /* divisor must be normalized */					\
  3136      MPN_COPY (yp, s->yp_block, 2);					\
  3137      yp[1] |= GMP_NUMB_HIGHBIT;						\
  3138  									\
  3139      speed_operand_src (s, xp, s->size);					\
  3140      speed_operand_src (s, yp, 2);					\
  3141      speed_operand_dst (s, wp, s->size);					\
  3142      speed_cache_fill (s);						\
  3143  									\
  3144      speed_starttime ();							\
  3145      i = s->reps;							\
  3146      do									\
  3147        function (wp, 0, xp, s->size, yp);				\
  3148      while (--i != 0);							\
  3149      t = speed_endtime ();						\
  3150  									\
  3151      TMP_FREE;								\
  3152      return t;								\
  3153    }
  3154  
  3155  #define SPEED_ROUTINE_MPN_DIV_QR_1(function)				\
  3156    {									\
  3157      mp_ptr    wp, xp;							\
  3158      mp_limb_t d;							\
  3159      mp_limb_t r;							\
  3160      unsigned  i;							\
  3161      double    t;							\
  3162      TMP_DECL;								\
  3163  									\
  3164      SPEED_RESTRICT_COND (s->size >= 1);					\
  3165  									\
  3166      TMP_MARK;								\
  3167      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
  3168  									\
  3169      d = s->r;								\
  3170      if (d == 0)								\
  3171        d = 1;								\
  3172      speed_operand_src (s, s->xp, s->size);				\
  3173      speed_operand_dst (s, wp, s->size);					\
  3174      speed_cache_fill (s);						\
  3175  									\
  3176      speed_starttime ();							\
  3177      i = s->reps;							\
  3178      do									\
  3179        r = function (wp, wp+s->size-1, s->xp, s->size, d);		\
  3180      while (--i != 0);							\
  3181      t = speed_endtime ();						\
  3182  									\
  3183      TMP_FREE;								\
  3184      return t;								\
  3185    }
  3186  
  3187  #define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function)			\
  3188    {									\
  3189      mp_ptr    wp, xp;							\
  3190      mp_limb_t d, dinv;							\
  3191      mp_limb_t r;							\
  3192      unsigned  i;							\
  3193      double    t;							\
  3194      TMP_DECL;								\
  3195  									\
  3196      SPEED_RESTRICT_COND (s->size >= 1);					\
  3197  									\
  3198      TMP_MARK;								\
  3199      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
  3200  									\
  3201      d = s->r;								\
  3202      /* divisor must be normalized */					\
  3203      SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT);				\
  3204      invert_limb (dinv, d);						\
  3205      speed_operand_src (s, s->xp, s->size);				\
  3206      speed_operand_dst (s, wp, s->size);					\
  3207      speed_cache_fill (s);						\
  3208  									\
  3209      speed_starttime ();							\
  3210      i = s->reps;							\
  3211      do									\
  3212        r = function (wp, s->xp, s->size, 0, d, dinv);			\
  3213      while (--i != 0);							\
  3214      t = speed_endtime ();						\
  3215  									\
  3216      TMP_FREE;								\
  3217      return t;								\
  3218    }
  3219  
  3220  #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
  3221    {									\
  3222      mp_ptr    wp, xp;							\
  3223      mp_limb_t yp[2];							\
  3224      mp_limb_t rp[2];							\
  3225      unsigned  i;							\
  3226      double    t;							\
  3227      TMP_DECL;								\
  3228  									\
  3229      SPEED_RESTRICT_COND (s->size >= 2);					\
  3230  									\
  3231      TMP_MARK;								\
  3232      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
  3233  									\
  3234      /* divisor must be normalized */					\
  3235      MPN_COPY (yp, s->yp_block, 2);					\
  3236      if (norm)								\
  3237        yp[1] |= GMP_NUMB_HIGHBIT;					\
  3238      else								\
  3239        {									\
  3240  	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
  3241  	if (yp[1] == 0)							\
  3242  	  yp[1] = 1;							\
  3243        }									\
  3244      speed_operand_src (s, s->xp, s->size);				\
  3245      speed_operand_src (s, yp, 2);					\
  3246      speed_operand_dst (s, wp, s->size);					\
  3247      speed_operand_dst (s, rp, 2);					\
  3248      speed_cache_fill (s);						\
  3249  									\
  3250      speed_starttime ();							\
  3251      i = s->reps;							\
  3252      do									\
  3253        function (wp, rp, s->xp, s->size, yp);				\
  3254      while (--i != 0);							\
  3255      t = speed_endtime ();						\
  3256  									\
  3257      TMP_FREE;								\
  3258      return t;								\
  3259    }
  3260  
  3261  #define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
  3262    {									\
  3263      unsigned   i, j;							\
  3264      mp_ptr     xp;							\
  3265      mp_limb_t  n = 1;							\
  3266      double     t;							\
  3267  									\
  3268      xp = s->xp_block-1;							\
  3269  									\
  3270      speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
  3271      speed_cache_fill (s);						\
  3272  									\
  3273      speed_starttime ();							\
  3274      i = s->reps;							\
  3275      do									\
  3276        {									\
  3277  	j = SPEED_BLOCK_SIZE;						\
  3278  	do								\
  3279  	  {								\
  3280  	    /* randomized but successively dependent */			\
  3281  	    n += (xp[j] << 1);						\
  3282  									\
  3283  	    function (n, n);						\
  3284  	  }								\
  3285  	while (--j != 0);						\
  3286        }									\
  3287      while (--i != 0);							\
  3288      t = speed_endtime ();						\
  3289  									\
  3290      /* make sure the compiler won't optimize away n */			\
  3291      noop_1 (n);								\
  3292  									\
  3293      s->time_divisor = SPEED_BLOCK_SIZE;					\
  3294      return t;								\
  3295    }
  3296  
  3297  
  3298  #define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call)				\
  3299    {									\
  3300      mp_ptr    wp, wp2;							\
  3301      unsigned  i;							\
  3302      double    t;							\
  3303      TMP_DECL;								\
  3304  									\
  3305      SPEED_RESTRICT_COND (s->size >= 1);					\
  3306  									\
  3307      TMP_MARK;								\
  3308      SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
  3309      SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
  3310  									\
  3311      speed_operand_src (s, s->xp, s->size);				\
  3312      speed_operand_dst (s, wp, s->size);					\
  3313      speed_operand_dst (s, wp2, s->size);				\
  3314      speed_cache_fill (s);						\
  3315  									\
  3316      speed_starttime ();							\
  3317      i = s->reps;							\
  3318      do									\
  3319        call;								\
  3320      while (--i != 0);							\
  3321      t = speed_endtime ();						\
  3322  									\
  3323      TMP_FREE;								\
  3324      return t;								\
  3325    }
  3326  
  3327  
  3328  /* s->size controls the number of limbs in the input, s->r is the base, or
  3329     decimal by default. */
  3330  #define SPEED_ROUTINE_MPN_GET_STR(function)				\
  3331    {									\
  3332      unsigned char *wp;							\
  3333      mp_size_t wn;							\
  3334      mp_ptr xp;								\
  3335      int base;								\
  3336      unsigned i;								\
  3337      double t;								\
  3338      TMP_DECL;								\
  3339  									\
  3340      SPEED_RESTRICT_COND (s->size >= 1);					\
  3341  									\
  3342      base = s->r == 0 ? 10 : s->r;					\
  3343      SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
  3344  									\
  3345      TMP_MARK;								\
  3346      SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
  3347  									\
  3348      MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
  3349      wp = (unsigned char *) TMP_ALLOC (wn);				\
  3350  									\
  3351      /* use this during development to guard against overflowing wp */	\
  3352      /*									\
  3353      MPN_COPY (xp, s->xp, s->size);					\
  3354      ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
  3355      */									\
  3356  									\
  3357      speed_operand_src (s, s->xp, s->size);				\
  3358      speed_operand_dst (s, xp, s->size);					\
  3359      speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES);		\
  3360      speed_cache_fill (s);						\
  3361  									\
  3362      speed_starttime ();							\
  3363      i = s->reps;							\
  3364      do									\
  3365        {									\
  3366  	MPN_COPY (xp, s->xp, s->size);					\
  3367  	function (wp, base, xp, s->size);				\
  3368        }									\
  3369      while (--i != 0);							\
  3370      t = speed_endtime ();						\
  3371  									\
  3372      TMP_FREE;								\
  3373      return t;								\
  3374    }
  3375  
  3376  /* s->size controls the number of digits in the input, s->r is the base, or
  3377     decimal by default. */
  3378  #define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
  3379    {									\
  3380      unsigned char *xp;							\
  3381      mp_ptr     wp;							\
  3382      mp_size_t  wn;							\
  3383      unsigned   i;							\
  3384      int        base;							\
  3385      double     t;							\
  3386      TMP_DECL;								\
  3387  									\
  3388      SPEED_RESTRICT_COND (s->size >= 1);					\
  3389  									\
  3390      base = s->r == 0 ? 10 : s->r;					\
  3391      SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
  3392  									\
  3393      TMP_MARK;								\
  3394  									\
  3395      xp = (unsigned char *) TMP_ALLOC (s->size);				\
  3396      for (i = 0; i < s->size; i++)					\
  3397        xp[i] = s->xp[i] % base;						\
  3398  									\
  3399      LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
  3400      SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
  3401  									\
  3402      /* use this during development to check wn is big enough */		\
  3403      /*									\
  3404      ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
  3405      */									\
  3406  									\
  3407      speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES);	\
  3408      speed_operand_dst (s, wp, wn);					\
  3409      speed_cache_fill (s);						\
  3410  									\
  3411      speed_starttime ();							\
  3412      i = s->reps;							\
  3413      do									\
  3414        call;								\
  3415      while (--i != 0);							\
  3416      t = speed_endtime ();						\
  3417  									\
  3418      TMP_FREE;								\
  3419      return t;								\
  3420    }
  3421  
  3422  
  3423  /* Run an accel gcd find_a() function over various data values.  A set of
  3424     values is used in case some run particularly fast or slow.  The size
  3425     parameter is ignored, the amount of data tested is fixed.  */
  3426  
  3427  #define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
  3428    {									\
  3429      unsigned  i, j;							\
  3430      mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
  3431      double    t;							\
  3432      TMP_DECL;								\
  3433  									\
  3434      TMP_MARK;								\
  3435  									\
  3436      /* low must be odd, high must be non-zero */			\
  3437      for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
  3438        {									\
  3439  	cp[i][0] = s->xp_block[i] | 1;					\
  3440  	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
  3441        }									\
  3442  									\
  3443      speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
  3444      speed_cache_fill (s);						\
  3445  									\
  3446      speed_starttime ();							\
  3447      i = s->reps;							\
  3448      do									\
  3449        {									\
  3450  	j = SPEED_BLOCK_SIZE;						\
  3451  	do								\
  3452  	  {								\
  3453  	    function (cp[j-1]);						\
  3454  	  }								\
  3455  	while (--j != 0);						\
  3456        }									\
  3457      while (--i != 0);							\
  3458      t = speed_endtime ();						\
  3459  									\
  3460      TMP_FREE;								\
  3461  									\
  3462      s->time_divisor = SPEED_BLOCK_SIZE;					\
  3463      return t;								\
  3464    }
  3465  
  3466  
  3467  /* "call" should do "count_foo_zeros(c,n)".
  3468     Give leading=1 if foo is leading zeros, leading=0 for trailing.
  3469     Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
  3470  
  3471  #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
  3472    {									\
  3473      mp_ptr     xp;							\
  3474      int        i, c;							\
  3475      unsigned   j;							\
  3476      mp_limb_t  n;							\
  3477      double     t;							\
  3478      TMP_DECL;								\
  3479  									\
  3480      TMP_MARK;								\
  3481      SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
  3482  									\
  3483      if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
  3484        return -1.0;							\
  3485      speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
  3486      speed_cache_fill (s);						\
  3487  									\
  3488      c = 0;								\
  3489      speed_starttime ();							\
  3490      j = s->reps;							\
  3491      do {								\
  3492        for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
  3493  	{								\
  3494  	  n = xp[i];							\
  3495  	  n ^= c;							\
  3496  
  3497  #define SPEED_ROUTINE_COUNT_ZEROS_B()					\
  3498  	}								\
  3499      } while (--j != 0);							\
  3500      t = speed_endtime ();						\
  3501  									\
  3502      /* don't let c go dead */						\
  3503      noop_1 (c);								\
  3504  									\
  3505      s->time_divisor = SPEED_BLOCK_SIZE;					\
  3506  									\
  3507      TMP_FREE;								\
  3508      return t;								\
  3509    }									\
  3510  
  3511  #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
  3512    do {									\
  3513      SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
  3514      call;								\
  3515      SPEED_ROUTINE_COUNT_ZEROS_B ();					\
  3516    } while (0)								\
  3517  
  3518  #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
  3519    SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
  3520  #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
  3521    SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
  3522  
  3523  #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
  3524    SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
  3525  #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
  3526    SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
  3527  
  3528  
  3529  #define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
  3530    {									\
  3531      unsigned   i, j;							\
  3532      mp_limb_t  d, dinv=0;						\
  3533      mp_ptr     xp = s->xp_block - 1;					\
  3534  									\
  3535      s->time_divisor = SPEED_BLOCK_SIZE;					\
  3536  									\
  3537      speed_starttime ();							\
  3538      i = s->reps;							\
  3539      do									\
  3540        {									\
  3541  	j = SPEED_BLOCK_SIZE;						\
  3542  	do								\
  3543  	  {								\
  3544  	    d = dinv ^ xp[j];						\
  3545  	    d |= GMP_LIMB_HIGHBIT;					\
  3546  	    do { call; } while (0);					\
  3547  	  }								\
  3548  	while (--j != 0);						\
  3549        }									\
  3550      while (--i != 0);							\
  3551  									\
  3552      /* don't let the compiler optimize everything away */		\
  3553      noop_1 (dinv);							\
  3554  									\
  3555      return speed_endtime();						\
  3556    }
  3557  
  3558  
  3559  #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
  3560    {									\
  3561      unsigned  i;							\
  3562      speed_starttime ();							\
  3563      i = s->reps;							\
  3564      do									\
  3565        function ();							\
  3566      while (--i != 0);							\
  3567      return speed_endtime ();						\
  3568    }
  3569  
  3570  
  3571  #define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
  3572    {									\
  3573      mp_ptr    wp;							\
  3574      unsigned  i;							\
  3575      double    t;							\
  3576      TMP_DECL;								\
  3577  									\
  3578      SPEED_RESTRICT_COND (s->size >= 0);					\
  3579  									\
  3580      TMP_MARK;								\
  3581      SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
  3582      speed_operand_dst (s, wp, s->size);					\
  3583      speed_cache_fill (s);						\
  3584  									\
  3585      speed_starttime ();							\
  3586      i = s->reps;							\
  3587      do									\
  3588        call;								\
  3589      while (--i != 0);							\
  3590      t = speed_endtime ();						\
  3591  									\
  3592      TMP_FREE;								\
  3593      return t;								\
  3594    }
  3595  
  3596  #define SPEED_ROUTINE_MPN_ZERO(function)				\
  3597    SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
  3598  
  3599  
  3600  #endif