github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/tune/speed.h (about) 1 /* Header for speed and threshold things. 2 3 Copyright 1999-2003, 2005, 2006, 2008-2015 Free Software Foundation, Inc. 4 5 This file is part of the GNU MP Library. 6 7 The GNU MP Library is free software; you can redistribute it and/or modify 8 it under the terms of either: 9 10 * the GNU Lesser General Public License as published by the Free 11 Software Foundation; either version 3 of the License, or (at your 12 option) any later version. 13 14 or 15 16 * the GNU General Public License as published by the Free Software 17 Foundation; either version 2 of the License, or (at your option) any 18 later version. 19 20 or both in parallel, as here. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 for more details. 26 27 You should have received copies of the GNU General Public License and the 28 GNU Lesser General Public License along with the GNU MP Library. If not, 29 see https://www.gnu.org/licenses/. */ 30 31 #ifndef __SPEED_H__ 32 #define __SPEED_H__ 33 34 35 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it 36 newsize long. */ 37 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \ 38 do { \ 39 ASSERT ((newsize) >= (oldsize)); \ 40 MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \ 41 } while (0) 42 43 /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on 44 x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */ 45 #define MP_LIMB_T_LOWBITMASK(n) \ 46 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1) 47 48 49 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */ 50 51 #define TMP_ALLOC_ALIGNED(bytes, align) \ 52 align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align)) 53 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \ 54 ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align)) 55 56 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the 57 limit on what s->align_xp etc and then request for off-alignment. Maybe 58 this should be an option of some sort, but in any case here are some line 59 sizes, 60 61 bytes 62 32 pentium 63 64 athlon 64 64 itanium-2 L1 65 128 itanium-2 L2 66 */ 67 #define CACHE_LINE_SIZE 64 /* bytes */ 68 69 #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1) 70 71 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb 72 alignment. */ 73 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \ 74 do { \ 75 mp_ptr __ptr; \ 76 mp_size_t __ptr_align, __ptr_add; \ 77 \ 78 ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0); \ 79 __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \ 80 __ptr_align = (__ptr - (mp_ptr) NULL); \ 81 __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \ 82 (ptr) = __ptr + __ptr_add; \ 83 } while (0) 84 85 86 /* This is the size for s->xp_block and s->yp_block, used in certain 87 routines that want to run across many different data values and use 88 s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1. 89 90 512 means 2kbytes of data for each of xp_block and yp_block, making 4k 91 total, which should fit easily in any L1 data cache. */ 92 93 #define SPEED_BLOCK_SIZE 512 /* limbs */ 94 95 96 extern double speed_unittime; 97 extern double speed_cycletime; 98 extern int speed_precision; 99 extern char speed_time_string[]; 100 void speed_time_init (void); 101 void speed_cycletime_fail (const char *str); 102 void speed_cycletime_init (void); 103 void speed_cycletime_need_cycles (void); 104 void speed_cycletime_need_seconds (void); 105 void speed_starttime (void); 106 double speed_endtime (void); 107 108 109 struct speed_params { 110 unsigned reps; /* how many times to run the routine */ 111 mp_ptr xp; /* first argument */ 112 mp_ptr yp; /* second argument */ 113 mp_size_t size; /* size of both arguments */ 114 mp_limb_t r; /* user supplied parameter */ 115 mp_size_t align_xp; /* alignment of xp */ 116 mp_size_t align_yp; /* alignment of yp */ 117 mp_size_t align_wp; /* intended alignment of wp */ 118 mp_size_t align_wp2; /* intended alignment of wp2 */ 119 mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */ 120 mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */ 121 122 double time_divisor; /* optionally set by the speed routine */ 123 124 /* used by the cache priming things */ 125 int cache; 126 unsigned src_num, dst_num; 127 struct { 128 mp_ptr ptr; 129 mp_size_t size; 130 } src[5], dst[4]; 131 }; 132 133 typedef double (*speed_function_t) (struct speed_params *); 134 135 double speed_measure (speed_function_t fun, struct speed_params *); 136 137 /* Prototypes for speed measuring routines */ 138 139 double speed_back_to_back (struct speed_params *); 140 double speed_count_leading_zeros (struct speed_params *); 141 double speed_count_trailing_zeros (struct speed_params *); 142 double speed_find_a (struct speed_params *); 143 double speed_gmp_allocate_free (struct speed_params *); 144 double speed_gmp_allocate_reallocate_free (struct speed_params *); 145 double speed_invert_limb (struct speed_params *); 146 double speed_malloc_free (struct speed_params *); 147 double speed_malloc_realloc_free (struct speed_params *); 148 double speed_memcpy (struct speed_params *); 149 double speed_binvert_limb (struct speed_params *); 150 double speed_binvert_limb_mul1 (struct speed_params *); 151 double speed_binvert_limb_loop (struct speed_params *); 152 double speed_binvert_limb_cond (struct speed_params *); 153 double speed_binvert_limb_arith (struct speed_params *); 154 155 double speed_mpf_init_clear (struct speed_params *); 156 157 double speed_mpn_add_n (struct speed_params *); 158 double speed_mpn_add_1 (struct speed_params *); 159 double speed_mpn_add_1_inplace (struct speed_params *); 160 double speed_mpn_add_err1_n (struct speed_params *); 161 double speed_mpn_add_err2_n (struct speed_params *); 162 double speed_mpn_add_err3_n (struct speed_params *); 163 double speed_mpn_addlsh_n (struct speed_params *); 164 double speed_mpn_addlsh1_n (struct speed_params *); 165 double speed_mpn_addlsh2_n (struct speed_params *); 166 double speed_mpn_addlsh_n_ip1 (struct speed_params *); 167 double speed_mpn_addlsh1_n_ip1 (struct speed_params *); 168 double speed_mpn_addlsh2_n_ip1 (struct speed_params *); 169 double speed_mpn_addlsh_n_ip2 (struct speed_params *); 170 double speed_mpn_addlsh1_n_ip2 (struct speed_params *); 171 double speed_mpn_addlsh2_n_ip2 (struct speed_params *); 172 double speed_mpn_add_n_sub_n (struct speed_params *); 173 double speed_mpn_and_n (struct speed_params *); 174 double speed_mpn_andn_n (struct speed_params *); 175 double speed_mpn_addmul_1 (struct speed_params *); 176 double speed_mpn_addmul_2 (struct speed_params *); 177 double speed_mpn_addmul_3 (struct speed_params *); 178 double speed_mpn_addmul_4 (struct speed_params *); 179 double speed_mpn_addmul_5 (struct speed_params *); 180 double speed_mpn_addmul_6 (struct speed_params *); 181 double speed_mpn_addmul_7 (struct speed_params *); 182 double speed_mpn_addmul_8 (struct speed_params *); 183 double speed_mpn_cnd_add_n (struct speed_params *); 184 double speed_mpn_cnd_sub_n (struct speed_params *); 185 double speed_mpn_com (struct speed_params *); 186 double speed_mpn_neg (struct speed_params *); 187 double speed_mpn_copyd (struct speed_params *); 188 double speed_mpn_copyi (struct speed_params *); 189 double speed_MPN_COPY (struct speed_params *); 190 double speed_MPN_COPY_DECR (struct speed_params *); 191 double speed_MPN_COPY_INCR (struct speed_params *); 192 double speed_mpn_sec_tabselect (struct speed_params *); 193 double speed_mpn_divexact_1 (struct speed_params *); 194 double speed_mpn_divexact_by3 (struct speed_params *); 195 double speed_mpn_bdiv_q_1 (struct speed_params *); 196 double speed_mpn_pi1_bdiv_q_1 (struct speed_params *); 197 double speed_mpn_bdiv_dbm1c (struct speed_params *); 198 double speed_mpn_divrem_1 (struct speed_params *); 199 double speed_mpn_divrem_1f (struct speed_params *); 200 double speed_mpn_divrem_1c (struct speed_params *); 201 double speed_mpn_divrem_1cf (struct speed_params *); 202 double speed_mpn_divrem_1_div (struct speed_params *); 203 double speed_mpn_divrem_1f_div (struct speed_params *); 204 double speed_mpn_divrem_1_inv (struct speed_params *); 205 double speed_mpn_divrem_1f_inv (struct speed_params *); 206 double speed_mpn_divrem_2 (struct speed_params *); 207 double speed_mpn_divrem_2_div (struct speed_params *); 208 double speed_mpn_divrem_2_inv (struct speed_params *); 209 double speed_mpn_div_qr_1n_pi1 (struct speed_params *); 210 double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *); 211 double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *); 212 double speed_mpn_div_qr_1 (struct speed_params *); 213 double speed_mpn_div_qr_2n (struct speed_params *); 214 double speed_mpn_div_qr_2u (struct speed_params *); 215 double speed_mpn_fib2_ui (struct speed_params *); 216 double speed_mpn_matrix22_mul (struct speed_params *); 217 double speed_mpn_hgcd (struct speed_params *); 218 double speed_mpn_hgcd_lehmer (struct speed_params *); 219 double speed_mpn_hgcd_appr (struct speed_params *); 220 double speed_mpn_hgcd_appr_lehmer (struct speed_params *); 221 double speed_mpn_hgcd_reduce (struct speed_params *); 222 double speed_mpn_hgcd_reduce_1 (struct speed_params *); 223 double speed_mpn_hgcd_reduce_2 (struct speed_params *); 224 double speed_mpn_gcd (struct speed_params *); 225 double speed_mpn_gcd_1 (struct speed_params *); 226 double speed_mpn_gcd_1N (struct speed_params *); 227 double speed_mpn_gcdext (struct speed_params *); 228 double speed_mpn_gcdext_double (struct speed_params *); 229 double speed_mpn_gcdext_one_double (struct speed_params *); 230 double speed_mpn_gcdext_one_single (struct speed_params *); 231 double speed_mpn_gcdext_single (struct speed_params *); 232 double speed_mpn_get_str (struct speed_params *); 233 double speed_mpn_hamdist (struct speed_params *); 234 double speed_mpn_ior_n (struct speed_params *); 235 double speed_mpn_iorn_n (struct speed_params *); 236 double speed_mpn_jacobi_base (struct speed_params *); 237 double speed_mpn_jacobi_base_1 (struct speed_params *); 238 double speed_mpn_jacobi_base_2 (struct speed_params *); 239 double speed_mpn_jacobi_base_3 (struct speed_params *); 240 double speed_mpn_jacobi_base_4 (struct speed_params *); 241 double speed_mpn_lshift (struct speed_params *); 242 double speed_mpn_lshiftc (struct speed_params *); 243 double speed_mpn_mod_1 (struct speed_params *); 244 double speed_mpn_mod_1c (struct speed_params *); 245 double speed_mpn_mod_1_div (struct speed_params *); 246 double speed_mpn_mod_1_inv (struct speed_params *); 247 double speed_mpn_mod_1_1 (struct speed_params *); 248 double speed_mpn_mod_1_1_1 (struct speed_params *); 249 double speed_mpn_mod_1_1_2 (struct speed_params *); 250 double speed_mpn_mod_1_2 (struct speed_params *); 251 double speed_mpn_mod_1_3 (struct speed_params *); 252 double speed_mpn_mod_1_4 (struct speed_params *); 253 double speed_mpn_mod_34lsub1 (struct speed_params *); 254 double speed_mpn_modexact_1_odd (struct speed_params *); 255 double speed_mpn_modexact_1c_odd (struct speed_params *); 256 double speed_mpn_mul_1 (struct speed_params *); 257 double speed_mpn_mul_1_inplace (struct speed_params *); 258 double speed_mpn_mul_2 (struct speed_params *); 259 double speed_mpn_mul_3 (struct speed_params *); 260 double speed_mpn_mul_4 (struct speed_params *); 261 double speed_mpn_mul_5 (struct speed_params *); 262 double speed_mpn_mul_6 (struct speed_params *); 263 double speed_mpn_mul (struct speed_params *); 264 double speed_mpn_mul_basecase (struct speed_params *); 265 double speed_mpn_mulmid (struct speed_params *); 266 double speed_mpn_mulmid_basecase (struct speed_params *); 267 double speed_mpn_mul_fft (struct speed_params *); 268 double speed_mpn_mul_fft_sqr (struct speed_params *); 269 double speed_mpn_fft_mul (struct speed_params *); 270 double speed_mpn_fft_sqr (struct speed_params *); 271 #if WANT_OLD_FFT_FULL 272 double speed_mpn_mul_fft_full (struct speed_params *); 273 double speed_mpn_mul_fft_full_sqr (struct speed_params *); 274 #endif 275 double speed_mpn_nussbaumer_mul (struct speed_params *); 276 double speed_mpn_nussbaumer_mul_sqr (struct speed_params *); 277 double speed_mpn_mul_n (struct speed_params *); 278 double speed_mpn_mul_n_sqr (struct speed_params *); 279 double speed_mpn_mulmid_n (struct speed_params *); 280 double speed_mpn_sqrlo (struct speed_params *); 281 double speed_mpn_sqrlo_basecase (struct speed_params *); 282 double speed_mpn_mullo_n (struct speed_params *); 283 double speed_mpn_mullo_basecase (struct speed_params *); 284 double speed_mpn_nand_n (struct speed_params *); 285 double speed_mpn_nior_n (struct speed_params *); 286 double speed_mpn_popcount (struct speed_params *); 287 double speed_mpn_preinv_divrem_1 (struct speed_params *); 288 double speed_mpn_preinv_divrem_1f (struct speed_params *); 289 double speed_mpn_preinv_mod_1 (struct speed_params *); 290 double speed_mpn_sbpi1_div_qr (struct speed_params *); 291 double speed_mpn_dcpi1_div_qr (struct speed_params *); 292 double speed_mpn_sbpi1_divappr_q (struct speed_params *); 293 double speed_mpn_dcpi1_divappr_q (struct speed_params *); 294 double speed_mpn_mu_div_qr (struct speed_params *); 295 double speed_mpn_mu_divappr_q (struct speed_params *); 296 double speed_mpn_mupi_div_qr (struct speed_params *); 297 double speed_mpn_mu_div_q (struct speed_params *); 298 double speed_mpn_sbpi1_bdiv_qr (struct speed_params *); 299 double speed_mpn_dcpi1_bdiv_qr (struct speed_params *); 300 double speed_mpn_sbpi1_bdiv_q (struct speed_params *); 301 double speed_mpn_dcpi1_bdiv_q (struct speed_params *); 302 double speed_mpn_mu_bdiv_q (struct speed_params *); 303 double speed_mpn_mu_bdiv_qr (struct speed_params *); 304 double speed_mpn_broot (struct speed_params *); 305 double speed_mpn_broot_invm1 (struct speed_params *); 306 double speed_mpn_brootinv (struct speed_params *); 307 double speed_mpn_invert (struct speed_params *); 308 double speed_mpn_invertappr (struct speed_params *); 309 double speed_mpn_ni_invertappr (struct speed_params *); 310 double speed_mpn_sec_invert (struct speed_params *s); 311 double speed_mpn_binvert (struct speed_params *); 312 double speed_mpn_redc_1 (struct speed_params *); 313 double speed_mpn_redc_2 (struct speed_params *); 314 double speed_mpn_redc_n (struct speed_params *); 315 double speed_mpn_rsblsh_n (struct speed_params *); 316 double speed_mpn_rsblsh1_n (struct speed_params *); 317 double speed_mpn_rsblsh2_n (struct speed_params *); 318 double speed_mpn_rsh1add_n (struct speed_params *); 319 double speed_mpn_rsh1sub_n (struct speed_params *); 320 double speed_mpn_rshift (struct speed_params *); 321 double speed_mpn_sb_divrem_m3 (struct speed_params *); 322 double speed_mpn_sb_divrem_m3_div (struct speed_params *); 323 double speed_mpn_sb_divrem_m3_inv (struct speed_params *); 324 double speed_mpn_set_str (struct speed_params *); 325 double speed_mpn_bc_set_str (struct speed_params *); 326 double speed_mpn_dc_set_str (struct speed_params *); 327 double speed_mpn_set_str_pre (struct speed_params *); 328 double speed_mpn_sqr_basecase (struct speed_params *); 329 double speed_mpn_sqr_diag_addlsh1 (struct speed_params *); 330 double speed_mpn_sqr_diagonal (struct speed_params *); 331 double speed_mpn_sqr (struct speed_params *); 332 double speed_mpn_sqrtrem (struct speed_params *); 333 double speed_mpn_rootrem (struct speed_params *); 334 double speed_mpn_sqrt (struct speed_params *); 335 double speed_mpn_root (struct speed_params *); 336 double speed_mpn_sub_n (struct speed_params *); 337 double speed_mpn_sub_1 (struct speed_params *); 338 double speed_mpn_sub_1_inplace (struct speed_params *); 339 double speed_mpn_sub_err1_n (struct speed_params *); 340 double speed_mpn_sub_err2_n (struct speed_params *); 341 double speed_mpn_sub_err3_n (struct speed_params *); 342 double speed_mpn_sublsh_n (struct speed_params *); 343 double speed_mpn_sublsh1_n (struct speed_params *); 344 double speed_mpn_sublsh2_n (struct speed_params *); 345 double speed_mpn_sublsh_n_ip1 (struct speed_params *); 346 double speed_mpn_sublsh1_n_ip1 (struct speed_params *); 347 double speed_mpn_sublsh2_n_ip1 (struct speed_params *); 348 double speed_mpn_submul_1 (struct speed_params *); 349 double speed_mpn_toom2_sqr (struct speed_params *); 350 double speed_mpn_toom3_sqr (struct speed_params *); 351 double speed_mpn_toom4_sqr (struct speed_params *); 352 double speed_mpn_toom6_sqr (struct speed_params *); 353 double speed_mpn_toom8_sqr (struct speed_params *); 354 double speed_mpn_toom22_mul (struct speed_params *); 355 double speed_mpn_toom33_mul (struct speed_params *); 356 double speed_mpn_toom44_mul (struct speed_params *); 357 double speed_mpn_toom6h_mul (struct speed_params *); 358 double speed_mpn_toom8h_mul (struct speed_params *); 359 double speed_mpn_toom32_mul (struct speed_params *); 360 double speed_mpn_toom42_mul (struct speed_params *); 361 double speed_mpn_toom43_mul (struct speed_params *); 362 double speed_mpn_toom63_mul (struct speed_params *); 363 double speed_mpn_toom32_for_toom43_mul (struct speed_params *); 364 double speed_mpn_toom43_for_toom32_mul (struct speed_params *); 365 double speed_mpn_toom32_for_toom53_mul (struct speed_params *); 366 double speed_mpn_toom53_for_toom32_mul (struct speed_params *); 367 double speed_mpn_toom42_for_toom53_mul (struct speed_params *); 368 double speed_mpn_toom53_for_toom42_mul (struct speed_params *); 369 double speed_mpn_toom43_for_toom54_mul (struct speed_params *); 370 double speed_mpn_toom54_for_toom43_mul (struct speed_params *); 371 double speed_mpn_toom42_mulmid (struct speed_params *); 372 double speed_mpn_mulmod_bnm1 (struct speed_params *); 373 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *); 374 double speed_mpn_mulmod_bnm1_rounded (struct speed_params *); 375 double speed_mpn_sqrmod_bnm1 (struct speed_params *); 376 double speed_mpn_udiv_qrnnd (struct speed_params *); 377 double speed_mpn_udiv_qrnnd_r (struct speed_params *); 378 double speed_mpn_umul_ppmm (struct speed_params *); 379 double speed_mpn_umul_ppmm_r (struct speed_params *); 380 double speed_mpn_xnor_n (struct speed_params *); 381 double speed_mpn_xor_n (struct speed_params *); 382 double speed_MPN_ZERO (struct speed_params *); 383 384 double speed_mpq_init_clear (struct speed_params *); 385 386 double speed_mpz_add (struct speed_params *); 387 double speed_mpz_bin_uiui (struct speed_params *); 388 double speed_mpz_bin_ui (struct speed_params *); 389 double speed_mpz_fac_ui (struct speed_params *); 390 double speed_mpz_2fac_ui (struct speed_params *); 391 double speed_mpz_fib_ui (struct speed_params *); 392 double speed_mpz_fib2_ui (struct speed_params *); 393 double speed_mpz_init_clear (struct speed_params *); 394 double speed_mpz_init_realloc_clear (struct speed_params *); 395 double speed_mpz_jacobi (struct speed_params *); 396 double speed_mpz_lucnum_ui (struct speed_params *); 397 double speed_mpz_lucnum2_ui (struct speed_params *); 398 double speed_mpz_mod (struct speed_params *); 399 double speed_mpz_powm (struct speed_params *); 400 double speed_mpz_powm_mod (struct speed_params *); 401 double speed_mpz_powm_redc (struct speed_params *); 402 double speed_mpz_powm_sec (struct speed_params *); 403 double speed_mpz_powm_ui (struct speed_params *); 404 double speed_mpz_urandomb (struct speed_params *); 405 406 double speed_gmp_randseed (struct speed_params *); 407 double speed_gmp_randseed_ui (struct speed_params *); 408 409 double speed_noop (struct speed_params *); 410 double speed_noop_wxs (struct speed_params *); 411 double speed_noop_wxys (struct speed_params *); 412 413 double speed_operator_div (struct speed_params *); 414 double speed_operator_mod (struct speed_params *); 415 416 double speed_udiv_qrnnd (struct speed_params *); 417 double speed_udiv_qrnnd_preinv1 (struct speed_params *); 418 double speed_udiv_qrnnd_preinv2 (struct speed_params *); 419 double speed_udiv_qrnnd_preinv3 (struct speed_params *); 420 double speed_udiv_qrnnd_c (struct speed_params *); 421 double speed_umul_ppmm (struct speed_params *); 422 423 /* Prototypes for other routines */ 424 425 #if defined (__cplusplus) 426 extern "C" { 427 #endif 428 429 /* low 32-bits in p[0], high 32-bits in p[1] */ 430 void speed_cyclecounter (unsigned p[2]); 431 432 #if defined (__cplusplus) 433 } 434 #endif 435 436 void mftb_function (unsigned p[2]); 437 438 double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]); 439 int gettimeofday_microseconds_p (void); 440 int getrusage_microseconds_p (void); 441 int cycles_works_p (void); 442 long clk_tck (void); 443 double freq_measure (const char *, double (*)(void)); 444 445 int double_cmp_ptr (const double *, const double *); 446 void pentium_wbinvd (void); 447 typedef int (*qsort_function_t) (const void *, const void *); 448 449 void noop (void); 450 void noop_1 (mp_limb_t); 451 void noop_wxs (mp_ptr, mp_srcptr, mp_size_t); 452 void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); 453 void mpn_cache_fill (mp_srcptr, mp_size_t); 454 void mpn_cache_fill_dummy (mp_limb_t); 455 void speed_cache_fill (struct speed_params *); 456 void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t); 457 void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t); 458 459 extern int speed_option_addrs; 460 extern int speed_option_verbose; 461 extern int speed_option_cycles_broken; 462 void speed_option_set (const char *); 463 464 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); 465 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); 466 467 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 468 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 469 mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); 470 mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); 471 472 int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int); 473 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int); 474 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int); 475 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int); 476 477 mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t); 478 mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t); 479 480 mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]); 481 mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]); 482 483 void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t); 484 void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t); 485 486 mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 487 mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 488 mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 489 mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 490 mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); 491 mp_size_t mpn_hgcd_lehmer_itch (mp_size_t); 492 493 mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); 494 mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t); 495 496 mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); 497 mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t); 498 499 mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); 500 mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t); 501 502 mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); 503 mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); 504 505 mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int); 506 void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr); 507 508 void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); 509 void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); 510 511 int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int); 512 513 514 /* "get" is called repeatedly until it ticks over, just in case on a fast 515 processor it takes less than a microsecond, though this is probably 516 unlikely if it's a system call. 517 518 speed_cyclecounter is called on the same side of the "get" for the start 519 and end measurements. It doesn't matter how long it takes from the "get" 520 sample to the cycles sample, since that period will cancel out in the 521 difference calculation (assuming it's the same each time). 522 523 Letting the test run for more than a process time slice is probably only 524 going to reduce accuracy, especially for getrusage when the cycle counter 525 is real time, or for gettimeofday if the cycle counter is in fact process 526 time. Use CLK_TCK/2 as a reasonable stop. 527 528 It'd be desirable to be quite accurate here. The default speed_precision 529 for a cycle counter is 10000 cycles, so to mix that with getrusage or 530 gettimeofday the frequency should be at least that accurate. But running 531 measurements for 10000 microseconds (or more) is too long. Be satisfied 532 with just a half clock tick (5000 microseconds usually). */ 533 534 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \ 535 do { \ 536 type st1, st, et1, et; \ 537 unsigned sc[2], ec[2]; \ 538 long dt, half_tick; \ 539 double dc, cyc; \ 540 \ 541 half_tick = (1000000L / clk_tck()) / 2; \ 542 \ 543 get (st1); \ 544 do { \ 545 get (st); \ 546 } while (usec(st) == usec(st1) && sec(st) == sec(st1)); \ 547 \ 548 getc (sc); \ 549 \ 550 for (;;) \ 551 { \ 552 get (et1); \ 553 do { \ 554 get (et); \ 555 } while (usec(et) == usec(et1) && sec(et) == sec(et1)); \ 556 \ 557 getc (ec); \ 558 \ 559 dc = speed_cyclecounter_diff (ec, sc); \ 560 \ 561 /* allow secs to cancel before multiplying */ \ 562 dt = sec(et) - sec(st); \ 563 dt = dt * 1000000L + (usec(et) - usec(st)); \ 564 \ 565 if (dt >= half_tick) \ 566 break; \ 567 } \ 568 \ 569 cyc = dt * 1e-6 / dc; \ 570 \ 571 if (speed_option_verbose >= 2) \ 572 printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \ 573 name, dc, dt, cyc); \ 574 \ 575 return dt * 1e-6 / dc; \ 576 \ 577 } while (0) 578 579 580 581 582 /* The measuring routines use these big macros to save duplication for 583 similar forms. They also get used for some automatically generated 584 measuring of new implementations of functions. 585 586 Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a 587 function pointer is considered undesirable since it's not the way a 588 normal application will be calling, and some processors might do 589 different things with an indirect call, like not branch predicting, or 590 doing a full pipe flush. At least some of the "functions" measured are 591 actually macros too. 592 593 The net effect is to bloat the object code, possibly in a big way, but 594 only what's being measured is being run, so that doesn't matter. 595 596 The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or 597 ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure 598 function pointer doesn't work in gcc 3.2. Using an actual non-pure 599 function pointer variable works, but stands a real risk of a 600 non-optimizing compiler generating unnecessary overheads in the call. 601 Currently the best idea is not to use those attributes for a timing 602 program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and 603 gmp-impl.h to omit them from routines there. */ 604 605 #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0; 606 607 /* For mpn_copy or similar. */ 608 #define SPEED_ROUTINE_MPN_COPY_CALL(call) \ 609 { \ 610 mp_ptr wp; \ 611 unsigned i; \ 612 double t; \ 613 TMP_DECL; \ 614 \ 615 SPEED_RESTRICT_COND (s->size >= 0); \ 616 \ 617 TMP_MARK; \ 618 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 619 \ 620 speed_operand_src (s, s->xp, s->size); \ 621 speed_operand_dst (s, wp, s->size); \ 622 speed_cache_fill (s); \ 623 \ 624 speed_starttime (); \ 625 i = s->reps; \ 626 do \ 627 call; \ 628 while (--i != 0); \ 629 t = speed_endtime (); \ 630 \ 631 TMP_FREE; \ 632 return t; \ 633 } 634 #define SPEED_ROUTINE_MPN_COPY(function) \ 635 SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size)) 636 637 #define SPEED_ROUTINE_MPN_TABSELECT(function) \ 638 { \ 639 mp_ptr xp, wp; \ 640 unsigned i; \ 641 double t; \ 642 TMP_DECL; \ 643 \ 644 SPEED_RESTRICT_COND (s->size >= 0); \ 645 \ 646 if (s->r == 0) \ 647 s->r = s->size; /* default to a quadratic shape */ \ 648 \ 649 TMP_MARK; \ 650 SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp); \ 651 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 652 \ 653 speed_operand_src (s, xp, s->size * s->r); \ 654 speed_operand_dst (s, wp, s->size); \ 655 speed_cache_fill (s); \ 656 \ 657 speed_starttime (); \ 658 i = s->reps; \ 659 do \ 660 function (wp, xp, s->size, s->r, (s->r) / 2); \ 661 while (--i != 0); \ 662 t = speed_endtime () / s->r; \ 663 \ 664 TMP_FREE; \ 665 return t; \ 666 } 667 668 669 #define SPEED_ROUTINE_MPN_COPYC(function) \ 670 { \ 671 mp_ptr wp; \ 672 unsigned i; \ 673 double t; \ 674 TMP_DECL; \ 675 \ 676 SPEED_RESTRICT_COND (s->size >= 0); \ 677 \ 678 TMP_MARK; \ 679 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 680 \ 681 speed_operand_src (s, s->xp, s->size); \ 682 speed_operand_dst (s, wp, s->size); \ 683 speed_cache_fill (s); \ 684 \ 685 speed_starttime (); \ 686 i = s->reps; \ 687 do \ 688 function (wp, s->xp, s->size, 0); \ 689 while (--i != 0); \ 690 t = speed_endtime (); \ 691 \ 692 TMP_FREE; \ 693 return t; \ 694 } 695 696 /* s->size is still in limbs, and it's limbs which are copied, but 697 "function" takes a size in bytes not limbs. */ 698 #define SPEED_ROUTINE_MPN_COPY_BYTES(function) \ 699 { \ 700 mp_ptr wp; \ 701 unsigned i; \ 702 double t; \ 703 TMP_DECL; \ 704 \ 705 SPEED_RESTRICT_COND (s->size >= 0); \ 706 \ 707 TMP_MARK; \ 708 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 709 \ 710 speed_operand_src (s, s->xp, s->size); \ 711 speed_operand_dst (s, wp, s->size); \ 712 speed_cache_fill (s); \ 713 \ 714 speed_starttime (); \ 715 i = s->reps; \ 716 do \ 717 function (wp, s->xp, s->size * GMP_LIMB_BYTES); \ 718 while (--i != 0); \ 719 t = speed_endtime (); \ 720 \ 721 TMP_FREE; \ 722 return t; \ 723 } 724 725 726 /* For mpn_add_n, mpn_sub_n, or similar. */ 727 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \ 728 { \ 729 mp_ptr wp; \ 730 mp_ptr xp, yp; \ 731 unsigned i; \ 732 double t; \ 733 TMP_DECL; \ 734 \ 735 SPEED_RESTRICT_COND (s->size >= 1); \ 736 \ 737 TMP_MARK; \ 738 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 739 \ 740 xp = s->xp; \ 741 yp = s->yp; \ 742 \ 743 if (s->r == 0) ; \ 744 else if (s->r == 1) { xp = wp; } \ 745 else if (s->r == 2) { yp = wp; } \ 746 else if (s->r == 3) { xp = wp; yp = wp; } \ 747 else if (s->r == 4) { yp = xp; } \ 748 else { \ 749 TMP_FREE; \ 750 return -1.0; \ 751 } \ 752 \ 753 /* initialize wp if operand overlap */ \ 754 if (xp == wp || yp == wp) \ 755 MPN_COPY (wp, s->xp, s->size); \ 756 \ 757 speed_operand_src (s, xp, s->size); \ 758 speed_operand_src (s, yp, s->size); \ 759 speed_operand_dst (s, wp, s->size); \ 760 speed_cache_fill (s); \ 761 \ 762 speed_starttime (); \ 763 i = s->reps; \ 764 do \ 765 call; \ 766 while (--i != 0); \ 767 t = speed_endtime (); \ 768 \ 769 TMP_FREE; \ 770 return t; \ 771 } 772 773 774 /* For mpn_aors_errK_n, where 1 <= K <= 3. */ 775 #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K) \ 776 { \ 777 mp_ptr wp; \ 778 mp_ptr xp, yp; \ 779 mp_ptr zp[K]; \ 780 mp_limb_t ep[2*K]; \ 781 unsigned i; \ 782 double t; \ 783 TMP_DECL; \ 784 \ 785 SPEED_RESTRICT_COND (s->size >= 1); \ 786 \ 787 TMP_MARK; \ 788 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 789 \ 790 /* (don't have a mechanism to specify zp alignments) */ \ 791 for (i = 0; i < K; i++) \ 792 SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0); \ 793 \ 794 xp = s->xp; \ 795 yp = s->yp; \ 796 \ 797 if (s->r == 0) ; \ 798 else if (s->r == 1) { xp = wp; } \ 799 else if (s->r == 2) { yp = wp; } \ 800 else if (s->r == 3) { xp = wp; yp = wp; } \ 801 else if (s->r == 4) { yp = xp; } \ 802 else { \ 803 TMP_FREE; \ 804 return -1.0; \ 805 } \ 806 \ 807 /* initialize wp if operand overlap */ \ 808 if (xp == wp || yp == wp) \ 809 MPN_COPY (wp, s->xp, s->size); \ 810 \ 811 speed_operand_src (s, xp, s->size); \ 812 speed_operand_src (s, yp, s->size); \ 813 for (i = 0; i < K; i++) \ 814 speed_operand_src (s, zp[i], s->size); \ 815 speed_operand_dst (s, wp, s->size); \ 816 speed_cache_fill (s); \ 817 \ 818 speed_starttime (); \ 819 i = s->reps; \ 820 do \ 821 call; \ 822 while (--i != 0); \ 823 t = speed_endtime (); \ 824 \ 825 TMP_FREE; \ 826 return t; \ 827 } 828 829 #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function) \ 830 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1) 831 832 #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function) \ 833 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2) 834 835 #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function) \ 836 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3) 837 838 839 /* For mpn_add_n, mpn_sub_n, or similar. */ 840 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \ 841 { \ 842 mp_ptr ap, sp; \ 843 mp_ptr xp, yp; \ 844 unsigned i; \ 845 double t; \ 846 TMP_DECL; \ 847 \ 848 SPEED_RESTRICT_COND (s->size >= 1); \ 849 \ 850 TMP_MARK; \ 851 SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \ 852 SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \ 853 \ 854 xp = s->xp; \ 855 yp = s->yp; \ 856 \ 857 if ((s->r & 1) != 0) { xp = ap; } \ 858 if ((s->r & 2) != 0) { yp = ap; } \ 859 if ((s->r & 4) != 0) { xp = sp; } \ 860 if ((s->r & 8) != 0) { yp = sp; } \ 861 if ((s->r & 3) == 3 || (s->r & 12) == 12) \ 862 { \ 863 TMP_FREE; \ 864 return -1.0; \ 865 } \ 866 \ 867 /* initialize ap if operand overlap */ \ 868 if (xp == ap || yp == ap) \ 869 MPN_COPY (ap, s->xp, s->size); \ 870 /* initialize sp if operand overlap */ \ 871 if (xp == sp || yp == sp) \ 872 MPN_COPY (sp, s->xp, s->size); \ 873 \ 874 speed_operand_src (s, xp, s->size); \ 875 speed_operand_src (s, yp, s->size); \ 876 speed_operand_dst (s, ap, s->size); \ 877 speed_operand_dst (s, sp, s->size); \ 878 speed_cache_fill (s); \ 879 \ 880 speed_starttime (); \ 881 i = s->reps; \ 882 do \ 883 call; \ 884 while (--i != 0); \ 885 t = speed_endtime (); \ 886 \ 887 TMP_FREE; \ 888 return t; \ 889 } 890 891 #define SPEED_ROUTINE_MPN_BINARY_N(function) \ 892 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size)) 893 894 #define SPEED_ROUTINE_MPN_BINARY_NC(function) \ 895 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0)) 896 897 898 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */ 899 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \ 900 { \ 901 mp_ptr wp; \ 902 unsigned i; \ 903 double t; \ 904 TMP_DECL; \ 905 \ 906 SPEED_RESTRICT_COND (s->size >= 1); \ 907 \ 908 TMP_MARK; \ 909 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 910 \ 911 speed_operand_src (s, s->xp, s->size); \ 912 speed_operand_dst (s, wp, s->size); \ 913 speed_cache_fill (s); \ 914 \ 915 speed_starttime (); \ 916 i = s->reps; \ 917 do \ 918 call; \ 919 while (--i != 0); \ 920 t = speed_endtime (); \ 921 \ 922 TMP_FREE; \ 923 return t; \ 924 } 925 926 #define SPEED_ROUTINE_MPN_UNARY_1(function) \ 927 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 928 929 #define SPEED_ROUTINE_MPN_UNARY_1C(function) \ 930 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 931 932 /* FIXME: wp is uninitialized here, should start it off from xp */ 933 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \ 934 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r)) 935 936 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \ 937 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 938 939 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function) \ 940 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 941 942 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call) \ 943 { \ 944 unsigned shift; \ 945 mp_limb_t dinv; \ 946 \ 947 SPEED_RESTRICT_COND (s->size > 0); \ 948 SPEED_RESTRICT_COND (s->r != 0); \ 949 \ 950 count_trailing_zeros (shift, s->r); \ 951 binvert_limb (dinv, s->r >> shift); \ 952 \ 953 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 954 } 955 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function) \ 956 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL \ 957 ((*function) (wp, s->xp, s->size, s->r, dinv, shift)) 958 959 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function) \ 960 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 961 962 #define SPEED_ROUTINE_MPN_DIVREM_1(function) \ 963 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r)) 964 965 #define SPEED_ROUTINE_MPN_DIVREM_1C(function) \ 966 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0)) 967 968 #define SPEED_ROUTINE_MPN_DIVREM_1F(function) \ 969 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r)) 970 971 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \ 972 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0)) 973 974 975 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \ 976 { \ 977 unsigned shift; \ 978 mp_limb_t dinv; \ 979 \ 980 SPEED_RESTRICT_COND (s->size >= 0); \ 981 SPEED_RESTRICT_COND (s->r != 0); \ 982 \ 983 count_leading_zeros (shift, s->r); \ 984 invert_limb (dinv, s->r << shift); \ 985 \ 986 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 987 } \ 988 989 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \ 990 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 991 ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift)) 992 993 /* s->size limbs worth of fraction part */ 994 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \ 995 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 996 ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift)) 997 998 999 /* s->r is duplicated to form the multiplier, defaulting to 1000 MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at 1001 least it provides some control. */ 1002 #define SPEED_ROUTINE_MPN_UNARY_N(function,N) \ 1003 { \ 1004 mp_ptr wp; \ 1005 mp_size_t wn; \ 1006 unsigned i; \ 1007 double t; \ 1008 mp_limb_t yp[N]; \ 1009 TMP_DECL; \ 1010 \ 1011 SPEED_RESTRICT_COND (s->size >= N); \ 1012 \ 1013 TMP_MARK; \ 1014 wn = s->size + N-1; \ 1015 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 1016 for (i = 0; i < N; i++) \ 1017 yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \ 1018 \ 1019 speed_operand_src (s, s->xp, s->size); \ 1020 speed_operand_src (s, yp, (mp_size_t) N); \ 1021 speed_operand_dst (s, wp, wn); \ 1022 speed_cache_fill (s); \ 1023 \ 1024 speed_starttime (); \ 1025 i = s->reps; \ 1026 do \ 1027 function (wp, s->xp, s->size, yp); \ 1028 while (--i != 0); \ 1029 t = speed_endtime (); \ 1030 \ 1031 TMP_FREE; \ 1032 return t; \ 1033 } 1034 1035 #define SPEED_ROUTINE_MPN_UNARY_2(function) \ 1036 SPEED_ROUTINE_MPN_UNARY_N (function, 2) 1037 #define SPEED_ROUTINE_MPN_UNARY_3(function) \ 1038 SPEED_ROUTINE_MPN_UNARY_N (function, 3) 1039 #define SPEED_ROUTINE_MPN_UNARY_4(function) \ 1040 SPEED_ROUTINE_MPN_UNARY_N (function, 4) 1041 #define SPEED_ROUTINE_MPN_UNARY_5(function) \ 1042 SPEED_ROUTINE_MPN_UNARY_N (function, 5) 1043 #define SPEED_ROUTINE_MPN_UNARY_6(function) \ 1044 SPEED_ROUTINE_MPN_UNARY_N (function, 6) 1045 #define SPEED_ROUTINE_MPN_UNARY_7(function) \ 1046 SPEED_ROUTINE_MPN_UNARY_N (function, 7) 1047 #define SPEED_ROUTINE_MPN_UNARY_8(function) \ 1048 SPEED_ROUTINE_MPN_UNARY_N (function, 8) 1049 1050 1051 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */ 1052 #define SPEED_ROUTINE_MPN_MUL(function) \ 1053 { \ 1054 mp_ptr wp; \ 1055 mp_size_t size1; \ 1056 unsigned i; \ 1057 double t; \ 1058 TMP_DECL; \ 1059 \ 1060 size1 = (s->r == 0 ? s->size : s->r); \ 1061 if (size1 < 0) size1 = -size1 - s->size; \ 1062 \ 1063 SPEED_RESTRICT_COND (size1 >= 1); \ 1064 SPEED_RESTRICT_COND (s->size >= size1); \ 1065 \ 1066 TMP_MARK; \ 1067 SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp); \ 1068 \ 1069 speed_operand_src (s, s->xp, s->size); \ 1070 speed_operand_src (s, s->yp, size1); \ 1071 speed_operand_dst (s, wp, size1 + s->size); \ 1072 speed_cache_fill (s); \ 1073 \ 1074 speed_starttime (); \ 1075 i = s->reps; \ 1076 do \ 1077 function (wp, s->xp, s->size, s->yp, size1); \ 1078 while (--i != 0); \ 1079 t = speed_endtime (); \ 1080 \ 1081 TMP_FREE; \ 1082 return t; \ 1083 } 1084 1085 1086 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \ 1087 { \ 1088 mp_ptr wp; \ 1089 unsigned i; \ 1090 double t; \ 1091 TMP_DECL; \ 1092 \ 1093 SPEED_RESTRICT_COND (s->size >= 1); \ 1094 \ 1095 TMP_MARK; \ 1096 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1097 \ 1098 speed_operand_src (s, s->xp, s->size); \ 1099 speed_operand_src (s, s->yp, s->size); \ 1100 speed_operand_dst (s, wp, 2*s->size); \ 1101 speed_cache_fill (s); \ 1102 \ 1103 speed_starttime (); \ 1104 i = s->reps; \ 1105 do \ 1106 call; \ 1107 while (--i != 0); \ 1108 t = speed_endtime (); \ 1109 \ 1110 TMP_FREE; \ 1111 return t; \ 1112 } 1113 1114 #define SPEED_ROUTINE_MPN_MUL_N(function) \ 1115 SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size)); 1116 1117 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call) \ 1118 { \ 1119 mp_ptr wp; \ 1120 unsigned i; \ 1121 double t; \ 1122 TMP_DECL; \ 1123 \ 1124 SPEED_RESTRICT_COND (s->size >= 1); \ 1125 \ 1126 TMP_MARK; \ 1127 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 1128 \ 1129 speed_operand_src (s, s->xp, s->size); \ 1130 speed_operand_src (s, s->yp, s->size); \ 1131 speed_operand_dst (s, wp, s->size); \ 1132 speed_cache_fill (s); \ 1133 \ 1134 speed_starttime (); \ 1135 i = s->reps; \ 1136 do \ 1137 call; \ 1138 while (--i != 0); \ 1139 t = speed_endtime (); \ 1140 \ 1141 TMP_FREE; \ 1142 return t; \ 1143 } 1144 1145 #define SPEED_ROUTINE_MPN_MULLO_N(function) \ 1146 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); 1147 1148 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function) \ 1149 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); 1150 1151 #define SPEED_ROUTINE_MPN_SQRLO(function) \ 1152 { \ 1153 mp_ptr wp; \ 1154 unsigned i; \ 1155 double t; \ 1156 TMP_DECL; \ 1157 \ 1158 SPEED_RESTRICT_COND (s->size >= 1); \ 1159 \ 1160 TMP_MARK; \ 1161 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 1162 \ 1163 speed_operand_src (s, s->xp, s->size); \ 1164 speed_operand_dst (s, wp, s->size); \ 1165 speed_cache_fill (s); \ 1166 \ 1167 speed_starttime (); \ 1168 i = s->reps; \ 1169 do \ 1170 function (wp, s->xp, s->size); \ 1171 while (--i != 0); \ 1172 t = speed_endtime (); \ 1173 \ 1174 TMP_FREE; \ 1175 return t; \ 1176 } 1177 1178 /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */ 1179 #define SPEED_ROUTINE_MPN_MULMID(function) \ 1180 { \ 1181 mp_ptr wp, xp; \ 1182 mp_size_t size1; \ 1183 unsigned i; \ 1184 double t; \ 1185 TMP_DECL; \ 1186 \ 1187 size1 = (s->r == 0 ? (2 * s->size - 1) : s->r); \ 1188 \ 1189 SPEED_RESTRICT_COND (s->size >= 1); \ 1190 SPEED_RESTRICT_COND (size1 >= s->size); \ 1191 \ 1192 TMP_MARK; \ 1193 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1194 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1195 \ 1196 speed_operand_src (s, xp, size1); \ 1197 speed_operand_src (s, s->yp, s->size); \ 1198 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1199 speed_cache_fill (s); \ 1200 \ 1201 speed_starttime (); \ 1202 i = s->reps; \ 1203 do \ 1204 function (wp, xp, size1, s->yp, s->size); \ 1205 while (--i != 0); \ 1206 t = speed_endtime (); \ 1207 \ 1208 TMP_FREE; \ 1209 return t; \ 1210 } 1211 1212 #define SPEED_ROUTINE_MPN_MULMID_N(function) \ 1213 { \ 1214 mp_ptr wp, xp; \ 1215 mp_size_t size1; \ 1216 unsigned i; \ 1217 double t; \ 1218 TMP_DECL; \ 1219 \ 1220 size1 = 2 * s->size - 1; \ 1221 \ 1222 SPEED_RESTRICT_COND (s->size >= 1); \ 1223 \ 1224 TMP_MARK; \ 1225 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1226 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1227 \ 1228 speed_operand_src (s, xp, size1); \ 1229 speed_operand_src (s, s->yp, s->size); \ 1230 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1231 speed_cache_fill (s); \ 1232 \ 1233 speed_starttime (); \ 1234 i = s->reps; \ 1235 do \ 1236 function (wp, xp, s->yp, s->size); \ 1237 while (--i != 0); \ 1238 t = speed_endtime (); \ 1239 \ 1240 TMP_FREE; \ 1241 return t; \ 1242 } 1243 1244 #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function) \ 1245 { \ 1246 mp_ptr wp, xp, scratch; \ 1247 mp_size_t size1, scratch_size; \ 1248 unsigned i; \ 1249 double t; \ 1250 TMP_DECL; \ 1251 \ 1252 size1 = 2 * s->size - 1; \ 1253 \ 1254 SPEED_RESTRICT_COND (s->size >= 1); \ 1255 \ 1256 TMP_MARK; \ 1257 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1258 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1259 scratch_size = mpn_toom42_mulmid_itch (s->size); \ 1260 SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0); \ 1261 \ 1262 speed_operand_src (s, xp, size1); \ 1263 speed_operand_src (s, s->yp, s->size); \ 1264 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1265 speed_cache_fill (s); \ 1266 \ 1267 speed_starttime (); \ 1268 i = s->reps; \ 1269 do \ 1270 function (wp, xp, s->yp, s->size, scratch); \ 1271 while (--i != 0); \ 1272 t = speed_endtime (); \ 1273 \ 1274 TMP_FREE; \ 1275 return t; \ 1276 } 1277 1278 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \ 1279 { \ 1280 mp_ptr wp, tp; \ 1281 unsigned i; \ 1282 double t; \ 1283 mp_size_t itch; \ 1284 TMP_DECL; \ 1285 \ 1286 SPEED_RESTRICT_COND (s->size >= 1); \ 1287 \ 1288 itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size); \ 1289 \ 1290 TMP_MARK; \ 1291 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ 1292 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1293 \ 1294 speed_operand_src (s, s->xp, s->size); \ 1295 speed_operand_src (s, s->yp, s->size); \ 1296 speed_operand_dst (s, wp, 2 * s->size); \ 1297 speed_operand_dst (s, tp, itch); \ 1298 speed_cache_fill (s); \ 1299 \ 1300 speed_starttime (); \ 1301 i = s->reps; \ 1302 do \ 1303 call; \ 1304 while (--i != 0); \ 1305 t = speed_endtime (); \ 1306 \ 1307 TMP_FREE; \ 1308 return t; \ 1309 } 1310 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function) \ 1311 { \ 1312 mp_ptr wp, tp; \ 1313 unsigned i; \ 1314 double t; \ 1315 mp_size_t size, itch; \ 1316 TMP_DECL; \ 1317 \ 1318 SPEED_RESTRICT_COND (s->size >= 1); \ 1319 \ 1320 size = mpn_mulmod_bnm1_next_size (s->size); \ 1321 itch = mpn_mulmod_bnm1_itch (size, size, size); \ 1322 \ 1323 TMP_MARK; \ 1324 SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp); \ 1325 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1326 \ 1327 speed_operand_src (s, s->xp, s->size); \ 1328 speed_operand_src (s, s->yp, s->size); \ 1329 speed_operand_dst (s, wp, size); \ 1330 speed_operand_dst (s, tp, itch); \ 1331 speed_cache_fill (s); \ 1332 \ 1333 speed_starttime (); \ 1334 i = s->reps; \ 1335 do \ 1336 function (wp, size, s->xp, s->size, s->yp, s->size, tp); \ 1337 while (--i != 0); \ 1338 t = speed_endtime (); \ 1339 \ 1340 TMP_FREE; \ 1341 return t; \ 1342 } 1343 1344 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \ 1345 { \ 1346 mp_ptr wp, tspace; \ 1347 unsigned i; \ 1348 double t; \ 1349 TMP_DECL; \ 1350 \ 1351 SPEED_RESTRICT_COND (s->size >= minsize); \ 1352 \ 1353 TMP_MARK; \ 1354 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1355 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1356 \ 1357 speed_operand_src (s, s->xp, s->size); \ 1358 speed_operand_src (s, s->yp, s->size); \ 1359 speed_operand_dst (s, wp, 2*s->size); \ 1360 speed_operand_dst (s, tspace, tsize); \ 1361 speed_cache_fill (s); \ 1362 \ 1363 speed_starttime (); \ 1364 i = s->reps; \ 1365 do \ 1366 call; \ 1367 while (--i != 0); \ 1368 t = speed_endtime (); \ 1369 \ 1370 TMP_FREE; \ 1371 return t; \ 1372 } 1373 1374 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function) \ 1375 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1376 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1377 mpn_toom22_mul_itch (s->size, s->size), \ 1378 MPN_TOOM22_MUL_MINSIZE) 1379 1380 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function) \ 1381 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1382 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1383 mpn_toom33_mul_itch (s->size, s->size), \ 1384 MPN_TOOM33_MUL_MINSIZE) 1385 1386 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function) \ 1387 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1388 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1389 mpn_toom44_mul_itch (s->size, s->size), \ 1390 MPN_TOOM44_MUL_MINSIZE) 1391 1392 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function) \ 1393 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1394 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1395 mpn_toom6h_mul_itch (s->size, s->size), \ 1396 MPN_TOOM6H_MUL_MINSIZE) 1397 1398 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function) \ 1399 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1400 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1401 mpn_toom8h_mul_itch (s->size, s->size), \ 1402 MPN_TOOM8H_MUL_MINSIZE) 1403 1404 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \ 1405 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1406 (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \ 1407 mpn_toom32_mul_itch (s->size, 2*s->size/3), \ 1408 MPN_TOOM32_MUL_MINSIZE) 1409 1410 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \ 1411 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1412 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1413 mpn_toom42_mul_itch (s->size, s->size/2), \ 1414 MPN_TOOM42_MUL_MINSIZE) 1415 1416 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \ 1417 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1418 (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \ 1419 mpn_toom43_mul_itch (s->size, s->size*3/4), \ 1420 MPN_TOOM43_MUL_MINSIZE) 1421 1422 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \ 1423 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1424 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1425 mpn_toom63_mul_itch (s->size, s->size/2), \ 1426 MPN_TOOM63_MUL_MINSIZE) 1427 1428 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \ 1429 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1430 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1431 mpn_toom32_mul_itch (s->size, 17*s->size/24), \ 1432 MPN_TOOM32_MUL_MINSIZE) 1433 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \ 1434 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1435 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1436 mpn_toom43_mul_itch (s->size, 17*s->size/24), \ 1437 MPN_TOOM43_MUL_MINSIZE) 1438 1439 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \ 1440 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1441 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1442 mpn_toom32_mul_itch (s->size, 19*s->size/30), \ 1443 MPN_TOOM32_MUL_MINSIZE) 1444 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \ 1445 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1446 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1447 mpn_toom53_mul_itch (s->size, 19*s->size/30), \ 1448 MPN_TOOM53_MUL_MINSIZE) 1449 1450 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \ 1451 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1452 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1453 mpn_toom42_mul_itch (s->size, 11*s->size/20), \ 1454 MPN_TOOM42_MUL_MINSIZE) 1455 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \ 1456 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1457 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1458 mpn_toom53_mul_itch (s->size, 11*s->size/20), \ 1459 MPN_TOOM53_MUL_MINSIZE) 1460 1461 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function) \ 1462 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1463 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ 1464 mpn_toom42_mul_itch (s->size, 5*s->size/6), \ 1465 MPN_TOOM54_MUL_MINSIZE) 1466 #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function) \ 1467 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1468 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ 1469 mpn_toom54_mul_itch (s->size, 5*s->size/6), \ 1470 MPN_TOOM54_MUL_MINSIZE) 1471 1472 1473 1474 #define SPEED_ROUTINE_MPN_SQR_CALL(call) \ 1475 { \ 1476 mp_ptr wp; \ 1477 unsigned i; \ 1478 double t; \ 1479 TMP_DECL; \ 1480 \ 1481 SPEED_RESTRICT_COND (s->size >= 1); \ 1482 \ 1483 TMP_MARK; \ 1484 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1485 \ 1486 speed_operand_src (s, s->xp, s->size); \ 1487 speed_operand_dst (s, wp, 2*s->size); \ 1488 speed_cache_fill (s); \ 1489 \ 1490 speed_starttime (); \ 1491 i = s->reps; \ 1492 do \ 1493 call; \ 1494 while (--i != 0); \ 1495 t = speed_endtime (); \ 1496 \ 1497 TMP_FREE; \ 1498 return t; \ 1499 } 1500 1501 #define SPEED_ROUTINE_MPN_SQR(function) \ 1502 SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size)) 1503 1504 #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call) \ 1505 { \ 1506 mp_ptr wp, tp; \ 1507 unsigned i; \ 1508 double t; \ 1509 TMP_DECL; \ 1510 \ 1511 SPEED_RESTRICT_COND (s->size >= 2); \ 1512 \ 1513 TMP_MARK; \ 1514 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp); \ 1515 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ 1516 \ 1517 speed_operand_src (s, s->xp, s->size); \ 1518 speed_operand_src (s, tp, 2 * s->size); \ 1519 speed_operand_dst (s, wp, 2 * s->size); \ 1520 speed_cache_fill (s); \ 1521 \ 1522 speed_starttime (); \ 1523 i = s->reps; \ 1524 do \ 1525 call; \ 1526 while (--i != 0); \ 1527 t = speed_endtime () / 2; \ 1528 \ 1529 TMP_FREE; \ 1530 return t; \ 1531 } 1532 1533 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \ 1534 { \ 1535 mp_ptr wp, tspace; \ 1536 unsigned i; \ 1537 double t; \ 1538 TMP_DECL; \ 1539 \ 1540 SPEED_RESTRICT_COND (s->size >= minsize); \ 1541 \ 1542 TMP_MARK; \ 1543 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1544 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1545 \ 1546 speed_operand_src (s, s->xp, s->size); \ 1547 speed_operand_dst (s, wp, 2*s->size); \ 1548 speed_operand_dst (s, tspace, tsize); \ 1549 speed_cache_fill (s); \ 1550 \ 1551 speed_starttime (); \ 1552 i = s->reps; \ 1553 do \ 1554 call; \ 1555 while (--i != 0); \ 1556 t = speed_endtime (); \ 1557 \ 1558 TMP_FREE; \ 1559 return t; \ 1560 } 1561 1562 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function) \ 1563 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1564 mpn_toom2_sqr_itch (s->size), \ 1565 MPN_TOOM2_SQR_MINSIZE) 1566 1567 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function) \ 1568 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1569 mpn_toom3_sqr_itch (s->size), \ 1570 MPN_TOOM3_SQR_MINSIZE) 1571 1572 1573 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function) \ 1574 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1575 mpn_toom4_sqr_itch (s->size), \ 1576 MPN_TOOM4_SQR_MINSIZE) 1577 1578 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function) \ 1579 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1580 mpn_toom6_sqr_itch (s->size), \ 1581 MPN_TOOM6_SQR_MINSIZE) 1582 1583 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function) \ 1584 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1585 mpn_toom8_sqr_itch (s->size), \ 1586 MPN_TOOM8_SQR_MINSIZE) 1587 1588 #define SPEED_ROUTINE_MPN_MOD_CALL(call) \ 1589 { \ 1590 unsigned i; \ 1591 \ 1592 SPEED_RESTRICT_COND (s->size >= 0); \ 1593 \ 1594 speed_operand_src (s, s->xp, s->size); \ 1595 speed_cache_fill (s); \ 1596 \ 1597 speed_starttime (); \ 1598 i = s->reps; \ 1599 do \ 1600 call; \ 1601 while (--i != 0); \ 1602 \ 1603 return speed_endtime (); \ 1604 } 1605 1606 #define SPEED_ROUTINE_MPN_MOD_1(function) \ 1607 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r)) 1608 1609 #define SPEED_ROUTINE_MPN_MOD_1C(function) \ 1610 SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0))) 1611 1612 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \ 1613 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r)); 1614 1615 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \ 1616 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0))); 1617 1618 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \ 1619 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size)) 1620 1621 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \ 1622 { \ 1623 unsigned i; \ 1624 mp_limb_t inv; \ 1625 \ 1626 SPEED_RESTRICT_COND (s->size >= 0); \ 1627 SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \ 1628 \ 1629 invert_limb (inv, s->r); \ 1630 speed_operand_src (s, s->xp, s->size); \ 1631 speed_cache_fill (s); \ 1632 \ 1633 speed_starttime (); \ 1634 i = s->reps; \ 1635 do \ 1636 (*function) (s->xp, s->size, s->r, inv); \ 1637 while (--i != 0); \ 1638 \ 1639 return speed_endtime (); \ 1640 } 1641 1642 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc) \ 1643 { \ 1644 unsigned i; \ 1645 mp_limb_t inv[4]; \ 1646 \ 1647 SPEED_RESTRICT_COND (s->size >= 2); \ 1648 \ 1649 mpn_mod_1_1p_cps (inv, s->r); \ 1650 speed_operand_src (s, s->xp, s->size); \ 1651 speed_cache_fill (s); \ 1652 \ 1653 speed_starttime (); \ 1654 i = s->reps; \ 1655 do { \ 1656 pfunc (inv, s->r); \ 1657 function (s->xp, s->size, s->r << inv[1], inv); \ 1658 } while (--i != 0); \ 1659 \ 1660 return speed_endtime (); \ 1661 } 1662 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N) \ 1663 { \ 1664 unsigned i; \ 1665 mp_limb_t inv[N+3]; \ 1666 \ 1667 SPEED_RESTRICT_COND (s->size >= 1); \ 1668 SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N); \ 1669 \ 1670 speed_operand_src (s, s->xp, s->size); \ 1671 speed_cache_fill (s); \ 1672 \ 1673 speed_starttime (); \ 1674 i = s->reps; \ 1675 do { \ 1676 pfunc (inv, s->r); \ 1677 function (s->xp, s->size, s->r, inv); \ 1678 } while (--i != 0); \ 1679 \ 1680 return speed_endtime (); \ 1681 } 1682 1683 1684 /* A division of 2*s->size by s->size limbs */ 1685 1686 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \ 1687 { \ 1688 unsigned i; \ 1689 mp_ptr a, d, q, r; \ 1690 double t; \ 1691 gmp_pi1_t dinv; \ 1692 TMP_DECL; \ 1693 \ 1694 SPEED_RESTRICT_COND (s->size >= 1); \ 1695 \ 1696 TMP_MARK; \ 1697 SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ 1698 SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ 1699 SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ 1700 SPEED_TMP_ALLOC_LIMBS (r, s->size, s->align_wp2); \ 1701 \ 1702 MPN_COPY (a, s->xp, s->size); \ 1703 MPN_COPY (a+s->size, s->xp, s->size); \ 1704 \ 1705 MPN_COPY (d, s->yp, s->size); \ 1706 \ 1707 /* normalize the data */ \ 1708 d[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1709 a[2*s->size-1] = d[s->size-1] - 1; \ 1710 \ 1711 invert_pi1 (dinv, d[s->size-1], d[s->size-2]); \ 1712 \ 1713 speed_operand_src (s, a, 2*s->size); \ 1714 speed_operand_src (s, d, s->size); \ 1715 speed_operand_dst (s, q, s->size+1); \ 1716 speed_operand_dst (s, r, s->size); \ 1717 speed_cache_fill (s); \ 1718 \ 1719 speed_starttime (); \ 1720 i = s->reps; \ 1721 do \ 1722 call; \ 1723 while (--i != 0); \ 1724 t = speed_endtime (); \ 1725 \ 1726 TMP_FREE; \ 1727 return t; \ 1728 } 1729 1730 1731 /* A remainder 2*s->size by s->size limbs */ 1732 1733 #define SPEED_ROUTINE_MPZ_MOD(function) \ 1734 { \ 1735 unsigned i; \ 1736 mpz_t a, d, r; \ 1737 \ 1738 SPEED_RESTRICT_COND (s->size >= 1); \ 1739 \ 1740 mpz_init_set_n (d, s->yp, s->size); \ 1741 \ 1742 /* high part less than d, low part a duplicate copied in */ \ 1743 mpz_init_set_n (a, s->xp, s->size); \ 1744 mpz_mod (a, a, d); \ 1745 mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size); \ 1746 MPN_COPY (PTR(a), s->xp, s->size); \ 1747 \ 1748 mpz_init (r); \ 1749 \ 1750 speed_operand_src (s, PTR(a), SIZ(a)); \ 1751 speed_operand_src (s, PTR(d), SIZ(d)); \ 1752 speed_cache_fill (s); \ 1753 \ 1754 speed_starttime (); \ 1755 i = s->reps; \ 1756 do \ 1757 function (r, a, d); \ 1758 while (--i != 0); \ 1759 return speed_endtime (); \ 1760 } 1761 1762 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN) \ 1763 { \ 1764 unsigned i; \ 1765 mp_ptr dp, tp, ap, qp; \ 1766 gmp_pi1_t inv; \ 1767 double t; \ 1768 mp_size_t size1; \ 1769 TMP_DECL; \ 1770 \ 1771 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1772 \ 1773 SPEED_RESTRICT_COND (s->size >= DMIN); \ 1774 SPEED_RESTRICT_COND (size1 - s->size >= QMIN); \ 1775 \ 1776 TMP_MARK; \ 1777 SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp); \ 1778 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1779 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1780 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2); \ 1781 \ 1782 /* we don't fill in dividend completely when size1 > s->size */ \ 1783 MPN_COPY (ap, s->xp, s->size); \ 1784 MPN_COPY (ap + size1 - s->size, s->xp, s->size); \ 1785 \ 1786 MPN_COPY (dp, s->yp, s->size); \ 1787 \ 1788 /* normalize the data */ \ 1789 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1790 ap[size1 - 1] = dp[s->size - 1] - 1; \ 1791 \ 1792 invert_pi1 (inv, dp[s->size-1], dp[s->size-2]); \ 1793 \ 1794 speed_operand_src (s, ap, size1); \ 1795 speed_operand_dst (s, tp, size1); \ 1796 speed_operand_src (s, dp, s->size); \ 1797 speed_operand_dst (s, qp, size1 - s->size); \ 1798 speed_cache_fill (s); \ 1799 \ 1800 speed_starttime (); \ 1801 i = s->reps; \ 1802 do { \ 1803 MPN_COPY (tp, ap, size1); \ 1804 function (qp, tp, size1, dp, s->size, INV); \ 1805 } while (--i != 0); \ 1806 t = speed_endtime (); \ 1807 \ 1808 TMP_FREE; \ 1809 return t; \ 1810 } 1811 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn) \ 1812 { \ 1813 unsigned i; \ 1814 mp_ptr dp, tp, qp, scratch; \ 1815 double t; \ 1816 mp_size_t itch; \ 1817 TMP_DECL; \ 1818 \ 1819 SPEED_RESTRICT_COND (s->size >= 2); \ 1820 \ 1821 itch = itchfn (2 * s->size, s->size, 0); \ 1822 TMP_MARK; \ 1823 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1824 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1825 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 1826 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1827 \ 1828 MPN_COPY (tp, s->xp, s->size); \ 1829 MPN_COPY (tp+s->size, s->xp, s->size); \ 1830 \ 1831 /* normalize the data */ \ 1832 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1833 tp[2*s->size-1] = dp[s->size-1] - 1; \ 1834 \ 1835 speed_operand_dst (s, qp, s->size); \ 1836 speed_operand_src (s, tp, 2 * s->size); \ 1837 speed_operand_src (s, dp, s->size); \ 1838 speed_operand_dst (s, scratch, itch); \ 1839 speed_cache_fill (s); \ 1840 \ 1841 speed_starttime (); \ 1842 i = s->reps; \ 1843 do { \ 1844 function (qp, tp, 2 * s->size, dp, s->size, scratch); \ 1845 } while (--i != 0); \ 1846 t = speed_endtime (); \ 1847 \ 1848 TMP_FREE; \ 1849 return t; \ 1850 } 1851 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn) \ 1852 { \ 1853 unsigned i; \ 1854 mp_ptr dp, tp, qp, rp, scratch; \ 1855 double t; \ 1856 mp_size_t size1, itch; \ 1857 TMP_DECL; \ 1858 \ 1859 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1860 \ 1861 SPEED_RESTRICT_COND (s->size >= 2); \ 1862 SPEED_RESTRICT_COND (size1 >= s->size); \ 1863 \ 1864 itch = itchfn (size1, s->size, 0); \ 1865 TMP_MARK; \ 1866 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1867 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1868 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1869 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1870 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1871 \ 1872 /* we don't fill in dividend completely when size1 > s->size */ \ 1873 MPN_COPY (tp, s->xp, s->size); \ 1874 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1875 \ 1876 MPN_COPY (dp, s->yp, s->size); \ 1877 \ 1878 /* normalize the data */ \ 1879 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1880 tp[size1 - 1] = dp[s->size - 1] - 1; \ 1881 \ 1882 speed_operand_dst (s, qp, size1 - s->size); \ 1883 speed_operand_dst (s, rp, s->size); \ 1884 speed_operand_src (s, tp, size1); \ 1885 speed_operand_src (s, dp, s->size); \ 1886 speed_operand_dst (s, scratch, itch); \ 1887 speed_cache_fill (s); \ 1888 \ 1889 speed_starttime (); \ 1890 i = s->reps; \ 1891 do { \ 1892 function (qp, rp, tp, size1, dp, s->size, scratch); \ 1893 } while (--i != 0); \ 1894 t = speed_endtime (); \ 1895 \ 1896 TMP_FREE; \ 1897 return t; \ 1898 } 1899 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn) \ 1900 { \ 1901 unsigned i; \ 1902 mp_ptr dp, tp, qp, rp, ip, scratch, tmp; \ 1903 double t; \ 1904 mp_size_t size1, itch; \ 1905 TMP_DECL; \ 1906 \ 1907 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1908 \ 1909 SPEED_RESTRICT_COND (s->size >= 2); \ 1910 SPEED_RESTRICT_COND (size1 >= s->size); \ 1911 \ 1912 itch = itchfn (size1, s->size, s->size); \ 1913 TMP_MARK; \ 1914 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1915 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1916 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1917 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1918 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1919 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \ 1920 \ 1921 /* we don't fill in dividend completely when size1 > s->size */ \ 1922 MPN_COPY (tp, s->xp, s->size); \ 1923 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1924 \ 1925 MPN_COPY (dp, s->yp, s->size); \ 1926 \ 1927 /* normalize the data */ \ 1928 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1929 tp[size1 - 1] = dp[s->size-1] - 1; \ 1930 \ 1931 tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size)); \ 1932 mpn_invert (ip, dp, s->size, tmp); \ 1933 \ 1934 speed_operand_dst (s, qp, size1 - s->size); \ 1935 speed_operand_dst (s, rp, s->size); \ 1936 speed_operand_src (s, tp, size1); \ 1937 speed_operand_src (s, dp, s->size); \ 1938 speed_operand_src (s, ip, s->size); \ 1939 speed_operand_dst (s, scratch, itch); \ 1940 speed_cache_fill (s); \ 1941 \ 1942 speed_starttime (); \ 1943 i = s->reps; \ 1944 do { \ 1945 function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch); \ 1946 } while (--i != 0); \ 1947 t = speed_endtime (); \ 1948 \ 1949 TMP_FREE; \ 1950 return t; \ 1951 } 1952 1953 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function) \ 1954 { \ 1955 unsigned i; \ 1956 mp_ptr dp, tp, ap, qp; \ 1957 mp_limb_t inv; \ 1958 double t; \ 1959 TMP_DECL; \ 1960 \ 1961 SPEED_RESTRICT_COND (s->size >= 1); \ 1962 \ 1963 TMP_MARK; \ 1964 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \ 1965 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1966 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1967 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \ 1968 \ 1969 MPN_COPY (ap, s->xp, s->size); \ 1970 MPN_COPY (ap+s->size, s->xp, s->size); \ 1971 \ 1972 /* divisor must be odd */ \ 1973 MPN_COPY (dp, s->yp, s->size); \ 1974 dp[0] |= 1; \ 1975 binvert_limb (inv, dp[0]); \ 1976 inv = -inv; \ 1977 \ 1978 speed_operand_src (s, ap, 2*s->size); \ 1979 speed_operand_dst (s, tp, 2*s->size); \ 1980 speed_operand_src (s, dp, s->size); \ 1981 speed_operand_dst (s, qp, s->size); \ 1982 speed_cache_fill (s); \ 1983 \ 1984 speed_starttime (); \ 1985 i = s->reps; \ 1986 do { \ 1987 MPN_COPY (tp, ap, 2*s->size); \ 1988 function (qp, tp, 2*s->size, dp, s->size, inv); \ 1989 } while (--i != 0); \ 1990 t = speed_endtime (); \ 1991 \ 1992 TMP_FREE; \ 1993 return t; \ 1994 } 1995 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function) \ 1996 { \ 1997 unsigned i; \ 1998 mp_ptr dp, tp, qp; \ 1999 mp_limb_t inv; \ 2000 double t; \ 2001 TMP_DECL; \ 2002 \ 2003 SPEED_RESTRICT_COND (s->size >= 1); \ 2004 \ 2005 TMP_MARK; \ 2006 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2007 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2008 SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2); \ 2009 \ 2010 /* divisor must be odd */ \ 2011 MPN_COPY (dp, s->yp, s->size); \ 2012 dp[0] |= 1; \ 2013 binvert_limb (inv, dp[0]); \ 2014 inv = -inv; \ 2015 \ 2016 speed_operand_src (s, s->xp, s->size); \ 2017 speed_operand_dst (s, tp, s->size); \ 2018 speed_operand_src (s, dp, s->size); \ 2019 speed_operand_dst (s, qp, s->size); \ 2020 speed_cache_fill (s); \ 2021 \ 2022 speed_starttime (); \ 2023 i = s->reps; \ 2024 do { \ 2025 MPN_COPY (tp, s->xp, s->size); \ 2026 function (qp, tp, s->size, dp, s->size, inv); \ 2027 } while (--i != 0); \ 2028 t = speed_endtime (); \ 2029 \ 2030 TMP_FREE; \ 2031 return t; \ 2032 } 2033 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn) \ 2034 { \ 2035 unsigned i; \ 2036 mp_ptr dp, qp, scratch; \ 2037 double t; \ 2038 mp_size_t itch; \ 2039 TMP_DECL; \ 2040 \ 2041 SPEED_RESTRICT_COND (s->size >= 2); \ 2042 \ 2043 itch = itchfn (s->size, s->size); \ 2044 TMP_MARK; \ 2045 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2046 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2047 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 2048 \ 2049 /* divisor must be odd */ \ 2050 MPN_COPY (dp, s->yp, s->size); \ 2051 dp[0] |= 1; \ 2052 \ 2053 speed_operand_dst (s, qp, s->size); \ 2054 speed_operand_src (s, s->xp, s->size); \ 2055 speed_operand_src (s, dp, s->size); \ 2056 speed_operand_dst (s, scratch, itch); \ 2057 speed_cache_fill (s); \ 2058 \ 2059 speed_starttime (); \ 2060 i = s->reps; \ 2061 do { \ 2062 function (qp, s->xp, s->size, dp, s->size, scratch); \ 2063 } while (--i != 0); \ 2064 t = speed_endtime (); \ 2065 \ 2066 TMP_FREE; \ 2067 return t; \ 2068 } 2069 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn) \ 2070 { \ 2071 unsigned i; \ 2072 mp_ptr dp, tp, qp, rp, scratch; \ 2073 double t; \ 2074 mp_size_t itch; \ 2075 TMP_DECL; \ 2076 \ 2077 SPEED_RESTRICT_COND (s->size >= 2); \ 2078 \ 2079 itch = itchfn (2 * s->size, s->size); \ 2080 TMP_MARK; \ 2081 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2082 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2083 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 2084 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 2085 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 2086 \ 2087 MPN_COPY (tp, s->xp, s->size); \ 2088 MPN_COPY (tp+s->size, s->xp, s->size); \ 2089 \ 2090 /* divisor must be odd */ \ 2091 MPN_COPY (dp, s->yp, s->size); \ 2092 dp[0] |= 1; \ 2093 \ 2094 speed_operand_dst (s, qp, s->size); \ 2095 speed_operand_dst (s, rp, s->size); \ 2096 speed_operand_src (s, tp, 2 * s->size); \ 2097 speed_operand_src (s, dp, s->size); \ 2098 speed_operand_dst (s, scratch, itch); \ 2099 speed_cache_fill (s); \ 2100 \ 2101 speed_starttime (); \ 2102 i = s->reps; \ 2103 do { \ 2104 function (qp, rp, tp, 2 * s->size, dp, s->size, scratch); \ 2105 } while (--i != 0); \ 2106 t = speed_endtime (); \ 2107 \ 2108 TMP_FREE; \ 2109 return t; \ 2110 } 2111 2112 #define SPEED_ROUTINE_MPN_BROOT(function) \ 2113 { \ 2114 SPEED_RESTRICT_COND (s->r & 1); \ 2115 s->xp[0] |= 1; \ 2116 SPEED_ROUTINE_MPN_UNARY_1_CALL \ 2117 ((*function) (wp, s->xp, s->size, s->r)); \ 2118 } 2119 2120 #define SPEED_ROUTINE_MPN_BROOTINV(function, itch) \ 2121 { \ 2122 mp_ptr wp, tp; \ 2123 unsigned i; \ 2124 double t; \ 2125 TMP_DECL; \ 2126 TMP_MARK; \ 2127 SPEED_RESTRICT_COND (s->size >= 1); \ 2128 SPEED_RESTRICT_COND (s->r & 1); \ 2129 wp = TMP_ALLOC_LIMBS (s->size); \ 2130 tp = TMP_ALLOC_LIMBS ( (itch)); \ 2131 s->xp[0] |= 1; \ 2132 \ 2133 speed_operand_src (s, s->xp, s->size); \ 2134 speed_operand_dst (s, wp, s->size); \ 2135 speed_cache_fill (s); \ 2136 \ 2137 speed_starttime (); \ 2138 i = s->reps; \ 2139 do \ 2140 (*function) (wp, s->xp, s->size, s->r, tp); \ 2141 while (--i != 0); \ 2142 t = speed_endtime (); \ 2143 \ 2144 TMP_FREE; \ 2145 return t; \ 2146 } 2147 2148 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn) \ 2149 { \ 2150 long i; \ 2151 mp_ptr up, tp, ip; \ 2152 double t; \ 2153 TMP_DECL; \ 2154 \ 2155 SPEED_RESTRICT_COND (s->size >= 1); \ 2156 \ 2157 TMP_MARK; \ 2158 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2159 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2160 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2161 \ 2162 MPN_COPY (up, s->xp, s->size); \ 2163 \ 2164 /* normalize the data */ \ 2165 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2166 \ 2167 speed_operand_src (s, up, s->size); \ 2168 speed_operand_dst (s, tp, s->size); \ 2169 speed_operand_dst (s, ip, s->size); \ 2170 speed_cache_fill (s); \ 2171 \ 2172 speed_starttime (); \ 2173 i = s->reps; \ 2174 do \ 2175 function (ip, up, s->size, tp); \ 2176 while (--i != 0); \ 2177 t = speed_endtime (); \ 2178 \ 2179 TMP_FREE; \ 2180 return t; \ 2181 } 2182 2183 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn) \ 2184 { \ 2185 long i; \ 2186 mp_ptr up, tp, ip; \ 2187 double t; \ 2188 TMP_DECL; \ 2189 \ 2190 SPEED_RESTRICT_COND (s->size >= 1); \ 2191 \ 2192 TMP_MARK; \ 2193 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2194 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2195 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2196 \ 2197 MPN_COPY (up, s->xp, s->size); \ 2198 \ 2199 /* normalize the data */ \ 2200 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2201 \ 2202 speed_operand_src (s, up, s->size); \ 2203 speed_operand_dst (s, tp, s->size); \ 2204 speed_operand_dst (s, ip, s->size); \ 2205 speed_cache_fill (s); \ 2206 \ 2207 speed_starttime (); \ 2208 i = s->reps; \ 2209 do \ 2210 function (ip, up, s->size, tp); \ 2211 while (--i != 0); \ 2212 t = speed_endtime (); \ 2213 \ 2214 TMP_FREE; \ 2215 return t; \ 2216 } 2217 2218 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn) \ 2219 { \ 2220 long i; \ 2221 mp_ptr up, tp, ip; \ 2222 double t; \ 2223 TMP_DECL; \ 2224 \ 2225 SPEED_RESTRICT_COND (s->size >= 3); \ 2226 \ 2227 TMP_MARK; \ 2228 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2229 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2230 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2231 \ 2232 MPN_COPY (up, s->xp, s->size); \ 2233 \ 2234 /* normalize the data */ \ 2235 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2236 \ 2237 speed_operand_src (s, up, s->size); \ 2238 speed_operand_dst (s, tp, s->size); \ 2239 speed_operand_dst (s, ip, s->size); \ 2240 speed_cache_fill (s); \ 2241 \ 2242 speed_starttime (); \ 2243 i = s->reps; \ 2244 do \ 2245 function (ip, up, s->size, tp); \ 2246 while (--i != 0); \ 2247 t = speed_endtime (); \ 2248 \ 2249 TMP_FREE; \ 2250 return t; \ 2251 } 2252 2253 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn) \ 2254 { \ 2255 long i; \ 2256 mp_ptr up, tp, ip; \ 2257 double t; \ 2258 TMP_DECL; \ 2259 \ 2260 SPEED_RESTRICT_COND (s->size >= 1); \ 2261 \ 2262 TMP_MARK; \ 2263 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2264 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2265 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2266 \ 2267 MPN_COPY (up, s->xp, s->size); \ 2268 \ 2269 /* normalize the data */ \ 2270 up[0] |= 1; \ 2271 \ 2272 speed_operand_src (s, up, s->size); \ 2273 speed_operand_dst (s, tp, s->size); \ 2274 speed_operand_dst (s, ip, s->size); \ 2275 speed_cache_fill (s); \ 2276 \ 2277 speed_starttime (); \ 2278 i = s->reps; \ 2279 do \ 2280 function (ip, up, s->size, tp); \ 2281 while (--i != 0); \ 2282 t = speed_endtime (); \ 2283 \ 2284 TMP_FREE; \ 2285 return t; \ 2286 } 2287 2288 #define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn) \ 2289 { \ 2290 long i; \ 2291 mp_ptr up, mp, tp, ip; \ 2292 double t; \ 2293 TMP_DECL; \ 2294 \ 2295 SPEED_RESTRICT_COND (s->size >= 1); \ 2296 \ 2297 TMP_MARK; \ 2298 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2299 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2300 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2301 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2302 \ 2303 speed_operand_src (s, up, s->size); \ 2304 speed_operand_dst (s, tp, s->size); \ 2305 speed_operand_dst (s, ip, s->size); \ 2306 speed_cache_fill (s); \ 2307 \ 2308 MPN_COPY (mp, s->yp, s->size); \ 2309 /* Must be odd */ \ 2310 mp[0] |= 1; \ 2311 speed_starttime (); \ 2312 i = s->reps; \ 2313 do \ 2314 { \ 2315 MPN_COPY (up, s->xp, s->size); \ 2316 function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp); \ 2317 } \ 2318 while (--i != 0); \ 2319 t = speed_endtime (); \ 2320 \ 2321 TMP_FREE; \ 2322 return t; \ 2323 } 2324 2325 #define SPEED_ROUTINE_REDC_1(function) \ 2326 { \ 2327 unsigned i; \ 2328 mp_ptr cp, mp, tp, ap; \ 2329 mp_limb_t inv; \ 2330 double t; \ 2331 TMP_DECL; \ 2332 \ 2333 SPEED_RESTRICT_COND (s->size >= 1); \ 2334 \ 2335 TMP_MARK; \ 2336 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2337 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2338 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2339 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2340 \ 2341 MPN_COPY (ap, s->xp, s->size); \ 2342 MPN_COPY (ap+s->size, s->xp, s->size); \ 2343 \ 2344 /* modulus must be odd */ \ 2345 MPN_COPY (mp, s->yp, s->size); \ 2346 mp[0] |= 1; \ 2347 binvert_limb (inv, mp[0]); \ 2348 inv = -inv; \ 2349 \ 2350 speed_operand_src (s, ap, 2*s->size+1); \ 2351 speed_operand_dst (s, tp, 2*s->size+1); \ 2352 speed_operand_src (s, mp, s->size); \ 2353 speed_operand_dst (s, cp, s->size); \ 2354 speed_cache_fill (s); \ 2355 \ 2356 speed_starttime (); \ 2357 i = s->reps; \ 2358 do { \ 2359 MPN_COPY (tp, ap, 2*s->size); \ 2360 function (cp, tp, mp, s->size, inv); \ 2361 } while (--i != 0); \ 2362 t = speed_endtime (); \ 2363 \ 2364 TMP_FREE; \ 2365 return t; \ 2366 } 2367 #define SPEED_ROUTINE_REDC_2(function) \ 2368 { \ 2369 unsigned i; \ 2370 mp_ptr cp, mp, tp, ap; \ 2371 mp_limb_t invp[2]; \ 2372 double t; \ 2373 TMP_DECL; \ 2374 \ 2375 SPEED_RESTRICT_COND (s->size >= 1); \ 2376 \ 2377 TMP_MARK; \ 2378 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2379 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2380 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2381 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2382 \ 2383 MPN_COPY (ap, s->xp, s->size); \ 2384 MPN_COPY (ap+s->size, s->xp, s->size); \ 2385 \ 2386 /* modulus must be odd */ \ 2387 MPN_COPY (mp, s->yp, s->size); \ 2388 mp[0] |= 1; \ 2389 mpn_binvert (invp, mp, 2, tp); \ 2390 invp[0] = -invp[0]; invp[1] = ~invp[1]; \ 2391 \ 2392 speed_operand_src (s, ap, 2*s->size+1); \ 2393 speed_operand_dst (s, tp, 2*s->size+1); \ 2394 speed_operand_src (s, mp, s->size); \ 2395 speed_operand_dst (s, cp, s->size); \ 2396 speed_cache_fill (s); \ 2397 \ 2398 speed_starttime (); \ 2399 i = s->reps; \ 2400 do { \ 2401 MPN_COPY (tp, ap, 2*s->size); \ 2402 function (cp, tp, mp, s->size, invp); \ 2403 } while (--i != 0); \ 2404 t = speed_endtime (); \ 2405 \ 2406 TMP_FREE; \ 2407 return t; \ 2408 } 2409 #define SPEED_ROUTINE_REDC_N(function) \ 2410 { \ 2411 unsigned i; \ 2412 mp_ptr cp, mp, tp, ap, invp; \ 2413 double t; \ 2414 TMP_DECL; \ 2415 \ 2416 SPEED_RESTRICT_COND (s->size > 8); \ 2417 \ 2418 TMP_MARK; \ 2419 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2420 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2421 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2422 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2423 SPEED_TMP_ALLOC_LIMBS (invp, s->size, s->align_wp2); /* align? */ \ 2424 \ 2425 MPN_COPY (ap, s->xp, s->size); \ 2426 MPN_COPY (ap+s->size, s->xp, s->size); \ 2427 \ 2428 /* modulus must be odd */ \ 2429 MPN_COPY (mp, s->yp, s->size); \ 2430 mp[0] |= 1; \ 2431 mpn_binvert (invp, mp, s->size, tp); \ 2432 \ 2433 speed_operand_src (s, ap, 2*s->size+1); \ 2434 speed_operand_dst (s, tp, 2*s->size+1); \ 2435 speed_operand_src (s, mp, s->size); \ 2436 speed_operand_dst (s, cp, s->size); \ 2437 speed_cache_fill (s); \ 2438 \ 2439 speed_starttime (); \ 2440 i = s->reps; \ 2441 do { \ 2442 MPN_COPY (tp, ap, 2*s->size); \ 2443 function (cp, tp, mp, s->size, invp); \ 2444 } while (--i != 0); \ 2445 t = speed_endtime (); \ 2446 \ 2447 TMP_FREE; \ 2448 return t; \ 2449 } 2450 2451 2452 #define SPEED_ROUTINE_MPN_POPCOUNT(function) \ 2453 { \ 2454 unsigned i; \ 2455 \ 2456 SPEED_RESTRICT_COND (s->size >= 1); \ 2457 \ 2458 speed_operand_src (s, s->xp, s->size); \ 2459 speed_cache_fill (s); \ 2460 \ 2461 speed_starttime (); \ 2462 i = s->reps; \ 2463 do \ 2464 function (s->xp, s->size); \ 2465 while (--i != 0); \ 2466 \ 2467 return speed_endtime (); \ 2468 } 2469 2470 #define SPEED_ROUTINE_MPN_HAMDIST(function) \ 2471 { \ 2472 unsigned i; \ 2473 \ 2474 SPEED_RESTRICT_COND (s->size >= 1); \ 2475 \ 2476 speed_operand_src (s, s->xp, s->size); \ 2477 speed_operand_src (s, s->yp, s->size); \ 2478 speed_cache_fill (s); \ 2479 \ 2480 speed_starttime (); \ 2481 i = s->reps; \ 2482 do \ 2483 function (s->xp, s->yp, s->size); \ 2484 while (--i != 0); \ 2485 \ 2486 return speed_endtime (); \ 2487 } 2488 2489 2490 #define SPEED_ROUTINE_MPZ_UI(function) \ 2491 { \ 2492 mpz_t z; \ 2493 unsigned i; \ 2494 double t; \ 2495 \ 2496 SPEED_RESTRICT_COND (s->size >= 0); \ 2497 \ 2498 mpz_init (z); \ 2499 \ 2500 speed_starttime (); \ 2501 i = s->reps; \ 2502 do \ 2503 function (z, s->size); \ 2504 while (--i != 0); \ 2505 t = speed_endtime (); \ 2506 \ 2507 mpz_clear (z); \ 2508 return t; \ 2509 } 2510 2511 #define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2512 #define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2513 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2514 2515 2516 #define SPEED_ROUTINE_MPZ_2_UI(function) \ 2517 { \ 2518 mpz_t z, z2; \ 2519 unsigned i; \ 2520 double t; \ 2521 \ 2522 SPEED_RESTRICT_COND (s->size >= 0); \ 2523 \ 2524 mpz_init (z); \ 2525 mpz_init (z2); \ 2526 \ 2527 speed_starttime (); \ 2528 i = s->reps; \ 2529 do \ 2530 function (z, z2, s->size); \ 2531 while (--i != 0); \ 2532 t = speed_endtime (); \ 2533 \ 2534 mpz_clear (z); \ 2535 mpz_clear (z2); \ 2536 return t; \ 2537 } 2538 2539 #define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2540 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2541 2542 2543 #define SPEED_ROUTINE_MPN_FIB2_UI(function) \ 2544 { \ 2545 mp_ptr fp, f1p; \ 2546 mp_size_t alloc; \ 2547 unsigned i; \ 2548 double t; \ 2549 TMP_DECL; \ 2550 \ 2551 SPEED_RESTRICT_COND (s->size >= 0); \ 2552 \ 2553 TMP_MARK; \ 2554 alloc = MPN_FIB2_SIZE (s->size); \ 2555 SPEED_TMP_ALLOC_LIMBS (fp, alloc, s->align_xp); \ 2556 SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp); \ 2557 \ 2558 speed_starttime (); \ 2559 i = s->reps; \ 2560 do \ 2561 function (fp, f1p, s->size); \ 2562 while (--i != 0); \ 2563 t = speed_endtime (); \ 2564 \ 2565 TMP_FREE; \ 2566 return t; \ 2567 } 2568 2569 2570 2571 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6 2572 limbs. m is forced to odd so that redc can be used. e is limited in 2573 size so the calculation doesn't take too long. */ 2574 #define SPEED_ROUTINE_MPZ_POWM(function) \ 2575 { \ 2576 mpz_t r, b, e, m; \ 2577 unsigned i; \ 2578 double t; \ 2579 \ 2580 SPEED_RESTRICT_COND (s->size >= 1); \ 2581 \ 2582 mpz_init (r); \ 2583 mpz_init_set_n (b, s->xp, s->size); \ 2584 mpz_init_set_n (m, s->yp, s->size); \ 2585 mpz_setbit (m, 0); /* force m to odd */ \ 2586 mpz_init_set_n (e, s->xp_block, 6); \ 2587 \ 2588 speed_starttime (); \ 2589 i = s->reps; \ 2590 do \ 2591 function (r, b, e, m); \ 2592 while (--i != 0); \ 2593 t = speed_endtime (); \ 2594 \ 2595 mpz_clear (r); \ 2596 mpz_clear (b); \ 2597 mpz_clear (e); \ 2598 mpz_clear (m); \ 2599 return t; \ 2600 } 2601 2602 /* (m-2)^0xAAAAAAAA mod m */ 2603 #define SPEED_ROUTINE_MPZ_POWM_UI(function) \ 2604 { \ 2605 mpz_t r, b, m; \ 2606 unsigned long e; \ 2607 unsigned i; \ 2608 double t; \ 2609 \ 2610 SPEED_RESTRICT_COND (s->size >= 1); \ 2611 \ 2612 mpz_init (r); \ 2613 \ 2614 /* force m to odd */ \ 2615 mpz_init (m); \ 2616 mpz_set_n (m, s->xp, s->size); \ 2617 PTR(m)[0] |= 1; \ 2618 \ 2619 e = (~ (unsigned long) 0) / 3; \ 2620 if (s->r != 0) \ 2621 e = s->r; \ 2622 \ 2623 mpz_init_set (b, m); \ 2624 mpz_sub_ui (b, b, 2); \ 2625 /* printf ("%X\n", mpz_get_ui(m)); */ \ 2626 i = s->reps; \ 2627 speed_starttime (); \ 2628 do \ 2629 function (r, b, e, m); \ 2630 while (--i != 0); \ 2631 t = speed_endtime (); \ 2632 \ 2633 mpz_clear (r); \ 2634 mpz_clear (b); \ 2635 mpz_clear (m); \ 2636 return t; \ 2637 } 2638 2639 2640 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \ 2641 { \ 2642 mp_ptr wp, wp2, xp, yp; \ 2643 unsigned i; \ 2644 double t; \ 2645 TMP_DECL; \ 2646 \ 2647 SPEED_RESTRICT_COND (s->size >= 0); \ 2648 \ 2649 TMP_MARK; \ 2650 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 2651 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 2652 xp = s->xp; \ 2653 yp = s->yp; \ 2654 \ 2655 if (s->r == 0) ; \ 2656 else if (s->r == 1) { xp = wp; } \ 2657 else if (s->r == 2) { yp = wp2; } \ 2658 else if (s->r == 3) { xp = wp; yp = wp2; } \ 2659 else if (s->r == 4) { xp = wp2; yp = wp; } \ 2660 else { \ 2661 TMP_FREE; \ 2662 return -1.0; \ 2663 } \ 2664 if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \ 2665 if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \ 2666 \ 2667 speed_operand_src (s, xp, s->size); \ 2668 speed_operand_src (s, yp, s->size); \ 2669 speed_operand_dst (s, wp, s->size); \ 2670 speed_operand_dst (s, wp2, s->size); \ 2671 speed_cache_fill (s); \ 2672 \ 2673 speed_starttime (); \ 2674 i = s->reps; \ 2675 do \ 2676 call; \ 2677 while (--i != 0); \ 2678 t = speed_endtime (); \ 2679 \ 2680 TMP_FREE; \ 2681 return t; \ 2682 } 2683 2684 #define SPEED_ROUTINE_MPN_ADDSUB_N(function) \ 2685 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2686 (function (wp, wp2, xp, yp, s->size)); 2687 2688 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \ 2689 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2690 (function (wp, wp2, xp, yp, s->size, 0)); 2691 2692 2693 /* Doing an Nx1 gcd with the given r. */ 2694 #define SPEED_ROUTINE_MPN_GCD_1N(function) \ 2695 { \ 2696 mp_ptr xp; \ 2697 unsigned i; \ 2698 double t; \ 2699 TMP_DECL; \ 2700 \ 2701 SPEED_RESTRICT_COND (s->size >= 1); \ 2702 SPEED_RESTRICT_COND (s->r != 0); \ 2703 \ 2704 TMP_MARK; \ 2705 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 2706 MPN_COPY (xp, s->xp, s->size); \ 2707 xp[0] |= refmpn_zero_p (xp, s->size); \ 2708 \ 2709 speed_operand_src (s, s->xp, s->size); \ 2710 speed_cache_fill (s); \ 2711 \ 2712 speed_starttime (); \ 2713 i = s->reps; \ 2714 do \ 2715 function (xp, s->size, s->r); \ 2716 while (--i != 0); \ 2717 t = speed_endtime (); \ 2718 \ 2719 TMP_FREE; \ 2720 return t; \ 2721 } 2722 2723 2724 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */ 2725 2726 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \ 2727 { \ 2728 unsigned i, j; \ 2729 mp_ptr px, py; \ 2730 mp_limb_t x_mask, y_mask; \ 2731 double t; \ 2732 TMP_DECL; \ 2733 \ 2734 SPEED_RESTRICT_COND (s->size >= 1); \ 2735 SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \ 2736 \ 2737 TMP_MARK; \ 2738 SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp); \ 2739 SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp); \ 2740 MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \ 2741 MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \ 2742 \ 2743 x_mask = MP_LIMB_T_LOWBITMASK (s->size); \ 2744 y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \ 2745 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 2746 { \ 2747 px[i] &= x_mask; px[i] += (px[i] == 0); \ 2748 py[i] &= y_mask; py[i] += (py[i] == 0); \ 2749 setup; \ 2750 } \ 2751 \ 2752 speed_operand_src (s, px, SPEED_BLOCK_SIZE); \ 2753 speed_operand_src (s, py, SPEED_BLOCK_SIZE); \ 2754 speed_cache_fill (s); \ 2755 \ 2756 speed_starttime (); \ 2757 i = s->reps; \ 2758 do \ 2759 { \ 2760 j = SPEED_BLOCK_SIZE; \ 2761 do \ 2762 { \ 2763 call; \ 2764 } \ 2765 while (--j != 0); \ 2766 } \ 2767 while (--i != 0); \ 2768 t = speed_endtime (); \ 2769 \ 2770 TMP_FREE; \ 2771 \ 2772 s->time_divisor = SPEED_BLOCK_SIZE; \ 2773 return t; \ 2774 } 2775 2776 #define SPEED_ROUTINE_MPN_GCD_1(function) \ 2777 SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1])) 2778 2779 #define SPEED_ROUTINE_MPN_JACBASE(function) \ 2780 SPEED_ROUTINE_MPN_GCD_1_CALL \ 2781 ({ \ 2782 /* require x<y, y odd, y!=1 */ \ 2783 px[i] %= py[i]; \ 2784 px[i] |= 1; \ 2785 py[i] |= 1; \ 2786 if (py[i]==1) py[i]=3; \ 2787 }, \ 2788 function (px[j-1], py[j-1], 0)) 2789 2790 2791 #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc) \ 2792 { \ 2793 mp_size_t hgcd_init_itch, hgcd_itch; \ 2794 mp_ptr ap, bp, wp, tmp1; \ 2795 struct hgcd_matrix hgcd; \ 2796 int res; \ 2797 unsigned i; \ 2798 double t; \ 2799 TMP_DECL; \ 2800 \ 2801 if (s->size < 2) \ 2802 return -1; \ 2803 \ 2804 TMP_MARK; \ 2805 \ 2806 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ 2807 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ 2808 \ 2809 s->xp[s->size - 1] |= 1; \ 2810 s->yp[s->size - 1] |= 1; \ 2811 \ 2812 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ 2813 hgcd_itch = itchfunc (s->size); \ 2814 \ 2815 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ 2816 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp); \ 2817 \ 2818 speed_operand_src (s, s->xp, s->size); \ 2819 speed_operand_src (s, s->yp, s->size); \ 2820 speed_operand_dst (s, ap, s->size + 1); \ 2821 speed_operand_dst (s, bp, s->size + 1); \ 2822 speed_operand_dst (s, wp, hgcd_itch); \ 2823 speed_operand_dst (s, tmp1, hgcd_init_itch); \ 2824 speed_cache_fill (s); \ 2825 \ 2826 speed_starttime (); \ 2827 i = s->reps; \ 2828 do \ 2829 { \ 2830 MPN_COPY (ap, s->xp, s->size); \ 2831 MPN_COPY (bp, s->yp, s->size); \ 2832 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ 2833 res = func (ap, bp, s->size, &hgcd, wp); \ 2834 } \ 2835 while (--i != 0); \ 2836 t = speed_endtime (); \ 2837 TMP_FREE; \ 2838 return t; \ 2839 } 2840 2841 #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc) \ 2842 { \ 2843 mp_size_t hgcd_init_itch, hgcd_step_itch; \ 2844 mp_ptr ap, bp, wp, tmp1; \ 2845 struct hgcd_matrix hgcd; \ 2846 mp_size_t p = s->size/2; \ 2847 int res; \ 2848 unsigned i; \ 2849 double t; \ 2850 TMP_DECL; \ 2851 \ 2852 if (s->size < 2) \ 2853 return -1; \ 2854 \ 2855 TMP_MARK; \ 2856 \ 2857 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ 2858 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ 2859 \ 2860 s->xp[s->size - 1] |= 1; \ 2861 s->yp[s->size - 1] |= 1; \ 2862 \ 2863 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ 2864 hgcd_step_itch = itchfunc (s->size, p); \ 2865 \ 2866 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ 2867 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp); \ 2868 \ 2869 speed_operand_src (s, s->xp, s->size); \ 2870 speed_operand_src (s, s->yp, s->size); \ 2871 speed_operand_dst (s, ap, s->size + 1); \ 2872 speed_operand_dst (s, bp, s->size + 1); \ 2873 speed_operand_dst (s, wp, hgcd_step_itch); \ 2874 speed_operand_dst (s, tmp1, hgcd_init_itch); \ 2875 speed_cache_fill (s); \ 2876 \ 2877 speed_starttime (); \ 2878 i = s->reps; \ 2879 do \ 2880 { \ 2881 MPN_COPY (ap, s->xp, s->size); \ 2882 MPN_COPY (bp, s->yp, s->size); \ 2883 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ 2884 res = func (&hgcd, ap, bp, s->size, p, wp); \ 2885 } \ 2886 while (--i != 0); \ 2887 t = speed_endtime (); \ 2888 TMP_FREE; \ 2889 return t; \ 2890 } 2891 2892 /* Run some GCDs of s->size limbs each. The number of different data values 2893 is decreased as s->size**2, since GCD is a quadratic algorithm. 2894 SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT 2895 though, because the plain gcd is about twice as fast as gcdext. */ 2896 2897 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \ 2898 { \ 2899 unsigned i; \ 2900 mp_size_t j, pieces, psize; \ 2901 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 2902 double t; \ 2903 TMP_DECL; \ 2904 \ 2905 SPEED_RESTRICT_COND (s->size >= 1); \ 2906 \ 2907 TMP_MARK; \ 2908 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 2909 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 2910 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 2911 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 2912 \ 2913 pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \ 2914 pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \ 2915 pieces = MAX (pieces, 1); \ 2916 \ 2917 psize = pieces * s->size; \ 2918 px = TMP_ALLOC_LIMBS (psize); \ 2919 py = TMP_ALLOC_LIMBS (psize); \ 2920 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 2921 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 2922 \ 2923 /* Requirements: x >= y, y must be odd, high limbs != 0. \ 2924 No need to ensure random numbers are really great. */ \ 2925 for (j = 0; j < pieces; j++) \ 2926 { \ 2927 mp_ptr x = px + j * s->size; \ 2928 mp_ptr y = py + j * s->size; \ 2929 if (x[s->size - 1] == 0) x[s->size - 1] = 1; \ 2930 if (y[s->size - 1] == 0) y[s->size - 1] = 1; \ 2931 \ 2932 if (x[s->size - 1] < y[s->size - 1]) \ 2933 MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]); \ 2934 else if (x[s->size - 1] == y[s->size - 1]) \ 2935 { \ 2936 x[s->size - 1] = 2; \ 2937 y[s->size - 1] = 1; \ 2938 } \ 2939 y[0] |= 1; \ 2940 } \ 2941 \ 2942 speed_operand_src (s, px, psize); \ 2943 speed_operand_src (s, py, psize); \ 2944 speed_operand_dst (s, xtmp, s->size); \ 2945 speed_operand_dst (s, ytmp, s->size); \ 2946 speed_operand_dst (s, wp, s->size); \ 2947 speed_cache_fill (s); \ 2948 \ 2949 speed_starttime (); \ 2950 i = s->reps; \ 2951 do \ 2952 { \ 2953 j = pieces; \ 2954 do \ 2955 { \ 2956 MPN_COPY (xtmp, px+(j - 1)*s->size, s->size); \ 2957 MPN_COPY (ytmp, py+(j - 1)*s->size, s->size); \ 2958 call; \ 2959 } \ 2960 while (--j != 0); \ 2961 } \ 2962 while (--i != 0); \ 2963 t = speed_endtime (); \ 2964 \ 2965 TMP_FREE; \ 2966 \ 2967 s->time_divisor = pieces; \ 2968 return t; \ 2969 } 2970 2971 #define SPEED_ROUTINE_MPN_GCD(function) \ 2972 SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size)) 2973 2974 #define SPEED_ROUTINE_MPN_GCDEXT(function) \ 2975 SPEED_ROUTINE_MPN_GCD_CALL \ 2976 (4, { mp_size_t wp2size; \ 2977 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); }) 2978 2979 2980 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \ 2981 { \ 2982 unsigned i; \ 2983 mp_size_t j, pieces, psize, wp2size; \ 2984 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 2985 double t; \ 2986 TMP_DECL; \ 2987 \ 2988 SPEED_RESTRICT_COND (s->size >= 1); \ 2989 \ 2990 TMP_MARK; \ 2991 \ 2992 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 2993 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 2994 MPN_COPY (xtmp, s->xp, s->size); \ 2995 MPN_COPY (ytmp, s->yp, s->size); \ 2996 \ 2997 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 2998 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 2999 \ 3000 pieces = SPEED_BLOCK_SIZE / 3; \ 3001 psize = 3 * pieces; \ 3002 px = TMP_ALLOC_LIMBS (psize); \ 3003 py = TMP_ALLOC_LIMBS (psize); \ 3004 MPN_COPY (px, s->xp_block, psize); \ 3005 MPN_COPY (py, s->yp_block, psize); \ 3006 \ 3007 /* x must have at least as many bits as y, \ 3008 high limbs must be non-zero */ \ 3009 for (j = 0; j < pieces; j++) \ 3010 { \ 3011 mp_ptr x = px+3*j; \ 3012 mp_ptr y = py+3*j; \ 3013 x[2] += (x[2] == 0); \ 3014 y[2] += (y[2] == 0); \ 3015 if (x[2] < y[2]) \ 3016 MP_LIMB_T_SWAP (x[2], y[2]); \ 3017 } \ 3018 \ 3019 speed_operand_src (s, px, psize); \ 3020 speed_operand_src (s, py, psize); \ 3021 speed_operand_dst (s, xtmp, s->size); \ 3022 speed_operand_dst (s, ytmp, s->size); \ 3023 speed_operand_dst (s, wp, s->size); \ 3024 speed_cache_fill (s); \ 3025 \ 3026 speed_starttime (); \ 3027 i = s->reps; \ 3028 do \ 3029 { \ 3030 mp_ptr x = px; \ 3031 mp_ptr y = py; \ 3032 mp_ptr xth = &xtmp[s->size-3]; \ 3033 mp_ptr yth = &ytmp[s->size-3]; \ 3034 j = pieces; \ 3035 do \ 3036 { \ 3037 xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \ 3038 yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \ 3039 \ 3040 ytmp[0] |= 1; /* y must be odd, */ \ 3041 \ 3042 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \ 3043 \ 3044 x += 3; \ 3045 y += 3; \ 3046 } \ 3047 while (--j != 0); \ 3048 } \ 3049 while (--i != 0); \ 3050 t = speed_endtime (); \ 3051 \ 3052 TMP_FREE; \ 3053 \ 3054 s->time_divisor = pieces; \ 3055 return t; \ 3056 } 3057 3058 #define SPEED_ROUTINE_MPZ_JACOBI(function) \ 3059 { \ 3060 mpz_t a, b; \ 3061 unsigned i; \ 3062 mp_size_t j, pieces, psize; \ 3063 mp_ptr px, py; \ 3064 double t; \ 3065 TMP_DECL; \ 3066 \ 3067 TMP_MARK; \ 3068 pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \ 3069 pieces = MAX (pieces, 1); \ 3070 s->time_divisor = pieces; \ 3071 \ 3072 psize = pieces * s->size; \ 3073 px = TMP_ALLOC_LIMBS (psize); \ 3074 py = TMP_ALLOC_LIMBS (psize); \ 3075 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 3076 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 3077 \ 3078 for (j = 0; j < pieces; j++) \ 3079 { \ 3080 mp_ptr x = px+j*s->size; \ 3081 mp_ptr y = py+j*s->size; \ 3082 \ 3083 /* y odd */ \ 3084 y[0] |= 1; \ 3085 \ 3086 /* high limbs non-zero */ \ 3087 if (x[s->size-1] == 0) x[s->size-1] = 1; \ 3088 if (y[s->size-1] == 0) y[s->size-1] = 1; \ 3089 } \ 3090 \ 3091 SIZ(a) = s->size; \ 3092 SIZ(b) = s->size; \ 3093 \ 3094 speed_operand_src (s, px, psize); \ 3095 speed_operand_src (s, py, psize); \ 3096 speed_cache_fill (s); \ 3097 \ 3098 speed_starttime (); \ 3099 i = s->reps; \ 3100 do \ 3101 { \ 3102 j = pieces; \ 3103 do \ 3104 { \ 3105 PTR(a) = px+(j-1)*s->size; \ 3106 PTR(b) = py+(j-1)*s->size; \ 3107 function (a, b); \ 3108 } \ 3109 while (--j != 0); \ 3110 } \ 3111 while (--i != 0); \ 3112 t = speed_endtime (); \ 3113 \ 3114 TMP_FREE; \ 3115 return t; \ 3116 } 3117 3118 #define SPEED_ROUTINE_MPN_DIVREM_2(function) \ 3119 { \ 3120 mp_ptr wp, xp; \ 3121 mp_limb_t yp[2]; \ 3122 unsigned i; \ 3123 double t; \ 3124 TMP_DECL; \ 3125 \ 3126 SPEED_RESTRICT_COND (s->size >= 2); \ 3127 \ 3128 TMP_MARK; \ 3129 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 3130 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3131 \ 3132 /* source is destroyed */ \ 3133 MPN_COPY (xp, s->xp, s->size); \ 3134 \ 3135 /* divisor must be normalized */ \ 3136 MPN_COPY (yp, s->yp_block, 2); \ 3137 yp[1] |= GMP_NUMB_HIGHBIT; \ 3138 \ 3139 speed_operand_src (s, xp, s->size); \ 3140 speed_operand_src (s, yp, 2); \ 3141 speed_operand_dst (s, wp, s->size); \ 3142 speed_cache_fill (s); \ 3143 \ 3144 speed_starttime (); \ 3145 i = s->reps; \ 3146 do \ 3147 function (wp, 0, xp, s->size, yp); \ 3148 while (--i != 0); \ 3149 t = speed_endtime (); \ 3150 \ 3151 TMP_FREE; \ 3152 return t; \ 3153 } 3154 3155 #define SPEED_ROUTINE_MPN_DIV_QR_1(function) \ 3156 { \ 3157 mp_ptr wp, xp; \ 3158 mp_limb_t d; \ 3159 mp_limb_t r; \ 3160 unsigned i; \ 3161 double t; \ 3162 TMP_DECL; \ 3163 \ 3164 SPEED_RESTRICT_COND (s->size >= 1); \ 3165 \ 3166 TMP_MARK; \ 3167 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3168 \ 3169 d = s->r; \ 3170 if (d == 0) \ 3171 d = 1; \ 3172 speed_operand_src (s, s->xp, s->size); \ 3173 speed_operand_dst (s, wp, s->size); \ 3174 speed_cache_fill (s); \ 3175 \ 3176 speed_starttime (); \ 3177 i = s->reps; \ 3178 do \ 3179 r = function (wp, wp+s->size-1, s->xp, s->size, d); \ 3180 while (--i != 0); \ 3181 t = speed_endtime (); \ 3182 \ 3183 TMP_FREE; \ 3184 return t; \ 3185 } 3186 3187 #define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function) \ 3188 { \ 3189 mp_ptr wp, xp; \ 3190 mp_limb_t d, dinv; \ 3191 mp_limb_t r; \ 3192 unsigned i; \ 3193 double t; \ 3194 TMP_DECL; \ 3195 \ 3196 SPEED_RESTRICT_COND (s->size >= 1); \ 3197 \ 3198 TMP_MARK; \ 3199 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3200 \ 3201 d = s->r; \ 3202 /* divisor must be normalized */ \ 3203 SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT); \ 3204 invert_limb (dinv, d); \ 3205 speed_operand_src (s, s->xp, s->size); \ 3206 speed_operand_dst (s, wp, s->size); \ 3207 speed_cache_fill (s); \ 3208 \ 3209 speed_starttime (); \ 3210 i = s->reps; \ 3211 do \ 3212 r = function (wp, s->xp, s->size, 0, d, dinv); \ 3213 while (--i != 0); \ 3214 t = speed_endtime (); \ 3215 \ 3216 TMP_FREE; \ 3217 return t; \ 3218 } 3219 3220 #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm) \ 3221 { \ 3222 mp_ptr wp, xp; \ 3223 mp_limb_t yp[2]; \ 3224 mp_limb_t rp[2]; \ 3225 unsigned i; \ 3226 double t; \ 3227 TMP_DECL; \ 3228 \ 3229 SPEED_RESTRICT_COND (s->size >= 2); \ 3230 \ 3231 TMP_MARK; \ 3232 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3233 \ 3234 /* divisor must be normalized */ \ 3235 MPN_COPY (yp, s->yp_block, 2); \ 3236 if (norm) \ 3237 yp[1] |= GMP_NUMB_HIGHBIT; \ 3238 else \ 3239 { \ 3240 yp[1] &= ~GMP_NUMB_HIGHBIT; \ 3241 if (yp[1] == 0) \ 3242 yp[1] = 1; \ 3243 } \ 3244 speed_operand_src (s, s->xp, s->size); \ 3245 speed_operand_src (s, yp, 2); \ 3246 speed_operand_dst (s, wp, s->size); \ 3247 speed_operand_dst (s, rp, 2); \ 3248 speed_cache_fill (s); \ 3249 \ 3250 speed_starttime (); \ 3251 i = s->reps; \ 3252 do \ 3253 function (wp, rp, s->xp, s->size, yp); \ 3254 while (--i != 0); \ 3255 t = speed_endtime (); \ 3256 \ 3257 TMP_FREE; \ 3258 return t; \ 3259 } 3260 3261 #define SPEED_ROUTINE_MODLIMB_INVERT(function) \ 3262 { \ 3263 unsigned i, j; \ 3264 mp_ptr xp; \ 3265 mp_limb_t n = 1; \ 3266 double t; \ 3267 \ 3268 xp = s->xp_block-1; \ 3269 \ 3270 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \ 3271 speed_cache_fill (s); \ 3272 \ 3273 speed_starttime (); \ 3274 i = s->reps; \ 3275 do \ 3276 { \ 3277 j = SPEED_BLOCK_SIZE; \ 3278 do \ 3279 { \ 3280 /* randomized but successively dependent */ \ 3281 n += (xp[j] << 1); \ 3282 \ 3283 function (n, n); \ 3284 } \ 3285 while (--j != 0); \ 3286 } \ 3287 while (--i != 0); \ 3288 t = speed_endtime (); \ 3289 \ 3290 /* make sure the compiler won't optimize away n */ \ 3291 noop_1 (n); \ 3292 \ 3293 s->time_divisor = SPEED_BLOCK_SIZE; \ 3294 return t; \ 3295 } 3296 3297 3298 #define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call) \ 3299 { \ 3300 mp_ptr wp, wp2; \ 3301 unsigned i; \ 3302 double t; \ 3303 TMP_DECL; \ 3304 \ 3305 SPEED_RESTRICT_COND (s->size >= 1); \ 3306 \ 3307 TMP_MARK; \ 3308 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3309 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 3310 \ 3311 speed_operand_src (s, s->xp, s->size); \ 3312 speed_operand_dst (s, wp, s->size); \ 3313 speed_operand_dst (s, wp2, s->size); \ 3314 speed_cache_fill (s); \ 3315 \ 3316 speed_starttime (); \ 3317 i = s->reps; \ 3318 do \ 3319 call; \ 3320 while (--i != 0); \ 3321 t = speed_endtime (); \ 3322 \ 3323 TMP_FREE; \ 3324 return t; \ 3325 } 3326 3327 3328 /* s->size controls the number of limbs in the input, s->r is the base, or 3329 decimal by default. */ 3330 #define SPEED_ROUTINE_MPN_GET_STR(function) \ 3331 { \ 3332 unsigned char *wp; \ 3333 mp_size_t wn; \ 3334 mp_ptr xp; \ 3335 int base; \ 3336 unsigned i; \ 3337 double t; \ 3338 TMP_DECL; \ 3339 \ 3340 SPEED_RESTRICT_COND (s->size >= 1); \ 3341 \ 3342 base = s->r == 0 ? 10 : s->r; \ 3343 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 3344 \ 3345 TMP_MARK; \ 3346 SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp); \ 3347 \ 3348 MPN_SIZEINBASE (wn, s->xp, s->size, base); \ 3349 wp = (unsigned char *) TMP_ALLOC (wn); \ 3350 \ 3351 /* use this during development to guard against overflowing wp */ \ 3352 /* \ 3353 MPN_COPY (xp, s->xp, s->size); \ 3354 ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn); \ 3355 */ \ 3356 \ 3357 speed_operand_src (s, s->xp, s->size); \ 3358 speed_operand_dst (s, xp, s->size); \ 3359 speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES); \ 3360 speed_cache_fill (s); \ 3361 \ 3362 speed_starttime (); \ 3363 i = s->reps; \ 3364 do \ 3365 { \ 3366 MPN_COPY (xp, s->xp, s->size); \ 3367 function (wp, base, xp, s->size); \ 3368 } \ 3369 while (--i != 0); \ 3370 t = speed_endtime (); \ 3371 \ 3372 TMP_FREE; \ 3373 return t; \ 3374 } 3375 3376 /* s->size controls the number of digits in the input, s->r is the base, or 3377 decimal by default. */ 3378 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call) \ 3379 { \ 3380 unsigned char *xp; \ 3381 mp_ptr wp; \ 3382 mp_size_t wn; \ 3383 unsigned i; \ 3384 int base; \ 3385 double t; \ 3386 TMP_DECL; \ 3387 \ 3388 SPEED_RESTRICT_COND (s->size >= 1); \ 3389 \ 3390 base = s->r == 0 ? 10 : s->r; \ 3391 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 3392 \ 3393 TMP_MARK; \ 3394 \ 3395 xp = (unsigned char *) TMP_ALLOC (s->size); \ 3396 for (i = 0; i < s->size; i++) \ 3397 xp[i] = s->xp[i] % base; \ 3398 \ 3399 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); \ 3400 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 3401 \ 3402 /* use this during development to check wn is big enough */ \ 3403 /* \ 3404 ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn); \ 3405 */ \ 3406 \ 3407 speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES); \ 3408 speed_operand_dst (s, wp, wn); \ 3409 speed_cache_fill (s); \ 3410 \ 3411 speed_starttime (); \ 3412 i = s->reps; \ 3413 do \ 3414 call; \ 3415 while (--i != 0); \ 3416 t = speed_endtime (); \ 3417 \ 3418 TMP_FREE; \ 3419 return t; \ 3420 } 3421 3422 3423 /* Run an accel gcd find_a() function over various data values. A set of 3424 values is used in case some run particularly fast or slow. The size 3425 parameter is ignored, the amount of data tested is fixed. */ 3426 3427 #define SPEED_ROUTINE_MPN_GCD_FINDA(function) \ 3428 { \ 3429 unsigned i, j; \ 3430 mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \ 3431 double t; \ 3432 TMP_DECL; \ 3433 \ 3434 TMP_MARK; \ 3435 \ 3436 /* low must be odd, high must be non-zero */ \ 3437 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 3438 { \ 3439 cp[i][0] = s->xp_block[i] | 1; \ 3440 cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \ 3441 } \ 3442 \ 3443 speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \ 3444 speed_cache_fill (s); \ 3445 \ 3446 speed_starttime (); \ 3447 i = s->reps; \ 3448 do \ 3449 { \ 3450 j = SPEED_BLOCK_SIZE; \ 3451 do \ 3452 { \ 3453 function (cp[j-1]); \ 3454 } \ 3455 while (--j != 0); \ 3456 } \ 3457 while (--i != 0); \ 3458 t = speed_endtime (); \ 3459 \ 3460 TMP_FREE; \ 3461 \ 3462 s->time_divisor = SPEED_BLOCK_SIZE; \ 3463 return t; \ 3464 } 3465 3466 3467 /* "call" should do "count_foo_zeros(c,n)". 3468 Give leading=1 if foo is leading zeros, leading=0 for trailing. 3469 Give zero=1 if n=0 is allowed in the call, zero=0 if not. */ 3470 3471 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \ 3472 { \ 3473 mp_ptr xp; \ 3474 int i, c; \ 3475 unsigned j; \ 3476 mp_limb_t n; \ 3477 double t; \ 3478 TMP_DECL; \ 3479 \ 3480 TMP_MARK; \ 3481 SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp); \ 3482 \ 3483 if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \ 3484 return -1.0; \ 3485 speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \ 3486 speed_cache_fill (s); \ 3487 \ 3488 c = 0; \ 3489 speed_starttime (); \ 3490 j = s->reps; \ 3491 do { \ 3492 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 3493 { \ 3494 n = xp[i]; \ 3495 n ^= c; \ 3496 3497 #define SPEED_ROUTINE_COUNT_ZEROS_B() \ 3498 } \ 3499 } while (--j != 0); \ 3500 t = speed_endtime (); \ 3501 \ 3502 /* don't let c go dead */ \ 3503 noop_1 (c); \ 3504 \ 3505 s->time_divisor = SPEED_BLOCK_SIZE; \ 3506 \ 3507 TMP_FREE; \ 3508 return t; \ 3509 } \ 3510 3511 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \ 3512 do { \ 3513 SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \ 3514 call; \ 3515 SPEED_ROUTINE_COUNT_ZEROS_B (); \ 3516 } while (0) \ 3517 3518 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \ 3519 SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero) 3520 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \ 3521 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0) 3522 3523 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \ 3524 SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero) 3525 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \ 3526 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0) 3527 3528 3529 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \ 3530 { \ 3531 unsigned i, j; \ 3532 mp_limb_t d, dinv=0; \ 3533 mp_ptr xp = s->xp_block - 1; \ 3534 \ 3535 s->time_divisor = SPEED_BLOCK_SIZE; \ 3536 \ 3537 speed_starttime (); \ 3538 i = s->reps; \ 3539 do \ 3540 { \ 3541 j = SPEED_BLOCK_SIZE; \ 3542 do \ 3543 { \ 3544 d = dinv ^ xp[j]; \ 3545 d |= GMP_LIMB_HIGHBIT; \ 3546 do { call; } while (0); \ 3547 } \ 3548 while (--j != 0); \ 3549 } \ 3550 while (--i != 0); \ 3551 \ 3552 /* don't let the compiler optimize everything away */ \ 3553 noop_1 (dinv); \ 3554 \ 3555 return speed_endtime(); \ 3556 } 3557 3558 3559 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \ 3560 { \ 3561 unsigned i; \ 3562 speed_starttime (); \ 3563 i = s->reps; \ 3564 do \ 3565 function (); \ 3566 while (--i != 0); \ 3567 return speed_endtime (); \ 3568 } 3569 3570 3571 #define SPEED_ROUTINE_MPN_ZERO_CALL(call) \ 3572 { \ 3573 mp_ptr wp; \ 3574 unsigned i; \ 3575 double t; \ 3576 TMP_DECL; \ 3577 \ 3578 SPEED_RESTRICT_COND (s->size >= 0); \ 3579 \ 3580 TMP_MARK; \ 3581 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3582 speed_operand_dst (s, wp, s->size); \ 3583 speed_cache_fill (s); \ 3584 \ 3585 speed_starttime (); \ 3586 i = s->reps; \ 3587 do \ 3588 call; \ 3589 while (--i != 0); \ 3590 t = speed_endtime (); \ 3591 \ 3592 TMP_FREE; \ 3593 return t; \ 3594 } 3595 3596 #define SPEED_ROUTINE_MPN_ZERO(function) \ 3597 SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size)) 3598 3599 3600 #endif