github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/tune/tuneup.c (about) 1 /* Create tuned thresholds for various algorithms. 2 3 Copyright 1999-2003, 2005, 2006, 2008-2012 Free Software Foundation, Inc. 4 5 This file is part of the GNU MP Library. 6 7 The GNU MP Library is free software; you can redistribute it and/or modify 8 it under the terms of either: 9 10 * the GNU Lesser General Public License as published by the Free 11 Software Foundation; either version 3 of the License, or (at your 12 option) any later version. 13 14 or 15 16 * the GNU General Public License as published by the Free Software 17 Foundation; either version 2 of the License, or (at your option) any 18 later version. 19 20 or both in parallel, as here. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 for more details. 26 27 You should have received copies of the GNU General Public License and the 28 GNU Lesser General Public License along with the GNU MP Library. If not, 29 see https://www.gnu.org/licenses/. */ 30 31 32 /* Usage: tuneup [-t] [-t] [-p precision] 33 34 -t turns on some diagnostic traces, a second -t turns on more traces. 35 36 Notes: 37 38 The code here isn't a vision of loveliness, mainly because it's subject 39 to ongoing changes according to new things wanting to be tuned, and 40 practical requirements of systems tested. 41 42 Sometimes running the program twice produces slightly different results. 43 This is probably because there's so little separating algorithms near 44 their crossover, and on that basis it should make little or no difference 45 to the final speed of the relevant routines, but nothing has been done to 46 check that carefully. 47 48 Algorithm: 49 50 The thresholds are determined as follows. A crossover may not be a 51 single size but rather a range where it oscillates between method A or 52 method B faster. If the threshold is set making B used where A is faster 53 (or vice versa) that's bad. Badness is the percentage time lost and 54 total badness is the sum of this over all sizes measured. The threshold 55 is set to minimize total badness. 56 57 Suppose, as sizes increase, method B becomes faster than method A. The 58 effect of the rule is that, as you look at increasing sizes, isolated 59 points where B is faster are ignored, but when it's consistently faster, 60 or faster on balance, then the threshold is set there. The same result 61 is obtained thinking in the other direction of A becoming faster at 62 smaller sizes. 63 64 In practice the thresholds tend to be chosen to bring on the next 65 algorithm fairly quickly. 66 67 This rule is attractive because it's got a basis in reason and is fairly 68 easy to implement, but no work has been done to actually compare it in 69 absolute terms to other possibilities. 70 71 Implementation: 72 73 In a normal library build the thresholds are constants. To tune them 74 selected objects are recompiled with the thresholds as global variables 75 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at 76 the end of gmp-impl.h, and rules in tune/Makefile.am. 77 78 MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The 79 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one 80 level, but recurse into the basecase. 81 82 MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value. 83 Other routines in turn will make use of both of those. Naturally the 84 dependants must be tuned first. 85 86 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, 87 just a threshold based on comparing two routines (mpn_divrem_1 and 88 mpn_divexact_1), and no further use of the value determined. 89 90 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being 91 just comparisons between certain routines on representative data. 92 93 Shortcuts are applied when native (assembler) versions of routines exist. 94 For instance a native mpn_sqr_basecase is assumed to be always faster 95 than mpn_mul_basecase, with no measuring. 96 97 No attempt is made to tune within assembler routines, for instance 98 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be 99 written and tuned all by hand. Assembler routines that might have hard 100 limits are recompiled though, to make them accept a bigger range of sizes 101 than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr. 102 103 Limitations: 104 105 The FFTs aren't subject to the same badness rule as the other thresholds, 106 so each k is probably being brought on a touch early. This isn't likely 107 to make a difference, and the simpler probing means fewer tests. 108 109 */ 110 111 #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ 112 113 #include "config.h" 114 115 #include <math.h> 116 #include <stdio.h> 117 #include <stdlib.h> 118 #include <time.h> 119 #if HAVE_UNISTD_H 120 #include <unistd.h> 121 #endif 122 123 #include "gmp.h" 124 #include "gmp-impl.h" 125 #include "longlong.h" 126 127 #include "tests.h" 128 #include "speed.h" 129 130 #if !HAVE_DECL_OPTARG 131 extern char *optarg; 132 extern int optind, opterr; 133 #endif 134 135 136 #define DEFAULT_MAX_SIZE 1000 /* limbs */ 137 138 #if WANT_FFT 139 mp_size_t option_fft_max_size = 50000; /* limbs */ 140 #else 141 mp_size_t option_fft_max_size = 0; 142 #endif 143 int option_trace = 0; 144 int option_fft_trace = 0; 145 struct speed_params s; 146 147 struct dat_t { 148 mp_size_t size; 149 double d; 150 } *dat = NULL; 151 int ndat = 0; 152 int allocdat = 0; 153 154 /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that 155 case use zero here, which for params.max_size means no limit. */ 156 #ifndef TUNE_SQR_TOOM2_MAX 157 #define TUNE_SQR_TOOM2_MAX 0 158 #endif 159 160 mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX; 161 mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT; 162 mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; 163 mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT; 164 mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT; 165 mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX; 166 mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX; 167 mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX; 168 mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX; 169 mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX; 170 mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; 171 mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; 172 mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; 173 mp_size_t sqr_toom2_threshold 174 = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX); 175 mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 176 mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 177 mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 178 mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 179 mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; 180 mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; 181 mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX; 182 mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX; 183 mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX; 184 mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX; 185 mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX; 186 mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX; 187 mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX; 188 mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX; 189 mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX; 190 mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX; 191 mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX; 192 mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX; 193 mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX; 194 mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX; 195 mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX; 196 mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX; 197 mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX; 198 mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX; 199 mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX; 200 mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX; 201 mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX; 202 mp_size_t inv_newton_threshold = MP_SIZE_T_MAX; 203 mp_size_t inv_appr_threshold = MP_SIZE_T_MAX; 204 mp_size_t binv_newton_threshold = MP_SIZE_T_MAX; 205 mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX; 206 mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX; 207 mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; 208 mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; 209 mp_size_t hgcd_threshold = MP_SIZE_T_MAX; 210 mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX; 211 mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX; 212 mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; 213 mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; 214 int div_qr_1n_pi1_method = 0; 215 mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX; 216 mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX; 217 mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; 218 mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; 219 mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; 220 mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; 221 int mod_1_1p_method = 0; 222 mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX; 223 mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX; 224 mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX; 225 mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX; 226 mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX; 227 mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; 228 mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; 229 mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; 230 mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; 231 mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; 232 mp_size_t fac_odd_threshold = 0; 233 mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT; 234 235 mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; 236 mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; 237 238 struct param_t { 239 const char *name; 240 speed_function_t function; 241 speed_function_t function2; 242 double step_factor; /* how much to step relatively */ 243 int step; /* how much to step absolutely */ 244 double function_fudge; /* multiplier for "function" speeds */ 245 int stop_since_change; 246 double stop_factor; 247 mp_size_t min_size; 248 int min_is_always; 249 mp_size_t max_size; 250 mp_size_t check_size; 251 mp_size_t size_extra; 252 253 #define DATA_HIGH_LT_R 1 254 #define DATA_HIGH_GE_R 2 255 int data_high; 256 257 int noprint; 258 }; 259 260 261 /* These are normally undefined when false, which suits "#if" fine. 262 But give them zero values so they can be used in plain C "if"s. */ 263 #ifndef UDIV_PREINV_ALWAYS 264 #define UDIV_PREINV_ALWAYS 0 265 #endif 266 #ifndef HAVE_NATIVE_mpn_divexact_1 267 #define HAVE_NATIVE_mpn_divexact_1 0 268 #endif 269 #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1 270 #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0 271 #endif 272 #ifndef HAVE_NATIVE_mpn_divrem_1 273 #define HAVE_NATIVE_mpn_divrem_1 0 274 #endif 275 #ifndef HAVE_NATIVE_mpn_divrem_2 276 #define HAVE_NATIVE_mpn_divrem_2 0 277 #endif 278 #ifndef HAVE_NATIVE_mpn_mod_1 279 #define HAVE_NATIVE_mpn_mod_1 0 280 #endif 281 #ifndef HAVE_NATIVE_mpn_mod_1_1p 282 #define HAVE_NATIVE_mpn_mod_1_1p 0 283 #endif 284 #ifndef HAVE_NATIVE_mpn_modexact_1_odd 285 #define HAVE_NATIVE_mpn_modexact_1_odd 0 286 #endif 287 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 288 #define HAVE_NATIVE_mpn_preinv_divrem_1 0 289 #endif 290 #ifndef HAVE_NATIVE_mpn_preinv_mod_1 291 #define HAVE_NATIVE_mpn_preinv_mod_1 0 292 #endif 293 #ifndef HAVE_NATIVE_mpn_sqr_basecase 294 #define HAVE_NATIVE_mpn_sqr_basecase 0 295 #endif 296 297 298 #define MAX3(a,b,c) MAX (MAX (a, b), c) 299 300 mp_limb_t 301 randlimb_norm (void) 302 { 303 mp_limb_t n; 304 mpn_random (&n, 1); 305 n |= GMP_NUMB_HIGHBIT; 306 return n; 307 } 308 309 #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) 310 311 mp_limb_t 312 randlimb_half (void) 313 { 314 mp_limb_t n; 315 mpn_random (&n, 1); 316 n &= GMP_NUMB_HALFMASK; 317 n += (n==0); 318 return n; 319 } 320 321 322 /* Add an entry to the end of the dat[] array, reallocing to make it bigger 323 if necessary. */ 324 void 325 add_dat (mp_size_t size, double d) 326 { 327 #define ALLOCDAT_STEP 500 328 329 ASSERT_ALWAYS (ndat <= allocdat); 330 331 if (ndat == allocdat) 332 { 333 dat = (struct dat_t *) __gmp_allocate_or_reallocate 334 (dat, allocdat * sizeof(dat[0]), 335 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); 336 allocdat += ALLOCDAT_STEP; 337 } 338 339 dat[ndat].size = size; 340 dat[ndat].d = d; 341 ndat++; 342 } 343 344 345 /* Return the threshold size based on the data accumulated. */ 346 mp_size_t 347 analyze_dat (int final) 348 { 349 double x, min_x; 350 int j, min_j; 351 352 /* If the threshold is set at dat[0].size, any positive values are bad. */ 353 x = 0.0; 354 for (j = 0; j < ndat; j++) 355 if (dat[j].d > 0.0) 356 x += dat[j].d; 357 358 if (option_trace >= 2 && final) 359 { 360 printf ("\n"); 361 printf ("x is the sum of the badness from setting thresh at given size\n"); 362 printf (" (minimum x is sought)\n"); 363 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); 364 } 365 366 min_x = x; 367 min_j = 0; 368 369 370 /* When stepping to the next dat[j].size, positive values are no longer 371 bad (so subtracted), negative values become bad (so add the absolute 372 value, meaning subtract). */ 373 for (j = 0; j < ndat; x -= dat[j].d, j++) 374 { 375 if (option_trace >= 2 && final) 376 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); 377 378 if (x < min_x) 379 { 380 min_x = x; 381 min_j = j; 382 } 383 } 384 385 return min_j; 386 } 387 388 389 /* Measuring for recompiled mpn/generic/div_qr_1.c, 390 * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */ 391 392 mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t); 393 394 #if defined (__cplusplus) 395 extern "C" { 396 #endif 397 398 mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 399 mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t); 400 void mpz_fac_ui_tune (mpz_ptr, unsigned long); 401 402 #if defined (__cplusplus) 403 } 404 #endif 405 406 double 407 speed_mpn_mod_1_tune (struct speed_params *s) 408 { 409 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); 410 } 411 double 412 speed_mpn_divrem_1_tune (struct speed_params *s) 413 { 414 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); 415 } 416 double 417 speed_mpz_fac_ui_tune (struct speed_params *s) 418 { 419 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune); 420 } 421 double 422 speed_mpn_div_qr_1_tune (struct speed_params *s) 423 { 424 SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune); 425 } 426 427 double 428 tuneup_measure (speed_function_t fun, 429 const struct param_t *param, 430 struct speed_params *s) 431 { 432 static struct param_t dummy; 433 double t; 434 TMP_DECL; 435 436 if (! param) 437 param = &dummy; 438 439 s->size += param->size_extra; 440 441 TMP_MARK; 442 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); 443 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); 444 445 mpn_random (s->xp, s->size); 446 mpn_random (s->yp, s->size); 447 448 switch (param->data_high) { 449 case DATA_HIGH_LT_R: 450 s->xp[s->size-1] %= s->r; 451 s->yp[s->size-1] %= s->r; 452 break; 453 case DATA_HIGH_GE_R: 454 s->xp[s->size-1] |= s->r; 455 s->yp[s->size-1] |= s->r; 456 break; 457 } 458 459 t = speed_measure (fun, s); 460 461 s->size -= param->size_extra; 462 463 TMP_FREE; 464 return t; 465 } 466 467 468 #define PRINT_WIDTH 31 469 470 void 471 print_define_start (const char *name) 472 { 473 printf ("#define %-*s ", PRINT_WIDTH, name); 474 if (option_trace) 475 printf ("...\n"); 476 } 477 478 void 479 print_define_end_remark (const char *name, mp_size_t value, const char *remark) 480 { 481 if (option_trace) 482 printf ("#define %-*s ", PRINT_WIDTH, name); 483 484 if (value == MP_SIZE_T_MAX) 485 printf ("MP_SIZE_T_MAX"); 486 else 487 printf ("%5ld", (long) value); 488 489 if (remark != NULL) 490 printf (" /* %s */", remark); 491 printf ("\n"); 492 fflush (stdout); 493 } 494 495 void 496 print_define_end (const char *name, mp_size_t value) 497 { 498 const char *remark; 499 if (value == MP_SIZE_T_MAX) 500 remark = "never"; 501 else if (value == 0) 502 remark = "always"; 503 else 504 remark = NULL; 505 print_define_end_remark (name, value, remark); 506 } 507 508 void 509 print_define (const char *name, mp_size_t value) 510 { 511 print_define_start (name); 512 print_define_end (name, value); 513 } 514 515 void 516 print_define_remark (const char *name, mp_size_t value, const char *remark) 517 { 518 print_define_start (name); 519 print_define_end_remark (name, value, remark); 520 } 521 522 523 void 524 one (mp_size_t *threshold, struct param_t *param) 525 { 526 int since_positive, since_thresh_change; 527 int thresh_idx, new_thresh_idx; 528 529 #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) 530 531 DEFAULT (param->function_fudge, 1.0); 532 DEFAULT (param->function2, param->function); 533 DEFAULT (param->step_factor, 0.01); /* small steps by default */ 534 DEFAULT (param->step, 1); /* small steps by default */ 535 DEFAULT (param->stop_since_change, 80); 536 DEFAULT (param->stop_factor, 1.2); 537 DEFAULT (param->min_size, 10); 538 DEFAULT (param->max_size, DEFAULT_MAX_SIZE); 539 540 if (param->check_size != 0) 541 { 542 double t1, t2; 543 s.size = param->check_size; 544 545 *threshold = s.size+1; 546 t1 = tuneup_measure (param->function, param, &s); 547 548 *threshold = s.size; 549 t2 = tuneup_measure (param->function2, param, &s); 550 if (t1 == -1.0 || t2 == -1.0) 551 { 552 printf ("Oops, can't run both functions at size %ld\n", 553 (long) s.size); 554 abort (); 555 } 556 t1 *= param->function_fudge; 557 558 /* ask that t2 is at least 4% below t1 */ 559 if (t1 < t2*1.04) 560 { 561 if (option_trace) 562 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); 563 *threshold = MP_SIZE_T_MAX; 564 if (! param->noprint) 565 print_define (param->name, *threshold); 566 return; 567 } 568 569 if (option_trace >= 2) 570 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", 571 (long) s.size, t1, t2); 572 } 573 574 if (! param->noprint || option_trace) 575 print_define_start (param->name); 576 577 ndat = 0; 578 since_positive = 0; 579 since_thresh_change = 0; 580 thresh_idx = 0; 581 582 if (option_trace >= 2) 583 { 584 printf (" algorithm-A algorithm-B ratio possible\n"); 585 printf (" (seconds) (seconds) diff thresh\n"); 586 } 587 588 for (s.size = param->min_size; 589 s.size < param->max_size; 590 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step)) 591 { 592 double ti, tiplus1, d; 593 594 /* 595 FIXME: check minimum size requirements are met, possibly by just 596 checking for the -1 returns from the speed functions. 597 */ 598 599 /* using method A at this size */ 600 *threshold = s.size+1; 601 ti = tuneup_measure (param->function, param, &s); 602 if (ti == -1.0) 603 abort (); 604 ti *= param->function_fudge; 605 606 /* using method B at this size */ 607 *threshold = s.size; 608 tiplus1 = tuneup_measure (param->function2, param, &s); 609 if (tiplus1 == -1.0) 610 abort (); 611 612 /* Calculate the fraction by which the one or the other routine is 613 slower. */ 614 if (tiplus1 >= ti) 615 d = (tiplus1 - ti) / tiplus1; /* negative */ 616 else 617 d = (tiplus1 - ti) / ti; /* positive */ 618 619 add_dat (s.size, d); 620 621 new_thresh_idx = analyze_dat (0); 622 623 if (option_trace >= 2) 624 printf ("size=%ld %.9f %.9f % .4f %c %ld\n", 625 (long) s.size, ti, tiplus1, d, 626 ti > tiplus1 ? '#' : ' ', 627 (long) dat[new_thresh_idx].size); 628 629 /* Stop if the last time method i was faster was more than a 630 certain number of measurements ago. */ 631 #define STOP_SINCE_POSITIVE 200 632 if (d >= 0) 633 since_positive = 0; 634 else 635 if (++since_positive > STOP_SINCE_POSITIVE) 636 { 637 if (option_trace >= 1) 638 printf ("stopped due to since_positive (%d)\n", 639 STOP_SINCE_POSITIVE); 640 break; 641 } 642 643 /* Stop if method A has become slower by a certain factor. */ 644 if (ti >= tiplus1 * param->stop_factor) 645 { 646 if (option_trace >= 1) 647 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", 648 param->stop_factor); 649 break; 650 } 651 652 /* Stop if the threshold implied hasn't changed in a certain 653 number of measurements. (It's this condition that usually 654 stops the loop.) */ 655 if (thresh_idx != new_thresh_idx) 656 since_thresh_change = 0, thresh_idx = new_thresh_idx; 657 else 658 if (++since_thresh_change > param->stop_since_change) 659 { 660 if (option_trace >= 1) 661 printf ("stopped due to since_thresh_change (%d)\n", 662 param->stop_since_change); 663 break; 664 } 665 666 /* Stop if the threshold implied is more than a certain number of 667 measurements ago. */ 668 #define STOP_SINCE_AFTER 500 669 if (ndat - thresh_idx > STOP_SINCE_AFTER) 670 { 671 if (option_trace >= 1) 672 printf ("stopped due to ndat - thresh_idx > amount (%d)\n", 673 STOP_SINCE_AFTER); 674 break; 675 } 676 677 /* Stop when the size limit is reached before the end of the 678 crossover, but only show this as an error for >= the default max 679 size. FIXME: Maybe should make it a param choice whether this is 680 an error. */ 681 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) 682 { 683 fprintf (stderr, "%s\n", param->name); 684 fprintf (stderr, "sizes %ld to %ld total %d measurements\n", 685 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 686 fprintf (stderr, " max size reached before end of crossover\n"); 687 break; 688 } 689 } 690 691 if (option_trace >= 1) 692 printf ("sizes %ld to %ld total %d measurements\n", 693 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 694 695 *threshold = dat[analyze_dat (1)].size; 696 697 if (param->min_is_always) 698 { 699 if (*threshold == param->min_size) 700 *threshold = 0; 701 } 702 703 if (! param->noprint || option_trace) 704 print_define_end (param->name, *threshold); 705 } 706 707 708 /* Special probing for the fft thresholds. The size restrictions on the 709 FFTs mean the graph of time vs size has a step effect. See this for 710 example using 711 712 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 713 gnuplot foo.gnuplot 714 715 The current approach is to compare routines at the midpoint of relevant 716 steps. Arguably a more sophisticated system of threshold data is wanted 717 if this step effect remains. */ 718 719 struct fft_param_t { 720 const char *table_name; 721 const char *threshold_name; 722 const char *modf_threshold_name; 723 mp_size_t *p_threshold; 724 mp_size_t *p_modf_threshold; 725 mp_size_t first_size; 726 mp_size_t max_size; 727 speed_function_t function; 728 speed_function_t mul_modf_function; 729 speed_function_t mul_function; 730 mp_size_t sqr; 731 }; 732 733 734 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with 735 N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple 736 of 2^(k-1) bits. */ 737 738 mp_size_t 739 fft_step_size (int k) 740 { 741 mp_size_t step; 742 743 step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS; 744 step *= (mp_size_t) 1 << k; 745 746 if (step <= 0) 747 { 748 printf ("Can't handle k=%d\n", k); 749 abort (); 750 } 751 752 return step; 753 } 754 755 mp_size_t 756 fft_next_size (mp_size_t pl, int k) 757 { 758 mp_size_t m = fft_step_size (k); 759 760 /* printf ("[k=%d %ld] %ld ->", k, m, pl); */ 761 762 if (pl == 0 || (pl & (m-1)) != 0) 763 pl = (pl | (m-1)) + 1; 764 765 /* printf (" %ld\n", pl); */ 766 return pl; 767 } 768 769 #define NMAX_DEFAULT 1000000 770 #define MAX_REPS 25 771 #define MIN_REPS 5 772 773 static inline size_t 774 mpn_mul_fft_lcm (size_t a, unsigned int k) 775 { 776 unsigned int l = k; 777 778 while (a % 2 == 0 && k > 0) 779 { 780 a >>= 1; 781 k--; 782 } 783 return a << l; 784 } 785 786 mp_size_t 787 fftfill (mp_size_t pl, int k, int sqr) 788 { 789 mp_size_t maxLK; 790 mp_bitcnt_t N, Nprime, nprime, M; 791 792 N = pl * GMP_NUMB_BITS; 793 M = N >> k; 794 795 maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); 796 797 Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; 798 nprime = Nprime / GMP_NUMB_BITS; 799 if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 800 { 801 size_t K2; 802 for (;;) 803 { 804 K2 = 1L << mpn_fft_best_k (nprime, sqr); 805 if ((nprime & (K2 - 1)) == 0) 806 break; 807 nprime = (nprime + K2 - 1) & -K2; 808 Nprime = nprime * GMP_LIMB_BITS; 809 } 810 } 811 ASSERT_ALWAYS (nprime < pl); 812 813 return Nprime; 814 } 815 816 static int 817 compare_double (const void *ap, const void *bp) 818 { 819 double a = * (const double *) ap; 820 double b = * (const double *) bp; 821 822 if (a < b) 823 return -1; 824 else if (a > b) 825 return 1; 826 else 827 return 0; 828 } 829 830 double 831 median (double *times, int n) 832 { 833 qsort (times, n, sizeof (double), compare_double); 834 return times[n/2]; 835 } 836 837 #define FFT_CACHE_SIZE 25 838 typedef struct fft_cache 839 { 840 mp_size_t n; 841 double time; 842 } fft_cache_t; 843 844 fft_cache_t fft_cache[FFT_CACHE_SIZE]; 845 846 double 847 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k, 848 int n_measurements) 849 { 850 int i; 851 double t, ttab[MAX_REPS]; 852 853 if (fft_cache[k].n == n) 854 return fft_cache[k].time; 855 856 for (i = 0; i < n_measurements; i++) 857 { 858 speed_starttime (); 859 mpn_mul_fft (rp, n, ap, n, bp, n, k); 860 ttab[i] = speed_endtime (); 861 } 862 863 t = median (ttab, n_measurements); 864 fft_cache[k].n = n; 865 fft_cache[k].time = t; 866 return t; 867 } 868 869 #define INSERT_FFTTAB(idx, nval, kval) \ 870 do { \ 871 fft_tab[idx].n = nval; \ 872 fft_tab[idx].k = kval; \ 873 fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \ 874 fft_tab[idx+1].k = (1 << 5) - 1; \ 875 } while (0) 876 877 int 878 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print) 879 { 880 mp_size_t n, n1, prev_n1; 881 int k, best_k, last_best_k, kmax; 882 int eff, prev_eff; 883 double t0, t1; 884 int n_measurements; 885 mp_limb_t *ap, *bp, *rp; 886 mp_size_t alloc; 887 struct fft_table_nk *fft_tab; 888 889 fft_tab = mpn_fft_table3[p->sqr]; 890 891 for (k = 0; k < FFT_CACHE_SIZE; k++) 892 fft_cache[k].n = 0; 893 894 if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 895 { 896 nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD); 897 } 898 899 if (print) 900 printf ("#define %s%*s", p->table_name, 38, ""); 901 902 if (idx == 0) 903 { 904 INSERT_FFTTAB (0, nmin, initial_k); 905 906 if (print) 907 { 908 printf ("\\\n { "); 909 printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k); 910 } 911 912 idx = 1; 913 } 914 915 ap = (mp_ptr) malloc (sizeof (mp_limb_t)); 916 if (p->sqr) 917 bp = ap; 918 else 919 bp = (mp_ptr) malloc (sizeof (mp_limb_t)); 920 rp = (mp_ptr) malloc (sizeof (mp_limb_t)); 921 alloc = 1; 922 923 /* Round n to comply to initial k value */ 924 n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k); 925 926 n_measurements = (18 - initial_k) | 1; 927 n_measurements = MAX (n_measurements, MIN_REPS); 928 n_measurements = MIN (n_measurements, MAX_REPS); 929 930 last_best_k = initial_k; 931 best_k = initial_k; 932 933 while (n < nmax) 934 { 935 int start_k, end_k; 936 937 /* Assume the current best k is best until we hit its next FFT step. */ 938 t0 = 99999; 939 940 prev_n1 = n + 1; 941 942 start_k = MAX (4, best_k - 4); 943 end_k = MIN (24, best_k + 4); 944 for (k = start_k; k <= end_k; k++) 945 { 946 n1 = mpn_fft_next_size (prev_n1, k); 947 948 eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr); 949 950 if (eff < 70) /* avoid measuring too slow fft:s */ 951 continue; 952 953 if (n1 > alloc) 954 { 955 alloc = n1; 956 if (p->sqr) 957 { 958 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); 959 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); 960 ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); 961 mpn_random (ap, alloc); 962 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); 963 } 964 else 965 { 966 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); 967 bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t)); 968 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); 969 ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); 970 mpn_random (ap, alloc); 971 bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t)); 972 mpn_random (bp, alloc); 973 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); 974 } 975 } 976 977 t1 = cached_measure (rp, ap, bp, n1, k, n_measurements); 978 979 if (t1 * n_measurements > 0.3) 980 n_measurements -= 2; 981 n_measurements = MAX (n_measurements, MIN_REPS); 982 983 if (t1 < t0) 984 { 985 best_k = k; 986 t0 = t1; 987 } 988 } 989 990 n1 = mpn_fft_next_size (prev_n1, best_k); 991 992 if (last_best_k != best_k) 993 { 994 ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1); 995 996 if (idx >= FFT_TABLE3_SIZE) 997 { 998 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 999 abort (); 1000 } 1001 INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k); 1002 1003 if (print) 1004 { 1005 printf (", "); 1006 if (idx % 4 == 0) 1007 printf ("\\\n "); 1008 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1009 } 1010 1011 if (option_trace >= 2) 1012 { 1013 printf ("{%lu,%u}\n", prev_n1, best_k); 1014 fflush (stdout); 1015 } 1016 1017 last_best_k = best_k; 1018 idx++; 1019 } 1020 1021 for (;;) 1022 { 1023 prev_n1 = n1; 1024 prev_eff = fftfill (prev_n1, best_k, p->sqr); 1025 n1 = mpn_fft_next_size (prev_n1 + 1, best_k); 1026 eff = fftfill (n1, best_k, p->sqr); 1027 1028 if (eff != prev_eff) 1029 break; 1030 } 1031 1032 n = prev_n1; 1033 } 1034 1035 kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */ 1036 kmax = MIN (kmax, 25-1); 1037 for (k = last_best_k + 1; k <= kmax; k++) 1038 { 1039 if (idx >= FFT_TABLE3_SIZE) 1040 { 1041 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 1042 abort (); 1043 } 1044 INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k); 1045 1046 if (print) 1047 { 1048 printf (", "); 1049 if (idx % 4 == 0) 1050 printf ("\\\n "); 1051 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1052 } 1053 1054 idx++; 1055 } 1056 1057 if (print) 1058 printf (" }\n"); 1059 1060 free (ap); 1061 if (! p->sqr) 1062 free (bp); 1063 free (rp); 1064 1065 return idx; 1066 } 1067 1068 void 1069 fft (struct fft_param_t *p) 1070 { 1071 mp_size_t size; 1072 int k, idx, initial_k; 1073 1074 /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/ 1075 1076 #if 1 1077 { 1078 /* Use plain one() mechanism, for some reasonable initial values of k. The 1079 advantage is that we don't depend on mpn_fft_table3, which can therefore 1080 leave it completely uninitialized. */ 1081 1082 static struct param_t param; 1083 mp_size_t thres, best_thres; 1084 int best_k; 1085 char buf[20]; 1086 1087 best_thres = MP_SIZE_T_MAX; 1088 best_k = -1; 1089 1090 for (k = 5; k <= 7; k++) 1091 { 1092 param.name = p->modf_threshold_name; 1093 param.min_size = 100; 1094 param.max_size = 2000; 1095 param.function = p->mul_function; 1096 param.step_factor = 0.0; 1097 param.step = 4; 1098 param.function2 = p->mul_modf_function; 1099 param.noprint = 1; 1100 s.r = k; 1101 one (&thres, ¶m); 1102 if (thres < best_thres) 1103 { 1104 best_thres = thres; 1105 best_k = k; 1106 } 1107 } 1108 1109 *(p->p_modf_threshold) = best_thres; 1110 sprintf (buf, "k = %d", best_k); 1111 print_define_remark (p->modf_threshold_name, best_thres, buf); 1112 initial_k = best_k; 1113 } 1114 #else 1115 size = p->first_size; 1116 for (;;) 1117 { 1118 double tk, tm; 1119 1120 size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr)); 1121 k = mpn_fft_best_k (size, p->sqr); 1122 1123 if (size >= p->max_size) 1124 break; 1125 1126 s.size = size + fft_step_size (k) / 2; 1127 s.r = k; 1128 tk = tuneup_measure (p->mul_modf_function, NULL, &s); 1129 if (tk == -1.0) 1130 abort (); 1131 1132 tm = tuneup_measure (p->mul_function, NULL, &s); 1133 if (tm == -1.0) 1134 abort (); 1135 1136 if (option_trace >= 2) 1137 printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n", 1138 (long) size, 1139 (long) size + fft_step_size (k) / 2, k, tk, 1140 (long) s.size, tm); 1141 1142 if (tk < tm) 1143 { 1144 *p->p_modf_threshold = s.size; 1145 print_define (p->modf_threshold_name, *p->p_modf_threshold); 1146 break; 1147 } 1148 } 1149 initial_k = ?; 1150 #endif 1151 1152 /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/ 1153 1154 idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1); 1155 printf ("#define %s_SIZE %d\n", p->table_name, idx); 1156 1157 /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/ 1158 1159 size = 2 * *p->p_modf_threshold; /* OK? */ 1160 for (;;) 1161 { 1162 double tk, tm; 1163 mp_size_t mulmod_size, mul_size;; 1164 1165 if (size >= p->max_size) 1166 break; 1167 1168 mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2; 1169 mul_size = (size + mulmod_size) / 2; /* middle of step */ 1170 1171 s.size = mulmod_size; 1172 tk = tuneup_measure (p->function, NULL, &s); 1173 if (tk == -1.0) 1174 abort (); 1175 1176 s.size = mul_size; 1177 tm = tuneup_measure (p->mul_function, NULL, &s); 1178 if (tm == -1.0) 1179 abort (); 1180 1181 if (option_trace >= 2) 1182 printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n", 1183 (long) size, 1184 (long) mulmod_size, tk, 1185 (long) mul_size, tm); 1186 1187 size = mulmod_size; 1188 1189 if (tk < tm) 1190 { 1191 *p->p_threshold = s.size; 1192 print_define (p->threshold_name, *p->p_threshold); 1193 break; 1194 } 1195 } 1196 } 1197 1198 1199 1200 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, 1201 giving wrong results. */ 1202 void 1203 tune_mul_n (void) 1204 { 1205 static struct param_t param; 1206 mp_size_t next_toom_start; 1207 int something_changed; 1208 1209 param.function = speed_mpn_mul_n; 1210 1211 param.name = "MUL_TOOM22_THRESHOLD"; 1212 param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE); 1213 param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1; 1214 one (&mul_toom22_threshold, ¶m); 1215 1216 param.noprint = 1; 1217 1218 /* Threshold sequence loop. Disable functions that would be used in a very 1219 narrow range, re-measuring things when that happens. */ 1220 something_changed = 1; 1221 while (something_changed) 1222 { 1223 something_changed = 0; 1224 1225 next_toom_start = mul_toom22_threshold; 1226 1227 if (mul_toom33_threshold != 0) 1228 { 1229 param.name = "MUL_TOOM33_THRESHOLD"; 1230 param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE); 1231 param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1; 1232 one (&mul_toom33_threshold, ¶m); 1233 1234 if (next_toom_start * 1.05 >= mul_toom33_threshold) 1235 { 1236 mul_toom33_threshold = 0; 1237 something_changed = 1; 1238 } 1239 } 1240 1241 next_toom_start = MAX (next_toom_start, mul_toom33_threshold); 1242 1243 if (mul_toom44_threshold != 0) 1244 { 1245 param.name = "MUL_TOOM44_THRESHOLD"; 1246 param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE); 1247 param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; 1248 one (&mul_toom44_threshold, ¶m); 1249 1250 if (next_toom_start * 1.05 >= mul_toom44_threshold) 1251 { 1252 mul_toom44_threshold = 0; 1253 something_changed = 1; 1254 } 1255 } 1256 1257 next_toom_start = MAX (next_toom_start, mul_toom44_threshold); 1258 1259 if (mul_toom6h_threshold != 0) 1260 { 1261 param.name = "MUL_TOOM6H_THRESHOLD"; 1262 param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE); 1263 param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1; 1264 one (&mul_toom6h_threshold, ¶m); 1265 1266 if (next_toom_start * 1.05 >= mul_toom6h_threshold) 1267 { 1268 mul_toom6h_threshold = 0; 1269 something_changed = 1; 1270 } 1271 } 1272 1273 next_toom_start = MAX (next_toom_start, mul_toom6h_threshold); 1274 1275 if (mul_toom8h_threshold != 0) 1276 { 1277 param.name = "MUL_TOOM8H_THRESHOLD"; 1278 param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE); 1279 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1; 1280 one (&mul_toom8h_threshold, ¶m); 1281 1282 if (next_toom_start * 1.05 >= mul_toom8h_threshold) 1283 { 1284 mul_toom8h_threshold = 0; 1285 something_changed = 1; 1286 } 1287 } 1288 } 1289 1290 print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD); 1291 print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD); 1292 print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD); 1293 print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD); 1294 1295 /* disabled until tuned */ 1296 MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; 1297 } 1298 1299 void 1300 tune_mul (void) 1301 { 1302 static struct param_t param; 1303 mp_size_t thres; 1304 1305 param.noprint = 1; 1306 1307 param.function = speed_mpn_toom32_for_toom43_mul; 1308 param.function2 = speed_mpn_toom43_for_toom32_mul; 1309 param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD"; 1310 param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17; 1311 one (&thres, ¶m); 1312 mul_toom32_to_toom43_threshold = thres * 17 / 24; 1313 print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold); 1314 1315 param.function = speed_mpn_toom32_for_toom53_mul; 1316 param.function2 = speed_mpn_toom53_for_toom32_mul; 1317 param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD"; 1318 param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19; 1319 one (&thres, ¶m); 1320 mul_toom32_to_toom53_threshold = thres * 19 / 30; 1321 print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold); 1322 1323 param.function = speed_mpn_toom42_for_toom53_mul; 1324 param.function2 = speed_mpn_toom53_for_toom42_mul; 1325 param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD"; 1326 param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11; 1327 one (&thres, ¶m); 1328 mul_toom42_to_toom53_threshold = thres * 11 / 20; 1329 print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold); 1330 1331 param.function = speed_mpn_toom42_mul; 1332 param.function2 = speed_mpn_toom63_mul; 1333 param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD"; 1334 param.min_size = MPN_TOOM63_MUL_MINSIZE * 2; 1335 one (&thres, ¶m); 1336 mul_toom42_to_toom63_threshold = thres / 2; 1337 print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold); 1338 1339 /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */ 1340 param.function = speed_mpn_toom43_for_toom54_mul; 1341 param.function2 = speed_mpn_toom54_for_toom43_mul; 1342 param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD"; 1343 param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5; 1344 one (&thres, ¶m); 1345 mul_toom43_to_toom54_threshold = thres * 5 / 6; 1346 print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold); 1347 } 1348 1349 1350 void 1351 tune_mullo (void) 1352 { 1353 static struct param_t param; 1354 1355 param.function = speed_mpn_mullo_n; 1356 1357 param.name = "MULLO_BASECASE_THRESHOLD"; 1358 param.min_size = 1; 1359 param.min_is_always = 1; 1360 param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1; 1361 param.stop_factor = 1.5; 1362 param.noprint = 1; 1363 one (&mullo_basecase_threshold, ¶m); 1364 1365 param.name = "MULLO_DC_THRESHOLD"; 1366 param.min_size = 8; 1367 param.min_is_always = 0; 1368 param.max_size = 1000; 1369 one (&mullo_dc_threshold, ¶m); 1370 1371 if (mullo_basecase_threshold >= mullo_dc_threshold) 1372 { 1373 print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold); 1374 print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase"); 1375 } 1376 else 1377 { 1378 print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold); 1379 print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold); 1380 } 1381 1382 if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2) 1383 { 1384 param.name = "MULLO_MUL_N_THRESHOLD"; 1385 param.min_size = mullo_dc_threshold; 1386 param.max_size = 2 * mul_fft_threshold; 1387 param.noprint = 0; 1388 param.step_factor = 0.03; 1389 one (&mullo_mul_n_threshold, ¶m); 1390 } 1391 else 1392 print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX, 1393 "without FFT use mullo forever"); 1394 } 1395 1396 void 1397 tune_sqrlo (void) 1398 { 1399 static struct param_t param; 1400 1401 param.function = speed_mpn_sqrlo; 1402 1403 param.name = "SQRLO_BASECASE_THRESHOLD"; 1404 param.min_size = 1; 1405 param.min_is_always = 1; 1406 param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1; 1407 param.stop_factor = 1.5; 1408 param.noprint = 1; 1409 one (&sqrlo_basecase_threshold, ¶m); 1410 1411 param.name = "SQRLO_DC_THRESHOLD"; 1412 param.min_size = 8; 1413 param.min_is_always = 0; 1414 param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1; 1415 one (&sqrlo_dc_threshold, ¶m); 1416 1417 if (sqrlo_basecase_threshold >= sqrlo_dc_threshold) 1418 { 1419 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold); 1420 print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase"); 1421 } 1422 else 1423 { 1424 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold); 1425 print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold); 1426 } 1427 1428 if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2) 1429 { 1430 param.name = "SQRLO_SQR_THRESHOLD"; 1431 param.min_size = sqrlo_dc_threshold; 1432 param.max_size = 2 * sqr_fft_threshold; 1433 param.noprint = 0; 1434 param.step_factor = 0.03; 1435 one (&sqrlo_sqr_threshold, ¶m); 1436 } 1437 else 1438 print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX, 1439 "without FFT use sqrlo forever"); 1440 } 1441 1442 void 1443 tune_mulmid (void) 1444 { 1445 static struct param_t param; 1446 1447 param.name = "MULMID_TOOM42_THRESHOLD"; 1448 param.function = speed_mpn_mulmid_n; 1449 param.min_size = 4; 1450 param.max_size = 100; 1451 one (&mulmid_toom42_threshold, ¶m); 1452 } 1453 1454 void 1455 tune_mulmod_bnm1 (void) 1456 { 1457 static struct param_t param; 1458 1459 param.name = "MULMOD_BNM1_THRESHOLD"; 1460 param.function = speed_mpn_mulmod_bnm1; 1461 param.min_size = 4; 1462 param.max_size = 100; 1463 one (&mulmod_bnm1_threshold, ¶m); 1464 } 1465 1466 void 1467 tune_sqrmod_bnm1 (void) 1468 { 1469 static struct param_t param; 1470 1471 param.name = "SQRMOD_BNM1_THRESHOLD"; 1472 param.function = speed_mpn_sqrmod_bnm1; 1473 param.min_size = 4; 1474 param.max_size = 100; 1475 one (&sqrmod_bnm1_threshold, ¶m); 1476 } 1477 1478 1479 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase 1480 is faster only at size==2 then we don't want to bother with extra code 1481 just for that. Start karatsuba from 4 same as MUL above. */ 1482 1483 void 1484 tune_sqr (void) 1485 { 1486 /* disabled until tuned */ 1487 SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; 1488 1489 if (HAVE_NATIVE_mpn_sqr_basecase) 1490 { 1491 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); 1492 sqr_basecase_threshold = 0; 1493 } 1494 else 1495 { 1496 static struct param_t param; 1497 param.name = "SQR_BASECASE_THRESHOLD"; 1498 param.function = speed_mpn_sqr; 1499 param.min_size = 3; 1500 param.min_is_always = 1; 1501 param.max_size = TUNE_SQR_TOOM2_MAX; 1502 param.noprint = 1; 1503 one (&sqr_basecase_threshold, ¶m); 1504 } 1505 1506 { 1507 static struct param_t param; 1508 param.name = "SQR_TOOM2_THRESHOLD"; 1509 param.function = speed_mpn_sqr; 1510 param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE); 1511 param.max_size = TUNE_SQR_TOOM2_MAX; 1512 param.noprint = 1; 1513 one (&sqr_toom2_threshold, ¶m); 1514 1515 if (! HAVE_NATIVE_mpn_sqr_basecase 1516 && sqr_toom2_threshold < sqr_basecase_threshold) 1517 { 1518 /* Karatsuba becomes faster than mul_basecase before 1519 sqr_basecase does. Arrange for the expression 1520 "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which 1521 selects mpn_sqr_basecase in mpn_sqr to be false, by setting 1522 SQR_TOOM2_THRESHOLD to zero, making 1523 SQR_BASECASE_THRESHOLD the toom2 threshold. */ 1524 1525 sqr_basecase_threshold = SQR_TOOM2_THRESHOLD; 1526 SQR_TOOM2_THRESHOLD = 0; 1527 1528 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, 1529 "toom2"); 1530 print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD, 1531 "never sqr_basecase"); 1532 } 1533 else 1534 { 1535 if (! HAVE_NATIVE_mpn_sqr_basecase) 1536 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); 1537 print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD); 1538 } 1539 } 1540 1541 { 1542 static struct param_t param; 1543 mp_size_t next_toom_start; 1544 int something_changed; 1545 1546 param.function = speed_mpn_sqr; 1547 param.noprint = 1; 1548 1549 /* Threshold sequence loop. Disable functions that would be used in a very 1550 narrow range, re-measuring things when that happens. */ 1551 something_changed = 1; 1552 while (something_changed) 1553 { 1554 something_changed = 0; 1555 1556 next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold); 1557 1558 sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 1559 param.name = "SQR_TOOM3_THRESHOLD"; 1560 param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE); 1561 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; 1562 one (&sqr_toom3_threshold, ¶m); 1563 1564 next_toom_start = MAX (next_toom_start, sqr_toom3_threshold); 1565 1566 if (sqr_toom4_threshold != 0) 1567 { 1568 param.name = "SQR_TOOM4_THRESHOLD"; 1569 sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 1570 param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE); 1571 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; 1572 one (&sqr_toom4_threshold, ¶m); 1573 1574 if (next_toom_start * 1.05 >= sqr_toom4_threshold) 1575 { 1576 sqr_toom4_threshold = 0; 1577 something_changed = 1; 1578 } 1579 } 1580 1581 next_toom_start = MAX (next_toom_start, sqr_toom4_threshold); 1582 1583 if (sqr_toom6_threshold != 0) 1584 { 1585 param.name = "SQR_TOOM6_THRESHOLD"; 1586 sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 1587 param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE); 1588 param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1; 1589 one (&sqr_toom6_threshold, ¶m); 1590 1591 if (next_toom_start * 1.05 >= sqr_toom6_threshold) 1592 { 1593 sqr_toom6_threshold = 0; 1594 something_changed = 1; 1595 } 1596 } 1597 1598 next_toom_start = MAX (next_toom_start, sqr_toom6_threshold); 1599 1600 if (sqr_toom8_threshold != 0) 1601 { 1602 param.name = "SQR_TOOM8_THRESHOLD"; 1603 sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 1604 param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE); 1605 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1; 1606 one (&sqr_toom8_threshold, ¶m); 1607 1608 if (next_toom_start * 1.05 >= sqr_toom8_threshold) 1609 { 1610 sqr_toom8_threshold = 0; 1611 something_changed = 1; 1612 } 1613 } 1614 } 1615 1616 print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD); 1617 print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD); 1618 print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD); 1619 print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD); 1620 } 1621 } 1622 1623 1624 void 1625 tune_dc_div (void) 1626 { 1627 s.r = 0; /* clear to make speed function do 2n/n */ 1628 { 1629 static struct param_t param; 1630 param.name = "DC_DIV_QR_THRESHOLD"; 1631 param.function = speed_mpn_sbpi1_div_qr; 1632 param.function2 = speed_mpn_dcpi1_div_qr; 1633 param.min_size = 6; 1634 one (&dc_div_qr_threshold, ¶m); 1635 } 1636 { 1637 static struct param_t param; 1638 param.name = "DC_DIVAPPR_Q_THRESHOLD"; 1639 param.function = speed_mpn_sbpi1_divappr_q; 1640 param.function2 = speed_mpn_dcpi1_divappr_q; 1641 param.min_size = 6; 1642 one (&dc_divappr_q_threshold, ¶m); 1643 } 1644 } 1645 1646 static double 1647 speed_mpn_sbordcpi1_div_qr (struct speed_params *s) 1648 { 1649 if (s->size < DC_DIV_QR_THRESHOLD) 1650 return speed_mpn_sbpi1_div_qr (s); 1651 else 1652 return speed_mpn_dcpi1_div_qr (s); 1653 } 1654 1655 void 1656 tune_mu_div (void) 1657 { 1658 s.r = 0; /* clear to make speed function do 2n/n */ 1659 { 1660 static struct param_t param; 1661 param.name = "MU_DIV_QR_THRESHOLD"; 1662 param.function = speed_mpn_dcpi1_div_qr; 1663 param.function2 = speed_mpn_mu_div_qr; 1664 param.min_size = mul_toom22_threshold; 1665 param.max_size = 5000; 1666 param.step_factor = 0.02; 1667 one (&mu_div_qr_threshold, ¶m); 1668 } 1669 { 1670 static struct param_t param; 1671 param.name = "MU_DIVAPPR_Q_THRESHOLD"; 1672 param.function = speed_mpn_dcpi1_divappr_q; 1673 param.function2 = speed_mpn_mu_divappr_q; 1674 param.min_size = mul_toom22_threshold; 1675 param.max_size = 5000; 1676 param.step_factor = 0.02; 1677 one (&mu_divappr_q_threshold, ¶m); 1678 } 1679 { 1680 static struct param_t param; 1681 param.name = "MUPI_DIV_QR_THRESHOLD"; 1682 param.function = speed_mpn_sbordcpi1_div_qr; 1683 param.function2 = speed_mpn_mupi_div_qr; 1684 param.min_size = 6; 1685 param.min_is_always = 1; 1686 param.max_size = 1000; 1687 param.step_factor = 0.02; 1688 one (&mupi_div_qr_threshold, ¶m); 1689 } 1690 } 1691 1692 void 1693 tune_dc_bdiv (void) 1694 { 1695 s.r = 0; /* clear to make speed function do 2n/n*/ 1696 { 1697 static struct param_t param; 1698 param.name = "DC_BDIV_QR_THRESHOLD"; 1699 param.function = speed_mpn_sbpi1_bdiv_qr; 1700 param.function2 = speed_mpn_dcpi1_bdiv_qr; 1701 param.min_size = 4; 1702 one (&dc_bdiv_qr_threshold, ¶m); 1703 } 1704 { 1705 static struct param_t param; 1706 param.name = "DC_BDIV_Q_THRESHOLD"; 1707 param.function = speed_mpn_sbpi1_bdiv_q; 1708 param.function2 = speed_mpn_dcpi1_bdiv_q; 1709 param.min_size = 4; 1710 one (&dc_bdiv_q_threshold, ¶m); 1711 } 1712 } 1713 1714 void 1715 tune_mu_bdiv (void) 1716 { 1717 s.r = 0; /* clear to make speed function do 2n/n*/ 1718 { 1719 static struct param_t param; 1720 param.name = "MU_BDIV_QR_THRESHOLD"; 1721 param.function = speed_mpn_dcpi1_bdiv_qr; 1722 param.function2 = speed_mpn_mu_bdiv_qr; 1723 param.min_size = dc_bdiv_qr_threshold; 1724 param.max_size = 5000; 1725 param.step_factor = 0.02; 1726 one (&mu_bdiv_qr_threshold, ¶m); 1727 } 1728 { 1729 static struct param_t param; 1730 param.name = "MU_BDIV_Q_THRESHOLD"; 1731 param.function = speed_mpn_dcpi1_bdiv_q; 1732 param.function2 = speed_mpn_mu_bdiv_q; 1733 param.min_size = dc_bdiv_q_threshold; 1734 param.max_size = 5000; 1735 param.step_factor = 0.02; 1736 one (&mu_bdiv_q_threshold, ¶m); 1737 } 1738 } 1739 1740 void 1741 tune_invertappr (void) 1742 { 1743 static struct param_t param; 1744 1745 param.function = speed_mpn_ni_invertappr; 1746 param.name = "INV_MULMOD_BNM1_THRESHOLD"; 1747 param.min_size = 5; 1748 one (&inv_mulmod_bnm1_threshold, ¶m); 1749 1750 param.function = speed_mpn_invertappr; 1751 param.name = "INV_NEWTON_THRESHOLD"; 1752 param.min_size = 5; 1753 one (&inv_newton_threshold, ¶m); 1754 } 1755 1756 void 1757 tune_invert (void) 1758 { 1759 static struct param_t param; 1760 1761 param.function = speed_mpn_invert; 1762 param.name = "INV_APPR_THRESHOLD"; 1763 param.min_size = 5; 1764 one (&inv_appr_threshold, ¶m); 1765 } 1766 1767 void 1768 tune_binvert (void) 1769 { 1770 static struct param_t param; 1771 1772 param.function = speed_mpn_binvert; 1773 param.name = "BINV_NEWTON_THRESHOLD"; 1774 param.min_size = 8; /* pointless with smaller operands */ 1775 one (&binv_newton_threshold, ¶m); 1776 } 1777 1778 void 1779 tune_redc (void) 1780 { 1781 #define TUNE_REDC_2_MAX 100 1782 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 1783 #define WANT_REDC_2 1 1784 #endif 1785 1786 #if WANT_REDC_2 1787 { 1788 static struct param_t param; 1789 param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1790 param.function = speed_mpn_redc_1; 1791 param.function2 = speed_mpn_redc_2; 1792 param.min_size = 1; 1793 param.min_is_always = 1; 1794 param.max_size = TUNE_REDC_2_MAX; 1795 param.noprint = 1; 1796 param.stop_factor = 1.5; 1797 one (&redc_1_to_redc_2_threshold, ¶m); 1798 } 1799 { 1800 static struct param_t param; 1801 param.name = "REDC_2_TO_REDC_N_THRESHOLD"; 1802 param.function = speed_mpn_redc_2; 1803 param.function2 = speed_mpn_redc_n; 1804 param.min_size = 16; 1805 param.noprint = 1; 1806 one (&redc_2_to_redc_n_threshold, ¶m); 1807 } 1808 if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold) 1809 { 1810 redc_2_to_redc_n_threshold = 0; /* disable redc_2 */ 1811 1812 /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as 1813 REDC_1_TO_REDC_2_THRESHOLD. */ 1814 { 1815 static struct param_t param; 1816 param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1817 param.function = speed_mpn_redc_1; 1818 param.function2 = speed_mpn_redc_n; 1819 param.min_size = 16; 1820 param.noprint = 1; 1821 one (&redc_1_to_redc_2_threshold, ¶m); 1822 } 1823 } 1824 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD); 1825 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD); 1826 #else 1827 { 1828 static struct param_t param; 1829 param.name = "REDC_1_TO_REDC_N_THRESHOLD"; 1830 param.function = speed_mpn_redc_1; 1831 param.function2 = speed_mpn_redc_n; 1832 param.min_size = 16; 1833 one (&redc_1_to_redc_n_threshold, ¶m); 1834 } 1835 #endif 1836 } 1837 1838 void 1839 tune_matrix22_mul (void) 1840 { 1841 static struct param_t param; 1842 param.name = "MATRIX22_STRASSEN_THRESHOLD"; 1843 param.function = speed_mpn_matrix22_mul; 1844 param.min_size = 2; 1845 one (&matrix22_strassen_threshold, ¶m); 1846 } 1847 1848 void 1849 tune_hgcd (void) 1850 { 1851 static struct param_t param; 1852 param.name = "HGCD_THRESHOLD"; 1853 param.function = speed_mpn_hgcd; 1854 /* We seem to get strange results for small sizes */ 1855 param.min_size = 30; 1856 one (&hgcd_threshold, ¶m); 1857 } 1858 1859 void 1860 tune_hgcd_appr (void) 1861 { 1862 static struct param_t param; 1863 param.name = "HGCD_APPR_THRESHOLD"; 1864 param.function = speed_mpn_hgcd_appr; 1865 /* We seem to get strange results for small sizes */ 1866 param.min_size = 50; 1867 param.stop_since_change = 150; 1868 one (&hgcd_appr_threshold, ¶m); 1869 } 1870 1871 void 1872 tune_hgcd_reduce (void) 1873 { 1874 static struct param_t param; 1875 param.name = "HGCD_REDUCE_THRESHOLD"; 1876 param.function = speed_mpn_hgcd_reduce; 1877 param.min_size = 30; 1878 param.max_size = 7000; 1879 param.step_factor = 0.04; 1880 one (&hgcd_reduce_threshold, ¶m); 1881 } 1882 1883 void 1884 tune_gcd_dc (void) 1885 { 1886 static struct param_t param; 1887 param.name = "GCD_DC_THRESHOLD"; 1888 param.function = speed_mpn_gcd; 1889 param.min_size = hgcd_threshold; 1890 param.max_size = 3000; 1891 param.step_factor = 0.02; 1892 one (&gcd_dc_threshold, ¶m); 1893 } 1894 1895 void 1896 tune_gcdext_dc (void) 1897 { 1898 static struct param_t param; 1899 param.name = "GCDEXT_DC_THRESHOLD"; 1900 param.function = speed_mpn_gcdext; 1901 param.min_size = hgcd_threshold; 1902 param.max_size = 3000; 1903 param.step_factor = 0.02; 1904 one (&gcdext_dc_threshold, ¶m); 1905 } 1906 1907 /* In tune_powm_sec we compute the table used by the win_size function. The 1908 cutoff points are in exponent bits, disregarding other operand sizes. It is 1909 not possible to use the one framework since it currently uses a granularity 1910 of full limbs. 1911 */ 1912 1913 /* This win_size replaces the variant in the powm code, allowing us to 1914 control k in the k-ary algorithms. */ 1915 int winsize; 1916 int 1917 win_size (mp_bitcnt_t eb) 1918 { 1919 return winsize; 1920 } 1921 1922 void 1923 tune_powm_sec (void) 1924 { 1925 mp_size_t n; 1926 int k, i; 1927 mp_size_t itch; 1928 mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff; 1929 const int n_max = 3000 / GMP_NUMB_BITS; 1930 const int n_measurements = 5; 1931 mp_ptr rp, bp, ep, mp, tp; 1932 double ttab[n_measurements], tk, tkp1; 1933 TMP_DECL; 1934 TMP_MARK; 1935 1936 possible_nbits_cutoff = 0; 1937 1938 k = 1; 1939 1940 winsize = 10; /* the itch function needs this */ 1941 itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max); 1942 1943 rp = TMP_ALLOC_LIMBS (n_max); 1944 bp = TMP_ALLOC_LIMBS (n_max); 1945 ep = TMP_ALLOC_LIMBS (n_max); 1946 mp = TMP_ALLOC_LIMBS (n_max); 1947 tp = TMP_ALLOC_LIMBS (itch); 1948 1949 mpn_random (bp, n_max); 1950 mpn_random (mp, n_max); 1951 mp[0] |= 1; 1952 1953 /* How about taking the M operand size into account? 1954 1955 An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming 1956 B = O(M)). 1957 1958 Using k-ary and no sliding window, the precomputation will need time 1959 O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) + 1960 O(log(E)/k*M(N)), for the squarings, multiplications, respectively. 1961 1962 An operation R=powm_sec(B,E,N) will take time like powm. 1963 1964 Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the 1965 main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) + 1966 O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full 1967 table reads, respectively. */ 1968 1969 printf ("#define POWM_SEC_TABLE "); 1970 1971 /* For nbits == 1, we should always use k == 1, so no need to tune 1972 that. Starting with nbits == 2 also ensure that nbits always is 1973 larger than the windowsize k+1. */ 1974 for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; ) 1975 { 1976 n = (nbits - 1) / GMP_NUMB_BITS + 1; 1977 1978 /* Generate E such that sliding-window for k and k+1 works equally 1979 well/poorly (but sliding is not used in powm_sec, of course). */ 1980 for (i = 0; i < n; i++) 1981 ep[i] = ~CNST_LIMB(0); 1982 1983 winsize = k; 1984 for (i = 0; i < n_measurements; i++) 1985 { 1986 speed_starttime (); 1987 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); 1988 ttab[i] = speed_endtime (); 1989 } 1990 tk = median (ttab, n_measurements); 1991 1992 winsize = k + 1; 1993 speed_starttime (); 1994 for (i = 0; i < n_measurements; i++) 1995 { 1996 speed_starttime (); 1997 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); 1998 ttab[i] = speed_endtime (); 1999 } 2000 tkp1 = median (ttab, n_measurements); 2001 /* 2002 printf ("testing: %ld, %d", nbits, k, ep[n-1]); 2003 printf (" %10.5f %10.5f\n", tk, tkp1); 2004 */ 2005 if (tkp1 < tk) 2006 { 2007 if (possible_nbits_cutoff) 2008 { 2009 /* Two consecutive sizes indicate k increase, obey. */ 2010 2011 /* Must always have x[k] >= k */ 2012 ASSERT_ALWAYS (possible_nbits_cutoff >= k); 2013 2014 if (k > 1) 2015 printf (","); 2016 printf ("%ld", (long) possible_nbits_cutoff); 2017 k++; 2018 possible_nbits_cutoff = 0; 2019 } 2020 else 2021 { 2022 /* One measurement indicate k increase, save nbits for further 2023 consideration. */ 2024 /* The new larger k gets used for sizes > the cutoff 2025 value, hence the cutoff should be one less than the 2026 smallest size where it gives a speedup. */ 2027 possible_nbits_cutoff = nbits - 1; 2028 } 2029 } 2030 else 2031 possible_nbits_cutoff = 0; 2032 2033 nbits_next = nbits * 65 / 64; 2034 nbits = nbits_next + (nbits_next == nbits); 2035 } 2036 printf ("\n"); 2037 TMP_FREE; 2038 } 2039 2040 2041 /* size_extra==1 reflects the fact that with high<divisor one division is 2042 always skipped. Forcing high<divisor while testing ensures consistency 2043 while stepping through sizes, ie. that size-1 divides will be done each 2044 time. 2045 2046 min_size==2 and min_is_always are used so that if plain division is only 2047 better at size==1 then don't bother including that code just for that 2048 case, instead go with preinv always and get a size saving. */ 2049 2050 #define DIV_1_PARAMS \ 2051 param.check_size = 256; \ 2052 param.min_size = 2; \ 2053 param.min_is_always = 1; \ 2054 param.data_high = DATA_HIGH_LT_R; \ 2055 param.size_extra = 1; \ 2056 param.stop_factor = 2.0; 2057 2058 2059 double (*tuned_speed_mpn_divrem_1) (struct speed_params *); 2060 2061 void 2062 tune_divrem_1 (void) 2063 { 2064 /* plain version by default */ 2065 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1; 2066 2067 /* No support for tuning native assembler code, do that by hand and put 2068 the results in the .asm file, there's no need for such thresholds to 2069 appear in gmp-mparam.h. */ 2070 if (HAVE_NATIVE_mpn_divrem_1) 2071 return; 2072 2073 if (GMP_NAIL_BITS != 0) 2074 { 2075 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 2076 "no preinv with nails"); 2077 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 2078 "no preinv with nails"); 2079 return; 2080 } 2081 2082 if (UDIV_PREINV_ALWAYS) 2083 { 2084 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always"); 2085 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L); 2086 return; 2087 } 2088 2089 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune; 2090 2091 /* Tune for the integer part of mpn_divrem_1. This will very possibly be 2092 a bit out for the fractional part, but that's too bad, the integer part 2093 is more important. */ 2094 { 2095 static struct param_t param; 2096 param.name = "DIVREM_1_NORM_THRESHOLD"; 2097 DIV_1_PARAMS; 2098 s.r = randlimb_norm (); 2099 param.function = speed_mpn_divrem_1_tune; 2100 one (&divrem_1_norm_threshold, ¶m); 2101 } 2102 { 2103 static struct param_t param; 2104 param.name = "DIVREM_1_UNNORM_THRESHOLD"; 2105 DIV_1_PARAMS; 2106 s.r = randlimb_half (); 2107 param.function = speed_mpn_divrem_1_tune; 2108 one (&divrem_1_unnorm_threshold, ¶m); 2109 } 2110 } 2111 2112 void 2113 tune_div_qr_1 (void) 2114 { 2115 static struct param_t param; 2116 double t1, t2; 2117 2118 if (!HAVE_NATIVE_mpn_div_qr_1n_pi1) 2119 { 2120 static struct param_t param; 2121 double t1, t2; 2122 2123 s.size = 10; 2124 s.r = randlimb_norm (); 2125 2126 t1 = tuneup_measure (speed_mpn_div_qr_1n_pi1_1, ¶m, &s); 2127 t2 = tuneup_measure (speed_mpn_div_qr_1n_pi1_2, ¶m, &s); 2128 2129 if (t1 == -1.0 || t2 == -1.0) 2130 { 2131 printf ("Oops, can't measure all mpn_div_qr_1n_pi1 methods at %ld\n", 2132 (long) s.size); 2133 abort (); 2134 } 2135 div_qr_1n_pi1_method = (t1 < t2) ? 1 : 2; 2136 print_define ("DIV_QR_1N_PI1_METHOD", div_qr_1n_pi1_method); 2137 } 2138 2139 { 2140 static struct param_t param; 2141 param.name = "DIV_QR_1_NORM_THRESHOLD"; 2142 DIV_1_PARAMS; 2143 param.min_size = 1; 2144 param.min_is_always = 0; 2145 s.r = randlimb_norm (); 2146 param.function = speed_mpn_div_qr_1_tune; 2147 one (&div_qr_1_norm_threshold, ¶m); 2148 } 2149 { 2150 static struct param_t param; 2151 param.name = "DIV_QR_1_UNNORM_THRESHOLD"; 2152 DIV_1_PARAMS; 2153 param.min_size = 1; 2154 param.min_is_always = 0; 2155 s.r = randlimb_half(); 2156 param.function = speed_mpn_div_qr_1_tune; 2157 one (&div_qr_1_unnorm_threshold, ¶m); 2158 } 2159 } 2160 2161 2162 void 2163 tune_mod_1 (void) 2164 { 2165 /* No support for tuning native assembler code, do that by hand and put 2166 the results in the .asm file, there's no need for such thresholds to 2167 appear in gmp-mparam.h. */ 2168 if (HAVE_NATIVE_mpn_mod_1) 2169 return; 2170 2171 if (GMP_NAIL_BITS != 0) 2172 { 2173 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 2174 "no preinv with nails"); 2175 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 2176 "no preinv with nails"); 2177 return; 2178 } 2179 2180 if (!HAVE_NATIVE_mpn_mod_1_1p) 2181 { 2182 static struct param_t param; 2183 double t1, t2; 2184 2185 s.size = 10; 2186 s.r = randlimb_half (); 2187 2188 t1 = tuneup_measure (speed_mpn_mod_1_1_1, ¶m, &s); 2189 t2 = tuneup_measure (speed_mpn_mod_1_1_2, ¶m, &s); 2190 2191 if (t1 == -1.0 || t2 == -1.0) 2192 { 2193 printf ("Oops, can't measure all mpn_mod_1_1 methods at %ld\n", 2194 (long) s.size); 2195 abort (); 2196 } 2197 mod_1_1p_method = (t1 < t2) ? 1 : 2; 2198 print_define ("MOD_1_1P_METHOD", mod_1_1p_method); 2199 } 2200 2201 if (UDIV_PREINV_ALWAYS) 2202 { 2203 print_define ("MOD_1_NORM_THRESHOLD", 0L); 2204 print_define ("MOD_1_UNNORM_THRESHOLD", 0L); 2205 } 2206 else 2207 { 2208 { 2209 static struct param_t param; 2210 param.name = "MOD_1_NORM_THRESHOLD"; 2211 DIV_1_PARAMS; 2212 s.r = randlimb_norm (); 2213 param.function = speed_mpn_mod_1_tune; 2214 one (&mod_1_norm_threshold, ¶m); 2215 } 2216 { 2217 static struct param_t param; 2218 param.name = "MOD_1_UNNORM_THRESHOLD"; 2219 DIV_1_PARAMS; 2220 s.r = randlimb_half (); 2221 param.function = speed_mpn_mod_1_tune; 2222 one (&mod_1_unnorm_threshold, ¶m); 2223 } 2224 } 2225 { 2226 static struct param_t param; 2227 2228 param.check_size = 256; 2229 2230 s.r = randlimb_norm (); 2231 param.function = speed_mpn_mod_1_tune; 2232 2233 param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD"; 2234 param.min_size = 2; 2235 one (&mod_1n_to_mod_1_1_threshold, ¶m); 2236 } 2237 2238 { 2239 static struct param_t param; 2240 2241 param.check_size = 256; 2242 s.r = randlimb_half (); 2243 param.noprint = 1; 2244 2245 param.function = speed_mpn_mod_1_1; 2246 param.function2 = speed_mpn_mod_1_2; 2247 param.min_is_always = 1; 2248 param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD"; 2249 param.min_size = 2; 2250 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2251 2252 param.function = speed_mpn_mod_1_2; 2253 param.function2 = speed_mpn_mod_1_4; 2254 param.min_is_always = 1; 2255 param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD"; 2256 param.min_size = 1; 2257 one (&mod_1_2_to_mod_1_4_threshold, ¶m); 2258 2259 if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold) 2260 { 2261 /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */ 2262 mod_1_2_to_mod_1_4_threshold = 0; 2263 2264 param.function = speed_mpn_mod_1_1; 2265 param.function2 = speed_mpn_mod_1_4; 2266 param.min_is_always = 1; 2267 param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake"; 2268 param.min_size = 2; 2269 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2270 } 2271 2272 param.function = speed_mpn_mod_1_tune; 2273 param.function2 = NULL; 2274 param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD"; 2275 param.min_size = 2; 2276 param.min_is_always = 0; 2277 one (&mod_1u_to_mod_1_1_threshold, ¶m); 2278 2279 if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold) 2280 mod_1_1_to_mod_1_2_threshold = 0; 2281 if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold) 2282 mod_1_2_to_mod_1_4_threshold = 0; 2283 2284 print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL); 2285 print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, 2286 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL); 2287 print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, 2288 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL); 2289 } 2290 2291 { 2292 static struct param_t param; 2293 2294 param.check_size = 256; 2295 2296 param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD"; 2297 s.r = randlimb_norm (); 2298 param.function = speed_mpn_preinv_mod_1; 2299 param.function2 = speed_mpn_mod_1_tune; 2300 param.min_size = 1; 2301 one (&preinv_mod_1_to_mod_1_threshold, ¶m); 2302 } 2303 } 2304 2305 2306 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would 2307 imply that udiv_qrnnd_preinv is worth using, but it seems most 2308 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div 2309 directly. */ 2310 2311 void 2312 tune_preinv_divrem_1 (void) 2313 { 2314 static struct param_t param; 2315 speed_function_t divrem_1; 2316 const char *divrem_1_name; 2317 double t1, t2; 2318 2319 if (GMP_NAIL_BITS != 0) 2320 { 2321 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails"); 2322 return; 2323 } 2324 2325 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because 2326 it's faster than mpn_divrem_1. */ 2327 if (HAVE_NATIVE_mpn_preinv_divrem_1) 2328 { 2329 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); 2330 return; 2331 } 2332 2333 /* If udiv_qrnnd_preinv is the only division method then of course 2334 mpn_preinv_divrem_1 should be used. */ 2335 if (UDIV_PREINV_ALWAYS) 2336 { 2337 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); 2338 return; 2339 } 2340 2341 /* If we've got an assembler version of mpn_divrem_1, then compare against 2342 that, not the mpn_divrem_1_div generic C. */ 2343 if (HAVE_NATIVE_mpn_divrem_1) 2344 { 2345 divrem_1 = speed_mpn_divrem_1; 2346 divrem_1_name = "mpn_divrem_1"; 2347 } 2348 else 2349 { 2350 divrem_1 = speed_mpn_divrem_1_div; 2351 divrem_1_name = "mpn_divrem_1_div"; 2352 } 2353 2354 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ 2355 s.size = 200; /* generous but not too big */ 2356 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, 2357 since in general that's probably most common, though in fact for a 2358 64-bit limb mp_bases[10].big_base is normalized. */ 2359 s.r = urandom() & (GMP_NUMB_MASK >> 4); 2360 if (s.r == 0) s.r = 123; 2361 2362 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); 2363 t2 = tuneup_measure (divrem_1, ¶m, &s); 2364 if (t1 == -1.0 || t2 == -1.0) 2365 { 2366 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", 2367 divrem_1_name, (long) s.size); 2368 abort (); 2369 } 2370 if (option_trace >= 1) 2371 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", 2372 (long) s.size, t1, divrem_1_name, t2); 2373 2374 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); 2375 } 2376 2377 2378 2379 void 2380 tune_divrem_2 (void) 2381 { 2382 static struct param_t param; 2383 2384 /* No support for tuning native assembler code, do that by hand and put 2385 the results in the .asm file, and there's no need for such thresholds 2386 to appear in gmp-mparam.h. */ 2387 if (HAVE_NATIVE_mpn_divrem_2) 2388 return; 2389 2390 if (GMP_NAIL_BITS != 0) 2391 { 2392 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, 2393 "no preinv with nails"); 2394 return; 2395 } 2396 2397 if (UDIV_PREINV_ALWAYS) 2398 { 2399 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); 2400 return; 2401 } 2402 2403 /* Tune for the integer part of mpn_divrem_2. This will very possibly be 2404 a bit out for the fractional part, but that's too bad, the integer part 2405 is more important. 2406 2407 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save 2408 code space if plain division is better only at size==2 or size==3. */ 2409 param.name = "DIVREM_2_THRESHOLD"; 2410 param.check_size = 256; 2411 param.min_size = 4; 2412 param.min_is_always = 1; 2413 param.size_extra = 2; /* does qsize==nsize-2 divisions */ 2414 param.stop_factor = 2.0; 2415 2416 s.r = randlimb_norm (); 2417 param.function = speed_mpn_divrem_2; 2418 one (&divrem_2_threshold, ¶m); 2419 } 2420 2421 void 2422 tune_div_qr_2 (void) 2423 { 2424 static struct param_t param; 2425 param.name = "DIV_QR_2_PI2_THRESHOLD"; 2426 param.function = speed_mpn_div_qr_2n; 2427 param.check_size = 500; 2428 param.min_size = 4; 2429 one (&div_qr_2_pi2_threshold, ¶m); 2430 } 2431 2432 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so 2433 tune for that. Its speed can differ on odd or even divisor, so take an 2434 average threshold for the two. 2435 2436 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1 2437 might not vary that way, but don't test this since high<divisor isn't 2438 expected to occur often with small divisors. */ 2439 2440 void 2441 tune_divexact_1 (void) 2442 { 2443 static struct param_t param; 2444 mp_size_t thresh[2], average; 2445 int low, i; 2446 2447 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a 2448 full mpn_divrem_1. */ 2449 if (HAVE_NATIVE_mpn_divexact_1) 2450 { 2451 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)"); 2452 return; 2453 } 2454 2455 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL); 2456 2457 param.name = "DIVEXACT_1_THRESHOLD"; 2458 param.data_high = DATA_HIGH_GE_R; 2459 param.check_size = 256; 2460 param.min_size = 2; 2461 param.stop_factor = 1.5; 2462 param.function = tuned_speed_mpn_divrem_1; 2463 param.function2 = speed_mpn_divexact_1; 2464 param.noprint = 1; 2465 2466 print_define_start (param.name); 2467 2468 for (low = 0; low <= 1; low++) 2469 { 2470 s.r = randlimb_half(); 2471 if (low == 0) 2472 s.r |= 1; 2473 else 2474 s.r &= ~CNST_LIMB(7); 2475 2476 one (&thresh[low], ¶m); 2477 if (option_trace) 2478 printf ("low=%d thresh %ld\n", low, (long) thresh[low]); 2479 2480 if (thresh[low] == MP_SIZE_T_MAX) 2481 { 2482 average = MP_SIZE_T_MAX; 2483 goto divexact_1_done; 2484 } 2485 } 2486 2487 if (option_trace) 2488 { 2489 printf ("average of:"); 2490 for (i = 0; i < numberof(thresh); i++) 2491 printf (" %ld", (long) thresh[i]); 2492 printf ("\n"); 2493 } 2494 2495 average = 0; 2496 for (i = 0; i < numberof(thresh); i++) 2497 average += thresh[i]; 2498 average /= numberof(thresh); 2499 2500 /* If divexact turns out to be better as early as 3 limbs, then use it 2501 always, so as to reduce code size and conditional jumps. */ 2502 if (average <= 3) 2503 average = 0; 2504 2505 divexact_1_done: 2506 print_define_end (param.name, average); 2507 } 2508 2509 2510 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the 2511 same as mpn_mod_1, but this might not be true of an assembler 2512 implementation. The threshold used is an average based on data where a 2513 divide can be skipped and where it can't. 2514 2515 If modexact turns out to be better as early as 3 limbs, then use it 2516 always, so as to reduce code size and conditional jumps. */ 2517 2518 void 2519 tune_modexact_1_odd (void) 2520 { 2521 static struct param_t param; 2522 mp_size_t thresh_lt, thresh_ge, average; 2523 2524 #if 0 2525 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed 2526 of a full mpn_mod_1. */ 2527 if (HAVE_NATIVE_mpn_modexact_1_odd) 2528 { 2529 print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1"); 2530 return; 2531 } 2532 #endif 2533 2534 param.name = "BMOD_1_TO_MOD_1_THRESHOLD"; 2535 param.check_size = 256; 2536 param.min_size = 2; 2537 param.stop_factor = 1.5; 2538 param.function = speed_mpn_modexact_1c_odd; 2539 param.function2 = speed_mpn_mod_1_tune; 2540 param.noprint = 1; 2541 s.r = randlimb_half () | 1; 2542 2543 print_define_start (param.name); 2544 2545 param.data_high = DATA_HIGH_LT_R; 2546 one (&thresh_lt, ¶m); 2547 if (option_trace) 2548 printf ("lt thresh %ld\n", (long) thresh_lt); 2549 2550 average = thresh_lt; 2551 if (thresh_lt != MP_SIZE_T_MAX) 2552 { 2553 param.data_high = DATA_HIGH_GE_R; 2554 one (&thresh_ge, ¶m); 2555 if (option_trace) 2556 printf ("ge thresh %ld\n", (long) thresh_ge); 2557 2558 if (thresh_ge != MP_SIZE_T_MAX) 2559 { 2560 average = (thresh_ge + thresh_lt) / 2; 2561 if (thresh_ge <= 3) 2562 average = 0; 2563 } 2564 } 2565 2566 print_define_end (param.name, average); 2567 } 2568 2569 2570 void 2571 tune_jacobi_base (void) 2572 { 2573 static struct param_t param; 2574 double t1, t2, t3, t4; 2575 int method; 2576 2577 s.size = GMP_LIMB_BITS * 3 / 4; 2578 2579 t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s); 2580 if (option_trace >= 1) 2581 printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1); 2582 2583 t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s); 2584 if (option_trace >= 1) 2585 printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2); 2586 2587 t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s); 2588 if (option_trace >= 1) 2589 printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3); 2590 2591 t4 = tuneup_measure (speed_mpn_jacobi_base_4, ¶m, &s); 2592 if (option_trace >= 1) 2593 printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4); 2594 2595 if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0) 2596 { 2597 printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n", 2598 (long) s.size); 2599 abort (); 2600 } 2601 2602 if (t1 < t2 && t1 < t3 && t1 < t4) 2603 method = 1; 2604 else if (t2 < t3 && t2 < t4) 2605 method = 2; 2606 else if (t3 < t4) 2607 method = 3; 2608 else 2609 method = 4; 2610 2611 print_define ("JACOBI_BASE_METHOD", method); 2612 } 2613 2614 2615 void 2616 tune_get_str (void) 2617 { 2618 /* Tune for decimal, it being most common. Some rough testing suggests 2619 other bases are different, but not by very much. */ 2620 s.r = 10; 2621 { 2622 static struct param_t param; 2623 GET_STR_PRECOMPUTE_THRESHOLD = 0; 2624 param.name = "GET_STR_DC_THRESHOLD"; 2625 param.function = speed_mpn_get_str; 2626 param.min_size = 4; 2627 param.max_size = GET_STR_THRESHOLD_LIMIT; 2628 one (&get_str_dc_threshold, ¶m); 2629 } 2630 { 2631 static struct param_t param; 2632 param.name = "GET_STR_PRECOMPUTE_THRESHOLD"; 2633 param.function = speed_mpn_get_str; 2634 param.min_size = GET_STR_DC_THRESHOLD; 2635 param.max_size = GET_STR_THRESHOLD_LIMIT; 2636 one (&get_str_precompute_threshold, ¶m); 2637 } 2638 } 2639 2640 2641 double 2642 speed_mpn_pre_set_str (struct speed_params *s) 2643 { 2644 unsigned char *str; 2645 mp_ptr wp; 2646 mp_size_t wn; 2647 unsigned i; 2648 int base; 2649 double t; 2650 mp_ptr powtab_mem, tp; 2651 powers_t powtab[GMP_LIMB_BITS]; 2652 mp_size_t un; 2653 int chars_per_limb; 2654 TMP_DECL; 2655 2656 SPEED_RESTRICT_COND (s->size >= 1); 2657 2658 base = s->r == 0 ? 10 : s->r; 2659 SPEED_RESTRICT_COND (base >= 2 && base <= 256); 2660 2661 TMP_MARK; 2662 2663 str = (unsigned char *) TMP_ALLOC (s->size); 2664 for (i = 0; i < s->size; i++) 2665 str[i] = s->xp[i] % base; 2666 2667 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); 2668 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); 2669 2670 /* use this during development to check wn is big enough */ 2671 /* 2672 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); 2673 */ 2674 2675 speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES); 2676 speed_operand_dst (s, wp, wn); 2677 speed_cache_fill (s); 2678 2679 chars_per_limb = mp_bases[base].chars_per_limb; 2680 un = s->size / chars_per_limb + 1; 2681 powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un)); 2682 mpn_set_str_compute_powtab (powtab, powtab_mem, un, base); 2683 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); 2684 2685 speed_starttime (); 2686 i = s->reps; 2687 do 2688 { 2689 mpn_pre_set_str (wp, str, s->size, powtab, tp); 2690 } 2691 while (--i != 0); 2692 t = speed_endtime (); 2693 2694 TMP_FREE; 2695 return t; 2696 } 2697 2698 void 2699 tune_set_str (void) 2700 { 2701 s.r = 10; /* decimal */ 2702 { 2703 static struct param_t param; 2704 SET_STR_PRECOMPUTE_THRESHOLD = 0; 2705 param.step_factor = 0.01; 2706 param.name = "SET_STR_DC_THRESHOLD"; 2707 param.function = speed_mpn_pre_set_str; 2708 param.min_size = 100; 2709 param.max_size = 50000; 2710 one (&set_str_dc_threshold, ¶m); 2711 } 2712 { 2713 static struct param_t param; 2714 param.step_factor = 0.02; 2715 param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; 2716 param.function = speed_mpn_set_str; 2717 param.min_size = SET_STR_DC_THRESHOLD; 2718 param.max_size = 100000; 2719 one (&set_str_precompute_threshold, ¶m); 2720 } 2721 } 2722 2723 2724 void 2725 tune_fft_mul (void) 2726 { 2727 static struct fft_param_t param; 2728 2729 if (option_fft_max_size == 0) 2730 return; 2731 2732 param.table_name = "MUL_FFT_TABLE3"; 2733 param.threshold_name = "MUL_FFT_THRESHOLD"; 2734 param.p_threshold = &mul_fft_threshold; 2735 param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; 2736 param.p_modf_threshold = &mul_fft_modf_threshold; 2737 param.first_size = MUL_TOOM33_THRESHOLD / 2; 2738 param.max_size = option_fft_max_size; 2739 param.function = speed_mpn_fft_mul; 2740 param.mul_modf_function = speed_mpn_mul_fft; 2741 param.mul_function = speed_mpn_mul_n; 2742 param.sqr = 0; 2743 fft (¶m); 2744 } 2745 2746 2747 void 2748 tune_fft_sqr (void) 2749 { 2750 static struct fft_param_t param; 2751 2752 if (option_fft_max_size == 0) 2753 return; 2754 2755 param.table_name = "SQR_FFT_TABLE3"; 2756 param.threshold_name = "SQR_FFT_THRESHOLD"; 2757 param.p_threshold = &sqr_fft_threshold; 2758 param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; 2759 param.p_modf_threshold = &sqr_fft_modf_threshold; 2760 param.first_size = SQR_TOOM3_THRESHOLD / 2; 2761 param.max_size = option_fft_max_size; 2762 param.function = speed_mpn_fft_sqr; 2763 param.mul_modf_function = speed_mpn_mul_fft_sqr; 2764 param.mul_function = speed_mpn_sqr; 2765 param.sqr = 1; 2766 fft (¶m); 2767 } 2768 2769 void 2770 tune_fac_ui (void) 2771 { 2772 static struct param_t param; 2773 2774 param.function = speed_mpz_fac_ui_tune; 2775 2776 param.name = "FAC_DSC_THRESHOLD"; 2777 param.min_size = 70; 2778 param.max_size = FAC_DSC_THRESHOLD_LIMIT; 2779 one (&fac_dsc_threshold, ¶m); 2780 2781 param.name = "FAC_ODD_THRESHOLD"; 2782 param.min_size = 22; 2783 param.stop_factor = 1.7; 2784 param.min_is_always = 1; 2785 one (&fac_odd_threshold, ¶m); 2786 } 2787 2788 void 2789 all (void) 2790 { 2791 time_t start_time, end_time; 2792 TMP_DECL; 2793 2794 TMP_MARK; 2795 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); 2796 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); 2797 2798 mpn_random (s.xp_block, SPEED_BLOCK_SIZE); 2799 mpn_random (s.yp_block, SPEED_BLOCK_SIZE); 2800 2801 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); 2802 2803 speed_time_init (); 2804 fprintf (stderr, "Using: %s\n", speed_time_string); 2805 2806 fprintf (stderr, "speed_precision %d", speed_precision); 2807 if (speed_unittime == 1.0) 2808 fprintf (stderr, ", speed_unittime 1 cycle"); 2809 else 2810 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); 2811 if (speed_cycletime == 1.0 || speed_cycletime == 0.0) 2812 fprintf (stderr, ", CPU freq unknown\n"); 2813 else 2814 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); 2815 2816 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", 2817 DEFAULT_MAX_SIZE, (long) option_fft_max_size); 2818 fprintf (stderr, "\n"); 2819 2820 time (&start_time); 2821 { 2822 struct tm *tp; 2823 tp = localtime (&start_time); 2824 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", 2825 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); 2826 2827 #ifdef __GNUC__ 2828 /* gcc sub-minor version doesn't seem to come through as a define */ 2829 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); 2830 #define PRINTED_COMPILER 2831 #endif 2832 #if defined (__SUNPRO_C) 2833 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); 2834 #define PRINTED_COMPILER 2835 #endif 2836 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) 2837 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ 2838 printf ("MIPSpro C %d.%d.%d */\n", 2839 _COMPILER_VERSION / 100, 2840 _COMPILER_VERSION / 10 % 10, 2841 _COMPILER_VERSION % 10); 2842 #define PRINTED_COMPILER 2843 #endif 2844 #if defined (__DECC) && defined (__DECC_VER) 2845 printf ("DEC C %d */\n", __DECC_VER); 2846 #define PRINTED_COMPILER 2847 #endif 2848 #if ! defined (PRINTED_COMPILER) 2849 printf ("system compiler */\n"); 2850 #endif 2851 } 2852 printf ("\n"); 2853 2854 tune_divrem_1 (); 2855 tune_mod_1 (); 2856 tune_preinv_divrem_1 (); 2857 tune_div_qr_1 (); 2858 #if 0 2859 tune_divrem_2 (); 2860 #endif 2861 tune_div_qr_2 (); 2862 tune_divexact_1 (); 2863 tune_modexact_1_odd (); 2864 printf("\n"); 2865 2866 tune_mul_n (); 2867 printf("\n"); 2868 2869 tune_mul (); 2870 printf("\n"); 2871 2872 tune_sqr (); 2873 printf("\n"); 2874 2875 tune_mulmid (); 2876 printf("\n"); 2877 2878 tune_mulmod_bnm1 (); 2879 tune_sqrmod_bnm1 (); 2880 printf("\n"); 2881 2882 tune_fft_mul (); 2883 printf("\n"); 2884 2885 tune_fft_sqr (); 2886 printf ("\n"); 2887 2888 tune_mullo (); 2889 tune_sqrlo (); 2890 printf("\n"); 2891 2892 tune_dc_div (); 2893 tune_dc_bdiv (); 2894 2895 printf("\n"); 2896 tune_invertappr (); 2897 tune_invert (); 2898 printf("\n"); 2899 2900 tune_binvert (); 2901 tune_redc (); 2902 printf("\n"); 2903 2904 tune_mu_div (); 2905 tune_mu_bdiv (); 2906 printf("\n"); 2907 2908 tune_powm_sec (); 2909 printf("\n"); 2910 2911 tune_get_str (); 2912 tune_set_str (); 2913 printf("\n"); 2914 2915 tune_fac_ui (); 2916 printf("\n"); 2917 2918 tune_matrix22_mul (); 2919 tune_hgcd (); 2920 tune_hgcd_appr (); 2921 tune_hgcd_reduce(); 2922 tune_gcd_dc (); 2923 tune_gcdext_dc (); 2924 tune_jacobi_base (); 2925 printf("\n"); 2926 2927 time (&end_time); 2928 printf ("/* Tuneup completed successfully, took %ld seconds */\n", 2929 (long) (end_time - start_time)); 2930 2931 TMP_FREE; 2932 } 2933 2934 2935 int 2936 main (int argc, char *argv[]) 2937 { 2938 int opt; 2939 2940 /* Unbuffered so if output is redirected to a file it isn't lost if the 2941 program is killed part way through. */ 2942 setbuf (stdout, NULL); 2943 setbuf (stderr, NULL); 2944 2945 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) 2946 { 2947 switch (opt) { 2948 case 'f': 2949 if (optarg[0] == 't') 2950 option_fft_trace = 2; 2951 else 2952 option_fft_max_size = atol (optarg); 2953 break; 2954 case 'o': 2955 speed_option_set (optarg); 2956 break; 2957 case 'p': 2958 speed_precision = atoi (optarg); 2959 break; 2960 case 't': 2961 option_trace++; 2962 break; 2963 case '?': 2964 exit(1); 2965 } 2966 } 2967 2968 all (); 2969 exit (0); 2970 }