github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/tune/tuneup.c (about)

     1  /* Create tuned thresholds for various algorithms.
     2  
     3  Copyright 1999-2003, 2005, 2006, 2008-2012 Free Software Foundation, Inc.
     4  
     5  This file is part of the GNU MP Library.
     6  
     7  The GNU MP Library is free software; you can redistribute it and/or modify
     8  it under the terms of either:
     9  
    10    * the GNU Lesser General Public License as published by the Free
    11      Software Foundation; either version 3 of the License, or (at your
    12      option) any later version.
    13  
    14  or
    15  
    16    * the GNU General Public License as published by the Free Software
    17      Foundation; either version 2 of the License, or (at your option) any
    18      later version.
    19  
    20  or both in parallel, as here.
    21  
    22  The GNU MP Library is distributed in the hope that it will be useful, but
    23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  for more details.
    26  
    27  You should have received copies of the GNU General Public License and the
    28  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  see https://www.gnu.org/licenses/.  */
    30  
    31  
    32  /* Usage: tuneup [-t] [-t] [-p precision]
    33  
    34     -t turns on some diagnostic traces, a second -t turns on more traces.
    35  
    36     Notes:
    37  
    38     The code here isn't a vision of loveliness, mainly because it's subject
    39     to ongoing changes according to new things wanting to be tuned, and
    40     practical requirements of systems tested.
    41  
    42     Sometimes running the program twice produces slightly different results.
    43     This is probably because there's so little separating algorithms near
    44     their crossover, and on that basis it should make little or no difference
    45     to the final speed of the relevant routines, but nothing has been done to
    46     check that carefully.
    47  
    48     Algorithm:
    49  
    50     The thresholds are determined as follows.  A crossover may not be a
    51     single size but rather a range where it oscillates between method A or
    52     method B faster.  If the threshold is set making B used where A is faster
    53     (or vice versa) that's bad.  Badness is the percentage time lost and
    54     total badness is the sum of this over all sizes measured.  The threshold
    55     is set to minimize total badness.
    56  
    57     Suppose, as sizes increase, method B becomes faster than method A.  The
    58     effect of the rule is that, as you look at increasing sizes, isolated
    59     points where B is faster are ignored, but when it's consistently faster,
    60     or faster on balance, then the threshold is set there.  The same result
    61     is obtained thinking in the other direction of A becoming faster at
    62     smaller sizes.
    63  
    64     In practice the thresholds tend to be chosen to bring on the next
    65     algorithm fairly quickly.
    66  
    67     This rule is attractive because it's got a basis in reason and is fairly
    68     easy to implement, but no work has been done to actually compare it in
    69     absolute terms to other possibilities.
    70  
    71     Implementation:
    72  
    73     In a normal library build the thresholds are constants.  To tune them
    74     selected objects are recompiled with the thresholds as global variables
    75     instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
    76     the end of gmp-impl.h, and rules in tune/Makefile.am.
    77  
    78     MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
    79     threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
    80     level, but recurse into the basecase.
    81  
    82     MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
    83     Other routines in turn will make use of both of those.  Naturally the
    84     dependants must be tuned first.
    85  
    86     In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
    87     just a threshold based on comparing two routines (mpn_divrem_1 and
    88     mpn_divexact_1), and no further use of the value determined.
    89  
    90     Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
    91     just comparisons between certain routines on representative data.
    92  
    93     Shortcuts are applied when native (assembler) versions of routines exist.
    94     For instance a native mpn_sqr_basecase is assumed to be always faster
    95     than mpn_mul_basecase, with no measuring.
    96  
    97     No attempt is made to tune within assembler routines, for instance
    98     DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
    99     written and tuned all by hand.  Assembler routines that might have hard
   100     limits are recompiled though, to make them accept a bigger range of sizes
   101     than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
   102  
   103     Limitations:
   104  
   105     The FFTs aren't subject to the same badness rule as the other thresholds,
   106     so each k is probably being brought on a touch early.  This isn't likely
   107     to make a difference, and the simpler probing means fewer tests.
   108  
   109  */
   110  
   111  #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
   112  
   113  #include "config.h"
   114  
   115  #include <math.h>
   116  #include <stdio.h>
   117  #include <stdlib.h>
   118  #include <time.h>
   119  #if HAVE_UNISTD_H
   120  #include <unistd.h>
   121  #endif
   122  
   123  #include "gmp.h"
   124  #include "gmp-impl.h"
   125  #include "longlong.h"
   126  
   127  #include "tests.h"
   128  #include "speed.h"
   129  
   130  #if !HAVE_DECL_OPTARG
   131  extern char *optarg;
   132  extern int optind, opterr;
   133  #endif
   134  
   135  
   136  #define DEFAULT_MAX_SIZE   1000  /* limbs */
   137  
   138  #if WANT_FFT
   139  mp_size_t  option_fft_max_size = 50000;  /* limbs */
   140  #else
   141  mp_size_t  option_fft_max_size = 0;
   142  #endif
   143  int        option_trace = 0;
   144  int        option_fft_trace = 0;
   145  struct speed_params  s;
   146  
   147  struct dat_t {
   148    mp_size_t  size;
   149    double     d;
   150  } *dat = NULL;
   151  int  ndat = 0;
   152  int  allocdat = 0;
   153  
   154  /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
   155     case use zero here, which for params.max_size means no limit.  */
   156  #ifndef TUNE_SQR_TOOM2_MAX
   157  #define TUNE_SQR_TOOM2_MAX  0
   158  #endif
   159  
   160  mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
   161  mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
   162  mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
   163  mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
   164  mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
   165  mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
   166  mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
   167  mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
   168  mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
   169  mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
   170  mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
   171  mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
   172  mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
   173  mp_size_t  sqr_toom2_threshold
   174    = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
   175  mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
   176  mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
   177  mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
   178  mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
   179  mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
   180  mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
   181  mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
   182  mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
   183  mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
   184  mp_size_t  sqrlo_basecase_threshold     = MP_SIZE_T_MAX;
   185  mp_size_t  sqrlo_dc_threshold           = MP_SIZE_T_MAX;
   186  mp_size_t  sqrlo_sqr_threshold          = MP_SIZE_T_MAX;
   187  mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
   188  mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
   189  mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
   190  mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
   191  mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
   192  mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
   193  mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
   194  mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
   195  mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
   196  mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
   197  mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
   198  mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
   199  mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
   200  mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
   201  mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
   202  mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
   203  mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
   204  mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
   205  mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
   206  mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
   207  mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
   208  mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
   209  mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
   210  mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
   211  mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
   212  mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
   213  mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
   214  int	   div_qr_1n_pi1_method		= 0;
   215  mp_size_t  div_qr_1_norm_threshold      = MP_SIZE_T_MAX;
   216  mp_size_t  div_qr_1_unnorm_threshold    = MP_SIZE_T_MAX;
   217  mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
   218  mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
   219  mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
   220  mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
   221  int	   mod_1_1p_method		= 0;
   222  mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
   223  mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
   224  mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
   225  mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
   226  mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
   227  mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
   228  mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
   229  mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
   230  mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
   231  mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
   232  mp_size_t  fac_odd_threshold            = 0;
   233  mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
   234  
   235  mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
   236  mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
   237  
   238  struct param_t {
   239    const char        *name;
   240    speed_function_t  function;
   241    speed_function_t  function2;
   242    double            step_factor;    /* how much to step relatively */
   243    int               step;           /* how much to step absolutely */
   244    double            function_fudge; /* multiplier for "function" speeds */
   245    int               stop_since_change;
   246    double            stop_factor;
   247    mp_size_t         min_size;
   248    int               min_is_always;
   249    mp_size_t         max_size;
   250    mp_size_t         check_size;
   251    mp_size_t         size_extra;
   252  
   253  #define DATA_HIGH_LT_R  1
   254  #define DATA_HIGH_GE_R  2
   255    int               data_high;
   256  
   257    int               noprint;
   258  };
   259  
   260  
   261  /* These are normally undefined when false, which suits "#if" fine.
   262     But give them zero values so they can be used in plain C "if"s.  */
   263  #ifndef UDIV_PREINV_ALWAYS
   264  #define UDIV_PREINV_ALWAYS 0
   265  #endif
   266  #ifndef HAVE_NATIVE_mpn_divexact_1
   267  #define HAVE_NATIVE_mpn_divexact_1 0
   268  #endif
   269  #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
   270  #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
   271  #endif
   272  #ifndef HAVE_NATIVE_mpn_divrem_1
   273  #define HAVE_NATIVE_mpn_divrem_1 0
   274  #endif
   275  #ifndef HAVE_NATIVE_mpn_divrem_2
   276  #define HAVE_NATIVE_mpn_divrem_2 0
   277  #endif
   278  #ifndef HAVE_NATIVE_mpn_mod_1
   279  #define HAVE_NATIVE_mpn_mod_1 0
   280  #endif
   281  #ifndef HAVE_NATIVE_mpn_mod_1_1p
   282  #define HAVE_NATIVE_mpn_mod_1_1p 0
   283  #endif
   284  #ifndef HAVE_NATIVE_mpn_modexact_1_odd
   285  #define HAVE_NATIVE_mpn_modexact_1_odd 0
   286  #endif
   287  #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
   288  #define HAVE_NATIVE_mpn_preinv_divrem_1 0
   289  #endif
   290  #ifndef HAVE_NATIVE_mpn_preinv_mod_1
   291  #define HAVE_NATIVE_mpn_preinv_mod_1 0
   292  #endif
   293  #ifndef HAVE_NATIVE_mpn_sqr_basecase
   294  #define HAVE_NATIVE_mpn_sqr_basecase 0
   295  #endif
   296  
   297  
   298  #define MAX3(a,b,c)  MAX (MAX (a, b), c)
   299  
   300  mp_limb_t
   301  randlimb_norm (void)
   302  {
   303    mp_limb_t  n;
   304    mpn_random (&n, 1);
   305    n |= GMP_NUMB_HIGHBIT;
   306    return n;
   307  }
   308  
   309  #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
   310  
   311  mp_limb_t
   312  randlimb_half (void)
   313  {
   314    mp_limb_t  n;
   315    mpn_random (&n, 1);
   316    n &= GMP_NUMB_HALFMASK;
   317    n += (n==0);
   318    return n;
   319  }
   320  
   321  
   322  /* Add an entry to the end of the dat[] array, reallocing to make it bigger
   323     if necessary.  */
   324  void
   325  add_dat (mp_size_t size, double d)
   326  {
   327  #define ALLOCDAT_STEP  500
   328  
   329    ASSERT_ALWAYS (ndat <= allocdat);
   330  
   331    if (ndat == allocdat)
   332      {
   333        dat = (struct dat_t *) __gmp_allocate_or_reallocate
   334          (dat, allocdat * sizeof(dat[0]),
   335           (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
   336        allocdat += ALLOCDAT_STEP;
   337      }
   338  
   339    dat[ndat].size = size;
   340    dat[ndat].d = d;
   341    ndat++;
   342  }
   343  
   344  
   345  /* Return the threshold size based on the data accumulated. */
   346  mp_size_t
   347  analyze_dat (int final)
   348  {
   349    double  x, min_x;
   350    int     j, min_j;
   351  
   352    /* If the threshold is set at dat[0].size, any positive values are bad. */
   353    x = 0.0;
   354    for (j = 0; j < ndat; j++)
   355      if (dat[j].d > 0.0)
   356        x += dat[j].d;
   357  
   358    if (option_trace >= 2 && final)
   359      {
   360        printf ("\n");
   361        printf ("x is the sum of the badness from setting thresh at given size\n");
   362        printf ("  (minimum x is sought)\n");
   363        printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
   364      }
   365  
   366    min_x = x;
   367    min_j = 0;
   368  
   369  
   370    /* When stepping to the next dat[j].size, positive values are no longer
   371       bad (so subtracted), negative values become bad (so add the absolute
   372       value, meaning subtract). */
   373    for (j = 0; j < ndat; x -= dat[j].d, j++)
   374      {
   375        if (option_trace >= 2 && final)
   376          printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
   377  
   378        if (x < min_x)
   379          {
   380            min_x = x;
   381            min_j = j;
   382          }
   383      }
   384  
   385    return min_j;
   386  }
   387  
   388  
   389  /* Measuring for recompiled mpn/generic/div_qr_1.c,
   390   * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
   391  
   392  mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
   393  
   394  #if defined (__cplusplus)
   395  extern "C" {
   396  #endif
   397  
   398  mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
   399  mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
   400  void mpz_fac_ui_tune (mpz_ptr, unsigned long);
   401  
   402  #if defined (__cplusplus)
   403  }
   404  #endif
   405  
   406  double
   407  speed_mpn_mod_1_tune (struct speed_params *s)
   408  {
   409    SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
   410  }
   411  double
   412  speed_mpn_divrem_1_tune (struct speed_params *s)
   413  {
   414    SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
   415  }
   416  double
   417  speed_mpz_fac_ui_tune (struct speed_params *s)
   418  {
   419    SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
   420  }
   421  double
   422  speed_mpn_div_qr_1_tune (struct speed_params *s)
   423  {
   424    SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
   425  }
   426  
   427  double
   428  tuneup_measure (speed_function_t fun,
   429                  const struct param_t *param,
   430                  struct speed_params *s)
   431  {
   432    static struct param_t  dummy;
   433    double   t;
   434    TMP_DECL;
   435  
   436    if (! param)
   437      param = &dummy;
   438  
   439    s->size += param->size_extra;
   440  
   441    TMP_MARK;
   442    SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
   443    SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
   444  
   445    mpn_random (s->xp, s->size);
   446    mpn_random (s->yp, s->size);
   447  
   448    switch (param->data_high) {
   449    case DATA_HIGH_LT_R:
   450      s->xp[s->size-1] %= s->r;
   451      s->yp[s->size-1] %= s->r;
   452      break;
   453    case DATA_HIGH_GE_R:
   454      s->xp[s->size-1] |= s->r;
   455      s->yp[s->size-1] |= s->r;
   456      break;
   457    }
   458  
   459    t = speed_measure (fun, s);
   460  
   461    s->size -= param->size_extra;
   462  
   463    TMP_FREE;
   464    return t;
   465  }
   466  
   467  
   468  #define PRINT_WIDTH  31
   469  
   470  void
   471  print_define_start (const char *name)
   472  {
   473    printf ("#define %-*s  ", PRINT_WIDTH, name);
   474    if (option_trace)
   475      printf ("...\n");
   476  }
   477  
   478  void
   479  print_define_end_remark (const char *name, mp_size_t value, const char *remark)
   480  {
   481    if (option_trace)
   482      printf ("#define %-*s  ", PRINT_WIDTH, name);
   483  
   484    if (value == MP_SIZE_T_MAX)
   485      printf ("MP_SIZE_T_MAX");
   486    else
   487      printf ("%5ld", (long) value);
   488  
   489    if (remark != NULL)
   490      printf ("  /* %s */", remark);
   491    printf ("\n");
   492    fflush (stdout);
   493  }
   494  
   495  void
   496  print_define_end (const char *name, mp_size_t value)
   497  {
   498    const char  *remark;
   499    if (value == MP_SIZE_T_MAX)
   500      remark = "never";
   501    else if (value == 0)
   502      remark = "always";
   503    else
   504      remark = NULL;
   505    print_define_end_remark (name, value, remark);
   506  }
   507  
   508  void
   509  print_define (const char *name, mp_size_t value)
   510  {
   511    print_define_start (name);
   512    print_define_end (name, value);
   513  }
   514  
   515  void
   516  print_define_remark (const char *name, mp_size_t value, const char *remark)
   517  {
   518    print_define_start (name);
   519    print_define_end_remark (name, value, remark);
   520  }
   521  
   522  
   523  void
   524  one (mp_size_t *threshold, struct param_t *param)
   525  {
   526    int  since_positive, since_thresh_change;
   527    int  thresh_idx, new_thresh_idx;
   528  
   529  #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
   530  
   531    DEFAULT (param->function_fudge, 1.0);
   532    DEFAULT (param->function2, param->function);
   533    DEFAULT (param->step_factor, 0.01);  /* small steps by default */
   534    DEFAULT (param->step, 1);            /* small steps by default */
   535    DEFAULT (param->stop_since_change, 80);
   536    DEFAULT (param->stop_factor, 1.2);
   537    DEFAULT (param->min_size, 10);
   538    DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
   539  
   540    if (param->check_size != 0)
   541      {
   542        double   t1, t2;
   543        s.size = param->check_size;
   544  
   545        *threshold = s.size+1;
   546        t1 = tuneup_measure (param->function, param, &s);
   547  
   548        *threshold = s.size;
   549        t2 = tuneup_measure (param->function2, param, &s);
   550        if (t1 == -1.0 || t2 == -1.0)
   551          {
   552            printf ("Oops, can't run both functions at size %ld\n",
   553                    (long) s.size);
   554            abort ();
   555          }
   556        t1 *= param->function_fudge;
   557  
   558        /* ask that t2 is at least 4% below t1 */
   559        if (t1 < t2*1.04)
   560          {
   561            if (option_trace)
   562              printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
   563            *threshold = MP_SIZE_T_MAX;
   564            if (! param->noprint)
   565              print_define (param->name, *threshold);
   566            return;
   567          }
   568  
   569        if (option_trace >= 2)
   570          printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
   571                  (long) s.size, t1, t2);
   572      }
   573  
   574    if (! param->noprint || option_trace)
   575      print_define_start (param->name);
   576  
   577    ndat = 0;
   578    since_positive = 0;
   579    since_thresh_change = 0;
   580    thresh_idx = 0;
   581  
   582    if (option_trace >= 2)
   583      {
   584        printf ("             algorithm-A  algorithm-B   ratio  possible\n");
   585        printf ("              (seconds)    (seconds)    diff    thresh\n");
   586      }
   587  
   588    for (s.size = param->min_size;
   589         s.size < param->max_size;
   590         s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
   591      {
   592        double   ti, tiplus1, d;
   593  
   594        /*
   595          FIXME: check minimum size requirements are met, possibly by just
   596          checking for the -1 returns from the speed functions.
   597        */
   598  
   599        /* using method A at this size */
   600        *threshold = s.size+1;
   601        ti = tuneup_measure (param->function, param, &s);
   602        if (ti == -1.0)
   603          abort ();
   604        ti *= param->function_fudge;
   605  
   606        /* using method B at this size */
   607        *threshold = s.size;
   608        tiplus1 = tuneup_measure (param->function2, param, &s);
   609        if (tiplus1 == -1.0)
   610          abort ();
   611  
   612        /* Calculate the fraction by which the one or the other routine is
   613           slower.  */
   614        if (tiplus1 >= ti)
   615          d = (tiplus1 - ti) / tiplus1;  /* negative */
   616        else
   617          d = (tiplus1 - ti) / ti;       /* positive */
   618  
   619        add_dat (s.size, d);
   620  
   621        new_thresh_idx = analyze_dat (0);
   622  
   623        if (option_trace >= 2)
   624          printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
   625                  (long) s.size, ti, tiplus1, d,
   626                  ti > tiplus1 ? '#' : ' ',
   627                  (long) dat[new_thresh_idx].size);
   628  
   629        /* Stop if the last time method i was faster was more than a
   630           certain number of measurements ago.  */
   631  #define STOP_SINCE_POSITIVE  200
   632        if (d >= 0)
   633          since_positive = 0;
   634        else
   635          if (++since_positive > STOP_SINCE_POSITIVE)
   636            {
   637              if (option_trace >= 1)
   638                printf ("stopped due to since_positive (%d)\n",
   639                        STOP_SINCE_POSITIVE);
   640              break;
   641            }
   642  
   643        /* Stop if method A has become slower by a certain factor. */
   644        if (ti >= tiplus1 * param->stop_factor)
   645          {
   646            if (option_trace >= 1)
   647              printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
   648                      param->stop_factor);
   649            break;
   650          }
   651  
   652        /* Stop if the threshold implied hasn't changed in a certain
   653           number of measurements.  (It's this condition that usually
   654           stops the loop.) */
   655        if (thresh_idx != new_thresh_idx)
   656          since_thresh_change = 0, thresh_idx = new_thresh_idx;
   657        else
   658          if (++since_thresh_change > param->stop_since_change)
   659            {
   660              if (option_trace >= 1)
   661                printf ("stopped due to since_thresh_change (%d)\n",
   662                        param->stop_since_change);
   663              break;
   664            }
   665  
   666        /* Stop if the threshold implied is more than a certain number of
   667           measurements ago.  */
   668  #define STOP_SINCE_AFTER   500
   669        if (ndat - thresh_idx > STOP_SINCE_AFTER)
   670          {
   671            if (option_trace >= 1)
   672              printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
   673                      STOP_SINCE_AFTER);
   674            break;
   675          }
   676  
   677        /* Stop when the size limit is reached before the end of the
   678           crossover, but only show this as an error for >= the default max
   679           size.  FIXME: Maybe should make it a param choice whether this is
   680           an error.  */
   681        if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
   682          {
   683            fprintf (stderr, "%s\n", param->name);
   684            fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
   685                     (long) dat[0].size, (long) dat[ndat-1].size, ndat);
   686            fprintf (stderr, "    max size reached before end of crossover\n");
   687            break;
   688          }
   689      }
   690  
   691    if (option_trace >= 1)
   692      printf ("sizes %ld to %ld total %d measurements\n",
   693              (long) dat[0].size, (long) dat[ndat-1].size, ndat);
   694  
   695    *threshold = dat[analyze_dat (1)].size;
   696  
   697    if (param->min_is_always)
   698      {
   699        if (*threshold == param->min_size)
   700          *threshold = 0;
   701      }
   702  
   703    if (! param->noprint || option_trace)
   704      print_define_end (param->name, *threshold);
   705  }
   706  
   707  
   708  /* Special probing for the fft thresholds.  The size restrictions on the
   709     FFTs mean the graph of time vs size has a step effect.  See this for
   710     example using
   711  
   712         ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
   713         gnuplot foo.gnuplot
   714  
   715     The current approach is to compare routines at the midpoint of relevant
   716     steps.  Arguably a more sophisticated system of threshold data is wanted
   717     if this step effect remains. */
   718  
   719  struct fft_param_t {
   720    const char        *table_name;
   721    const char        *threshold_name;
   722    const char        *modf_threshold_name;
   723    mp_size_t         *p_threshold;
   724    mp_size_t         *p_modf_threshold;
   725    mp_size_t         first_size;
   726    mp_size_t         max_size;
   727    speed_function_t  function;
   728    speed_function_t  mul_modf_function;
   729    speed_function_t  mul_function;
   730    mp_size_t         sqr;
   731  };
   732  
   733  
   734  /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
   735     N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
   736     of 2^(k-1) bits. */
   737  
   738  mp_size_t
   739  fft_step_size (int k)
   740  {
   741    mp_size_t  step;
   742  
   743    step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
   744    step *= (mp_size_t) 1 << k;
   745  
   746    if (step <= 0)
   747      {
   748        printf ("Can't handle k=%d\n", k);
   749        abort ();
   750      }
   751  
   752    return step;
   753  }
   754  
   755  mp_size_t
   756  fft_next_size (mp_size_t pl, int k)
   757  {
   758    mp_size_t  m = fft_step_size (k);
   759  
   760  /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
   761  
   762    if (pl == 0 || (pl & (m-1)) != 0)
   763      pl = (pl | (m-1)) + 1;
   764  
   765  /*    printf (" %ld\n", pl); */
   766    return pl;
   767  }
   768  
   769  #define NMAX_DEFAULT 1000000
   770  #define MAX_REPS 25
   771  #define MIN_REPS 5
   772  
   773  static inline size_t
   774  mpn_mul_fft_lcm (size_t a, unsigned int k)
   775  {
   776    unsigned int l = k;
   777  
   778    while (a % 2 == 0 && k > 0)
   779      {
   780        a >>= 1;
   781        k--;
   782      }
   783    return a << l;
   784  }
   785  
   786  mp_size_t
   787  fftfill (mp_size_t pl, int k, int sqr)
   788  {
   789    mp_size_t maxLK;
   790    mp_bitcnt_t N, Nprime, nprime, M;
   791  
   792    N = pl * GMP_NUMB_BITS;
   793    M = N >> k;
   794  
   795    maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
   796  
   797    Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
   798    nprime = Nprime / GMP_NUMB_BITS;
   799    if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
   800      {
   801        size_t K2;
   802        for (;;)
   803  	{
   804  	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
   805  	  if ((nprime & (K2 - 1)) == 0)
   806  	    break;
   807  	  nprime = (nprime + K2 - 1) & -K2;
   808  	  Nprime = nprime * GMP_LIMB_BITS;
   809  	}
   810      }
   811    ASSERT_ALWAYS (nprime < pl);
   812  
   813    return Nprime;
   814  }
   815  
   816  static int
   817  compare_double (const void *ap, const void *bp)
   818  {
   819    double a = * (const double *) ap;
   820    double b = * (const double *) bp;
   821  
   822    if (a < b)
   823      return -1;
   824    else if (a > b)
   825      return 1;
   826    else
   827      return 0;
   828  }
   829  
   830  double
   831  median (double *times, int n)
   832  {
   833    qsort (times, n, sizeof (double), compare_double);
   834    return times[n/2];
   835  }
   836  
   837  #define FFT_CACHE_SIZE 25
   838  typedef struct fft_cache
   839  {
   840    mp_size_t n;
   841    double time;
   842  } fft_cache_t;
   843  
   844  fft_cache_t fft_cache[FFT_CACHE_SIZE];
   845  
   846  double
   847  cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
   848  		int n_measurements)
   849  {
   850    int i;
   851    double t, ttab[MAX_REPS];
   852  
   853    if (fft_cache[k].n == n)
   854      return fft_cache[k].time;
   855  
   856    for (i = 0; i < n_measurements; i++)
   857      {
   858        speed_starttime ();
   859        mpn_mul_fft (rp, n, ap, n, bp, n, k);
   860        ttab[i] = speed_endtime ();
   861      }
   862  
   863    t = median (ttab, n_measurements);
   864    fft_cache[k].n = n;
   865    fft_cache[k].time = t;
   866    return t;
   867  }
   868  
   869  #define INSERT_FFTTAB(idx, nval, kval)					\
   870    do {									\
   871      fft_tab[idx].n = nval;						\
   872      fft_tab[idx].k = kval;						\
   873      fft_tab[idx+1].n = (1 << 27) - 1;	/* sentinel, 27b wide field */	\
   874      fft_tab[idx+1].k = (1 <<  5) - 1;					\
   875    } while (0)
   876  
   877  int
   878  fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
   879  {
   880    mp_size_t n, n1, prev_n1;
   881    int k, best_k, last_best_k, kmax;
   882    int eff, prev_eff;
   883    double t0, t1;
   884    int n_measurements;
   885    mp_limb_t *ap, *bp, *rp;
   886    mp_size_t alloc;
   887    struct fft_table_nk *fft_tab;
   888  
   889    fft_tab = mpn_fft_table3[p->sqr];
   890  
   891    for (k = 0; k < FFT_CACHE_SIZE; k++)
   892      fft_cache[k].n = 0;
   893  
   894    if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
   895      {
   896        nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
   897      }
   898  
   899    if (print)
   900      printf ("#define %s%*s", p->table_name, 38, "");
   901  
   902    if (idx == 0)
   903      {
   904        INSERT_FFTTAB (0, nmin, initial_k);
   905  
   906        if (print)
   907  	{
   908  	  printf ("\\\n  { ");
   909  	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
   910  	}
   911  
   912        idx = 1;
   913      }
   914  
   915    ap = (mp_ptr) malloc (sizeof (mp_limb_t));
   916    if (p->sqr)
   917      bp = ap;
   918    else
   919      bp = (mp_ptr) malloc (sizeof (mp_limb_t));
   920    rp = (mp_ptr) malloc (sizeof (mp_limb_t));
   921    alloc = 1;
   922  
   923    /* Round n to comply to initial k value */
   924    n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
   925  
   926    n_measurements = (18 - initial_k) | 1;
   927    n_measurements = MAX (n_measurements, MIN_REPS);
   928    n_measurements = MIN (n_measurements, MAX_REPS);
   929  
   930    last_best_k = initial_k;
   931    best_k = initial_k;
   932  
   933    while (n < nmax)
   934      {
   935        int start_k, end_k;
   936  
   937        /* Assume the current best k is best until we hit its next FFT step.  */
   938        t0 = 99999;
   939  
   940        prev_n1 = n + 1;
   941  
   942        start_k = MAX (4, best_k - 4);
   943        end_k = MIN (24, best_k + 4);
   944        for (k = start_k; k <= end_k; k++)
   945  	{
   946            n1 = mpn_fft_next_size (prev_n1, k);
   947  
   948  	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
   949  
   950  	  if (eff < 70)		/* avoid measuring too slow fft:s */
   951  	    continue;
   952  
   953  	  if (n1 > alloc)
   954  	    {
   955  	      alloc = n1;
   956  	      if (p->sqr)
   957  		{
   958  		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
   959  		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
   960  		  ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
   961  		  mpn_random (ap, alloc);
   962  		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
   963  		}
   964  	      else
   965  		{
   966  		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
   967  		  bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
   968  		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
   969  		  ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
   970  		  mpn_random (ap, alloc);
   971  		  bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
   972  		  mpn_random (bp, alloc);
   973  		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
   974  		}
   975  	    }
   976  
   977  	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
   978  
   979  	  if (t1 * n_measurements > 0.3)
   980  	    n_measurements -= 2;
   981  	  n_measurements = MAX (n_measurements, MIN_REPS);
   982  
   983  	  if (t1 < t0)
   984  	    {
   985  	      best_k = k;
   986  	      t0 = t1;
   987  	    }
   988  	}
   989  
   990        n1 = mpn_fft_next_size (prev_n1, best_k);
   991  
   992        if (last_best_k != best_k)
   993  	{
   994  	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
   995  
   996  	  if (idx >= FFT_TABLE3_SIZE)
   997  	    {
   998  	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
   999  	      abort ();
  1000  	    }
  1001  	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
  1002  
  1003  	  if (print)
  1004  	    {
  1005  	      printf (", ");
  1006  	      if (idx % 4 == 0)
  1007  		printf ("\\\n    ");
  1008  	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
  1009  	    }
  1010  
  1011  	  if (option_trace >= 2)
  1012  	    {
  1013  	      printf ("{%lu,%u}\n", prev_n1, best_k);
  1014  	      fflush (stdout);
  1015  	    }
  1016  
  1017  	  last_best_k = best_k;
  1018  	  idx++;
  1019  	}
  1020  
  1021        for (;;)
  1022  	{
  1023  	  prev_n1 = n1;
  1024  	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
  1025  	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
  1026  	  eff = fftfill (n1, best_k, p->sqr);
  1027  
  1028  	  if (eff != prev_eff)
  1029  	    break;
  1030  	}
  1031  
  1032        n = prev_n1;
  1033      }
  1034  
  1035    kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
  1036    kmax = MIN (kmax, 25-1);
  1037    for (k = last_best_k + 1; k <= kmax; k++)
  1038      {
  1039        if (idx >= FFT_TABLE3_SIZE)
  1040  	{
  1041  	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
  1042  	  abort ();
  1043  	}
  1044        INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
  1045  
  1046        if (print)
  1047  	{
  1048  	  printf (", ");
  1049  	  if (idx % 4 == 0)
  1050  	    printf ("\\\n    ");
  1051  	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
  1052  	}
  1053  
  1054        idx++;
  1055      }
  1056  
  1057    if (print)
  1058      printf (" }\n");
  1059  
  1060    free (ap);
  1061    if (! p->sqr)
  1062      free (bp);
  1063    free (rp);
  1064  
  1065    return idx;
  1066  }
  1067  
  1068  void
  1069  fft (struct fft_param_t *p)
  1070  {
  1071    mp_size_t  size;
  1072    int        k, idx, initial_k;
  1073  
  1074    /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
  1075  
  1076  #if 1
  1077    {
  1078      /* Use plain one() mechanism, for some reasonable initial values of k.  The
  1079         advantage is that we don't depend on mpn_fft_table3, which can therefore
  1080         leave it completely uninitialized.  */
  1081  
  1082      static struct param_t param;
  1083      mp_size_t thres, best_thres;
  1084      int best_k;
  1085      char buf[20];
  1086  
  1087      best_thres = MP_SIZE_T_MAX;
  1088      best_k = -1;
  1089  
  1090      for (k = 5; k <= 7; k++)
  1091        {
  1092  	param.name = p->modf_threshold_name;
  1093  	param.min_size = 100;
  1094  	param.max_size = 2000;
  1095  	param.function  = p->mul_function;
  1096  	param.step_factor = 0.0;
  1097  	param.step = 4;
  1098  	param.function2 = p->mul_modf_function;
  1099  	param.noprint = 1;
  1100  	s.r = k;
  1101  	one (&thres, &param);
  1102  	if (thres < best_thres)
  1103  	  {
  1104  	    best_thres = thres;
  1105  	    best_k = k;
  1106  	  }
  1107        }
  1108  
  1109      *(p->p_modf_threshold) = best_thres;
  1110      sprintf (buf, "k = %d", best_k);
  1111      print_define_remark (p->modf_threshold_name, best_thres, buf);
  1112      initial_k = best_k;
  1113    }
  1114  #else
  1115    size = p->first_size;
  1116    for (;;)
  1117      {
  1118        double  tk, tm;
  1119  
  1120        size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
  1121        k = mpn_fft_best_k (size, p->sqr);
  1122  
  1123        if (size >= p->max_size)
  1124          break;
  1125  
  1126        s.size = size + fft_step_size (k) / 2;
  1127        s.r = k;
  1128        tk = tuneup_measure (p->mul_modf_function, NULL, &s);
  1129        if (tk == -1.0)
  1130          abort ();
  1131  
  1132        tm = tuneup_measure (p->mul_function, NULL, &s);
  1133        if (tm == -1.0)
  1134          abort ();
  1135  
  1136        if (option_trace >= 2)
  1137          printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
  1138                  (long) size,
  1139                  (long) size + fft_step_size (k) / 2, k, tk,
  1140                  (long) s.size, tm);
  1141  
  1142        if (tk < tm)
  1143          {
  1144  	  *p->p_modf_threshold = s.size;
  1145  	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
  1146  	  break;
  1147          }
  1148      }
  1149    initial_k = ?;
  1150  #endif
  1151  
  1152    /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
  1153  
  1154    idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
  1155    printf ("#define %s_SIZE %d\n", p->table_name, idx);
  1156  
  1157    /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
  1158  
  1159    size = 2 * *p->p_modf_threshold;	/* OK? */
  1160    for (;;)
  1161      {
  1162        double  tk, tm;
  1163        mp_size_t mulmod_size, mul_size;;
  1164  
  1165        if (size >= p->max_size)
  1166          break;
  1167  
  1168        mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
  1169        mul_size = (size + mulmod_size) / 2;	/* middle of step */
  1170  
  1171        s.size = mulmod_size;
  1172        tk = tuneup_measure (p->function, NULL, &s);
  1173        if (tk == -1.0)
  1174          abort ();
  1175  
  1176        s.size = mul_size;
  1177        tm = tuneup_measure (p->mul_function, NULL, &s);
  1178        if (tm == -1.0)
  1179          abort ();
  1180  
  1181        if (option_trace >= 2)
  1182          printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
  1183                  (long) size,
  1184                  (long) mulmod_size, tk,
  1185                  (long) mul_size, tm);
  1186  
  1187        size = mulmod_size;
  1188  
  1189        if (tk < tm)
  1190          {
  1191  	  *p->p_threshold = s.size;
  1192  	  print_define (p->threshold_name, *p->p_threshold);
  1193  	  break;
  1194          }
  1195      }
  1196  }
  1197  
  1198  
  1199  
  1200  /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
  1201     giving wrong results.  */
  1202  void
  1203  tune_mul_n (void)
  1204  {
  1205    static struct param_t  param;
  1206    mp_size_t next_toom_start;
  1207    int something_changed;
  1208  
  1209    param.function = speed_mpn_mul_n;
  1210  
  1211    param.name = "MUL_TOOM22_THRESHOLD";
  1212    param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
  1213    param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
  1214    one (&mul_toom22_threshold, &param);
  1215  
  1216    param.noprint = 1;
  1217  
  1218    /* Threshold sequence loop.  Disable functions that would be used in a very
  1219       narrow range, re-measuring things when that happens.  */
  1220    something_changed = 1;
  1221    while (something_changed)
  1222      {
  1223        something_changed = 0;
  1224  
  1225  	next_toom_start = mul_toom22_threshold;
  1226  
  1227  	if (mul_toom33_threshold != 0)
  1228  	  {
  1229  	    param.name = "MUL_TOOM33_THRESHOLD";
  1230  	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
  1231  	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
  1232  	    one (&mul_toom33_threshold, &param);
  1233  
  1234  	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
  1235  	      {
  1236  		mul_toom33_threshold = 0;
  1237  		something_changed = 1;
  1238  	      }
  1239  	  }
  1240  
  1241  	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
  1242  
  1243  	if (mul_toom44_threshold != 0)
  1244  	  {
  1245  	    param.name = "MUL_TOOM44_THRESHOLD";
  1246  	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
  1247  	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
  1248  	    one (&mul_toom44_threshold, &param);
  1249  
  1250  	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
  1251  	      {
  1252  		mul_toom44_threshold = 0;
  1253  		something_changed = 1;
  1254  	      }
  1255  	  }
  1256  
  1257  	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
  1258  
  1259  	if (mul_toom6h_threshold != 0)
  1260  	  {
  1261  	    param.name = "MUL_TOOM6H_THRESHOLD";
  1262  	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
  1263  	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
  1264  	    one (&mul_toom6h_threshold, &param);
  1265  
  1266  	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
  1267  	      {
  1268  		mul_toom6h_threshold = 0;
  1269  		something_changed = 1;
  1270  	      }
  1271  	  }
  1272  
  1273  	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
  1274  
  1275  	if (mul_toom8h_threshold != 0)
  1276  	  {
  1277  	    param.name = "MUL_TOOM8H_THRESHOLD";
  1278  	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
  1279  	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
  1280  	    one (&mul_toom8h_threshold, &param);
  1281  
  1282  	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
  1283  	      {
  1284  		mul_toom8h_threshold = 0;
  1285  		something_changed = 1;
  1286  	      }
  1287  	  }
  1288      }
  1289  
  1290      print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
  1291      print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
  1292      print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
  1293      print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
  1294  
  1295    /* disabled until tuned */
  1296    MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
  1297  }
  1298  
  1299  void
  1300  tune_mul (void)
  1301  {
  1302    static struct param_t  param;
  1303    mp_size_t thres;
  1304  
  1305    param.noprint = 1;
  1306  
  1307    param.function = speed_mpn_toom32_for_toom43_mul;
  1308    param.function2 = speed_mpn_toom43_for_toom32_mul;
  1309    param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
  1310    param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
  1311    one (&thres, &param);
  1312    mul_toom32_to_toom43_threshold = thres * 17 / 24;
  1313    print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
  1314  
  1315    param.function = speed_mpn_toom32_for_toom53_mul;
  1316    param.function2 = speed_mpn_toom53_for_toom32_mul;
  1317    param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
  1318    param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
  1319    one (&thres, &param);
  1320    mul_toom32_to_toom53_threshold = thres * 19 / 30;
  1321    print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
  1322  
  1323    param.function = speed_mpn_toom42_for_toom53_mul;
  1324    param.function2 = speed_mpn_toom53_for_toom42_mul;
  1325    param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
  1326    param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
  1327    one (&thres, &param);
  1328    mul_toom42_to_toom53_threshold = thres * 11 / 20;
  1329    print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
  1330  
  1331    param.function = speed_mpn_toom42_mul;
  1332    param.function2 = speed_mpn_toom63_mul;
  1333    param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
  1334    param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
  1335    one (&thres, &param);
  1336    mul_toom42_to_toom63_threshold = thres / 2;
  1337    print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
  1338  
  1339    /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
  1340    param.function = speed_mpn_toom43_for_toom54_mul;
  1341    param.function2 = speed_mpn_toom54_for_toom43_mul;
  1342    param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
  1343    param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
  1344    one (&thres, &param);
  1345    mul_toom43_to_toom54_threshold = thres * 5 / 6;
  1346    print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
  1347  }
  1348  
  1349  
  1350  void
  1351  tune_mullo (void)
  1352  {
  1353    static struct param_t  param;
  1354  
  1355    param.function = speed_mpn_mullo_n;
  1356  
  1357    param.name = "MULLO_BASECASE_THRESHOLD";
  1358    param.min_size = 1;
  1359    param.min_is_always = 1;
  1360    param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
  1361    param.stop_factor = 1.5;
  1362    param.noprint = 1;
  1363    one (&mullo_basecase_threshold, &param);
  1364  
  1365    param.name = "MULLO_DC_THRESHOLD";
  1366    param.min_size = 8;
  1367    param.min_is_always = 0;
  1368    param.max_size = 1000;
  1369    one (&mullo_dc_threshold, &param);
  1370  
  1371    if (mullo_basecase_threshold >= mullo_dc_threshold)
  1372      {
  1373        print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
  1374        print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
  1375      }
  1376    else
  1377      {
  1378        print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
  1379        print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
  1380      }
  1381  
  1382    if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
  1383      {
  1384        param.name = "MULLO_MUL_N_THRESHOLD";
  1385        param.min_size = mullo_dc_threshold;
  1386        param.max_size = 2 * mul_fft_threshold;
  1387        param.noprint = 0;
  1388        param.step_factor = 0.03;
  1389        one (&mullo_mul_n_threshold, &param);
  1390      }
  1391    else
  1392      print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
  1393  			 "without FFT use mullo forever");
  1394  }
  1395  
  1396  void
  1397  tune_sqrlo (void)
  1398  {
  1399    static struct param_t  param;
  1400  
  1401    param.function = speed_mpn_sqrlo;
  1402  
  1403    param.name = "SQRLO_BASECASE_THRESHOLD";
  1404    param.min_size = 1;
  1405    param.min_is_always = 1;
  1406    param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
  1407    param.stop_factor = 1.5;
  1408    param.noprint = 1;
  1409    one (&sqrlo_basecase_threshold, &param);
  1410  
  1411    param.name = "SQRLO_DC_THRESHOLD";
  1412    param.min_size = 8;
  1413    param.min_is_always = 0;
  1414    param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
  1415    one (&sqrlo_dc_threshold, &param);
  1416  
  1417    if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
  1418      {
  1419        print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
  1420        print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
  1421      }
  1422    else
  1423      {
  1424        print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
  1425        print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
  1426      }
  1427  
  1428    if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
  1429      {
  1430        param.name = "SQRLO_SQR_THRESHOLD";
  1431        param.min_size = sqrlo_dc_threshold;
  1432        param.max_size = 2 * sqr_fft_threshold;
  1433        param.noprint = 0;
  1434        param.step_factor = 0.03;
  1435        one (&sqrlo_sqr_threshold, &param);
  1436      }
  1437    else
  1438      print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
  1439  			 "without FFT use sqrlo forever");
  1440  }
  1441  
  1442  void
  1443  tune_mulmid (void)
  1444  {
  1445    static struct param_t  param;
  1446  
  1447    param.name = "MULMID_TOOM42_THRESHOLD";
  1448    param.function = speed_mpn_mulmid_n;
  1449    param.min_size = 4;
  1450    param.max_size = 100;
  1451    one (&mulmid_toom42_threshold, &param);
  1452  }
  1453  
  1454  void
  1455  tune_mulmod_bnm1 (void)
  1456  {
  1457    static struct param_t  param;
  1458  
  1459    param.name = "MULMOD_BNM1_THRESHOLD";
  1460    param.function = speed_mpn_mulmod_bnm1;
  1461    param.min_size = 4;
  1462    param.max_size = 100;
  1463    one (&mulmod_bnm1_threshold, &param);
  1464  }
  1465  
  1466  void
  1467  tune_sqrmod_bnm1 (void)
  1468  {
  1469    static struct param_t  param;
  1470  
  1471    param.name = "SQRMOD_BNM1_THRESHOLD";
  1472    param.function = speed_mpn_sqrmod_bnm1;
  1473    param.min_size = 4;
  1474    param.max_size = 100;
  1475    one (&sqrmod_bnm1_threshold, &param);
  1476  }
  1477  
  1478  
  1479  /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
  1480     is faster only at size==2 then we don't want to bother with extra code
  1481     just for that.  Start karatsuba from 4 same as MUL above.  */
  1482  
  1483  void
  1484  tune_sqr (void)
  1485  {
  1486    /* disabled until tuned */
  1487    SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
  1488  
  1489    if (HAVE_NATIVE_mpn_sqr_basecase)
  1490      {
  1491        print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
  1492        sqr_basecase_threshold = 0;
  1493      }
  1494    else
  1495      {
  1496        static struct param_t  param;
  1497        param.name = "SQR_BASECASE_THRESHOLD";
  1498        param.function = speed_mpn_sqr;
  1499        param.min_size = 3;
  1500        param.min_is_always = 1;
  1501        param.max_size = TUNE_SQR_TOOM2_MAX;
  1502        param.noprint = 1;
  1503        one (&sqr_basecase_threshold, &param);
  1504      }
  1505  
  1506    {
  1507      static struct param_t  param;
  1508      param.name = "SQR_TOOM2_THRESHOLD";
  1509      param.function = speed_mpn_sqr;
  1510      param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
  1511      param.max_size = TUNE_SQR_TOOM2_MAX;
  1512      param.noprint = 1;
  1513      one (&sqr_toom2_threshold, &param);
  1514  
  1515      if (! HAVE_NATIVE_mpn_sqr_basecase
  1516          && sqr_toom2_threshold < sqr_basecase_threshold)
  1517        {
  1518          /* Karatsuba becomes faster than mul_basecase before
  1519             sqr_basecase does.  Arrange for the expression
  1520             "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
  1521             selects mpn_sqr_basecase in mpn_sqr to be false, by setting
  1522             SQR_TOOM2_THRESHOLD to zero, making
  1523             SQR_BASECASE_THRESHOLD the toom2 threshold.  */
  1524  
  1525          sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
  1526          SQR_TOOM2_THRESHOLD = 0;
  1527  
  1528          print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
  1529                               "toom2");
  1530          print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
  1531                               "never sqr_basecase");
  1532        }
  1533      else
  1534        {
  1535          if (! HAVE_NATIVE_mpn_sqr_basecase)
  1536            print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
  1537          print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
  1538        }
  1539    }
  1540  
  1541    {
  1542      static struct param_t  param;
  1543      mp_size_t next_toom_start;
  1544      int something_changed;
  1545  
  1546      param.function = speed_mpn_sqr;
  1547      param.noprint = 1;
  1548  
  1549    /* Threshold sequence loop.  Disable functions that would be used in a very
  1550       narrow range, re-measuring things when that happens.  */
  1551      something_changed = 1;
  1552      while (something_changed)
  1553        {
  1554  	something_changed = 0;
  1555  
  1556  	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
  1557  
  1558  	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
  1559  	param.name = "SQR_TOOM3_THRESHOLD";
  1560  	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
  1561  	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
  1562  	one (&sqr_toom3_threshold, &param);
  1563  
  1564  	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
  1565  
  1566  	if (sqr_toom4_threshold != 0)
  1567  	  {
  1568  	    param.name = "SQR_TOOM4_THRESHOLD";
  1569  	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
  1570  	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
  1571  	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
  1572  	    one (&sqr_toom4_threshold, &param);
  1573  
  1574  	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
  1575  	      {
  1576  		sqr_toom4_threshold = 0;
  1577  		something_changed = 1;
  1578  	      }
  1579  	  }
  1580  
  1581  	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
  1582  
  1583  	if (sqr_toom6_threshold != 0)
  1584  	  {
  1585  	    param.name = "SQR_TOOM6_THRESHOLD";
  1586  	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
  1587  	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
  1588  	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
  1589  	    one (&sqr_toom6_threshold, &param);
  1590  
  1591  	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
  1592  	      {
  1593  		sqr_toom6_threshold = 0;
  1594  		something_changed = 1;
  1595  	      }
  1596  	  }
  1597  
  1598  	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
  1599  
  1600  	if (sqr_toom8_threshold != 0)
  1601  	  {
  1602  	    param.name = "SQR_TOOM8_THRESHOLD";
  1603  	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
  1604  	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
  1605  	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
  1606  	    one (&sqr_toom8_threshold, &param);
  1607  
  1608  	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
  1609  	      {
  1610  		sqr_toom8_threshold = 0;
  1611  		something_changed = 1;
  1612  	      }
  1613  	  }
  1614        }
  1615  
  1616      print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
  1617      print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
  1618      print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
  1619      print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
  1620    }
  1621  }
  1622  
  1623  
  1624  void
  1625  tune_dc_div (void)
  1626  {
  1627    s.r = 0;		/* clear to make speed function do 2n/n */
  1628    {
  1629      static struct param_t  param;
  1630      param.name = "DC_DIV_QR_THRESHOLD";
  1631      param.function = speed_mpn_sbpi1_div_qr;
  1632      param.function2 = speed_mpn_dcpi1_div_qr;
  1633      param.min_size = 6;
  1634      one (&dc_div_qr_threshold, &param);
  1635    }
  1636    {
  1637      static struct param_t  param;
  1638      param.name = "DC_DIVAPPR_Q_THRESHOLD";
  1639      param.function = speed_mpn_sbpi1_divappr_q;
  1640      param.function2 = speed_mpn_dcpi1_divappr_q;
  1641      param.min_size = 6;
  1642      one (&dc_divappr_q_threshold, &param);
  1643    }
  1644  }
  1645  
  1646  static double
  1647  speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
  1648  {
  1649    if (s->size < DC_DIV_QR_THRESHOLD)
  1650      return speed_mpn_sbpi1_div_qr (s);
  1651    else
  1652      return speed_mpn_dcpi1_div_qr (s);
  1653  }
  1654  
  1655  void
  1656  tune_mu_div (void)
  1657  {
  1658    s.r = 0;		/* clear to make speed function do 2n/n */
  1659    {
  1660      static struct param_t  param;
  1661      param.name = "MU_DIV_QR_THRESHOLD";
  1662      param.function = speed_mpn_dcpi1_div_qr;
  1663      param.function2 = speed_mpn_mu_div_qr;
  1664      param.min_size = mul_toom22_threshold;
  1665      param.max_size = 5000;
  1666      param.step_factor = 0.02;
  1667      one (&mu_div_qr_threshold, &param);
  1668    }
  1669    {
  1670      static struct param_t  param;
  1671      param.name = "MU_DIVAPPR_Q_THRESHOLD";
  1672      param.function = speed_mpn_dcpi1_divappr_q;
  1673      param.function2 = speed_mpn_mu_divappr_q;
  1674      param.min_size = mul_toom22_threshold;
  1675      param.max_size = 5000;
  1676      param.step_factor = 0.02;
  1677      one (&mu_divappr_q_threshold, &param);
  1678    }
  1679    {
  1680      static struct param_t  param;
  1681      param.name = "MUPI_DIV_QR_THRESHOLD";
  1682      param.function = speed_mpn_sbordcpi1_div_qr;
  1683      param.function2 = speed_mpn_mupi_div_qr;
  1684      param.min_size = 6;
  1685      param.min_is_always = 1;
  1686      param.max_size = 1000;
  1687      param.step_factor = 0.02;
  1688      one (&mupi_div_qr_threshold, &param);
  1689    }
  1690  }
  1691  
  1692  void
  1693  tune_dc_bdiv (void)
  1694  {
  1695    s.r = 0;		/* clear to make speed function do 2n/n*/
  1696    {
  1697      static struct param_t  param;
  1698      param.name = "DC_BDIV_QR_THRESHOLD";
  1699      param.function = speed_mpn_sbpi1_bdiv_qr;
  1700      param.function2 = speed_mpn_dcpi1_bdiv_qr;
  1701      param.min_size = 4;
  1702      one (&dc_bdiv_qr_threshold, &param);
  1703    }
  1704    {
  1705      static struct param_t  param;
  1706      param.name = "DC_BDIV_Q_THRESHOLD";
  1707      param.function = speed_mpn_sbpi1_bdiv_q;
  1708      param.function2 = speed_mpn_dcpi1_bdiv_q;
  1709      param.min_size = 4;
  1710      one (&dc_bdiv_q_threshold, &param);
  1711    }
  1712  }
  1713  
  1714  void
  1715  tune_mu_bdiv (void)
  1716  {
  1717    s.r = 0;		/* clear to make speed function do 2n/n*/
  1718    {
  1719      static struct param_t  param;
  1720      param.name = "MU_BDIV_QR_THRESHOLD";
  1721      param.function = speed_mpn_dcpi1_bdiv_qr;
  1722      param.function2 = speed_mpn_mu_bdiv_qr;
  1723      param.min_size = dc_bdiv_qr_threshold;
  1724      param.max_size = 5000;
  1725      param.step_factor = 0.02;
  1726      one (&mu_bdiv_qr_threshold, &param);
  1727    }
  1728    {
  1729      static struct param_t  param;
  1730      param.name = "MU_BDIV_Q_THRESHOLD";
  1731      param.function = speed_mpn_dcpi1_bdiv_q;
  1732      param.function2 = speed_mpn_mu_bdiv_q;
  1733      param.min_size = dc_bdiv_q_threshold;
  1734      param.max_size = 5000;
  1735      param.step_factor = 0.02;
  1736      one (&mu_bdiv_q_threshold, &param);
  1737    }
  1738  }
  1739  
  1740  void
  1741  tune_invertappr (void)
  1742  {
  1743    static struct param_t  param;
  1744  
  1745    param.function = speed_mpn_ni_invertappr;
  1746    param.name = "INV_MULMOD_BNM1_THRESHOLD";
  1747    param.min_size = 5;
  1748    one (&inv_mulmod_bnm1_threshold, &param);
  1749  
  1750    param.function = speed_mpn_invertappr;
  1751    param.name = "INV_NEWTON_THRESHOLD";
  1752    param.min_size = 5;
  1753    one (&inv_newton_threshold, &param);
  1754  }
  1755  
  1756  void
  1757  tune_invert (void)
  1758  {
  1759    static struct param_t  param;
  1760  
  1761    param.function = speed_mpn_invert;
  1762    param.name = "INV_APPR_THRESHOLD";
  1763    param.min_size = 5;
  1764    one (&inv_appr_threshold, &param);
  1765  }
  1766  
  1767  void
  1768  tune_binvert (void)
  1769  {
  1770    static struct param_t  param;
  1771  
  1772    param.function = speed_mpn_binvert;
  1773    param.name = "BINV_NEWTON_THRESHOLD";
  1774    param.min_size = 8;		/* pointless with smaller operands */
  1775    one (&binv_newton_threshold, &param);
  1776  }
  1777  
  1778  void
  1779  tune_redc (void)
  1780  {
  1781  #define TUNE_REDC_2_MAX 100
  1782  #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
  1783  #define WANT_REDC_2 1
  1784  #endif
  1785  
  1786  #if WANT_REDC_2
  1787    {
  1788      static struct param_t  param;
  1789      param.name = "REDC_1_TO_REDC_2_THRESHOLD";
  1790      param.function = speed_mpn_redc_1;
  1791      param.function2 = speed_mpn_redc_2;
  1792      param.min_size = 1;
  1793      param.min_is_always = 1;
  1794      param.max_size = TUNE_REDC_2_MAX;
  1795      param.noprint = 1;
  1796      param.stop_factor = 1.5;
  1797      one (&redc_1_to_redc_2_threshold, &param);
  1798    }
  1799    {
  1800      static struct param_t  param;
  1801      param.name = "REDC_2_TO_REDC_N_THRESHOLD";
  1802      param.function = speed_mpn_redc_2;
  1803      param.function2 = speed_mpn_redc_n;
  1804      param.min_size = 16;
  1805      param.noprint = 1;
  1806      one (&redc_2_to_redc_n_threshold, &param);
  1807    }
  1808    if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
  1809      {
  1810        redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
  1811  
  1812        /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
  1813  	 REDC_1_TO_REDC_2_THRESHOLD.  */
  1814        {
  1815  	static struct param_t  param;
  1816  	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
  1817  	param.function = speed_mpn_redc_1;
  1818  	param.function2 = speed_mpn_redc_n;
  1819  	param.min_size = 16;
  1820  	param.noprint = 1;
  1821  	one (&redc_1_to_redc_2_threshold, &param);
  1822        }
  1823      }
  1824    print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
  1825    print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
  1826  #else
  1827    {
  1828      static struct param_t  param;
  1829      param.name = "REDC_1_TO_REDC_N_THRESHOLD";
  1830      param.function = speed_mpn_redc_1;
  1831      param.function2 = speed_mpn_redc_n;
  1832      param.min_size = 16;
  1833      one (&redc_1_to_redc_n_threshold, &param);
  1834    }
  1835  #endif
  1836  }
  1837  
  1838  void
  1839  tune_matrix22_mul (void)
  1840  {
  1841    static struct param_t  param;
  1842    param.name = "MATRIX22_STRASSEN_THRESHOLD";
  1843    param.function = speed_mpn_matrix22_mul;
  1844    param.min_size = 2;
  1845    one (&matrix22_strassen_threshold, &param);
  1846  }
  1847  
  1848  void
  1849  tune_hgcd (void)
  1850  {
  1851    static struct param_t  param;
  1852    param.name = "HGCD_THRESHOLD";
  1853    param.function = speed_mpn_hgcd;
  1854    /* We seem to get strange results for small sizes */
  1855    param.min_size = 30;
  1856    one (&hgcd_threshold, &param);
  1857  }
  1858  
  1859  void
  1860  tune_hgcd_appr (void)
  1861  {
  1862    static struct param_t  param;
  1863    param.name = "HGCD_APPR_THRESHOLD";
  1864    param.function = speed_mpn_hgcd_appr;
  1865    /* We seem to get strange results for small sizes */
  1866    param.min_size = 50;
  1867    param.stop_since_change = 150;
  1868    one (&hgcd_appr_threshold, &param);
  1869  }
  1870  
  1871  void
  1872  tune_hgcd_reduce (void)
  1873  {
  1874    static struct param_t  param;
  1875    param.name = "HGCD_REDUCE_THRESHOLD";
  1876    param.function = speed_mpn_hgcd_reduce;
  1877    param.min_size = 30;
  1878    param.max_size = 7000;
  1879    param.step_factor = 0.04;
  1880    one (&hgcd_reduce_threshold, &param);
  1881  }
  1882  
  1883  void
  1884  tune_gcd_dc (void)
  1885  {
  1886    static struct param_t  param;
  1887    param.name = "GCD_DC_THRESHOLD";
  1888    param.function = speed_mpn_gcd;
  1889    param.min_size = hgcd_threshold;
  1890    param.max_size = 3000;
  1891    param.step_factor = 0.02;
  1892    one (&gcd_dc_threshold, &param);
  1893  }
  1894  
  1895  void
  1896  tune_gcdext_dc (void)
  1897  {
  1898    static struct param_t  param;
  1899    param.name = "GCDEXT_DC_THRESHOLD";
  1900    param.function = speed_mpn_gcdext;
  1901    param.min_size = hgcd_threshold;
  1902    param.max_size = 3000;
  1903    param.step_factor = 0.02;
  1904    one (&gcdext_dc_threshold, &param);
  1905  }
  1906  
  1907  /* In tune_powm_sec we compute the table used by the win_size function.  The
  1908     cutoff points are in exponent bits, disregarding other operand sizes.  It is
  1909     not possible to use the one framework since it currently uses a granularity
  1910     of full limbs.
  1911  */
  1912  
  1913  /* This win_size replaces the variant in the powm code, allowing us to
  1914     control k in the k-ary algorithms.  */
  1915  int winsize;
  1916  int
  1917  win_size (mp_bitcnt_t eb)
  1918  {
  1919    return winsize;
  1920  }
  1921  
  1922  void
  1923  tune_powm_sec (void)
  1924  {
  1925    mp_size_t n;
  1926    int k, i;
  1927    mp_size_t itch;
  1928    mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
  1929    const int n_max = 3000 / GMP_NUMB_BITS;
  1930    const int n_measurements = 5;
  1931    mp_ptr rp, bp, ep, mp, tp;
  1932    double ttab[n_measurements], tk, tkp1;
  1933    TMP_DECL;
  1934    TMP_MARK;
  1935  
  1936    possible_nbits_cutoff = 0;
  1937  
  1938    k = 1;
  1939  
  1940    winsize = 10;			/* the itch function needs this */
  1941    itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
  1942  
  1943    rp = TMP_ALLOC_LIMBS (n_max);
  1944    bp = TMP_ALLOC_LIMBS (n_max);
  1945    ep = TMP_ALLOC_LIMBS (n_max);
  1946    mp = TMP_ALLOC_LIMBS (n_max);
  1947    tp = TMP_ALLOC_LIMBS (itch);
  1948  
  1949    mpn_random (bp, n_max);
  1950    mpn_random (mp, n_max);
  1951    mp[0] |= 1;
  1952  
  1953  /* How about taking the M operand size into account?
  1954  
  1955     An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
  1956     B = O(M)).
  1957  
  1958     Using k-ary and no sliding window, the precomputation will need time
  1959     O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
  1960     O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
  1961  
  1962     An operation R=powm_sec(B,E,N) will take time like powm.
  1963  
  1964     Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
  1965     main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
  1966     O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
  1967     table reads, respectively.  */
  1968  
  1969    printf ("#define POWM_SEC_TABLE  ");
  1970  
  1971    /* For nbits == 1, we should always use k == 1, so no need to tune
  1972       that. Starting with nbits == 2 also ensure that nbits always is
  1973       larger than the windowsize k+1. */
  1974    for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
  1975      {
  1976        n = (nbits - 1) / GMP_NUMB_BITS + 1;
  1977  
  1978        /* Generate E such that sliding-window for k and k+1 works equally
  1979  	 well/poorly (but sliding is not used in powm_sec, of course). */
  1980        for (i = 0; i < n; i++)
  1981  	ep[i] = ~CNST_LIMB(0);
  1982  
  1983        winsize = k;
  1984        for (i = 0; i < n_measurements; i++)
  1985  	{
  1986  	  speed_starttime ();
  1987  	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
  1988  	  ttab[i] = speed_endtime ();
  1989  	}
  1990        tk = median (ttab, n_measurements);
  1991  
  1992        winsize = k + 1;
  1993        speed_starttime ();
  1994        for (i = 0; i < n_measurements; i++)
  1995  	{
  1996  	  speed_starttime ();
  1997  	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
  1998  	  ttab[i] = speed_endtime ();
  1999  	}
  2000        tkp1 = median (ttab, n_measurements);
  2001  /*
  2002        printf ("testing: %ld, %d", nbits, k, ep[n-1]);
  2003        printf ("   %10.5f  %10.5f\n", tk, tkp1);
  2004  */
  2005        if (tkp1 < tk)
  2006  	{
  2007  	  if (possible_nbits_cutoff)
  2008  	    {
  2009  	      /* Two consecutive sizes indicate k increase, obey.  */
  2010  
  2011  	      /* Must always have x[k] >= k */
  2012  	      ASSERT_ALWAYS (possible_nbits_cutoff >= k);
  2013  
  2014  	      if (k > 1)
  2015  		printf (",");
  2016  	      printf ("%ld", (long) possible_nbits_cutoff);
  2017  	      k++;
  2018  	      possible_nbits_cutoff = 0;
  2019  	    }
  2020  	  else
  2021  	    {
  2022  	      /* One measurement indicate k increase, save nbits for further
  2023  		 consideration.  */
  2024  	      /* The new larger k gets used for sizes > the cutoff
  2025  		 value, hence the cutoff should be one less than the
  2026  		 smallest size where it gives a speedup. */
  2027  	      possible_nbits_cutoff = nbits - 1;
  2028  	    }
  2029  	}
  2030        else
  2031  	possible_nbits_cutoff = 0;
  2032  
  2033        nbits_next = nbits * 65 / 64;
  2034        nbits = nbits_next + (nbits_next == nbits);
  2035      }
  2036    printf ("\n");
  2037    TMP_FREE;
  2038  }
  2039  
  2040  
  2041  /* size_extra==1 reflects the fact that with high<divisor one division is
  2042     always skipped.  Forcing high<divisor while testing ensures consistency
  2043     while stepping through sizes, ie. that size-1 divides will be done each
  2044     time.
  2045  
  2046     min_size==2 and min_is_always are used so that if plain division is only
  2047     better at size==1 then don't bother including that code just for that
  2048     case, instead go with preinv always and get a size saving.  */
  2049  
  2050  #define DIV_1_PARAMS                    \
  2051    param.check_size = 256;               \
  2052    param.min_size = 2;                   \
  2053    param.min_is_always = 1;              \
  2054    param.data_high = DATA_HIGH_LT_R;     \
  2055    param.size_extra = 1;                 \
  2056    param.stop_factor = 2.0;
  2057  
  2058  
  2059  double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
  2060  
  2061  void
  2062  tune_divrem_1 (void)
  2063  {
  2064    /* plain version by default */
  2065    tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
  2066  
  2067    /* No support for tuning native assembler code, do that by hand and put
  2068       the results in the .asm file, there's no need for such thresholds to
  2069       appear in gmp-mparam.h.  */
  2070    if (HAVE_NATIVE_mpn_divrem_1)
  2071      return;
  2072  
  2073    if (GMP_NAIL_BITS != 0)
  2074      {
  2075        print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
  2076                             "no preinv with nails");
  2077        print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
  2078                             "no preinv with nails");
  2079        return;
  2080      }
  2081  
  2082    if (UDIV_PREINV_ALWAYS)
  2083      {
  2084        print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
  2085        print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
  2086        return;
  2087      }
  2088  
  2089    tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
  2090  
  2091    /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
  2092       a bit out for the fractional part, but that's too bad, the integer part
  2093       is more important. */
  2094    {
  2095      static struct param_t  param;
  2096      param.name = "DIVREM_1_NORM_THRESHOLD";
  2097      DIV_1_PARAMS;
  2098      s.r = randlimb_norm ();
  2099      param.function = speed_mpn_divrem_1_tune;
  2100      one (&divrem_1_norm_threshold, &param);
  2101    }
  2102    {
  2103      static struct param_t  param;
  2104      param.name = "DIVREM_1_UNNORM_THRESHOLD";
  2105      DIV_1_PARAMS;
  2106      s.r = randlimb_half ();
  2107      param.function = speed_mpn_divrem_1_tune;
  2108      one (&divrem_1_unnorm_threshold, &param);
  2109    }
  2110  }
  2111  
  2112  void
  2113  tune_div_qr_1 (void)
  2114  {
  2115    static struct param_t  param;
  2116    double            t1, t2;
  2117  
  2118    if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
  2119      {
  2120        static struct param_t  param;
  2121        double   t1, t2;
  2122  
  2123        s.size = 10;
  2124        s.r = randlimb_norm ();
  2125  
  2126        t1 = tuneup_measure (speed_mpn_div_qr_1n_pi1_1, &param, &s);
  2127        t2 = tuneup_measure (speed_mpn_div_qr_1n_pi1_2, &param, &s);
  2128  
  2129        if (t1 == -1.0 || t2 == -1.0)
  2130  	{
  2131  	  printf ("Oops, can't measure all mpn_div_qr_1n_pi1 methods at %ld\n",
  2132  		  (long) s.size);
  2133  	  abort ();
  2134  	}
  2135        div_qr_1n_pi1_method = (t1 < t2) ? 1 : 2;
  2136        print_define ("DIV_QR_1N_PI1_METHOD", div_qr_1n_pi1_method);
  2137      }
  2138  
  2139    {
  2140      static struct param_t  param;
  2141      param.name = "DIV_QR_1_NORM_THRESHOLD";
  2142      DIV_1_PARAMS;
  2143      param.min_size = 1;
  2144      param.min_is_always = 0;
  2145      s.r = randlimb_norm ();
  2146      param.function = speed_mpn_div_qr_1_tune;
  2147      one (&div_qr_1_norm_threshold, &param);
  2148    }
  2149    {
  2150      static struct param_t  param;
  2151      param.name = "DIV_QR_1_UNNORM_THRESHOLD";
  2152      DIV_1_PARAMS;
  2153      param.min_size = 1;
  2154      param.min_is_always = 0;
  2155      s.r = randlimb_half();
  2156      param.function = speed_mpn_div_qr_1_tune;
  2157      one (&div_qr_1_unnorm_threshold, &param);
  2158    }
  2159  }
  2160  
  2161  
  2162  void
  2163  tune_mod_1 (void)
  2164  {
  2165    /* No support for tuning native assembler code, do that by hand and put
  2166       the results in the .asm file, there's no need for such thresholds to
  2167       appear in gmp-mparam.h.  */
  2168    if (HAVE_NATIVE_mpn_mod_1)
  2169      return;
  2170  
  2171    if (GMP_NAIL_BITS != 0)
  2172      {
  2173        print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
  2174                             "no preinv with nails");
  2175        print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
  2176                             "no preinv with nails");
  2177        return;
  2178      }
  2179  
  2180    if (!HAVE_NATIVE_mpn_mod_1_1p)
  2181      {
  2182        static struct param_t  param;
  2183        double   t1, t2;
  2184  
  2185        s.size = 10;
  2186        s.r = randlimb_half ();
  2187  
  2188        t1 = tuneup_measure (speed_mpn_mod_1_1_1, &param, &s);
  2189        t2 = tuneup_measure (speed_mpn_mod_1_1_2, &param, &s);
  2190  
  2191        if (t1 == -1.0 || t2 == -1.0)
  2192  	{
  2193  	  printf ("Oops, can't measure all mpn_mod_1_1 methods at %ld\n",
  2194  		  (long) s.size);
  2195  	  abort ();
  2196  	}
  2197        mod_1_1p_method = (t1 < t2) ? 1 : 2;
  2198        print_define ("MOD_1_1P_METHOD", mod_1_1p_method);
  2199      }
  2200  
  2201    if (UDIV_PREINV_ALWAYS)
  2202      {
  2203        print_define ("MOD_1_NORM_THRESHOLD", 0L);
  2204        print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
  2205      }
  2206    else
  2207      {
  2208        {
  2209  	static struct param_t  param;
  2210  	param.name = "MOD_1_NORM_THRESHOLD";
  2211  	DIV_1_PARAMS;
  2212  	s.r = randlimb_norm ();
  2213  	param.function = speed_mpn_mod_1_tune;
  2214  	one (&mod_1_norm_threshold, &param);
  2215        }
  2216        {
  2217  	static struct param_t  param;
  2218  	param.name = "MOD_1_UNNORM_THRESHOLD";
  2219  	DIV_1_PARAMS;
  2220  	s.r = randlimb_half ();
  2221  	param.function = speed_mpn_mod_1_tune;
  2222  	one (&mod_1_unnorm_threshold, &param);
  2223        }
  2224      }
  2225    {
  2226      static struct param_t  param;
  2227  
  2228      param.check_size = 256;
  2229  
  2230      s.r = randlimb_norm ();
  2231      param.function = speed_mpn_mod_1_tune;
  2232  
  2233      param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
  2234      param.min_size = 2;
  2235      one (&mod_1n_to_mod_1_1_threshold, &param);
  2236    }
  2237  
  2238    {
  2239      static struct param_t  param;
  2240  
  2241      param.check_size = 256;
  2242      s.r = randlimb_half ();
  2243      param.noprint = 1;
  2244  
  2245      param.function = speed_mpn_mod_1_1;
  2246      param.function2 = speed_mpn_mod_1_2;
  2247      param.min_is_always = 1;
  2248      param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
  2249      param.min_size = 2;
  2250      one (&mod_1_1_to_mod_1_2_threshold, &param);
  2251  
  2252      param.function = speed_mpn_mod_1_2;
  2253      param.function2 = speed_mpn_mod_1_4;
  2254      param.min_is_always = 1;
  2255      param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
  2256      param.min_size = 1;
  2257      one (&mod_1_2_to_mod_1_4_threshold, &param);
  2258  
  2259      if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
  2260        {
  2261  	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
  2262  	mod_1_2_to_mod_1_4_threshold = 0;
  2263  
  2264  	param.function = speed_mpn_mod_1_1;
  2265  	param.function2 = speed_mpn_mod_1_4;
  2266  	param.min_is_always = 1;
  2267  	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
  2268  	param.min_size = 2;
  2269  	one (&mod_1_1_to_mod_1_2_threshold, &param);
  2270        }
  2271  
  2272      param.function = speed_mpn_mod_1_tune;
  2273      param.function2 = NULL;
  2274      param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
  2275      param.min_size = 2;
  2276      param.min_is_always = 0;
  2277      one (&mod_1u_to_mod_1_1_threshold, &param);
  2278  
  2279      if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
  2280        mod_1_1_to_mod_1_2_threshold = 0;
  2281      if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
  2282        mod_1_2_to_mod_1_4_threshold = 0;
  2283  
  2284      print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
  2285      print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
  2286  			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
  2287      print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
  2288  			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
  2289    }
  2290  
  2291    {
  2292      static struct param_t  param;
  2293  
  2294      param.check_size = 256;
  2295  
  2296      param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
  2297      s.r = randlimb_norm ();
  2298      param.function = speed_mpn_preinv_mod_1;
  2299      param.function2 = speed_mpn_mod_1_tune;
  2300      param.min_size = 1;
  2301      one (&preinv_mod_1_to_mod_1_threshold, &param);
  2302    }
  2303  }
  2304  
  2305  
  2306  /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
  2307     imply that udiv_qrnnd_preinv is worth using, but it seems most
  2308     straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
  2309     directly.  */
  2310  
  2311  void
  2312  tune_preinv_divrem_1 (void)
  2313  {
  2314    static struct param_t  param;
  2315    speed_function_t  divrem_1;
  2316    const char        *divrem_1_name;
  2317    double            t1, t2;
  2318  
  2319    if (GMP_NAIL_BITS != 0)
  2320      {
  2321        print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
  2322        return;
  2323      }
  2324  
  2325    /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
  2326       it's faster than mpn_divrem_1.  */
  2327    if (HAVE_NATIVE_mpn_preinv_divrem_1)
  2328      {
  2329        print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
  2330        return;
  2331      }
  2332  
  2333    /* If udiv_qrnnd_preinv is the only division method then of course
  2334       mpn_preinv_divrem_1 should be used.  */
  2335    if (UDIV_PREINV_ALWAYS)
  2336      {
  2337        print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
  2338        return;
  2339      }
  2340  
  2341    /* If we've got an assembler version of mpn_divrem_1, then compare against
  2342       that, not the mpn_divrem_1_div generic C.  */
  2343    if (HAVE_NATIVE_mpn_divrem_1)
  2344      {
  2345        divrem_1 = speed_mpn_divrem_1;
  2346        divrem_1_name = "mpn_divrem_1";
  2347      }
  2348    else
  2349      {
  2350        divrem_1 = speed_mpn_divrem_1_div;
  2351        divrem_1_name = "mpn_divrem_1_div";
  2352      }
  2353  
  2354    param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
  2355    s.size = 200;                     /* generous but not too big */
  2356    /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
  2357       since in general that's probably most common, though in fact for a
  2358       64-bit limb mp_bases[10].big_base is normalized.  */
  2359    s.r = urandom() & (GMP_NUMB_MASK >> 4);
  2360    if (s.r == 0) s.r = 123;
  2361  
  2362    t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
  2363    t2 = tuneup_measure (divrem_1, &param, &s);
  2364    if (t1 == -1.0 || t2 == -1.0)
  2365      {
  2366        printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
  2367                divrem_1_name, (long) s.size);
  2368        abort ();
  2369      }
  2370    if (option_trace >= 1)
  2371      printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
  2372              (long) s.size, t1, divrem_1_name, t2);
  2373  
  2374    print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
  2375  }
  2376  
  2377  
  2378  
  2379  void
  2380  tune_divrem_2 (void)
  2381  {
  2382    static struct param_t  param;
  2383  
  2384    /* No support for tuning native assembler code, do that by hand and put
  2385       the results in the .asm file, and there's no need for such thresholds
  2386       to appear in gmp-mparam.h.  */
  2387    if (HAVE_NATIVE_mpn_divrem_2)
  2388      return;
  2389  
  2390    if (GMP_NAIL_BITS != 0)
  2391      {
  2392        print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
  2393                             "no preinv with nails");
  2394        return;
  2395      }
  2396  
  2397    if (UDIV_PREINV_ALWAYS)
  2398      {
  2399        print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
  2400        return;
  2401      }
  2402  
  2403    /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
  2404       a bit out for the fractional part, but that's too bad, the integer part
  2405       is more important.
  2406  
  2407       min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
  2408       code space if plain division is better only at size==2 or size==3. */
  2409    param.name = "DIVREM_2_THRESHOLD";
  2410    param.check_size = 256;
  2411    param.min_size = 4;
  2412    param.min_is_always = 1;
  2413    param.size_extra = 2;      /* does qsize==nsize-2 divisions */
  2414    param.stop_factor = 2.0;
  2415  
  2416    s.r = randlimb_norm ();
  2417    param.function = speed_mpn_divrem_2;
  2418    one (&divrem_2_threshold, &param);
  2419  }
  2420  
  2421  void
  2422  tune_div_qr_2 (void)
  2423  {
  2424    static struct param_t  param;
  2425    param.name = "DIV_QR_2_PI2_THRESHOLD";
  2426    param.function = speed_mpn_div_qr_2n;
  2427    param.check_size = 500;
  2428    param.min_size = 4;
  2429    one (&div_qr_2_pi2_threshold, &param);
  2430  }
  2431  
  2432  /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
  2433     tune for that.  Its speed can differ on odd or even divisor, so take an
  2434     average threshold for the two.
  2435  
  2436     mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
  2437     might not vary that way, but don't test this since high<divisor isn't
  2438     expected to occur often with small divisors.  */
  2439  
  2440  void
  2441  tune_divexact_1 (void)
  2442  {
  2443    static struct param_t  param;
  2444    mp_size_t  thresh[2], average;
  2445    int        low, i;
  2446  
  2447    /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
  2448       full mpn_divrem_1.  */
  2449    if (HAVE_NATIVE_mpn_divexact_1)
  2450      {
  2451        print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
  2452        return;
  2453      }
  2454  
  2455    ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
  2456  
  2457    param.name = "DIVEXACT_1_THRESHOLD";
  2458    param.data_high = DATA_HIGH_GE_R;
  2459    param.check_size = 256;
  2460    param.min_size = 2;
  2461    param.stop_factor = 1.5;
  2462    param.function  = tuned_speed_mpn_divrem_1;
  2463    param.function2 = speed_mpn_divexact_1;
  2464    param.noprint = 1;
  2465  
  2466    print_define_start (param.name);
  2467  
  2468    for (low = 0; low <= 1; low++)
  2469      {
  2470        s.r = randlimb_half();
  2471        if (low == 0)
  2472          s.r |= 1;
  2473        else
  2474          s.r &= ~CNST_LIMB(7);
  2475  
  2476        one (&thresh[low], &param);
  2477        if (option_trace)
  2478          printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
  2479  
  2480        if (thresh[low] == MP_SIZE_T_MAX)
  2481          {
  2482            average = MP_SIZE_T_MAX;
  2483            goto divexact_1_done;
  2484          }
  2485      }
  2486  
  2487    if (option_trace)
  2488      {
  2489        printf ("average of:");
  2490        for (i = 0; i < numberof(thresh); i++)
  2491          printf (" %ld", (long) thresh[i]);
  2492        printf ("\n");
  2493      }
  2494  
  2495    average = 0;
  2496    for (i = 0; i < numberof(thresh); i++)
  2497      average += thresh[i];
  2498    average /= numberof(thresh);
  2499  
  2500    /* If divexact turns out to be better as early as 3 limbs, then use it
  2501       always, so as to reduce code size and conditional jumps.  */
  2502    if (average <= 3)
  2503      average = 0;
  2504  
  2505   divexact_1_done:
  2506    print_define_end (param.name, average);
  2507  }
  2508  
  2509  
  2510  /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
  2511     same as mpn_mod_1, but this might not be true of an assembler
  2512     implementation.  The threshold used is an average based on data where a
  2513     divide can be skipped and where it can't.
  2514  
  2515     If modexact turns out to be better as early as 3 limbs, then use it
  2516     always, so as to reduce code size and conditional jumps.  */
  2517  
  2518  void
  2519  tune_modexact_1_odd (void)
  2520  {
  2521    static struct param_t  param;
  2522    mp_size_t  thresh_lt, thresh_ge, average;
  2523  
  2524  #if 0
  2525    /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
  2526       of a full mpn_mod_1.  */
  2527    if (HAVE_NATIVE_mpn_modexact_1_odd)
  2528      {
  2529        print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
  2530        return;
  2531      }
  2532  #endif
  2533  
  2534    param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
  2535    param.check_size = 256;
  2536    param.min_size = 2;
  2537    param.stop_factor = 1.5;
  2538    param.function  = speed_mpn_modexact_1c_odd;
  2539    param.function2 = speed_mpn_mod_1_tune;
  2540    param.noprint = 1;
  2541    s.r = randlimb_half () | 1;
  2542  
  2543    print_define_start (param.name);
  2544  
  2545    param.data_high = DATA_HIGH_LT_R;
  2546    one (&thresh_lt, &param);
  2547    if (option_trace)
  2548      printf ("lt thresh %ld\n", (long) thresh_lt);
  2549  
  2550    average = thresh_lt;
  2551    if (thresh_lt != MP_SIZE_T_MAX)
  2552      {
  2553        param.data_high = DATA_HIGH_GE_R;
  2554        one (&thresh_ge, &param);
  2555        if (option_trace)
  2556          printf ("ge thresh %ld\n", (long) thresh_ge);
  2557  
  2558        if (thresh_ge != MP_SIZE_T_MAX)
  2559          {
  2560            average = (thresh_ge + thresh_lt) / 2;
  2561            if (thresh_ge <= 3)
  2562              average = 0;
  2563          }
  2564      }
  2565  
  2566    print_define_end (param.name, average);
  2567  }
  2568  
  2569  
  2570  void
  2571  tune_jacobi_base (void)
  2572  {
  2573    static struct param_t  param;
  2574    double   t1, t2, t3, t4;
  2575    int      method;
  2576  
  2577    s.size = GMP_LIMB_BITS * 3 / 4;
  2578  
  2579    t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
  2580    if (option_trace >= 1)
  2581      printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
  2582  
  2583    t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
  2584    if (option_trace >= 1)
  2585      printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
  2586  
  2587    t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
  2588    if (option_trace >= 1)
  2589      printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
  2590  
  2591    t4 = tuneup_measure (speed_mpn_jacobi_base_4, &param, &s);
  2592    if (option_trace >= 1)
  2593      printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4);
  2594  
  2595    if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0)
  2596      {
  2597        printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
  2598                (long) s.size);
  2599        abort ();
  2600      }
  2601  
  2602    if (t1 < t2 && t1 < t3 && t1 < t4)
  2603      method = 1;
  2604    else if (t2 < t3 && t2 < t4)
  2605      method = 2;
  2606    else if (t3 < t4)
  2607      method = 3;
  2608    else
  2609      method = 4;
  2610  
  2611    print_define ("JACOBI_BASE_METHOD", method);
  2612  }
  2613  
  2614  
  2615  void
  2616  tune_get_str (void)
  2617  {
  2618    /* Tune for decimal, it being most common.  Some rough testing suggests
  2619       other bases are different, but not by very much.  */
  2620    s.r = 10;
  2621    {
  2622      static struct param_t  param;
  2623      GET_STR_PRECOMPUTE_THRESHOLD = 0;
  2624      param.name = "GET_STR_DC_THRESHOLD";
  2625      param.function = speed_mpn_get_str;
  2626      param.min_size = 4;
  2627      param.max_size = GET_STR_THRESHOLD_LIMIT;
  2628      one (&get_str_dc_threshold, &param);
  2629    }
  2630    {
  2631      static struct param_t  param;
  2632      param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
  2633      param.function = speed_mpn_get_str;
  2634      param.min_size = GET_STR_DC_THRESHOLD;
  2635      param.max_size = GET_STR_THRESHOLD_LIMIT;
  2636      one (&get_str_precompute_threshold, &param);
  2637    }
  2638  }
  2639  
  2640  
  2641  double
  2642  speed_mpn_pre_set_str (struct speed_params *s)
  2643  {
  2644    unsigned char *str;
  2645    mp_ptr     wp;
  2646    mp_size_t  wn;
  2647    unsigned   i;
  2648    int        base;
  2649    double     t;
  2650    mp_ptr powtab_mem, tp;
  2651    powers_t powtab[GMP_LIMB_BITS];
  2652    mp_size_t un;
  2653    int chars_per_limb;
  2654    TMP_DECL;
  2655  
  2656    SPEED_RESTRICT_COND (s->size >= 1);
  2657  
  2658    base = s->r == 0 ? 10 : s->r;
  2659    SPEED_RESTRICT_COND (base >= 2 && base <= 256);
  2660  
  2661    TMP_MARK;
  2662  
  2663    str = (unsigned char *) TMP_ALLOC (s->size);
  2664    for (i = 0; i < s->size; i++)
  2665      str[i] = s->xp[i] % base;
  2666  
  2667    LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
  2668    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
  2669  
  2670    /* use this during development to check wn is big enough */
  2671    /*
  2672    ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
  2673    */
  2674  
  2675    speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
  2676    speed_operand_dst (s, wp, wn);
  2677    speed_cache_fill (s);
  2678  
  2679    chars_per_limb = mp_bases[base].chars_per_limb;
  2680    un = s->size / chars_per_limb + 1;
  2681    powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
  2682    mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
  2683    tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
  2684  
  2685    speed_starttime ();
  2686    i = s->reps;
  2687    do
  2688      {
  2689        mpn_pre_set_str (wp, str, s->size, powtab, tp);
  2690      }
  2691    while (--i != 0);
  2692    t = speed_endtime ();
  2693  
  2694    TMP_FREE;
  2695    return t;
  2696  }
  2697  
  2698  void
  2699  tune_set_str (void)
  2700  {
  2701    s.r = 10;  /* decimal */
  2702    {
  2703      static struct param_t  param;
  2704      SET_STR_PRECOMPUTE_THRESHOLD = 0;
  2705      param.step_factor = 0.01;
  2706      param.name = "SET_STR_DC_THRESHOLD";
  2707      param.function = speed_mpn_pre_set_str;
  2708      param.min_size = 100;
  2709      param.max_size = 50000;
  2710      one (&set_str_dc_threshold, &param);
  2711    }
  2712    {
  2713      static struct param_t  param;
  2714      param.step_factor = 0.02;
  2715      param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
  2716      param.function = speed_mpn_set_str;
  2717      param.min_size = SET_STR_DC_THRESHOLD;
  2718      param.max_size = 100000;
  2719      one (&set_str_precompute_threshold, &param);
  2720    }
  2721  }
  2722  
  2723  
  2724  void
  2725  tune_fft_mul (void)
  2726  {
  2727    static struct fft_param_t  param;
  2728  
  2729    if (option_fft_max_size == 0)
  2730      return;
  2731  
  2732    param.table_name          = "MUL_FFT_TABLE3";
  2733    param.threshold_name      = "MUL_FFT_THRESHOLD";
  2734    param.p_threshold         = &mul_fft_threshold;
  2735    param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
  2736    param.p_modf_threshold    = &mul_fft_modf_threshold;
  2737    param.first_size          = MUL_TOOM33_THRESHOLD / 2;
  2738    param.max_size            = option_fft_max_size;
  2739    param.function            = speed_mpn_fft_mul;
  2740    param.mul_modf_function   = speed_mpn_mul_fft;
  2741    param.mul_function        = speed_mpn_mul_n;
  2742    param.sqr = 0;
  2743    fft (&param);
  2744  }
  2745  
  2746  
  2747  void
  2748  tune_fft_sqr (void)
  2749  {
  2750    static struct fft_param_t  param;
  2751  
  2752    if (option_fft_max_size == 0)
  2753      return;
  2754  
  2755    param.table_name          = "SQR_FFT_TABLE3";
  2756    param.threshold_name      = "SQR_FFT_THRESHOLD";
  2757    param.p_threshold         = &sqr_fft_threshold;
  2758    param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
  2759    param.p_modf_threshold    = &sqr_fft_modf_threshold;
  2760    param.first_size          = SQR_TOOM3_THRESHOLD / 2;
  2761    param.max_size            = option_fft_max_size;
  2762    param.function            = speed_mpn_fft_sqr;
  2763    param.mul_modf_function   = speed_mpn_mul_fft_sqr;
  2764    param.mul_function        = speed_mpn_sqr;
  2765    param.sqr = 1;
  2766    fft (&param);
  2767  }
  2768  
  2769  void
  2770  tune_fac_ui (void)
  2771  {
  2772    static struct param_t  param;
  2773  
  2774    param.function = speed_mpz_fac_ui_tune;
  2775  
  2776    param.name = "FAC_DSC_THRESHOLD";
  2777    param.min_size = 70;
  2778    param.max_size = FAC_DSC_THRESHOLD_LIMIT;
  2779    one (&fac_dsc_threshold, &param);
  2780  
  2781    param.name = "FAC_ODD_THRESHOLD";
  2782    param.min_size = 22;
  2783    param.stop_factor = 1.7;
  2784    param.min_is_always = 1;
  2785    one (&fac_odd_threshold, &param);
  2786  }
  2787  
  2788  void
  2789  all (void)
  2790  {
  2791    time_t  start_time, end_time;
  2792    TMP_DECL;
  2793  
  2794    TMP_MARK;
  2795    SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
  2796    SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
  2797  
  2798    mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
  2799    mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
  2800  
  2801    fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
  2802  
  2803    speed_time_init ();
  2804    fprintf (stderr, "Using: %s\n", speed_time_string);
  2805  
  2806    fprintf (stderr, "speed_precision %d", speed_precision);
  2807    if (speed_unittime == 1.0)
  2808      fprintf (stderr, ", speed_unittime 1 cycle");
  2809    else
  2810      fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
  2811    if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
  2812      fprintf (stderr, ", CPU freq unknown\n");
  2813    else
  2814      fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
  2815  
  2816    fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
  2817             DEFAULT_MAX_SIZE, (long) option_fft_max_size);
  2818    fprintf (stderr, "\n");
  2819  
  2820    time (&start_time);
  2821    {
  2822      struct tm  *tp;
  2823      tp = localtime (&start_time);
  2824      printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
  2825              tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
  2826  
  2827  #ifdef __GNUC__
  2828      /* gcc sub-minor version doesn't seem to come through as a define */
  2829      printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
  2830  #define PRINTED_COMPILER
  2831  #endif
  2832  #if defined (__SUNPRO_C)
  2833      printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
  2834  #define PRINTED_COMPILER
  2835  #endif
  2836  #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
  2837      /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
  2838      printf ("MIPSpro C %d.%d.%d */\n",
  2839  	    _COMPILER_VERSION / 100,
  2840  	    _COMPILER_VERSION / 10 % 10,
  2841  	    _COMPILER_VERSION % 10);
  2842  #define PRINTED_COMPILER
  2843  #endif
  2844  #if defined (__DECC) && defined (__DECC_VER)
  2845      printf ("DEC C %d */\n", __DECC_VER);
  2846  #define PRINTED_COMPILER
  2847  #endif
  2848  #if ! defined (PRINTED_COMPILER)
  2849      printf ("system compiler */\n");
  2850  #endif
  2851    }
  2852    printf ("\n");
  2853  
  2854    tune_divrem_1 ();
  2855    tune_mod_1 ();
  2856    tune_preinv_divrem_1 ();
  2857    tune_div_qr_1 ();
  2858  #if 0
  2859    tune_divrem_2 ();
  2860  #endif
  2861    tune_div_qr_2 ();
  2862    tune_divexact_1 ();
  2863    tune_modexact_1_odd ();
  2864    printf("\n");
  2865  
  2866    tune_mul_n ();
  2867    printf("\n");
  2868  
  2869    tune_mul ();
  2870    printf("\n");
  2871  
  2872    tune_sqr ();
  2873    printf("\n");
  2874  
  2875    tune_mulmid ();
  2876    printf("\n");
  2877  
  2878    tune_mulmod_bnm1 ();
  2879    tune_sqrmod_bnm1 ();
  2880    printf("\n");
  2881  
  2882    tune_fft_mul ();
  2883    printf("\n");
  2884  
  2885    tune_fft_sqr ();
  2886    printf ("\n");
  2887  
  2888    tune_mullo ();
  2889    tune_sqrlo ();
  2890    printf("\n");
  2891  
  2892    tune_dc_div ();
  2893    tune_dc_bdiv ();
  2894  
  2895    printf("\n");
  2896    tune_invertappr ();
  2897    tune_invert ();
  2898    printf("\n");
  2899  
  2900    tune_binvert ();
  2901    tune_redc ();
  2902    printf("\n");
  2903  
  2904    tune_mu_div ();
  2905    tune_mu_bdiv ();
  2906    printf("\n");
  2907  
  2908    tune_powm_sec ();
  2909    printf("\n");
  2910  
  2911    tune_get_str ();
  2912    tune_set_str ();
  2913    printf("\n");
  2914  
  2915    tune_fac_ui ();
  2916    printf("\n");
  2917  
  2918    tune_matrix22_mul ();
  2919    tune_hgcd ();
  2920    tune_hgcd_appr ();
  2921    tune_hgcd_reduce();
  2922    tune_gcd_dc ();
  2923    tune_gcdext_dc ();
  2924    tune_jacobi_base ();
  2925    printf("\n");
  2926  
  2927    time (&end_time);
  2928    printf ("/* Tuneup completed successfully, took %ld seconds */\n",
  2929            (long) (end_time - start_time));
  2930  
  2931    TMP_FREE;
  2932  }
  2933  
  2934  
  2935  int
  2936  main (int argc, char *argv[])
  2937  {
  2938    int  opt;
  2939  
  2940    /* Unbuffered so if output is redirected to a file it isn't lost if the
  2941       program is killed part way through.  */
  2942    setbuf (stdout, NULL);
  2943    setbuf (stderr, NULL);
  2944  
  2945    while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
  2946      {
  2947        switch (opt) {
  2948        case 'f':
  2949          if (optarg[0] == 't')
  2950            option_fft_trace = 2;
  2951          else
  2952            option_fft_max_size = atol (optarg);
  2953          break;
  2954        case 'o':
  2955          speed_option_set (optarg);
  2956          break;
  2957        case 'p':
  2958          speed_precision = atoi (optarg);
  2959          break;
  2960        case 't':
  2961          option_trace++;
  2962          break;
  2963        case '?':
  2964          exit(1);
  2965        }
  2966      }
  2967  
  2968    all ();
  2969    exit (0);
  2970  }