github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/mullo_n.c (about) 1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs 2 of their products. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS 7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED 8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc. 11 12 This file is part of the GNU MP Library. 13 14 The GNU MP Library is free software; you can redistribute it and/or modify 15 it under the terms of either: 16 17 * the GNU Lesser General Public License as published by the Free 18 Software Foundation; either version 3 of the License, or (at your 19 option) any later version. 20 21 or 22 23 * the GNU General Public License as published by the Free Software 24 Foundation; either version 2 of the License, or (at your option) any 25 later version. 26 27 or both in parallel, as here. 28 29 The GNU MP Library is distributed in the hope that it will be useful, but 30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 32 for more details. 33 34 You should have received copies of the GNU General Public License and the 35 GNU Lesser General Public License along with the GNU MP Library. If not, 36 see https://www.gnu.org/licenses/. */ 37 38 #include "gmp.h" 39 #include "gmp-impl.h" 40 41 42 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY 43 #define MAYBE_range_basecase 1 44 #define MAYBE_range_toom22 1 45 #else 46 #define MAYBE_range_basecase \ 47 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11)) 48 #define MAYBE_range_toom22 \ 49 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) ) 50 #endif 51 52 /* THINK: The DC strategy uses different constants in different Toom's 53 ranges. Something smoother? 54 */ 55 56 /* 57 Compute the least significant half of the product {xy,n}*{yp,n}, or 58 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). 59 60 Above the given threshold, the Divide and Conquer strategy is used. 61 The operands are split in two, and a full product plus two mullo 62 are used to obtain the final result. The more natural strategy is to 63 split in two halves, but this is far from optimal when a 64 sub-quadratic multiplication is used. 65 66 Mulders suggests an unbalanced split in favour of the full product, 67 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. 68 69 To compute the value of a, we assume that the cost of mullo for a 70 given size ML(n) is a fraction of the cost of a full product with 71 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; 72 then we can write: 73 74 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e 75 76 Given a value for e, want to minimise the value of k, i.e. the 77 function k=(1-a)^e/(1-2*a^e). 78 79 With e=2, the exponent for schoolbook multiplication, the minimum is 80 given by the values a=1-a=1/2. 81 82 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), 83 Mulders compute (1-a) = 0.694... and we approximate a with 11/36. 84 85 Other possible approximations follow: 86 e=log(5)/log(3) [Toom-3] -> a ~= 9/40 87 e=log(7)/log(4) [Toom-4] -> a ~= 7/39 88 e=log(11)/log(6) [Toom-6] -> a ~= 1/8 89 e=log(15)/log(8) [Toom-8] -> a ~= 1/10 90 91 The values above where obtained with the following trivial commands 92 in the gp-pari shell: 93 94 fun(e,a)=(1-a)^e/(1-2*a^e) 95 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))} 96 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5)) 97 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5)) 98 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5)) 99 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3)) 100 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3)) 101 102 , 103 |\ 104 | \ 105 +----, 106 | | 107 | | 108 | |\ 109 | | \ 110 +----+--` 111 ^ n2 ^n1^ 112 113 For an actual implementation, the assumption that M(n)=n^e is 114 incorrect, as a consequence also the assumption that ML(n)=k*M(n) 115 with a constant k is wrong. 116 117 But theory suggest us two things: 118 - the best the multiplication product is (lower e), the more k 119 approaches 1, and a approaches 0. 120 121 - A value for a smaller than optimal is probably less bad than a 122 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal 123 value, and k(a)=0.808_ the mul/mullo speed ratio. We get 124 k(a+1/6)=0.929_ but k(a-1/6)=0.865_. 125 */ 126 127 static mp_size_t 128 mpn_mullo_n_itch (mp_size_t n) 129 { 130 return 2*n; 131 } 132 133 /* 134 mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp. 135 It accepts tp == rp. 136 */ 137 static void 138 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp) 139 { 140 mp_size_t n2, n1; 141 ASSERT (n >= 2); 142 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); 143 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); 144 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); 145 146 /* Divide-and-conquer */ 147 148 /* We need fractional approximation of the value 0 < a <= 1/2 149 giving the minimum in the function k=(1-a)^e/(1-2*a^e). 150 */ 151 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11))) 152 n1 = n >> 1; 153 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11))) 154 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ 155 else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9))) 156 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ 157 else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9)) 158 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ 159 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ 160 else 161 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ 162 163 n2 = n - n1; 164 165 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0, 166 y = y1 2^(n2 GMP_NUMB_BITS) + y0 */ 167 168 /* x0 * y0 */ 169 mpn_mul_n (tp, xp, yp, n2); 170 MPN_COPY (rp, tp, n2); 171 172 /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */ 173 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) 174 mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1); 175 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) 176 mpn_mullo_basecase (tp + n, xp + n2, yp, n1); 177 else 178 mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n); 179 mpn_add_n (rp + n2, tp + n2, tp + n, n1); 180 181 /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */ 182 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) 183 mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1); 184 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) 185 mpn_mullo_basecase (tp + n, xp, yp + n2, n1); 186 else 187 mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n); 188 mpn_add_n (rp + n2, rp + n2, tp + n, n1); 189 } 190 191 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ 192 #define MUL_BASECASE_ALLOC \ 193 (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT) 194 195 /* FIXME: This function should accept a temporary area; dc_mullow_n 196 accepts a pointer tp, and handle the case tp == rp, do the same here. 197 Maybe recombine the two functions. 198 THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase 199 (typically thanks to mpn_addmul_2) should we unconditionally use 200 mpn_mul_n? 201 */ 202 203 void 204 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) 205 { 206 ASSERT (n >= 1); 207 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); 208 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); 209 210 if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD)) 211 { 212 /* Allocate workspace of fixed size on stack: fast! */ 213 mp_limb_t tp[MUL_BASECASE_ALLOC]; 214 mpn_mul_basecase (tp, xp, n, yp, n); 215 MPN_COPY (rp, tp, n); 216 } 217 else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD)) 218 { 219 mpn_mullo_basecase (rp, xp, yp, n); 220 } 221 else 222 { 223 mp_ptr tp; 224 TMP_DECL; 225 TMP_MARK; 226 tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n)); 227 if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD)) 228 { 229 mpn_dc_mullo_n (rp, xp, yp, n, tp); 230 } 231 else 232 { 233 /* For really large operands, use plain mpn_mul_n but throw away upper n 234 limbs of result. */ 235 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD) 236 mpn_fft_mul (tp, xp, n, yp, n); 237 #else 238 mpn_mul_n (tp, xp, yp, n); 239 #endif 240 MPN_COPY (rp, tp, n); 241 } 242 TMP_FREE; 243 } 244 }