github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/sqrlo.c (about) 1 /* mpn_sqrlo -- squares an n-limb number and returns the low n limbs 2 of the result. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS 7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED 8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc. 11 12 This file is part of the GNU MP Library. 13 14 The GNU MP Library is free software; you can redistribute it and/or modify 15 it under the terms of either: 16 17 * the GNU Lesser General Public License as published by the Free 18 Software Foundation; either version 3 of the License, or (at your 19 option) any later version. 20 21 or 22 23 * the GNU General Public License as published by the Free Software 24 Foundation; either version 2 of the License, or (at your option) any 25 later version. 26 27 or both in parallel, as here. 28 29 The GNU MP Library is distributed in the hope that it will be useful, but 30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 32 for more details. 33 34 You should have received copies of the GNU General Public License and the 35 GNU Lesser General Public License along with the GNU MP Library. If not, 36 see https://www.gnu.org/licenses/. */ 37 38 #include "gmp.h" 39 #include "gmp-impl.h" 40 41 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY 42 #define MAYBE_range_basecase 1 43 #define MAYBE_range_toom22 1 44 #else 45 #define MAYBE_range_basecase \ 46 ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11)) 47 #define MAYBE_range_toom22 \ 48 ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) ) 49 #endif 50 51 /* THINK: The DC strategy uses different constants in different Toom's 52 ranges. Something smoother? 53 */ 54 55 /* 56 Compute the least significant half of the product {xy,n}*{yp,n}, or 57 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). 58 59 Above the given threshold, the Divide and Conquer strategy is used. 60 The operand is split in two, and a full square plus a mullo 61 is used to obtain the final result. The more natural strategy is to 62 split in two halves, but this is far from optimal when a 63 sub-quadratic multiplication is used. 64 65 Mulders suggests an unbalanced split in favour of the full product, 66 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. 67 68 To compute the value of a, we assume that the cost of mullo for a 69 given size ML(n) is a fraction of the cost of a full product with 70 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; 71 then we can write: 72 73 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e 74 75 Given a value for e, want to minimise the value of k, i.e. the 76 function k=(1-a)^e/(1-2*a^e). 77 78 With e=2, the exponent for schoolbook multiplication, the minimum is 79 given by the values a=1-a=1/2. 80 81 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), 82 Mulders compute (1-a) = 0.694... and we approximate a with 11/36. 83 84 Other possible approximations follow: 85 e=log(5)/log(3) [Toom-3] -> a ~= 9/40 86 e=log(7)/log(4) [Toom-4] -> a ~= 7/39 87 e=log(11)/log(6) [Toom-6] -> a ~= 1/8 88 e=log(15)/log(8) [Toom-8] -> a ~= 1/10 89 90 The values above where obtained with the following trivial commands 91 in the gp-pari shell: 92 93 fun(e,a)=(1-a)^e/(1-2*a^e) 94 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))} 95 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5)) 96 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5)) 97 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5)) 98 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3)) 99 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3)) 100 101 , 102 |\ 103 | \ 104 +----, 105 | | 106 | | 107 | |\ 108 | | \ 109 +----+--` 110 ^ n2 ^n1^ 111 112 For an actual implementation, the assumption that M(n)=n^e is 113 incorrect, as a consequence also the assumption that ML(n)=k*M(n) 114 with a constant k is wrong. 115 116 But theory suggest us two things: 117 - the best the multiplication product is (lower e), the more k 118 approaches 1, and a approaches 0. 119 120 - A value for a smaller than optimal is probably less bad than a 121 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal 122 value, and k(a)=0.808_ the mul/mullo speed ratio. We get 123 k(a+1/6)=0.929_ but k(a-1/6)=0.865_. 124 */ 125 126 static mp_size_t 127 mpn_sqrlo_itch (mp_size_t n) 128 { 129 return 2*n; 130 } 131 132 /* 133 mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp. 134 It accepts tp == rp. 135 */ 136 static void 137 mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp) 138 { 139 mp_size_t n2, n1; 140 ASSERT (n >= 2); 141 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); 142 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); 143 144 /* Divide-and-conquer */ 145 146 /* We need fractional approximation of the value 0 < a <= 1/2 147 giving the minimum in the function k=(1-a)^e/(1-2*a^e). 148 */ 149 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11))) 150 n1 = n >> 1; 151 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11))) 152 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ 153 else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9))) 154 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ 155 else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9)) 156 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ 157 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ 158 else 159 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ 160 161 n2 = n - n1; 162 163 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */ 164 165 /* x0 ^ 2 */ 166 mpn_sqr (tp, xp, n2); 167 MPN_COPY (rp, tp, n2); 168 169 /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */ 170 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) 171 mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1); 172 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) 173 mpn_mullo_basecase (tp + n, xp + n2, xp, n1); 174 else 175 mpn_mullo_n (tp + n, xp + n2, xp, n1); 176 /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */ 177 #if HAVE_NATIVE_mpn_addlsh1_n 178 mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1); 179 #else 180 mpn_lshift (rp + n2, tp + n, n1, 1); 181 mpn_add_n (rp + n2, rp + n2, tp + n2, n1); 182 #endif 183 } 184 185 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ 186 #define SQR_BASECASE_ALLOC \ 187 (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT) 188 189 /* FIXME: This function should accept a temporary area; dc_sqrlo 190 accepts a pointer tp, and handle the case tp == rp, do the same here. 191 */ 192 193 void 194 mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n) 195 { 196 ASSERT (n >= 1); 197 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); 198 199 if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD)) 200 { 201 /* FIXME: smarter criteria? */ 202 #if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase 203 /* mullo computes as many products as sqr, but directly writes 204 on the result area. */ 205 mpn_mullo_basecase (rp, xp, xp, n); 206 #else 207 /* Allocate workspace of fixed size on stack: fast! */ 208 mp_limb_t tp[SQR_BASECASE_ALLOC]; 209 mpn_sqr_basecase (tp, xp, n); 210 MPN_COPY (rp, tp, n); 211 #endif 212 } 213 else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD)) 214 { 215 mpn_sqrlo_basecase (rp, xp, n); 216 } 217 else 218 { 219 mp_ptr tp; 220 TMP_DECL; 221 TMP_MARK; 222 tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n)); 223 if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD)) 224 { 225 mpn_dc_sqrlo (rp, xp, n, tp); 226 } 227 else 228 { 229 /* For really large operands, use plain mpn_mul_n but throw away upper n 230 limbs of result. */ 231 #if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD) 232 mpn_fft_mul (tp, xp, n, xp, n); 233 #else 234 mpn_sqr (tp, xp, n); 235 #endif 236 MPN_COPY (rp, tp, n); 237 } 238 TMP_FREE; 239 } 240 }