github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom32_mul.c (about) 1 /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5 2 times as large as bn. Or more accurately, bn < an < 3bn. 3 4 Contributed to the GNU project by Torbjorn Granlund. 5 Improvements by Marco Bodrato and Niels Möller. 6 7 The idea of applying toom to unbalanced multiplication is due to Marco 8 Bodrato and Alberto Zanoni. 9 10 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 11 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 12 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 13 14 Copyright 2006-2010 Free Software Foundation, Inc. 15 16 This file is part of the GNU MP Library. 17 18 The GNU MP Library is free software; you can redistribute it and/or modify 19 it under the terms of either: 20 21 * the GNU Lesser General Public License as published by the Free 22 Software Foundation; either version 3 of the License, or (at your 23 option) any later version. 24 25 or 26 27 * the GNU General Public License as published by the Free Software 28 Foundation; either version 2 of the License, or (at your option) any 29 later version. 30 31 or both in parallel, as here. 32 33 The GNU MP Library is distributed in the hope that it will be useful, but 34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 35 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 36 for more details. 37 38 You should have received copies of the GNU General Public License and the 39 GNU Lesser General Public License along with the GNU MP Library. If not, 40 see https://www.gnu.org/licenses/. */ 41 42 43 #include "gmp.h" 44 #include "gmp-impl.h" 45 46 /* Evaluate in: -1, 0, +1, +inf 47 48 <-s-><--n--><--n--> 49 ___ ______ ______ 50 |a2_|___a1_|___a0_| 51 |_b1_|___b0_| 52 <-t--><--n--> 53 54 v0 = a0 * b0 # A(0)*B(0) 55 v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1 56 vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0 57 vinf= a2 * b1 # A(inf)*B(inf) 58 */ 59 60 #define TOOM32_MUL_N_REC(p, a, b, n, ws) \ 61 do { \ 62 mpn_mul_n (p, a, b, n); \ 63 } while (0) 64 65 void 66 mpn_toom32_mul (mp_ptr pp, 67 mp_srcptr ap, mp_size_t an, 68 mp_srcptr bp, mp_size_t bn, 69 mp_ptr scratch) 70 { 71 mp_size_t n, s, t; 72 int vm1_neg; 73 mp_limb_t cy; 74 mp_limb_signed_t hi; 75 mp_limb_t ap1_hi, bp1_hi; 76 77 #define a0 ap 78 #define a1 (ap + n) 79 #define a2 (ap + 2 * n) 80 #define b0 bp 81 #define b1 (bp + n) 82 83 /* Required, to ensure that s + t >= n. */ 84 ASSERT (bn + 2 <= an && an + 6 <= 3*bn); 85 86 n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1); 87 88 s = an - 2 * n; 89 t = bn - n; 90 91 ASSERT (0 < s && s <= n); 92 ASSERT (0 < t && t <= n); 93 ASSERT (s + t >= n); 94 95 /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */ 96 #define ap1 (pp) /* n, most significant limb in ap1_hi */ 97 #define bp1 (pp + n) /* n, most significant bit in bp1_hi */ 98 #define am1 (pp + 2*n) /* n, most significant bit in hi */ 99 #define bm1 (pp + 3*n) /* n */ 100 #define v1 (scratch) /* 2n + 1 */ 101 #define vm1 (pp) /* 2n + 1 */ 102 #define scratch_out (scratch + 2*n + 1) /* Currently unused. */ 103 104 /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */ 105 106 /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */ 107 108 /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */ 109 ap1_hi = mpn_add (ap1, a0, n, a2, s); 110 #if HAVE_NATIVE_mpn_add_n_sub_n 111 if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0) 112 { 113 ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1; 114 hi = 0; 115 vm1_neg = 1; 116 } 117 else 118 { 119 cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n); 120 hi = ap1_hi - (cy & 1); 121 ap1_hi += (cy >> 1); 122 vm1_neg = 0; 123 } 124 #else 125 if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0) 126 { 127 ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n)); 128 hi = 0; 129 vm1_neg = 1; 130 } 131 else 132 { 133 hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n); 134 vm1_neg = 0; 135 } 136 ap1_hi += mpn_add_n (ap1, ap1, a1, n); 137 #endif 138 139 /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */ 140 if (t == n) 141 { 142 #if HAVE_NATIVE_mpn_add_n_sub_n 143 if (mpn_cmp (b0, b1, n) < 0) 144 { 145 cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n); 146 vm1_neg ^= 1; 147 } 148 else 149 { 150 cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n); 151 } 152 bp1_hi = cy >> 1; 153 #else 154 bp1_hi = mpn_add_n (bp1, b0, b1, n); 155 156 if (mpn_cmp (b0, b1, n) < 0) 157 { 158 ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n)); 159 vm1_neg ^= 1; 160 } 161 else 162 { 163 ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n)); 164 } 165 #endif 166 } 167 else 168 { 169 /* FIXME: Should still use mpn_add_n_sub_n for the main part. */ 170 bp1_hi = mpn_add (bp1, b0, n, b1, t); 171 172 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) 173 { 174 ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t)); 175 MPN_ZERO (bm1 + t, n - t); 176 vm1_neg ^= 1; 177 } 178 else 179 { 180 ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t)); 181 } 182 } 183 184 TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out); 185 if (ap1_hi == 1) 186 { 187 cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n); 188 } 189 else if (ap1_hi == 2) 190 { 191 #if HAVE_NATIVE_mpn_addlsh1_n 192 cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n); 193 #else 194 cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2)); 195 #endif 196 } 197 else 198 cy = 0; 199 if (bp1_hi != 0) 200 cy += mpn_add_n (v1 + n, v1 + n, ap1, n); 201 v1[2 * n] = cy; 202 203 TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out); 204 if (hi) 205 hi = mpn_add_n (vm1+n, vm1+n, bm1, n); 206 207 vm1[2*n] = hi; 208 209 /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */ 210 if (vm1_neg) 211 { 212 #if HAVE_NATIVE_mpn_rsh1sub_n 213 mpn_rsh1sub_n (v1, v1, vm1, 2*n+1); 214 #else 215 mpn_sub_n (v1, v1, vm1, 2*n+1); 216 ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1)); 217 #endif 218 } 219 else 220 { 221 #if HAVE_NATIVE_mpn_rsh1add_n 222 mpn_rsh1add_n (v1, v1, vm1, 2*n+1); 223 #else 224 mpn_add_n (v1, v1, vm1, 2*n+1); 225 ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1)); 226 #endif 227 } 228 229 /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence 230 231 y = x1 + x3 + (x0 + x2) * B 232 = (x0 + x2) * B + (x0 + x2) - vm1. 233 234 y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as 235 follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n 236 (already in place, except for carry propagation). 237 238 We thus add 239 240 B^3 B^2 B 1 241 | | | | 242 +-----+----+ 243 + | x0 + x2 | 244 +----+-----+----+ 245 + | x0 + x2 | 246 +----------+ 247 - | vm1 | 248 --+----++----+----+- 249 | y2 | y1 | y0 | 250 +-----+----+----+ 251 252 Since we store y0 at the same location as the low half of x0 + x2, we 253 need to do the middle sum first. */ 254 255 hi = vm1[2*n]; 256 cy = mpn_add_n (pp + 2*n, v1, v1 + n, n); 257 MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]); 258 259 /* FIXME: Can we get rid of this second vm1_neg conditional by 260 swapping the location of +1 and -1 values? */ 261 if (vm1_neg) 262 { 263 cy = mpn_add_n (v1, v1, vm1, n); 264 hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy); 265 MPN_INCR_U (v1 + n, n+1, hi); 266 } 267 else 268 { 269 cy = mpn_sub_n (v1, v1, vm1, n); 270 hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy); 271 MPN_DECR_U (v1 + n, n+1, hi); 272 } 273 274 TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out); 275 /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */ 276 if (s > t) mpn_mul (pp+3*n, a2, s, b1, t); 277 else mpn_mul (pp+3*n, b1, t, a2, s); 278 279 /* Remaining interpolation. 280 281 y * B + x0 + x3 B^3 - x0 B^2 - x3 B 282 = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B 283 = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B 284 + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2 285 = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2 286 + (y2 - (H x0 - L x3)) B^3 + H x3 B^4 287 288 B^4 B^3 B^2 B 1 289 | | | | | | 290 +-------+ +---------+---------+ 291 | Hx3 | | Hx0-Lx3 | Lx0 | 292 +------+----------+---------+---------+---------+ 293 | y2 | y1 | y0 | 294 ++---------+---------+---------+ 295 -| Hx0-Lx3 | - Lx0 | 296 +---------+---------+ 297 | - Hx3 | 298 +--------+ 299 300 We must take into account the carry from Hx0 - Lx3. 301 */ 302 303 cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n); 304 hi = scratch[2*n] + cy; 305 306 cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy); 307 hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy); 308 309 hi += mpn_add (pp + n, pp + n, 3*n, scratch, n); 310 311 /* FIXME: Is support for s + t == n needed? */ 312 if (LIKELY (s + t > n)) 313 { 314 hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n); 315 316 if (hi < 0) 317 MPN_DECR_U (pp + 4*n, s+t-n, -hi); 318 else 319 MPN_INCR_U (pp + 4*n, s+t-n, hi); 320 } 321 else 322 ASSERT (hi == 0); 323 }