github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom44_mul.c (about) 1 /* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in 2 size. Or more accurately, bn <= an < (4/3)bn. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 Copyright 2006-2008, 2013 Free Software Foundation, Inc. 11 12 This file is part of the GNU MP Library. 13 14 The GNU MP Library is free software; you can redistribute it and/or modify 15 it under the terms of either: 16 17 * the GNU Lesser General Public License as published by the Free 18 Software Foundation; either version 3 of the License, or (at your 19 option) any later version. 20 21 or 22 23 * the GNU General Public License as published by the Free Software 24 Foundation; either version 2 of the License, or (at your option) any 25 later version. 26 27 or both in parallel, as here. 28 29 The GNU MP Library is distributed in the hope that it will be useful, but 30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 32 for more details. 33 34 You should have received copies of the GNU General Public License and the 35 GNU Lesser General Public License along with the GNU MP Library. If not, 36 see https://www.gnu.org/licenses/. */ 37 38 39 #include "gmp.h" 40 #include "gmp-impl.h" 41 42 /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf 43 44 <-s--><--n--><--n--><--n--> 45 ____ ______ ______ ______ 46 |_a3_|___a2_|___a1_|___a0_| 47 |b3_|___b2_|___b1_|___b0_| 48 <-t-><--n--><--n--><--n--> 49 50 v0 = a0 * b0 # A(0)*B(0) 51 v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3 52 vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1 53 v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14 54 vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(2)*B(2) ah <= 9 |bh| <= 9 55 vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14 56 vinf= a3 * b2 # A(inf)*B(inf) 57 */ 58 59 #if TUNE_PROGRAM_BUILD 60 #define MAYBE_mul_basecase 1 61 #define MAYBE_mul_toom22 1 62 #define MAYBE_mul_toom44 1 63 #else 64 #define MAYBE_mul_basecase \ 65 (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD) 66 #define MAYBE_mul_toom22 \ 67 (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD) 68 #define MAYBE_mul_toom44 \ 69 (MUL_TOOM6H_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD) 70 #endif 71 72 #define TOOM44_MUL_N_REC(p, a, b, n, ws) \ 73 do { \ 74 if (MAYBE_mul_basecase \ 75 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ 76 mpn_mul_basecase (p, a, n, b, n); \ 77 else if (MAYBE_mul_toom22 \ 78 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ 79 mpn_toom22_mul (p, a, n, b, n, ws); \ 80 else if (! MAYBE_mul_toom44 \ 81 || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \ 82 mpn_toom33_mul (p, a, n, b, n, ws); \ 83 else \ 84 mpn_toom44_mul (p, a, n, b, n, ws); \ 85 } while (0) 86 87 /* Use of scratch space. In the product area, we store 88 89 ___________________ 90 |vinf|____|_v1_|_v0_| 91 s+t 2n-1 2n+1 2n 92 93 The other recursive products, vm1, v2, vm2, vh are stored in the 94 scratch area. When computing them, we use the product area for 95 intermediate values. 96 97 Next, we compute v1. We can store the intermediate factors at v0 98 and at vh + 2n + 2. 99 100 Finally, for v0 and vinf, factors are parts of the input operands, 101 and we need scratch space only for the recursive multiplication. 102 103 In all, if S(an) is the scratch need, the needed space is bounded by 104 105 S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1) 106 107 which should give S(n) = 8 n/3 + c log(n) for some constant c. 108 */ 109 110 void 111 mpn_toom44_mul (mp_ptr pp, 112 mp_srcptr ap, mp_size_t an, 113 mp_srcptr bp, mp_size_t bn, 114 mp_ptr scratch) 115 { 116 mp_size_t n, s, t; 117 mp_limb_t cy; 118 enum toom7_flags flags; 119 120 #define a0 ap 121 #define a1 (ap + n) 122 #define a2 (ap + 2*n) 123 #define a3 (ap + 3*n) 124 #define b0 bp 125 #define b1 (bp + n) 126 #define b2 (bp + 2*n) 127 #define b3 (bp + 3*n) 128 129 ASSERT (an >= bn); 130 131 n = (an + 3) >> 2; 132 133 s = an - 3 * n; 134 t = bn - 3 * n; 135 136 ASSERT (0 < s && s <= n); 137 ASSERT (0 < t && t <= n); 138 ASSERT (s >= t); 139 140 /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the 141 * following limb, so these must be computed in order, and we need a 142 * one limb gap to tp. */ 143 #define v0 pp /* 2n */ 144 #define v1 (pp + 2 * n) /* 2n+1 */ 145 #define vinf (pp + 6 * n) /* s+t */ 146 #define v2 scratch /* 2n+1 */ 147 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 148 #define vh (scratch + 4 * n + 2) /* 2n+1 */ 149 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ 150 #define tp (scratch + 8*n + 5) 151 152 /* apx and bpx must not overlap with v1 */ 153 #define apx pp /* n+1 */ 154 #define amx (pp + n + 1) /* n+1 */ 155 #define bmx (pp + 2*n + 2) /* n+1 */ 156 #define bpx (pp + 4*n + 2) /* n+1 */ 157 158 /* Total scratch need: 8*n + 5 + scratch for recursive calls. This 159 gives roughly 32 n/3 + log term. */ 160 161 /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ 162 flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp)); 163 164 /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */ 165 flags = (enum toom7_flags) (flags ^ (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp))); 166 167 TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */ 168 TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */ 169 170 /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ 171 #if HAVE_NATIVE_mpn_addlsh1_n 172 cy = mpn_addlsh1_n (apx, a1, a0, n); 173 cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); 174 if (s < n) 175 { 176 mp_limb_t cy2; 177 cy2 = mpn_addlsh1_n (apx, a3, apx, s); 178 apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); 179 MPN_INCR_U (apx + s, n+1-s, cy2); 180 } 181 else 182 apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); 183 #else 184 cy = mpn_lshift (apx, a0, n, 1); 185 cy += mpn_add_n (apx, apx, a1, n); 186 cy = 2*cy + mpn_lshift (apx, apx, n, 1); 187 cy += mpn_add_n (apx, apx, a2, n); 188 cy = 2*cy + mpn_lshift (apx, apx, n, 1); 189 apx[n] = cy + mpn_add (apx, apx, n, a3, s); 190 #endif 191 192 /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */ 193 #if HAVE_NATIVE_mpn_addlsh1_n 194 cy = mpn_addlsh1_n (bpx, b1, b0, n); 195 cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n); 196 if (t < n) 197 { 198 mp_limb_t cy2; 199 cy2 = mpn_addlsh1_n (bpx, b3, bpx, t); 200 bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1); 201 MPN_INCR_U (bpx + t, n+1-t, cy2); 202 } 203 else 204 bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n); 205 #else 206 cy = mpn_lshift (bpx, b0, n, 1); 207 cy += mpn_add_n (bpx, bpx, b1, n); 208 cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); 209 cy += mpn_add_n (bpx, bpx, b2, n); 210 cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); 211 bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t); 212 #endif 213 214 ASSERT (apx[n] < 15); 215 ASSERT (bpx[n] < 15); 216 217 TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */ 218 219 /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ 220 flags = (enum toom7_flags) (flags | (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp))); 221 222 /* Compute bpx = b0 + b1 + b2 + b3 and bmx = b0 - b1 + b2 - b3. */ 223 flags = (enum toom7_flags) (flags ^ (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp))); 224 225 TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */ 226 /* Clobbers amx, bmx. */ 227 TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */ 228 229 TOOM44_MUL_N_REC (v0, a0, b0, n, tp); 230 if (s > t) 231 mpn_mul (vinf, a3, s, b3, t); 232 else 233 TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */ 234 235 mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp); 236 }