github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom42_mulmid.c (about) 1 /* mpn_toom42_mulmid -- toom42 middle product 2 3 Contributed by David Harvey. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2011 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of either: 15 16 * the GNU Lesser General Public License as published by the Free 17 Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 or 21 22 * the GNU General Public License as published by the Free Software 23 Foundation; either version 2 of the License, or (at your option) any 24 later version. 25 26 or both in parallel, as here. 27 28 The GNU MP Library is distributed in the hope that it will be useful, but 29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31 for more details. 32 33 You should have received copies of the GNU General Public License and the 34 GNU Lesser General Public License along with the GNU MP Library. If not, 35 see https://www.gnu.org/licenses/. */ 36 37 38 #include "gmp.h" 39 #include "gmp-impl.h" 40 41 42 43 /* 44 Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}. 45 46 Neither ap nor bp may overlap rp. 47 48 Must have n >= 4. 49 50 Amount of scratch space required is given by mpn_toom42_mulmid_itch(). 51 52 FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact 53 requirements should be clarified. 54 */ 55 void 56 mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, 57 mp_ptr scratch) 58 { 59 mp_limb_t cy, e[12], zh, zl; 60 mp_size_t m; 61 int neg; 62 63 ASSERT (n >= 4); 64 ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); 65 ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); 66 67 ap += n & 1; /* handle odd row and diagonal later */ 68 m = n / 2; 69 70 /* (e0h:e0l) etc are correction terms, in 2's complement */ 71 #define e0l (e[0]) 72 #define e0h (e[1]) 73 #define e1l (e[2]) 74 #define e1h (e[3]) 75 #define e2l (e[4]) 76 #define e2h (e[5]) 77 #define e3l (e[6]) 78 #define e3h (e[7]) 79 #define e4l (e[8]) 80 #define e4h (e[9]) 81 #define e5l (e[10]) 82 #define e5h (e[11]) 83 84 #define s (scratch + 2) 85 #define t (rp + m + 2) 86 #define p0 rp 87 #define p1 scratch 88 #define p2 (rp + m) 89 #define next_scratch (scratch + 3*m + 1) 90 91 /* 92 rp scratch 93 |---------|-----------| |---------|---------|----------| 94 0 m 2m+2 0 m 2m 3m+1 95 <----p2----> <-------------s-------------> 96 <----p0----><---t----> <----p1----> 97 */ 98 99 /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */ 100 cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0); 101 cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l, 102 bp + m, bp, m, cy); 103 mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy); 104 105 /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */ 106 if (mpn_cmp (bp + m, bp, m) < 0) 107 { 108 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l, 109 ap + m - 1, ap + 2*m - 1, m, 0)); 110 neg = 1; 111 } 112 else 113 { 114 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l, 115 ap + m - 1, ap + 2*m - 1, m, 0)); 116 neg = 0; 117 } 118 119 /* recursive middle products. The picture is: 120 121 b[2m-1] A A A B B B - - - - - 122 ... - A A A B B B - - - - 123 b[m] - - A A A B B B - - - 124 b[m-1] - - - C C C D D D - - 125 ... - - - - C C C D D D - 126 b[0] - - - - - C C C D D D 127 a[0] ... a[m] ... a[2m] ... a[4m-2] 128 */ 129 130 if (m < MULMID_TOOM42_THRESHOLD) 131 { 132 /* A + B */ 133 mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m); 134 /* accumulate high limbs of p0 into e1 */ 135 ADDC_LIMB (cy, e1l, e1l, p0[m]); 136 e1h += p0[m + 1] + cy; 137 /* (-1)^neg * (B - C) (overwrites first m limbs of s) */ 138 mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m); 139 /* C + D (overwrites t) */ 140 mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m); 141 } 142 else 143 { 144 /* as above, but use toom42 instead */ 145 mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch); 146 ADDC_LIMB (cy, e1l, e1l, p0[m]); 147 e1h += p0[m + 1] + cy; 148 mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch); 149 mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch); 150 } 151 152 /* apply error terms */ 153 154 /* -e0 at rp[0] */ 155 SUBC_LIMB (cy, rp[0], rp[0], e0l); 156 SUBC_LIMB (cy, rp[1], rp[1], e0h + cy); 157 if (UNLIKELY (cy)) 158 { 159 cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1; 160 SUBC_LIMB (cy, e1l, e1l, cy); 161 e1h -= cy; 162 } 163 164 /* z = e1 - e2 + high(p0) */ 165 SUBC_LIMB (cy, zl, e1l, e2l); 166 zh = e1h - e2h - cy; 167 168 /* z at rp[m] */ 169 ADDC_LIMB (cy, rp[m], rp[m], zl); 170 zh = (zh + cy) & GMP_NUMB_MASK; 171 ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh); 172 cy -= (zh >> (GMP_NUMB_BITS - 1)); 173 if (UNLIKELY (cy)) 174 { 175 if (cy == 1) 176 mpn_add_1 (rp + m + 2, rp + m + 2, m, 1); 177 else /* cy == -1 */ 178 mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1); 179 } 180 181 /* e3 at rp[2*m] */ 182 ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l); 183 rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK; 184 185 /* e4 at p1[0] */ 186 ADDC_LIMB (cy, p1[0], p1[0], e4l); 187 ADDC_LIMB (cy, p1[1], p1[1], e4h + cy); 188 if (UNLIKELY (cy)) 189 mpn_add_1 (p1 + 2, p1 + 2, m, 1); 190 191 /* -e5 at p1[m] */ 192 SUBC_LIMB (cy, p1[m], p1[m], e5l); 193 p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK; 194 195 /* adjustment if p1 ends up negative */ 196 cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1)); 197 198 /* add (-1)^neg * (p1 - B^m * p1) to output */ 199 if (neg) 200 { 201 mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy); 202 mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ 203 mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */ 204 } 205 else 206 { 207 mpn_add_1 (rp + m + 2, rp + m + 2, m, cy); 208 mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ 209 mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */ 210 } 211 212 /* odd row and diagonal */ 213 if (n & 1) 214 { 215 /* 216 Products marked E are already done. We need to do products marked O. 217 218 OOOOO---- 219 -EEEEO--- 220 --EEEEO-- 221 ---EEEEO- 222 ----EEEEO 223 */ 224 225 /* first row of O's */ 226 cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]); 227 ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy); 228 229 /* O's on diagonal */ 230 /* FIXME: should probably define an interface "mpn_mulmid_diag_1" 231 that can handle the sum below. Currently we're relying on 232 mulmid_basecase being pretty fast for a diagonal sum like this, 233 which is true at least for the K8 asm version, but surely false 234 for the generic version. */ 235 mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1); 236 mpn_add_n (rp + n - 1, rp + n - 1, e, 3); 237 } 238 }