github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/matrix22_mul.c (about) 1 /* matrix22_mul.c. 2 3 Contributed by Niels Möller and Marco Bodrato. 4 5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of either: 15 16 * the GNU Lesser General Public License as published by the Free 17 Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 or 21 22 * the GNU General Public License as published by the Free Software 23 Foundation; either version 2 of the License, or (at your option) any 24 later version. 25 26 or both in parallel, as here. 27 28 The GNU MP Library is distributed in the hope that it will be useful, but 29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31 for more details. 32 33 You should have received copies of the GNU General Public License and the 34 GNU Lesser General Public License along with the GNU MP Library. If not, 35 see https://www.gnu.org/licenses/. */ 36 37 #include "gmp.h" 38 #include "gmp-impl.h" 39 #include "longlong.h" 40 41 #define MUL(rp, ap, an, bp, bn) do { \ 42 if (an >= bn) \ 43 mpn_mul (rp, ap, an, bp, bn); \ 44 else \ 45 mpn_mul (rp, bp, bn, ap, an); \ 46 } while (0) 47 48 /* Inputs are unsigned. */ 49 static int 50 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) 51 { 52 int c; 53 MPN_CMP (c, ap, bp, n); 54 if (c >= 0) 55 { 56 mpn_sub_n (rp, ap, bp, n); 57 return 0; 58 } 59 else 60 { 61 mpn_sub_n (rp, bp, ap, n); 62 return 1; 63 } 64 } 65 66 static int 67 add_signed_n (mp_ptr rp, 68 mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) 69 { 70 if (as != bs) 71 return as ^ abs_sub_n (rp, ap, bp, n); 72 else 73 { 74 ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); 75 return as; 76 } 77 } 78 79 mp_size_t 80 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) 81 { 82 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 83 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 84 return 3*rn + 2*mn; 85 else 86 return 3*(rn + mn) + 5; 87 } 88 89 /* Algorithm: 90 91 / s0 \ / 1 0 0 0 \ / r0 \ 92 | s1 | | 0 1 0 1 | | r1 | 93 | s2 | | 0 0 -1 1 | | r2 | 94 | s3 | = | 0 1 -1 1 | \ r3 / 95 | s4 | | -1 1 -1 1 | 96 | s5 | | 0 1 0 0 | 97 \ s6 / \ 0 0 1 0 / 98 99 / t0 \ / 1 0 0 0 \ / m0 \ 100 | t1 | | 0 1 0 1 | | m1 | 101 | t2 | | 0 0 -1 1 | | m2 | 102 | t3 | = | 0 1 -1 1 | \ m3 / 103 | t4 | | -1 1 -1 1 | 104 | t5 | | 0 1 0 0 | 105 \ t6 / \ 0 0 1 0 / 106 107 Note: the two matrices above are the same, but s_i and t_i are used 108 in the same product, only for i<4, see "A Strassen-like Matrix 109 Multiplication suited for squaring and higher power computation" by 110 M. Bodrato, in Proceedings of ISSAC 2010. 111 112 / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \ 113 | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 | 114 | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 | 115 \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 | 116 | s4*t5 | 117 | s5*t6 | 118 \ s6*t4 / 119 120 The scheduling uses two temporaries U0 and U1 to store products, and 121 two, S0 and T0, to store combinations of entries of the two 122 operands. 123 */ 124 125 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). 126 * 127 * Resulting elements are of size up to rn + mn + 1. 128 * 129 * Temporary storage: 3 rn + 3 mn + 5. */ 130 void 131 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 132 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 133 mp_ptr tp) 134 { 135 mp_ptr s0, t0, u0, u1; 136 int r1s, r3s, s0s, t0s, u1s; 137 s0 = tp; tp += rn + 1; 138 t0 = tp; tp += mn + 1; 139 u0 = tp; tp += rn + mn + 1; 140 u1 = tp; /* rn + mn + 2 */ 141 142 MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */ 143 r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */ 144 if (r3s) 145 { 146 r1s = abs_sub_n (r1, r1, r3, rn); 147 r1[rn] = 0; 148 } 149 else 150 { 151 r1[rn] = mpn_add_n (r1, r1, r3, rn); 152 r1s = 0; /* r1 - r2 + r3 */ 153 } 154 if (r1s) 155 { 156 s0[rn] = mpn_add_n (s0, r1, r0, rn); 157 s0s = 0; 158 } 159 else if (r1[rn] != 0) 160 { 161 s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn); 162 s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */ 163 /* Reverse sign! */ 164 } 165 else 166 { 167 s0s = abs_sub_n (s0, r0, r1, rn); 168 s0[rn] = 0; 169 } 170 MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */ 171 r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); 172 ASSERT (r0[rn+mn] < 2); /* u0 + u5 */ 173 174 t0s = abs_sub_n (t0, m3, m2, mn); 175 u1s = r3s^t0s^1; /* Reverse sign! */ 176 MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */ 177 u1[rn+mn] = 0; 178 if (t0s) 179 { 180 t0s = abs_sub_n (t0, m1, t0, mn); 181 t0[mn] = 0; 182 } 183 else 184 { 185 t0[mn] = mpn_add_n (t0, t0, m1, mn); 186 } 187 188 /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs 189 at r3. I'd expect that for matrices of random size, the high 190 words t0[mn] and r1[rn] are non-zero with a pretty small 191 probability. If that can be confirmed this should be done as an 192 unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn])) 193 add_n. */ 194 if (t0[mn] != 0) 195 { 196 MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */ 197 ASSERT (r1[rn] < 2); 198 if (r1[rn] != 0) 199 mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1); 200 } 201 else 202 { 203 MUL (r3, r1, rn + 1, t0, mn); 204 } 205 206 ASSERT (r3[rn+mn] < 4); 207 208 u0[rn+mn] = 0; 209 if (r1s^t0s) 210 { 211 r3s = abs_sub_n (r3, u0, r3, rn + mn + 1); 212 } 213 else 214 { 215 ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1)); 216 r3s = 0; /* u3 + u5 */ 217 } 218 219 if (t0s) 220 { 221 t0[mn] = mpn_add_n (t0, t0, m0, mn); 222 } 223 else if (t0[mn] != 0) 224 { 225 t0[mn] -= mpn_sub_n (t0, t0, m0, mn); 226 } 227 else 228 { 229 t0s = abs_sub_n (t0, t0, m0, mn); 230 } 231 MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */ 232 ASSERT (u0[rn+mn] < 2); 233 if (r1s) 234 { 235 ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn)); 236 } 237 else 238 { 239 r1[rn] += mpn_add_n (r1, r1, r2, rn); 240 } 241 rn++; 242 t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn); 243 /* u3 + u5 + u6 */ 244 ASSERT (r2[rn+mn-1] < 4); 245 r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn); 246 /* -u2 + u3 + u5 */ 247 ASSERT (r3[rn+mn-1] < 3); 248 MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */ 249 ASSERT (u0[rn+mn-1] < 2); 250 t0[mn] = mpn_add_n (t0, m3, m1, mn); 251 MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */ 252 mn += rn; 253 ASSERT (u1[mn-1] < 4); 254 ASSERT (u1[mn] == 0); 255 ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn)); 256 /* -u2 + u3 - u4 + u5 */ 257 ASSERT (r1[mn-1] < 2); 258 if (r3s) 259 { 260 ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn)); 261 } 262 else 263 { 264 ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn)); 265 /* u1 + u2 - u3 - u5 */ 266 } 267 ASSERT (r3[mn-1] < 2); 268 if (t0s) 269 { 270 ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn)); 271 } 272 else 273 { 274 ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn)); 275 /* u1 - u3 - u5 - u6 */ 276 } 277 ASSERT (r2[mn-1] < 2); 278 } 279 280 void 281 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 282 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 283 mp_ptr tp) 284 { 285 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 286 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 287 { 288 mp_ptr p0, p1; 289 unsigned i; 290 291 /* Temporary storage: 3 rn + 2 mn */ 292 p0 = tp + rn; 293 p1 = p0 + rn + mn; 294 295 for (i = 0; i < 2; i++) 296 { 297 MPN_COPY (tp, r0, rn); 298 299 if (rn >= mn) 300 { 301 mpn_mul (p0, r0, rn, m0, mn); 302 mpn_mul (p1, r1, rn, m3, mn); 303 mpn_mul (r0, r1, rn, m2, mn); 304 mpn_mul (r1, tp, rn, m1, mn); 305 } 306 else 307 { 308 mpn_mul (p0, m0, mn, r0, rn); 309 mpn_mul (p1, m3, mn, r1, rn); 310 mpn_mul (r0, m2, mn, r1, rn); 311 mpn_mul (r1, m1, mn, tp, rn); 312 } 313 r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); 314 r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); 315 316 r0 = r2; r1 = r3; 317 } 318 } 319 else 320 mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, 321 m0, m1, m2, m3, mn, tp); 322 }