github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/toom_interpolate_12pts.c (about) 1 /* Interpolation for the algorithm Toom-Cook 6.5-way. 2 3 Contributed to the GNU project by Marco Bodrato. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of either: 15 16 * the GNU Lesser General Public License as published by the Free 17 Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 or 21 22 * the GNU General Public License as published by the Free Software 23 Foundation; either version 2 of the License, or (at your option) any 24 later version. 25 26 or both in parallel, as here. 27 28 The GNU MP Library is distributed in the hope that it will be useful, but 29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31 for more details. 32 33 You should have received copies of the GNU General Public License and the 34 GNU Lesser General Public License along with the GNU MP Library. If not, 35 see https://www.gnu.org/licenses/. */ 36 37 38 #include "gmp.h" 39 #include "gmp-impl.h" 40 41 42 #if HAVE_NATIVE_mpn_sublsh_n 43 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s) 44 #else 45 static mp_limb_t 46 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) 47 { 48 #if USE_MUL_1 && 0 49 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); 50 #else 51 mp_limb_t __cy; 52 __cy = mpn_lshift(ws,src,n,s); 53 return __cy + mpn_sub_n(dst,dst,ws,n); 54 #endif 55 } 56 #endif 57 58 #if HAVE_NATIVE_mpn_addlsh_n 59 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s) 60 #else 61 static mp_limb_t 62 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) 63 { 64 #if USE_MUL_1 && 0 65 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s)); 66 #else 67 mp_limb_t __cy; 68 __cy = mpn_lshift(ws,src,n,s); 69 return __cy + mpn_add_n(dst,dst,ws,n); 70 #endif 71 } 72 #endif 73 74 #if HAVE_NATIVE_mpn_subrsh 75 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s) 76 #else 77 /* FIXME: This is not a correct definition, it assumes no carry */ 78 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ 79 do { \ 80 mp_limb_t __cy; \ 81 MPN_DECR_U (dst, nd, src[0] >> s); \ 82 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ 83 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ 84 } while (0) 85 #endif 86 87 88 #if GMP_NUMB_BITS < 21 89 #error Not implemented: Both sublsh_n(,,,20) should be corrected. 90 #endif 91 92 #if GMP_NUMB_BITS < 16 93 #error Not implemented: divexact_by42525 needs splitting. 94 #endif 95 96 #if GMP_NUMB_BITS < 12 97 #error Not implemented: Hard to adapt... 98 #endif 99 100 /* FIXME: tuneup should decide the best variant */ 101 #ifndef AORSMUL_FASTER_AORS_AORSLSH 102 #define AORSMUL_FASTER_AORS_AORSLSH 1 103 #endif 104 #ifndef AORSMUL_FASTER_AORS_2AORSLSH 105 #define AORSMUL_FASTER_AORS_2AORSLSH 1 106 #endif 107 #ifndef AORSMUL_FASTER_2AORSLSH 108 #define AORSMUL_FASTER_2AORSLSH 1 109 #endif 110 #ifndef AORSMUL_FASTER_3AORSLSH 111 #define AORSMUL_FASTER_3AORSLSH 1 112 #endif 113 114 #define BINVERT_9 \ 115 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) 116 117 #define BINVERT_255 \ 118 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) 119 120 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */ 121 #if GMP_LIMB_BITS == 32 122 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) 123 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) 124 #else 125 #if GMP_LIMB_BITS == 64 126 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) 127 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) 128 #endif 129 #endif 130 131 #ifndef mpn_divexact_by255 132 #if GMP_NUMB_BITS % 8 == 0 133 #define mpn_divexact_by255(dst,src,size) \ 134 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) 135 #else 136 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 137 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) 138 #else 139 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) 140 #endif 141 #endif 142 #endif 143 144 #ifndef mpn_divexact_by9x4 145 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 146 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2) 147 #else 148 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2) 149 #endif 150 #endif 151 152 #ifndef mpn_divexact_by42525 153 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) 154 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0) 155 #else 156 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)) 157 #endif 158 #endif 159 160 #ifndef mpn_divexact_by2835x4 161 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) 162 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2) 163 #else 164 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2) 165 #endif 166 #endif 167 168 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation 169 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely, 170 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of 171 degree 11 (or 10), given the 12 (rsp. 11) values: 172 173 r0 = limit at infinity of f(x) / x^7, 174 r1 = f(4),f(-4), 175 r2 = f(2),f(-2), 176 r3 = f(1),f(-1), 177 r4 = f(1/4),f(-1/4), 178 r5 = f(1/2),f(-1/2), 179 r6 = f(0). 180 181 All couples of the form f(n),f(-n) must be already mixed with 182 toom_couple_handling(f(n),...,f(-n),...) 183 184 The result is stored in {pp, spt + 7*n (or 6*n)}. 185 At entry, r6 is stored at {pp, 2n}, 186 r4 is stored at {pp + 3n, 3n + 1}. 187 r2 is stored at {pp + 7n, 3n + 1}. 188 r0 is stored at {pp +11n, spt}. 189 190 The other values are 3n+1 limbs each (with most significant limbs small). 191 192 Negative intermediate results are stored two-complemented. 193 Inputs are destroyed. 194 */ 195 196 void 197 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, 198 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) 199 { 200 mp_limb_t cy; 201 mp_size_t n3; 202 mp_size_t n3p1; 203 n3 = 3 * n; 204 n3p1 = n3 + 1; 205 206 #define r4 (pp + n3) /* 3n+1 */ 207 #define r2 (pp + 7 * n) /* 3n+1 */ 208 #define r0 (pp +11 * n) /* s+t <= 2*n */ 209 210 /******************************* interpolation *****************************/ 211 if (half != 0) { 212 cy = mpn_sub_n (r3, r3, r0, spt); 213 MPN_DECR_U (r3 + spt, n3p1 - spt, cy); 214 215 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi); 216 MPN_DECR_U (r2 + spt, n3p1 - spt, cy); 217 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi); 218 219 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi); 220 MPN_DECR_U (r1 + spt, n3p1 - spt, cy); 221 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi); 222 }; 223 224 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi); 225 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi); 226 227 #if HAVE_NATIVE_mpn_add_n_sub_n 228 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1); 229 #else 230 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1)); 231 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */ 232 MP_PTR_SWAP(r1, wsi); 233 #endif 234 235 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi); 236 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi); 237 238 #if HAVE_NATIVE_mpn_add_n_sub_n 239 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); 240 #else 241 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ 242 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); 243 MP_PTR_SWAP(r5, wsi); 244 #endif 245 246 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n); 247 248 #if AORSMUL_FASTER_AORS_AORSLSH 249 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */ 250 #else 251 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */ 252 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */ 253 #endif 254 /* A division by 2835x4 follows. Warning: the operand can be negative! */ 255 mpn_divexact_by2835x4(r4, r4, n3p1); 256 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) 257 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); 258 259 #if AORSMUL_FASTER_2AORSLSH 260 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */ 261 #else 262 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */ 263 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */ 264 #endif 265 mpn_divexact_by255(r5, r5, n3p1); 266 267 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi)); 268 269 #if AORSMUL_FASTER_3AORSLSH 270 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100)); 271 #else 272 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi)); 273 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi)); 274 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi)); 275 #endif 276 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi)); 277 mpn_divexact_by42525(r1, r1, n3p1); 278 279 #if AORSMUL_FASTER_AORS_2AORSLSH 280 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225)); 281 #else 282 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1)); 283 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi)); 284 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi)); 285 #endif 286 mpn_divexact_by9x4(r2, r2, n3p1); 287 288 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1)); 289 290 mpn_sub_n (r4, r2, r4, n3p1); 291 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1)); 292 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1)); 293 294 mpn_add_n (r5, r5, r1, n3p1); 295 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); 296 297 /* last interpolation steps... */ 298 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); 299 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1)); 300 /* ... could be mixed with recomposition 301 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1| 302 */ 303 304 /***************************** recomposition *******************************/ 305 /* 306 pp[] prior to operations: 307 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp 308 309 summation scheme for remaining operations: 310 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp 311 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp 312 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| 313 */ 314 315 cy = mpn_add_n (pp + n, pp + n, r5, n); 316 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy); 317 #if HAVE_NATIVE_mpn_add_nc 318 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy); 319 #else 320 MPN_INCR_U (r5 + 2 * n, n + 1, cy); 321 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n); 322 #endif 323 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy); 324 325 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n); 326 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]); 327 #if HAVE_NATIVE_mpn_add_nc 328 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy); 329 #else 330 MPN_INCR_U (r3 + 2 * n, n + 1, cy); 331 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n); 332 #endif 333 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); 334 335 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n); 336 if (half) { 337 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]); 338 #if HAVE_NATIVE_mpn_add_nc 339 if (LIKELY (spt > n)) { 340 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy); 341 MPN_INCR_U (pp + 4 * n3, spt - n, cy); 342 } else { 343 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy)); 344 } 345 #else 346 MPN_INCR_U (r1 + 2 * n, n + 1, cy); 347 if (LIKELY (spt > n)) { 348 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n); 349 MPN_INCR_U (pp + 4 * n3, spt - n, cy); 350 } else { 351 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt)); 352 } 353 #endif 354 } else { 355 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n])); 356 } 357 358 #undef r0 359 #undef r2 360 #undef r4 361 }