github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/mod_1_1.c (about) 1 /* mpn_mod_1_1p (ap, n, b, cps) 2 Divide (ap,,n) by b. Return the single-limb remainder. 3 4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller. 5 Based on a suggestion by Peter L. Montgomery. 6 7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 10 11 Copyright 2008-2011, 2013 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of either: 17 18 * the GNU Lesser General Public License as published by the Free 19 Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 or 23 24 * the GNU General Public License as published by the Free Software 25 Foundation; either version 2 of the License, or (at your option) any 26 later version. 27 28 or both in parallel, as here. 29 30 The GNU MP Library is distributed in the hope that it will be useful, but 31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 33 for more details. 34 35 You should have received copies of the GNU General Public License and the 36 GNU Lesser General Public License along with the GNU MP Library. If not, 37 see https://www.gnu.org/licenses/. */ 38 39 #include "gmp.h" 40 #include "gmp-impl.h" 41 #include "longlong.h" 42 43 #ifndef MOD_1_1P_METHOD 44 # define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */ 45 #endif 46 47 /* Define some longlong.h-style macros, but for wider operations. 48 * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates 49 * carry out, in the form of a mask. */ 50 51 #if defined (__GNUC__) && ! defined (NO_ASM) 52 53 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 54 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 55 __asm__ ( "add %6, %k2\n\t" \ 56 "adc %4, %k1\n\t" \ 57 "sbb %k0, %k0" \ 58 : "=r" (m), "=r" (s1), "=&r" (s0) \ 59 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ 60 "%2" ((USItype)(a0)), "g" ((USItype)(b0))) 61 #endif 62 63 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64 64 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 65 __asm__ ( "add %6, %q2\n\t" \ 66 "adc %4, %q1\n\t" \ 67 "sbb %q0, %q0" \ 68 : "=r" (m), "=r" (s1), "=&r" (s0) \ 69 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ 70 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) 71 #endif 72 73 #if defined (__sparc__) && W_TYPE_SIZE == 32 74 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ 75 __asm__ ( "addcc %r5, %6, %2\n\t" \ 76 "addxcc %r3, %4, %1\n\t" \ 77 "subx %%g0, %%g0, %0" \ 78 : "=r" (m), "=r" (sh), "=&r" (sl) \ 79 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ 80 __CLOBBER_CC) 81 #endif 82 83 #if defined (__sparc__) && W_TYPE_SIZE == 64 84 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ 85 __asm__ ( "addcc %r5, %6, %2\n\t" \ 86 "addccc %r7, %8, %%g0\n\t" \ 87 "addccc %r3, %4, %1\n\t" \ 88 "clr %0\n\t" \ 89 "movcs %%xcc, -1, %0" \ 90 : "=r" (m), "=r" (sh), "=&r" (sl) \ 91 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ 92 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ 93 __CLOBBER_CC) 94 #if __VIS__ >= 0x300 95 #undef add_mssaaaa 96 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ 97 __asm__ ( "addcc %r5, %6, %2\n\t" \ 98 "addxccc %r3, %4, %1\n\t" \ 99 "clr %0\n\t" \ 100 "movcs %%xcc, -1, %0" \ 101 : "=r" (m), "=r" (sh), "=&r" (sl) \ 102 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ 103 __CLOBBER_CC) 104 #endif 105 #endif 106 107 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) 108 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a 109 processor running in 32-bit mode, since the carry flag then gets the 32-bit 110 carry. */ 111 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 112 __asm__ ( "add%I6c %2, %5, %6\n\t" \ 113 "adde %1, %3, %4\n\t" \ 114 "subfe %0, %0, %0\n\t" \ 115 "nor %0, %0, %0" \ 116 : "=r" (m), "=r" (s1), "=&r" (s0) \ 117 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0)) 118 #endif 119 120 #if defined (__s390x__) && W_TYPE_SIZE == 64 121 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 122 __asm__ ( "algr %2, %6\n\t" \ 123 "alcgr %1, %4\n\t" \ 124 "lghi %0, 0\n\t" \ 125 "alcgr %0, %0\n\t" \ 126 "lcgr %0, %0" \ 127 : "=r" (m), "=r" (s1), "=&r" (s0) \ 128 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \ 129 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC) 130 #endif 131 132 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32 133 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ 134 __asm__ ( "adds %2, %5, %6\n\t" \ 135 "adcs %1, %3, %4\n\t" \ 136 "movcc %0, #0\n\t" \ 137 "movcs %0, #-1" \ 138 : "=r" (m), "=r" (sh), "=&r" (sl) \ 139 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) 140 #endif 141 #endif /* defined (__GNUC__) */ 142 143 #ifndef add_mssaaaa 144 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 145 do { \ 146 UWtype __s0, __s1, __c0, __c1; \ 147 __s0 = (a0) + (b0); \ 148 __s1 = (a1) + (b1); \ 149 __c0 = __s0 < (a0); \ 150 __c1 = __s1 < (a1); \ 151 (s0) = __s0; \ 152 __s1 = __s1 + __c0; \ 153 (s1) = __s1; \ 154 (m) = - (__c1 + (__s1 < __c0)); \ 155 } while (0) 156 #endif 157 158 #if MOD_1_1P_METHOD == 1 159 void 160 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) 161 { 162 mp_limb_t bi; 163 mp_limb_t B1modb, B2modb; 164 int cnt; 165 166 count_leading_zeros (cnt, b); 167 168 b <<= cnt; 169 invert_limb (bi, b); 170 171 cps[0] = bi; 172 cps[1] = cnt; 173 174 B1modb = -b; 175 if (LIKELY (cnt != 0)) 176 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); 177 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ 178 cps[2] = B1modb >> cnt; 179 180 /* In the normalized case, this can be simplified to 181 * 182 * B2modb = - b * bi; 183 * ASSERT (B2modb <= b); // NB: equality iff b = B/2 184 */ 185 udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); 186 cps[3] = B2modb >> cnt; 187 } 188 189 mp_limb_t 190 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4]) 191 { 192 mp_limb_t rh, rl, bi, ph, pl, r; 193 mp_limb_t B1modb, B2modb; 194 mp_size_t i; 195 int cnt; 196 mp_limb_t mask; 197 198 ASSERT (n >= 2); /* fix tuneup.c if this is changed */ 199 200 B1modb = bmodb[2]; 201 B2modb = bmodb[3]; 202 203 rl = ap[n - 1]; 204 umul_ppmm (ph, pl, rl, B1modb); 205 add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]); 206 207 for (i = n - 3; i >= 0; i -= 1) 208 { 209 /* rr = ap[i] < B 210 + LO(rr) * (B mod b) <= (B-1)(b-1) 211 + HI(rr) * (B^2 mod b) <= (B-1)(b-1) 212 */ 213 umul_ppmm (ph, pl, rl, B1modb); 214 add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]); 215 216 umul_ppmm (rh, rl, rh, B2modb); 217 add_ssaaaa (rh, rl, rh, rl, ph, pl); 218 } 219 220 cnt = bmodb[1]; 221 bi = bmodb[0]; 222 223 if (LIKELY (cnt != 0)) 224 rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); 225 226 mask = -(mp_limb_t) (rh >= b); 227 rh -= mask & b; 228 229 udiv_rnnd_preinv (r, rh, rl << cnt, b, bi); 230 231 return r >> cnt; 232 } 233 #endif /* MOD_1_1P_METHOD == 1 */ 234 235 #if MOD_1_1P_METHOD == 2 236 void 237 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) 238 { 239 mp_limb_t bi; 240 mp_limb_t B2modb; 241 int cnt; 242 243 count_leading_zeros (cnt, b); 244 245 b <<= cnt; 246 invert_limb (bi, b); 247 248 cps[0] = bi; 249 cps[1] = cnt; 250 251 if (LIKELY (cnt != 0)) 252 { 253 mp_limb_t B1modb = -b; 254 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); 255 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ 256 cps[2] = B1modb >> cnt; 257 } 258 B2modb = - b * bi; 259 ASSERT (B2modb <= b); // NB: equality iff b = B/2 260 cps[3] = B2modb; 261 } 262 263 mp_limb_t 264 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4]) 265 { 266 int cnt; 267 mp_limb_t bi, B1modb; 268 mp_limb_t r0, r1; 269 mp_limb_t r; 270 271 ASSERT (n >= 2); /* fix tuneup.c if this is changed */ 272 273 r0 = ap[n-2]; 274 r1 = ap[n-1]; 275 276 if (n > 2) 277 { 278 mp_limb_t B2modb, B2mb; 279 mp_limb_t p0, p1; 280 mp_limb_t r2; 281 mp_size_t j; 282 283 B2modb = bmodb[3]; 284 B2mb = B2modb - b; 285 286 umul_ppmm (p1, p0, r1, B2modb); 287 add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0); 288 289 for (j = n-4; j >= 0; j--) 290 { 291 mp_limb_t cy; 292 /* mp_limb_t t = r0 + B2mb; */ 293 umul_ppmm (p1, p0, r1, B2modb); 294 295 ADDC_LIMB (cy, r0, r0, r2 & B2modb); 296 /* Alternative, for cmov: if (cy) r0 = t; */ 297 r0 -= (-cy) & b; 298 add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0); 299 } 300 301 r1 -= (r2 & b); 302 } 303 304 cnt = bmodb[1]; 305 306 if (LIKELY (cnt != 0)) 307 { 308 mp_limb_t t; 309 mp_limb_t B1modb = bmodb[2]; 310 311 umul_ppmm (r1, t, r1, B1modb); 312 r0 += t; 313 r1 += (r0 < t); 314 315 /* Normalize */ 316 r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt)); 317 r0 <<= cnt; 318 319 /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */ 320 } 321 else 322 { 323 mp_limb_t mask = -(mp_limb_t) (r1 >= b); 324 r1 -= mask & b; 325 } 326 327 bi = bmodb[0]; 328 329 udiv_rnnd_preinv (r, r1, r0, b, bi); 330 return r >> cnt; 331 } 332 #endif /* MOD_1_1P_METHOD == 2 */