github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/perfsqr.c (about) 1 /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, 2 zero otherwise. 3 4 Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software 5 Foundation, Inc. 6 7 This file is part of the GNU MP Library. 8 9 The GNU MP Library is free software; you can redistribute it and/or modify 10 it under the terms of either: 11 12 * the GNU Lesser General Public License as published by the Free 13 Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 or 17 18 * the GNU General Public License as published by the Free Software 19 Foundation; either version 2 of the License, or (at your option) any 20 later version. 21 22 or both in parallel, as here. 23 24 The GNU MP Library is distributed in the hope that it will be useful, but 25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 for more details. 28 29 You should have received copies of the GNU General Public License and the 30 GNU Lesser General Public License along with the GNU MP Library. If not, 31 see https://www.gnu.org/licenses/. */ 32 33 #include <stdio.h> /* for NULL */ 34 #include "gmp.h" 35 #include "gmp-impl.h" 36 #include "longlong.h" 37 38 #include "perfsqr.h" 39 40 41 /* change this to "#define TRACE(x) x" for diagnostics */ 42 #define TRACE(x) 43 44 45 46 /* PERFSQR_MOD_* detects non-squares using residue tests. 47 48 A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes 49 {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or 50 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34, 51 or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP 52 used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this 53 case too. 54 55 PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or 56 PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table 57 data indicating residues and non-residues modulo those divisors. The 58 table data is in 1 or 2 limbs worth of bits respectively, per the size of 59 each d. 60 61 A "modexact" style remainder is taken to reduce r modulo d. 62 PERFSQR_MOD_IDX implements this, producing an index "idx" for use with 63 the table data. Notice there's just one multiplication by a constant 64 "inv", for each d. 65 66 The modexact doesn't produce a true r%d remainder, instead idx satisfies 67 "-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor 68 -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is 69 accounted for by having the table data suitably permuted. 70 71 The remainder r fits within PERFSQR_MOD_BITS which is less than a limb. 72 In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit 73 each divisor d meaning the modexact multiply can take place entirely 74 within one limb, giving the compiler the chance to optimize it, in a way 75 that say umul_ppmm would not give. 76 77 There's no need for the divisors d to be prime, in fact gen-psqr.c makes 78 a deliberate effort to combine factors so as to reduce the number of 79 separate tests done on r. But such combining is limited to d <= 80 2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs. 81 82 Alternatives: 83 84 It'd be possible to use bigger divisors d, and more than 2 limbs of table 85 data, but this doesn't look like it would be of much help to the prime 86 factors in the usual moduli 2^24-1 or 2^48-1. 87 88 The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're 89 just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime 90 factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have 91 fewer prime factors. 92 93 The nails case usually ends up using mpn_mod_1, which is a lot slower 94 than mpn_mod_34lsub1. Perhaps other such special moduli could be found 95 for the nails case. Two-term things like 2^30-2^15-1 might be 96 candidates. Or at worst some on-the-fly de-nailing would allow the plain 97 2^24-1 to be used. Currently nails are too preliminary to be worried 98 about. 99 100 */ 101 102 #define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1) 103 104 #define MOD34_BITS (GMP_NUMB_BITS / 4 * 3) 105 #define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1) 106 107 #define PERFSQR_MOD_34(r, up, usize) \ 108 do { \ 109 (r) = mpn_mod_34lsub1 (up, usize); \ 110 (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \ 111 } while (0) 112 113 /* FIXME: The %= here isn't good, and might destroy any savings from keeping 114 the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm). 115 Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor 116 and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our 117 normal case, so lets not worry too much about mod_1. */ 118 #define PERFSQR_MOD_PP(r, up, usize) \ 119 do { \ 120 if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \ 121 { \ 122 (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \ 123 PERFSQR_PP_INVERTED); \ 124 (r) %= PERFSQR_PP; \ 125 } \ 126 else \ 127 { \ 128 (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \ 129 } \ 130 } while (0) 131 132 #define PERFSQR_MOD_IDX(idx, r, d, inv) \ 133 do { \ 134 mp_limb_t q; \ 135 ASSERT ((r) <= PERFSQR_MOD_MASK); \ 136 ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \ 137 ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \ 138 \ 139 q = ((r) * (inv)) & PERFSQR_MOD_MASK; \ 140 ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \ 141 (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \ 142 } while (0) 143 144 #define PERFSQR_MOD_1(r, d, inv, mask) \ 145 do { \ 146 unsigned idx; \ 147 ASSERT ((d) <= GMP_LIMB_BITS); \ 148 PERFSQR_MOD_IDX(idx, r, d, inv); \ 149 TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \ 150 d, r%d, idx)); \ 151 if ((((mask) >> idx) & 1) == 0) \ 152 { \ 153 TRACE (printf (" non-square\n")); \ 154 return 0; \ 155 } \ 156 } while (0) 157 158 /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the 159 sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */ 160 #define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \ 161 do { \ 162 mp_limb_t m; \ 163 unsigned idx; \ 164 ASSERT ((d) <= 2*GMP_LIMB_BITS); \ 165 \ 166 PERFSQR_MOD_IDX (idx, r, d, inv); \ 167 TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \ 168 d, r%d, idx)); \ 169 m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \ 170 idx %= GMP_LIMB_BITS; \ 171 if (((m >> idx) & 1) == 0) \ 172 { \ 173 TRACE (printf (" non-square\n")); \ 174 return 0; \ 175 } \ 176 } while (0) 177 178 179 int 180 mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) 181 { 182 ASSERT (usize >= 1); 183 184 TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize)); 185 186 /* The first test excludes 212/256 (82.8%) of the perfect square candidates 187 in O(1) time. */ 188 { 189 unsigned idx = up[0] % 0x100; 190 if (((sq_res_0x100[idx / GMP_LIMB_BITS] 191 >> (idx % GMP_LIMB_BITS)) & 1) == 0) 192 return 0; 193 } 194 195 #if 0 196 /* Check that we have even multiplicity of 2, and then check that the rest is 197 a possible perfect square. Leave disabled until we can determine this 198 really is an improvement. It it is, it could completely replace the 199 simple probe above, since this should throw out more non-squares, but at 200 the expense of somewhat more cycles. */ 201 { 202 mp_limb_t lo; 203 int cnt; 204 lo = up[0]; 205 while (lo == 0) 206 up++, lo = up[0], usize--; 207 count_trailing_zeros (cnt, lo); 208 if ((cnt & 1) != 0) 209 return 0; /* return of not even multiplicity of 2 */ 210 lo >>= cnt; /* shift down to align lowest non-zero bit */ 211 lo >>= 1; /* shift away lowest non-zero bit */ 212 if ((lo & 3) != 0) 213 return 0; 214 } 215 #endif 216 217 218 /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares 219 according to their residues modulo small primes (or powers of 220 primes). See perfsqr.h. */ 221 PERFSQR_MOD_TEST (up, usize); 222 223 224 /* For the third and last test, we finally compute the square root, 225 to make sure we've really got a perfect square. */ 226 { 227 mp_ptr root_ptr; 228 int res; 229 TMP_DECL; 230 231 TMP_MARK; 232 root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2); 233 234 /* Iff mpn_sqrtrem returns zero, the square is perfect. */ 235 res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); 236 TMP_FREE; 237 238 return res; 239 } 240 }