github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpz/n_pow_ui.c (about) 1 /* mpz_n_pow_ui -- mpn raised to ulong. 2 3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5 FUTURE GNU MP RELEASES. 6 7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of either: 13 14 * the GNU Lesser General Public License as published by the Free 15 Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 or 19 20 * the GNU General Public License as published by the Free Software 21 Foundation; either version 2 of the License, or (at your option) any 22 later version. 23 24 or both in parallel, as here. 25 26 The GNU MP Library is distributed in the hope that it will be useful, but 27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29 for more details. 30 31 You should have received copies of the GNU General Public License and the 32 GNU Lesser General Public License along with the GNU MP Library. If not, 33 see https://www.gnu.org/licenses/. */ 34 35 #include "gmp.h" 36 #include "gmp-impl.h" 37 #include "longlong.h" 38 39 40 /* Change this to "#define TRACE(x) x" for some traces. */ 41 #define TRACE(x) 42 43 44 /* Use this to test the mul_2 code on a CPU without a native version of that 45 routine. */ 46 #if 0 47 #define mpn_mul_2 refmpn_mul_2 48 #define HAVE_NATIVE_mpn_mul_2 1 49 #endif 50 51 52 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code. 53 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on 54 bsize==2 or >2, but separating that isn't easy because there's shared 55 code both before and after (the size calculations and the powers of 2 56 handling). 57 58 Alternatives: 59 60 It would work to just use the mpn_mul powering loop for 1 and 2 limb 61 bases, but the current separate loop allows mul_1 and mul_2 to be done 62 in-place, which might help cache locality a bit. If mpn_mul was relaxed 63 to allow source==dest when vn==1 or 2 then some pointer twiddling might 64 let us get the same effect in one loop. 65 66 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't 67 form the biggest possible power of b that fits, only the biggest power of 68 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps 69 using mp_bases[b].big_base for small b, and thereby get better value 70 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing 71 so would be more complicated than it's worth, and could well end up being 72 a slowdown for small e. For big e on the other hand the algorithm is 73 dominated by mpn_sqr so there wouldn't much of a saving. The current 74 code can be viewed as simply doing the first few steps of the powering in 75 a single or double limb where possible. 76 77 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary 78 copy made of b is unnecessary. We could just use the old alloc'ed block 79 and free it at the end. But arranging this seems like a lot more trouble 80 than it's worth. */ 81 82 83 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in 84 a limb without overflowing. 85 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */ 86 87 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1) 88 89 90 /* The following are for convenience, they update the size and check the 91 alloc. */ 92 93 #define MPN_SQR(dst, alloc, src, size) \ 94 do { \ 95 ASSERT (2*(size) <= (alloc)); \ 96 mpn_sqr (dst, src, size); \ 97 (size) *= 2; \ 98 (size) -= ((dst)[(size)-1] == 0); \ 99 } while (0) 100 101 #define MPN_MUL(dst, alloc, src, size, src2, size2) \ 102 do { \ 103 mp_limb_t cy; \ 104 ASSERT ((size) + (size2) <= (alloc)); \ 105 cy = mpn_mul (dst, src, size, src2, size2); \ 106 (size) += (size2) - (cy == 0); \ 107 } while (0) 108 109 #define MPN_MUL_2(ptr, size, alloc, mult) \ 110 do { \ 111 mp_limb_t cy; \ 112 ASSERT ((size)+2 <= (alloc)); \ 113 cy = mpn_mul_2 (ptr, ptr, size, mult); \ 114 (size)++; \ 115 (ptr)[(size)] = cy; \ 116 (size) += (cy != 0); \ 117 } while (0) 118 119 #define MPN_MUL_1(ptr, size, alloc, limb) \ 120 do { \ 121 mp_limb_t cy; \ 122 ASSERT ((size)+1 <= (alloc)); \ 123 cy = mpn_mul_1 (ptr, ptr, size, limb); \ 124 (ptr)[size] = cy; \ 125 (size) += (cy != 0); \ 126 } while (0) 127 128 #define MPN_LSHIFT(ptr, size, alloc, shift) \ 129 do { \ 130 mp_limb_t cy; \ 131 ASSERT ((size)+1 <= (alloc)); \ 132 cy = mpn_lshift (ptr, ptr, size, shift); \ 133 (ptr)[size] = cy; \ 134 (size) += (cy != 0); \ 135 } while (0) 136 137 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \ 138 do { \ 139 if ((shift) == 0) \ 140 MPN_COPY (dst, src, size); \ 141 else \ 142 { \ 143 mpn_rshift (dst, src, size, shift); \ 144 (size) -= ((dst)[(size)-1] == 0); \ 145 } \ 146 } while (0) 147 148 149 /* ralloc and talloc are only wanted for ASSERTs, after the initial space 150 allocations. Avoid writing values to them in a normal build, to ensure 151 the compiler lets them go dead. gcc already figures this out itself 152 actually. */ 153 154 #define SWAP_RP_TP \ 155 do { \ 156 MP_PTR_SWAP (rp, tp); \ 157 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \ 158 } while (0) 159 160 161 void 162 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e) 163 { 164 mp_ptr rp; 165 mp_size_t rtwos_limbs, ralloc, rsize; 166 int rneg, i, cnt, btwos, r_bp_overlap; 167 mp_limb_t blimb, rl; 168 mp_bitcnt_t rtwos_bits; 169 #if HAVE_NATIVE_mpn_mul_2 170 mp_limb_t blimb_low, rl_high; 171 #else 172 mp_limb_t b_twolimbs[2]; 173 #endif 174 TMP_DECL; 175 176 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n", 177 PTR(r), bp, bsize, e, e); 178 mpn_trace ("b", bp, bsize)); 179 180 ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0); 181 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize))); 182 183 /* b^0 == 1, including 0^0 == 1 */ 184 if (e == 0) 185 { 186 PTR(r)[0] = 1; 187 SIZ(r) = 1; 188 return; 189 } 190 191 /* 0^e == 0 apart from 0^0 above */ 192 if (bsize == 0) 193 { 194 SIZ(r) = 0; 195 return; 196 } 197 198 /* Sign of the final result. */ 199 rneg = (bsize < 0 && (e & 1) != 0); 200 bsize = ABS (bsize); 201 TRACE (printf ("rneg %d\n", rneg)); 202 203 r_bp_overlap = (PTR(r) == bp); 204 205 /* Strip low zero limbs from b. */ 206 rtwos_limbs = 0; 207 for (blimb = *bp; blimb == 0; blimb = *++bp) 208 { 209 rtwos_limbs += e; 210 bsize--; ASSERT (bsize >= 1); 211 } 212 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs)); 213 214 /* Strip low zero bits from b. */ 215 count_trailing_zeros (btwos, blimb); 216 blimb >>= btwos; 217 rtwos_bits = e * btwos; 218 rtwos_limbs += rtwos_bits / GMP_NUMB_BITS; 219 rtwos_bits %= GMP_NUMB_BITS; 220 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n", 221 btwos, rtwos_limbs, rtwos_bits)); 222 223 TMP_MARK; 224 225 rl = 1; 226 #if HAVE_NATIVE_mpn_mul_2 227 rl_high = 0; 228 #endif 229 230 if (bsize == 1) 231 { 232 bsize_1: 233 /* Power up as far as possible within blimb. We start here with e!=0, 234 but if e is small then we might reach e==0 and the whole b^e in rl. 235 Notice this code works when blimb==1 too, reaching e==0. */ 236 237 while (blimb <= GMP_NUMB_HALFMAX) 238 { 239 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n", 240 e, blimb, rl)); 241 ASSERT (e != 0); 242 if ((e & 1) != 0) 243 rl *= blimb; 244 e >>= 1; 245 if (e == 0) 246 goto got_rl; 247 blimb *= blimb; 248 } 249 250 #if HAVE_NATIVE_mpn_mul_2 251 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n", 252 e, blimb, rl)); 253 254 /* Can power b once more into blimb:blimb_low */ 255 bsize = 2; 256 ASSERT (e != 0); 257 if ((e & 1) != 0) 258 { 259 umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS); 260 rl >>= GMP_NAIL_BITS; 261 } 262 e >>= 1; 263 umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS); 264 blimb_low >>= GMP_NAIL_BITS; 265 266 got_rl: 267 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n", 268 e, blimb, blimb_low, rl_high, rl)); 269 270 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the 271 final mul_1 or mul_2 rather than a separate lshift. 272 - rl_high:rl mustn't be 1 (since then there's no final mul) 273 - rl_high mustn't overflow 274 - rl_high mustn't change to non-zero, since mul_1+lshift is 275 probably faster than mul_2 (FIXME: is this true?) */ 276 277 if (rtwos_bits != 0 278 && ! (rl_high == 0 && rl == 1) 279 && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0) 280 { 281 mp_limb_t new_rl_high = (rl_high << rtwos_bits) 282 | (rl >> (GMP_NUMB_BITS-rtwos_bits)); 283 if (! (rl_high == 0 && new_rl_high != 0)) 284 { 285 rl_high = new_rl_high; 286 rl <<= rtwos_bits; 287 rtwos_bits = 0; 288 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n", 289 rl_high, rl)); 290 } 291 } 292 #else 293 got_rl: 294 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n", 295 e, blimb, rl)); 296 297 /* Combine left-over rtwos_bits into rl to be handled by the final 298 mul_1 rather than a separate lshift. 299 - rl mustn't be 1 (since then there's no final mul) 300 - rl mustn't overflow */ 301 302 if (rtwos_bits != 0 303 && rl != 1 304 && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0) 305 { 306 rl <<= rtwos_bits; 307 rtwos_bits = 0; 308 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl)); 309 } 310 #endif 311 } 312 else if (bsize == 2) 313 { 314 mp_limb_t bsecond = bp[1]; 315 if (btwos != 0) 316 blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK; 317 bsecond >>= btwos; 318 if (bsecond == 0) 319 { 320 /* Two limbs became one after rshift. */ 321 bsize = 1; 322 goto bsize_1; 323 } 324 325 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb)); 326 #if HAVE_NATIVE_mpn_mul_2 327 blimb_low = blimb; 328 #else 329 bp = b_twolimbs; 330 b_twolimbs[0] = blimb; 331 b_twolimbs[1] = bsecond; 332 #endif 333 blimb = bsecond; 334 } 335 else 336 { 337 if (r_bp_overlap || btwos != 0) 338 { 339 mp_ptr tp = TMP_ALLOC_LIMBS (bsize); 340 MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos); 341 bp = tp; 342 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize)); 343 } 344 #if HAVE_NATIVE_mpn_mul_2 345 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */ 346 blimb_low = bp[0]; 347 #endif 348 blimb = bp[bsize-1]; 349 350 TRACE (printf ("big bsize=%ld ", bsize); 351 mpn_trace ("b", bp, bsize)); 352 } 353 354 /* At this point blimb is the most significant limb of the base to use. 355 356 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1 357 limb to round up the division; +1 for multiplies all using an extra 358 limb over the true size; +2 for rl at the end; +1 for lshift at the 359 end. 360 361 The size calculation here is reasonably accurate. The base is at least 362 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits 363 when it will power up as just over 16, an overestimate of 17/16 = 364 6.25%. For a 64-bit limb it's half that. 365 366 If e==0 then blimb won't be anything useful (though it will be 367 non-zero), but that doesn't matter since we just end up with ralloc==5, 368 and that's fine for 2 limbs of rl and 1 of lshift. */ 369 370 ASSERT (blimb != 0); 371 count_leading_zeros (cnt, blimb); 372 ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5; 373 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n", 374 ralloc, bsize, blimb, cnt)); 375 rp = MPZ_REALLOC (r, ralloc + rtwos_limbs); 376 377 /* Low zero limbs resulting from powers of 2. */ 378 MPN_ZERO (rp, rtwos_limbs); 379 rp += rtwos_limbs; 380 381 if (e == 0) 382 { 383 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the 384 start. */ 385 rp[0] = rl; 386 rsize = 1; 387 #if HAVE_NATIVE_mpn_mul_2 388 rp[1] = rl_high; 389 rsize += (rl_high != 0); 390 #endif 391 ASSERT (rp[rsize-1] != 0); 392 } 393 else 394 { 395 mp_ptr tp; 396 mp_size_t talloc; 397 398 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the 399 low bit of e is zero, tp only has to hold the second last power 400 step, which is half the size of the final result. There's no need 401 to round up the divide by 2, since ralloc includes a +2 for rl 402 which not needed by tp. In the mpn_mul loop when the low bit of e 403 is 1, tp must hold nearly the full result, so just size it the same 404 as rp. */ 405 406 talloc = ralloc; 407 #if HAVE_NATIVE_mpn_mul_2 408 if (bsize <= 2 || (e & 1) == 0) 409 talloc /= 2; 410 #else 411 if (bsize <= 1 || (e & 1) == 0) 412 talloc /= 2; 413 #endif 414 TRACE (printf ("talloc %ld\n", talloc)); 415 tp = TMP_ALLOC_LIMBS (talloc); 416 417 /* Go from high to low over the bits of e, starting with i pointing at 418 the bit below the highest 1 (which will mean i==-1 if e==1). */ 419 count_leading_zeros (cnt, (mp_limb_t) e); 420 i = GMP_LIMB_BITS - cnt - 2; 421 422 #if HAVE_NATIVE_mpn_mul_2 423 if (bsize <= 2) 424 { 425 mp_limb_t mult[2]; 426 427 /* Any bsize==1 will have been powered above to be two limbs. */ 428 ASSERT (bsize == 2); 429 ASSERT (blimb != 0); 430 431 /* Arrange the final result ends up in r, not in the temp space */ 432 if ((i & 1) == 0) 433 SWAP_RP_TP; 434 435 rp[0] = blimb_low; 436 rp[1] = blimb; 437 rsize = 2; 438 439 mult[0] = blimb_low; 440 mult[1] = blimb; 441 442 for ( ; i >= 0; i--) 443 { 444 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n", 445 i, e, rsize, ralloc, talloc); 446 mpn_trace ("r", rp, rsize)); 447 448 MPN_SQR (tp, talloc, rp, rsize); 449 SWAP_RP_TP; 450 if ((e & (1L << i)) != 0) 451 MPN_MUL_2 (rp, rsize, ralloc, mult); 452 } 453 454 TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize)); 455 if (rl_high != 0) 456 { 457 mult[0] = rl; 458 mult[1] = rl_high; 459 MPN_MUL_2 (rp, rsize, ralloc, mult); 460 } 461 else if (rl != 1) 462 MPN_MUL_1 (rp, rsize, ralloc, rl); 463 } 464 #else 465 if (bsize == 1) 466 { 467 /* Arrange the final result ends up in r, not in the temp space */ 468 if ((i & 1) == 0) 469 SWAP_RP_TP; 470 471 rp[0] = blimb; 472 rsize = 1; 473 474 for ( ; i >= 0; i--) 475 { 476 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n", 477 i, e, rsize, ralloc, talloc); 478 mpn_trace ("r", rp, rsize)); 479 480 MPN_SQR (tp, talloc, rp, rsize); 481 SWAP_RP_TP; 482 if ((e & (1L << i)) != 0) 483 MPN_MUL_1 (rp, rsize, ralloc, blimb); 484 } 485 486 TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize)); 487 if (rl != 1) 488 MPN_MUL_1 (rp, rsize, ralloc, rl); 489 } 490 #endif 491 else 492 { 493 int parity; 494 495 /* Arrange the final result ends up in r, not in the temp space */ 496 ULONG_PARITY (parity, e); 497 if (((parity ^ i) & 1) != 0) 498 SWAP_RP_TP; 499 500 MPN_COPY (rp, bp, bsize); 501 rsize = bsize; 502 503 for ( ; i >= 0; i--) 504 { 505 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n", 506 i, e, rsize, ralloc, talloc); 507 mpn_trace ("r", rp, rsize)); 508 509 MPN_SQR (tp, talloc, rp, rsize); 510 SWAP_RP_TP; 511 if ((e & (1L << i)) != 0) 512 { 513 MPN_MUL (tp, talloc, rp, rsize, bp, bsize); 514 SWAP_RP_TP; 515 } 516 } 517 } 518 } 519 520 ASSERT (rp == PTR(r) + rtwos_limbs); 521 TRACE (mpn_trace ("end loop r", rp, rsize)); 522 TMP_FREE; 523 524 /* Apply any partial limb factors of 2. */ 525 if (rtwos_bits != 0) 526 { 527 MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits); 528 TRACE (mpn_trace ("lshift r", rp, rsize)); 529 } 530 531 rsize += rtwos_limbs; 532 SIZ(r) = (rneg ? -rsize : rsize); 533 }