github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/gcdext.c (about) 1 /* mpn_gcdext -- Extended Greatest Common Divisor. 2 3 Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation, 4 Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of either: 10 11 * the GNU Lesser General Public License as published by the Free 12 Software Foundation; either version 3 of the License, or (at your 13 option) any later version. 14 15 or 16 17 * the GNU General Public License as published by the Free Software 18 Foundation; either version 2 of the License, or (at your option) any 19 later version. 20 21 or both in parallel, as here. 22 23 The GNU MP Library is distributed in the hope that it will be useful, but 24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 for more details. 27 28 You should have received copies of the GNU General Public License and the 29 GNU Lesser General Public License along with the GNU MP Library. If not, 30 see https://www.gnu.org/licenses/. */ 31 32 #include "gmp.h" 33 #include "gmp-impl.h" 34 #include "longlong.h" 35 36 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and 37 the size is returned (if inputs are non-normalized, result may be 38 non-normalized too). Temporary space needed is M->n + n. 39 */ 40 static size_t 41 hgcd_mul_matrix_vector (struct hgcd_matrix *M, 42 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) 43 { 44 mp_limb_t ah, bh; 45 46 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 47 48 t = u00 * a 49 r = u10 * b 50 r += t; 51 52 t = u11 * b 53 b = u01 * a 54 b += t; 55 */ 56 57 if (M->n >= n) 58 { 59 mpn_mul (tp, M->p[0][0], M->n, ap, n); 60 mpn_mul (rp, M->p[1][0], M->n, bp, n); 61 } 62 else 63 { 64 mpn_mul (tp, ap, n, M->p[0][0], M->n); 65 mpn_mul (rp, bp, n, M->p[1][0], M->n); 66 } 67 68 ah = mpn_add_n (rp, rp, tp, n + M->n); 69 70 if (M->n >= n) 71 { 72 mpn_mul (tp, M->p[1][1], M->n, bp, n); 73 mpn_mul (bp, M->p[0][1], M->n, ap, n); 74 } 75 else 76 { 77 mpn_mul (tp, bp, n, M->p[1][1], M->n); 78 mpn_mul (bp, ap, n, M->p[0][1], M->n); 79 } 80 bh = mpn_add_n (bp, bp, tp, n + M->n); 81 82 n += M->n; 83 if ( (ah | bh) > 0) 84 { 85 rp[n] = ah; 86 bp[n] = bh; 87 n++; 88 } 89 else 90 { 91 /* Normalize */ 92 while ( (rp[n-1] | bp[n-1]) == 0) 93 n--; 94 } 95 96 return n; 97 } 98 99 #define COMPUTE_V_ITCH(n) (2*(n)) 100 101 /* Computes |v| = |(g - u a)| / b, where u may be positive or 102 negative, and v is of the opposite sign. max(a, b) is of size n, u and 103 v at most size n, and v must have space for n+1 limbs. */ 104 static mp_size_t 105 compute_v (mp_ptr vp, 106 mp_srcptr ap, mp_srcptr bp, mp_size_t n, 107 mp_srcptr gp, mp_size_t gn, 108 mp_srcptr up, mp_size_t usize, 109 mp_ptr tp) 110 { 111 mp_size_t size; 112 mp_size_t an; 113 mp_size_t bn; 114 mp_size_t vn; 115 116 ASSERT (n > 0); 117 ASSERT (gn > 0); 118 ASSERT (usize != 0); 119 120 size = ABS (usize); 121 ASSERT (size <= n); 122 ASSERT (up[size-1] > 0); 123 124 an = n; 125 MPN_NORMALIZE (ap, an); 126 ASSERT (gn <= an); 127 128 if (an >= size) 129 mpn_mul (tp, ap, an, up, size); 130 else 131 mpn_mul (tp, up, size, ap, an); 132 133 size += an; 134 135 if (usize > 0) 136 { 137 /* |v| = -v = (u a - g) / b */ 138 139 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); 140 MPN_NORMALIZE (tp, size); 141 if (size == 0) 142 return 0; 143 } 144 else 145 { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a, 146 (g + |u| a) always fits in (|usize| + an) limbs. */ 147 148 ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn)); 149 size -= (tp[size - 1] == 0); 150 } 151 152 /* Now divide t / b. There must be no remainder */ 153 bn = n; 154 MPN_NORMALIZE (bp, bn); 155 ASSERT (size >= bn); 156 157 vn = size + 1 - bn; 158 ASSERT (vn <= n + 1); 159 160 mpn_divexact (vp, tp, size, bp, bn); 161 vn -= (vp[vn-1] == 0); 162 163 return vn; 164 } 165 166 /* Temporary storage: 167 168 Initial division: Quotient of at most an - n + 1 <= an limbs. 169 170 Storage for u0 and u1: 2(n+1). 171 172 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) 173 174 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. 175 176 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. 177 178 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. 179 180 For the lehmer call after the loop, Let T denote 181 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for 182 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T 183 for u, T+1 for v and 2T scratch space. In all, 7T + 3 is 184 sufficient for both operations. 185 186 */ 187 188 /* Optimal choice of p seems difficult. In each iteration the division 189 * of work between hgcd and the updates of u0 and u1 depends on the 190 * current size of the u. It may be desirable to use a different 191 * choice of p in each iteration. Also the input size seems to matter; 192 * choosing p = n / 3 in the first iteration seems to improve 193 * performance slightly for input size just above the threshold, but 194 * degrade performance for larger inputs. */ 195 #define CHOOSE_P_1(n) ((n) / 2) 196 #define CHOOSE_P_2(n) ((n) / 3) 197 198 mp_size_t 199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 200 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) 201 { 202 mp_size_t talloc; 203 mp_size_t scratch; 204 mp_size_t matrix_scratch; 205 mp_size_t ualloc = n + 1; 206 207 struct gcdext_ctx ctx; 208 mp_size_t un; 209 mp_ptr u0; 210 mp_ptr u1; 211 212 mp_ptr tp; 213 214 TMP_DECL; 215 216 ASSERT (an >= n); 217 ASSERT (n > 0); 218 ASSERT (bp[n-1] > 0); 219 220 TMP_MARK; 221 222 /* FIXME: Check for small sizes first, before setting up temporary 223 storage etc. */ 224 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); 225 226 /* For initial division */ 227 scratch = an - n + 1; 228 if (scratch > talloc) 229 talloc = scratch; 230 231 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 232 { 233 /* For hgcd loop. */ 234 mp_size_t hgcd_scratch; 235 mp_size_t update_scratch; 236 mp_size_t p1 = CHOOSE_P_1 (n); 237 mp_size_t p2 = CHOOSE_P_2 (n); 238 mp_size_t min_p = MIN(p1, p2); 239 mp_size_t max_p = MAX(p1, p2); 240 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); 241 hgcd_scratch = mpn_hgcd_itch (n - min_p); 242 update_scratch = max_p + n - 1; 243 244 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); 245 if (scratch > talloc) 246 talloc = scratch; 247 248 /* Final mpn_gcdext_lehmer_n call. Need space for u and for 249 copies of a and b. */ 250 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) 251 + 3*GCDEXT_DC_THRESHOLD; 252 253 if (scratch > talloc) 254 talloc = scratch; 255 256 /* Cofactors u0 and u1 */ 257 talloc += 2*(n+1); 258 } 259 260 tp = TMP_ALLOC_LIMBS(talloc); 261 262 if (an > n) 263 { 264 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); 265 266 if (mpn_zero_p (ap, n)) 267 { 268 MPN_COPY (gp, bp, n); 269 *usizep = 0; 270 TMP_FREE; 271 return n; 272 } 273 } 274 275 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 276 { 277 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); 278 279 TMP_FREE; 280 return gn; 281 } 282 283 MPN_ZERO (tp, 2*ualloc); 284 u0 = tp; tp += ualloc; 285 u1 = tp; tp += ualloc; 286 287 ctx.gp = gp; 288 ctx.up = up; 289 ctx.usize = usizep; 290 291 { 292 /* For the first hgcd call, there are no u updates, and it makes 293 some sense to use a different choice for p. */ 294 295 /* FIXME: We could trim use of temporary storage, since u0 and u1 296 are not used yet. For the hgcd call, we could swap in the u0 297 and u1 pointers for the relevant matrix elements. */ 298 299 struct hgcd_matrix M; 300 mp_size_t p = CHOOSE_P_1 (n); 301 mp_size_t nn; 302 303 mpn_hgcd_matrix_init (&M, n - p, tp); 304 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 305 if (nn > 0) 306 { 307 ASSERT (M.n <= (n - p - 1)/2); 308 ASSERT (M.n + p <= (p + n - 1) / 2); 309 310 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 311 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); 312 313 MPN_COPY (u0, M.p[1][0], M.n); 314 MPN_COPY (u1, M.p[1][1], M.n); 315 un = M.n; 316 while ( (u0[un-1] | u1[un-1] ) == 0) 317 un--; 318 } 319 else 320 { 321 /* mpn_hgcd has failed. Then either one of a or b is very 322 small, or the difference is very small. Perform one 323 subtraction followed by one division. */ 324 u1[0] = 1; 325 326 ctx.u0 = u0; 327 ctx.u1 = u1; 328 ctx.tp = tp + n; /* ualloc */ 329 ctx.un = 1; 330 331 /* Temporary storage n */ 332 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); 333 if (n == 0) 334 { 335 TMP_FREE; 336 return ctx.gn; 337 } 338 339 un = ctx.un; 340 ASSERT (un < ualloc); 341 } 342 } 343 344 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 345 { 346 struct hgcd_matrix M; 347 mp_size_t p = CHOOSE_P_2 (n); 348 mp_size_t nn; 349 350 mpn_hgcd_matrix_init (&M, n - p, tp); 351 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 352 if (nn > 0) 353 { 354 mp_ptr t0; 355 356 t0 = tp + matrix_scratch; 357 ASSERT (M.n <= (n - p - 1)/2); 358 ASSERT (M.n + p <= (p + n - 1) / 2); 359 360 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 361 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); 362 363 /* By the same analysis as for mpn_hgcd_matrix_mul */ 364 ASSERT (M.n + un <= ualloc); 365 366 /* FIXME: This copying could be avoided by some swapping of 367 * pointers. May need more temporary storage, though. */ 368 MPN_COPY (t0, u0, un); 369 370 /* Temporary storage ualloc */ 371 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); 372 373 ASSERT (un < ualloc); 374 ASSERT ( (u0[un-1] | u1[un-1]) > 0); 375 } 376 else 377 { 378 /* mpn_hgcd has failed. Then either one of a or b is very 379 small, or the difference is very small. Perform one 380 subtraction followed by one division. */ 381 ctx.u0 = u0; 382 ctx.u1 = u1; 383 ctx.tp = tp + n; /* ualloc */ 384 ctx.un = un; 385 386 /* Temporary storage n */ 387 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); 388 if (n == 0) 389 { 390 TMP_FREE; 391 return ctx.gn; 392 } 393 394 un = ctx.un; 395 ASSERT (un < ualloc); 396 } 397 } 398 /* We have A = ... a + ... b 399 B = u0 a + u1 b 400 401 a = u1 A + ... B 402 b = -u0 A + ... B 403 404 with bounds 405 406 |u0|, |u1| <= B / min(a, b) 407 408 We always have u1 > 0, and u0 == 0 is possible only if u1 == 1, 409 in which case the only reduction done so far is a = A - k B for 410 some k. 411 412 Compute g = u a + v b = (u u1 - v u0) A + (...) B 413 Here, u, v are bounded by 414 415 |u| <= b, 416 |v| <= a 417 */ 418 419 ASSERT ( (ap[n-1] | bp[n-1]) > 0); 420 421 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) 422 { 423 /* Must return the smallest cofactor, +u1 or -u0 */ 424 int c; 425 426 MPN_COPY (gp, ap, n); 427 428 MPN_CMP (c, u0, u1, un); 429 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in 430 this case we choose the cofactor + 1, corresponding to G = A 431 - k B, rather than -1, corresponding to G = - A + (k+1) B. */ 432 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); 433 if (c < 0) 434 { 435 MPN_NORMALIZE (u0, un); 436 MPN_COPY (up, u0, un); 437 *usizep = -un; 438 } 439 else 440 { 441 MPN_NORMALIZE_NOT_ZERO (u1, un); 442 MPN_COPY (up, u1, un); 443 *usizep = un; 444 } 445 446 TMP_FREE; 447 return n; 448 } 449 else if (UNLIKELY (u0[0] == 0) && un == 1) 450 { 451 mp_size_t gn; 452 ASSERT (u1[0] == 1); 453 454 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ 455 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); 456 457 TMP_FREE; 458 return gn; 459 } 460 else 461 { 462 mp_size_t u0n; 463 mp_size_t u1n; 464 mp_size_t lehmer_un; 465 mp_size_t lehmer_vn; 466 mp_size_t gn; 467 468 mp_ptr lehmer_up; 469 mp_ptr lehmer_vp; 470 int negate; 471 472 lehmer_up = tp; tp += n; 473 474 /* Call mpn_gcdext_lehmer_n with copies of a and b. */ 475 MPN_COPY (tp, ap, n); 476 MPN_COPY (tp + n, bp, n); 477 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); 478 479 u0n = un; 480 MPN_NORMALIZE (u0, u0n); 481 ASSERT (u0n > 0); 482 483 if (lehmer_un == 0) 484 { 485 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ 486 MPN_COPY (up, u0, u0n); 487 *usizep = -u0n; 488 489 TMP_FREE; 490 return gn; 491 } 492 493 lehmer_vp = tp; 494 /* Compute v = (g - u a) / b */ 495 lehmer_vn = compute_v (lehmer_vp, 496 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); 497 498 if (lehmer_un > 0) 499 negate = 0; 500 else 501 { 502 lehmer_un = -lehmer_un; 503 negate = 1; 504 } 505 506 u1n = un; 507 MPN_NORMALIZE (u1, u1n); 508 ASSERT (u1n > 0); 509 510 ASSERT (lehmer_un + u1n <= ualloc); 511 ASSERT (lehmer_vn + u0n <= ualloc); 512 513 /* We may still have v == 0 */ 514 515 /* Compute u u0 */ 516 if (lehmer_un <= u1n) 517 /* Should be the common case */ 518 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); 519 else 520 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); 521 522 un = u1n + lehmer_un; 523 un -= (up[un - 1] == 0); 524 525 if (lehmer_vn > 0) 526 { 527 mp_limb_t cy; 528 529 /* Overwrites old u1 value */ 530 if (lehmer_vn <= u0n) 531 /* Should be the common case */ 532 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); 533 else 534 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); 535 536 u1n = u0n + lehmer_vn; 537 u1n -= (u1[u1n - 1] == 0); 538 539 if (u1n <= un) 540 { 541 cy = mpn_add (up, up, un, u1, u1n); 542 } 543 else 544 { 545 cy = mpn_add (up, u1, u1n, up, un); 546 un = u1n; 547 } 548 up[un] = cy; 549 un += (cy != 0); 550 551 ASSERT (un < ualloc); 552 } 553 *usizep = negate ? -un : un; 554 555 TMP_FREE; 556 return gn; 557 } 558 }