github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/generic/matrix22_mul.c (about)

     1  /* matrix22_mul.c.
     2  
     3     Contributed by Niels Möller and Marco Bodrato.
     4  
     5     THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
     6     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     7     GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     8  
     9  Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
    10  
    11  This file is part of the GNU MP Library.
    12  
    13  The GNU MP Library is free software; you can redistribute it and/or modify
    14  it under the terms of either:
    15  
    16    * the GNU Lesser General Public License as published by the Free
    17      Software Foundation; either version 3 of the License, or (at your
    18      option) any later version.
    19  
    20  or
    21  
    22    * the GNU General Public License as published by the Free Software
    23      Foundation; either version 2 of the License, or (at your option) any
    24      later version.
    25  
    26  or both in parallel, as here.
    27  
    28  The GNU MP Library is distributed in the hope that it will be useful, but
    29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    31  for more details.
    32  
    33  You should have received copies of the GNU General Public License and the
    34  GNU Lesser General Public License along with the GNU MP Library.  If not,
    35  see https://www.gnu.org/licenses/.  */
    36  
    37  #include "gmp.h"
    38  #include "gmp-impl.h"
    39  #include "longlong.h"
    40  
    41  #define MUL(rp, ap, an, bp, bn) do {		\
    42    if (an >= bn)					\
    43      mpn_mul (rp, ap, an, bp, bn);		\
    44    else						\
    45      mpn_mul (rp, bp, bn, ap, an);		\
    46  } while (0)
    47  
    48  /* Inputs are unsigned. */
    49  static int
    50  abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
    51  {
    52    int c;
    53    MPN_CMP (c, ap, bp, n);
    54    if (c >= 0)
    55      {
    56        mpn_sub_n (rp, ap, bp, n);
    57        return 0;
    58      }
    59    else
    60      {
    61        mpn_sub_n (rp, bp, ap, n);
    62        return 1;
    63      }
    64  }
    65  
    66  static int
    67  add_signed_n (mp_ptr rp,
    68  	      mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
    69  {
    70    if (as != bs)
    71      return as ^ abs_sub_n (rp, ap, bp, n);
    72    else
    73      {
    74        ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
    75        return as;
    76      }
    77  }
    78  
    79  mp_size_t
    80  mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
    81  {
    82    if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
    83        || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
    84      return 3*rn + 2*mn;
    85    else
    86      return 3*(rn + mn) + 5;
    87  }
    88  
    89  /* Algorithm:
    90  
    91      / s0 \   /  1  0  0  0 \ / r0 \
    92      | s1 |   |  0  1  0  1 | | r1 |
    93      | s2 |   |  0  0 -1  1 | | r2 |
    94      | s3 | = |  0  1 -1  1 | \ r3 /
    95      | s4 |   | -1  1 -1  1 |
    96      | s5 |   |  0  1  0  0 |
    97      \ s6 /   \  0  0  1  0 /
    98  
    99      / t0 \   /  1  0  0  0 \ / m0 \
   100      | t1 |   |  0  1  0  1 | | m1 |
   101      | t2 |   |  0  0 -1  1 | | m2 |
   102      | t3 | = |  0  1 -1  1 | \ m3 /
   103      | t4 |   | -1  1 -1  1 |
   104      | t5 |   |  0  1  0  0 |
   105      \ t6 /   \  0  0  1  0 /
   106  
   107    Note: the two matrices above are the same, but s_i and t_i are used
   108    in the same product, only for i<4, see "A Strassen-like Matrix
   109    Multiplication suited for squaring and higher power computation" by
   110    M. Bodrato, in Proceedings of ISSAC 2010.
   111  
   112      / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \
   113      | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |
   114      | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |
   115      \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |
   116  				    | s4*t5 |
   117  				    | s5*t6 |
   118  				    \ s6*t4 /
   119  
   120    The scheduling uses two temporaries U0 and U1 to store products, and
   121    two, S0 and T0, to store combinations of entries of the two
   122    operands.
   123  */
   124  
   125  /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
   126   *
   127   * Resulting elements are of size up to rn + mn + 1.
   128   *
   129   * Temporary storage: 3 rn + 3 mn + 5. */
   130  void
   131  mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
   132  			   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
   133  			   mp_ptr tp)
   134  {
   135    mp_ptr s0, t0, u0, u1;
   136    int r1s, r3s, s0s, t0s, u1s;
   137    s0 = tp; tp += rn + 1;
   138    t0 = tp; tp += mn + 1;
   139    u0 = tp; tp += rn + mn + 1;
   140    u1 = tp; /* rn + mn + 2 */
   141  
   142    MUL (u0, r1, rn, m2, mn);		/* u5 = s5 * t6 */
   143    r3s = abs_sub_n (r3, r3, r2, rn);	/* r3 - r2 */
   144    if (r3s)
   145      {
   146        r1s = abs_sub_n (r1, r1, r3, rn);
   147        r1[rn] = 0;
   148      }
   149    else
   150      {
   151        r1[rn] = mpn_add_n (r1, r1, r3, rn);
   152        r1s = 0;				/* r1 - r2 + r3  */
   153      }
   154    if (r1s)
   155      {
   156        s0[rn] = mpn_add_n (s0, r1, r0, rn);
   157        s0s = 0;
   158      }
   159    else if (r1[rn] != 0)
   160      {
   161        s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
   162        s0s = 1;				/* s4 = -r0 + r1 - r2 + r3 */
   163  					/* Reverse sign! */
   164      }
   165    else
   166      {
   167        s0s = abs_sub_n (s0, r0, r1, rn);
   168        s0[rn] = 0;
   169      }
   170    MUL (u1, r0, rn, m0, mn);		/* u0 = s0 * t0 */
   171    r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
   172    ASSERT (r0[rn+mn] < 2);		/* u0 + u5 */
   173  
   174    t0s = abs_sub_n (t0, m3, m2, mn);
   175    u1s = r3s^t0s^1;			/* Reverse sign! */
   176    MUL (u1, r3, rn, t0, mn);		/* u2 = s2 * t2 */
   177    u1[rn+mn] = 0;
   178    if (t0s)
   179      {
   180        t0s = abs_sub_n (t0, m1, t0, mn);
   181        t0[mn] = 0;
   182      }
   183    else
   184      {
   185        t0[mn] = mpn_add_n (t0, t0, m1, mn);
   186      }
   187  
   188    /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
   189       at r3. I'd expect that for matrices of random size, the high
   190       words t0[mn] and r1[rn] are non-zero with a pretty small
   191       probability. If that can be confirmed this should be done as an
   192       unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
   193       add_n. */
   194    if (t0[mn] != 0)
   195      {
   196        MUL (r3, r1, rn, t0, mn + 1);	/* u3 = s3 * t3 */
   197        ASSERT (r1[rn] < 2);
   198        if (r1[rn] != 0)
   199  	mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
   200      }
   201    else
   202      {
   203        MUL (r3, r1, rn + 1, t0, mn);
   204      }
   205  
   206    ASSERT (r3[rn+mn] < 4);
   207  
   208    u0[rn+mn] = 0;
   209    if (r1s^t0s)
   210      {
   211        r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
   212      }
   213    else
   214      {
   215        ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
   216        r3s = 0;				/* u3 + u5 */
   217      }
   218  
   219    if (t0s)
   220      {
   221        t0[mn] = mpn_add_n (t0, t0, m0, mn);
   222      }
   223    else if (t0[mn] != 0)
   224      {
   225        t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
   226      }
   227    else
   228      {
   229        t0s = abs_sub_n (t0, t0, m0, mn);
   230      }
   231    MUL (u0, r2, rn, t0, mn + 1);		/* u6 = s6 * t4 */
   232    ASSERT (u0[rn+mn] < 2);
   233    if (r1s)
   234      {
   235        ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
   236      }
   237    else
   238      {
   239        r1[rn] += mpn_add_n (r1, r1, r2, rn);
   240      }
   241    rn++;
   242    t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
   243  					/* u3 + u5 + u6 */
   244    ASSERT (r2[rn+mn-1] < 4);
   245    r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
   246  					/* -u2 + u3 + u5  */
   247    ASSERT (r3[rn+mn-1] < 3);
   248    MUL (u0, s0, rn, m1, mn);		/* u4 = s4 * t5 */
   249    ASSERT (u0[rn+mn-1] < 2);
   250    t0[mn] = mpn_add_n (t0, m3, m1, mn);
   251    MUL (u1, r1, rn, t0, mn + 1);		/* u1 = s1 * t1 */
   252    mn += rn;
   253    ASSERT (u1[mn-1] < 4);
   254    ASSERT (u1[mn] == 0);
   255    ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
   256  					/* -u2 + u3 - u4 + u5  */
   257    ASSERT (r1[mn-1] < 2);
   258    if (r3s)
   259      {
   260        ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
   261      }
   262    else
   263      {
   264        ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
   265  					/* u1 + u2 - u3 - u5  */
   266      }
   267    ASSERT (r3[mn-1] < 2);
   268    if (t0s)
   269      {
   270        ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
   271      }
   272    else
   273      {
   274        ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
   275  					/* u1 - u3 - u5 - u6  */
   276      }
   277    ASSERT (r2[mn-1] < 2);
   278  }
   279  
   280  void
   281  mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
   282  		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
   283  		  mp_ptr tp)
   284  {
   285    if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
   286        || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
   287      {
   288        mp_ptr p0, p1;
   289        unsigned i;
   290  
   291        /* Temporary storage: 3 rn + 2 mn */
   292        p0 = tp + rn;
   293        p1 = p0 + rn + mn;
   294  
   295        for (i = 0; i < 2; i++)
   296  	{
   297  	  MPN_COPY (tp, r0, rn);
   298  
   299  	  if (rn >= mn)
   300  	    {
   301  	      mpn_mul (p0, r0, rn, m0, mn);
   302  	      mpn_mul (p1, r1, rn, m3, mn);
   303  	      mpn_mul (r0, r1, rn, m2, mn);
   304  	      mpn_mul (r1, tp, rn, m1, mn);
   305  	    }
   306  	  else
   307  	    {
   308  	      mpn_mul (p0, m0, mn, r0, rn);
   309  	      mpn_mul (p1, m3, mn, r1, rn);
   310  	      mpn_mul (r0, m2, mn, r1, rn);
   311  	      mpn_mul (r1, m1, mn, tp, rn);
   312  	    }
   313  	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
   314  	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
   315  
   316  	  r0 = r2; r1 = r3;
   317  	}
   318      }
   319    else
   320      mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
   321  			       m0, m1, m2, m3, mn, tp);
   322  }