github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_ppc64x.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build (ppc64 || ppc64le) && !purego
     6  
     7  #include "textflag.h"
     8  
     9  //func gfpUnmarshal(out *gfP, in *[32]byte)
    10  TEXT ·gfpUnmarshal(SB), NOSPLIT, $0-16
    11  	MOVD	res+0(FP), R3
    12  	MOVD	in+8(FP), R4
    13  	BR	gfpInternalEndianSwap<>(SB)
    14  
    15  // func gfpMarshal(out *[32]byte, in *gfP)
    16  TEXT ·gfpMarshal(SB), NOSPLIT, $0-16
    17  	MOVD	res+0(FP), R3
    18  	MOVD	in+8(FP), R4
    19  	BR	gfpInternalEndianSwap<>(SB)
    20  
    21  TEXT gfpInternalEndianSwap<>(SB), NOSPLIT, $0-0
    22  	// Index registers needed for BR movs
    23  #ifdef GOARCH_ppc64le	
    24  	MOVD	$8, R9
    25  	MOVD	$16, R10
    26  	MOVD	$24, R14
    27  
    28  	MOVDBR	(R0)(R4), R5
    29  	MOVDBR	(R9)(R4), R6
    30  	MOVDBR	(R10)(R4), R7
    31  	MOVDBR	(R14)(R4), R8
    32  
    33  	MOVD	R8, 0(R3)
    34  	MOVD	R7, 8(R3)
    35  	MOVD	R6, 16(R3)
    36  	MOVD	R5, 24(R3)
    37  #else
    38  	MOVD	$16, R10
    39  	LXVD2X (R4)(R0), V0
    40  	LXVD2X (R4)(R10), V1
    41  
    42  	XXPERMDI V0, V0, $2, V0
    43  	XXPERMDI V1, V1, $2, V1
    44  
    45  	STXVD2X V1, (R0+R3)
    46  	STXVD2X V0, (R10+R3)	
    47  #endif
    48  	RET
    49  
    50  #define X1L   V0
    51  #define X1H   V1
    52  #define Y1L   V2
    53  #define Y1H   V3
    54  #define T1L   V4
    55  #define T1H   V5
    56  #define T0    V4
    57  #define T1    V5
    58  #define T2    V6
    59  #define SEL1  V7
    60  #define ZERO  V8
    61  #define CAR1  V9
    62  #define CAR2  V10
    63  #define TT0   V11
    64  #define TT1   V12
    65  
    66  #define PL    V30
    67  #define PH    V31
    68  
    69  #define gfpSubInternal(T1, T0, X1, X0, Y1, Y0) \
    70  	VSPLTISB $0, ZERO           \ // VZERO
    71  	VSUBCUQ  X0, Y0, CAR1       \
    72  	VSUBUQM  X0, Y0, T0         \
    73  	VSUBECUQ X1, Y1, CAR1, SEL1 \
    74  	VSUBEUQM X1, Y1, CAR1, T1   \
    75  	VSUBUQM  ZERO, SEL1, SEL1   \ // VSQ
    76  	                            \
    77  	VADDCUQ  T0, PL, CAR1       \ // VACCQ
    78  	VADDUQM  T0, PL, TT0        \ // VAQ
    79  	VADDEUQM T1, PH, CAR1, TT1  \ // VACQ
    80  	                            \
    81  	VSEL     TT0, T0, SEL1, T0  \
    82  	VSEL     TT1, T1, SEL1, T1  \
    83  
    84  TEXT ·gfpNeg(SB),0,$0-16
    85  	MOVD c+0(FP), R3
    86  	MOVD a+8(FP), R4
    87  
    88  	MOVD $16, R5
    89  	LXVD2X (R4)(R0), Y1L
    90  	LXVD2X (R4)(R5), Y1H
    91  
    92  	XXPERMDI Y1H, Y1H, $2, Y1H
    93  	XXPERMDI Y1L, Y1L, $2, Y1L
    94  
    95  	MOVD $·p2+0(SB), R6
    96  	LXVD2X (R6)(R0), PL
    97  	LXVD2X (R6)(R5), PH
    98  
    99  	XXPERMDI PH, PH, $2, PH
   100  	XXPERMDI PL, PL, $2, PL
   101  
   102  	VSPLTISB $0, X1L
   103  	gfpSubInternal(T1, T0, X1L, X1L, Y1H, Y1L)
   104  
   105  	XXPERMDI T1, T1, $2, T1
   106  	XXPERMDI T0, T0, $2, T0
   107  
   108  	STXVD2X T0, (R0+R3)
   109  	STXVD2X T1, (R5+R3)
   110  	RET
   111  
   112  TEXT ·gfpSub(SB),0,$0-24
   113  	MOVD c+0(FP), R3
   114  	MOVD a+8(FP), R4
   115  	MOVD b+16(FP), R5
   116  
   117  	MOVD $16, R6
   118  	LXVD2X (R4)(R0), X1L
   119  	LXVD2X (R4)(R6), X1H
   120  	XXPERMDI X1H, X1H, $2, X1H
   121  	XXPERMDI X1L, X1L, $2, X1L
   122  
   123  	LXVD2X (R5)(R0), Y1L
   124  	LXVD2X (R5)(R6), Y1H
   125  	XXPERMDI Y1H, Y1H, $2, Y1H
   126  	XXPERMDI Y1L, Y1L, $2, Y1L
   127  
   128  	MOVD $·p2+0(SB), R7
   129  	LXVD2X (R7)(R0), PL
   130  	LXVD2X (R7)(R6), PH
   131  	XXPERMDI PH, PH, $2, PH
   132  	XXPERMDI PL, PL, $2, PL
   133  
   134  	gfpSubInternal(T1, T0, X1H, X1L, Y1H, Y1L)
   135  
   136  	XXPERMDI T1, T1, $2, T1
   137  	XXPERMDI T0, T0, $2, T0
   138  
   139  	STXVD2X T0, (R0+R3)
   140  	STXVD2X T1, (R6+R3)
   141  	RET
   142  
   143  #define gfpAddInternal(T1, T0, X1, X0, Y1, Y0) \
   144  	VADDCUQ  X0, Y0, CAR1         \
   145  	VADDUQM  X0, Y0, T0           \
   146  	VADDECUQ X1, Y1, CAR1, T2     \ // VACCCQ
   147  	VADDEUQM X1, Y1, CAR1, T1     \
   148  	                              \
   149  	VSUBCUQ  T0, PL, CAR1         \ // VSCBIQ
   150  	VSUBUQM  T0, PL, TT0          \
   151  	VSUBECUQ T1, PH, CAR1, CAR2   \ // VSBCBIQ
   152  	VSUBEUQM T1, PH, CAR1, TT1    \ // VSBIQ
   153  	VSUBEUQM T2, ZERO, CAR2, SEL1 \
   154  	                              \
   155  	VSEL     TT0, T0, SEL1, T0    \
   156  	VSEL     TT1, T1, SEL1, T1
   157  
   158  TEXT ·gfpAdd(SB),0,$0-24
   159  	MOVD c+0(FP), R3
   160  	MOVD a+8(FP), R4
   161  	MOVD b+16(FP), R5
   162  
   163  	MOVD $16, R6
   164  	LXVD2X (R4)(R0), X1L
   165  	LXVD2X (R4)(R6), X1H
   166  	XXPERMDI X1H, X1H, $2, X1H
   167  	XXPERMDI X1L, X1L, $2, X1L
   168  
   169  	LXVD2X (R5)(R0), Y1L
   170  	LXVD2X (R5)(R6), Y1H
   171  	XXPERMDI Y1H, Y1H, $2, Y1H
   172  	XXPERMDI Y1L, Y1L, $2, Y1L
   173  
   174  	MOVD $·p2+0(SB), R7
   175  	LXVD2X (R7)(R0), PL
   176  	LXVD2X (R7)(R6), PH
   177  	XXPERMDI PH, PH, $2, PH
   178  	XXPERMDI PL, PL, $2, PL
   179  
   180  	VSPLTISB $0, ZERO
   181  
   182  	gfpAddInternal(T1, T0, X1H, X1L, Y1H, Y1L)
   183  
   184  	XXPERMDI T1, T1, $2, T1
   185  	XXPERMDI T0, T0, $2, T0
   186  
   187  	STXVD2X T0, (R0+R3)
   188  	STXVD2X T1, (R6+R3)
   189  	RET
   190  
   191  TEXT ·gfpDouble(SB),0,$0-16
   192  	MOVD c+0(FP), R3
   193  	MOVD a+8(FP), R4
   194  
   195  	MOVD $16, R6
   196  	LXVD2X (R4)(R0), X1L
   197  	LXVD2X (R4)(R6), X1H
   198  	XXPERMDI X1H, X1H, $2, X1H
   199  	XXPERMDI X1L, X1L, $2, X1L
   200  
   201  	MOVD $·p2+0(SB), R7
   202  	LXVD2X (R7)(R0), PL
   203  	LXVD2X (R7)(R6), PH
   204  	XXPERMDI PH, PH, $2, PH
   205  	XXPERMDI PL, PL, $2, PL
   206  
   207  	VSPLTISB $0, ZERO
   208  
   209  	gfpAddInternal(T1, T0, X1H, X1L, X1H, X1L)
   210  
   211  	XXPERMDI T1, T1, $2, T1
   212  	XXPERMDI T0, T0, $2, T0
   213  
   214  	STXVD2X T0, (R0+R3)
   215  	STXVD2X T1, (R6+R3)
   216  	RET
   217  
   218  TEXT ·gfpTriple(SB),0,$0-16
   219  	MOVD c+0(FP), R3
   220  	MOVD a+8(FP), R4
   221  
   222  	MOVD $16, R6
   223  	LXVD2X (R4)(R0), X1L
   224  	LXVD2X (R4)(R6), X1H
   225  	XXPERMDI X1H, X1H, $2, X1H
   226  	XXPERMDI X1L, X1L, $2, X1L
   227  
   228  	MOVD $·p2+0(SB), R7
   229  	LXVD2X (R7)(R0), PL
   230  	LXVD2X (R7)(R6), PH
   231  	XXPERMDI PH, PH, $2, PH
   232  	XXPERMDI PL, PL, $2, PL
   233  
   234  	VSPLTISB $0, ZERO
   235  
   236  	gfpAddInternal(T1, T0, X1H, X1L, X1H, X1L)
   237  	gfpAddInternal(T1, T0, T1, T0, X1H, X1L)
   238  
   239  	XXPERMDI T1, T1, $2, T1
   240  	XXPERMDI T0, T0, $2, T0
   241  
   242  	STXVD2X T0, (R0+R3)
   243  	STXVD2X T1, (R6+R3)
   244  	RET
   245  
   246  #undef X1L
   247  #undef X1H
   248  #undef Y1L
   249  #undef Y1H
   250  #undef T1L
   251  #undef T1H
   252  #undef T0
   253  #undef T1
   254  #undef T2
   255  #undef SEL1
   256  #undef ZERO
   257  #undef CAR1
   258  #undef CAR2
   259  #undef TT0
   260  #undef TT1
   261  #undef PL
   262  #undef PH
   263  
   264  // Vector multiply word
   265  //
   266  //	VMLF  x0, x1, out_low
   267  //	VMLHF x0, x1, out_hi
   268  #define VMULT(x1, x2, out_low, out_hi) \
   269  	VMULEUW x1, x2, TMP1; \
   270  	VMULOUW x1, x2, TMP2; \
   271  	VMRGEW TMP1, TMP2, out_hi; \
   272  	VMRGOW TMP1, TMP2, out_low
   273  
   274  //
   275  // Vector multiply add word
   276  //
   277  //	VMALF  x0, x1, y, out_low
   278  //	VMALHF x0, x1, y, out_hi
   279  #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
   280  	VMULEUW  y, one, TMP1; \
   281  	VMULOUW  y, one, TMP2; \
   282  	VMULEUW  x1, x2, out_hi; \
   283  	VMULOUW  x1, x2, out_low; \
   284  	VADDUDM  TMP1, out_hi, TMP1; \
   285  	VADDUDM  TMP2, out_low, TMP2; \
   286  	VMRGEW   TMP1, TMP2, out_hi; \
   287  	VMRGOW   TMP1, TMP2, out_low
   288  
   289  
   290  // ---------------------------------------
   291  // gfpMulInternal
   292  #define X0    V0
   293  #define X1    V1
   294  #define Y0    V2
   295  #define Y1    V3
   296  #define M1    V4
   297  #define M0    V5
   298  #define T0    V6
   299  #define T1    V7
   300  #define T2    V8
   301  #define YDIG  V9
   302  
   303  #define ADD1  V16
   304  #define ADD1H V17
   305  #define ADD2  V18
   306  #define ADD2H V19
   307  #define RED1  V20
   308  #define RED1H V21
   309  #define RED2  V22
   310  #define RED2H V23
   311  #define CAR1  V24
   312  #define CAR1M V25
   313  
   314  #define MK0   V30
   315  #define K0    V31
   316  
   317  // TMP1, TMP2 used in
   318  // VMULT macros
   319  #define TMP1  V13
   320  #define TMP2  V27
   321  #define ONE   V29 // 1s splatted by word
   322  
   323  TEXT gfpMulInternal<>(SB), NOSPLIT, $0
   324  	// ---------------------------------------------------------------------------/
   325  	//	VREPF $3, Y0, YDIG
   326  	VSPLTW $3, Y0, YDIG
   327  	VSPLTISW $1, ONE
   328  
   329  	//	VMLF  X0, YDIG, ADD1
   330  	//	VMLF  X1, YDIG, ADD2
   331  	//	VMLHF X0, YDIG, ADD1H
   332  	//	VMLHF X1, YDIG, ADD2H
   333  	VMULT(X0, YDIG, ADD1, ADD1H)
   334  	VMULT(X1, YDIG, ADD2, ADD2H)
   335  
   336  	//	VMLF  ADD1, K0, MK0
   337  	//	VREPF $3, MK0, MK0
   338  	VMULUWM ADD1, K0, MK0
   339  	VSPLTW $3, MK0, MK0
   340  
   341  	//	VMALF  M0, MK0, ADD1, RED1
   342  	//	VMALHF M0, MK0, ADD1, RED1H
   343  	//	VMALF  M1, MK0, ADD2, RED2
   344  	//	VMALHF M1, MK0, ADD2, RED2H
   345  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   346  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   347  
   348  	VSPLTISB $0, T2 // VZERO T2
   349  
   350  	VSLDOI $12, RED2, RED1, RED1 // VSLDB
   351  	VSLDOI $12, T2, RED2, RED2   // VSLDB
   352  
   353  	VADDCUQ RED1, ADD1H, CAR1  // VACCQ
   354  	VADDUQM RED1, ADD1H, T0    // VAQ
   355  	VADDCUQ RED1H, T0, CAR1M   // VACCQ
   356  	VADDUQM RED1H, T0, T0      // VAQ
   357  
   358  	// << ready for next MK0
   359  
   360  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   361  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   362  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   363  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   364  	VADDUQM  CAR1, T2, T2            // VAQ
   365  
   366  	// ---------------------------------------------------
   367  /* *
   368   * ---+--------+--------+
   369   *  T2|   T1   |   T0   |
   370   * ---+--------+--------+
   371   *           *(add)*
   372   *    +--------+--------+
   373   *    |   X1   |   X0   |
   374   *    +--------+--------+
   375   *           *(mul)*
   376   *    +--------+--------+
   377   *    |  YDIG  |  YDIG  |
   378   *    +--------+--------+
   379   *           *(add)*
   380   *    +--------+--------+
   381   *    |   M1   |   M0   |
   382   *    +--------+--------+
   383   *           *(mul)*
   384   *    +--------+--------+
   385   *    |   MK0  |   MK0  |
   386   *    +--------+--------+
   387   *
   388   *   ---------------------
   389   *
   390   *    +--------+--------+
   391   *    |  ADD2  |  ADD1  |
   392   *    +--------+--------+
   393   *  +--------+--------+
   394   *  | ADD2H  | ADD1H  |
   395   *  +--------+--------+
   396   *    +--------+--------+
   397   *    |  RED2  |  RED1  |
   398   *    +--------+--------+
   399   *  +--------+--------+
   400   *  | RED2H  | RED1H  |
   401   *  +--------+--------+
   402   */
   403  	// VREPF $2, Y0, YDIG
   404  	VSPLTW $2, Y0, YDIG
   405  
   406  	//	VMALF X0, YDIG, T0, ADD1
   407  	//	VMALF  X1, YDIG, T1, ADD2
   408  	//	VMALHF X0, YDIG, T0, ADD1H
   409  	//	VMALHF X1, YDIG, T1, ADD2H
   410  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   411  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   412  
   413  	//	VMLF  ADD1, K0, MK0
   414  	//	VREPF $3, MK0, MK0
   415  	VMULUWM ADD1, K0, MK0
   416  	VSPLTW $3, MK0, MK0
   417  
   418  	//	VMALF  M0, MK0, ADD1, RED1
   419  	//	VMALHF M0, MK0, ADD1, RED1H
   420  	//	VMALF  M1, MK0, ADD2, RED2
   421  	//	VMALHF M1, MK0, ADD2, RED2H
   422  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   423  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   424  
   425  	VSLDOI $12, RED2, RED1, RED1 // VSLDB
   426  	VSLDOI $12, T2, RED2, RED2   // VSLDB
   427  
   428  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   429  	VADDUQM RED1, ADD1H, T0   // VAQ
   430  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   431  	VADDUQM RED1H, T0, T0     // VAQ
   432  
   433  	// << ready for next MK0
   434  
   435  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   436  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   437  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   438  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   439  	VADDUQM  CAR1, T2, T2            // VAQ
   440  
   441  	// ---------------------------------------------------
   442  	//	VREPF $1, Y0, YDIG
   443  	VSPLTW $1, Y0, YDIG
   444  
   445  	//	VMALF X0, YDIG, T0, ADD1
   446  	//	VMALF  X1, YDIG, T1, ADD2
   447  	//	VMALHF X0, YDIG, T0, ADD1H
   448  	//	VMALHF X1, YDIG, T1, ADD2H
   449  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   450  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   451  
   452  	//	VMLF  ADD1, K0, MK0
   453  	//	VREPF $3, MK0, MK0
   454  	VMULUWM ADD1, K0, MK0
   455  	VSPLTW $3, MK0, MK0
   456  
   457  	//	VMALF  M0, MK0, ADD1, RED1
   458  	//	VMALHF M0, MK0, ADD1, RED1H
   459  	//	VMALF  M1, MK0, ADD2, RED2
   460  	//	VMALHF M1, MK0, ADD2, RED2H
   461  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   462  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   463  	
   464  	VSLDOI $12, RED2, RED1, RED1 // VSLDB
   465  	VSLDOI $12, T2, RED2, RED2   // VSLDB
   466  
   467  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   468  	VADDUQM RED1, ADD1H, T0 // VAQ
   469  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   470  	VADDUQM RED1H, T0, T0   // VAQ
   471  
   472  	// << ready for next MK0
   473  
   474  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   475  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   476  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   477  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   478  	VADDUQM  CAR1, T2, T2            // VAQ
   479  
   480  	// ---------------------------------------------------
   481  	//	VREPF $0, Y0, YDIG
   482  	VSPLTW $0, Y0, YDIG
   483  
   484  	//	VMALF X0, YDIG, T0, ADD1
   485  	//	VMALF  X1, YDIG, T1, ADD2
   486  	//	VMALHF X0, YDIG, T0, ADD1H
   487  	//	VMALHF X1, YDIG, T1, ADD2H
   488  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   489  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   490  
   491  	//	VMLF  ADD1, K0, MK0
   492  	//	VREPF $3, MK0, MK0
   493  	VMULUWM ADD1, K0, MK0
   494  	VSPLTW $3, MK0, MK0
   495  
   496  	//	VMALF  M0, MK0, ADD1, RED1
   497  	//	VMALHF M0, MK0, ADD1, RED1H
   498  	//	VMALF  M1, MK0, ADD2, RED2
   499  	//	VMALHF M1, MK0, ADD2, RED2H
   500  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   501  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   502  
   503  	VSLDOI $12, RED2, RED1, RED1
   504  	VSLDOI $12, T2, RED2, RED2
   505  
   506  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   507  	VADDUQM RED1, ADD1H, T0   // VAQ
   508  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   509  	VADDUQM RED1H, T0, T0     // VAQ
   510  
   511  	// << ready for next MK0
   512  
   513  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   514  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   515  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   516  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   517  	VADDUQM  CAR1, T2, T2            // VAQ
   518  
   519  	// ---------------------------------------------------
   520  	//	VREPF $3, Y1, YDIG
   521  	VSPLTW $3, Y1, YDIG
   522  
   523  	//	VMALF X0, YDIG, T0, ADD1
   524  	//	VMALF  X1, YDIG, T1, ADD2
   525  	//	VMALHF X0, YDIG, T0, ADD1H
   526  	//	VMALHF X1, YDIG, T1, ADD2H
   527  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   528  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   529  
   530  	//	VMLF  ADD1, K0, MK0
   531  	//	VREPF $3, MK0, MK0
   532  	VMULUWM ADD1, K0, MK0
   533  	VSPLTW $3, MK0, MK0
   534  
   535  	//	VMALF  M0, MK0, ADD1, RED1
   536  	//	VMALHF M0, MK0, ADD1, RED1H
   537  	//	VMALF  M1, MK0, ADD2, RED2
   538  	//	VMALHF M1, MK0, ADD2, RED2H
   539  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   540  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   541  
   542  	VSLDOI $12, RED2, RED1, RED1
   543  	VSLDOI $12, T2, RED2, RED2
   544  
   545  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   546  	VADDUQM RED1, ADD1H, T0   // VAQ
   547  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   548  	VADDUQM RED1H, T0, T0     // VAQ
   549  
   550  	// << ready for next MK0
   551  
   552  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   553  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   554  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   555  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   556  	VADDUQM  CAR1, T2, T2            // VAQ
   557  
   558  	// ---------------------------------------------------
   559  	//	VREPF $2, Y1, YDIG
   560  	VSPLTW $2, Y1, YDIG
   561  
   562  	//	VMALF X0, YDIG, T0, ADD1
   563  	//	VMALF  X1, YDIG, T1, ADD2
   564  	//	VMALHF X0, YDIG, T0, ADD1H
   565  	//	VMALHF X1, YDIG, T1, ADD2H
   566  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   567  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   568  
   569  	//	VMLF  ADD1, K0, MK0
   570  	//	VREPF $3, MK0, MK0
   571  	VMULUWM ADD1, K0, MK0
   572  	VSPLTW $3, MK0, MK0
   573  
   574  	//	VMALF  M0, MK0, ADD1, RED1
   575  	//	VMALHF M0, MK0, ADD1, RED1H
   576  	//	VMALF  M1, MK0, ADD2, RED2
   577  	//	VMALHF M1, MK0, ADD2, RED2H
   578  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   579  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   580  
   581  	VSLDOI $12, RED2, RED1, RED1
   582  	VSLDOI $12, T2, RED2, RED2
   583  
   584  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   585  	VADDUQM RED1, ADD1H, T0   // VAQ
   586  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   587  	VADDUQM RED1H, T0, T0     // VAQ
   588  
   589  	// << ready for next MK0
   590  
   591  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   592  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   593  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   594  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   595  	VADDUQM  CAR1, T2, T2            // VAQ
   596  
   597  	// ---------------------------------------------------
   598  	//	VREPF $1, Y1, YDIG
   599  	VSPLTW $1, Y1, YDIG
   600  
   601  	//	VMALF X0, YDIG, T0, ADD1
   602  	//	VMALF  X1, YDIG, T1, ADD2
   603  	//	VMALHF X0, YDIG, T0, ADD1H
   604  	//	VMALHF X1, YDIG, T1, ADD2H
   605  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   606  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   607  
   608  	//	VMLF  ADD1, K0, MK0
   609  	//	VREPF $3, MK0, MK0
   610  	VMULUWM ADD1, K0, MK0
   611  	VSPLTW $3, MK0, MK0
   612  
   613  	//	VMALF  M0, MK0, ADD1, RED1
   614  	//	VMALHF M0, MK0, ADD1, RED1H
   615  	//	VMALF  M1, MK0, ADD2, RED2
   616  	//	VMALHF M1, MK0, ADD2, RED2H
   617  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   618  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   619  
   620  	VSLDOI $12, RED2, RED1, RED1
   621  	VSLDOI $12, T2, RED2, RED2
   622  
   623  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   624  	VADDUQM RED1, ADD1H, T0   // VAQ
   625  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   626  	VADDUQM RED1H, T0, T0     // VAQ
   627  
   628  	// << ready for next MK0
   629  
   630  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   631  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   632  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   633  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   634  	VADDUQM  CAR1, T2, T2            // VAQ
   635  
   636  	// ---------------------------------------------------
   637  	//	VREPF $0, Y1, YDIG
   638  	VSPLTW $0, Y1, YDIG
   639  	
   640  	//	VMALF X0, YDIG, T0, ADD1
   641  	//	VMALF  X1, YDIG, T1, ADD2
   642  	//	VMALHF X0, YDIG, T0, ADD1H
   643  	//	VMALHF X1, YDIG, T1, ADD2H
   644  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   645  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   646  
   647  	//	VMLF  ADD1, K0, MK0
   648  	//	VREPF $3, MK0, MK0
   649  	VMULUWM ADD1, K0, MK0
   650  	VSPLTW $3, MK0, MK0
   651  
   652  	//	VMALF  M0, MK0, ADD1, RED1
   653  	//	VMALHF M0, MK0, ADD1, RED1H
   654  	//	VMALF  M1, MK0, ADD2, RED2
   655  	//	VMALHF M1, MK0, ADD2, RED2H
   656  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   657  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   658  
   659  	VSLDOI $12, RED2, RED1, RED1
   660  	VSLDOI $12, T2, RED2, RED2
   661  
   662  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   663  	VADDUQM RED1, ADD1H, T0   // VAQ
   664  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   665  	VADDUQM RED1H, T0, T0     // VAQ
   666  
   667  	// << ready for next MK0
   668  
   669  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   670  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   671  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   672  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   673  	VADDUQM  CAR1, T2, T2            // VAQ
   674  
   675  	// ---------------------------------------------------
   676  
   677  	//	VZERO   RED1
   678  	//	VSCBIQ  M0, T0, CAR1
   679  	//	VSQ     M0, T0, ADD1
   680  	//	VSBCBIQ T1, M1, CAR1, CAR1M
   681  	//	VSBIQ   T1, M1, CAR1, ADD2
   682  	//	VSBIQ   T2, RED1, CAR1M, T2
   683  	VSPLTISB $0, RED1 // VZERO RED1
   684  	VSUBCUQ  T0, M0, CAR1         // VSCBIQ
   685  	VSUBUQM  T0, M0, ADD1         // VSQ
   686  	VSUBECUQ T1, M1, CAR1, CAR1M  // VSBCBIQ
   687  	VSUBEUQM T1, M1, CAR1, ADD2   // VSBIQ
   688  	VSUBEUQM T2, RED1, CAR1M, T2  // VSBIQ
   689  
   690  	// what output to use, ADD2||ADD1 or T1||T0?
   691  	VSEL ADD1, T0, T2, T0
   692  	VSEL ADD2, T1, T2, T1
   693  	RET
   694  
   695  #undef X0
   696  #undef X1
   697  #undef Y0
   698  #undef Y1
   699  #undef M0
   700  #undef M1
   701  #undef T0
   702  #undef T1
   703  #undef T2
   704  #undef YDIG
   705  
   706  #undef ADD1
   707  #undef ADD1H
   708  #undef ADD2
   709  #undef ADD2H
   710  #undef RED1
   711  #undef RED1H
   712  #undef RED2
   713  #undef RED2H
   714  #undef CAR1
   715  #undef CAR1M
   716  
   717  #undef MK0
   718  #undef K0
   719  #undef TMP1
   720  #undef TMP2
   721  #undef ONE
   722  
   723  // func gfpMul(c, a, b *gfP)
   724  #define res_ptr R3
   725  #define x_ptr R4
   726  #define y_ptr R5
   727  #define CPOOL R7
   728  #define N     R8
   729  
   730  #define X0    V0
   731  #define X1    V1
   732  #define Y0    V2
   733  #define Y1    V3
   734  #define M0    V5
   735  #define M1    V4
   736  #define T0    V6
   737  #define T1    V7
   738  #define K0    V31
   739  
   740  TEXT ·gfpMul(SB),NOSPLIT,$0
   741  	MOVD	c+0(FP), res_ptr
   742  	MOVD	a+8(FP), x_ptr
   743  	MOVD	b+16(FP), y_ptr
   744  
   745  	MOVD $16, R16
   746  
   747  	LXVD2X (R0)(x_ptr), X0
   748  	LXVD2X (R16)(x_ptr), X1
   749  
   750  	XXPERMDI X0, X0, $2, X0
   751  	XXPERMDI X1, X1, $2, X1
   752  
   753  	LXVD2X (R0)(y_ptr), Y0
   754  	LXVD2X (R16)(y_ptr), Y1
   755  
   756  	XXPERMDI Y0, Y0, $2, Y0
   757  	XXPERMDI Y1, Y1, $2, Y1
   758  
   759  	MOVD $·p2+0(SB), CPOOL
   760  	LXVD2X (CPOOL)(R0), M0
   761  	LXVD2X (CPOOL)(R16), M1
   762  	
   763  	XXPERMDI M0, M0, $2, M0
   764  	XXPERMDI M1, M1, $2, M1
   765  
   766  	MOVD $·np+0(SB), CPOOL
   767  	LXVD2X (CPOOL)(R0), K0
   768  	VSPLTW $1, K0, K0
   769  
   770  	CALL gfpMulInternal<>(SB)
   771  
   772  	XXPERMDI T0, T0, $2, T0
   773  	XXPERMDI T1, T1, $2, T1
   774  	STXVD2X T0, (R0)(res_ptr)
   775  	STXVD2X T1, (R16)(res_ptr)
   776  
   777  	RET
   778  
   779  // func gfpSqr(res, in *gfP, n int)
   780  TEXT ·gfpSqr(SB),NOSPLIT,$0
   781  	MOVD res+0(FP), res_ptr
   782  	MOVD in+8(FP), x_ptr
   783  	MOVD n+16(FP), N
   784  	MOVD $16, R16
   785  
   786  	LXVD2X (R0)(x_ptr), X0
   787  	LXVD2X (R16)(x_ptr), X1
   788  
   789  	XXPERMDI X0, X0, $2, X0
   790  	XXPERMDI X1, X1, $2, X1
   791  
   792  	MOVD $·p2+0(SB), CPOOL
   793  	LXVD2X (CPOOL)(R0), M0
   794  	LXVD2X (CPOOL)(R16), M1
   795  	
   796  	XXPERMDI M0, M0, $2, M0
   797  	XXPERMDI M1, M1, $2, M1
   798  
   799  	MOVD $·np+0(SB), CPOOL
   800  	LXVD2X (CPOOL)(R0), K0
   801  	VSPLTW $1, K0, K0
   802  
   803  sqrLoop:
   804  	// Sqr uses same value for both
   805  
   806  	VOR	X0, X0, Y0
   807  	VOR	X1, X1, Y1
   808  	CALL gfpMulInternal<>(SB)
   809  
   810  	ADD	$-1, N
   811  	CMP	$0, N
   812  	BEQ	done
   813  
   814  	VOR	T0, T0, X0
   815  	VOR	T1, T1, X1
   816  	BR	sqrLoop
   817  
   818  done:
   819  	XXPERMDI T0, T0, $2, T0
   820  	XXPERMDI T1, T1, $2, T1
   821  	STXVD2X T0, (R0)(res_ptr)
   822  	STXVD2X T1, (R16)(res_ptr)
   823  	RET
   824  
   825  #undef res_ptr
   826  #undef x_ptr
   827  #undef y_ptr
   828  #undef CPOOL
   829  #undef N
   830  #undef X0
   831  #undef X1
   832  #undef Y0
   833  #undef Y1
   834  #undef M0
   835  #undef M1
   836  #undef T0
   837  #undef T1
   838  #undef K0
   839  
   840  /* ---------------------------------------*/
   841  #define res_ptr R3
   842  #define x_ptr R4
   843  #define CPOOL R7
   844  
   845  #define M0    V5
   846  #define M1    V4
   847  #define T0    V6
   848  #define T1    V7
   849  #define T2    V8
   850  
   851  #define ADD1  V16
   852  #define ADD1H V17
   853  #define ADD2  V18
   854  #define ADD2H V19
   855  #define RED1  V20
   856  #define RED1H V21
   857  #define RED2  V22
   858  #define RED2H V23
   859  #define CAR1  V24
   860  #define CAR1M V25
   861  
   862  #define MK0   V30
   863  #define K0    V31
   864  
   865  // TMP1, TMP2 used in
   866  // VMULT macros
   867  #define TMP1  V13
   868  #define TMP2  V27
   869  #define ONE   V29 // 1s splatted by word
   870  // func gfpFromMont(res, in *gfP)
   871  TEXT ·gfpFromMont(SB),NOSPLIT,$0
   872  	MOVD res+0(FP), res_ptr
   873  	MOVD in+8(FP), x_ptr
   874  
   875  	MOVD $16, R16
   876  
   877  	LXVD2X (R0)(x_ptr), T0
   878  	LXVD2X (R16)(x_ptr), T1
   879  
   880  	XXPERMDI T0, T0, $2, T0
   881  	XXPERMDI T1, T1, $2, T1
   882  
   883  	MOVD $·p2+0(SB), CPOOL
   884  	LXVD2X (CPOOL)(R0), M0
   885  	LXVD2X (CPOOL)(R16), M1
   886  	
   887  	XXPERMDI M0, M0, $2, M0
   888  	XXPERMDI M1, M1, $2, M1
   889  
   890  	MOVD $·np+0(SB), CPOOL
   891  	LXVD2X (CPOOL)(R0), K0
   892  	VSPLTW $1, K0, K0
   893  
   894  	// ---------------------------------------------------------------------------/
   895  	VSPLTISW $1, ONE
   896  	VSPLTISB $0, T2 // VZERO T2
   897  
   898  	MOVD $8, R5
   899  	MOVD R5, CTR
   900  
   901  loop:
   902  	VMULUWM T0, K0, MK0
   903  	VSPLTW $3, MK0, MK0
   904  
   905  	VMULT_ADD(M0, MK0, T0, ONE, RED1, RED1H)
   906  	VMULT_ADD(M1, MK0, T1, ONE, RED2, RED2H)
   907  
   908  	VSLDOI $12, RED2, RED1, RED1 // VSLDB
   909  	VSLDOI $12, T2, RED2, RED2   // VSLDB
   910  
   911  	VADDCUQ RED1H, RED1, CAR1M   // VACCQ
   912  	VADDUQM RED1H, RED1, T0      // VAQ
   913  
   914  	// << ready for next MK0
   915  
   916  	VADDECUQ RED2H, RED2, CAR1M, T2    // VACCCQ
   917  	VADDEUQM RED2H, RED2, CAR1M, T1    // VACQ
   918  
   919  	BDNZ loop
   920  	// ---------------------------------------------------
   921  	VSPLTISB $0, RED1 // VZERO RED1
   922  	VSUBCUQ  T0, M0, CAR1         // VSCBIQ
   923  	VSUBUQM  T0, M0, ADD1         // VSQ
   924  	VSUBECUQ T1, M1, CAR1, CAR1M  // VSBCBIQ
   925  	VSUBEUQM T1, M1, CAR1, ADD2   // VSBIQ
   926  	VSUBEUQM T2, RED1, CAR1M, T2  // VSBIQ
   927  
   928  	// what output to use, ADD2||ADD1 or T1||T0?
   929  	VSEL ADD1, T0, T2, T0
   930  	VSEL ADD2, T1, T2, T1
   931  
   932  	XXPERMDI T0, T0, $2, T0
   933  	XXPERMDI T1, T1, $2, T1
   934  	STXVD2X T0, (R0)(res_ptr)
   935  	STXVD2X T1, (R16)(res_ptr)	
   936  	RET