github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp2_g1_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define res_ptr R0
     6  #define a_ptr R1
     7  #define b_ptr R2
     8  
     9  #define acc0 R3
    10  #define acc1 R4
    11  #define acc2 R5
    12  #define acc3 R6
    13  
    14  #define acc4 R7
    15  #define acc5 R8
    16  #define acc6 R9
    17  #define acc7 R10
    18  #define t0 R11
    19  #define t1 R12
    20  #define const0 R13
    21  #define const1 R14
    22  #define const2 R15
    23  #define const3 R16
    24  
    25  #define hlp0 R17
    26  #define hlp1 res_ptr
    27  
    28  #define x0 R19
    29  #define x1 R20
    30  #define x2 R21
    31  #define x3 R22
    32  #define y0 R23
    33  #define y1 R24
    34  #define y2 R25
    35  #define y3 R26
    36  
    37  /* ---------------------------------------*/
    38  // (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0)
    39  TEXT gfpSubInternal(SB),NOSPLIT,$0
    40  	SUBS	x0, y0, acc0
    41  	SBCS	x1, y1, acc1
    42  	SBCS	x2, y2, acc2
    43  	SBCS	x3, y3, acc3
    44  	SBC	$0, ZR, t0
    45  
    46  	ADDS	const0, acc0, acc4
    47  	ADCS	const1, acc1, acc5
    48  	ADCS	const2, acc2, acc6
    49  	ADC	    const3, acc3, acc7
    50  
    51  	ANDS	$1, t0
    52  	CSEL	EQ, acc0, acc4, x0
    53  	CSEL	EQ, acc1, acc5, x1
    54  	CSEL	EQ, acc2, acc6, x2
    55  	CSEL	EQ, acc3, acc7, x3
    56  
    57  	RET
    58  
    59  /* ---------------------------------------*/
    60  // (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
    61  TEXT gfpMulInternal(SB),NOSPLIT,$0
    62  	// y[0] * x
    63  	MUL	y0, x0, acc0
    64  	UMULH	y0, x0, acc1
    65  
    66  	MUL	y0, x1, t0
    67  	ADDS	t0, acc1
    68  	UMULH	y0, x1, acc2
    69  
    70  	MUL	y0, x2, t0
    71  	ADCS	t0, acc2
    72  	UMULH	y0, x2, acc3
    73  
    74  	MUL	y0, x3, t0
    75  	ADCS	t0, acc3
    76  	UMULH	y0, x3, acc4
    77  	ADC	$0, acc4
    78  	// First reduction step
    79  	MUL	acc0, hlp1, hlp0
    80  
    81  	MUL	const0, hlp0, t0
    82  	ADDS	t0, acc0, acc0
    83  	UMULH	const0, hlp0, t1
    84  
    85  	MUL	const1, hlp0, t0
    86  	ADCS	t0, acc1, acc1
    87  	UMULH	const1, hlp0, y0
    88  
    89  	MUL	const2, hlp0, t0
    90  	ADCS	t0, acc2, acc2
    91  	UMULH	const2, hlp0, acc0
    92  
    93  	MUL	const3, hlp0, t0
    94  	ADCS	t0, acc3, acc3
    95  
    96  	UMULH	const3, hlp0, hlp0
    97  	ADC	$0, acc4
    98  
    99  	ADDS	t1, acc1, acc1
   100  	ADCS	y0, acc2, acc2
   101  	ADCS	acc0, acc3, acc3
   102  	ADC	$0, hlp0, acc0
   103  
   104  	// y[1] * x
   105  	MUL	y1, x0, t0
   106  	ADDS	t0, acc1
   107  	UMULH	y1, x0, t1
   108  
   109  	MUL	y1, x1, t0
   110  	ADCS	t0, acc2
   111  	UMULH	y1, x1, y0
   112  
   113  	MUL	y1, x2, t0
   114  	ADCS	t0, acc3
   115  	UMULH	y1, x2, hlp0
   116  
   117  	MUL	y1, x3, t0
   118  	ADCS	t0, acc4
   119  	UMULH	y1, x3, y1
   120  	ADC	$0, ZR, acc5
   121  
   122  	ADDS	t1, acc2
   123  	ADCS	y0, acc3
   124  	ADCS	hlp0, acc4
   125  	ADC	y1, acc5
   126  	// Second reduction step
   127  	MUL	acc1, hlp1, hlp0
   128  
   129  	MUL	const0, hlp0, t0
   130  	ADDS	t0, acc1, acc1
   131  	UMULH	const0, hlp0, t1
   132  
   133  	MUL	const1, hlp0, t0
   134  	ADCS	t0, acc2, acc2
   135  	UMULH	const1, hlp0, y0
   136  
   137  	MUL	const2, hlp0, t0
   138  	ADCS	t0, acc3, acc3
   139  	UMULH	const2, hlp0, acc1
   140  
   141  	MUL	const3, hlp0, t0
   142  	ADCS	t0, acc0, acc0
   143  
   144  	UMULH	const3, hlp0, hlp0
   145  	ADC	$0, acc5
   146  
   147  	ADDS	t1, acc2, acc2
   148  	ADCS	y0, acc3, acc3
   149  	ADCS	acc1, acc0, acc0
   150  	ADC	$0, hlp0, acc1
   151  
   152  	// y[2] * x
   153  	MUL	y2, x0, t0
   154  	ADDS	t0, acc2
   155  	UMULH	y2, x0, t1
   156  
   157  	MUL	y2, x1, t0
   158  	ADCS	t0, acc3
   159  	UMULH	y2, x1, y0
   160  
   161  	MUL	y2, x2, t0
   162  	ADCS	t0, acc4
   163  	UMULH	y2, x2, y1
   164  
   165  	MUL	y2, x3, t0
   166  	ADCS	t0, acc5
   167  	UMULH	y2, x3, hlp0
   168  	ADC	$0, ZR, acc6
   169  
   170  	ADDS	t1, acc3
   171  	ADCS	y0, acc4
   172  	ADCS	y1, acc5
   173  	ADC	hlp0, acc6
   174  	// Third reduction step
   175  	MUL	acc2, hlp1, hlp0
   176  
   177  	MUL	const0, hlp0, t0
   178  	ADDS	t0, acc2, acc2
   179  	UMULH	const0, hlp0, t1
   180  
   181  	MUL	const1, hlp0, t0
   182  	ADCS	t0, acc3, acc3
   183  	UMULH	const1, hlp0, y0
   184  
   185  	MUL	const2, hlp0, t0
   186  	ADCS	t0, acc0, acc0
   187  	UMULH	const2, hlp0, acc2
   188  
   189  	MUL	const3, hlp0, t0
   190  	ADCS	t0, acc1, acc1
   191  
   192  	UMULH	const3, hlp0, hlp0
   193  	ADC	$0, acc6
   194  
   195  	ADDS	t1, acc3, acc3
   196  	ADCS	y0, acc0, acc0
   197  	ADCS	acc2, acc1, acc1
   198  	ADC	$0, hlp0, acc2
   199  	// y[3] * x
   200  	MUL	y3, x0, t0
   201  	ADDS	t0, acc3
   202  	UMULH	y3, x0, t1
   203  
   204  	MUL	y3, x1, t0
   205  	ADCS	t0, acc4
   206  	UMULH	y3, x1, y0
   207  
   208  	MUL	y3, x2, t0
   209  	ADCS	t0, acc5
   210  	UMULH	y3, x2, y1
   211  
   212  	MUL	y3, x3, t0
   213  	ADCS	t0, acc6
   214  	UMULH	y3, x3, hlp0
   215  	ADC	$0, ZR, acc7
   216  
   217  	ADDS	t1, acc4
   218  	ADCS	y0, acc5
   219  	ADCS	y1, acc6
   220  	ADC	hlp0, acc7
   221  	// Last reduction step
   222  	MUL	acc3, hlp1, hlp0
   223  
   224  	MUL	const0, hlp0, t0
   225  	ADDS	t0, acc3, acc3
   226  	UMULH	const0, hlp0, t1
   227  
   228  	MUL	const1, hlp0, t0
   229  	ADCS	t0, acc0, acc0
   230  	UMULH	const1, hlp0, y0
   231  
   232  	MUL	const2, hlp0, t0
   233  	ADCS	t0, acc1, acc1
   234  	UMULH	const2, hlp0, acc3
   235  
   236  	MUL	const3, hlp0, t0
   237  	ADCS	t0, acc2, acc2
   238  
   239  	UMULH	const3, hlp0, hlp0
   240  	ADC	$0, acc7
   241  
   242  	ADDS	t1, acc0, acc0
   243  	ADCS	y0, acc1, acc1
   244  	ADCS	acc3, acc2, acc2
   245  	ADC	$0, hlp0, acc3
   246  
   247  	// Add bits [511:256] of the mul result
   248  	ADDS	acc4, acc0, acc0
   249  	ADCS	acc5, acc1, acc1
   250  	ADCS	acc6, acc2, acc2
   251  	ADCS	acc7, acc3, acc3
   252  	ADC	$0, ZR, acc4
   253  
   254  	SUBS	const0, acc0, t0
   255  	SBCS	const1, acc1, t1
   256  	SBCS	const2, acc2, acc6
   257  	SBCS	const3, acc3, acc7
   258  	SBCS	$0, acc4, acc4
   259  
   260  	CSEL	CS, t0, acc0, y0
   261  	CSEL	CS, t1, acc1, y1
   262  	CSEL	CS, acc6, acc2, y2
   263  	CSEL	CS, acc7, acc3, y3
   264      
   265      RET
   266  
   267  /* ---------------------------------------*/
   268  // (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2
   269  TEXT gfpSqrInternal(SB),NOSPLIT,$0
   270  	// x[1:] * x[0]
   271  	MUL	x0, x1, acc1
   272  	UMULH	x0, x1, acc2
   273  
   274  	MUL	x0, x2, t0
   275  	ADDS	t0, acc2, acc2
   276  	UMULH	x0, x2, acc3
   277  
   278  	MUL	x0, x3, t0
   279  	ADCS	t0, acc3, acc3
   280  	UMULH	x0, x3, acc4
   281  	ADC	$0, acc4, acc4
   282  	// x[2:] * x[1]
   283  	MUL	x1, x2, t0
   284  	ADDS	t0, acc3
   285  	UMULH	x1, x2, t1
   286  	ADCS	t1, acc4
   287  	ADC	$0, ZR, acc5
   288  
   289  	MUL	x1, x3, t0
   290  	ADDS	t0, acc4
   291  	UMULH	x1, x3, t1
   292  	ADC	t1, acc5
   293  	// x[3] * x[2]
   294  	MUL	x2, x3, t0
   295  	ADDS	t0, acc5
   296  	UMULH	x2, x3, acc6
   297  	ADC	$0, acc6
   298  
   299  	MOVD	$0, acc7
   300  	// *2
   301  	ADDS	acc1, acc1
   302  	ADCS	acc2, acc2
   303  	ADCS	acc3, acc3
   304  	ADCS	acc4, acc4
   305  	ADCS	acc5, acc5
   306  	ADCS	acc6, acc6
   307  	ADC	$0, acc7
   308  	// Missing products
   309  	MUL	x0, x0, acc0
   310  	UMULH	x0, x0, t0
   311  	ADDS	t0, acc1, acc1
   312  
   313  	MUL	x1, x1, t0
   314  	ADCS	t0, acc2, acc2
   315  	UMULH	x1, x1, t1
   316  	ADCS	t1, acc3, acc3
   317  
   318  	MUL	x2, x2, t0
   319  	ADCS	t0, acc4, acc4
   320  	UMULH	x2, x2, t1
   321  	ADCS	t1, acc5, acc5
   322  
   323  	MUL	x3, x3, t0
   324  	ADCS	t0, acc6, acc6
   325  	UMULH	x3, x3, t1
   326  	ADCS	t1, acc7, acc7
   327  	// First reduction step
   328  	MUL	acc0, hlp1, hlp0
   329  
   330  	MUL	const0, hlp0, t0
   331  	ADDS	t0, acc0, acc0
   332  	UMULH	const0, hlp0, t1
   333  
   334  	MUL	const1, hlp0, t0
   335  	ADCS	t0, acc1, acc1
   336  	UMULH	const1, hlp0, y0
   337  
   338  	MUL	const2, hlp0, t0
   339  	ADCS	t0, acc2, acc2
   340  	UMULH	const2, hlp0, acc0
   341  
   342  	MUL	const3, hlp0, t0
   343  	ADCS	t0, acc3, acc3
   344  
   345  	UMULH	const3, hlp0, hlp0
   346  	ADC	$0, hlp0
   347  
   348  	ADDS	t1, acc1, acc1
   349  	ADCS	y0, acc2, acc2
   350  	ADCS	acc0, acc3, acc3
   351  	ADC	$0, hlp0, acc0
   352  	// Second reduction step
   353  	MUL	acc1, hlp1, hlp0
   354  
   355  	MUL	const0, hlp0, t0
   356  	ADDS	t0, acc1, acc1
   357  	UMULH	const0, hlp0, t1
   358  
   359  	MUL	const1, hlp0, t0
   360  	ADCS	t0, acc2, acc2
   361  	UMULH	const1, hlp0, y0
   362  
   363  	MUL	const2, hlp0, t0
   364  	ADCS	t0, acc3, acc3
   365  	UMULH	const2, hlp0, acc1
   366  
   367  	MUL	const3, hlp0, t0
   368  	ADCS	t0, acc0, acc0
   369  
   370  	UMULH	const3, hlp0, hlp0
   371  	ADC	$0, hlp0
   372  
   373  	ADDS	t1, acc2, acc2
   374  	ADCS	y0, acc3, acc3
   375  	ADCS	acc1, acc0, acc0
   376  	ADC	$0, hlp0, acc1
   377  	// Third reduction step
   378  	MUL	acc2, hlp1, hlp0
   379  
   380  	MUL	const0, hlp0, t0
   381  	ADDS	t0, acc2, acc2
   382  	UMULH	const0, hlp0, t1
   383  
   384  	MUL	const1, hlp0, t0
   385  	ADCS	t0, acc3, acc3
   386  	UMULH	const1, hlp0, y0
   387  
   388  	MUL	const2, hlp0, t0
   389  	ADCS	t0, acc0, acc0
   390  	UMULH	const2, hlp0, acc2
   391  
   392  	MUL	const3, hlp0, t0
   393  	ADCS	t0, acc1, acc1
   394  
   395  	UMULH	const3, hlp0, hlp0
   396  	ADC	$0, hlp0
   397  
   398  	ADDS	t1, acc3, acc3
   399  	ADCS	y0, acc0, acc0
   400  	ADCS	acc2, acc1, acc1
   401  	ADC	$0, hlp0, acc2
   402  
   403  	// Last reduction step
   404  	MUL	acc3, hlp1, hlp0
   405  
   406  	MUL	const0, hlp0, t0
   407  	ADDS	t0, acc3, acc3
   408  	UMULH	const0, hlp0, t1
   409  
   410  	MUL	const1, hlp0, t0
   411  	ADCS	t0, acc0, acc0
   412  	UMULH	const1, hlp0, y0
   413  
   414  	MUL	const2, hlp0, t0
   415  	ADCS	t0, acc1, acc1
   416  	UMULH	const2, hlp0, acc3
   417  
   418  	MUL	const3, hlp0, t0
   419  	ADCS	t0, acc2, acc2
   420  
   421  	UMULH	const3, hlp0, hlp0
   422  	ADC	$0, acc7
   423  
   424  	ADDS	t1, acc0, acc0
   425  	ADCS	y0, acc1, acc1
   426  	ADCS	acc3, acc2, acc2
   427  	ADC	$0, hlp0, acc3
   428  	// Add bits [511:256] of the sqr result
   429  	ADDS	acc4, acc0, acc0
   430  	ADCS	acc5, acc1, acc1
   431  	ADCS	acc6, acc2, acc2
   432  	ADCS	acc7, acc3, acc3
   433  	ADC	$0, ZR, acc4
   434  
   435  	SUBS	const0, acc0, t0
   436  	SBCS	const1, acc1, t1
   437  	SBCS	const2, acc2, acc6
   438  	SBCS	const3, acc3, acc7
   439  	SBCS	$0, acc4, acc4
   440  
   441  	CSEL	CS, t0, acc0, y0
   442  	CSEL	CS, t1, acc1, y1
   443  	CSEL	CS, acc6, acc2, y2
   444  	CSEL	CS, acc7, acc3, y3
   445      RET
   446  
   447  /* ---------------------------------------*/
   448  // (x3, x2, x1, x0) = 2(y3, y2, y1, y0)
   449  #define gfpMulBy2Inline       \
   450  	ADDS	y0, y0, x0;    \
   451  	ADCS	y1, y1, x1;    \
   452  	ADCS	y2, y2, x2;    \
   453  	ADCS	y3, y3, x3;    \
   454  	ADC	$0, ZR, hlp0;  \
   455  	SUBS	const0, x0, acc0;   \
   456  	SBCS	const1, x1, acc1;\
   457  	SBCS	const2, x2, acc2;    \
   458  	SBCS	const3, x3, acc3;\
   459  	SBCS	$0, hlp0, hlp0;\
   460  	CSEL	CC, x0, acc0, x0;\
   461  	CSEL	CC, x1, acc1, x1;\
   462  	CSEL	CC, x2, acc2, x2;\
   463  	CSEL	CC, x3, acc3, x3;    
   464  
   465  // (y3, y2, y1, y0) = 2(y3, y2, y1, y0)
   466  #define gfpMulBy2Inline2       \
   467  	ADDS	y0, y0, x0;    \
   468  	ADCS	y1, y1, x1;    \
   469  	ADCS	y2, y2, x2;    \
   470  	ADCS	y3, y3, x3;    \
   471  	ADC	$0, ZR, hlp0;  \
   472  	SUBS	const0, x0, acc0;   \
   473  	SBCS	const1, x1, acc1;\
   474  	SBCS	const2, x2, acc2;    \
   475  	SBCS	const3, x3, acc3;\
   476  	SBCS	$0, hlp0, hlp0;\
   477  	CSEL	CC, x0, acc0, y0;\
   478  	CSEL	CC, x1, acc1, y1;\
   479  	CSEL	CC, x2, acc2, y2;\
   480  	CSEL	CC, x3, acc3, y3;    
   481  
   482  /* ---------------------------------------*/
   483  // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
   484  #define gfpAddInline          \
   485  	ADDS	y0, x0, x0;    \
   486  	ADCS	y1, x1, x1;    \
   487  	ADCS	y2, x2, x2;    \
   488  	ADCS	y3, x3, x3;    \
   489  	ADC	$0, ZR, hlp0;  \
   490  	SUBS	const0, x0, acc0;   \
   491  	SBCS	const1, x1, acc1;\
   492  	SBCS	const2, x2, acc2;    \
   493  	SBCS	const3, x3, acc3;\
   494  	SBCS	$0, hlp0, hlp0;\
   495  	CSEL	CC, x0, acc0, x0;\
   496  	CSEL	CC, x1, acc1, x1;\
   497  	CSEL	CC, x2, acc2, x2;\
   498  	CSEL	CC, x3, acc3, x3;
   499  
   500  /* ---------------------------------------*/
   501  #define x1in(off) (off)(a_ptr)
   502  #define y1in(off) (off + 32)(a_ptr)
   503  #define z1in(off) (off + 64)(a_ptr)
   504  #define x2in(off) (off)(b_ptr)
   505  #define y2in(off) (off + 32)(b_ptr)
   506  #define z2in(off) (off + 64)(b_ptr)
   507  #define x3out(off) (off)(res_ptr)
   508  #define y3out(off) (off + 32)(res_ptr)
   509  #define z3out(off) (off + 64)(res_ptr)
   510  #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
   511  #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
   512  #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
   513  #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
   514  #define y2x      MOVD y0, x0; MOVD y1, x1; MOVD y2, x2; MOVD y3, x3
   515  #define x2y      MOVD x0, y0; MOVD x1, y1; MOVD x2, y2; MOVD x3, y3
   516  
   517  /* ---------------------------------------*/
   518  #define tmp0(off)	(32*0 + 8 + off)(RSP)
   519  #define tmp1(off)	(32*1 + 8 + off)(RSP)
   520  #define tmp2(off) (32*2 + 8 + off)(RSP)
   521  
   522  // func gfp2Mul(c, a, b *gfP2)
   523  TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
   524  	MOVD	in1+8(FP), a_ptr
   525  	MOVD	in2+16(FP), b_ptr
   526  
   527  	MOVD	·np+0x00(SB), hlp1
   528  	LDP	·p2+0x00(SB), (const0, const1)
   529  	LDP	·p2+0x10(SB), (const2, const3)
   530  	
   531  	LDx (y1in)
   532  	LDy (y2in)
   533  	CALL gfpMulInternal(SB)
   534  	STy (tmp0)
   535  
   536  	LDx (x1in)
   537  	LDy (x2in)
   538  	CALL gfpMulInternal(SB)
   539  	STy (tmp1)
   540  
   541  	LDx (x1in)
   542  	LDy (y1in)
   543  	gfpAddInline
   544  	STx (tmp2)
   545  
   546  	LDx (x2in)
   547  	LDy (y2in)
   548  	gfpAddInline
   549  	LDy (tmp2)
   550  	CALL gfpMulInternal(SB)
   551  
   552  	LDx (tmp0)
   553  	CALL gfpSubInternal(SB)
   554  	x2y
   555  	LDx (tmp1)
   556  	CALL gfpSubInternal(SB)
   557  	MOVD	res+0(FP), res_ptr  // not use hlp1 any more
   558  	STx (x3out)
   559  
   560  	LDy (tmp1)
   561  	gfpMulBy2Inline
   562  	LDy (tmp0)
   563  	CALL gfpSubInternal(SB)
   564  	STx (y3out)
   565  
   566  	RET
   567  
   568  // func gfp2MulU(c, a, b *gfP2)
   569  TEXT ·gfp2MulU(SB),NOSPLIT,$104-24
   570  	MOVD	in1+8(FP), a_ptr
   571  	MOVD	in2+16(FP), b_ptr
   572  
   573  	MOVD	·np+0x00(SB), hlp1
   574  	LDP	·p2+0x00(SB), (const0, const1)
   575  	LDP	·p2+0x10(SB), (const2, const3)
   576  
   577  	LDx (y1in)
   578  	LDy (y2in)
   579  	CALL gfpMulInternal(SB)
   580  	STy (tmp0)
   581  
   582  	LDx (x1in)
   583  	LDy (x2in)
   584  	CALL gfpMulInternal(SB)
   585  	STy (tmp1)
   586  
   587  	LDx (x1in)
   588  	LDy (y1in)
   589  	gfpAddInline
   590  	STx (tmp2)
   591  
   592  	LDx (x2in)
   593  	LDy (y2in)
   594  	gfpAddInline
   595  	LDy (tmp2)
   596  	CALL gfpMulInternal(SB)
   597  
   598  	LDx (tmp0)
   599  	CALL gfpSubInternal(SB)
   600  	x2y
   601  	LDx (tmp1)
   602  	CALL gfpSubInternal(SB)
   603  	x2y
   604  	gfpMulBy2Inline
   605  	MOVD	$0, y0 
   606  	MOVD	$0, y1 
   607  	MOVD	$0, y2 
   608  	MOVD	$0, y3
   609  	CALL gfpSubInternal(SB)
   610  	MOVD	res+0(FP), res_ptr    // not use hlp1 any more
   611  	STx (y3out)
   612  
   613  	LDy (tmp1)
   614  	gfpMulBy2Inline
   615  	LDy (tmp0)
   616  	CALL gfpSubInternal(SB)
   617  	STx (x3out)
   618  
   619  	RET
   620  
   621  // func gfp2MulU1(c, a *gfP2)
   622  TEXT ·gfp2MulU1(SB),NOSPLIT,$0-16
   623  	MOVD	res+0(FP), b_ptr
   624  	MOVD	in1+8(FP), a_ptr
   625  
   626  	LDP	·p2+0x00(SB), (const0, const1)
   627  	LDP	·p2+0x10(SB), (const2, const3)
   628  
   629  	LDy (x1in)
   630  	gfpMulBy2Inline
   631  	MOVD	$0, y0 
   632  	MOVD	$0, y1 
   633  	MOVD	$0, y2 
   634  	MOVD	$0, y3
   635  	CALL gfpSubInternal(SB)
   636  	
   637  	ADD $32, a_ptr, a_ptr
   638  	VLD1 (a_ptr), [V0.B16, V1.B16]
   639  	VST1 [V0.B16, V1.B16], (b_ptr)
   640  	STx (y2in)
   641  
   642  	RET
   643  
   644  // func gfp2Square(c, a *gfP2)
   645  TEXT ·gfp2Square(SB),NOSPLIT,$72-16
   646  	MOVD	res+0(FP), b_ptr
   647  	MOVD	in1+8(FP), a_ptr
   648  
   649  	MOVD	·np+0x00(SB), hlp1
   650  	LDP	·p2+0x00(SB), (const0, const1)
   651  	LDP	·p2+0x10(SB), (const2, const3)
   652  
   653  	LDx (y1in)
   654  	LDy (x1in)
   655  	gfpAddInline
   656  	STx (tmp0)
   657  	gfpMulBy2Inline
   658  	LDy (y1in)
   659  	CALL gfpSubInternal(SB)
   660  	LDy (tmp0)
   661  	CALL gfpMulInternal(SB)
   662  	STy (tmp0)
   663  
   664  	LDx (y1in)
   665  	LDy (x1in)
   666  	CALL gfpMulInternal(SB)
   667  	//STy (tmp1)
   668  	LDx (tmp0)
   669  	gfpAddInline
   670  	STx (y2in)
   671  
   672  	//LDy (tmp1)
   673  	gfpMulBy2Inline
   674  	STx (x2in)
   675  
   676  	RET
   677  
   678  // func gfp2SquareU(c, a *gfP2)
   679  TEXT ·gfp2SquareU(SB),NOSPLIT,$72-16
   680  	MOVD	res+0(FP), b_ptr
   681  	MOVD	in1+8(FP), a_ptr
   682  
   683  	MOVD	·np+0x00(SB), hlp1
   684  	LDP	·p2+0x00(SB), (const0, const1)
   685  	LDP	·p2+0x10(SB), (const2, const3)
   686  
   687  	LDx (y1in)
   688  	LDy (x1in)
   689  	gfpAddInline
   690  	STx (tmp0)
   691  	gfpMulBy2Inline
   692  	LDy (y1in)
   693  	CALL gfpSubInternal(SB)
   694  	LDy (tmp0)
   695  	CALL gfpMulInternal(SB)
   696  	STy (tmp0)
   697  
   698  	LDx (y1in)
   699  	LDy (x1in)
   700  	CALL gfpMulInternal(SB)
   701  	//STy (tmp1)
   702  	LDx (tmp0)
   703  	gfpAddInline
   704  	STx (x2in)
   705  
   706  	//LDy (tmp1)
   707  	gfpMulBy2Inline2
   708  	gfpMulBy2Inline
   709  	MOVD	$0, y0 
   710  	MOVD	$0, y1 
   711  	MOVD	$0, y2 
   712  	MOVD	$0, y3
   713  	CALL gfpSubInternal(SB)
   714  	STx (y2in)
   715  
   716  	RET
   717  
   718  /* ---------------------------------------*/
   719  #undef tmp2
   720  #define x3t(off) (32*2 + 8 + off)(RSP)
   721  #define y3t(off) (32*3 + 8 + off)(RSP)
   722  #define z3t(off) (32*4 + 8 + off)(RSP)
   723  
   724  // func curvePointDoubleComplete(c, a *curvePoint)
   725  TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16
   726  	MOVD	res+0(FP), b_ptr
   727  	MOVD	in1+8(FP), a_ptr
   728  
   729  	MOVD	·np+0x00(SB), hlp1
   730  	LDP	·p2+0x00(SB), (const0, const1)
   731  	LDP	·p2+0x10(SB), (const2, const3)
   732  
   733  	LDx (y1in)
   734  	CALL gfpSqrInternal(SB) // t0 := Y^2
   735  	STy (tmp0)
   736  
   737  	gfpMulBy2Inline2        // Z3 := t0 + t0
   738  	gfpMulBy2Inline2        // Z3 := Z3 + Z3
   739  	gfpMulBy2Inline         // Z3 := Z3 + Z3
   740  	STx (z3t)
   741  	
   742  	LDx (z1in)
   743  	CALL gfpSqrInternal(SB) // t2 := Z^2
   744  	STy (tmp1)
   745  	gfpMulBy2Inline2
   746  	gfpMulBy2Inline2
   747  	gfpMulBy2Inline2
   748  	gfpMulBy2Inline2
   749  	LDx (tmp1)
   750  	CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ^2
   751  	STx (tmp1)
   752  	LDy (z3t)
   753  	CALL gfpMulInternal(SB) // X3 := t2 * Z3
   754  	STy (x3t)
   755  
   756  	LDx (tmp0)
   757  	LDy (tmp1)
   758  	gfpAddInline            // Y3 := t0 + t2
   759  	STx (y3t)
   760  	gfpMulBy2Inline
   761  	gfpAddInline            // t2 := t2 + t2 + t2
   762  	STx (tmp1)
   763  	LDy (tmp0)
   764  	CALL gfpSubInternal(SB) // t0 := t0 - t2
   765  	STx (tmp0)    
   766  	LDy (y3t)
   767  	CALL gfpMulInternal(SB) // Y3 := t0 * Y3
   768  	LDx (x3t)
   769  	gfpAddInline            // Y3 := X3 + Y3
   770  	STx (y3t)
   771  
   772  	LDx (y1in)
   773  	LDy (z1in)
   774  	CALL gfpMulInternal(SB) // t1 := YZ
   775  	LDx (z3t)
   776  	CALL gfpMulInternal(SB) // Z3 := t1 * Z3
   777  	STy (z2in)              // Store Z3
   778  
   779  	LDx (x1in)
   780  	LDy (y1in)
   781  	CALL gfpMulInternal(SB) // t1 := XY
   782  	LDx (tmp0)
   783  	CALL gfpMulInternal(SB) // X3 := t0 * t1
   784  	gfpMulBy2Inline         // X3 := X3 + X3
   785  	STx (x2in)              // Store X3
   786  	// Store Y3
   787  	LDx (y3t)
   788  	STx (y2in)
   789  
   790  	RET
   791  
   792  /* ---------------------------------------*/
   793  #undef x3t
   794  #undef y3t
   795  #undef z3t
   796  
   797  #define tmp2(off) (32*2 + 8 + off)(RSP)
   798  #define tmp3(off) (32*3 + 8 + off)(RSP)
   799  #define tmp4(off) (32*4 + 8 + off)(RSP)
   800  #define x3t(off) (32*5 + 8 + off)(RSP)
   801  #define y3t(off) (32*6 + 8 + off)(RSP)
   802  #define z3t(off) (32*7 + 8 + off)(RSP)
   803  
   804  // func curvePointAddComplete(c, a, b *curvePoint)
   805  TEXT ·curvePointAddComplete(SB),0,$264-24
   806  	MOVD	in1+8(FP), a_ptr
   807  	MOVD	in2+16(FP), b_ptr
   808  
   809  	MOVD	·np+0x00(SB), hlp1
   810  	LDP	·p2+0x00(SB), (const0, const1)
   811  	LDP	·p2+0x10(SB), (const2, const3)
   812  
   813  	LDx (x1in)
   814  	LDy (x2in)
   815  	CALL gfpMulInternal(SB)         // t0 := X1X2
   816  	STy (tmp0)
   817  	LDx (y1in)
   818  	LDy (y2in)
   819  	CALL gfpMulInternal(SB)         // t1 := Y1Y2
   820  	STy (tmp1)
   821  	LDx (z1in)
   822  	LDy (z2in)
   823  	CALL gfpMulInternal(SB)         // t2 := Z1Z2
   824  	STy (tmp2)
   825  
   826  	LDx (x1in)
   827  	LDy (y1in)
   828  	gfpAddInline                    // t3 := X1 + Y1
   829  	STx (tmp3)
   830  
   831  	LDx (x2in)
   832  	LDy (y2in)
   833  	gfpAddInline                    // t4 := X2 + Y2
   834  	LDy (tmp3)
   835  	CALL gfpMulInternal(SB)         // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
   836  	STy (tmp3)
   837  
   838  	LDx (tmp0)
   839  	LDy (tmp1)
   840  	gfpAddInline                    // t4 := t0 + t1
   841  	LDy (tmp3)
   842  	CALL gfpSubInternal(SB)         // t3 := t3 - t4 = X1Y2 + X2Y1
   843  	STx (tmp3)
   844  
   845  	LDx (y1in)
   846  	LDy (z1in)
   847  	gfpAddInline                    // t4 := Y1 + Z1
   848  	STx (tmp4)
   849  
   850  	LDx (y2in)
   851  	LDy (z2in)
   852  	gfpAddInline                    // t3 := Y2 + Z2
   853  	LDy (tmp4)
   854  	CALL gfpMulInternal(SB)         // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
   855  	STy (tmp4)
   856  
   857  	LDx (tmp1)
   858  	LDy (tmp2)
   859  	gfpAddInline                    // X3 := t1 + t2
   860  	LDy (tmp4)
   861  	CALL gfpSubInternal(SB)         // t4 := t4 - X3 = Y1Z2 + Y2Z1
   862  	STx (tmp4)
   863  
   864  	LDx (x1in)
   865  	LDy (z1in)
   866  	gfpAddInline                    // X3 := X1 + Z1
   867  	STx (x3t)
   868  
   869  	LDx (x2in)
   870  	LDy (z2in)
   871  	gfpAddInline                    // Y3 := X2 + Z2
   872  	LDy (x3t)
   873  	CALL gfpMulInternal(SB)         // X3 := X3 * Y3
   874  	STy (x3t)
   875  
   876  	LDx (tmp0)
   877  	LDy (tmp2)
   878  	gfpAddInline                    // Y3 := t0 + t2
   879  	LDy (x3t)
   880  	CALL gfpSubInternal(SB)         // Y3 := X3 - Y3 = X1Z2 + X2Z1
   881  	STx (y3t)
   882  
   883  	LDy (tmp0)
   884  	gfpMulBy2Inline
   885  	gfpAddInline                    // t0 := t0 + t0 + t0 = 3X1X2
   886  	STx (tmp0)
   887  
   888  	LDy (tmp2)
   889  	gfpMulBy2Inline2
   890  	gfpMulBy2Inline2
   891  	gfpMulBy2Inline2
   892  	gfpMulBy2Inline2
   893  	LDx (tmp2)
   894  	CALL gfpSubInternal(SB)        // t2 := 3b * t2 = 3bZ1Z2
   895  	STx (tmp2)
   896  
   897  	LDy (tmp1)
   898  	gfpAddInline                   // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
   899  	STx (z3t)
   900  
   901  	LDx (tmp2)
   902  	CALL gfpSubInternal(SB)        // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
   903  	STx (tmp1)
   904  
   905  	LDy (y3t)
   906  	gfpMulBy2Inline2
   907  	gfpMulBy2Inline2
   908  	gfpMulBy2Inline2
   909  	gfpMulBy2Inline2
   910  	LDx (y3t)
   911  	CALL gfpSubInternal(SB)        // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
   912  	STx (y3t)
   913  
   914  	LDy (tmp4)
   915  	CALL gfpMulInternal(SB)        // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
   916  	STy (x3t)
   917  
   918  	MOVD res+0(FP), b_ptr
   919  
   920  	LDx (tmp3)
   921  	LDy (tmp1)
   922  	CALL gfpMulInternal(SB)        // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
   923  	LDx (x3t)
   924  	CALL gfpSubInternal(SB)        // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
   925  	STx (x2in)
   926  
   927  	LDy (y3t)
   928  	LDx (tmp0)
   929  	CALL gfpMulInternal(SB)        // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
   930  	STy (y3t)
   931  
   932  	LDx (tmp1)
   933  	LDy (z3t)
   934  	CALL gfpMulInternal(SB)        // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
   935  	LDx (y3t)
   936  	gfpAddInline                   // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
   937  	STx (y2in)
   938  
   939  	LDx (tmp0)
   940  	LDy (tmp3)
   941  	CALL gfpMulInternal(SB)        // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
   942  	STy (tmp0)
   943  
   944  	LDx (tmp4)
   945  	LDy (z3t)
   946  	CALL gfpMulInternal(SB)        // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
   947  	LDx (tmp0)
   948  	gfpAddInline                   // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
   949  	STx (z2in)
   950  
   951  	RET