github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp2_g1_amd64.s (about)

     1  //go:build !(purego || plugin)
     2  
     3  #include "textflag.h"
     4  
     5  /* ---------------------------------------*/
     6  #define mul0 AX
     7  #define mul1 DX
     8  #define acc0 BX
     9  #define acc1 CX
    10  #define acc2 R8
    11  #define acc3 R9
    12  #define acc4 R10
    13  #define acc5 R11
    14  #define acc6 R12
    15  #define acc7 R13
    16  #define t0 R14
    17  #define t1 R15
    18  #define t2 DI
    19  #define t3 SI
    20  #define hlp BP
    21  /* ---------------------------------------*/
    22  // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) - (t3, t2, t1, t0)
    23  TEXT gfpSubInternal(SB),NOSPLIT,$0
    24  	XORQ mul0, mul0
    25  	SUBQ t0, acc4
    26  	SBBQ t1, acc5
    27  	SBBQ t2, acc6
    28  	SBBQ t3, acc7
    29  	SBBQ $0, mul0
    30  
    31  	MOVQ acc4, acc0
    32  	MOVQ acc5, acc1
    33  	MOVQ acc6, acc2
    34  	MOVQ acc7, acc3
    35  
    36  	ADDQ ·p2+0(SB), acc4
    37  	ADCQ ·p2+8(SB), acc5
    38  	ADCQ ·p2+16(SB), acc6
    39  	ADCQ ·p2+24(SB), acc7
    40  	ANDQ $1, mul0
    41  
    42  	// CMOVQEQ: Move if equal (ZF == 1)
    43  	CMOVQEQ acc0, acc4
    44  	CMOVQEQ acc1, acc5
    45  	CMOVQEQ acc2, acc6
    46  	CMOVQEQ acc3, acc7
    47  
    48  	RET
    49  
    50  /* ---------------------------------------*/
    51  // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) * (t3, t2, t1, t0)
    52  // t0, t1 will be overwrited after this function call
    53  TEXT gfpMulInternal(SB),NOSPLIT,$8
    54  	CMPB ·supportADX(SB), $0
    55  	JE   noAdxMul
    56  
    57  	// [t3, t2, t1, t0] * acc4
    58  	MOVQ acc4, mul1
    59  	MULXQ t0, acc0, acc1
    60  
    61  	MULXQ t1, mul0, acc2
    62  	ADDQ mul0, acc1
    63  
    64  	MULXQ t2, mul0, acc3
    65  	ADCQ mul0, acc2
    66  
    67  	MULXQ t3, mul0, acc4
    68  	ADCQ mul0, acc3
    69  	ADCQ $0, acc4
    70  
    71  	// [t3, t2, t1, t0] * acc5
    72  	MOVQ acc5, mul1
    73  	MULXQ t0, mul0, hlp
    74  	ADDQ mul0, acc1
    75  	ADCQ hlp, acc2
    76  
    77  	MULXQ t1, mul0, hlp
    78  	ADCQ $0, hlp
    79  	ADDQ mul0, acc2
    80  	ADCQ hlp, acc3
    81  
    82  	MULXQ t2, mul0, hlp
    83  	ADCQ $0, hlp
    84  	ADDQ mul0, acc3
    85  	ADCQ hlp, acc4
    86  
    87  	MULXQ t3, mul0, acc5
    88  	ADCQ $0, acc5
    89  	ADDQ mul0, acc4
    90  	ADCQ $0, acc5
    91  
    92  	// [t3, t2, t1, t0] * acc6
    93  	MOVQ acc6, mul1
    94  	MULXQ t0, mul0, hlp
    95  	ADDQ mul0, acc2
    96  	ADCQ hlp, acc3
    97  
    98  	MULXQ t1, mul0, hlp
    99  	ADCQ $0, hlp
   100  	ADDQ mul0, acc3
   101  	ADCQ hlp, acc4
   102  
   103  	MULXQ t2, mul0, hlp
   104  	ADCQ $0, hlp
   105  	ADDQ mul0, acc4
   106  	ADCQ hlp, acc5
   107  
   108  	MULXQ t3, mul0, acc6
   109  	ADCQ $0, acc6
   110  	ADDQ mul0, acc5
   111  	ADCQ $0, acc6
   112  
   113  	// [t3, t2, t1, t0] * acc7
   114  	MOVQ acc7, mul1
   115  	MULXQ t0, mul0, hlp
   116  	ADDQ mul0, acc3
   117  	ADCQ hlp, acc4
   118  
   119  	MULXQ t1, mul0, hlp
   120  	ADCQ $0, hlp
   121  	ADDQ mul0, acc4
   122  	ADCQ hlp, acc5
   123  
   124  	MULXQ t2, mul0, hlp
   125  	ADCQ $0, hlp
   126  	ADDQ mul0, acc5
   127  	ADCQ hlp, acc6
   128  
   129  	MULXQ t3, mul0, acc7
   130  	ADCQ $0, acc7
   131  	ADDQ mul0, acc6
   132  	ADCQ $0, acc7
   133  
   134  	// T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0]
   135  	// First reduction step
   136  	XORQ t1, t1
   137  	MOVQ acc0, mul1
   138  	MULXQ ·np+0x00(SB), mul1, mul0
   139  
   140  	MULXQ ·p2+0x00(SB), mul0, t0
   141  	ADOXQ mul0, acc0               // (carry1, acc0) = acc0 + t0 * ord0
   142  
   143  	MULXQ ·p2+0x08(SB), mul0, hlp
   144  	ADCXQ t0, mul0
   145  	ADOXQ mul0, acc1
   146  
   147  	MULXQ ·p2+0x10(SB), mul0, t0
   148  	ADCXQ hlp, mul0
   149  	ADOXQ mul0, acc2
   150  	
   151  	MULXQ ·p2+0x18(SB), mul0, acc0
   152  	ADCXQ t0, mul0
   153  	ADOXQ mul0, acc3
   154  	ADCXQ t1, acc0
   155  	ADOXQ t1, acc0
   156  
   157  	// Second reduction step
   158  	MOVQ acc1, mul1
   159  	MULXQ ·np+0x00(SB), mul1, mul0
   160  
   161  	MULXQ ·p2+0x00(SB), mul0, t0
   162  	ADOXQ mul0, acc1
   163  
   164  	MULXQ ·p2+0x08(SB), mul0, hlp
   165  	ADCXQ t0, mul0
   166  	ADOXQ mul0, acc2
   167  
   168  	MULXQ ·p2+0x10(SB), mul0, t0
   169  	ADCXQ hlp, mul0
   170  	ADOXQ mul0, acc3
   171  
   172  	MULXQ ·p2+0x18(SB), mul0, acc1
   173  	ADCXQ t0, mul0
   174  	ADOXQ mul0, acc0
   175  	ADCXQ t1, acc1
   176  	ADOXQ t1, acc1
   177  
   178  	// Third reduction step
   179  	MOVQ acc2, mul1
   180  	MULXQ ·np+0x00(SB), mul1, mul0
   181  
   182  	MULXQ ·p2+0x00(SB), mul0, t0
   183  	ADOXQ mul0, acc2
   184  
   185  	MULXQ ·p2+0x08(SB), mul0, hlp
   186  	ADCXQ t0, mul0
   187  	ADOXQ mul0, acc3
   188  
   189  	MULXQ ·p2+0x10(SB), mul0, t0
   190  	ADCXQ hlp, mul0
   191  	ADOXQ mul0, acc0
   192  
   193  	MULXQ ·p2+0x18(SB), mul0, acc2
   194  	ADCXQ t0, mul0
   195  	ADOXQ mul0, acc1
   196  	ADCXQ t1, acc2
   197  	ADOXQ t1, acc2
   198  
   199  	// Last reduction step
   200  	MOVQ acc3, mul1
   201  	MULXQ ·np+0x00(SB), mul1, mul0
   202  
   203  	MULXQ ·p2+0x00(SB), mul0, t0
   204  	ADOXQ mul0, acc3
   205  
   206  	MULXQ ·p2+0x08(SB), mul0, hlp
   207  	ADCXQ t0, mul0
   208  	ADOXQ mul0, acc0
   209  
   210  	MULXQ ·p2+0x10(SB), mul0, t0
   211  	ADCXQ hlp, mul0
   212  	ADOXQ mul0, acc1
   213  
   214  	MULXQ ·p2+0x18(SB), mul0, acc3
   215  	ADCXQ t0, mul0
   216  	ADOXQ mul0, acc2
   217  	ADCXQ t1, acc3
   218  	ADOXQ t1, acc3
   219  
   220  	MOVQ $0, hlp
   221  	// Add bits [511:256] of the result
   222  	ADDQ acc0, acc4
   223  	ADCQ acc1, acc5
   224  	ADCQ acc2, acc6
   225  	ADCQ acc3, acc7
   226  	ADCQ $0, hlp
   227  	// Copy result
   228  	MOVQ acc4, acc0
   229  	MOVQ acc5, acc1
   230  	MOVQ acc6, acc2
   231  	MOVQ acc7, acc3
   232  	// Subtract p
   233  	SUBQ ·p2+0(SB), acc4
   234  	SBBQ ·p2+8(SB), acc5
   235  	SBBQ ·p2+16(SB), acc6
   236  	SBBQ ·p2+24(SB), acc7
   237  	SBBQ $0, hlp
   238  	// If the result of the subtraction is negative, restore the previous result
   239  	CMOVQCS acc0, acc4
   240  	CMOVQCS acc1, acc5
   241  	CMOVQCS acc2, acc6
   242  	CMOVQCS acc3, acc7
   243  
   244  	RET
   245  
   246  noAdxMul:
   247  	// [t3, t2, t1, t0] * acc4
   248  	MOVQ acc4, mul0
   249  	MULQ t0
   250  	MOVQ mul0, acc0
   251  	MOVQ mul1, acc1
   252  
   253  	MOVQ acc4, mul0
   254  	MULQ t1
   255  	ADDQ mul0, acc1
   256  	ADCQ $0, mul1
   257  	MOVQ mul1, acc2
   258  
   259  	MOVQ acc4, mul0
   260  	MULQ t2
   261  	ADDQ mul0, acc2
   262  	ADCQ $0, mul1
   263  	MOVQ mul1, acc3
   264  
   265  	MOVQ acc4, mul0
   266  	MULQ t3
   267  	ADDQ mul0, acc3
   268  	ADCQ $0, mul1
   269  	MOVQ mul1, acc4
   270  
   271  	// [t3, t2, t1, t0] * acc5
   272  	MOVQ acc5, mul0
   273  	MULQ t0
   274  	ADDQ mul0, acc1
   275  	ADCQ $0, mul1
   276  	MOVQ mul1, hlp
   277  
   278  	MOVQ acc5, mul0
   279  	MULQ t1
   280  	ADDQ hlp, acc2
   281  	ADCQ $0, mul1
   282  	ADDQ mul0, acc2
   283  	ADCQ $0, mul1
   284  	MOVQ mul1, hlp
   285  
   286  	MOVQ acc5, mul0
   287  	MULQ t2
   288  	ADDQ hlp, acc3
   289  	ADCQ $0, mul1
   290  	ADDQ mul0, acc3
   291  	ADCQ $0, mul1
   292  	MOVQ mul1, hlp
   293  
   294  	MOVQ acc5, mul0
   295  	MULQ t3
   296  	ADDQ hlp, acc4
   297  	ADCQ $0, mul1
   298  	ADDQ mul0, acc4
   299  	ADCQ $0, mul1
   300  	MOVQ mul1, acc5
   301  
   302  	// [t3, t2, t1, t0] * acc6
   303  	MOVQ acc6, mul0
   304  	MULQ t0
   305  	ADDQ mul0, acc2
   306  	ADCQ $0, mul1
   307  	MOVQ mul1, hlp
   308  
   309  	MOVQ acc6, mul0
   310  	MULQ t1
   311  	ADDQ hlp, acc3
   312  	ADCQ $0, mul1
   313  	ADDQ mul0, acc3
   314  	ADCQ $0, mul1
   315  	MOVQ mul1, hlp
   316  
   317  	MOVQ acc6, mul0
   318  	MULQ t2
   319  	ADDQ hlp, acc4
   320  	ADCQ $0, mul1
   321  	ADDQ mul0, acc4
   322  	ADCQ $0, mul1
   323  	MOVQ mul1, hlp
   324  
   325  	MOVQ acc6, mul0
   326  	MULQ t3
   327  	ADDQ hlp, acc5
   328  	ADCQ $0, mul1
   329  	ADDQ mul0, acc5
   330  	ADCQ $0, mul1
   331  	MOVQ mul1, acc6
   332  
   333  	// [t3, t2, t1, t0] * acc7
   334  	MOVQ acc7, mul0
   335  	MULQ t0
   336  	ADDQ mul0, acc3
   337  	ADCQ $0, mul1
   338  	MOVQ mul1, hlp
   339  
   340  	MOVQ acc7, mul0
   341  	MULQ t1
   342  	ADDQ hlp, acc4
   343  	ADCQ $0, mul1
   344  	ADDQ mul0, acc4
   345  	ADCQ $0, mul1
   346  	MOVQ mul1, hlp
   347  
   348  	MOVQ acc7, mul0
   349  	MULQ t2
   350  	ADDQ hlp, acc5
   351  	ADCQ $0, mul1
   352  	ADDQ mul0, acc5
   353  	ADCQ $0, mul1
   354  	MOVQ mul1, hlp
   355  
   356  	MOVQ acc7, mul0
   357  	MULQ t3
   358  	ADDQ hlp, acc6
   359  	ADCQ $0, mul1
   360  	ADDQ mul0, acc6
   361  	ADCQ $0, mul1
   362  	MOVQ mul1, acc7
   363  	// T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0]
   364  	// First reduction step
   365  	MOVQ acc0, mul0
   366  	MULQ ·np+0x00(SB)
   367  	MOVQ mul0, hlp
   368  
   369  	MOVQ ·p2+0x00(SB), mul0
   370  	MULQ hlp
   371  	ADDQ mul0, acc0
   372  	ADCQ $0, mul1
   373  	MOVQ mul1, t0
   374  	XORQ acc0, acc0
   375  
   376  	MOVQ ·p2+0x08(SB), mul0
   377  	MULQ hlp
   378  	ADDQ t0, acc1
   379  	ADCQ $0, mul1
   380  	ADDQ mul0, acc1
   381  	ADCQ $0, mul1
   382  	MOVQ mul1, t0
   383  
   384  	MOVQ ·p2+0x10(SB), mul0
   385  	MULQ hlp
   386  	ADDQ t0, acc2
   387  	ADCQ $0, mul1
   388  	ADDQ mul0, acc2
   389  	ADCQ $0, mul1
   390  	MOVQ mul1, t0
   391  
   392  	MOVQ ·p2+0x18(SB), mul0
   393  	MULQ hlp
   394  	ADDQ t0, acc3
   395  	ADCQ $0, mul1
   396  	ADDQ mul0, acc3
   397  	ADCQ mul1, acc0
   398  
   399  	// Second reduction step
   400  	MOVQ acc1, mul0
   401  	MULQ ·np+0x00(SB)
   402  	MOVQ mul0, hlp
   403  
   404  	MOVQ ·p2+0x00(SB), mul0
   405  	MULQ hlp
   406  	ADDQ mul0, acc1
   407  	ADCQ $0, mul1
   408  	MOVQ mul1, t0
   409  	XORQ acc1, acc1
   410  
   411  	MOVQ ·p2+0x08(SB), mul0
   412  	MULQ hlp
   413  	ADDQ t0, acc2
   414  	ADCQ $0, mul1
   415  	ADDQ mul0, acc2
   416  	ADCQ $0, mul1
   417  	MOVQ mul1, t0
   418  
   419  	MOVQ ·p2+0x10(SB), mul0
   420  	MULQ hlp
   421  	ADDQ t0, acc3
   422  	ADCQ $0, mul1
   423  	ADDQ mul0, acc3
   424  	ADCQ $0, mul1
   425  	MOVQ mul1, t0
   426  
   427  	MOVQ ·p2+0x18(SB), mul0
   428  	MULQ hlp
   429  	ADDQ t0, acc0
   430  	ADCQ $0, mul1
   431  	ADDQ mul0, acc0
   432  	ADCQ mul1, acc1
   433  
   434  	// Third reduction step
   435  	MOVQ acc2, mul0
   436  	MULQ ·np+0x00(SB)
   437  	MOVQ mul0, hlp
   438  
   439  	MOVQ ·p2+0x00(SB), mul0
   440  	MULQ hlp
   441  	ADDQ mul0, acc2
   442  	ADCQ $0, mul1
   443  	MOVQ mul1, t0
   444  	XORQ acc2, acc2
   445  
   446  	MOVQ ·p2+0x08(SB), mul0
   447  	MULQ hlp
   448  	ADDQ t0, acc3
   449  	ADCQ $0, mul1
   450  	ADDQ mul0, acc3
   451  	ADCQ $0, mul1
   452  	MOVQ mul1, t0
   453  
   454  	MOVQ ·p2+0x10(SB), mul0
   455  	MULQ hlp
   456  	ADDQ t0, acc0
   457  	ADCQ $0, mul1
   458  	ADDQ mul0, acc0
   459  	ADCQ $0, mul1
   460  	MOVQ mul1, t0
   461  
   462  	MOVQ ·p2+0x18(SB), mul0
   463  	MULQ hlp
   464  	ADDQ t0, acc1
   465  	ADCQ $0, mul1
   466  	ADDQ mul0, acc1
   467  	ADCQ mul1, acc2
   468  
   469  	// Last reduction step
   470  	MOVQ acc3, mul0
   471  	MULQ ·np+0x00(SB)
   472  	MOVQ mul0, hlp
   473  
   474  	MOVQ ·p2+0x00(SB), mul0
   475  	MULQ hlp
   476  	ADDQ mul0, acc3
   477  	ADCQ $0, mul1
   478  	MOVQ mul1, t0
   479  	XORQ acc3, acc3
   480  
   481  	MOVQ ·p2+0x08(SB), mul0
   482  	MULQ hlp
   483  	ADDQ t0, acc0
   484  	ADCQ $0, mul1
   485  	ADDQ mul0, acc0
   486  	ADCQ $0, mul1
   487  	MOVQ mul1, t0
   488  
   489  	MOVQ ·p2+0x10(SB), mul0
   490  	MULQ hlp
   491  	ADDQ t0, acc1
   492  	ADCQ $0, mul1
   493  	ADDQ mul0, acc1
   494  	ADCQ $0, mul1
   495  	MOVQ mul1, t0
   496  
   497  	MOVQ ·p2+0x18(SB), mul0
   498  	MULQ hlp
   499  	ADDQ t0, acc2
   500  	ADCQ $0, mul1
   501  	ADDQ mul0, acc2
   502  	ADCQ mul1, acc3
   503  
   504  	MOVQ $0, hlp
   505  	// Add bits [511:256] of the result
   506  	ADDQ acc0, acc4
   507  	ADCQ acc1, acc5
   508  	ADCQ acc2, acc6
   509  	ADCQ acc3, acc7
   510  	ADCQ $0, hlp
   511  	// Copy result
   512  	MOVQ acc4, acc0
   513  	MOVQ acc5, acc1
   514  	MOVQ acc6, acc2
   515  	MOVQ acc7, acc3
   516  	// Subtract p
   517  	SUBQ ·p2+0(SB), acc4
   518  	SBBQ ·p2+8(SB), acc5
   519  	SBBQ ·p2+16(SB), acc6
   520  	SBBQ ·p2+24(SB), acc7
   521  	SBBQ $0, hlp
   522  	// If the result of the subtraction is negative, restore the previous result
   523  	CMOVQCS acc0, acc4
   524  	CMOVQCS acc1, acc5
   525  	CMOVQCS acc2, acc6
   526  	CMOVQCS acc3, acc7
   527  
   528  	RET
   529  
   530  /* ---------------------------------------*/
   531  // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) ^ 2
   532  TEXT gfpSqrInternal(SB),NOSPLIT,$8
   533  	CMPB ·supportADX(SB), $0
   534  	JE   noAdxSqr
   535  
   536  	XORQ t3, t3
   537  
   538  	// [acc7, acc6, acc5] * acc4
   539  	MOVQ acc4, mul1
   540  	MULXQ acc5, acc1, acc2
   541  
   542  	MULXQ acc6, mul0, acc3
   543  	ADOXQ mul0, acc2
   544  
   545  	MULXQ acc7, mul0, t0
   546  	ADOXQ mul0, acc3
   547  	ADOXQ t3, t0
   548  
   549  	// [acc7, acc6] * acc5
   550  	MOVQ acc5, mul1
   551  	MULXQ acc6, mul0, hlp
   552  	ADOXQ mul0, acc3
   553  
   554  	MULXQ acc7, mul0, t1
   555  	ADCXQ hlp, mul0
   556  	ADOXQ mul0, t0
   557  	ADCXQ t3, t1
   558  
   559  	// acc7 * acc6
   560  	MOVQ acc6, mul1
   561  	MULXQ acc7, mul0, t2
   562  	ADOXQ mul0, t1
   563  	ADOXQ t3, t2
   564  	
   565  	// *2
   566  	ADOXQ acc1, acc1
   567  	ADOXQ acc2, acc2
   568  	ADOXQ acc3, acc3
   569  	ADOXQ t0, t0
   570  	ADOXQ t1, t1
   571  	ADOXQ t2, t2
   572  	ADOXQ t3, t3
   573  
   574  	// Missing products
   575  	MOVQ acc4, mul1
   576  	MULXQ mul1, acc0, acc4 
   577  	ADCXQ acc4, acc1
   578  
   579  	MOVQ acc5, mul1
   580  	MULXQ mul1, mul0, acc4
   581  	ADCXQ mul0, acc2
   582  	ADCXQ acc4, acc3
   583  
   584  	MOVQ acc6, mul1
   585  	MULXQ mul1, mul0, acc4
   586  	ADCXQ mul0, t0
   587  	ADCXQ acc4, t1
   588  
   589  	MOVQ acc7, mul1
   590  	MULXQ mul1, mul0, acc4
   591  	ADCXQ mul0, t2
   592  	ADCXQ acc4, t3
   593  
   594  	// T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0]
   595  	// First reduction step
   596  	XORQ acc5, acc5
   597  	MOVQ acc0, mul1
   598  	MULXQ ·np+0x00(SB), mul1, mul0
   599  
   600  	MULXQ ·p2+0x00(SB), mul0, acc4
   601  	ADOXQ mul0, acc0               // (carry1, acc0) = acc0 + acc5 * ord0
   602  
   603  	MULXQ ·p2+0x08(SB), mul0, hlp
   604  	ADCXQ acc4, mul0
   605  	ADOXQ mul0, acc1
   606  
   607  	MULXQ ·p2+0x10(SB), mul0, acc4
   608  	ADCXQ hlp, mul0
   609  	ADOXQ mul0, acc2
   610  	
   611  	MULXQ ·p2+0x18(SB), mul0, acc0
   612  	ADCXQ acc4, mul0
   613  	ADOXQ mul0, acc3
   614  	ADCXQ acc5, acc0
   615  	ADOXQ acc5, acc0
   616  
   617  	// Second reduction step
   618  	MOVQ acc1, mul1
   619  	MULXQ ·np+0x00(SB), mul1, mul0
   620  
   621  	MULXQ ·p2+0x00(SB), mul0, acc4
   622  	ADOXQ mul0, acc1
   623  
   624  	MULXQ ·p2+0x08(SB), mul0, hlp
   625  	ADCXQ acc4, mul0
   626  	ADOXQ mul0, acc2
   627  
   628  	MULXQ ·p2+0x10(SB), mul0, acc4
   629  	ADCXQ hlp, mul0
   630  	ADOXQ mul0, acc3
   631  
   632  	MULXQ ·p2+0x18(SB), mul0, acc1
   633  	ADCXQ acc4, mul0
   634  	ADOXQ mul0, acc0
   635  	ADCXQ acc5, acc1
   636  	ADOXQ acc5, acc1
   637  
   638  	// Third reduction step
   639  	MOVQ acc2, mul1
   640  	MULXQ ·np+0x00(SB), mul1, mul0
   641  
   642  	MULXQ ·p2+0x00(SB), mul0, acc4
   643  	ADOXQ mul0, acc2
   644  
   645  	MULXQ ·p2+0x08(SB), mul0, hlp
   646  	ADCXQ acc4, mul0
   647  	ADOXQ mul0, acc3
   648  
   649  	MULXQ ·p2+0x10(SB), mul0, acc4
   650  	ADCXQ hlp, mul0
   651  	ADOXQ mul0, acc0
   652  
   653  	MULXQ ·p2+0x18(SB), mul0, acc2
   654  	ADCXQ acc4, mul0
   655  	ADOXQ mul0, acc1
   656  	ADCXQ acc5, acc2
   657  	ADOXQ acc5, acc2
   658  
   659  	// Last reduction step
   660  	MOVQ acc3, mul1
   661  	MULXQ ·np+0x00(SB), mul1, mul0
   662  
   663  	MULXQ ·p2+0x00(SB), mul0, acc4
   664  	ADOXQ mul0, acc3
   665  
   666  	MULXQ ·p2+0x08(SB), mul0, hlp
   667  	ADCXQ acc4, mul0
   668  	ADOXQ mul0, acc0
   669  
   670  	MULXQ ·p2+0x10(SB), mul0, acc4
   671  	ADCXQ hlp, mul0
   672  	ADOXQ mul0, acc1
   673  
   674  	MULXQ ·p2+0x18(SB), mul0, acc3
   675  	ADCXQ acc4, mul0
   676  	ADOXQ mul0, acc2
   677  	ADCXQ acc5, acc3
   678  	ADOXQ acc5, acc3
   679  
   680  	MOVQ $0, hlp
   681  	// Add bits [511:256] of the result
   682  	ADDQ acc0, t0
   683  	ADCQ acc1, t1
   684  	ADCQ acc2, t2
   685  	ADCQ acc3, t3
   686  	ADCQ $0, hlp
   687  	// Copy result
   688  	MOVQ t0, acc4
   689  	MOVQ t1, acc5
   690  	MOVQ t2, acc6
   691  	MOVQ t3, acc7
   692  	// Subtract p
   693  	SUBQ ·p2+0(SB), acc4
   694  	SBBQ ·p2+8(SB), acc5
   695  	SBBQ ·p2+16(SB), acc6
   696  	SBBQ ·p2+24(SB), acc7
   697  	SBBQ $0, hlp
   698  	// If the result of the subtraction is negative, restore the previous result
   699  	CMOVQCS t0, acc4
   700  	CMOVQCS t1, acc5
   701  	CMOVQCS t2, acc6
   702  	CMOVQCS t3, acc7
   703  
   704  	RET
   705  
   706  noAdxSqr:
   707  	MOVQ acc4, mul0
   708  	MULQ acc5
   709  	MOVQ mul0, acc1
   710  	MOVQ mul1, acc2
   711  
   712  	MOVQ acc4, mul0
   713  	MULQ acc6
   714  	ADDQ mul0, acc2
   715  	ADCQ $0, mul1
   716  	MOVQ mul1, acc3
   717  
   718  	MOVQ acc4, mul0
   719  	MULQ acc7
   720  	ADDQ mul0, acc3
   721  	ADCQ $0, mul1
   722  	MOVQ mul1, t0
   723  
   724  	MOVQ acc5, mul0
   725  	MULQ acc6
   726  	ADDQ mul0, acc3
   727  	ADCQ $0, mul1
   728  	MOVQ mul1, hlp
   729  
   730  	MOVQ acc5, mul0
   731  	MULQ acc7
   732  	ADDQ hlp, t0
   733  	ADCQ $0, mul1
   734  	ADDQ mul0, t0
   735  	ADCQ $0, mul1
   736  	MOVQ mul1, t1
   737  
   738  	MOVQ acc6, mul0
   739  	MULQ acc7
   740  	ADDQ mul0, t1
   741  	ADCQ $0, mul1
   742  	MOVQ mul1, t2
   743  	XORQ t3, t3
   744  	// *2
   745  	ADDQ acc1, acc1
   746  	ADCQ acc2, acc2
   747  	ADCQ acc3, acc3
   748  	ADCQ t0, t0
   749  	ADCQ t1, t1
   750  	ADCQ t2, t2
   751  	ADCQ $0, t3
   752  	// Missing products
   753  	MOVQ acc4, mul0
   754  	MULQ mul0
   755  	MOVQ mul0, acc0
   756  	MOVQ DX, acc4
   757  
   758  	MOVQ acc5, mul0
   759  	MULQ mul0
   760  	ADDQ acc4, acc1
   761  	ADCQ mul0, acc2
   762  	ADCQ $0, DX
   763  	MOVQ DX, acc4
   764  
   765  	MOVQ acc6, mul0
   766  	MULQ mul0
   767  	ADDQ acc4, acc3
   768  	ADCQ mul0, t0
   769  	ADCQ $0, DX
   770  	MOVQ DX, acc4
   771  
   772  	MOVQ acc7, mul0
   773  	MULQ mul0
   774  	ADDQ acc4, t1
   775  	ADCQ mul0, t2
   776  	ADCQ DX, t3
   777  	// T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0]
   778  	// First reduction step
   779  	MOVQ acc0, mul0
   780  	MULQ ·np+0x00(SB)
   781  	MOVQ mul0, hlp
   782  
   783  	MOVQ ·p2+0x00(SB), mul0
   784  	MULQ hlp
   785  	ADDQ mul0, acc0
   786  	ADCQ $0, mul1
   787  	MOVQ mul1, acc5
   788  	XORQ acc0, acc0
   789  
   790  	MOVQ ·p2+0x08(SB), mul0
   791  	MULQ hlp
   792  	ADDQ acc5, acc1
   793  	ADCQ $0, mul1
   794  	ADDQ mul0, acc1
   795  	ADCQ $0, mul1
   796  	MOVQ mul1, acc5
   797  
   798  	MOVQ ·p2+0x10(SB), mul0
   799  	MULQ hlp
   800  	ADDQ acc5, acc2
   801  	ADCQ $0, mul1
   802  	ADDQ mul0, acc2
   803  	ADCQ $0, mul1
   804  	MOVQ mul1, acc5
   805  
   806  	MOVQ ·p2+0x18(SB), mul0
   807  	MULQ hlp
   808  	ADDQ acc5, acc3
   809  	ADCQ $0, mul1
   810  	ADDQ mul0, acc3
   811  	ADCQ mul1, acc0
   812  
   813  	// Second reduction step
   814  	MOVQ acc1, mul0
   815  	MULQ ·np+0x00(SB)
   816  	MOVQ mul0, hlp
   817  
   818  	MOVQ ·p2+0x00(SB), mul0
   819  	MULQ hlp
   820  	ADDQ mul0, acc1
   821  	ADCQ $0, mul1
   822  	MOVQ mul1, acc5
   823  	XORQ acc1, acc1
   824  
   825  	MOVQ ·p2+0x08(SB), mul0
   826  	MULQ hlp
   827  	ADDQ acc5, acc2
   828  	ADCQ $0, mul1
   829  	ADDQ mul0, acc2
   830  	ADCQ $0, mul1
   831  	MOVQ mul1, acc5
   832  
   833  	MOVQ ·p2+0x10(SB), mul0
   834  	MULQ hlp
   835  	ADDQ acc5, acc3
   836  	ADCQ $0, mul1
   837  	ADDQ mul0, acc3
   838  	ADCQ $0, mul1
   839  	MOVQ mul1, acc5
   840  
   841  	MOVQ ·p2+0x18(SB), mul0
   842  	MULQ hlp
   843  	ADDQ acc5, acc0
   844  	ADCQ $0, mul1
   845  	ADDQ mul0, acc0
   846  	ADCQ mul1, acc1
   847  
   848  	// Third reduction step
   849  	MOVQ acc2, mul0
   850  	MULQ ·np+0x00(SB)
   851  	MOVQ mul0, hlp
   852  
   853  	MOVQ ·p2+0x00(SB), mul0
   854  	MULQ hlp
   855  	ADDQ mul0, acc2
   856  	ADCQ $0, mul1
   857  	MOVQ mul1, acc5
   858  	XORQ acc2, acc2
   859  
   860  	MOVQ ·p2+0x08(SB), mul0
   861  	MULQ hlp
   862  	ADDQ acc5, acc3
   863  	ADCQ $0, mul1
   864  	ADDQ mul0, acc3
   865  	ADCQ $0, mul1
   866  	MOVQ mul1, acc5
   867  
   868  	MOVQ ·p2+0x10(SB), mul0
   869  	MULQ hlp
   870  	ADDQ acc5, acc0
   871  	ADCQ $0, mul1
   872  	ADDQ mul0, acc0
   873  	ADCQ $0, mul1
   874  	MOVQ mul1, acc5
   875  
   876  	MOVQ ·p2+0x18(SB), mul0
   877  	MULQ hlp
   878  	ADDQ acc5, acc1
   879  	ADCQ $0, mul1
   880  	ADDQ mul0, acc1
   881  	ADCQ mul1, acc2
   882  
   883  	// Last reduction step
   884  	MOVQ acc3, mul0
   885  	MULQ ·np+0x00(SB)
   886  	MOVQ mul0, hlp
   887  
   888  	MOVQ ·p2+0x00(SB), mul0
   889  	MULQ hlp
   890  	ADDQ mul0, acc3
   891  	ADCQ $0, mul1
   892  	MOVQ mul1, acc5
   893  	XORQ acc3, acc3
   894  
   895  	MOVQ ·p2+0x08(SB), mul0
   896  	MULQ hlp
   897  	ADDQ acc5, acc0
   898  	ADCQ $0, mul1
   899  	ADDQ mul0, acc0
   900  	ADCQ $0, mul1
   901  	MOVQ mul1, acc5
   902  
   903  	MOVQ ·p2+0x10(SB), mul0
   904  	MULQ hlp
   905  	ADDQ acc5, acc1
   906  	ADCQ $0, mul1
   907  	ADDQ mul0, acc1
   908  	ADCQ $0, mul1
   909  	MOVQ mul1, acc5
   910  
   911  	MOVQ ·p2+0x18(SB), mul0
   912  	MULQ hlp
   913  	ADDQ acc5, acc2
   914  	ADCQ $0, mul1
   915  	ADDQ mul0, acc2
   916  	ADCQ mul1, acc3
   917  
   918  	MOVQ $0, hlp
   919  	// Add bits [511:256] of the result
   920  	ADDQ acc0, t0
   921  	ADCQ acc1, t1
   922  	ADCQ acc2, t2
   923  	ADCQ acc3, t3
   924  	ADCQ $0, hlp
   925  	// Copy result
   926  	MOVQ t0, acc4
   927  	MOVQ t1, acc5
   928  	MOVQ t2, acc6
   929  	MOVQ t3, acc7
   930  	// Subtract p
   931  	SUBQ ·p2+0(SB), acc4
   932  	SBBQ ·p2+8(SB), acc5
   933  	SBBQ ·p2+16(SB), acc6
   934  	SBBQ ·p2+24(SB), acc7
   935  	SBBQ $0, hlp
   936  	// If the result of the subtraction is negative, restore the previous result
   937  	CMOVQCS t0, acc4
   938  	CMOVQCS t1, acc5
   939  	CMOVQCS t2, acc6
   940  	CMOVQCS t3, acc7
   941  
   942  	RET
   943  
   944  /* ---------------------------------------*/
   945  // (t3, t2, t1, t0) = 2(acc7, acc6, acc5, acc4)
   946  #define gfpMulBy2Inline \
   947  	XORQ mul0, mul0;\
   948  	ADDQ acc4, acc4;\
   949  	ADCQ acc5, acc5;\
   950  	ADCQ acc6, acc6;\
   951  	ADCQ acc7, acc7;\
   952  	ADCQ $0, mul0;\
   953  	MOVQ acc4, t0;\
   954  	MOVQ acc5, t1;\
   955  	MOVQ acc6, t2;\
   956  	MOVQ acc7, t3;\
   957  	SUBQ ·p2+0(SB), t0;\
   958  	SBBQ ·p2+8(SB), t1;\
   959  	SBBQ ·p2+16(SB), t2;\
   960  	SBBQ ·p2+24(SB), t3;\
   961  	SBBQ $0, mul0;\
   962  	CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
   963  	CMOVQCS acc5, t1;\
   964  	CMOVQCS acc6, t2;\
   965  	CMOVQCS acc7, t3;
   966  
   967  // (acc7, acc6, acc5, acc4) = 2(acc7, acc6, acc5, acc4)
   968  #define gfpMulBy2Inline2 \
   969  	XORQ mul0, mul0;\
   970  	ADDQ acc4, acc4;\
   971  	ADCQ acc5, acc5;\
   972  	ADCQ acc6, acc6;\
   973  	ADCQ acc7, acc7;\
   974  	ADCQ $0, mul0;\
   975  	MOVQ acc4, t0;\
   976  	MOVQ acc5, t1;\
   977  	MOVQ acc6, t2;\
   978  	MOVQ acc7, t3;\
   979  	SUBQ ·p2+0(SB), acc4;\
   980  	SBBQ ·p2+8(SB), acc5;\
   981  	SBBQ ·p2+16(SB), acc6;\
   982  	SBBQ ·p2+24(SB), acc7;\
   983  	SBBQ $0, mul0;\
   984  	CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
   985  	CMOVQCS t1, acc5;\
   986  	CMOVQCS t2, acc6;\
   987  	CMOVQCS t3, acc7;	
   988  
   989  /* ---------------------------------------*/
   990  // (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0)
   991  #define gfpAddInline \
   992  	XORQ mul0, mul0;\
   993  	ADDQ t0, acc4;\
   994  	ADCQ t1, acc5;\
   995  	ADCQ t2, acc6;\
   996  	ADCQ t3, acc7;\
   997  	ADCQ $0, mul0;\
   998  	MOVQ acc4, t0;\
   999  	MOVQ acc5, t1;\
  1000  	MOVQ acc6, t2;\
  1001  	MOVQ acc7, t3;\
  1002  	SUBQ ·p2+0(SB), t0;\
  1003  	SBBQ ·p2+8(SB), t1;\
  1004  	SBBQ ·p2+16(SB), t2;\
  1005  	SBBQ ·p2+24(SB), t3;\
  1006  	SBBQ $0, mul0;\
  1007  	CMOVQCS acc4, t0;\
  1008  	CMOVQCS acc5, t1;\
  1009  	CMOVQCS acc6, t2;\
  1010  	CMOVQCS acc7, t3;
  1011  
  1012  /* ---------------------------------------*/
  1013  #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1014  #define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1015  #define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1016  #define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1017  #define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1018  #define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1019  
  1020  /* ---------------------------------------*/
  1021  #define axin(off) (32*0 + off)(SP)
  1022  #define ayin(off) (32*1 + off)(SP)
  1023  #define bxin(off) (32*2 + off)(SP)
  1024  #define byin(off) (32*3 + off)(SP)
  1025  #define tmp0(off) (32*4 + off)(SP)
  1026  #define tmp1(off) (32*5 + off)(SP)
  1027  #define cxout(off) (32*6 + off)(SP)
  1028  #define rptr	  (32*7)(SP)
  1029  
  1030  TEXT ·gfp2Mul(SB),NOSPLIT,$256-24
  1031  	// Move input to stack in order to free registers
  1032  	MOVQ res+0(FP), CX
  1033  	MOVQ in1+8(FP), AX
  1034  	MOVQ in2+16(FP), BX
  1035  
  1036  	MOVOU (16*0)(AX), X0
  1037  	MOVOU (16*1)(AX), X1
  1038  	MOVOU (16*2)(AX), X2
  1039  	MOVOU (16*3)(AX), X3
  1040  
  1041  	MOVOU X0, axin(16*0)
  1042  	MOVOU X1, axin(16*1)
  1043  	MOVOU X2, ayin(16*0)
  1044  	MOVOU X3, ayin(16*1)
  1045  
  1046  	MOVOU (16*0)(BX), X0
  1047  	MOVOU (16*1)(BX), X1
  1048  	MOVOU (16*2)(BX), X2
  1049  	MOVOU (16*3)(BX), X3
  1050  
  1051  	MOVOU X0, bxin(16*0)
  1052  	MOVOU X1, bxin(16*1)
  1053  	MOVOU X2, byin(16*0)
  1054  	MOVOU X3, byin(16*1)
  1055  
  1056  	// Store pointer to result
  1057  	MOVQ CX, rptr
  1058  
  1059  	LDacc (ayin)
  1060  	LDt (byin)
  1061  	CALL gfpMulInternal(SB)
  1062  	ST (tmp0)
  1063  
  1064  	LDacc (axin)
  1065  	LDt (bxin)
  1066  	CALL gfpMulInternal(SB)
  1067  	ST (tmp1)
  1068  
  1069  	LDacc (axin)
  1070  	LDt (ayin)
  1071  	gfpAddInline
  1072  	STt (cxout)
  1073  
  1074  	LDacc (bxin)
  1075  	LDt (byin)
  1076  	gfpAddInline
  1077  
  1078  	LDacc (cxout)
  1079  	CALL gfpMulInternal(SB)
  1080  	LDt (tmp0)
  1081  	CALL gfpSubInternal(SB)
  1082  	LDt (tmp1)
  1083  	CALL gfpSubInternal(SB)
  1084  
  1085  	// Store x	
  1086  	MOVQ rptr, AX
  1087  	MOVQ acc4, (16*0 + 8*0)(AX)
  1088  	MOVQ acc5, (16*0 + 8*1)(AX)
  1089  	MOVQ acc6, (16*0 + 8*2)(AX)
  1090  	MOVQ acc7, (16*0 + 8*3)(AX)
  1091  
  1092  	LDacc (tmp0)
  1093  	//LDt (tmp1)
  1094  	CALL gfpSubInternal(SB)
  1095  	CALL gfpSubInternal(SB)
  1096  	MOVQ rptr, AX
  1097  	///////////////////////
  1098  	MOVQ $0, rptr	
  1099  	// Store y
  1100  	MOVQ acc4, (16*2 + 8*0)(AX)
  1101  	MOVQ acc5, (16*2 + 8*1)(AX)
  1102  	MOVQ acc6, (16*2 + 8*2)(AX)
  1103  	MOVQ acc7, (16*2 + 8*3)(AX)
  1104  
  1105  	RET
  1106  
  1107  TEXT ·gfp2MulU(SB),NOSPLIT,$256-24
  1108  	// Move input to stack in order to free registers
  1109  	MOVQ res+0(FP), CX
  1110  	MOVQ in1+8(FP), AX
  1111  	MOVQ in2+16(FP), BX
  1112  
  1113  	MOVOU (16*0)(AX), X0
  1114  	MOVOU (16*1)(AX), X1
  1115  	MOVOU (16*2)(AX), X2
  1116  	MOVOU (16*3)(AX), X3
  1117  
  1118  	MOVOU X0, axin(16*0)
  1119  	MOVOU X1, axin(16*1)
  1120  	MOVOU X2, ayin(16*0)
  1121  	MOVOU X3, ayin(16*1)
  1122  
  1123  	MOVOU (16*0)(BX), X0
  1124  	MOVOU (16*1)(BX), X1
  1125  	MOVOU (16*2)(BX), X2
  1126  	MOVOU (16*3)(BX), X3
  1127  
  1128  	MOVOU X0, bxin(16*0)
  1129  	MOVOU X1, bxin(16*1)
  1130  	MOVOU X2, byin(16*0)
  1131  	MOVOU X3, byin(16*1)
  1132  
  1133  	// Store pointer to result
  1134  	MOVQ CX, rptr
  1135  
  1136  	LDacc (ayin)
  1137  	LDt (byin)
  1138  	CALL gfpMulInternal(SB)
  1139  	ST (tmp0)
  1140  
  1141  	LDacc (axin)
  1142  	LDt (bxin)
  1143  	CALL gfpMulInternal(SB)
  1144  	ST (tmp1)
  1145  
  1146  	LDacc (axin)
  1147  	LDt (ayin)
  1148  	gfpAddInline
  1149  	STt (cxout)
  1150  
  1151  	LDacc (bxin)
  1152  	LDt (byin)
  1153  	gfpAddInline
  1154  
  1155  	LDacc (cxout)
  1156  	CALL gfpMulInternal(SB)
  1157  	LDt (tmp0)
  1158  	CALL gfpSubInternal(SB)
  1159  	LDt (tmp1)
  1160  	CALL gfpSubInternal(SB)
  1161  	gfpMulBy2Inline
  1162  	XORQ acc4, acc4
  1163  	XORQ acc5, acc5
  1164  	XORQ acc6, acc6
  1165  	XORQ acc7, acc7
  1166  	CALL gfpSubInternal(SB)
  1167  
  1168  	// Store y
  1169  	MOVQ rptr, AX
  1170  	MOVQ acc4, (16*2 + 8*0)(AX)
  1171  	MOVQ acc5, (16*2 + 8*1)(AX)
  1172  	MOVQ acc6, (16*2 + 8*2)(AX)
  1173  	MOVQ acc7, (16*2 + 8*3)(AX)
  1174  
  1175  	LDacc (tmp0)
  1176  	LDt (tmp1)
  1177  	CALL gfpSubInternal(SB)
  1178  	CALL gfpSubInternal(SB)
  1179  	MOVQ rptr, AX
  1180  	///////////////////////
  1181  	MOVQ $0, rptr	
  1182  	// Store x
  1183  	MOVQ acc4, (16*0 + 8*0)(AX)
  1184  	MOVQ acc5, (16*0 + 8*1)(AX)
  1185  	MOVQ acc6, (16*0 + 8*2)(AX)
  1186  	MOVQ acc7, (16*0 + 8*3)(AX)
  1187  
  1188  	RET
  1189  
  1190  #undef axin
  1191  #undef ayin
  1192  #undef bxin
  1193  #undef byin
  1194  #undef tmp0
  1195  #undef tmp1
  1196  #undef cxout
  1197  #undef rptr
  1198  
  1199  TEXT ·gfp2MulU1(SB),NOSPLIT,$0-16
  1200  	// Move input to stack in order to free registers
  1201  	MOVQ res+0(FP), mul1
  1202  	MOVQ in1+8(FP), AX
  1203  
  1204  	//LDacc (axin)
  1205  	MOVOU (16*2)(AX), X2
  1206  	MOVOU (16*3)(AX), X3
  1207  	MOVQ (16*0 + 8*0)(AX), acc4
  1208  	MOVQ (16*0 + 8*1)(AX), acc5
  1209  	MOVQ (16*0 + 8*2)(AX), acc6
  1210  	MOVQ (16*0 + 8*3)(AX), acc7
  1211  
  1212  	gfpMulBy2Inline
  1213  	XORQ acc4, acc4
  1214  	XORQ acc5, acc5
  1215  	XORQ acc6, acc6
  1216  	XORQ acc7, acc7
  1217  	CALL gfpSubInternal(SB)
  1218  
  1219  	MOVOU X2, (16*0)(mul1)
  1220  	MOVOU X3, (16*1)(mul1)
  1221  	MOVQ acc4, (16*2 + 8*0)(mul1)
  1222  	MOVQ acc5, (16*2 + 8*1)(mul1)
  1223  	MOVQ acc6, (16*2 + 8*2)(mul1)
  1224  	MOVQ acc7, (16*2 + 8*3)(mul1)
  1225  
  1226  	RET
  1227  
  1228  #define axin(off) (32*0 + off)(SP)
  1229  #define ayin(off) (32*1 + off)(SP)
  1230  #define cxout(off) (32*2 + off)(SP)
  1231  #define cyout(off) (32*3 + off)(SP)
  1232  #define rptr	  (32*4)(SP)
  1233  
  1234  TEXT ·gfp2Square(SB),NOSPLIT,$160-16
  1235  	// Move input to stack in order to free registers
  1236  	MOVQ res+0(FP), AX
  1237  	MOVQ in1+8(FP), BX
  1238  
  1239  	MOVOU (16*0)(BX), X0
  1240  	MOVOU (16*1)(BX), X1
  1241  	MOVOU (16*2)(BX), X2
  1242  	MOVOU (16*3)(BX), X3
  1243  
  1244  	MOVOU X0, axin(16*0)
  1245  	MOVOU X1, axin(16*1)
  1246  	MOVOU X2, ayin(16*0)
  1247  	MOVOU X3, ayin(16*1)
  1248  
  1249  	// Store pointer to result
  1250  	MOVQ AX, rptr
  1251  	
  1252  	LDacc (axin)
  1253  	LDt (ayin)
  1254  	gfpAddInline
  1255  	STt (cyout)
  1256  
  1257  	LDacc (axin)
  1258  	gfpMulBy2Inline
  1259  	LDacc (ayin)
  1260  	CALL gfpSubInternal(SB)
  1261  
  1262  	LDt (cyout)
  1263  	CALL gfpMulInternal(SB)
  1264  	ST (cyout)
  1265  
  1266  	LDacc (axin)
  1267  	LDt (ayin)
  1268  	CALL gfpMulInternal(SB)
  1269  	ST (cxout)
  1270  
  1271  	LDt (cyout)
  1272  	gfpAddInline
  1273  	// Store y
  1274  	MOVQ rptr, AX
  1275  	MOVQ t0, (16*2 + 8*0)(AX)
  1276  	MOVQ t1, (16*2 + 8*1)(AX)
  1277  	MOVQ t2, (16*2 + 8*2)(AX)
  1278  	MOVQ t3, (16*2 + 8*3)(AX)
  1279  
  1280  	LDacc (cxout)
  1281  	gfpMulBy2Inline
  1282  	// Store x
  1283  	MOVQ rptr, AX
  1284  	///////////////////////
  1285  	MOVQ $0, rptr	
  1286  	MOVQ t0, (16*0 + 8*0)(AX)
  1287  	MOVQ t1, (16*0 + 8*1)(AX)
  1288  	MOVQ t2, (16*0 + 8*2)(AX)
  1289  	MOVQ t3, (16*0 + 8*3)(AX)
  1290  
  1291  	RET
  1292  
  1293  TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16
  1294  	// Move input to stack in order to free registers
  1295  	MOVQ res+0(FP), AX
  1296  	MOVQ in1+8(FP), BX
  1297  
  1298  	MOVOU (16*0)(BX), X0
  1299  	MOVOU (16*1)(BX), X1
  1300  	MOVOU (16*2)(BX), X2
  1301  	MOVOU (16*3)(BX), X3
  1302  
  1303  	MOVOU X0, axin(16*0)
  1304  	MOVOU X1, axin(16*1)
  1305  	MOVOU X2, ayin(16*0)
  1306  	MOVOU X3, ayin(16*1)
  1307  
  1308  	// Store pointer to result
  1309  	MOVQ AX, rptr
  1310  	
  1311  	LDacc (axin)
  1312  	LDt (ayin)
  1313  	gfpAddInline
  1314  	STt (cxout)
  1315  
  1316  	LDacc (axin)
  1317  	gfpMulBy2Inline
  1318  
  1319  	LDacc (ayin)
  1320  	CALL gfpSubInternal(SB)
  1321  
  1322  	LDt (cxout)
  1323  	CALL gfpMulInternal(SB)
  1324  	ST (cxout)
  1325  
  1326  	LDacc (axin)
  1327  	LDt (ayin)
  1328  	CALL gfpMulInternal(SB)
  1329  	ST (cyout)
  1330  
  1331  	LDt (cxout)
  1332  	gfpAddInline
  1333  
  1334  	// Store x
  1335  	MOVQ rptr, AX
  1336  	MOVQ t0, (16*0 + 8*0)(AX)
  1337  	MOVQ t1, (16*0 + 8*1)(AX)
  1338  	MOVQ t2, (16*0 + 8*2)(AX)
  1339  	MOVQ t3, (16*0 + 8*3)(AX)
  1340  
  1341  	LDacc (cyout)
  1342  	gfpMulBy2Inline2
  1343  	gfpMulBy2Inline
  1344  	XORQ acc4, acc4
  1345  	XORQ acc5, acc5
  1346  	XORQ acc6, acc6
  1347  	XORQ acc7, acc7
  1348  	CALL gfpSubInternal(SB)
  1349  
  1350  	// Store y
  1351  	MOVQ rptr, AX
  1352  	///////////////////////
  1353  	MOVQ $0, rptr	
  1354  	MOVQ acc4, (16*2 + 8*0)(AX)
  1355  	MOVQ acc5, (16*2 + 8*1)(AX)
  1356  	MOVQ acc6, (16*2 + 8*2)(AX)
  1357  	MOVQ acc7, (16*2 + 8*3)(AX)
  1358  
  1359  	RET
  1360  
  1361  #undef axin
  1362  #undef ayin
  1363  #undef cxout
  1364  #undef cyout
  1365  #undef rptr
  1366  
  1367  /* ---------------------------------------*/
  1368  #define xin(off) (32*0 + off)(SP)
  1369  #define yin(off) (32*1 + off)(SP)
  1370  #define zin(off) (32*2 + off)(SP)
  1371  
  1372  #define xout(off) (32*3 + off)(SP)
  1373  #define yout(off) (32*4 + off)(SP)
  1374  #define zout(off) (32*5 + off)(SP)
  1375  #define tmp0(off) (32*6 + off)(SP)
  1376  #define tmp2(off) (32*7 + off)(SP)
  1377  #define rptr	  (32*8)(SP)
  1378  
  1379  // func curvePointDoubleComplete(c, a *curvePoint)
  1380  TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
  1381  	MOVQ res+0(FP), AX
  1382  	MOVQ in+8(FP), BX
  1383  
  1384  	MOVOU (16*0)(BX), X0
  1385  	MOVOU (16*1)(BX), X1
  1386  	MOVOU (16*2)(BX), X2
  1387  	MOVOU (16*3)(BX), X3
  1388  	MOVOU (16*4)(BX), X4
  1389  	MOVOU (16*5)(BX), X5
  1390  	
  1391  	MOVOU X0, xin(16*0)
  1392  	MOVOU X1, xin(16*1)
  1393  	MOVOU X2, yin(16*0)
  1394  	MOVOU X3, yin(16*1)
  1395  	MOVOU X4, zin(16*0)
  1396  	MOVOU X5, zin(16*1)
  1397  
  1398  	// Store pointer to result
  1399  	MOVQ AX, rptr
  1400  
  1401  	LDacc (yin)
  1402  	CALL gfpSqrInternal(SB) // t0 := Y^2
  1403  	ST (tmp0)
  1404  
  1405  	gfpMulBy2Inline2        // Z3 := t0 + t0
  1406  	gfpMulBy2Inline2        // Z3 := Z3 + Z3
  1407  	gfpMulBy2Inline         // Z3 := Z3 + Z3
  1408  	STt (zout)	
  1409  
  1410  	LDacc (zin)
  1411  	CALL gfpSqrInternal(SB) // t2 := Z^2
  1412  	MOVQ acc4, acc0
  1413  	MOVQ acc5, acc1
  1414  	MOVQ acc6, acc2
  1415  	MOVQ acc7, acc3
  1416  	gfpMulBy2Inline2
  1417  	gfpMulBy2Inline2
  1418  	gfpMulBy2Inline2
  1419  	gfpMulBy2Inline2
  1420  	MOVQ acc0, t0
  1421  	MOVQ acc1, t1
  1422  	MOVQ acc2, t2
  1423  	MOVQ acc3, t3	
  1424  	CALL gfpSubInternal(SB)  // t2 := 3b * t2
  1425  	ST (tmp2)
  1426  	LDt (zout)
  1427  	CALL gfpMulInternal(SB)  // X3 := Z3 * t2
  1428  	ST (xout)
  1429  
  1430  	LDacc (tmp0)
  1431  	LDt (tmp2)
  1432  	gfpAddInline             // Y3 := t0 + t2
  1433  	STt (yout)
  1434  
  1435  	LDacc (yin)
  1436  	LDt (zin)
  1437  	CALL gfpMulInternal(SB)  // t1 := YZ
  1438  	LDt (zout)
  1439  	CALL gfpMulInternal(SB)  // Z3 := t1 * Z3
  1440  	MOVQ rptr, AX
  1441  	// Store Z
  1442  	MOVQ acc4, (16*4 + 8*0)(AX)
  1443  	MOVQ acc5, (16*4 + 8*1)(AX)
  1444  	MOVQ acc6, (16*4 + 8*2)(AX)
  1445  	MOVQ acc7, (16*4 + 8*3)(AX)	
  1446  
  1447  	LDacc (tmp2) 
  1448  	gfpMulBy2Inline
  1449  	LDacc (tmp2)
  1450  	gfpAddInline            // t2 := t2 + t2 + t2
  1451  	LDacc (tmp0)
  1452  	CALL gfpSubInternal(SB) // t0 := t0 - t2
  1453  	ST (tmp0)
  1454  	LDt (yout)
  1455  	CALL gfpMulInternal(SB) // Y3 = t0 * Y3
  1456  	LDt (xout)
  1457  	gfpAddInline            // Y3 := X3 + Y3
  1458  	MOVQ rptr, AX
  1459  	// Store y
  1460  	MOVQ t0, (16*2 + 8*0)(AX)
  1461  	MOVQ t1, (16*2 + 8*1)(AX)
  1462  	MOVQ t2, (16*2 + 8*2)(AX)
  1463  	MOVQ t3, (16*2 + 8*3)(AX)
  1464  
  1465  	LDacc (xin)
  1466  	LDt (yin)
  1467  	CALL gfpMulInternal(SB) // t1 := XY
  1468  	LDt (tmp0)
  1469  	CALL gfpMulInternal(SB) // X3 := t0 * t1
  1470  	gfpMulBy2Inline         // X3 := X3 + X3
  1471  	MOVQ rptr, AX
  1472  	MOVQ $0, rptr
  1473  	// Store x
  1474  	MOVQ t0, (16*0 + 8*0)(AX)
  1475  	MOVQ t1, (16*0 + 8*1)(AX)
  1476  	MOVQ t2, (16*0 + 8*2)(AX)
  1477  	MOVQ t3, (16*0 + 8*3)(AX)
  1478  
  1479  	RET
  1480  
  1481  #undef xin
  1482  #undef yin
  1483  #undef zin
  1484  #undef xout
  1485  #undef yout
  1486  #undef zout
  1487  #undef tmp0
  1488  #undef tmp2
  1489  #undef rptr
  1490  
  1491  /* ---------------------------------------*/
  1492  #define x1in(off) (32*0 + off)(SP)
  1493  #define y1in(off) (32*1 + off)(SP)
  1494  #define z1in(off) (32*2 + off)(SP)
  1495  #define x2in(off) (32*3 + off)(SP)
  1496  #define y2in(off) (32*4 + off)(SP)
  1497  #define z2in(off) (32*5 + off)(SP)
  1498  #define xout(off) (32*6 + off)(SP)
  1499  #define yout(off) (32*7 + off)(SP)
  1500  #define zout(off) (32*8 + off)(SP)
  1501  #define tmp0(off) (32*9 + off)(SP)
  1502  #define tmp1(off) (32*10 + off)(SP)
  1503  #define tmp2(off) (32*11 + off)(SP)
  1504  #define tmp3(off) (32*12 + off)(SP)
  1505  #define tmp4(off) (32*13 + off)(SP)
  1506  #define rptr      (32*14)(SP)
  1507  
  1508  #define curvePointAddCompleteInline \
  1509  	LDacc (x1in) \
  1510  	LDt (x2in)   \
  1511  	CALL gfpMulInternal(SB) \ // t0 := X1X2
  1512  	ST (tmp0)    \
  1513  	LDacc (y1in) \
  1514  	LDt (y2in)   \
  1515  	CALL gfpMulInternal(SB) \ // t1 := Y1Y2
  1516  	ST (tmp1)    \
  1517  	LDacc (z1in) \
  1518  	LDt (z2in)   \
  1519  	CALL gfpMulInternal(SB) \ // t2 := Z1Z2
  1520  	ST (tmp2)    \
  1521  	\
  1522  	LDacc (x1in) \
  1523  	LDt (y1in)   \
  1524  	gfpAddInline \
  1525  	STt (tmp3)   \            // t3 := X1 + Y1
  1526  	LDacc (x2in) \
  1527  	LDt (y2in)   \
  1528  	gfpAddInline \
  1529  	LDacc (tmp3) \
  1530  	CALL gfpMulInternal(SB) \ // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
  1531  	ST (tmp3)    \
  1532  	LDacc (tmp0) \
  1533  	LDt (tmp1)   \
  1534  	gfpAddInline \
  1535  	LDacc (tmp3) \
  1536  	CALL gfpSubInternal(SB) \ // t3 := t3 - t4 = X1Y2 + X2Y1
  1537  	ST (tmp3)    \
  1538  	\
  1539  	LDacc (y1in) \
  1540  	LDt (z1in)   \
  1541  	gfpAddInline \            // t4 := Y1 + Z1
  1542  	STt (tmp4)   \
  1543  	LDacc (y2in) \ 
  1544  	LDt (z2in)   \
  1545  	gfpAddInline \
  1546  	LDacc (tmp4) \
  1547  	CALL gfpMulInternal(SB) \ // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
  1548  	ST (tmp4)    \
  1549  	LDacc (tmp1) \
  1550  	LDt (tmp2)   \
  1551  	gfpAddInline \
  1552  	LDacc (tmp4) \
  1553  	CALL gfpSubInternal(SB) \ // t4 := t4 - X3 = Y1Z2 + Y2Z1
  1554  	ST (tmp4)    \
  1555  	\
  1556  	LDacc (z1in) \
  1557  	LDt (x1in)   \
  1558  	gfpAddInline \            // X3 := X1 + Z1
  1559  	STt (xout)   \
  1560  	LDacc (z2in) \
  1561  	LDt (x2in)   \
  1562  	gfpAddInline \
  1563  	LDacc (xout) \
  1564  	CALL gfpMulInternal(SB) \ // X3 := X3 * Y3
  1565  	ST (xout)    \
  1566  	LDacc (tmp0) \
  1567  	LDt (tmp2)   \
  1568  	gfpAddInline \
  1569  	LDacc (xout) \
  1570  	CALL gfpSubInternal(SB) \ // Y3 := X3 - Y3 = X1Z2 + X2Z1
  1571  	ST (yout)    \
  1572  	\
  1573  	LDacc (tmp0) \
  1574  	gfpMulBy2Inline \
  1575  	LDacc (tmp0)    \
  1576  	gfpAddInline    \         // t0 := t0 + t0 + t0 = 3X1X2
  1577  	STt (tmp0)   \
  1578  	\
  1579  	LDacc (tmp2) \
  1580  	gfpMulBy2Inline2  \
  1581  	gfpMulBy2Inline2  \
  1582  	gfpMulBy2Inline2  \
  1583  	gfpMulBy2Inline2  \
  1584  	LDt (tmp2)       \
  1585  	CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2
  1586  	ST (tmp2)        \  
  1587  	\
  1588  	LDt (tmp1)       \  
  1589  	gfpAddInline     \        // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
  1590  	STt (zout)       \
  1591  	\
  1592  	LDacc (tmp1)     \
  1593  	LDt (tmp2)       \
  1594  	CALL gfpSubInternal(SB) \ // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
  1595  	ST (tmp1)        \
  1596  	\
  1597  	LDacc (yout)     \
  1598  	gfpMulBy2Inline2  \
  1599  	gfpMulBy2Inline2  \
  1600  	gfpMulBy2Inline2  \
  1601  	gfpMulBy2Inline2  \
  1602  	LDt (yout)       \
  1603  	CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
  1604  	ST (yout)        \
  1605  	\
  1606  	LDt (tmp4)       \
  1607  	CALL gfpMulInternal(SB) \ // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
  1608  	ST (xout)        \
  1609  	\
  1610  	LDacc (tmp1)     \
  1611  	LDt (tmp3)       \
  1612  	CALL gfpMulInternal(SB) \ // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
  1613  	LDt (xout)       \
  1614  	CALL gfpSubInternal(SB) \ // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
  1615  	MOVQ rptr, AX    \
  1616  	\// Store x
  1617  	MOVQ acc4, (16*0 + 8*0)(AX) \
  1618  	MOVQ acc5, (16*0 + 8*1)(AX) \
  1619  	MOVQ acc6, (16*0 + 8*2)(AX) \
  1620  	MOVQ acc7, (16*0 + 8*3)(AX) \
  1621  	\
  1622  	LDacc (yout)     \
  1623  	LDt (tmp0)       \
  1624  	CALL gfpMulInternal(SB) \ // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
  1625  	ST (yout)        \ 
  1626  	\
  1627  	LDacc (tmp1)     \ 
  1628  	LDt (zout)       \
  1629  	CALL gfpMulInternal(SB) \ // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
  1630  	LDt (yout)       \  
  1631  	gfpAddInline     \        // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
  1632  	MOVQ rptr, AX    \
  1633  	\// Store y
  1634  	MOVQ t0, (16*2 + 8*0)(AX) \
  1635  	MOVQ t1, (16*2 + 8*1)(AX) \
  1636  	MOVQ t2, (16*2 + 8*2)(AX) \
  1637  	MOVQ t3, (16*2 + 8*3)(AX) \
  1638  	\
  1639  	LDacc (tmp0)     \    
  1640  	LDt (tmp3)       \
  1641  	CALL gfpMulInternal(SB) \ // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
  1642  	ST (tmp0)        \
  1643  	LDacc (zout)     \
  1644  	LDt (tmp4)       \
  1645  	CALL gfpMulInternal(SB) \ // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
  1646  	LDt (tmp0)       \
  1647  	gfpAddInline     \        // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
  1648  	MOVQ rptr, AX    \
  1649  	MOVQ $0, rptr    \
  1650  	\// Store z
  1651  	MOVQ t0, (16*4 + 8*0)(AX) \
  1652  	MOVQ t1, (16*4 + 8*1)(AX) \
  1653  	MOVQ t2, (16*4 + 8*2)(AX) \
  1654  	MOVQ t3, (16*4 + 8*3)(AX) \
  1655  
  1656  // func curvePointAddComplete(c, a, b *curvePoint)
  1657  TEXT ·curvePointAddComplete(SB),0,$480-24
  1658  	// Move input to stack in order to free registers
  1659  	MOVQ res+0(FP), AX
  1660  	MOVQ in1+8(FP), BX
  1661  	MOVQ in2+16(FP), CX
  1662  
  1663  	CMPB ·supportAVX2+0(SB), $0x01
  1664  	JEQ  pointadd_avx2
  1665  
  1666  	MOVOU (16*0)(BX), X0
  1667  	MOVOU (16*1)(BX), X1
  1668  	MOVOU (16*2)(BX), X2
  1669  	MOVOU (16*3)(BX), X3
  1670  	MOVOU (16*4)(BX), X4
  1671  	MOVOU (16*5)(BX), X5
  1672  
  1673  	MOVOU X0, x1in(16*0)
  1674  	MOVOU X1, x1in(16*1)
  1675  	MOVOU X2, y1in(16*0)
  1676  	MOVOU X3, y1in(16*1)
  1677  	MOVOU X4, z1in(16*0)
  1678  	MOVOU X5, z1in(16*1)
  1679  
  1680  	MOVOU (16*0)(CX), X0
  1681  	MOVOU (16*1)(CX), X1
  1682  	MOVOU (16*2)(CX), X2
  1683  	MOVOU (16*3)(CX), X3
  1684  	MOVOU (16*4)(CX), X4
  1685  	MOVOU (16*5)(CX), X5
  1686  
  1687  	MOVOU X0, x2in(16*0)
  1688  	MOVOU X1, x2in(16*1)
  1689  	MOVOU X2, y2in(16*0)
  1690  	MOVOU X3, y2in(16*1)
  1691  	MOVOU X4, z2in(16*0)
  1692  	MOVOU X5, z2in(16*1)
  1693  	// Store pointer to result
  1694  	MOVQ AX, rptr
  1695  	
  1696  	curvePointAddCompleteInline
  1697  
  1698  	RET
  1699  	
  1700  pointadd_avx2:	
  1701  	VMOVDQU (32*0)(BX), Y0
  1702  	VMOVDQU (32*1)(BX), Y1
  1703  	VMOVDQU (32*2)(BX), Y2
  1704  
  1705  	VMOVDQU Y0, x1in(32*0)
  1706  	VMOVDQU Y1, y1in(32*0)
  1707  	VMOVDQU Y2, z1in(32*0)
  1708  
  1709  	VMOVDQU (32*0)(CX), Y0
  1710  	VMOVDQU (32*1)(CX), Y1
  1711  	VMOVDQU (32*2)(CX), Y2
  1712  
  1713  	VMOVDQU Y0, x2in(32*0)
  1714  	VMOVDQU Y1, y2in(32*0)
  1715  	VMOVDQU Y2, z2in(32*0)
  1716  
  1717  	// Store pointer to result
  1718  	MOVQ AX, rptr
  1719  	curvePointAddCompleteInline
  1720  
  1721  	VZEROUPPER
  1722  	RET
  1723  
  1724  #undef x1in
  1725  #undef y1in
  1726  #undef z1in
  1727  #undef x2in
  1728  #undef y2in
  1729  #undef z2in
  1730  #undef xout
  1731  #undef yout
  1732  #undef zout
  1733  #undef tmp0
  1734  #undef tmp1
  1735  #undef tmp2
  1736  #undef tmp3
  1737  #undef tmp4
  1738  #undef rptr
  1739  
  1740  /* ---------------------------------------*/
  1741  /*
  1742  // gfpIsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  1743  // otherwise. It writes to [acc4..acc7], t0 and t1.
  1744  TEXT gfpIsZero(SB),NOSPLIT,$0
  1745  	// AX contains a flag that is set if the input is zero.
  1746  	XORQ AX, AX
  1747  	MOVQ $1, t1
  1748  
  1749  	// Check whether [acc4..acc7] are all zero.
  1750  	MOVQ acc4, t0
  1751  	ORQ acc5, t0
  1752  	ORQ acc6, t0
  1753  	ORQ acc7, t0
  1754  
  1755  	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  1756  	// appear to be supported in Go. Thus t1 = 1.)
  1757  	CMOVQEQ t1, AX
  1758  
  1759  	// XOR [acc4..acc7] with P and compare with zero again.
  1760  	XORQ ·p2+0(SB), acc4
  1761  	XORQ ·p2+8(SB), acc5
  1762  	XORQ ·p2+16(SB), acc6
  1763  	XORQ ·p2+24(SB), acc7
  1764  	ORQ acc5, acc4
  1765  	ORQ acc6, acc4
  1766  	ORQ acc7, acc4
  1767  
  1768  	// Set the zero flag if so.
  1769  	CMOVQEQ t1, AX
  1770  	RET
  1771  
  1772  #define x1in(off) (32*0 + off)(SP)
  1773  #define y1in(off) (32*1 + off)(SP)
  1774  #define z1in(off) (32*2 + off)(SP)
  1775  #define x2in(off) (32*3 + off)(SP)
  1776  #define y2in(off) (32*4 + off)(SP)
  1777  #define z2in(off) (32*5 + off)(SP)
  1778  
  1779  #define xout(off) (32*6 + off)(SP)
  1780  #define yout(off) (32*7 + off)(SP)
  1781  #define zout(off) (32*8 + off)(SP)
  1782  
  1783  #define u1(off)    (32*9 + off)(SP)
  1784  #define u2(off)    (32*10 + off)(SP)
  1785  #define s1(off)    (32*11 + off)(SP)
  1786  #define s2(off)    (32*12 + off)(SP)
  1787  #define z1sqr(off) (32*13 + off)(SP)
  1788  #define z2sqr(off) (32*14 + off)(SP)
  1789  #define h(off)     (32*15 + off)(SP)
  1790  #define r(off)     (32*16 + off)(SP)
  1791  #define hsqr(off)  (32*17 + off)(SP)
  1792  #define rsqr(off)  (32*18 + off)(SP)
  1793  #define hcub(off)  (32*19 + off)(SP)
  1794  #define rptr       (32*20)(SP)
  1795  #define points_eq  (32*20+8)(SP)
  1796  
  1797  #define curvePointAddInline \
  1798  	\// Begin point add
  1799  	LDacc (z2in)                 \
  1800  	CALL gfpSqrInternal(SB)	 \// z2ˆ2
  1801  	ST (z2sqr)                   \
  1802  	LDt (z2in)                   \
  1803  	CALL gfpMulInternal(SB)	 \// z2ˆ3
  1804  	LDt (y1in)                   \
  1805  	CALL gfpMulInternal(SB)	 \// s1 = z2ˆ3*y1
  1806  	ST (s1)                      \
  1807  	\
  1808  	LDacc (z1in)                 \ 
  1809  	CALL gfpSqrInternal(SB)	 \// z1ˆ2
  1810  	ST (z1sqr)                   \
  1811  	LDt (z1in)                   \
  1812  	CALL gfpMulInternal(SB)	 \// z1ˆ3
  1813  	LDt (y2in)                   \
  1814  	CALL gfpMulInternal(SB)	 \// s2 = z1ˆ3*y2
  1815  	ST (s2)                      \ 
  1816  	\
  1817  	LDt (s1)                     \
  1818  	CALL gfpSubInternal(SB)	 \// r = s2 - s1
  1819  	ST (r)                       \
  1820  	CALL gfpIsZero(SB)       \
  1821  	MOVQ AX, points_eq           \
  1822  	\
  1823  	LDacc (z2sqr)                \
  1824  	LDt (x1in)                   \
  1825  	CALL gfpMulInternal(SB)	 \// u1 = x1 * z2ˆ2
  1826  	ST (u1)                      \
  1827  	LDacc (z1sqr)                \
  1828  	LDt (x2in)                   \ 
  1829  	CALL gfpMulInternal(SB)	 \// u2 = x2 * z1ˆ2
  1830  	ST (u2)                      \
  1831  	\
  1832  	LDt (u1)                     \ 
  1833  	CALL gfpSubInternal(SB)	 \// h = u2 - u1
  1834  	ST (h)                       \
  1835  	CALL gfpIsZero(SB)       \
  1836  	ANDQ points_eq, AX           \
  1837  	MOVQ AX, points_eq           \
  1838  	\
  1839  	LDacc (r)                    \
  1840  	CALL gfpSqrInternal(SB)	 \// rsqr = rˆ2
  1841  	ST (rsqr)                    \
  1842  	\
  1843  	LDacc (h)                    \
  1844  	CALL gfpSqrInternal(SB)	 \// hsqr = hˆ2
  1845  	ST (hsqr)                    \
  1846  	\
  1847  	LDt (h)                      \
  1848  	CALL gfpMulInternal(SB)	 \// hcub = hˆ3
  1849  	ST (hcub)                    \
  1850  	\
  1851  	LDt (s1)                     \
  1852  	CALL gfpMulInternal(SB)  \
  1853  	ST (s2)                      \
  1854  	\
  1855  	LDacc (z1in)                 \
  1856  	LDt (z2in)                   \
  1857  	CALL gfpMulInternal(SB)	 \// z1 * z2
  1858  	LDt (h)                      \
  1859  	CALL gfpMulInternal(SB)	 \// z1 * z2 * h
  1860  	ST (zout)                    \
  1861  	\
  1862  	LDacc (hsqr)                 \
  1863  	LDt (u1)                     \
  1864  	CALL gfpMulInternal(SB)	 \// hˆ2 * u1
  1865  	ST (u2)                      \
  1866  	\
  1867  	gfpMulBy2Inline	         \// u1 * hˆ2 * 2, inline
  1868  	LDacc (rsqr)                 \
  1869  	CALL gfpSubInternal(SB)	 \// rˆ2 - u1 * hˆ2 * 2
  1870  	\
  1871  	LDt (hcub)                   \
  1872  	CALL gfpSubInternal(SB)  \
  1873  	ST (xout)                    \
  1874  	\
  1875  	MOVQ acc4, t0                \
  1876  	MOVQ acc5, t1                \
  1877  	MOVQ acc6, t2                \
  1878  	MOVQ acc7, t3                \
  1879  	LDacc (u2)                   \
  1880  	CALL gfpSubInternal(SB)  \
  1881  	\
  1882  	LDt (r)                      \
  1883  	CALL gfpMulInternal(SB)  \
  1884  	\
  1885  	LDt (s2)                     \
  1886  	CALL gfpSubInternal(SB)  \
  1887  	ST (yout)                    \
  1888  
  1889  // func curvePointAdd(c, a, b *curvePoint) int
  1890  TEXT ·curvePointAdd(SB),0,$680-32
  1891  	// Move input to stack in order to free registers
  1892  	MOVQ res+0(FP), AX
  1893  	MOVQ in1+8(FP), BX
  1894  	MOVQ in2+16(FP), CX
  1895  
  1896  	MOVOU (16*0)(BX), X0
  1897  	MOVOU (16*1)(BX), X1
  1898  	MOVOU (16*2)(BX), X2
  1899  	MOVOU (16*3)(BX), X3
  1900  	MOVOU (16*4)(BX), X4
  1901  	MOVOU (16*5)(BX), X5
  1902  
  1903  	MOVOU X0, x1in(16*0)
  1904  	MOVOU X1, x1in(16*1)
  1905  	MOVOU X2, y1in(16*0)
  1906  	MOVOU X3, y1in(16*1)
  1907  	MOVOU X4, z1in(16*0)
  1908  	MOVOU X5, z1in(16*1)
  1909  
  1910  	MOVOU (16*0)(CX), X0
  1911  	MOVOU (16*1)(CX), X1
  1912  	MOVOU (16*2)(CX), X2
  1913  	MOVOU (16*3)(CX), X3
  1914  	MOVOU (16*4)(CX), X4
  1915  	MOVOU (16*5)(CX), X5
  1916  
  1917  	MOVOU X0, x2in(16*0)
  1918  	MOVOU X1, x2in(16*1)
  1919  	MOVOU X2, y2in(16*0)
  1920  	MOVOU X3, y2in(16*1)
  1921  	MOVOU X4, z2in(16*0)
  1922  	MOVOU X5, z2in(16*1)
  1923  	// Store pointer to result
  1924  	MOVQ AX, rptr
  1925  
  1926  	curvePointAddInline
  1927  
  1928  	MOVOU xout(16*0), X0
  1929  	MOVOU xout(16*1), X1
  1930  	MOVOU yout(16*0), X2
  1931  	MOVOU yout(16*1), X3
  1932  	MOVOU zout(16*0), X4
  1933  	MOVOU zout(16*1), X5
  1934  	// Finally output the result
  1935  	MOVQ rptr, AX
  1936  	MOVQ $0, rptr
  1937  	MOVOU X0, (16*0)(AX)
  1938  	MOVOU X1, (16*1)(AX)
  1939  	MOVOU X2, (16*2)(AX)
  1940  	MOVOU X3, (16*3)(AX)
  1941  	MOVOU X4, (16*4)(AX)
  1942  	MOVOU X5, (16*5)(AX)
  1943  
  1944  	MOVQ points_eq, AX
  1945  	MOVQ AX, ret+24(FP)
  1946  
  1947  	RET
  1948  */