github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define res_ptr R0
     6  #define a_ptr R1
     7  #define b_ptr R2
     8  
     9  #define acc0 R3
    10  #define acc1 R4
    11  #define acc2 R5
    12  #define acc3 R6
    13  
    14  #define acc4 R7
    15  #define acc5 R8
    16  #define acc6 R9
    17  #define acc7 R10
    18  #define t0 R11
    19  #define t1 R12
    20  #define t2 R13
    21  #define t3 R14
    22  #define const0 R15
    23  #define const1 R16
    24  
    25  #define hlp0 R17
    26  #define hlp1 res_ptr
    27  
    28  #define x0 R19
    29  #define x1 R20
    30  #define x2 R21
    31  #define x3 R22
    32  #define y0 R23
    33  #define y1 R24
    34  #define y2 R25
    35  #define y3 R26
    36  
    37  #define const2 t2
    38  #define const3 t3
    39  
    40  #define storeBlock(a0,a1,a2,a3, r) \
    41  	MOVD a0,  0+r \
    42  	MOVD a1,  8+r \
    43  	MOVD a2, 16+r \
    44  	MOVD a3, 24+r
    45  
    46  #define loadBlock(r, a0,a1,a2,a3) \
    47  	MOVD  0+r, a0 \
    48  	MOVD  8+r, a1 \
    49  	MOVD 16+r, a2 \
    50  	MOVD 24+r, a3
    51  
    52  #define loadModulus(p0,p1,p2,p3) \
    53  	MOVD ·p2+0(SB), p0 \
    54  	MOVD ·p2+8(SB), p1 \
    55  	MOVD ·p2+16(SB), p2 \
    56  	MOVD ·p2+24(SB), p3
    57  
    58  TEXT ·gfpNeg(SB),0,$0-16
    59  	MOVD a+8(FP), R0
    60  	loadBlock(0(R0), R1,R2,R3,R4)
    61  	loadModulus(R5,R6,R7,R8)
    62  
    63  	SUBS R1, R5, R1
    64  	SBCS R2, R6, R2
    65  	SBCS R3, R7, R3
    66  	SBCS R4, R8, R4
    67  
    68  	SUBS R5, R1, R5
    69  	SBCS R6, R2, R6
    70  	SBCS R7, R3, R7
    71  	SBCS R8, R4, R8
    72  
    73  	CSEL CS, R5, R1, R1
    74  	CSEL CS, R6, R2, R2
    75  	CSEL CS, R7, R3, R3
    76  	CSEL CS, R8, R4, R4
    77  
    78  	MOVD c+0(FP), R0
    79  	storeBlock(R1,R2,R3,R4, 0(R0))
    80  	RET
    81  
    82  TEXT ·gfpAdd(SB),0,$0-24
    83  	MOVD a+8(FP), R0
    84  	loadBlock(0(R0), R1,R2,R3,R4)
    85  	MOVD b+16(FP), R0
    86  	loadBlock(0(R0), R5,R6,R7,R8)
    87  	loadModulus(R9,R10,R11,R12)
    88  	MOVD ZR, R0
    89  
    90  	ADDS R5, R1
    91  	ADCS R6, R2
    92  	ADCS R7, R3
    93  	ADCS R8, R4
    94  	ADCS ZR, R0
    95  
    96  	SUBS  R9, R1, R5
    97  	SBCS R10, R2, R6
    98  	SBCS R11, R3, R7
    99  	SBCS R12, R4, R8
   100  	SBCS  ZR, R0, R0
   101  
   102  	CSEL CS, R5, R1, R1
   103  	CSEL CS, R6, R2, R2
   104  	CSEL CS, R7, R3, R3
   105  	CSEL CS, R8, R4, R4
   106  
   107  	MOVD c+0(FP), R0
   108  	storeBlock(R1,R2,R3,R4, 0(R0))
   109  	RET
   110  
   111  TEXT ·gfpDouble(SB),0,$0-16
   112  	MOVD a+8(FP), R0
   113  	loadBlock(0(R0), R1,R2,R3,R4)
   114  	loadModulus(R9,R10,R11,R12)
   115  	MOVD ZR, R0
   116  
   117  	ADDS R1, R1
   118  	ADCS R2, R2
   119  	ADCS R3, R3
   120  	ADCS R4, R4
   121  	ADCS ZR, R0
   122  
   123  	SUBS  R9, R1, R5
   124  	SBCS R10, R2, R6
   125  	SBCS R11, R3, R7
   126  	SBCS R12, R4, R8
   127  	SBCS  ZR, R0, R0
   128  
   129  	CSEL CS, R5, R1, R1
   130  	CSEL CS, R6, R2, R2
   131  	CSEL CS, R7, R3, R3
   132  	CSEL CS, R8, R4, R4
   133  
   134  	MOVD c+0(FP), R0
   135  	storeBlock(R1,R2,R3,R4, 0(R0))
   136  	RET
   137  
   138  TEXT ·gfpTriple(SB),0,$0-16
   139  	MOVD a+8(FP), R0
   140  	loadBlock(0(R0), R1,R2,R3,R4)
   141  	MOVD R1, R19
   142  	MOVD R2, R20
   143  	MOVD R3, R21
   144  	MOVD R4, R22
   145  	loadModulus(R9,R10,R11,R12)
   146  	MOVD ZR, R0
   147  
   148  	ADDS R1, R1
   149  	ADCS R2, R2
   150  	ADCS R3, R3
   151  	ADCS R4, R4
   152  	ADCS ZR, R0
   153  
   154  	SUBS  R9, R1, R5
   155  	SBCS R10, R2, R6
   156  	SBCS R11, R3, R7
   157  	SBCS R12, R4, R8
   158  	SBCS  ZR, R0, R0
   159  
   160  	CSEL CS, R5, R1, R1
   161  	CSEL CS, R6, R2, R2
   162  	CSEL CS, R7, R3, R3
   163  	CSEL CS, R8, R4, R4
   164  
   165  	MOVD ZR, R0
   166  
   167  	ADDS R19, R1
   168  	ADCS R20, R2
   169  	ADCS R21, R3
   170  	ADCS R22, R4
   171  	ADCS ZR, R0
   172  
   173  	SUBS  R9, R1, R5
   174  	SBCS R10, R2, R6
   175  	SBCS R11, R3, R7
   176  	SBCS R12, R4, R8
   177  	SBCS  ZR, R0, R0
   178  
   179  	CSEL CS, R5, R1, R1
   180  	CSEL CS, R6, R2, R2
   181  	CSEL CS, R7, R3, R3
   182  	CSEL CS, R8, R4, R4
   183  
   184  	MOVD c+0(FP), R0
   185  	storeBlock(R1,R2,R3,R4, 0(R0))
   186  	RET
   187  
   188  TEXT ·gfpSub(SB),0,$0-24
   189  	MOVD a+8(FP), R0
   190  	loadBlock(0(R0), R1,R2,R3,R4)
   191  	MOVD b+16(FP), R0
   192  	loadBlock(0(R0), R5,R6,R7,R8)
   193  	loadModulus(R9,R10,R11,R12)
   194  
   195  	SUBS R5, R1
   196  	SBCS R6, R2
   197  	SBCS R7, R3
   198  	SBCS R8, R4
   199  
   200  	CSEL CS, ZR,  R9,  R9
   201  	CSEL CS, ZR, R10, R10
   202  	CSEL CS, ZR, R11, R11
   203  	CSEL CS, ZR, R12, R12
   204  
   205  	ADDS  R9, R1
   206  	ADCS R10, R2
   207  	ADCS R11, R3
   208  	ADCS R12, R4
   209  
   210  	MOVD c+0(FP), R0
   211  	storeBlock(R1,R2,R3,R4, 0(R0))
   212  	RET
   213  
   214  TEXT ·gfpMul(SB),NOSPLIT,$0
   215  	MOVD	in1+8(FP), a_ptr
   216  	MOVD	in2+16(FP), b_ptr
   217  
   218  	MOVD	·np+0x00(SB), hlp1
   219  	LDP	·p2+0x00(SB), (const0, const1)
   220  	LDP	·p2+0x10(SB), (const2, const3)
   221  
   222  	LDP	0*16(a_ptr), (x0, x1)
   223  	LDP	1*16(a_ptr), (x2, x3)
   224  	LDP	0*16(b_ptr), (y0, y1)
   225  	LDP	1*16(b_ptr), (y2, y3)
   226  
   227  	// y[0] * x
   228  	MUL	y0, x0, acc0
   229  	UMULH	y0, x0, acc1
   230  
   231  	MUL	y0, x1, t0
   232  	ADDS	t0, acc1
   233  	UMULH	y0, x1, acc2
   234  
   235  	MUL	y0, x2, t0
   236  	ADCS	t0, acc2
   237  	UMULH	y0, x2, acc3
   238  
   239  	MUL	y0, x3, t0
   240  	ADCS	t0, acc3
   241  	UMULH	y0, x3, acc4
   242  	ADC	$0, acc4
   243  	// First reduction step
   244  	MUL	acc0, hlp1, hlp0
   245  
   246  	MUL	const0, hlp0, t0
   247  	ADDS	t0, acc0, acc0
   248  	UMULH	const0, hlp0, t1
   249  
   250  	MUL	const1, hlp0, t0
   251  	ADCS	t0, acc1, acc1
   252  	UMULH	const1, hlp0, y0
   253  
   254  	MUL	const2, hlp0, t0
   255  	ADCS	t0, acc2, acc2
   256  	UMULH	const2, hlp0, acc0
   257  
   258  	MUL	const3, hlp0, t0
   259  	ADCS	t0, acc3, acc3
   260  
   261  	UMULH	const3, hlp0, hlp0
   262  	ADC	$0, acc4
   263  
   264  	ADDS	t1, acc1, acc1
   265  	ADCS	y0, acc2, acc2
   266  	ADCS	acc0, acc3, acc3
   267  	ADC	$0, hlp0, acc0
   268  	// y[1] * x
   269  	MUL	y1, x0, t0
   270  	ADDS	t0, acc1
   271  	UMULH	y1, x0, t1
   272  
   273  	MUL	y1, x1, t0
   274  	ADCS	t0, acc2
   275  	UMULH	y1, x1, hlp0
   276  
   277  	MUL	y1, x2, t0
   278  	ADCS	t0, acc3
   279  	UMULH	y1, x2, y0
   280  
   281  	MUL	y1, x3, t0
   282  	ADCS	t0, acc4
   283  	UMULH	y1, x3, y1
   284  	ADC	$0, ZR, acc5
   285  
   286  	ADDS	t1, acc2
   287  	ADCS	hlp0, acc3
   288  	ADCS	y0, acc4
   289  	ADC	y1, acc5
   290  	// Second reduction step
   291  	MUL	acc1, hlp1, hlp0
   292  
   293  	MUL	const0, hlp0, t0
   294  	ADDS	t0, acc1, acc1
   295  	UMULH	const0, hlp0, t1
   296  
   297  	MUL	const1, hlp0, t0
   298  	ADCS	t0, acc2, acc2
   299  	UMULH	const1, hlp0, y0
   300  
   301  	MUL	const2, hlp0, t0
   302  	ADCS	t0, acc3, acc3
   303  	UMULH	const2, hlp0, acc1
   304  
   305  	MUL	const3, hlp0, t0
   306  	ADCS	t0, acc0, acc0
   307  
   308  	UMULH	const3, hlp0, hlp0
   309  	ADC	$0, acc5
   310  
   311  	ADDS	t1, acc2, acc2
   312  	ADCS	y0, acc3, acc3
   313  	ADCS	acc1, acc0, acc0
   314  	ADC	$0, hlp0, acc1
   315  	// y[2] * x
   316  	MUL	y2, x0, t0
   317  	ADDS	t0, acc2
   318  	UMULH	y2, x0, t1
   319  
   320  	MUL	y2, x1, t0
   321  	ADCS	t0, acc3
   322  	UMULH	y2, x1, hlp0
   323  
   324  	MUL	y2, x2, t0
   325  	ADCS	t0, acc4
   326  	UMULH	y2, x2, y0
   327  
   328  	MUL	y2, x3, t0
   329  	ADCS	t0, acc5
   330  	UMULH	y2, x3, y1
   331  	ADC	$0, ZR, acc6
   332  
   333  	ADDS	t1, acc3
   334  	ADCS	hlp0, acc4
   335  	ADCS	y0, acc5
   336  	ADC	y1, acc6
   337  	// Third reduction step
   338  	MUL	acc2, hlp1, hlp0
   339  
   340  	MUL	const0, hlp0, t0
   341  	ADDS	t0, acc2, acc2
   342  	UMULH	const0, hlp0, t1
   343  
   344  	MUL	const1, hlp0, t0
   345  	ADCS	t0, acc3, acc3
   346  	UMULH	const1, hlp0, y0
   347  
   348  	MUL	const2, hlp0, t0
   349  	ADCS	t0, acc0, acc0
   350  	UMULH	const2, hlp0, acc2
   351  
   352  	MUL	const3, hlp0, t0
   353  	ADCS	t0, acc1, acc1
   354  
   355  	UMULH	const3, hlp0, hlp0
   356  	ADC	$0, acc6
   357  
   358  	ADDS	t1, acc3, acc3
   359  	ADCS	y0, acc0, acc0
   360  	ADCS	acc2, acc1, acc1
   361  	ADC	$0, hlp0, acc2
   362  	// y[3] * x
   363  	MUL	y3, x0, t0
   364  	ADDS	t0, acc3
   365  	UMULH	y3, x0, t1
   366  
   367  	MUL	y3, x1, t0
   368  	ADCS	t0, acc4
   369  	UMULH	y3, x1, hlp0
   370  
   371  	MUL	y3, x2, t0
   372  	ADCS	t0, acc5
   373  	UMULH	y3, x2, y0
   374  
   375  	MUL	y3, x3, t0
   376  	ADCS	t0, acc6
   377  	UMULH	y3, x3, y1
   378  	ADC	$0, ZR, acc7
   379  
   380  	ADDS	t1, acc4
   381  	ADCS	hlp0, acc5
   382  	ADCS	y0, acc6
   383  	ADC	y1, acc7
   384  	// Last reduction step
   385  	MUL	acc3, hlp1, hlp0
   386  
   387  	MUL	const0, hlp0, t0
   388  	ADDS	t0, acc3, acc3
   389  	UMULH	const0, hlp0, t1
   390  
   391  	MUL	const1, hlp0, t0
   392  	ADCS	t0, acc0, acc0
   393  	UMULH	const1, hlp0, y0
   394  
   395  	MUL	const2, hlp0, t0
   396  	ADCS	t0, acc1, acc1
   397  	UMULH	const2, hlp0, acc3
   398  
   399  	MUL	const3, hlp0, t0
   400  	ADCS	t0, acc2, acc2
   401  
   402  	UMULH	const3, hlp0, hlp0
   403  	ADC	$0, acc7
   404  
   405  	ADDS	t1, acc0, acc0
   406  	ADCS	y0, acc1, acc1
   407  	ADCS	acc3, acc2, acc2
   408  	ADC	$0, hlp0, acc3
   409  
   410  	ADDS	acc4, acc0, acc0
   411  	ADCS	acc5, acc1, acc1
   412  	ADCS	acc6, acc2, acc2
   413  	ADCS	acc7, acc3, acc3
   414  	ADC	$0, ZR, acc4
   415  
   416  	SUBS	const0, acc0, t0
   417  	SBCS	const1, acc1, t1
   418  	SBCS	const2, acc2, t2
   419  	SBCS	const3, acc3, t3
   420  	SBCS	$0, acc4, acc4
   421  
   422  	CSEL	CS, t0, acc0, acc0
   423  	CSEL	CS, t1, acc1, acc1
   424  	CSEL	CS, t2, acc2, acc2
   425  	CSEL	CS, t3, acc3, acc3
   426  
   427  	MOVD	res+0(FP), res_ptr
   428  	STP	(acc0, acc1), 0*16(res_ptr)
   429  	STP	(acc2, acc3), 1*16(res_ptr)
   430  
   431  	RET
   432  
   433  // func gfpSqr(res, in *gfP, n int)
   434  TEXT ·gfpSqr(SB),NOSPLIT,$0
   435  	MOVD	in+8(FP), a_ptr
   436  	MOVD	n+16(FP), b_ptr
   437  
   438  	MOVD	·np+0x00(SB), hlp1
   439  	LDP	·p2+0x00(SB), (const0, const1)
   440  	LDP	·p2+0x10(SB), (const2, const3)
   441  
   442  	LDP	0*16(a_ptr), (x0, x1)
   443  	LDP	1*16(a_ptr), (x2, x3)
   444  
   445  ordSqrLoop:
   446  	SUB	$1, b_ptr
   447  
   448  	// x[1:] * x[0]
   449  	MUL	x0, x1, acc1
   450  	UMULH	x0, x1, acc2
   451  
   452  	MUL	x0, x2, t0
   453  	ADDS	t0, acc2, acc2
   454  	UMULH	x0, x2, acc3
   455  
   456  	MUL	x0, x3, t0
   457  	ADCS	t0, acc3, acc3
   458  	UMULH	x0, x3, acc4
   459  	ADC	$0, acc4, acc4
   460  	// x[2:] * x[1]
   461  	MUL	x1, x2, t0
   462  	ADDS	t0, acc3
   463  	UMULH	x1, x2, t1
   464  	ADCS	t1, acc4
   465  	ADC	$0, ZR, acc5
   466  
   467  	MUL	x1, x3, t0
   468  	ADDS	t0, acc4
   469  	UMULH	x1, x3, t1
   470  	ADC	t1, acc5
   471  	// x[3] * x[2]
   472  	MUL	x2, x3, t0
   473  	ADDS	t0, acc5
   474  	UMULH	x2, x3, acc6
   475  	ADC	$0, acc6
   476  
   477  	MOVD	$0, acc7
   478  	// *2
   479  	ADDS	acc1, acc1
   480  	ADCS	acc2, acc2
   481  	ADCS	acc3, acc3
   482  	ADCS	acc4, acc4
   483  	ADCS	acc5, acc5
   484  	ADCS	acc6, acc6
   485  	ADC	$0, acc7
   486  	// Missing products
   487  	MUL	x0, x0, acc0
   488  	UMULH	x0, x0, t0
   489  	ADDS	t0, acc1, acc1
   490  
   491  	MUL	x1, x1, t0
   492  	ADCS	t0, acc2, acc2
   493  	UMULH	x1, x1, t1
   494  	ADCS	t1, acc3, acc3
   495  
   496  	MUL	x2, x2, t0
   497  	ADCS	t0, acc4, acc4
   498  	UMULH	x2, x2, t1
   499  	ADCS	t1, acc5, acc5
   500  
   501  	MUL	x3, x3, t0
   502  	ADCS	t0, acc6, acc6
   503  	UMULH	x3, x3, t1
   504  	ADC	t1, acc7, acc7
   505  	// First reduction step
   506  	MUL	acc0, hlp1, hlp0
   507  
   508  	MUL	const0, hlp0, t0
   509  	ADDS	t0, acc0, acc0
   510  	UMULH	const0, hlp0, t1
   511  
   512  	MUL	const1, hlp0, t0
   513  	ADCS	t0, acc1, acc1
   514  	UMULH	const1, hlp0, y0
   515  
   516  	MUL	const2, hlp0, t0
   517  	ADCS	t0, acc2, acc2
   518  	UMULH	const2, hlp0, acc0
   519  
   520  	MUL	const3, hlp0, t0
   521  	ADCS	t0, acc3, acc3
   522  
   523  	UMULH	const3, hlp0, hlp0
   524  	ADC	$0, hlp0
   525  
   526  	ADDS	t1, acc1, acc1
   527  	ADCS	y0, acc2, acc2
   528  	ADCS	acc0, acc3, acc3
   529  	ADC	$0, hlp0, acc0
   530  	// Second reduction step
   531  	MUL	acc1, hlp1, hlp0
   532  
   533  	MUL	const0, hlp0, t0
   534  	ADDS	t0, acc1, acc1
   535  	UMULH	const0, hlp0, t1
   536  
   537  	MUL	const1, hlp0, t0
   538  	ADCS	t0, acc2, acc2
   539  	UMULH	const1, hlp0, y0
   540  
   541  	MUL	const2, hlp0, t0
   542  	ADCS	t0, acc3, acc3
   543  	UMULH	const2, hlp0, acc1
   544  
   545  	MUL	const3, hlp0, t0
   546  	ADCS	t0, acc0, acc0
   547  
   548  	UMULH	const3, hlp0, hlp0
   549  	ADC	$0, hlp0
   550  
   551  	ADDS	t1, acc2, acc2
   552  	ADCS	y0, acc3, acc3
   553  	ADCS	acc1, acc0, acc0
   554  	ADC	$0, hlp0, acc1
   555  	// Third reduction step
   556  	MUL	acc2, hlp1, hlp0
   557  
   558  	MUL	const0, hlp0, t0
   559  	ADDS	t0, acc2, acc2
   560  	UMULH	const0, hlp0, t1
   561  
   562  	MUL	const1, hlp0, t0
   563  	ADCS	t0, acc3, acc3
   564  	UMULH	const1, hlp0, y0
   565  
   566  	MUL	const2, hlp0, t0
   567  	ADCS	t0, acc0, acc0
   568  	UMULH	const2, hlp0, acc2
   569  
   570  	MUL	const3, hlp0, t0
   571  	ADCS	t0, acc1, acc1
   572  
   573  	UMULH	const3, hlp0, hlp0
   574  	ADC	$0, hlp0
   575  
   576  	ADDS	t1, acc3, acc3
   577  	ADCS	y0, acc0, acc0
   578  	ADCS	acc2, acc1, acc1
   579  	ADC	$0, hlp0, acc2
   580  
   581  	// Last reduction step
   582  	MUL	acc3, hlp1, hlp0
   583  
   584  	MUL	const0, hlp0, t0
   585  	ADDS	t0, acc3, acc3
   586  	UMULH	const0, hlp0, t1
   587  
   588  	MUL	const1, hlp0, t0
   589  	ADCS	t0, acc0, acc0
   590  	UMULH	const1, hlp0, y0
   591  
   592  	MUL	const2, hlp0, t0
   593  	ADCS	t0, acc1, acc1
   594  	UMULH	const2, hlp0, acc3
   595  
   596  	MUL	const3, hlp0, t0
   597  	ADCS	t0, acc2, acc2
   598  
   599  	UMULH	const3, hlp0, hlp0
   600  	ADC	$0, acc7
   601  
   602  	ADDS	t1, acc0, acc0
   603  	ADCS	y0, acc1, acc1
   604  	ADCS	acc3, acc2, acc2
   605  	ADC	$0, hlp0, acc3
   606  
   607  	ADDS	acc4, acc0, acc0
   608  	ADCS	acc5, acc1, acc1
   609  	ADCS	acc6, acc2, acc2
   610  	ADCS	acc7, acc3, acc3
   611  	ADC	$0, ZR, acc4
   612  
   613  	SUBS	const0, acc0, y0
   614  	SBCS	const1, acc1, y1
   615  	SBCS	const2, acc2, y2
   616  	SBCS	const3, acc3, y3
   617  	SBCS	$0, acc4, acc4
   618  
   619  	CSEL	CS, y0, acc0, x0
   620  	CSEL	CS, y1, acc1, x1
   621  	CSEL	CS, y2, acc2, x2
   622  	CSEL	CS, y3, acc3, x3
   623  
   624  	CBNZ	b_ptr, ordSqrLoop
   625  
   626  	MOVD	res+0(FP), res_ptr
   627  	STP	(x0, x1), 0*16(res_ptr)
   628  	STP	(x2, x3), 1*16(res_ptr)
   629  
   630  	RET
   631  
   632  /* ---------------------------------------*/
   633  // func gfpFromMont(res, in *gfP)
   634  TEXT ·gfpFromMont(SB),NOSPLIT,$0
   635  	MOVD	in+8(FP), a_ptr
   636  
   637  	MOVD	·np+0x00(SB), hlp1
   638  	LDP	·p2+0x00(SB), (const0, const1)
   639  	LDP	·p2+0x10(SB), (const2, const3)
   640  
   641  	LDP	0*16(a_ptr), (acc0, acc1)
   642  	LDP	1*16(a_ptr), (acc2, acc3)
   643  	// Only reduce, no multiplications are needed
   644  	// First reduction step
   645  	MUL	acc0, hlp1, hlp0
   646  
   647  	MUL	const0, hlp1, t0
   648  	ADDS	t0, acc0, acc0
   649  	UMULH	const0, hlp0, t1
   650  
   651  	MUL	const1, hlp0, t0
   652  	ADCS	t0, acc1, acc1
   653  	UMULH	const1, hlp0, y0
   654  
   655  	MUL	const2, hlp0, t0
   656  	ADCS	t0, acc2, acc2
   657  	UMULH	const2, hlp0, acc0
   658  
   659  	MUL	const3, hlp0, t0
   660  	ADCS	t0, acc3, acc3
   661  
   662  	UMULH	const3, hlp0, hlp0
   663  	ADC	$0, hlp0
   664  
   665  	ADDS	t1, acc1, acc1
   666  	ADCS	y0, acc2, acc2
   667  	ADCS	acc0, acc3, acc3
   668  	ADC	$0, hlp0, acc0
   669  	// Second reduction step
   670  	MUL	acc1, hlp1, hlp0
   671  
   672  	MUL	const0, hlp1, t0
   673  	ADDS	t0, acc1, acc1
   674  	UMULH	const0, hlp0, t1
   675  
   676  	MUL	const1, hlp0, t0
   677  	ADCS	t0, acc2, acc2
   678  	UMULH	const1, hlp0, y0
   679  
   680  	MUL	const2, hlp0, t0
   681  	ADCS	t0, acc3, acc3
   682  	UMULH	const2, hlp0, acc1
   683  
   684  	MUL	const3, hlp0, t0
   685  	ADCS	t0, acc0, acc0
   686  
   687  	UMULH	const3, hlp0, hlp0
   688  	ADC	$0, hlp0
   689  
   690  	ADDS	t1, acc2, acc2
   691  	ADCS	y0, acc3, acc3
   692  	ADCS	acc1, acc0, acc0
   693  	ADC	$0, hlp0, acc1
   694  	// Third reduction step
   695  	MUL	acc2, hlp1, hlp0
   696  
   697  	MUL	const0, hlp1, t0
   698  	ADDS	t0, acc2, acc2
   699  	UMULH	const0, hlp0, t1
   700  
   701  	MUL	const1, hlp0, t0
   702  	ADCS	t0, acc3, acc3
   703  	UMULH	const1, hlp0, y0
   704  
   705  	MUL	const2, hlp0, t0
   706  	ADCS	t0, acc0, acc0
   707  	UMULH	const2, hlp0, acc2
   708  
   709  	MUL	const3, hlp0, t0
   710  	ADCS	t0, acc1, acc1
   711  
   712  	UMULH	const3, hlp0, hlp0
   713  	ADC	$0, hlp0
   714  
   715  	ADDS	t1, acc3, acc3
   716  	ADCS	y0, acc0, acc0
   717  	ADCS	acc2, acc1, acc1
   718  	ADC	$0, hlp0, acc2
   719  
   720  	// Last reduction step
   721  	MUL	acc3, hlp1, hlp0
   722  
   723  	MUL	const0, hlp1, t0
   724  	ADDS	t0, acc3, acc3
   725  	UMULH	const0, hlp0, t1
   726  
   727  	MUL	const1, hlp0, t0
   728  	ADCS	t0, acc0, acc0
   729  	UMULH	const1, hlp0, y0
   730  
   731  	MUL	const2, hlp0, t0
   732  	ADCS	t0, acc1, acc1
   733  	UMULH	const2, hlp0, acc3
   734  
   735  	MUL	const3, hlp0, t0
   736  	ADCS	t0, acc2, acc2
   737  
   738  	UMULH	const3, hlp0, hlp0
   739  	ADC	$0, hlp0
   740  
   741  	ADDS	t1, acc0, acc0
   742  	ADCS	y0, acc1, acc1
   743  	ADCS	acc3, acc2, acc2
   744  	ADC	$0, hlp0, acc3
   745  
   746  	SUBS	const0, acc0, y0
   747  	SBCS	const1, acc1, y1
   748  	SBCS	const2, acc2, y2
   749  	SBCS	const3, acc3, y3
   750  
   751  	CSEL	CS, y0, acc0, x0
   752  	CSEL	CS, y1, acc1, x1
   753  	CSEL	CS, y2, acc2, x2
   754  	CSEL	CS, y3, acc3, x3
   755  
   756  	MOVD	res+0(FP), res_ptr
   757  	STP	(x0, x1), 0*16(res_ptr)
   758  	STP	(x2, x3), 1*16(res_ptr)
   759  
   760  	RET
   761  
   762  /* ---------------------------------------*/
   763  // func gfpUnmarshal(res *gfP, in *[32]byte)
   764  TEXT ·gfpUnmarshal(SB),NOSPLIT,$0
   765  	JMP	·gfpMarshal(SB)
   766  
   767  /* ---------------------------------------*/
   768  // func gfpMarshal(res *[32]byte, in *gfP)
   769  TEXT ·gfpMarshal(SB),NOSPLIT,$0
   770  	MOVD	res+0(FP), res_ptr
   771  	MOVD	in+8(FP), a_ptr
   772  
   773  	LDP	0*16(a_ptr), (acc0, acc1)
   774  	LDP	1*16(a_ptr), (acc2, acc3)
   775  
   776  	REV	acc0, acc0
   777  	REV	acc1, acc1
   778  	REV	acc2, acc2
   779  	REV	acc3, acc3
   780  
   781  	STP	(acc3, acc2), 0*16(res_ptr)
   782  	STP	(acc1, acc0), 1*16(res_ptr)
   783  	RET