gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm2/p256_asm_arm64.s

gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm2/p256_asm_arm64.s (about)

     1  // This file contains constant-time, 64-bit assembly implementation of
     2  // P256. The optimizations performed here are described in detail in:
     3  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     4  //                          256-bit primes"
     5  // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
     6  // https://eprint.iacr.org/2013/816.pdf
     7  
     8  #include "textflag.h"
     9  
    10  #define res_ptr R0
    11  #define a_ptr R1
    12  #define b_ptr R2
    13  
    14  #define acc0 R3
    15  #define acc1 R4
    16  #define acc2 R5
    17  #define acc3 R6
    18  
    19  #define acc4 R7
    20  #define acc5 R8
    21  #define acc6 R9
    22  #define acc7 R10
    23  #define t0 R11
    24  #define t1 R12
    25  #define t2 R13
    26  #define t3 R14
    27  #define const0 R15
    28  #define const1 R16
    29  
    30  #define hlp0 R17
    31  #define hlp1 res_ptr
    32  
    33  #define x0 R19
    34  #define x1 R20
    35  #define x2 R21
    36  #define x3 R22
    37  #define y0 R23
    38  #define y1 R24
    39  #define y2 R25
    40  #define y3 R26
    41  
    42  #define const2 t2
    43  #define const3 t3
    44  
    45  DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
    46  DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
    47  DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
    48  DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
    49  DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
    50  DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
    51  DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
    52  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    53  DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
    54  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    55  DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
    56  DATA p256one<>+0x10(SB)/8, $0x0000000000000000
    57  DATA p256one<>+0x18(SB)/8, $0x0000000100000000
    58  GLOBL p256p<>(SB), RODATA, $32
    59  GLOBL p256ordK0<>(SB), RODATA, $8
    60  GLOBL p256ord<>(SB), RODATA, $32
    61  GLOBL p256one<>(SB), RODATA, $32
    62  
    63  /* ---------------------------------------*/
    64  // func p256LittleToBig(res []byte, in []uint64)
    65  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    66  	JMP	·p256BigToLittle(SB)
    67  /* ---------------------------------------*/
    68  // func p256BigToLittle(res []uint64, in []byte)
    69  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    70  	MOVD	res+0(FP), res_ptr
    71  	MOVD	in+24(FP), a_ptr
    72  
    73  	LDP	0*16(a_ptr), (acc0, acc1)
    74  	LDP	1*16(a_ptr), (acc2, acc3)
    75  
    76  	REV	acc0, acc0
    77  	REV	acc1, acc1
    78  	REV	acc2, acc2
    79  	REV	acc3, acc3
    80  
    81  	STP	(acc3, acc2), 0*16(res_ptr)
    82  	STP	(acc1, acc0), 1*16(res_ptr)
    83  	RET
    84  /* ---------------------------------------*/
    85  // func p256MovCond(res, a, b []uint64, cond int)
    86  // If cond == 0 res=b, else res=a
    87  TEXT ·p256MovCond(SB),NOSPLIT,$0
    88  	MOVD	res+0(FP), res_ptr
    89  	MOVD	a+24(FP), a_ptr
    90  	MOVD	b+48(FP), b_ptr
    91  	MOVD	cond+72(FP), R3
    92  
    93  	CMP	$0, R3
    94  	// Two remarks:
    95  	// 1) Will want to revisit NEON, when support is better
    96  	// 2) CSEL might not be constant time on all ARM processors
    97  	LDP	0*16(a_ptr), (R4, R5)
    98  	LDP	1*16(a_ptr), (R6, R7)
    99  	LDP	2*16(a_ptr), (R8, R9)
   100  	LDP	0*16(b_ptr), (R16, R17)
   101  	LDP	1*16(b_ptr), (R19, R20)
   102  	LDP	2*16(b_ptr), (R21, R22)
   103  	CSEL	EQ, R16, R4, R4
   104  	CSEL	EQ, R17, R5, R5
   105  	CSEL	EQ, R19, R6, R6
   106  	CSEL	EQ, R20, R7, R7
   107  	CSEL	EQ, R21, R8, R8
   108  	CSEL	EQ, R22, R9, R9
   109  	STP	(R4, R5), 0*16(res_ptr)
   110  	STP	(R6, R7), 1*16(res_ptr)
   111  	STP	(R8, R9), 2*16(res_ptr)
   112  
   113  	LDP	3*16(a_ptr), (R4, R5)
   114  	LDP	4*16(a_ptr), (R6, R7)
   115  	LDP	5*16(a_ptr), (R8, R9)
   116  	LDP	3*16(b_ptr), (R16, R17)
   117  	LDP	4*16(b_ptr), (R19, R20)
   118  	LDP	5*16(b_ptr), (R21, R22)
   119  	CSEL	EQ, R16, R4, R4
   120  	CSEL	EQ, R17, R5, R5
   121  	CSEL	EQ, R19, R6, R6
   122  	CSEL	EQ, R20, R7, R7
   123  	CSEL	EQ, R21, R8, R8
   124  	CSEL	EQ, R22, R9, R9
   125  	STP	(R4, R5), 3*16(res_ptr)
   126  	STP	(R6, R7), 4*16(res_ptr)
   127  	STP	(R8, R9), 5*16(res_ptr)
   128  
   129  	RET
   130  /* ---------------------------------------*/
   131  // func p256NegCond(val []uint64, cond int)
   132  TEXT ·p256NegCond(SB),NOSPLIT,$0
   133  	MOVD	val+0(FP), a_ptr
   134  	MOVD	cond+24(FP), hlp0
   135  	MOVD	a_ptr, res_ptr
   136  	// acc = poly
   137  	LDP	p256p<>+0x00(SB), (acc0, acc1)
   138  	LDP	p256p<>+0x10(SB), (acc2, acc3)
   139  	
   140  	// Load the original value
   141  	LDP	0*16(a_ptr), (t0, t1)
   142  	LDP	1*16(a_ptr), (t2, t3)
   143  	// Speculatively subtract
   144  	SUBS	t0, acc0
   145  	SBCS	t1, acc1
   146  	SBCS	t2, acc2
   147  	SBC	t3, acc3
   148  	// If condition is 0, keep original value
   149  	CMP	$0, hlp0
   150  	CSEL	EQ, t0, acc0, acc0
   151  	CSEL	EQ, t1, acc1, acc1
   152  	CSEL	EQ, t2, acc2, acc2
   153  	CSEL	EQ, t3, acc3, acc3
   154  	// Store result
   155  	STP	(acc0, acc1), 0*16(res_ptr)
   156  	STP	(acc2, acc3), 1*16(res_ptr)
   157  
   158  	RET
   159  /* ---------------------------------------*/
   160  // func p256Sqr(res, in []uint64, n int)
   161  TEXT ·p256Sqr(SB),NOSPLIT,$0
   162  	MOVD	res+0(FP), res_ptr
   163  	MOVD	in+24(FP), a_ptr
   164  	MOVD	n+48(FP), b_ptr
   165  
   166  	LDP	p256p<>+0x00(SB), (const0, const1)
   167  	LDP	p256p<>+0x10(SB), (const2, const3)
   168  
   169  	LDP	0*16(a_ptr), (x0, x1)
   170  	LDP	1*16(a_ptr), (x2, x3)
   171  
   172  sqrLoop:
   173  	SUB	$1, b_ptr
   174  	CALL	sm2P256SqrInternal<>(SB)
   175  	MOVD	y0, x0
   176  	MOVD	y1, x1
   177  	MOVD	y2, x2
   178  	MOVD	y3, x3
   179  	CBNZ	b_ptr, sqrLoop
   180  
   181  	STP	(y0, y1), 0*16(res_ptr)
   182  	STP	(y2, y3), 1*16(res_ptr)
   183  	RET
   184  /* ---------------------------------------*/
   185  // func p256Mul(res, in1, in2 []uint64)
   186  TEXT ·p256Mul(SB),NOSPLIT,$0
   187  	MOVD	res+0(FP), res_ptr
   188  	MOVD	in1+24(FP), a_ptr
   189  	MOVD	in2+48(FP), b_ptr
   190  
   191  	LDP	p256p<>+0x00(SB), (const0, const1)
   192  	LDP	p256p<>+0x10(SB), (const2, const3)
   193  
   194  	LDP	0*16(a_ptr), (x0, x1)
   195  	LDP	1*16(a_ptr), (x2, x3)
   196  
   197  	LDP	0*16(b_ptr), (y0, y1)
   198  	LDP	1*16(b_ptr), (y2, y3)
   199  
   200  	CALL	sm2P256MulInternal<>(SB)
   201  
   202  	STP	(y0, y1), 0*16(res_ptr)
   203  	STP	(y2, y3), 1*16(res_ptr)
   204  	RET
   205  /* ---------------------------------------*/
   206  // func p256FromMont(res, in []uint64)
   207  TEXT ·p256FromMont(SB),NOSPLIT,$0
   208  	MOVD	res+0(FP), res_ptr
   209  	MOVD	in+24(FP), a_ptr
   210  
   211  	LDP	p256p<>+0x00(SB), (const0, const1)
   212  	LDP	p256p<>+0x10(SB), (const2, const3)
   213  
   214  	LDP	0*16(a_ptr), (acc0, acc1)
   215  	LDP	1*16(a_ptr), (acc2, acc3)
   216  	// Only reduce, no multiplications are needed
   217  	// First reduction step
   218  	LSL $32, acc0, y0
   219  	LSR	$32, acc0, y1
   220  
   221  	ADDS acc0, acc1, acc1
   222  	ADCS $0, acc2, acc2
   223  	ADCS $0, acc3, acc3
   224  	ADC $0, acc0, acc0
   225  	
   226  	SUBS y0, acc1
   227  	SBCS y1, acc2
   228  	SBCS y0, acc3
   229  	SBC y1, acc0	
   230  	// Second reduction step
   231  	LSL $32, acc1, y0
   232  	LSR	$32, acc1, y1
   233  
   234  	ADDS acc1, acc2, acc2
   235  	ADCS $0, acc3, acc3
   236  	ADCS $0, acc0, acc0
   237  	ADC $0, acc1, acc1
   238  	
   239  	SUBS y0, acc2
   240  	SBCS y1, acc3
   241  	SBCS y0, acc0
   242  	SBC y1, acc1	
   243  	// Third reduction step
   244  	LSL $32, acc2, y0
   245  	LSR	$32, acc2, y1
   246  
   247  	ADDS acc2, acc3, acc3
   248  	ADCS $0, acc0, acc0
   249  	ADCS $0, acc1, acc1
   250  	ADC $0, acc2, acc2
   251  	
   252  	SUBS y0, acc3
   253  	SBCS y1, acc0
   254  	SBCS y0, acc1
   255  	SBC y1, acc2
   256  	// Last reduction step
   257  	LSL $32, acc3, y0
   258  	LSR	$32, acc3, y1
   259  
   260  	ADDS acc3, acc0, acc0
   261  	ADCS $0, acc1, acc1
   262  	ADCS $0, acc2, acc2
   263  	ADC $0, acc3, acc3
   264  	
   265  	SUBS y0, acc0
   266  	SBCS y1, acc1
   267  	SBCS y0, acc2
   268  	SBC y1, acc3
   269  
   270  	SUBS	const0, acc0, t0
   271  	SBCS	const1, acc1, t1
   272  	SBCS	const2, acc2, t2
   273  	SBCS	const3, acc3, t3
   274  
   275  	CSEL	CS, t0, acc0, acc0
   276  	CSEL	CS, t1, acc1, acc1
   277  	CSEL	CS, t2, acc2, acc2
   278  	CSEL	CS, t3, acc3, acc3
   279  
   280  	STP	(acc0, acc1), 0*16(res_ptr)
   281  	STP	(acc2, acc3), 1*16(res_ptr)
   282  
   283  	RET
   284  /* ---------------------------------------*/
   285  // Constant time point access to arbitrary point table.
   286  // Indexed from 1 to 15, with -1 offset
   287  // (index 0 is implicitly point at infinity)
   288  // func p256Select(point, table []uint64, idx int)
   289  TEXT ·p256Select(SB),NOSPLIT,$0
   290  	MOVD	idx+48(FP), const0
   291  	MOVD	table+24(FP), b_ptr
   292  	MOVD	point+0(FP), res_ptr
   293  
   294  	EOR	x0, x0, x0
   295  	EOR	x1, x1, x1
   296  	EOR	x2, x2, x2
   297  	EOR	x3, x3, x3
   298  	EOR	y0, y0, y0
   299  	EOR	y1, y1, y1
   300  	EOR	y2, y2, y2
   301  	EOR	y3, y3, y3
   302  	EOR	t0, t0, t0
   303  	EOR	t1, t1, t1
   304  	EOR	t2, t2, t2
   305  	EOR	t3, t3, t3
   306  
   307  	MOVD	$0, const1
   308  
   309  loop_select:
   310  		ADD	$1, const1
   311  		CMP	const0, const1
   312  		LDP.P	16(b_ptr), (acc0, acc1)
   313  		CSEL	EQ, acc0, x0, x0
   314  		CSEL	EQ, acc1, x1, x1
   315  		LDP.P	16(b_ptr), (acc2, acc3)
   316  		CSEL	EQ, acc2, x2, x2
   317  		CSEL	EQ, acc3, x3, x3
   318  		LDP.P	16(b_ptr), (acc4, acc5)
   319  		CSEL	EQ, acc4, y0, y0
   320  		CSEL	EQ, acc5, y1, y1
   321  		LDP.P	16(b_ptr), (acc6, acc7)
   322  		CSEL	EQ, acc6, y2, y2
   323  		CSEL	EQ, acc7, y3, y3
   324  		LDP.P	16(b_ptr), (acc0, acc1)
   325  		CSEL	EQ, acc0, t0, t0
   326  		CSEL	EQ, acc1, t1, t1
   327  		LDP.P	16(b_ptr), (acc2, acc3)
   328  		CSEL	EQ, acc2, t2, t2
   329  		CSEL	EQ, acc3, t3, t3
   330  
   331  		CMP	$16, const1
   332  		BNE	loop_select
   333  
   334  	STP	(x0, x1), 0*16(res_ptr)
   335  	STP	(x2, x3), 1*16(res_ptr)
   336  	STP	(y0, y1), 2*16(res_ptr)
   337  	STP	(y2, y3), 3*16(res_ptr)
   338  	STP	(t0, t1), 4*16(res_ptr)
   339  	STP	(t2, t3), 5*16(res_ptr)
   340  	RET
   341  /* ---------------------------------------*/
   342  // Constant time point access to base point table.
   343  // func p256SelectBase(point *[12]uint64, table string, idx int)
   344  TEXT ·p256SelectBase(SB),NOSPLIT,$0
   345  	MOVD	idx+24(FP), t0
   346  	MOVD	table_base+8(FP), t1
   347  	MOVD	point+0(FP), res_ptr
   348  
   349  	EOR	x0, x0, x0
   350  	EOR	x1, x1, x1
   351  	EOR	x2, x2, x2
   352  	EOR	x3, x3, x3
   353  	EOR	y0, y0, y0
   354  	EOR	y1, y1, y1
   355  	EOR	y2, y2, y2
   356  	EOR	y3, y3, y3
   357  
   358  	MOVD	$0, t2
   359  
   360  loop_select:
   361  		ADD	$1, t2
   362  		CMP	t0, t2
   363  		LDP.P	16(t1), (acc0, acc1)
   364  		CSEL	EQ, acc0, x0, x0
   365  		CSEL	EQ, acc1, x1, x1
   366  		LDP.P	16(t1), (acc2, acc3)
   367  		CSEL	EQ, acc2, x2, x2
   368  		CSEL	EQ, acc3, x3, x3
   369  		LDP.P	16(t1), (acc4, acc5)
   370  		CSEL	EQ, acc4, y0, y0
   371  		CSEL	EQ, acc5, y1, y1
   372  		LDP.P	16(t1), (acc6, acc7)
   373  		CSEL	EQ, acc6, y2, y2
   374  		CSEL	EQ, acc7, y3, y3
   375  
   376  		CMP	$32, t2
   377  		BNE	loop_select
   378  
   379  	STP	(x0, x1), 0*16(res_ptr)
   380  	STP	(x2, x3), 1*16(res_ptr)
   381  	STP	(y0, y1), 2*16(res_ptr)
   382  	STP	(y2, y3), 3*16(res_ptr)
   383  	RET
   384  /* ---------------------------------------*/
   385  // func p256OrdSqr(res, in []uint64, n int)
   386  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   387  	MOVD	in+24(FP), a_ptr
   388  	MOVD	n+48(FP), b_ptr
   389  
   390  	MOVD	p256ordK0<>(SB), hlp1
   391  	LDP	p256ord<>+0x00(SB), (const0, const1)
   392  	LDP	p256ord<>+0x10(SB), (const2, const3)
   393  
   394  	LDP	0*16(a_ptr), (x0, x1)
   395  	LDP	1*16(a_ptr), (x2, x3)
   396  
   397  ordSqrLoop:
   398  	SUB	$1, b_ptr
   399  
   400  	// x[1:] * x[0]
   401  	MUL	x0, x1, acc1
   402  	UMULH	x0, x1, acc2
   403  
   404  	MUL	x0, x2, t0
   405  	ADDS	t0, acc2, acc2
   406  	UMULH	x0, x2, acc3
   407  
   408  	MUL	x0, x3, t0
   409  	ADCS	t0, acc3, acc3
   410  	UMULH	x0, x3, acc4
   411  	ADC	$0, acc4, acc4
   412  	// x[2:] * x[1]
   413  	MUL	x1, x2, t0
   414  	ADDS	t0, acc3
   415  	UMULH	x1, x2, t1
   416  	ADCS	t1, acc4
   417  	ADC	$0, ZR, acc5
   418  
   419  	MUL	x1, x3, t0
   420  	ADDS	t0, acc4
   421  	UMULH	x1, x3, t1
   422  	ADC	t1, acc5
   423  	// x[3] * x[2]
   424  	MUL	x2, x3, t0
   425  	ADDS	t0, acc5
   426  	UMULH	x2, x3, acc6
   427  	ADC	$0, acc6
   428  
   429  	MOVD	$0, acc7
   430  	// *2
   431  	ADDS	acc1, acc1
   432  	ADCS	acc2, acc2
   433  	ADCS	acc3, acc3
   434  	ADCS	acc4, acc4
   435  	ADCS	acc5, acc5
   436  	ADCS	acc6, acc6
   437  	ADC	$0, acc7
   438  	// Missing products
   439  	MUL	x0, x0, acc0
   440  	UMULH	x0, x0, t0
   441  	ADDS	t0, acc1, acc1
   442  
   443  	MUL	x1, x1, t0
   444  	ADCS	t0, acc2, acc2
   445  	UMULH	x1, x1, t1
   446  	ADCS	t1, acc3, acc3
   447  
   448  	MUL	x2, x2, t0
   449  	ADCS	t0, acc4, acc4
   450  	UMULH	x2, x2, t1
   451  	ADCS	t1, acc5, acc5
   452  
   453  	MUL	x3, x3, t0
   454  	ADCS	t0, acc6, acc6
   455  	UMULH	x3, x3, t1
   456  	ADC	t1, acc7, acc7
   457  	// First reduction step
   458  	MUL	acc0, hlp1, hlp0
   459  
   460  	MUL	const0, hlp1, t0
   461  	ADDS	t0, acc0, acc0
   462  	UMULH	const0, hlp0, t1
   463  
   464  	MUL	const1, hlp0, t0
   465  	ADCS	t0, acc1, acc1
   466  	UMULH	const1, hlp0, y0
   467  
   468  	MUL	const2, hlp0, t0
   469  	ADCS	t0, acc2, acc2
   470  	UMULH	const2, hlp0, acc0
   471  
   472  	MUL	const3, hlp0, t0
   473  	ADCS	t0, acc3, acc3
   474  
   475  	UMULH	const3, hlp0, hlp0
   476  	ADC	$0, hlp0
   477  
   478  	ADDS	t1, acc1, acc1
   479  	ADCS	y0, acc2, acc2
   480  	ADCS	acc0, acc3, acc3
   481  	ADC	$0, hlp0, acc0
   482  	// Second reduction step
   483  	MUL	acc1, hlp1, hlp0
   484  
   485  	MUL	const0, hlp1, t0
   486  	ADDS	t0, acc1, acc1
   487  	UMULH	const0, hlp0, t1
   488  
   489  	MUL	const1, hlp0, t0
   490  	ADCS	t0, acc2, acc2
   491  	UMULH	const1, hlp0, y0
   492  
   493  	MUL	const2, hlp0, t0
   494  	ADCS	t0, acc3, acc3
   495  	UMULH	const2, hlp0, acc1
   496  
   497  	MUL	const3, hlp0, t0
   498  	ADCS	t0, acc0, acc0
   499  
   500  	UMULH	const3, hlp0, hlp0
   501  	ADC	$0, hlp0
   502  
   503  	ADDS	t1, acc2, acc2
   504  	ADCS	y0, acc3, acc3
   505  	ADCS	acc1, acc0, acc0
   506  	ADC	$0, hlp0, acc1
   507  	// Third reduction step
   508  	MUL	acc2, hlp1, hlp0
   509  
   510  	MUL	const0, hlp1, t0
   511  	ADDS	t0, acc2, acc2
   512  	UMULH	const0, hlp0, t1
   513  
   514  	MUL	const1, hlp0, t0
   515  	ADCS	t0, acc3, acc3
   516  	UMULH	const1, hlp0, y0
   517  
   518  	MUL	const2, hlp0, t0
   519  	ADCS	t0, acc0, acc0
   520  	UMULH	const2, hlp0, acc2
   521  
   522  	MUL	const3, hlp0, t0
   523  	ADCS	t0, acc1, acc1
   524  
   525  	UMULH	const3, hlp0, hlp0
   526  	ADC	$0, hlp0
   527  
   528  	ADDS	t1, acc3, acc3
   529  	ADCS	y0, acc0, acc0
   530  	ADCS	acc2, acc1, acc1
   531  	ADC	$0, hlp0, acc2
   532  
   533  	// Last reduction step
   534  	MUL	acc3, hlp1, hlp0
   535  
   536  	MUL	const0, hlp1, t0
   537  	ADDS	t0, acc3, acc3
   538  	UMULH	const0, hlp0, t1
   539  
   540  	MUL	const1, hlp0, t0
   541  	ADCS	t0, acc0, acc0
   542  	UMULH	const1, hlp0, y0
   543  
   544  	MUL	const2, hlp0, t0
   545  	ADCS	t0, acc1, acc1
   546  	UMULH	const2, hlp0, acc3
   547  
   548  	MUL	const3, hlp0, t0
   549  	ADCS	t0, acc2, acc2
   550  
   551  	UMULH	const3, hlp0, hlp0
   552  	ADC	$0, acc7
   553  
   554  	ADDS	t1, acc0, acc0
   555  	ADCS	y0, acc1, acc1
   556  	ADCS	acc3, acc2, acc2
   557  	ADC	$0, hlp0, acc3
   558  
   559  	ADDS	acc4, acc0, acc0
   560  	ADCS	acc5, acc1, acc1
   561  	ADCS	acc6, acc2, acc2
   562  	ADCS	acc7, acc3, acc3
   563  	ADC	$0, ZR, acc4
   564  
   565  	SUBS	const0, acc0, y0
   566  	SBCS	const1, acc1, y1
   567  	SBCS	const2, acc2, y2
   568  	SBCS	const3, acc3, y3
   569  	SBCS	$0, acc4, acc4
   570  
   571  	CSEL	CS, y0, acc0, x0
   572  	CSEL	CS, y1, acc1, x1
   573  	CSEL	CS, y2, acc2, x2
   574  	CSEL	CS, y3, acc3, x3
   575  
   576  	CBNZ	b_ptr, ordSqrLoop
   577  
   578  	MOVD	res+0(FP), res_ptr
   579  	STP	(x0, x1), 0*16(res_ptr)
   580  	STP	(x2, x3), 1*16(res_ptr)
   581  
   582  	RET
   583  /* ---------------------------------------*/
   584  // func p256OrdMul(res, in1, in2 []uint64)
   585  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   586  	MOVD	in1+24(FP), a_ptr
   587  	MOVD	in2+48(FP), b_ptr
   588  
   589  	MOVD	p256ordK0<>(SB), hlp1
   590  	LDP	p256ord<>+0x00(SB), (const0, const1)
   591  	LDP	p256ord<>+0x10(SB), (const2, const3)
   592  
   593  	LDP	0*16(a_ptr), (x0, x1)
   594  	LDP	1*16(a_ptr), (x2, x3)
   595  	LDP	0*16(b_ptr), (y0, y1)
   596  	LDP	1*16(b_ptr), (y2, y3)
   597  
   598  	// y[0] * x
   599  	MUL	y0, x0, acc0
   600  	UMULH	y0, x0, acc1
   601  
   602  	MUL	y0, x1, t0
   603  	ADDS	t0, acc1
   604  	UMULH	y0, x1, acc2
   605  
   606  	MUL	y0, x2, t0
   607  	ADCS	t0, acc2
   608  	UMULH	y0, x2, acc3
   609  
   610  	MUL	y0, x3, t0
   611  	ADCS	t0, acc3
   612  	UMULH	y0, x3, acc4
   613  	ADC	$0, acc4
   614  	// First reduction step
   615  	MUL	acc0, hlp1, hlp0
   616  
   617  	MUL	const0, hlp1, t0
   618  	ADDS	t0, acc0, acc0
   619  	UMULH	const0, hlp0, t1
   620  
   621  	MUL	const1, hlp0, t0
   622  	ADCS	t0, acc1, acc1
   623  	UMULH	const1, hlp0, y0
   624  
   625  	MUL	const2, hlp0, t0
   626  	ADCS	t0, acc2, acc2
   627  	UMULH	const2, hlp0, acc0
   628  
   629  	MUL	const3, hlp0, t0
   630  	ADCS	t0, acc3, acc3
   631  
   632  	UMULH	const3, hlp0, hlp0
   633  	ADC	$0, acc4
   634  
   635  	ADDS	t1, acc1, acc1
   636  	ADCS	y0, acc2, acc2
   637  	ADCS	acc0, acc3, acc3
   638  	ADC	$0, hlp0, acc0
   639  	// y[1] * x
   640  	MUL	y1, x0, t0
   641  	ADDS	t0, acc1
   642  	UMULH	y1, x0, t1
   643  
   644  	MUL	y1, x1, t0
   645  	ADCS	t0, acc2
   646  	UMULH	y1, x1, hlp0
   647  
   648  	MUL	y1, x2, t0
   649  	ADCS	t0, acc3
   650  	UMULH	y1, x2, y0
   651  
   652  	MUL	y1, x3, t0
   653  	ADCS	t0, acc4
   654  	UMULH	y1, x3, y1
   655  	ADC	$0, ZR, acc5
   656  
   657  	ADDS	t1, acc2
   658  	ADCS	hlp0, acc3
   659  	ADCS	y0, acc4
   660  	ADC	y1, acc5
   661  	// Second reduction step
   662  	MUL	acc1, hlp1, hlp0
   663  
   664  	MUL	const0, hlp1, t0
   665  	ADDS	t0, acc1, acc1
   666  	UMULH	const0, hlp0, t1
   667  
   668  	MUL	const1, hlp0, t0
   669  	ADCS	t0, acc2, acc2
   670  	UMULH	const1, hlp0, y0
   671  
   672  	MUL	const2, hlp0, t0
   673  	ADCS	t0, acc3, acc3
   674  	UMULH	const2, hlp0, acc1
   675  
   676  	MUL	const3, hlp0, t0
   677  	ADCS	t0, acc0, acc0
   678  
   679  	UMULH	const3, hlp0, hlp0
   680  	ADC	$0, acc5
   681  
   682  	ADDS	t1, acc2, acc2
   683  	ADCS	y0, acc3, acc3
   684  	ADCS	acc1, acc0, acc0
   685  	ADC	$0, hlp0, acc1
   686  	// y[2] * x
   687  	MUL	y2, x0, t0
   688  	ADDS	t0, acc2
   689  	UMULH	y2, x0, t1
   690  
   691  	MUL	y2, x1, t0
   692  	ADCS	t0, acc3
   693  	UMULH	y2, x1, hlp0
   694  
   695  	MUL	y2, x2, t0
   696  	ADCS	t0, acc4
   697  	UMULH	y2, x2, y0
   698  
   699  	MUL	y2, x3, t0
   700  	ADCS	t0, acc5
   701  	UMULH	y2, x3, y1
   702  	ADC	$0, ZR, acc6
   703  
   704  	ADDS	t1, acc3
   705  	ADCS	hlp0, acc4
   706  	ADCS	y0, acc5
   707  	ADC	y1, acc6
   708  	// Third reduction step
   709  	MUL	acc2, hlp1, hlp0
   710  
   711  	MUL	const0, hlp1, t0
   712  	ADDS	t0, acc2, acc2
   713  	UMULH	const0, hlp0, t1
   714  
   715  	MUL	const1, hlp0, t0
   716  	ADCS	t0, acc3, acc3
   717  	UMULH	const1, hlp0, y0
   718  
   719  	MUL	const2, hlp0, t0
   720  	ADCS	t0, acc0, acc0
   721  	UMULH	const2, hlp0, acc2
   722  
   723  	MUL	const3, hlp0, t0
   724  	ADCS	t0, acc1, acc1
   725  
   726  	UMULH	const3, hlp0, hlp0
   727  	ADC	$0, acc6
   728  
   729  	ADDS	t1, acc3, acc3
   730  	ADCS	y0, acc0, acc0
   731  	ADCS	acc2, acc1, acc1
   732  	ADC	$0, hlp0, acc2
   733  	// y[3] * x
   734  	MUL	y3, x0, t0
   735  	ADDS	t0, acc3
   736  	UMULH	y3, x0, t1
   737  
   738  	MUL	y3, x1, t0
   739  	ADCS	t0, acc4
   740  	UMULH	y3, x1, hlp0
   741  
   742  	MUL	y3, x2, t0
   743  	ADCS	t0, acc5
   744  	UMULH	y3, x2, y0
   745  
   746  	MUL	y3, x3, t0
   747  	ADCS	t0, acc6
   748  	UMULH	y3, x3, y1
   749  	ADC	$0, ZR, acc7
   750  
   751  	ADDS	t1, acc4
   752  	ADCS	hlp0, acc5
   753  	ADCS	y0, acc6
   754  	ADC	y1, acc7
   755  	// Last reduction step
   756  	MUL	acc3, hlp1, hlp0
   757  
   758  	MUL	const0, hlp1, t0
   759  	ADDS	t0, acc3, acc3
   760  	UMULH	const0, hlp0, t1
   761  
   762  	MUL	const1, hlp0, t0
   763  	ADCS	t0, acc0, acc0
   764  	UMULH	const1, hlp0, y0
   765  
   766  	MUL	const2, hlp0, t0
   767  	ADCS	t0, acc1, acc1
   768  	UMULH	const2, hlp0, acc3
   769  
   770  	MUL	const3, hlp0, t0
   771  	ADCS	t0, acc2, acc2
   772  
   773  	UMULH	const3, hlp0, hlp0
   774  	ADC	$0, acc7
   775  
   776  	ADDS	t1, acc0, acc0
   777  	ADCS	y0, acc1, acc1
   778  	ADCS	acc3, acc2, acc2
   779  	ADC	$0, hlp0, acc3
   780  
   781  	ADDS	acc4, acc0, acc0
   782  	ADCS	acc5, acc1, acc1
   783  	ADCS	acc6, acc2, acc2
   784  	ADCS	acc7, acc3, acc3
   785  	ADC	$0, ZR, acc4
   786  
   787  	SUBS	const0, acc0, t0
   788  	SBCS	const1, acc1, t1
   789  	SBCS	const2, acc2, t2
   790  	SBCS	const3, acc3, t3
   791  	SBCS	$0, acc4, acc4
   792  
   793  	CSEL	CS, t0, acc0, acc0
   794  	CSEL	CS, t1, acc1, acc1
   795  	CSEL	CS, t2, acc2, acc2
   796  	CSEL	CS, t3, acc3, acc3
   797  
   798  	MOVD	res+0(FP), res_ptr
   799  	STP	(acc0, acc1), 0*16(res_ptr)
   800  	STP	(acc2, acc3), 1*16(res_ptr)
   801  
   802  	RET
   803  /* ---------------------------------------*/
   804  TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0
   805  	SUBS	x0, y0, acc0
   806  	SBCS	x1, y1, acc1
   807  	SBCS	x2, y2, acc2
   808  	SBCS	x3, y3, acc3
   809  	SBC	$0, ZR, t0
   810  
   811  	ADDS	const0, acc0, acc4
   812  	ADCS	const1, acc1, acc5
   813  	ADCS	const2, acc2, acc6
   814  	ADC	const3, acc3, acc7
   815  
   816  	ANDS	$1, t0
   817  	CSEL	EQ, acc0, acc4, x0
   818  	CSEL	EQ, acc1, acc5, x1
   819  	CSEL	EQ, acc2, acc6, x2
   820  	CSEL	EQ, acc3, acc7, x3
   821  
   822  	RET
   823  /* ---------------------------------------*/
   824  TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
   825  	// x[1:] * x[0]
   826  	MUL	x0, x1, acc1
   827  	UMULH	x0, x1, acc2
   828  
   829  	MUL	x0, x2, t0
   830  	ADDS	t0, acc2, acc2
   831  	UMULH	x0, x2, acc3
   832  
   833  	MUL	x0, x3, t0
   834  	ADCS	t0, acc3, acc3
   835  	UMULH	x0, x3, acc4
   836  	ADC	$0, acc4, acc4
   837  	// x[2:] * x[1]
   838  	MUL	x1, x2, t0
   839  	ADDS	t0, acc3
   840  	UMULH	x1, x2, t1
   841  	ADCS	t1, acc4
   842  	ADC	$0, ZR, acc5
   843  
   844  	MUL	x1, x3, t0
   845  	ADDS	t0, acc4
   846  	UMULH	x1, x3, t1
   847  	ADC	t1, acc5
   848  	// x[3] * x[2]
   849  	MUL	x2, x3, t0
   850  	ADDS	t0, acc5
   851  	UMULH	x2, x3, acc6
   852  	ADC	$0, acc6
   853  
   854  	MOVD	$0, acc7
   855  	// *2
   856  	ADDS	acc1, acc1
   857  	ADCS	acc2, acc2
   858  	ADCS	acc3, acc3
   859  	ADCS	acc4, acc4
   860  	ADCS	acc5, acc5
   861  	ADCS	acc6, acc6
   862  	ADC	$0, acc7
   863  	// Missing products
   864  	MUL	x0, x0, acc0
   865  	UMULH	x0, x0, t0
   866  	ADDS	t0, acc1, acc1
   867  
   868  	MUL	x1, x1, t0
   869  	ADCS	t0, acc2, acc2
   870  	UMULH	x1, x1, t1
   871  	ADCS	t1, acc3, acc3
   872  
   873  	MUL	x2, x2, t0
   874  	ADCS	t0, acc4, acc4
   875  	UMULH	x2, x2, t1
   876  	ADCS	t1, acc5, acc5
   877  
   878  	MUL	x3, x3, t0
   879  	ADCS	t0, acc6, acc6
   880  	UMULH	x3, x3, t1
   881  	ADCS	t1, acc7, acc7
   882  	// First reduction step
   883  	LSL $32, acc0, y0
   884  	LSR	$32, acc0, y1
   885  
   886  	ADDS acc0, acc1, acc1
   887  	ADCS $0, acc2, acc2
   888  	ADCS $0, acc3, acc3
   889  	ADC $0, acc0, acc0
   890  	
   891  	SUBS y0, acc1
   892  	SBCS y1, acc2
   893  	SBCS y0, acc3
   894  	SBC y1, acc0	
   895  	// Second reduction step
   896  	LSL $32, acc1, y0
   897  	LSR	$32, acc1, y1
   898  
   899  	ADDS acc1, acc2, acc2
   900  	ADCS $0, acc3, acc3
   901  	ADCS $0, acc0, acc0
   902  	ADC $0, acc1, acc1
   903  	
   904  	SUBS y0, acc2
   905  	SBCS y1, acc3
   906  	SBCS y0, acc0
   907  	SBC y1, acc1	
   908  	// Third reduction step
   909  	LSL $32, acc2, y0
   910  	LSR	$32, acc2, y1
   911  
   912  	ADDS acc2, acc3, acc3
   913  	ADCS $0, acc0, acc0
   914  	ADCS $0, acc1, acc1
   915  	ADC $0, acc2, acc2
   916  	
   917  	SUBS y0, acc3
   918  	SBCS y1, acc0
   919  	SBCS y0, acc1
   920  	SBC y1, acc2
   921  	// Last reduction step
   922  	LSL $32, acc3, y0
   923  	LSR	$32, acc3, y1
   924  
   925  	ADDS acc3, acc0, acc0
   926  	ADCS $0, acc1, acc1
   927  	ADCS $0, acc2, acc2
   928  	ADC $0, acc3, acc3
   929  	
   930  	SUBS y0, acc0
   931  	SBCS y1, acc1
   932  	SBCS y0, acc2
   933  	SBC y1, acc3
   934  
   935  	// Add bits [511:256] of the sqr result
   936  	ADDS	acc4, acc0, acc0
   937  	ADCS	acc5, acc1, acc1
   938  	ADCS	acc6, acc2, acc2
   939  	ADCS	acc7, acc3, acc3
   940  	ADC	$0, ZR, acc4
   941  
   942  	SUBS	const0, acc0, t0
   943  	SBCS	const1, acc1, t1
   944  	SBCS	const2, acc2, acc5
   945  	SBCS	const3, acc3, acc6
   946  	SBCS	$0, acc4, acc4
   947  
   948  	CSEL	CS, t0, acc0, y0
   949  	CSEL	CS, t1, acc1, y1
   950  	CSEL	CS, acc5, acc2, y2
   951  	CSEL	CS, acc6, acc3, y3
   952  	RET
   953  /* ---------------------------------------*/
   954  TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
   955  	// y[0] * x
   956  	MUL	y0, x0, acc0
   957  	UMULH	y0, x0, acc1
   958  
   959  	MUL	y0, x1, t0
   960  	ADDS	t0, acc1
   961  	UMULH	y0, x1, acc2
   962  
   963  	MUL	y0, x2, t0
   964  	ADCS	t0, acc2
   965  	UMULH	y0, x2, acc3
   966  
   967  	MUL	y0, x3, t0
   968  	ADCS	t0, acc3
   969  	UMULH	y0, x3, acc4
   970  	ADC	$0, acc4
   971  	// First reduction step
   972  	LSL $32, acc0, t0
   973  	LSR	$32, acc0, t1
   974  
   975  	ADDS acc0, acc1, acc1
   976  	ADCS $0, acc2, acc2
   977  	ADCS $0, acc3, acc3
   978  	ADC $0, acc0, acc0
   979  	
   980  	SUBS t0, acc1
   981  	SBCS t1, acc2
   982  	SBCS t0, acc3
   983  	SBC t1, acc0	
   984  
   985  	// y[1] * x
   986  	MUL	y1, x0, t0
   987  	ADDS	t0, acc1
   988  	UMULH	y1, x0, t1
   989  
   990  	MUL	y1, x1, t0
   991  	ADCS	t0, acc2
   992  	UMULH	y1, x1, y0
   993  
   994  	MUL	y1, x2, t0
   995  	ADCS	t0, acc3
   996  	UMULH	y1, x2, acc6
   997  
   998  	MUL	y1, x3, t0
   999  	ADCS	t0, acc4
  1000  	UMULH	y1, x3, hlp0
  1001  	ADC	$0, ZR, acc5
  1002  
  1003  	ADDS	t1, acc2
  1004  	ADCS	y0, acc3
  1005  	ADCS	acc6, acc4
  1006  	ADC	hlp0, acc5
  1007  	// Second reduction step
  1008  	LSL $32, acc1, t0
  1009  	LSR	$32, acc1, t1
  1010  
  1011  	ADDS acc1, acc2, acc2
  1012  	ADCS $0, acc3, acc3
  1013  	ADCS $0, acc0, acc0
  1014  	ADC $0, acc1, acc1
  1015  	
  1016  	SUBS t0, acc2
  1017  	SBCS t1, acc3
  1018  	SBCS t0, acc0
  1019  	SBC t1, acc1	
  1020  
  1021  	// y[2] * x
  1022  	MUL	y2, x0, t0
  1023  	ADDS	t0, acc2
  1024  	UMULH	y2, x0, t1
  1025  
  1026  	MUL	y2, x1, t0
  1027  	ADCS	t0, acc3
  1028  	UMULH	y2, x1, y0
  1029  
  1030  	MUL	y2, x2, t0
  1031  	ADCS	t0, acc4
  1032  	UMULH	y2, x2, y1
  1033  
  1034  	MUL	y2, x3, t0
  1035  	ADCS	t0, acc5
  1036  	UMULH	y2, x3, hlp0
  1037  	ADC	$0, ZR, acc6
  1038  
  1039  	ADDS	t1, acc3
  1040  	ADCS	y0, acc4
  1041  	ADCS	y1, acc5
  1042  	ADC	hlp0, acc6
  1043  	// Third reduction step
  1044  	LSL $32, acc2, t0
  1045  	LSR	$32, acc2, t1
  1046  
  1047  	ADDS acc2, acc3, acc3
  1048  	ADCS $0, acc0, acc0
  1049  	ADCS $0, acc1, acc1
  1050  	ADC $0, acc2, acc2
  1051  	
  1052  	SUBS t0, acc3
  1053  	SBCS t1, acc0
  1054  	SBCS t0, acc1
  1055  	SBC t1, acc2	
  1056  
  1057  	// y[3] * x
  1058  	MUL	y3, x0, t0
  1059  	ADDS	t0, acc3
  1060  	UMULH	y3, x0, t1
  1061  
  1062  	MUL	y3, x1, t0
  1063  	ADCS	t0, acc4
  1064  	UMULH	y3, x1, y0
  1065  
  1066  	MUL	y3, x2, t0
  1067  	ADCS	t0, acc5
  1068  	UMULH	y3, x2, y1
  1069  
  1070  	MUL	y3, x3, t0
  1071  	ADCS	t0, acc6
  1072  	UMULH	y3, x3, hlp0
  1073  	ADC	$0, ZR, acc7
  1074  
  1075  	ADDS	t1, acc4
  1076  	ADCS	y0, acc5
  1077  	ADCS	y1, acc6
  1078  	ADC	hlp0, acc7
  1079  	// Last reduction step
  1080  	LSL $32, acc3, t0
  1081  	LSR	$32, acc3, t1
  1082  
  1083  	ADDS acc3, acc0, acc0
  1084  	ADCS $0, acc1, acc1
  1085  	ADCS $0, acc2, acc2
  1086  	ADC $0, acc3, acc3
  1087  	
  1088  	SUBS t0, acc0
  1089  	SBCS t1, acc1
  1090  	SBCS t0, acc2
  1091  	SBC t1, acc3	
  1092  
  1093  	// Add bits [511:256] of the mul result
  1094  	ADDS	acc4, acc0, acc0
  1095  	ADCS	acc5, acc1, acc1
  1096  	ADCS	acc6, acc2, acc2
  1097  	ADCS	acc7, acc3, acc3
  1098  	ADC	$0, ZR, acc4
  1099  
  1100  	SUBS	const0, acc0, t0
  1101  	SBCS	const1, acc1, t1
  1102  	SBCS	const2, acc2, acc5
  1103  	SBCS	const3, acc3, acc6
  1104  	SBCS	$0, acc4, acc4
  1105  
  1106  	CSEL	CS, t0, acc0, y0
  1107  	CSEL	CS, t1, acc1, y1
  1108  	CSEL	CS, acc5, acc2, y2
  1109  	CSEL	CS, acc6, acc3, y3
  1110  	RET
  1111  /* ---------------------------------------*/
  1112  #define p256MulBy2Inline       \
  1113  	ADDS	y0, y0, x0;    \
  1114  	ADCS	y1, y1, x1;    \
  1115  	ADCS	y2, y2, x2;    \
  1116  	ADCS	y3, y3, x3;    \
  1117  	ADC	$0, ZR, hlp0;  \
  1118  	SUBS	const0, x0, t0;   \
  1119  	SBCS	const1, x1, t1;\
  1120  	SBCS	const2, x2, acc5;    \
  1121  	SBCS	const3, x3, acc6;\
  1122  	SBCS	$0, hlp0, hlp0;\
  1123  	CSEL	CC, x0, t0, x0;\
  1124  	CSEL	CC, x1, t1, x1;\
  1125  	CSEL	CC, x2, acc5, x2;\
  1126  	CSEL	CC, x3, acc6, x3;
  1127  /* ---------------------------------------*/
  1128  #define x1in(off) (off)(a_ptr)
  1129  #define y1in(off) (off + 32)(a_ptr)
  1130  #define z1in(off) (off + 64)(a_ptr)
  1131  #define x2in(off) (off)(b_ptr)
  1132  #define z2in(off) (off + 64)(b_ptr)
  1133  #define x3out(off) (off)(res_ptr)
  1134  #define y3out(off) (off + 32)(res_ptr)
  1135  #define z3out(off) (off + 64)(res_ptr)
  1136  #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1137  #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1138  #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1139  #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1140  /* ---------------------------------------*/
  1141  #define y2in(off)  (32*0 + 8 + off)(RSP)
  1142  #define s2(off)    (32*1 + 8 + off)(RSP)
  1143  #define z1sqr(off) (32*2 + 8 + off)(RSP)
  1144  #define h(off)	   (32*3 + 8 + off)(RSP)
  1145  #define r(off)	   (32*4 + 8 + off)(RSP)
  1146  #define hsqr(off)  (32*5 + 8 + off)(RSP)
  1147  #define rsqr(off)  (32*6 + 8 + off)(RSP)
  1148  #define hcub(off)  (32*7 + 8 + off)(RSP)
  1149  
  1150  #define z2sqr(off) (32*8 + 8 + off)(RSP)
  1151  #define s1(off) (32*9 + 8 + off)(RSP)
  1152  #define u1(off) (32*10 + 8 + off)(RSP)
  1153  #define u2(off) (32*11 + 8 + off)(RSP)
  1154  
  1155  // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1156  TEXT ·p256PointAddAffineAsm(SB),0,$264-96
  1157  	MOVD	in1+24(FP), a_ptr
  1158  	MOVD	in2+48(FP), b_ptr
  1159  	MOVD	sign+72(FP), hlp0
  1160  	MOVD	sel+80(FP), hlp1
  1161  	MOVD	zero+88(FP), t1
  1162  
  1163  	MOVD	$1, t0
  1164  	CMP	$0, t1
  1165  	CSEL	EQ, ZR, t0, t1
  1166  	CMP	$0, hlp1
  1167  	CSEL	EQ, ZR, t0, hlp1
  1168  
  1169  	LDP	p256p<>+0x00(SB), (const0, const1)
  1170  	LDP	p256p<>+0x10(SB), (const2, const3)
  1171  	EOR	t1<<1, hlp1
  1172  
  1173  	// Negate y2in based on sign
  1174  	LDP	2*16(b_ptr), (y0, y1)
  1175  	LDP	3*16(b_ptr), (y2, y3)
  1176  
  1177  	SUBS	y0, const0, acc0
  1178  	SBCS	y1, const1, acc1
  1179  	SBCS	y2, const2, acc2
  1180  	SBCS	y3, const3, acc3
  1181  	SBC	$0, ZR, t0
  1182  
  1183  	ADDS	const0, acc0, acc4
  1184  	ADCS	const1, acc1, acc5
  1185  	ADCS	const2, acc2, acc6
  1186  	ADCS	const3, acc3, acc7
  1187  	ADC	$0, t0, t0
  1188  
  1189  	CMP	$0, t0
  1190  	CSEL	EQ, acc4, acc0, acc0
  1191  	CSEL	EQ, acc5, acc1, acc1
  1192  	CSEL	EQ, acc6, acc2, acc2
  1193  	CSEL	EQ, acc7, acc3, acc3
  1194  	// If condition is 0, keep original value
  1195  	CMP	$0, hlp0
  1196  	CSEL	EQ, y0, acc0, y0
  1197  	CSEL	EQ, y1, acc1, y1
  1198  	CSEL	EQ, y2, acc2, y2
  1199  	CSEL	EQ, y3, acc3, y3
  1200  	// Store result
  1201  	STy(y2in)
  1202  	// Begin point add
  1203  	LDx(z1in)
  1204  	CALL	sm2P256SqrInternal<>(SB)    // z1ˆ2
  1205  	STy(z1sqr)
  1206  
  1207  	LDx(x2in)
  1208  	CALL	sm2P256MulInternal<>(SB)    // x2 * z1ˆ2
  1209  
  1210  	LDx(x1in)
  1211  	CALL	sm2P256Subinternal<>(SB)    // h = u2 - u1
  1212  	STx(h)
  1213  
  1214  	LDy(z1in)
  1215  	CALL	sm2P256MulInternal<>(SB)    // z3 = h * z1
  1216  
  1217  	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1218  	LDP	5*16(a_ptr), (acc2, acc3)
  1219  	ANDS	$1, hlp1, ZR
  1220  	CSEL	EQ, acc0, y0, y0
  1221  	CSEL	EQ, acc1, y1, y1
  1222  	CSEL	EQ, acc2, y2, y2
  1223  	CSEL	EQ, acc3, y3, y3
  1224  	LDP	p256one<>+0x00(SB), (acc0, acc1)
  1225  	LDP	p256one<>+0x10(SB), (acc2, acc3)
  1226  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1227  	CSEL	EQ, acc0, y0, y0
  1228  	CSEL	EQ, acc1, y1, y1
  1229  	CSEL	EQ, acc2, y2, y2
  1230  	CSEL	EQ, acc3, y3, y3
  1231  	LDx(z1in)
  1232  	MOVD	res+0(FP), t0
  1233  	STP	(y0, y1), 4*16(t0)
  1234  	STP	(y2, y3), 5*16(t0)
  1235  
  1236  	LDy(z1sqr)
  1237  	CALL	sm2P256MulInternal<>(SB)    // z1 ^ 3
  1238  
  1239  	LDx(y2in)
  1240  	CALL	sm2P256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1241  	STy(s2)
  1242  
  1243  	LDx(y1in)
  1244  	CALL	sm2P256Subinternal<>(SB)    // r = s2 - s1
  1245  	STx(r)
  1246  
  1247  	CALL	sm2P256SqrInternal<>(SB)    // rsqr = rˆ2
  1248  	STy	(rsqr)
  1249  
  1250  	LDx(h)
  1251  	CALL	sm2P256SqrInternal<>(SB)    // hsqr = hˆ2
  1252  	STy(hsqr)
  1253  
  1254  	CALL	sm2P256MulInternal<>(SB)    // hcub = hˆ3
  1255  	STy(hcub)
  1256  
  1257  	LDx(y1in)
  1258  	CALL	sm2P256MulInternal<>(SB)    // y1 * hˆ3
  1259  	STy(s2)
  1260  
  1261  	LDP	hsqr(0*8), (x0, x1)
  1262  	LDP	hsqr(2*8), (x2, x3)
  1263  	LDP	0*16(a_ptr), (y0, y1)
  1264  	LDP	1*16(a_ptr), (y2, y3)
  1265  	CALL	sm2P256MulInternal<>(SB)    // u1 * hˆ2
  1266  	STP	(y0, y1), h(0*8)
  1267  	STP	(y2, y3), h(2*8)
  1268  
  1269  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1270  
  1271  	LDy(rsqr)
  1272  	CALL	sm2P256Subinternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1273  
  1274  	MOVD	x0, y0
  1275  	MOVD	x1, y1
  1276  	MOVD	x2, y2
  1277  	MOVD	x3, y3
  1278  	LDx(hcub)
  1279  	CALL	sm2P256Subinternal<>(SB)
  1280  
  1281  	LDP	0*16(a_ptr), (acc0, acc1)
  1282  	LDP	1*16(a_ptr), (acc2, acc3)
  1283  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1284  	CSEL	EQ, acc0, x0, x0
  1285  	CSEL	EQ, acc1, x1, x1
  1286  	CSEL	EQ, acc2, x2, x2
  1287  	CSEL	EQ, acc3, x3, x3
  1288  	LDP	0*16(b_ptr), (acc0, acc1)
  1289  	LDP	1*16(b_ptr), (acc2, acc3)
  1290  	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1291  	CSEL	EQ, acc0, x0, x0
  1292  	CSEL	EQ, acc1, x1, x1
  1293  	CSEL	EQ, acc2, x2, x2
  1294  	CSEL	EQ, acc3, x3, x3
  1295  	MOVD	res+0(FP), t0
  1296  	STP	(x0, x1), 0*16(t0)
  1297  	STP	(x2, x3), 1*16(t0)
  1298  
  1299  	LDP	h(0*8), (y0, y1)
  1300  	LDP	h(2*8), (y2, y3)
  1301  	CALL	sm2P256Subinternal<>(SB)
  1302  
  1303  	LDP	r(0*8), (y0, y1)
  1304  	LDP	r(2*8), (y2, y3)
  1305  	CALL	sm2P256MulInternal<>(SB)
  1306  
  1307  	LDP	s2(0*8), (x0, x1)
  1308  	LDP	s2(2*8), (x2, x3)
  1309  	CALL	sm2P256Subinternal<>(SB)
  1310  	LDP	2*16(a_ptr), (acc0, acc1)
  1311  	LDP	3*16(a_ptr), (acc2, acc3)
  1312  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1313  	CSEL	EQ, acc0, x0, x0
  1314  	CSEL	EQ, acc1, x1, x1
  1315  	CSEL	EQ, acc2, x2, x2
  1316  	CSEL	EQ, acc3, x3, x3
  1317  	LDP	y2in(0*8), (acc0, acc1)
  1318  	LDP	y2in(2*8), (acc2, acc3)
  1319  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1320  	CSEL	EQ, acc0, x0, x0
  1321  	CSEL	EQ, acc1, x1, x1
  1322  	CSEL	EQ, acc2, x2, x2
  1323  	CSEL	EQ, acc3, x3, x3
  1324  	MOVD	res+0(FP), t0
  1325  	STP	(x0, x1), 2*16(t0)
  1326  	STP	(x2, x3), 3*16(t0)
  1327  
  1328  	RET
  1329  
  1330  #define p256AddInline          \
  1331  	ADDS	y0, x0, x0;    \
  1332  	ADCS	y1, x1, x1;    \
  1333  	ADCS	y2, x2, x2;    \
  1334  	ADCS	y3, x3, x3;    \
  1335  	ADC	$0, ZR, hlp0;  \
  1336  	SUBS	const0, x0, t0;   \
  1337  	SBCS	const1, x1, t1;\
  1338  	SBCS	const2, x2, acc5;    \
  1339  	SBCS	const3, x3, acc6;\
  1340  	SBCS	$0, hlp0, hlp0;\
  1341  	CSEL	CC, x0, t0, x0;\
  1342  	CSEL	CC, x1, t1, x1;\
  1343  	CSEL	CC, x2, acc5, x2;\
  1344  	CSEL	CC, x3, acc6, x3;
  1345  
  1346  #define s(off)	(32*0 + 8 + off)(RSP)
  1347  #define m(off)	(32*1 + 8 + off)(RSP)
  1348  #define zsqr(off) (32*2 + 8 + off)(RSP)
  1349  #define tmp(off)  (32*3 + 8 + off)(RSP)
  1350  
  1351  //func p256PointDoubleAsm(res, in []uint64)
  1352  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48
  1353  	MOVD	res+0(FP), res_ptr
  1354  	MOVD	in+24(FP), a_ptr
  1355  
  1356  	LDP	p256p<>+0x00(SB), (const0, const1)
  1357  	LDP	p256p<>+0x10(SB), (const2, const3)
  1358  
  1359  	// Begin point double
  1360  	LDP	4*16(a_ptr), (x0, x1)
  1361  	LDP	5*16(a_ptr), (x2, x3)
  1362  	CALL	sm2P256SqrInternal<>(SB)
  1363  	STP	(y0, y1), zsqr(0*8)
  1364  	STP	(y2, y3), zsqr(2*8)
  1365  
  1366  	LDP	0*16(a_ptr), (x0, x1)
  1367  	LDP	1*16(a_ptr), (x2, x3)
  1368  	p256AddInline
  1369  	STx(m)
  1370  
  1371  	LDx(z1in)
  1372  	LDy(y1in)
  1373  	CALL	sm2P256MulInternal<>(SB)
  1374  	p256MulBy2Inline
  1375  	STx(z3out)
  1376  
  1377  	LDy(x1in)
  1378  	LDx(zsqr)
  1379  	CALL	sm2P256Subinternal<>(SB)
  1380  	LDy(m)
  1381  	CALL	sm2P256MulInternal<>(SB)
  1382  
  1383  	// Multiply by 3
  1384  	p256MulBy2Inline
  1385  	p256AddInline
  1386  	STx(m)
  1387  
  1388  	LDy(y1in)
  1389  	p256MulBy2Inline
  1390  	CALL	sm2P256SqrInternal<>(SB)
  1391  	STy(s)
  1392  	MOVD	y0, x0
  1393  	MOVD	y1, x1
  1394  	MOVD	y2, x2
  1395  	MOVD	y3, x3
  1396  	CALL	sm2P256SqrInternal<>(SB)
  1397  
  1398  	// Divide by 2
  1399  	ADDS	const0, y0, t0
  1400  	ADCS	const1, y1, t1
  1401  	ADCS	const2, y2, acc5
  1402  	ADCS	const3, y3, acc6
  1403  	ADC	$0, ZR, hlp0
  1404  
  1405  	ANDS	$1, y0, ZR
  1406  	CSEL	EQ, y0, t0, t0
  1407  	CSEL	EQ, y1, t1, t1
  1408  	CSEL	EQ, y2, acc5, acc5
  1409  	CSEL	EQ, y3, acc6, acc6
  1410  	AND	y0, hlp0, hlp0
  1411  
  1412  	EXTR	$1, t0, t1, y0
  1413  	EXTR	$1, t1, acc5, y1
  1414  	EXTR	$1, acc5, acc6, y2
  1415  	EXTR	$1, acc6, hlp0, y3
  1416  	STy(y3out)
  1417  
  1418  	LDx(x1in)
  1419  	LDy(s)
  1420  	CALL	sm2P256MulInternal<>(SB)
  1421  	STy(s)
  1422  	p256MulBy2Inline
  1423  	STx(tmp)
  1424  
  1425  	LDx(m)
  1426  	CALL	sm2P256SqrInternal<>(SB)
  1427  	LDx(tmp)
  1428  	CALL	sm2P256Subinternal<>(SB)
  1429  
  1430  	STx(x3out)
  1431  
  1432  	LDy(s)
  1433  	CALL	sm2P256Subinternal<>(SB)
  1434  
  1435  	LDy(m)
  1436  	CALL	sm2P256MulInternal<>(SB)
  1437  
  1438  	LDx(y3out)
  1439  	CALL	sm2P256Subinternal<>(SB)
  1440  	STx(y3out)
  1441  	RET
  1442  /* ---------------------------------------*/
  1443  #undef y2in
  1444  #undef x3out
  1445  #undef y3out
  1446  #undef z3out
  1447  #define y2in(off) (off + 32)(b_ptr)
  1448  #define x3out(off) (off)(b_ptr)
  1449  #define y3out(off) (off + 32)(b_ptr)
  1450  #define z3out(off) (off + 64)(b_ptr)
  1451  //func p256PointAddAsm(res, in1, in2 []uint64) int
  1452  TEXT ·p256PointAddAsm(SB),0,$392-80
  1453  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1454  	// Move input to stack in order to free registers
  1455  	MOVD	in1+24(FP), a_ptr
  1456  	MOVD	in2+48(FP), b_ptr
  1457  
  1458  	LDP	p256p<>+0x00(SB), (const0, const1)
  1459  	LDP	p256p<>+0x10(SB), (const2, const3)
  1460  
  1461  	// Begin point add
  1462  	LDx(z2in)
  1463  	CALL	sm2P256SqrInternal<>(SB)    // z2^2
  1464  	STy(z2sqr)
  1465  
  1466  	CALL	sm2P256MulInternal<>(SB)    // z2^3
  1467  
  1468  	LDx(y1in)
  1469  	CALL	sm2P256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1470  	STy(s1)
  1471  
  1472  	LDx(z1in)
  1473  	CALL	sm2P256SqrInternal<>(SB)    // z1^2
  1474  	STy(z1sqr)
  1475  
  1476  	CALL	sm2P256MulInternal<>(SB)    // z1^3
  1477  
  1478  	LDx(y2in)
  1479  	CALL	sm2P256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1480  
  1481  	LDx(s1)
  1482  	CALL	sm2P256Subinternal<>(SB)    // r = s2 - s1
  1483  	STx(r)
  1484  
  1485  	MOVD	$1, acc1
  1486  	ORR	x0, x1, acc2             // Check if zero mod p256
  1487  	ORR	x2, x3, acc3
  1488  	ORR	acc3, acc2, acc2
  1489  	CMP	$0, acc2
  1490  	CSEL	EQ, acc1, ZR, hlp1
  1491  
  1492  	EOR	const0, x0, acc2
  1493  	EOR	const1, x1, acc3
  1494  	EOR	const2, x2, acc4
  1495  	EOR	const3, x3, acc5
  1496  
  1497  	ORR	acc2, acc3, acc2
  1498  	ORR	acc4, acc5, acc3
  1499  	ORR	acc3, acc2, acc2
  1500  	CMP	$0, acc2
  1501  	CSEL	EQ, acc1, hlp1, hlp1
  1502  
  1503  	LDx(z2sqr)
  1504  	LDy(x1in)
  1505  	CALL	sm2P256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1506  	STy(u1)
  1507  
  1508  	LDx(z1sqr)
  1509  	LDy(x2in)
  1510  	CALL	sm2P256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1511  	STy(u2)
  1512  
  1513  	LDx(u1)
  1514  	CALL	sm2P256Subinternal<>(SB)    // h = u2 - u1
  1515  	STx(h)
  1516  
  1517  	MOVD	$1, acc1
  1518  	ORR	x0, x1, acc2             // Check if zero mod p256
  1519  	ORR	x2, x3, acc3
  1520  	ORR	acc3, acc2, acc2
  1521  	CMP	$0, acc2
  1522  	CSEL	EQ, acc1, ZR, hlp0
  1523  
  1524  	EOR	const0, x0, acc2
  1525  	EOR	const1, x1, acc3
  1526  	EOR	const2, x2, acc4
  1527  	EOR	const3, x3, acc5
  1528  
  1529  	ORR	acc2, acc3, acc2
  1530  	ORR	acc4, acc5, acc3
  1531  	ORR	acc3, acc2, acc2
  1532  	CMP	$0, acc2
  1533  	CSEL	EQ, acc1, hlp0, hlp0
  1534  
  1535  	AND	hlp0, hlp1, hlp1
  1536  
  1537  	LDx(r)
  1538  	CALL	sm2P256SqrInternal<>(SB)    // rsqr = rˆ2
  1539  	STy(rsqr)
  1540  
  1541  	LDx(h)
  1542  	CALL	sm2P256SqrInternal<>(SB)    // hsqr = hˆ2
  1543  	STy(hsqr)
  1544  
  1545  	LDx(h)
  1546  	CALL	sm2P256MulInternal<>(SB)    // hcub = hˆ3
  1547  	STy(hcub)
  1548  
  1549  	LDx(s1)
  1550  	CALL	sm2P256MulInternal<>(SB)
  1551  	STy(s2)
  1552  
  1553  	LDx(z1in)
  1554  	LDy(z2in)
  1555  	CALL	sm2P256MulInternal<>(SB)    // z1 * z2
  1556  	LDx(h)
  1557  	CALL	sm2P256MulInternal<>(SB)    // z1 * z2 * h
  1558  	MOVD	res+0(FP), b_ptr
  1559  	STy(z3out)
  1560  
  1561  	LDx(hsqr)
  1562  	LDy(u1)
  1563  	CALL	sm2P256MulInternal<>(SB)    // hˆ2 * u1
  1564  	STy(u2)
  1565  
  1566  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1567  	LDy(rsqr)
  1568  	CALL	sm2P256Subinternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1569  
  1570  	MOVD	x0, y0
  1571  	MOVD	x1, y1
  1572  	MOVD	x2, y2
  1573  	MOVD	x3, y3
  1574  	LDx(hcub)
  1575  	CALL	sm2P256Subinternal<>(SB)
  1576  	STx(x3out)
  1577  
  1578  	LDy(u2)
  1579  	CALL	sm2P256Subinternal<>(SB)
  1580  
  1581  	LDy(r)
  1582  	CALL	sm2P256MulInternal<>(SB)
  1583  
  1584  	LDx(s2)
  1585  	CALL	sm2P256Subinternal<>(SB)
  1586  	STx(y3out)
  1587  
  1588  	MOVD	hlp1, R0
  1589  	MOVD	R0, ret+72(FP)
  1590  
  1591  	RET
  1592