github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_arm64.s

github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_arm64.s (about)

     1  // This file contains constant-time, 64-bit assembly implementation of
     2  // P256. The optimizations performed here are described in detail in:
     3  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     4  //                          256-bit primes"
     5  // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
     6  // https://eprint.iacr.org/2013/816.pdf
     7  //go:build !purego
     8  
     9  #include "textflag.h"
    10  
    11  #define res_ptr R0
    12  #define a_ptr R1
    13  #define b_ptr R2
    14  
    15  #define acc0 R3
    16  #define acc1 R4
    17  #define acc2 R5
    18  #define acc3 R6
    19  
    20  #define acc4 R7
    21  #define acc5 R8
    22  #define acc6 R9
    23  #define acc7 R10
    24  #define t0 R11
    25  #define t1 R12
    26  #define t2 R13
    27  #define t3 R14
    28  #define const0 R15
    29  #define const1 R16
    30  
    31  #define hlp0 R17
    32  #define hlp1 res_ptr
    33  
    34  #define x0 R19
    35  #define x1 R20
    36  #define x2 R21
    37  #define x3 R22
    38  #define y0 R23
    39  #define y1 R24
    40  #define y2 R25
    41  #define y3 R26
    42  
    43  #define const2 t2
    44  #define const3 t3
    45  
    46  DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
    47  DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
    48  DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
    49  DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
    50  DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
    51  DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
    52  DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
    53  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    54  DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
    55  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    56  DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
    57  DATA p256one<>+0x10(SB)/8, $0x0000000000000000
    58  DATA p256one<>+0x18(SB)/8, $0x0000000100000000
    59  GLOBL p256p<>(SB), RODATA, $32
    60  GLOBL p256ordK0<>(SB), RODATA, $8
    61  GLOBL p256ord<>(SB), RODATA, $32
    62  GLOBL p256one<>(SB), RODATA, $32
    63  
    64  /* ---------------------------------------*/
    65  // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    66  TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    67  	JMP	·p256BigToLittle(SB)
    68  /* ---------------------------------------*/
    69  // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    70  TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    71  	JMP	·p256BigToLittle(SB)
    72  /* ---------------------------------------*/
    73  // func p256LittleToBig(res *[32]byte, in *p256Element)
    74  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    75  	JMP	·p256BigToLittle(SB)
    76  /* ---------------------------------------*/
    77  // func p256BigToLittle(res *p256Element, in *[32]byte)
    78  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    79  	MOVD	res+0(FP), res_ptr
    80  	MOVD	in+8(FP), a_ptr
    81  
    82  	VLD1 (a_ptr), [V0.B16, V1.B16]
    83  
    84  	VEXT	$8, V0.B16, V0.B16, V3.B16
    85  	VEXT	$8, V1.B16, V1.B16, V2.B16
    86  	VREV64 V2.B16, V2.B16
    87  	VREV64 V3.B16, V3.B16
    88  
    89  	VST1 [V2.B16, V3.B16], (res_ptr)
    90  
    91  	RET
    92  /* ---------------------------------------*/
    93  // func p256MovCond(res, a, b *SM2P256Point, cond int)
    94  // If cond == 0 res=b, else res=a
    95  TEXT ·p256MovCond(SB),NOSPLIT,$0
    96  	MOVD	res+0(FP), res_ptr
    97  	MOVD	a+8(FP), a_ptr
    98  	MOVD	b+16(FP), b_ptr
    99  	MOVD	cond+24(FP), R3
   100  
   101  	VEOR V0.B16, V0.B16, V0.B16
   102  	VDUP R3, V1.S4
   103  	VCMEQ V0.S4, V1.S4, V2.S4
   104  
   105  	VLD1.P (48)(a_ptr), [V3.B16, V4.B16, V5.B16]
   106  	VLD1.P (48)(b_ptr), [V6.B16, V7.B16, V8.B16]
   107  	VBIT V2.B16, V6.B16, V3.B16
   108  	VBIT V2.B16, V7.B16, V4.B16
   109  	VBIT V2.B16, V8.B16, V5.B16
   110  	VST1.P [V3.B16, V4.B16, V5.B16], (48)(res_ptr)
   111  
   112  	VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16]
   113  	VLD1 (b_ptr), [V6.B16, V7.B16, V8.B16]
   114  	VBIT V2.B16, V6.B16, V3.B16
   115  	VBIT V2.B16, V7.B16, V4.B16
   116  	VBIT V2.B16, V8.B16, V5.B16
   117  	VST1 [V3.B16, V4.B16, V5.B16], (res_ptr)
   118  
   119  	RET
   120  /* ---------------------------------------*/
   121  // func p256NegCond(val *p256Element, cond int)
   122  TEXT ·p256NegCond(SB),NOSPLIT,$0
   123  	MOVD	val+0(FP), a_ptr
   124  	MOVD	cond+8(FP), hlp0
   125  	MOVD	a_ptr, res_ptr
   126  	// acc = poly
   127  	LDP	p256p<>+0x00(SB), (acc0, acc1)
   128  	LDP	p256p<>+0x10(SB), (acc2, acc3)
   129  
   130  	// Load the original value
   131  	LDP	0*16(a_ptr), (t0, t1)
   132  	LDP	1*16(a_ptr), (t2, t3)
   133  	// Speculatively subtract
   134  	SUBS	t0, acc0
   135  	SBCS	t1, acc1
   136  	SBCS	t2, acc2
   137  	SBC	t3, acc3
   138  	// If condition is 0, keep original value
   139  	CMP	$0, hlp0
   140  	CSEL	EQ, t0, acc0, acc0
   141  	CSEL	EQ, t1, acc1, acc1
   142  	CSEL	EQ, t2, acc2, acc2
   143  	CSEL	EQ, t3, acc3, acc3
   144  	// Store result
   145  	STP	(acc0, acc1), 0*16(res_ptr)
   146  	STP	(acc2, acc3), 1*16(res_ptr)
   147  
   148  	RET
   149  /* ---------------------------------------*/
   150  // func p256Sqr(res, in *p256Element, n int)
   151  TEXT ·p256Sqr(SB),NOSPLIT,$0
   152  	MOVD	res+0(FP), res_ptr
   153  	MOVD	in+8(FP), a_ptr
   154  	MOVD	n+16(FP), b_ptr
   155  
   156  	LDP	p256p<>+0x00(SB), (const0, const1)
   157  	LDP	p256p<>+0x10(SB), (const2, const3)
   158  
   159  	LDP	0*16(a_ptr), (x0, x1)
   160  	LDP	1*16(a_ptr), (x2, x3)
   161  
   162  sqrLoop:
   163  	SUB	$1, b_ptr
   164  	CALL	sm2P256SqrInternal<>(SB)
   165  	MOVD	y0, x0
   166  	MOVD	y1, x1
   167  	MOVD	y2, x2
   168  	MOVD	y3, x3
   169  	CBNZ	b_ptr, sqrLoop
   170  
   171  	STP	(y0, y1), 0*16(res_ptr)
   172  	STP	(y2, y3), 1*16(res_ptr)
   173  	RET
   174  /* ---------------------------------------*/
   175  // func p256Mul(res, in1, in2 *p256Element)
   176  TEXT ·p256Mul(SB),NOSPLIT,$0
   177  	MOVD	res+0(FP), res_ptr
   178  	MOVD	in1+8(FP), a_ptr
   179  	MOVD	in2+16(FP), b_ptr
   180  
   181  	LDP	p256p<>+0x00(SB), (const0, const1)
   182  	LDP	p256p<>+0x10(SB), (const2, const3)
   183  
   184  	LDP	0*16(a_ptr), (x0, x1)
   185  	LDP	1*16(a_ptr), (x2, x3)
   186  
   187  	LDP	0*16(b_ptr), (y0, y1)
   188  	LDP	1*16(b_ptr), (y2, y3)
   189  
   190  	CALL	sm2P256MulInternal<>(SB)
   191  
   192  	STP	(y0, y1), 0*16(res_ptr)
   193  	STP	(y2, y3), 1*16(res_ptr)
   194  	RET                        
   195  /* ---------------------------------------*/
   196  // func p256FromMont(res, in *p256Element)
   197  TEXT ·p256FromMont(SB),NOSPLIT,$0
   198  	MOVD	res+0(FP), res_ptr
   199  	MOVD	in+8(FP), a_ptr
   200  	LDP	p256p<>+0x00(SB), (const0, const1)
   201  	LDP	p256p<>+0x10(SB), (const2, const3)
   202  
   203  	LDP	0*16(a_ptr), (acc0, acc1)
   204  	LDP	1*16(a_ptr), (acc2, acc3)
   205  	// Only reduce, no multiplications are needed
   206  	// First reduction step
   207  	LSL $32, acc0, y0
   208  	LSR	$32, acc0, y1
   209  
   210  	SUBS y0, acc1
   211  	SBCS y1, acc2
   212  	SBCS y0, acc3
   213  	SBC y1, acc0, y0	
   214  
   215  	ADDS acc0, acc1, acc1
   216  	ADCS $0, acc2, acc2
   217  	ADCS $0, acc3, acc3
   218  	ADC $0, y0, acc0
   219  
   220  	// Second reduction step
   221  	LSL $32, acc1, y0
   222  	LSR	$32, acc1, y1
   223  
   224  	SUBS y0, acc2
   225  	SBCS y1, acc3
   226  	SBCS y0, acc0
   227  	SBC y1, acc1, y0
   228  
   229  	ADDS acc1, acc2, acc2
   230  	ADCS $0, acc3, acc3
   231  	ADCS $0, acc0, acc0
   232  	ADC $0, y0, acc1
   233  
   234  	// Third reduction step
   235  	LSL $32, acc2, y0
   236  	LSR	$32, acc2, y1
   237  
   238  	SUBS y0, acc3
   239  	SBCS y1, acc0
   240  	SBCS y0, acc1
   241  	SBC y1, acc2, y0
   242  
   243  	ADDS acc2, acc3, acc3
   244  	ADCS $0, acc0, acc0
   245  	ADCS $0, acc1, acc1
   246  	ADC $0, y0, acc2
   247  
   248  	// Last reduction step
   249  	LSL $32, acc3, y0
   250  	LSR	$32, acc3, y1
   251  
   252  	SUBS y0, acc0
   253  	SBCS y1, acc1
   254  	SBCS y0, acc2
   255  	SBC y1, acc3, y0
   256  
   257  	ADDS acc3, acc0, acc0
   258  	ADCS $0, acc1, acc1
   259  	ADCS $0, acc2, acc2
   260  	ADC $0, y0, acc3
   261  
   262  	SUBS	const0, acc0, t0
   263  	SBCS	const1, acc1, t1
   264  	SBCS	const2, acc2, t2
   265  	SBCS	const3, acc3, t3
   266  
   267  	CSEL	CS, t0, acc0, acc0
   268  	CSEL	CS, t1, acc1, acc1
   269  	CSEL	CS, t2, acc2, acc2
   270  	CSEL	CS, t3, acc3, acc3
   271  
   272  	STP	(acc0, acc1), 0*16(res_ptr)
   273  	STP	(acc2, acc3), 1*16(res_ptr)
   274  
   275  	RET
   276  /* ---------------------------------------*/
   277  // func p256Select(res *SM2P256Point, table *p256Table, idx, limit int)
   278  TEXT ·p256Select(SB),NOSPLIT,$0
   279  	MOVD	limit+24(FP), a_ptr
   280  	MOVD	idx+16(FP), const0
   281  	MOVD	table+8(FP), b_ptr
   282  	MOVD	res+0(FP), res_ptr
   283  
   284  	VDUP const0, V0.S4
   285  
   286  	VEOR V2.B16, V2.B16, V2.B16
   287  	VEOR V3.B16, V3.B16, V3.B16
   288  	VEOR V4.B16, V4.B16, V4.B16
   289  	VEOR V5.B16, V5.B16, V5.B16
   290  	VEOR V6.B16, V6.B16, V6.B16
   291  	VEOR V7.B16, V7.B16, V7.B16
   292  
   293  	MOVD	$0, const1
   294  
   295  loop_select:
   296  		ADD	$1, const1
   297  		VDUP const1, V1.S4
   298  		VCMEQ V0.S4, V1.S4, V14.S4
   299  		VLD1.P (48)(b_ptr), [V8.B16, V9.B16, V10.B16]
   300  		VLD1.P (48)(b_ptr), [V11.B16, V12.B16, V13.B16]
   301  		VBIT V14.B16, V8.B16, V2.B16
   302  		VBIT V14.B16, V9.B16, V3.B16
   303  		VBIT V14.B16, V10.B16, V4.B16
   304  		VBIT V14.B16, V11.B16, V5.B16
   305  		VBIT V14.B16, V12.B16, V6.B16
   306  		VBIT V14.B16, V13.B16, V7.B16
   307  
   308  		CMP	a_ptr, const1
   309  		BNE	loop_select
   310  	VST1.P [V2.B16, V3.B16, V4.B16], (48)(res_ptr)
   311  	VST1 [V5.B16, V6.B16, V7.B16], (res_ptr)
   312  	RET
   313  /* ---------------------------------------*/
   314  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   315  TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   316  	MOVD	idx+16(FP), t0
   317  	MOVD	table+8(FP), t1
   318  	MOVD	res+0(FP), res_ptr
   319  
   320  	VDUP t0, V0.S4
   321  
   322  	VEOR V2.B16, V2.B16, V2.B16
   323  	VEOR V3.B16, V3.B16, V3.B16
   324  	VEOR V4.B16, V4.B16, V4.B16
   325  	VEOR V5.B16, V5.B16, V5.B16
   326  
   327  	MOVD	$0, t2
   328  
   329  loop_select:
   330  		ADD	$1, t2
   331  		VDUP t2, V1.S4
   332  		VCMEQ V0.S4, V1.S4, V10.S4
   333  		VLD1.P (64)(t1), [V6.B16, V7.B16, V8.B16, V9.B16]
   334  		VBIT V10.B16, V6.B16, V2.B16
   335  		VBIT V10.B16, V7.B16, V3.B16
   336  		VBIT V10.B16, V8.B16, V4.B16
   337  		VBIT V10.B16, V9.B16, V5.B16
   338  
   339  		CMP	$32, t2
   340  		BNE	loop_select
   341  
   342  	VST1 [V2.B16, V3.B16, V4.B16, V5.B16], (res_ptr)
   343  	RET
   344  
   345  /* ---------------------------------------*/
   346  //func p256OrdReduce(s *p256OrdElement)
   347  TEXT ·p256OrdReduce(SB),NOSPLIT,$0
   348  	MOVD	s+0(FP), res_ptr
   349  
   350  	LDP	p256ord<>+0x00(SB), (const0, const1)
   351  	LDP	p256ord<>+0x10(SB), (const2, const3)
   352  
   353  	LDP	0*16(res_ptr), (acc0, acc1)
   354  	LDP	1*16(res_ptr), (acc2, acc3)
   355  	EOR acc4, acc4, acc4
   356  
   357  	SUBS	const0, acc0, y0
   358  	SBCS	const1, acc1, y1
   359  	SBCS	const2, acc2, y2
   360  	SBCS	const3, acc3, y3
   361  	SBCS	$0, acc4, acc4
   362  
   363  	CSEL	CS, y0, acc0, x0
   364  	CSEL	CS, y1, acc1, x1
   365  	CSEL	CS, y2, acc2, x2
   366  	CSEL	CS, y3, acc3, x3
   367  
   368  	STP	(x0, x1), 0*16(res_ptr)
   369  	STP	(x2, x3), 1*16(res_ptr)
   370  
   371  	RET
   372  
   373  /* ---------------------------------------*/
   374  // func p256OrdSqr(res, in *p256OrdElement, n int)
   375  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   376  	MOVD	in+8(FP), a_ptr
   377  	MOVD	n+16(FP), b_ptr
   378  
   379  	MOVD	p256ordK0<>(SB), hlp1
   380  
   381  	LDP	p256ord<>+0x00(SB), (const0, const1)
   382  	LDP	p256ord<>+0x10(SB), (const2, const3)
   383  
   384  	LDP	0*16(a_ptr), (x0, x1)
   385  	LDP	1*16(a_ptr), (x2, x3)
   386  
   387  ordSqrLoop:
   388  	SUB	$1, b_ptr
   389  
   390  	// x[1:] * x[0]
   391  	MUL	x0, x1, acc1
   392  	UMULH	x0, x1, acc2
   393  
   394  	MUL	x0, x2, t0
   395  	ADDS	t0, acc2, acc2
   396  	UMULH	x0, x2, acc3
   397  
   398  	MUL	x0, x3, t0
   399  	ADCS	t0, acc3, acc3
   400  	UMULH	x0, x3, acc4
   401  	ADC	$0, acc4, acc4
   402  	// x[2:] * x[1]
   403  	MUL	x1, x2, t0
   404  	ADDS	t0, acc3
   405  	UMULH	x1, x2, t1
   406  	ADCS	t1, acc4
   407  	ADC	$0, ZR, acc5
   408  
   409  	MUL	x1, x3, t0
   410  	ADDS	t0, acc4
   411  	UMULH	x1, x3, t1
   412  	ADC	t1, acc5
   413  	// x[3] * x[2]
   414  	MUL	x2, x3, t0
   415  	ADDS	t0, acc5
   416  	UMULH	x2, x3, acc6
   417  	ADC	$0, acc6
   418  
   419  	MOVD	$0, acc7
   420  	// *2
   421  	ADDS	acc1, acc1
   422  	ADCS	acc2, acc2
   423  	ADCS	acc3, acc3
   424  	ADCS	acc4, acc4
   425  	ADCS	acc5, acc5
   426  	ADCS	acc6, acc6
   427  	ADC	$0, acc7
   428  	// Missing products
   429  	MUL	x0, x0, acc0
   430  	UMULH	x0, x0, t0
   431  	ADDS	t0, acc1, acc1
   432  
   433  	MUL	x1, x1, t0
   434  	ADCS	t0, acc2, acc2
   435  	UMULH	x1, x1, t1
   436  	ADCS	t1, acc3, acc3
   437  
   438  	MUL	x2, x2, t0
   439  	ADCS	t0, acc4, acc4
   440  	UMULH	x2, x2, t1
   441  	ADCS	t1, acc5, acc5
   442  
   443  	MUL	x3, x3, t0
   444  	ADCS	t0, acc6, acc6
   445  	UMULH	x3, x3, t1
   446  	ADC	t1, acc7, acc7
   447  	// First reduction step
   448  	MUL	acc0, hlp1, hlp0
   449  
   450  	MUL	const0, hlp0, t0
   451  	ADDS	t0, acc0, acc0
   452  	UMULH	const0, hlp0, t1
   453  
   454  	MUL	const1, hlp0, t0
   455  	ADCS	t0, acc1, acc1
   456  	UMULH	const1, hlp0, y0
   457  
   458  	MUL	const2, hlp0, t0
   459  	ADCS	t0, acc2, acc2
   460  	UMULH	const2, hlp0, acc0
   461  
   462  	MUL	const3, hlp0, t0
   463  	ADCS	t0, acc3, acc3
   464  
   465  	UMULH	const3, hlp0, hlp0
   466  	ADC	$0, hlp0
   467  
   468  	ADDS	t1, acc1, acc1
   469  	ADCS	y0, acc2, acc2
   470  	ADCS	acc0, acc3, acc3
   471  	ADC	$0, hlp0, acc0
   472  	// Second reduction step
   473  	MUL	acc1, hlp1, hlp0
   474  
   475  	MUL	const0, hlp0, t0
   476  	ADDS	t0, acc1, acc1
   477  	UMULH	const0, hlp0, t1
   478  
   479  	MUL	const1, hlp0, t0
   480  	ADCS	t0, acc2, acc2
   481  	UMULH	const1, hlp0, y0
   482  
   483  	MUL	const2, hlp0, t0
   484  	ADCS	t0, acc3, acc3
   485  	UMULH	const2, hlp0, acc1
   486  
   487  	MUL	const3, hlp0, t0
   488  	ADCS	t0, acc0, acc0
   489  
   490  	UMULH	const3, hlp0, hlp0
   491  	ADC	$0, hlp0
   492  
   493  	ADDS	t1, acc2, acc2
   494  	ADCS	y0, acc3, acc3
   495  	ADCS	acc1, acc0, acc0
   496  	ADC	$0, hlp0, acc1
   497  	// Third reduction step
   498  	MUL	acc2, hlp1, hlp0
   499  
   500  	MUL	const0, hlp0, t0
   501  	ADDS	t0, acc2, acc2
   502  	UMULH	const0, hlp0, t1
   503  
   504  	MUL	const1, hlp0, t0
   505  	ADCS	t0, acc3, acc3
   506  	UMULH	const1, hlp0, y0
   507  
   508  	MUL	const2, hlp0, t0
   509  	ADCS	t0, acc0, acc0
   510  	UMULH	const2, hlp0, acc2
   511  
   512  	MUL	const3, hlp0, t0
   513  	ADCS	t0, acc1, acc1
   514  
   515  	UMULH	const3, hlp0, hlp0
   516  	ADC	$0, hlp0
   517  
   518  	ADDS	t1, acc3, acc3
   519  	ADCS	y0, acc0, acc0
   520  	ADCS	acc2, acc1, acc1
   521  	ADC	$0, hlp0, acc2
   522  
   523  	// Last reduction step
   524  	MUL	acc3, hlp1, hlp0
   525  
   526  	MUL	const0, hlp0, t0
   527  	ADDS	t0, acc3, acc3
   528  	UMULH	const0, hlp0, t1
   529  
   530  	MUL	const1, hlp0, t0
   531  	ADCS	t0, acc0, acc0
   532  	UMULH	const1, hlp0, y0
   533  
   534  	MUL	const2, hlp0, t0
   535  	ADCS	t0, acc1, acc1
   536  	UMULH	const2, hlp0, acc3
   537  
   538  	MUL	const3, hlp0, t0
   539  	ADCS	t0, acc2, acc2
   540  
   541  	UMULH	const3, hlp0, hlp0
   542  	ADC	$0, acc7
   543  
   544  	ADDS	t1, acc0, acc0
   545  	ADCS	y0, acc1, acc1
   546  	ADCS	acc3, acc2, acc2
   547  	ADC	$0, hlp0, acc3
   548  
   549  	ADDS	acc4, acc0, acc0
   550  	ADCS	acc5, acc1, acc1
   551  	ADCS	acc6, acc2, acc2
   552  	ADCS	acc7, acc3, acc3
   553  	ADC	$0, ZR, acc4
   554  
   555  	SUBS	const0, acc0, y0
   556  	SBCS	const1, acc1, y1
   557  	SBCS	const2, acc2, y2
   558  	SBCS	const3, acc3, y3
   559  	SBCS	$0, acc4, acc4
   560  
   561  	CSEL	CS, y0, acc0, x0
   562  	CSEL	CS, y1, acc1, x1
   563  	CSEL	CS, y2, acc2, x2
   564  	CSEL	CS, y3, acc3, x3
   565  
   566  	CBNZ	b_ptr, ordSqrLoop
   567  
   568  	MOVD	res+0(FP), res_ptr
   569  	STP	(x0, x1), 0*16(res_ptr)
   570  	STP	(x2, x3), 1*16(res_ptr)
   571  
   572  	RET
   573  /* ---------------------------------------*/
   574  // func p256OrdMul(res, in1, in2 *p256OrdElement)
   575  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   576  	MOVD	in1+8(FP), a_ptr
   577  	MOVD	in2+16(FP), b_ptr
   578  
   579  	MOVD	p256ordK0<>(SB), hlp1
   580  	LDP	p256ord<>+0x00(SB), (const0, const1)
   581  	LDP	p256ord<>+0x10(SB), (const2, const3)
   582  
   583  	LDP	0*16(a_ptr), (x0, x1)
   584  	LDP	1*16(a_ptr), (x2, x3)
   585  	LDP	0*16(b_ptr), (y0, y1)
   586  	LDP	1*16(b_ptr), (y2, y3)
   587  
   588  	// y[0] * x
   589  	MUL	y0, x0, acc0
   590  	UMULH	y0, x0, acc1
   591  
   592  	MUL	y0, x1, t0
   593  	ADDS	t0, acc1
   594  	UMULH	y0, x1, acc2
   595  
   596  	MUL	y0, x2, t0
   597  	ADCS	t0, acc2
   598  	UMULH	y0, x2, acc3
   599  
   600  	MUL	y0, x3, t0
   601  	ADCS	t0, acc3
   602  	UMULH	y0, x3, acc4
   603  	ADC	$0, acc4
   604  	// First reduction step
   605  	MUL	acc0, hlp1, hlp0
   606  
   607  	MUL	const0, hlp0, t0
   608  	ADDS	t0, acc0, acc0
   609  	UMULH	const0, hlp0, t1
   610  
   611  	MUL	const1, hlp0, t0
   612  	ADCS	t0, acc1, acc1
   613  	UMULH	const1, hlp0, y0
   614  
   615  	MUL	const2, hlp0, t0
   616  	ADCS	t0, acc2, acc2
   617  	UMULH	const2, hlp0, acc0
   618  
   619  	MUL	const3, hlp0, t0
   620  	ADCS	t0, acc3, acc3
   621  
   622  	UMULH	const3, hlp0, hlp0
   623  	ADC	$0, acc4
   624  
   625  	ADDS	t1, acc1, acc1
   626  	ADCS	y0, acc2, acc2
   627  	ADCS	acc0, acc3, acc3
   628  	ADC	$0, hlp0, acc0
   629  	// y[1] * x
   630  	MUL	y1, x0, t0
   631  	ADDS	t0, acc1
   632  	UMULH	y1, x0, t1
   633  
   634  	MUL	y1, x1, t0
   635  	ADCS	t0, acc2
   636  	UMULH	y1, x1, hlp0
   637  
   638  	MUL	y1, x2, t0
   639  	ADCS	t0, acc3
   640  	UMULH	y1, x2, y0
   641  
   642  	MUL	y1, x3, t0
   643  	ADCS	t0, acc4
   644  	UMULH	y1, x3, y1
   645  	ADC	$0, ZR, acc5
   646  
   647  	ADDS	t1, acc2
   648  	ADCS	hlp0, acc3
   649  	ADCS	y0, acc4
   650  	ADC	y1, acc5
   651  	// Second reduction step
   652  	MUL	acc1, hlp1, hlp0
   653  
   654  	MUL	const0, hlp0, t0
   655  	ADDS	t0, acc1, acc1
   656  	UMULH	const0, hlp0, t1
   657  
   658  	MUL	const1, hlp0, t0
   659  	ADCS	t0, acc2, acc2
   660  	UMULH	const1, hlp0, y0
   661  
   662  	MUL	const2, hlp0, t0
   663  	ADCS	t0, acc3, acc3
   664  	UMULH	const2, hlp0, acc1
   665  
   666  	MUL	const3, hlp0, t0
   667  	ADCS	t0, acc0, acc0
   668  
   669  	UMULH	const3, hlp0, hlp0
   670  	ADC	$0, acc5
   671  
   672  	ADDS	t1, acc2, acc2
   673  	ADCS	y0, acc3, acc3
   674  	ADCS	acc1, acc0, acc0
   675  	ADC	$0, hlp0, acc1
   676  	// y[2] * x
   677  	MUL	y2, x0, t0
   678  	ADDS	t0, acc2
   679  	UMULH	y2, x0, t1
   680  
   681  	MUL	y2, x1, t0
   682  	ADCS	t0, acc3
   683  	UMULH	y2, x1, hlp0
   684  
   685  	MUL	y2, x2, t0
   686  	ADCS	t0, acc4
   687  	UMULH	y2, x2, y0
   688  
   689  	MUL	y2, x3, t0
   690  	ADCS	t0, acc5
   691  	UMULH	y2, x3, y1
   692  	ADC	$0, ZR, acc6
   693  
   694  	ADDS	t1, acc3
   695  	ADCS	hlp0, acc4
   696  	ADCS	y0, acc5
   697  	ADC	y1, acc6
   698  	// Third reduction step
   699  	MUL	acc2, hlp1, hlp0
   700  
   701  	MUL	const0, hlp0, t0
   702  	ADDS	t0, acc2, acc2
   703  	UMULH	const0, hlp0, t1
   704  
   705  	MUL	const1, hlp0, t0
   706  	ADCS	t0, acc3, acc3
   707  	UMULH	const1, hlp0, y0
   708  
   709  	MUL	const2, hlp0, t0
   710  	ADCS	t0, acc0, acc0
   711  	UMULH	const2, hlp0, acc2
   712  
   713  	MUL	const3, hlp0, t0
   714  	ADCS	t0, acc1, acc1
   715  
   716  	UMULH	const3, hlp0, hlp0
   717  	ADC	$0, acc6
   718  
   719  	ADDS	t1, acc3, acc3
   720  	ADCS	y0, acc0, acc0
   721  	ADCS	acc2, acc1, acc1
   722  	ADC	$0, hlp0, acc2
   723  	// y[3] * x
   724  	MUL	y3, x0, t0
   725  	ADDS	t0, acc3
   726  	UMULH	y3, x0, t1
   727  
   728  	MUL	y3, x1, t0
   729  	ADCS	t0, acc4
   730  	UMULH	y3, x1, hlp0
   731  
   732  	MUL	y3, x2, t0
   733  	ADCS	t0, acc5
   734  	UMULH	y3, x2, y0
   735  
   736  	MUL	y3, x3, t0
   737  	ADCS	t0, acc6
   738  	UMULH	y3, x3, y1
   739  	ADC	$0, ZR, acc7
   740  
   741  	ADDS	t1, acc4
   742  	ADCS	hlp0, acc5
   743  	ADCS	y0, acc6
   744  	ADC	y1, acc7
   745  	// Last reduction step
   746  	MUL	acc3, hlp1, hlp0
   747  
   748  	MUL	const0, hlp0, t0
   749  	ADDS	t0, acc3, acc3
   750  	UMULH	const0, hlp0, t1
   751  
   752  	MUL	const1, hlp0, t0
   753  	ADCS	t0, acc0, acc0
   754  	UMULH	const1, hlp0, y0
   755  
   756  	MUL	const2, hlp0, t0
   757  	ADCS	t0, acc1, acc1
   758  	UMULH	const2, hlp0, acc3
   759  
   760  	MUL	const3, hlp0, t0
   761  	ADCS	t0, acc2, acc2
   762  
   763  	UMULH	const3, hlp0, hlp0
   764  	ADC	$0, acc7
   765  
   766  	ADDS	t1, acc0, acc0
   767  	ADCS	y0, acc1, acc1
   768  	ADCS	acc3, acc2, acc2
   769  	ADC	$0, hlp0, acc3
   770  
   771  	ADDS	acc4, acc0, acc0
   772  	ADCS	acc5, acc1, acc1
   773  	ADCS	acc6, acc2, acc2
   774  	ADCS	acc7, acc3, acc3
   775  	ADC	$0, ZR, acc4
   776  
   777  	SUBS	const0, acc0, t0
   778  	SBCS	const1, acc1, t1
   779  	SBCS	const2, acc2, t2
   780  	SBCS	const3, acc3, t3
   781  	SBCS	$0, acc4, acc4
   782  
   783  	CSEL	CS, t0, acc0, acc0
   784  	CSEL	CS, t1, acc1, acc1
   785  	CSEL	CS, t2, acc2, acc2
   786  	CSEL	CS, t3, acc3, acc3
   787  
   788  	MOVD	res+0(FP), res_ptr
   789  	STP	(acc0, acc1), 0*16(res_ptr)
   790  	STP	(acc2, acc3), 1*16(res_ptr)
   791  
   792  	RET
   793  /* ---------------------------------------*/
   794  // (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0)	
   795  TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0
   796  	SUBS	x0, y0, acc0
   797  	SBCS	x1, y1, acc1
   798  	SBCS	x2, y2, acc2
   799  	SBCS	x3, y3, acc3
   800  	SBC	$0, ZR, t0
   801  
   802  	ADDS	const0, acc0, acc4
   803  	ADCS	const1, acc1, acc5
   804  	ADCS	const2, acc2, acc6
   805  	ADC	const3, acc3, acc7
   806  
   807  	ANDS	$1, t0
   808  	CSEL	EQ, acc0, acc4, x0
   809  	CSEL	EQ, acc1, acc5, x1
   810  	CSEL	EQ, acc2, acc6, x2
   811  	CSEL	EQ, acc3, acc7, x3
   812  
   813  	RET
   814  
   815  /* ---------------------------------------*/
   816  // (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2
   817  TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
   818  	// x[1:] * x[0]
   819  	MUL	x0, x1, acc1
   820  	UMULH	x0, x1, acc2
   821  
   822  	MUL	x0, x2, t0
   823  	ADDS	t0, acc2, acc2
   824  	UMULH	x0, x2, acc3
   825  
   826  	MUL	x0, x3, t0
   827  	ADCS	t0, acc3, acc3
   828  	UMULH	x0, x3, acc4
   829  	ADC	$0, acc4, acc4
   830  	// x[2:] * x[1]
   831  	MUL	x1, x2, t0
   832  	ADDS	t0, acc3
   833  	UMULH	x1, x2, t1
   834  	ADCS	t1, acc4
   835  	ADC	$0, ZR, acc5
   836  
   837  	MUL	x1, x3, t0
   838  	ADDS	t0, acc4
   839  	UMULH	x1, x3, t1
   840  	ADC	t1, acc5
   841  	// x[3] * x[2]
   842  	MUL	x2, x3, t0
   843  	ADDS	t0, acc5
   844  	UMULH	x2, x3, acc6
   845  	ADC	$0, acc6
   846  
   847  	MOVD	$0, acc7
   848  	// *2
   849  	ADDS	acc1, acc1
   850  	ADCS	acc2, acc2
   851  	ADCS	acc3, acc3
   852  	ADCS	acc4, acc4
   853  	ADCS	acc5, acc5
   854  	ADCS	acc6, acc6
   855  	ADC	$0, acc7
   856  	// Missing products
   857  	MUL	x0, x0, acc0
   858  	UMULH	x0, x0, t0
   859  	ADDS	t0, acc1, acc1
   860  
   861  	MUL	x1, x1, t0
   862  	ADCS	t0, acc2, acc2
   863  	UMULH	x1, x1, t1
   864  	ADCS	t1, acc3, acc3
   865  
   866  	MUL	x2, x2, t0
   867  	ADCS	t0, acc4, acc4
   868  	UMULH	x2, x2, t1
   869  	ADCS	t1, acc5, acc5
   870  
   871  	MUL	x3, x3, t0
   872  	ADCS	t0, acc6, acc6
   873  	UMULH	x3, x3, t1
   874  	ADCS	t1, acc7, acc7
   875  
   876  	// First reduction step
   877  	LSL $32, acc0, y0
   878  	LSR	$32, acc0, y1
   879  
   880  	SUBS y0, acc1
   881  	SBCS y1, acc2
   882  	SBCS y0, acc3
   883  	SBC y1, acc0, y0
   884  
   885  	ADDS acc0, acc1, acc1
   886  	ADCS $0, acc2, acc2
   887  	ADCS $0, acc3, acc3
   888  	ADC $0, y0, acc0
   889  
   890  	// Second reduction step
   891  	LSL $32, acc1, y0
   892  	LSR	$32, acc1, y1
   893  
   894  	SUBS y0, acc2
   895  	SBCS y1, acc3
   896  	SBCS y0, acc0
   897  	SBC y1, acc1, y0
   898  
   899  	ADDS acc1, acc2, acc2
   900  	ADCS $0, acc3, acc3
   901  	ADCS $0, acc0, acc0
   902  	ADC $0, y0, acc1
   903  
   904  	// Third reduction step
   905  	LSL $32, acc2, y0
   906  	LSR	$32, acc2, y1
   907  
   908  	SUBS y0, acc3
   909  	SBCS y1, acc0
   910  	SBCS y0, acc1
   911  	SBC y1, acc2, y0
   912  
   913  	ADDS acc2, acc3, acc3
   914  	ADCS $0, acc0, acc0
   915  	ADCS $0, acc1, acc1
   916  	ADC $0, y0, acc2
   917  
   918  	// Last reduction step
   919  	LSL $32, acc3, y0
   920  	LSR	$32, acc3, y1
   921  
   922  	SUBS y0, acc0
   923  	SBCS y1, acc1
   924  	SBCS y0, acc2
   925  	SBC y1, acc3, y0
   926  
   927  	ADDS acc3, acc0, acc0
   928  	ADCS $0, acc1, acc1
   929  	ADCS $0, acc2, acc2
   930  	ADC $0, y0, acc3
   931  
   932  	// Add bits [511:256] of the sqr result
   933  	ADDS	acc4, acc0, acc0
   934  	ADCS	acc5, acc1, acc1
   935  	ADCS	acc6, acc2, acc2
   936  	ADCS	acc7, acc3, acc3
   937  	ADC	$0, ZR, acc4
   938  
   939  	SUBS	const0, acc0, t0
   940  	SBCS	const1, acc1, t1
   941  	SBCS	const2, acc2, acc5
   942  	SBCS	const3, acc3, acc6
   943  	SBCS	$0, acc4, acc4
   944  
   945  	CSEL	CS, t0, acc0, y0
   946  	CSEL	CS, t1, acc1, y1
   947  	CSEL	CS, acc5, acc2, y2
   948  	CSEL	CS, acc6, acc3, y3
   949  	RET
   950  /* ---------------------------------------*/
   951  // (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
   952  TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
   953  	// y[0] * x
   954  	MUL	y0, x0, acc0
   955  	UMULH	y0, x0, acc1
   956  
   957  	MUL	y0, x1, t0
   958  	ADDS	t0, acc1
   959  	UMULH	y0, x1, acc2
   960  
   961  	MUL	y0, x2, t0
   962  	ADCS	t0, acc2
   963  	UMULH	y0, x2, acc3
   964  
   965  	MUL	y0, x3, t0
   966  	ADCS	t0, acc3
   967  	UMULH	y0, x3, acc4
   968  	ADC	$0, acc4
   969  	// First reduction step
   970  	LSL $32, acc0, t0
   971  	LSR	$32, acc0, t1
   972  
   973  	SUBS t0, acc1
   974  	SBCS t1, acc2
   975  	SBCS t0, acc3
   976  	SBC t1, acc0, t0
   977  
   978  	ADDS acc0, acc1, acc1
   979  	ADCS $0, acc2, acc2
   980  	ADCS $0, acc3, acc3
   981  	ADC $0, t0, acc0
   982  
   983  	// y[1] * x
   984  	MUL	y1, x0, t0
   985  	ADDS	t0, acc1
   986  	UMULH	y1, x0, t1
   987  
   988  	MUL	y1, x1, t0
   989  	ADCS	t0, acc2
   990  	UMULH	y1, x1, y0
   991  
   992  	MUL	y1, x2, t0
   993  	ADCS	t0, acc3
   994  	UMULH	y1, x2, acc6
   995  
   996  	MUL	y1, x3, t0
   997  	ADCS	t0, acc4
   998  	UMULH	y1, x3, hlp0
   999  	ADC	$0, ZR, acc5
  1000  
  1001  	ADDS	t1, acc2
  1002  	ADCS	y0, acc3
  1003  	ADCS	acc6, acc4
  1004  	ADC	hlp0, acc5
  1005  	// Second reduction step
  1006  	LSL $32, acc1, t0
  1007  	LSR	$32, acc1, t1
  1008  
  1009  	SUBS t0, acc2
  1010  	SBCS t1, acc3
  1011  	SBCS t0, acc0
  1012  	SBC t1, acc1, t0
  1013  
  1014  	ADDS acc1, acc2, acc2
  1015  	ADCS $0, acc3, acc3
  1016  	ADCS $0, acc0, acc0
  1017  	ADC $0, t0, acc1
  1018  
  1019  	// y[2] * x
  1020  	MUL	y2, x0, t0
  1021  	ADDS	t0, acc2
  1022  	UMULH	y2, x0, t1
  1023  
  1024  	MUL	y2, x1, t0
  1025  	ADCS	t0, acc3
  1026  	UMULH	y2, x1, y0
  1027  
  1028  	MUL	y2, x2, t0
  1029  	ADCS	t0, acc4
  1030  	UMULH	y2, x2, y1
  1031  
  1032  	MUL	y2, x3, t0
  1033  	ADCS	t0, acc5
  1034  	UMULH	y2, x3, hlp0
  1035  	ADC	$0, ZR, acc6
  1036  
  1037  	ADDS	t1, acc3
  1038  	ADCS	y0, acc4
  1039  	ADCS	y1, acc5
  1040  	ADC	hlp0, acc6
  1041  	// Third reduction step
  1042  	LSL $32, acc2, t0
  1043  	LSR	$32, acc2, t1
  1044  
  1045  	SUBS t0, acc3
  1046  	SBCS t1, acc0
  1047  	SBCS t0, acc1
  1048  	SBC t1, acc2, t0
  1049  
  1050  	ADDS acc2, acc3, acc3
  1051  	ADCS $0, acc0, acc0
  1052  	ADCS $0, acc1, acc1
  1053  	ADC $0, t0, acc2
  1054  
  1055  	// y[3] * x
  1056  	MUL	y3, x0, t0
  1057  	ADDS	t0, acc3
  1058  	UMULH	y3, x0, t1
  1059  
  1060  	MUL	y3, x1, t0
  1061  	ADCS	t0, acc4
  1062  	UMULH	y3, x1, y0
  1063  
  1064  	MUL	y3, x2, t0
  1065  	ADCS	t0, acc5
  1066  	UMULH	y3, x2, y1
  1067  
  1068  	MUL	y3, x3, t0
  1069  	ADCS	t0, acc6
  1070  	UMULH	y3, x3, hlp0
  1071  	ADC	$0, ZR, acc7
  1072  
  1073  	ADDS	t1, acc4
  1074  	ADCS	y0, acc5
  1075  	ADCS	y1, acc6
  1076  	ADC	hlp0, acc7
  1077  	// Last reduction step
  1078  	LSL $32, acc3, t0
  1079  	LSR	$32, acc3, t1
  1080  
  1081  	SUBS t0, acc0
  1082  	SBCS t1, acc1
  1083  	SBCS t0, acc2
  1084  	SBC t1, acc3, t0
  1085  
  1086  	ADDS acc3, acc0, acc0
  1087  	ADCS $0, acc1, acc1
  1088  	ADCS $0, acc2, acc2
  1089  	ADC $0, t0, acc3
  1090  
  1091  	// Add bits [511:256] of the mul result
  1092  	ADDS	acc4, acc0, acc0
  1093  	ADCS	acc5, acc1, acc1
  1094  	ADCS	acc6, acc2, acc2
  1095  	ADCS	acc7, acc3, acc3
  1096  	ADC	$0, ZR, acc4
  1097  
  1098  	SUBS	const0, acc0, t0
  1099  	SBCS	const1, acc1, t1
  1100  	SBCS	const2, acc2, acc5
  1101  	SBCS	const3, acc3, acc6
  1102  	SBCS	$0, acc4, acc4
  1103  
  1104  	CSEL	CS, t0, acc0, y0
  1105  	CSEL	CS, t1, acc1, y1
  1106  	CSEL	CS, acc5, acc2, y2
  1107  	CSEL	CS, acc6, acc3, y3
  1108  	RET
  1109  /* ---------------------------------------*/
  1110  // (x3, x2, x1, x0) = 2(y3, y2, y1, y0)
  1111  #define p256MulBy2Inline       \
  1112  	ADDS	y0, y0, x0;    \
  1113  	ADCS	y1, y1, x1;    \
  1114  	ADCS	y2, y2, x2;    \
  1115  	ADCS	y3, y3, x3;    \
  1116  	ADC	$0, ZR, hlp0;  \
  1117  	SUBS	const0, x0, t0;   \
  1118  	SBCS	const1, x1, t1;\
  1119  	SBCS	const2, x2, acc5;    \
  1120  	SBCS	const3, x3, acc6;\
  1121  	SBCS	$0, hlp0, hlp0;\
  1122  	CSEL	CC, x0, t0, x0;\
  1123  	CSEL	CC, x1, t1, x1;\
  1124  	CSEL	CC, x2, acc5, x2;\
  1125  	CSEL	CC, x3, acc6, x3;
  1126  /* ---------------------------------------*/
  1127  #define x1in(off) (off)(a_ptr)
  1128  #define y1in(off) (off + 32)(a_ptr)
  1129  #define z1in(off) (off + 64)(a_ptr)
  1130  #define x2in(off) (off)(b_ptr)
  1131  #define z2in(off) (off + 64)(b_ptr)
  1132  #define x3out(off) (off)(res_ptr)
  1133  #define y3out(off) (off + 32)(res_ptr)
  1134  #define z3out(off) (off + 64)(res_ptr)
  1135  #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1136  #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1137  #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1138  #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1139  /* ---------------------------------------*/
  1140  #define y2in(off)  (32*0 + 8 + off)(RSP)
  1141  #define s2(off)    (32*1 + 8 + off)(RSP)
  1142  #define z1sqr(off) (32*2 + 8 + off)(RSP)
  1143  #define h(off)	   (32*3 + 8 + off)(RSP)
  1144  #define r(off)	   (32*4 + 8 + off)(RSP)
  1145  #define hsqr(off)  (32*5 + 8 + off)(RSP)
  1146  #define rsqr(off)  (32*6 + 8 + off)(RSP)
  1147  #define hcub(off)  (32*7 + 8 + off)(RSP)
  1148  
  1149  #define z2sqr(off) (32*8 + 8 + off)(RSP)
  1150  #define s1(off) (32*9 + 8 + off)(RSP)
  1151  #define u1(off) (32*10 + 8 + off)(RSP)
  1152  #define u2(off) (32*11 + 8 + off)(RSP)
  1153  
  1154  // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1155  TEXT ·p256PointAddAffineAsm(SB),0,$264-48
  1156  	MOVD	in1+8(FP), a_ptr
  1157  	MOVD	in2+16(FP), b_ptr
  1158  	MOVD	sign+24(FP), hlp0
  1159  	MOVD	sel+32(FP), hlp1
  1160  	MOVD	zero+40(FP), t1
  1161  
  1162  	VEOR V12.B16, V12.B16, V12.B16
  1163  	VDUP hlp1, V13.S4
  1164  	VCMEQ V12.S4, V13.S4, V13.S4
  1165  	VDUP t1, V14.S4
  1166  	VCMEQ V12.S4, V14.S4, V14.S4	
  1167  
  1168  	LDP	p256p<>+0x00(SB), (const0, const1)
  1169  	LDP	p256p<>+0x10(SB), (const2, const3)
  1170  
  1171  	// Negate y2in based on sign
  1172  	LDP	2*16(b_ptr), (y0, y1)
  1173  	LDP	3*16(b_ptr), (y2, y3)
  1174  
  1175  	SUBS	y0, const0, acc0
  1176  	SBCS	y1, const1, acc1
  1177  	SBCS	y2, const2, acc2
  1178  	SBCS	y3, const3, acc3
  1179  	SBC	$0, ZR, t0
  1180  
  1181  	ADDS	const0, acc0, acc4
  1182  	ADCS	const1, acc1, acc5
  1183  	ADCS	const2, acc2, acc6
  1184  	ADCS	const3, acc3, acc7
  1185  	ADC	$0, t0, t0
  1186  
  1187  	CMP	$0, t0
  1188  	CSEL	EQ, acc4, acc0, acc0
  1189  	CSEL	EQ, acc5, acc1, acc1
  1190  	CSEL	EQ, acc6, acc2, acc2
  1191  	CSEL	EQ, acc7, acc3, acc3
  1192  	// If condition is 0, keep original value
  1193  	CMP	$0, hlp0
  1194  	CSEL	EQ, y0, acc0, y0
  1195  	CSEL	EQ, y1, acc1, y1
  1196  	CSEL	EQ, y2, acc2, y2
  1197  	CSEL	EQ, y3, acc3, y3
  1198  	// Store result
  1199  	STy(y2in)
  1200  
  1201  	// Begin point add
  1202  	LDx(z1in)
  1203  	CALL	sm2P256SqrInternal<>(SB)    // z1ˆ2
  1204  	STy(z1sqr)
  1205  
  1206  	LDx(x2in)
  1207  	CALL	sm2P256MulInternal<>(SB)    // x2 * z1ˆ2
  1208  
  1209  	LDx(x1in)
  1210  	CALL	sm2P256Subinternal<>(SB)    // h = u2 - u1
  1211  	STx(h)
  1212  
  1213  	MOVD	x0, y0
  1214  	MOVD	x1, y1
  1215  	MOVD	x2, y2
  1216  	MOVD	x3, y3
  1217  	LDx(z1in)
  1218  	CALL	sm2P256MulInternal<>(SB)    // z3 = h * z1
  1219  	VMOV y0, V4.D[0]            // save z3
  1220  	VMOV y1, V4.D[1]
  1221  	VMOV y2, V5.D[0]
  1222  	VMOV y3, V5.D[1]
  1223  
  1224  	LDy(z1sqr)
  1225  	CALL	sm2P256MulInternal<>(SB)    // z1 ^ 3
  1226  
  1227  	LDx(y2in)
  1228  	CALL	sm2P256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1229  	STy(s2)
  1230  
  1231  	LDx(y1in)
  1232  	CALL	sm2P256Subinternal<>(SB)    // r = s2 - s1
  1233  	STx(r)
  1234  
  1235  	CALL	sm2P256SqrInternal<>(SB)    // rsqr = rˆ2
  1236  	STy	(rsqr)
  1237  
  1238  	LDx(h)
  1239  	CALL	sm2P256SqrInternal<>(SB)    // hsqr = hˆ2
  1240  	STy(hsqr)
  1241  
  1242  	CALL	sm2P256MulInternal<>(SB)    // hcub = hˆ3
  1243  	STy(hcub)
  1244  
  1245  	LDx(y1in)
  1246  	CALL	sm2P256MulInternal<>(SB)    // y1 * hˆ3
  1247  	STy(s2)
  1248  
  1249  	LDP	hsqr(0*8), (x0, x1)
  1250  	LDP	hsqr(2*8), (x2, x3)
  1251  	LDP	0*16(a_ptr), (y0, y1)
  1252  	LDP	1*16(a_ptr), (y2, y3)
  1253  	CALL	sm2P256MulInternal<>(SB)    // u1 * hˆ2
  1254  	STP	(y0, y1), h(0*8)
  1255  	STP	(y2, y3), h(2*8)
  1256  
  1257  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1258  
  1259  	LDy(rsqr)
  1260  	CALL	sm2P256Subinternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1261  
  1262  	MOVD	x0, y0
  1263  	MOVD	x1, y1
  1264  	MOVD	x2, y2
  1265  	MOVD	x3, y3
  1266  	LDx(hcub)
  1267  	CALL	sm2P256Subinternal<>(SB)
  1268  	VMOV x0, V0.D[0]      // save x3
  1269  	VMOV x1, V0.D[1]
  1270  	VMOV x2, V1.D[0]
  1271  	VMOV x3, V1.D[1]
  1272  
  1273  	LDP	h(0*8), (y0, y1)
  1274  	LDP	h(2*8), (y2, y3)
  1275  	CALL	sm2P256Subinternal<>(SB)
  1276  
  1277  	LDP	r(0*8), (y0, y1)
  1278  	LDP	r(2*8), (y2, y3)
  1279  	CALL	sm2P256MulInternal<>(SB)
  1280  
  1281  	LDP	s2(0*8), (x0, x1)
  1282  	LDP	s2(2*8), (x2, x3)
  1283  	CALL	sm2P256Subinternal<>(SB)
  1284  	VMOV x0, V2.D[0]      // save y3
  1285  	VMOV x1, V2.D[1]
  1286  	VMOV x2, V3.D[0]
  1287  	VMOV x3, V3.D[1]
  1288  
  1289  	// If zero is 0, sets res = in2
  1290  	VLD1 (b_ptr), [V6.B16, V7.B16]
  1291  	ADD $8, RSP, hlp1
  1292  	VLD1 (hlp1), [V8.B16, V9.B16]
  1293  	MOVD $p256one<>(SB), hlp1
  1294  	VLD1 (hlp1), [V10.B16, V11.B16]
  1295  	VBIT V14.B16, V6.B16, V0.B16
  1296  	VBIT V14.B16, V7.B16, V1.B16
  1297  	VBIT V14.B16, V8.B16, V2.B16
  1298  	VBIT V14.B16, V9.B16, V3.B16
  1299  	VBIT V14.B16, V10.B16, V4.B16
  1300  	VBIT V14.B16, V11.B16, V5.B16
  1301  
  1302  	// If sel is 0, sets res = in1.
  1303  	VLD1.P (48)(a_ptr), [V6.B16, V7.B16, V8.B16]
  1304  	VLD1 (a_ptr), [V9.B16, V10.B16, V11.B16]
  1305  	VBIT V13.B16, V6.B16, V0.B16
  1306  	VBIT V13.B16, V7.B16, V1.B16
  1307  	VBIT V13.B16, V8.B16, V2.B16
  1308  	VBIT V13.B16, V9.B16, V3.B16
  1309  	VBIT V13.B16, V10.B16, V4.B16
  1310  	VBIT V13.B16, V11.B16, V5.B16
  1311  
  1312  	MOVD	res+0(FP), t0
  1313  	VST1.P [V0.B16, V1.B16, V2.B16], (48)(t0)
  1314  	VST1 [V3.B16, V4.B16, V5.B16], (t0)
  1315  	RET
  1316  
  1317  // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
  1318  #define p256AddInline          \
  1319  	ADDS	y0, x0, x0;    \
  1320  	ADCS	y1, x1, x1;    \
  1321  	ADCS	y2, x2, x2;    \
  1322  	ADCS	y3, x3, x3;    \
  1323  	ADC	$0, ZR, hlp0;  \
  1324  	SUBS	const0, x0, t0;   \
  1325  	SBCS	const1, x1, t1;\
  1326  	SBCS	const2, x2, acc5;    \
  1327  	SBCS	const3, x3, acc6;\
  1328  	SBCS	$0, hlp0, hlp0;\
  1329  	CSEL	CC, x0, t0, x0;\
  1330  	CSEL	CC, x1, t1, x1;\
  1331  	CSEL	CC, x2, acc5, x2;\
  1332  	CSEL	CC, x3, acc6, x3;
  1333  
  1334  #define s(off)	(32*0 + 8 + off)(RSP)
  1335  #define m(off)	(32*1 + 8 + off)(RSP)
  1336  #define zsqr(off) (32*2 + 8 + off)(RSP)
  1337  #define tmp(off)  (32*3 + 8 + off)(RSP)
  1338  
  1339  //func p256PointDoubleAsm(res, in *SM2P256Point)
  1340  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
  1341  	MOVD	res+0(FP), res_ptr
  1342  	MOVD	in+8(FP), a_ptr
  1343  
  1344  	LDP	p256p<>+0x00(SB), (const0, const1)
  1345  	LDP	p256p<>+0x10(SB), (const2, const3)
  1346  
  1347  	// Begin point double
  1348  	LDP	4*16(a_ptr), (x0, x1)        // load z
  1349  	LDP	5*16(a_ptr), (x2, x3)
  1350  	CALL	sm2P256SqrInternal<>(SB)
  1351  	STP	(y0, y1), zsqr(0*8)          // store z^2
  1352  	STP	(y2, y3), zsqr(2*8)
  1353  
  1354  	LDP	0*16(a_ptr), (x0, x1)        // load x
  1355  	LDP	1*16(a_ptr), (x2, x3)
  1356  	p256AddInline
  1357  	STx(m)
  1358  
  1359  	LDx(z1in)
  1360  	LDy(y1in)
  1361  	CALL	sm2P256MulInternal<>(SB)
  1362  	p256MulBy2Inline
  1363  	STx(z3out)
  1364  
  1365  	LDy(x1in)
  1366  	LDx(zsqr)
  1367  	CALL	sm2P256Subinternal<>(SB)
  1368  	LDy(m)
  1369  	CALL	sm2P256MulInternal<>(SB)
  1370  
  1371  	// Multiply by 3
  1372  	p256MulBy2Inline
  1373  	p256AddInline
  1374  	STx(m)
  1375  
  1376  	LDy(y1in)
  1377  	p256MulBy2Inline
  1378  	CALL	sm2P256SqrInternal<>(SB)
  1379  	STy(s)
  1380  	MOVD	y0, x0
  1381  	MOVD	y1, x1
  1382  	MOVD	y2, x2
  1383  	MOVD	y3, x3
  1384  	CALL	sm2P256SqrInternal<>(SB)
  1385  
  1386  	// Divide by 2
  1387  	ADDS	const0, y0, t0
  1388  	ADCS	const1, y1, t1
  1389  	ADCS	const2, y2, acc5
  1390  	ADCS	const3, y3, acc6
  1391  	ADC	$0, ZR, hlp0
  1392  
  1393  	ANDS	$1, y0, ZR
  1394  	CSEL	EQ, y0, t0, t0
  1395  	CSEL	EQ, y1, t1, t1
  1396  	CSEL	EQ, y2, acc5, acc5
  1397  	CSEL	EQ, y3, acc6, acc6
  1398  	AND	y0, hlp0, hlp0
  1399  
  1400  	EXTR	$1, t0, t1, y0
  1401  	EXTR	$1, t1, acc5, y1
  1402  	EXTR	$1, acc5, acc6, y2
  1403  	EXTR	$1, acc6, hlp0, y3
  1404  	STy(y3out)
  1405  
  1406  	LDx(x1in)
  1407  	LDy(s)
  1408  	CALL	sm2P256MulInternal<>(SB)
  1409  	STy(s)
  1410  	p256MulBy2Inline
  1411  	STx(tmp)
  1412  
  1413  	LDx(m)
  1414  	CALL	sm2P256SqrInternal<>(SB)
  1415  	LDx(tmp)
  1416  	CALL	sm2P256Subinternal<>(SB)
  1417  
  1418  	STx(x3out)
  1419  
  1420  	LDy(s)
  1421  	CALL	sm2P256Subinternal<>(SB)
  1422  
  1423  	LDy(m)
  1424  	CALL	sm2P256MulInternal<>(SB)
  1425  
  1426  	LDx(y3out)
  1427  	CALL	sm2P256Subinternal<>(SB)
  1428  	STx(y3out)
  1429  	RET
  1430  
  1431  #define p256PointDoubleRound() \
  1432  	LDx(z3out)                       \ // load z
  1433  	CALL	sm2P256SqrInternal<>(SB) \
  1434  	STP	(y0, y1), zsqr(0*8)          \ // store z^2
  1435  	STP	(y2, y3), zsqr(2*8)          \
  1436  	\
  1437  	LDx(x3out)                       \// load x
  1438  	p256AddInline                    \
  1439  	STx(m)                           \
  1440  	\
  1441  	LDx(z3out)                       \ // load z
  1442  	LDy(y3out)                       \ // load y
  1443  	CALL	sm2P256MulInternal<>(SB) \
  1444  	p256MulBy2Inline                 \
  1445  	STx(z3out)                       \ // store result z
  1446  	\
  1447  	LDy(x3out)                       \ // load x
  1448  	LDx(zsqr)                        \
  1449  	CALL	sm2P256Subinternal<>(SB) \
  1450  	LDy(m)                           \
  1451  	CALL	sm2P256MulInternal<>(SB) \
  1452  	\
  1453  	\// Multiply by 3
  1454  	p256MulBy2Inline                 \
  1455  	p256AddInline                    \
  1456  	STx(m)                           \
  1457  	\
  1458  	LDy(y3out)                       \  // load y
  1459  	p256MulBy2Inline                 \
  1460  	CALL	sm2P256SqrInternal<>(SB) \
  1461  	STy(s)                           \
  1462  	MOVD	y0, x0                   \
  1463  	MOVD	y1, x1                   \
  1464  	MOVD	y2, x2                   \
  1465  	MOVD	y3, x3                   \
  1466  	CALL	sm2P256SqrInternal<>(SB) \
  1467  	\
  1468  	\// Divide by 2
  1469  	ADDS	const0, y0, t0           \
  1470  	ADCS	const1, y1, t1           \
  1471  	ADCS	const2, y2, acc5         \
  1472  	ADCS	const3, y3, acc6         \
  1473  	ADC	$0, ZR, hlp0                 \
  1474  	\
  1475  	ANDS	$1, y0, ZR               \
  1476  	CSEL	EQ, y0, t0, t0           \
  1477  	CSEL	EQ, y1, t1, t1           \
  1478  	CSEL	EQ, y2, acc5, acc5       \
  1479  	CSEL	EQ, y3, acc6, acc6       \
  1480  	AND	y0, hlp0, hlp0               \
  1481  	\
  1482  	EXTR	$1, t0, t1, y0           \
  1483  	EXTR	$1, t1, acc5, y1         \
  1484  	EXTR	$1, acc5, acc6, y2       \
  1485  	EXTR	$1, acc6, hlp0, y3       \
  1486  	STy(y3out)                       \                
  1487  	\
  1488  	LDx(x3out)                       \  // load x
  1489  	LDy(s)                           \
  1490  	CALL	sm2P256MulInternal<>(SB) \
  1491  	STy(s)                           \
  1492  	p256MulBy2Inline                 \
  1493  	STx(tmp)                         \
  1494  	\
  1495  	LDx(m)                           \
  1496  	CALL	sm2P256SqrInternal<>(SB) \
  1497  	LDx(tmp)                         \
  1498  	CALL	sm2P256Subinternal<>(SB) \
  1499  	\
  1500  	STx(x3out)                       \
  1501  	\
  1502  	LDy(s)                           \
  1503  	CALL	sm2P256Subinternal<>(SB) \
  1504  	\
  1505  	LDy(m)                           \
  1506  	CALL	sm2P256MulInternal<>(SB) \
  1507  	\
  1508  	LDx(y3out)                       \
  1509  	CALL	sm2P256Subinternal<>(SB) \
  1510  	STx(y3out)                       \
  1511  
  1512  //func p256PointDouble6TimesAsm(res, in *SM2P256Point)
  1513  TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$136-16
  1514  	MOVD	res+0(FP), res_ptr
  1515  	MOVD	in+8(FP), a_ptr
  1516  
  1517  	LDP	p256p<>+0x00(SB), (const0, const1)
  1518  	LDP	p256p<>+0x10(SB), (const2, const3)
  1519  
  1520  	// Begin point double round 1
  1521  	LDP	4*16(a_ptr), (x0, x1)        // load z
  1522  	LDP	5*16(a_ptr), (x2, x3)
  1523  	CALL	sm2P256SqrInternal<>(SB)
  1524  	STP	(y0, y1), zsqr(0*8)          // store z^2
  1525  	STP	(y2, y3), zsqr(2*8)
  1526  
  1527  	LDP	0*16(a_ptr), (x0, x1)        // load x
  1528  	LDP	1*16(a_ptr), (x2, x3)
  1529  	p256AddInline
  1530  	STx(m)
  1531  
  1532  	LDx(z1in)                        // load z
  1533  	LDy(y1in)                        // load y
  1534  	CALL	sm2P256MulInternal<>(SB)
  1535  	p256MulBy2Inline
  1536  	STx(z3out)                        // store result z
  1537  
  1538  	LDy(x1in)                        // load x
  1539  	LDx(zsqr)
  1540  	CALL	sm2P256Subinternal<>(SB)
  1541  	LDy(m)
  1542  	CALL	sm2P256MulInternal<>(SB)
  1543  
  1544  	// Multiply by 3
  1545  	p256MulBy2Inline
  1546  	p256AddInline
  1547  	STx(m)
  1548  
  1549  	LDy(y1in)                         // load y
  1550  	p256MulBy2Inline
  1551  	CALL	sm2P256SqrInternal<>(SB)
  1552  	STy(s)
  1553  	MOVD	y0, x0
  1554  	MOVD	y1, x1
  1555  	MOVD	y2, x2
  1556  	MOVD	y3, x3
  1557  	CALL	sm2P256SqrInternal<>(SB)
  1558  
  1559  	// Divide by 2
  1560  	ADDS	const0, y0, t0
  1561  	ADCS	const1, y1, t1
  1562  	ADCS	const2, y2, acc5
  1563  	ADCS	const3, y3, acc6
  1564  	ADC	$0, ZR, hlp0
  1565  
  1566  	ANDS	$1, y0, ZR
  1567  	CSEL	EQ, y0, t0, t0
  1568  	CSEL	EQ, y1, t1, t1
  1569  	CSEL	EQ, y2, acc5, acc5
  1570  	CSEL	EQ, y3, acc6, acc6
  1571  	AND	y0, hlp0, hlp0
  1572  
  1573  	EXTR	$1, t0, t1, y0
  1574  	EXTR	$1, t1, acc5, y1
  1575  	EXTR	$1, acc5, acc6, y2
  1576  	EXTR	$1, acc6, hlp0, y3
  1577  	STy(y3out)                       
  1578  
  1579  	LDx(x1in)                         // load x
  1580  	LDy(s)
  1581  	CALL	sm2P256MulInternal<>(SB)
  1582  	STy(s)
  1583  	p256MulBy2Inline
  1584  	STx(tmp)
  1585  
  1586  	LDx(m)
  1587  	CALL	sm2P256SqrInternal<>(SB)
  1588  	LDx(tmp)
  1589  	CALL	sm2P256Subinternal<>(SB)
  1590  
  1591  	STx(x3out)
  1592  
  1593  	LDy(s)
  1594  	CALL	sm2P256Subinternal<>(SB)
  1595  
  1596  	LDy(m)
  1597  	CALL	sm2P256MulInternal<>(SB)
  1598  
  1599  	LDx(y3out)
  1600  	CALL	sm2P256Subinternal<>(SB)
  1601  	STx(y3out)
  1602  
  1603  	// Begin point double rounds 2 - 6
  1604  	p256PointDoubleRound()
  1605  	p256PointDoubleRound()
  1606  	p256PointDoubleRound()
  1607  	p256PointDoubleRound()
  1608  	p256PointDoubleRound()
  1609  	
  1610  	RET
  1611  
  1612  /* ---------------------------------------*/
  1613  #undef y2in
  1614  #undef x3out
  1615  #undef y3out
  1616  #undef z3out
  1617  #define y2in(off) (off + 32)(b_ptr)
  1618  #define x3out(off) (off)(b_ptr)
  1619  #define y3out(off) (off + 32)(b_ptr)
  1620  #define z3out(off) (off + 64)(b_ptr)
  1621  // func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
  1622  TEXT ·p256PointAddAsm(SB),0,$392-32
  1623  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1624  	// Move input to stack in order to free registers
  1625  	MOVD	in1+8(FP), a_ptr
  1626  	MOVD	in2+16(FP), b_ptr
  1627  
  1628  	LDP	p256p<>+0x00(SB), (const0, const1)
  1629  	LDP	p256p<>+0x10(SB), (const2, const3)
  1630  
  1631  	// Begin point add
  1632  	LDx(z2in)
  1633  	CALL	sm2P256SqrInternal<>(SB)    // z2^2
  1634  	STy(z2sqr)
  1635  
  1636  	CALL	sm2P256MulInternal<>(SB)    // z2^3
  1637  
  1638  	LDx(y1in)
  1639  	CALL	sm2P256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1640  	STy(s1)
  1641  
  1642  	LDx(z1in)
  1643  	CALL	sm2P256SqrInternal<>(SB)    // z1^2
  1644  	STy(z1sqr)
  1645  
  1646  	CALL	sm2P256MulInternal<>(SB)    // z1^3
  1647  
  1648  	LDx(y2in)
  1649  	CALL	sm2P256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1650  
  1651  	LDx(s1)
  1652  	CALL	sm2P256Subinternal<>(SB)    // r = s2 - s1
  1653  	STx(r)
  1654  
  1655  	MOVD	$1, acc1
  1656  	ORR	x0, x1, acc2             // Check if zero mod p256
  1657  	ORR	x2, x3, acc3
  1658  	ORR	acc3, acc2, acc2
  1659  	CMP	$0, acc2
  1660  	CSEL	EQ, acc1, ZR, hlp1
  1661  
  1662  	EOR	const0, x0, acc2
  1663  	EOR	const1, x1, acc3
  1664  	EOR	const2, x2, acc4
  1665  	EOR	const3, x3, acc5
  1666  
  1667  	ORR	acc2, acc3, acc2
  1668  	ORR	acc4, acc5, acc3
  1669  	ORR	acc3, acc2, acc2
  1670  	CMP	$0, acc2
  1671  	CSEL	EQ, acc1, hlp1, hlp1
  1672  
  1673  	LDx(z2sqr)
  1674  	LDy(x1in)
  1675  	CALL	sm2P256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1676  	STy(u1)
  1677  
  1678  	LDx(z1sqr)
  1679  	LDy(x2in)
  1680  	CALL	sm2P256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1681  	STy(u2)
  1682  
  1683  	LDx(u1)
  1684  	CALL	sm2P256Subinternal<>(SB)    // h = u2 - u1
  1685  	STx(h)
  1686  
  1687  	MOVD	$1, acc1
  1688  	ORR	x0, x1, acc2             // Check if zero mod p256
  1689  	ORR	x2, x3, acc3
  1690  	ORR	acc3, acc2, acc2
  1691  	CMP	$0, acc2
  1692  	CSEL	EQ, acc1, ZR, hlp0
  1693  
  1694  	EOR	const0, x0, acc2
  1695  	EOR	const1, x1, acc3
  1696  	EOR	const2, x2, acc4
  1697  	EOR	const3, x3, acc5
  1698  
  1699  	ORR	acc2, acc3, acc2
  1700  	ORR	acc4, acc5, acc3
  1701  	ORR	acc3, acc2, acc2
  1702  	CMP	$0, acc2
  1703  	CSEL	EQ, acc1, hlp0, hlp0
  1704  
  1705  	AND	hlp0, hlp1, hlp1
  1706  
  1707  	LDx(r)
  1708  	CALL	sm2P256SqrInternal<>(SB)    // rsqr = rˆ2
  1709  	STy(rsqr)
  1710  
  1711  	LDx(h)
  1712  	CALL	sm2P256SqrInternal<>(SB)    // hsqr = hˆ2
  1713  	STy(hsqr)
  1714  
  1715  	LDx(h)
  1716  	CALL	sm2P256MulInternal<>(SB)    // hcub = hˆ3
  1717  	STy(hcub)
  1718  
  1719  	LDx(s1)
  1720  	CALL	sm2P256MulInternal<>(SB)
  1721  	STy(s2)
  1722  
  1723  	LDx(z1in)
  1724  	LDy(z2in)
  1725  	CALL	sm2P256MulInternal<>(SB)    // z1 * z2
  1726  	LDx(h)
  1727  	CALL	sm2P256MulInternal<>(SB)    // z1 * z2 * h
  1728  	MOVD	res+0(FP), b_ptr
  1729  	STy(z3out)
  1730  
  1731  	LDx(hsqr)
  1732  	LDy(u1)
  1733  	CALL	sm2P256MulInternal<>(SB)    // hˆ2 * u1
  1734  	STy(u2)
  1735  
  1736  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1737  	LDy(rsqr)
  1738  	CALL	sm2P256Subinternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1739  
  1740  	MOVD	x0, y0
  1741  	MOVD	x1, y1
  1742  	MOVD	x2, y2
  1743  	MOVD	x3, y3
  1744  	LDx(hcub)
  1745  	CALL	sm2P256Subinternal<>(SB)
  1746  	STx(x3out)
  1747  
  1748  	LDy(u2)
  1749  	CALL	sm2P256Subinternal<>(SB)
  1750  
  1751  	LDy(r)
  1752  	CALL	sm2P256MulInternal<>(SB)
  1753  
  1754  	LDx(s2)
  1755  	CALL	sm2P256Subinternal<>(SB)
  1756  	STx(y3out)
  1757  
  1758  	MOVD	hlp1, R0
  1759  	MOVD	R0, ret+24(FP)
  1760  
  1761  	RET