github.com/cloudflare/circl@v1.5.0/ecc/p384/arith_arm64.s (about)

     1  // +build arm64,!purego
     2  
     3  #include "textflag.h"
     4  
     5  TEXT ·fp384Cmov(SB), NOSPLIT, $0
     6      MOVD x+0(FP), R0
     7      MOVD y+8(FP), R1
     8      MOVW b+16(FP), R2
     9      CMP $0, R2
    10      LDP   0(R0), (R3, R5)
    11      LDP   0(R1), (R4, R6)
    12      CSEL NE,R4,R3,R7
    13      CSEL NE,R6,R5,R8
    14      STP  (R7, R8),  0(R0)
    15      LDP  16(R0), (R3, R5)
    16      LDP  16(R1), (R4, R6)
    17      CSEL NE,R4,R3,R7
    18      CSEL NE,R6,R5,R8
    19      STP  (R7, R8), 16(R0)
    20      LDP  32(R0), (R3, R5)
    21      LDP  32(R1), (R4, R6)
    22      CSEL NE,R4,R3,R7
    23      CSEL NE,R6,R5,R8
    24      STP  (R7, R8), 32(R0)
    25      RET
    26  
    27  // Compute c = -a mod p
    28  TEXT ·fp384Neg(SB), NOSPLIT, $0-16
    29  	MOVD	c+0(FP), R0
    30  	MOVD	a+8(FP), R1
    31  
    32  	// Load p in R2-R7, a in R8-R13
    33  	// Compute p-a in R8-R13
    34  	LDP	·p+0(SB), (R2, R3)
    35  	LDP	0(R1), (R8, R9)
    36  	SUBS	R8, R2, R8
    37  	SBCS	R9, R3, R9
    38  	LDP	·p+16(SB), (R4, R5)
    39  	LDP	16(R1), (R10, R11)
    40  	SBCS	R10, R4, R10
    41  	SBCS	R11, R5, R11
    42  	LDP	·p+32(SB), (R6, R7)
    43  	LDP	32(R1), (R12, R13)
    44  	SBCS	R12, R6, R12
    45  	SBC	R13, R7, R13
    46  
    47  	// Compute (p-a)-p in R2-R7
    48  	SUBS	R2,  R8, R2
    49  	SBCS	R3,  R9, R3
    50  	SBCS	R4, R10, R4
    51  	SBCS	R5, R11, R5
    52  	SBCS	R6, R12, R6
    53  	SBCS	R7, R13, R7
    54  
    55  	// If (p-a)-p < 0 (nearly always), return p-a
    56  	// Only return (p-a)-p for a = 0
    57  	// Store result in c
    58  	CSEL	CC, R8, R2, R2
    59  	CSEL	CC, R9, R3, R3
    60  	STP	(R2, R3), 0(R0)
    61  	CSEL	CC, R10, R4, R4
    62  	CSEL	CC, R11, R5, R5
    63  	STP	(R4, R5), 16(R0)
    64  	CSEL	CC, R12, R6, R6
    65  	CSEL	CC, R13, R7, R7
    66  	STP	(R6, R7), 32(R0)
    67  
    68  	RET
    69  
    70  // Compute c = a+b mod p
    71  TEXT ·fp384Add(SB), NOSPLIT, $0-24
    72  	MOVD	c+0(FP), R0
    73  	MOVD	a+8(FP), R1
    74  	MOVD	b+16(FP), R2
    75  
    76  	// Load a in R3-R8, b in R9-R14
    77  	// Compute a+b in R3-R9
    78  	LDP	0(R1), (R3, R4)
    79  	LDP	0(R2), (R9, R10)
    80  	ADDS	R9, R3
    81  	ADCS	R10, R4
    82  	LDP	16(R1), (R5, R6)
    83  	LDP	16(R2), (R11, R12)
    84  	ADCS	R11, R5
    85  	ADCS	R12, R6
    86  	LDP	32(R1), (R7, R8)
    87  	LDP	32(R2), (R13, R14)
    88  	ADCS	R13, R7
    89  	ADCS	R14, R8
    90  	ADC	ZR, ZR, R9
    91  
    92  	// Load p in R10-R15
    93  	LDP	·p+ 0(SB), (R10, R11)
    94  	LDP	·p+16(SB), (R12, R13)
    95  	LDP	·p+32(SB), (R14, R15)
    96  
    97  	// Compute a+b-p in R10-R16
    98  	SUBS	R10, R3, R10
    99  	SBCS	R11, R4, R11
   100  	SBCS	R12, R5, R12
   101  	SBCS	R13, R6, R13
   102  	SBCS	R14, R7, R14
   103  	SBCS	R15, R8, R15
   104  	SBCS	 ZR, R9, R16
   105  
   106  	// If a+b-p is negative, return a+b
   107  	// Store result in c
   108  	CSEL	CC, R3, R10, R3
   109  	CSEL	CC, R4, R11, R4
   110  	STP	(R3, R4), 0(R0)
   111  	CSEL	CC, R5, R12, R5
   112  	CSEL	CC, R6, R13, R6
   113  	STP	(R5, R6), 16(R0)
   114  	CSEL	CC, R7, R14, R7
   115  	CSEL	CC, R8, R15, R8
   116  	STP	(R7, R8), 32(R0)
   117  
   118  	RET
   119  
   120  // Compute c = a-b mod p
   121  TEXT ·fp384Sub(SB), NOSPLIT, $0-24
   122  	MOVD	c+0(FP), R0
   123  	MOVD	a+8(FP), R1
   124  	MOVD	b+16(FP), R2
   125  
   126  	// Load a in R3-R8, b in R9-R14
   127  	// Compute a-b in R3-R9
   128  	LDP	0(R1), (R3, R4)
   129  	LDP	0(R2), (R9, R10)
   130  	SUBS	R9, R3
   131  	SBCS	R10, R4
   132  	LDP	16(R1), (R5, R6)
   133  	LDP	16(R2), (R11, R12)
   134  	SBCS	R11, R5
   135  	SBCS	R12, R6
   136  	LDP	32(R1), (R7, R8)
   137  	LDP	32(R2), (R13, R14)
   138  	SBCS	R13, R7
   139  	SBCS	R14, R8
   140  	SBC	ZR, ZR, R9
   141  
   142  	// Load p in R10-R15
   143  	// If a-b < 0, (a-b)+p to R3-R8
   144  	// Store result in c
   145  	LDP	·p+ 0(SB), (R10, R11)
   146  	AND	R9, R10
   147  	LDP	·p+16(SB), (R12, R13)
   148  	AND	R9, R11
   149  	AND	R9, R12
   150  	LDP	·p+32(SB), (R14, R15)
   151  	AND	R9, R13
   152  	AND	R9, R14
   153  	AND	R9, R15
   154  
   155  	ADDS	R10, R3
   156  	ADCS	R11, R4
   157  	STP	(R3, R4), 0(R0)
   158  	ADCS	R12, R5
   159  	ADCS	R13, R6
   160  	STP	(R5, R6), 16(R0)
   161  	ADCS	R14, R7
   162  	ADC	R15, R8
   163  	STP	(R7, R8), 32(R0)
   164  
   165  	RET
   166  
   167  // Expects that A0*B0 is already in C0(low),C3(high) and A0*B1 in C1(low),C2(high)
   168  // C0 is not actually touched
   169  // Result of (A0-A2) * (B0-B2) will be in C0-C5
   170  // Inputs remain intact
   171  #define mul192x192comba(A0,A1,A2, B0,B1,B2, C0,C1,C2,C3,C4,C5, S0,S1,S2,S3) \
   172  	MUL	A1, B0, S2	\
   173  	UMULH	A1, B0, S3	\
   174  				\
   175  	ADDS	C3, C1		\
   176  	ADCS	ZR, C2		\
   177  	ADC	ZR, ZR, C3	\
   178  				\
   179  	MUL	A0, B2, S0	\
   180  	UMULH	A0, B2, S1	\
   181  				\
   182  	ADDS	S2, C1		\
   183  	ADCS	S3, C2		\
   184  	ADC	ZR, C3		\
   185  				\
   186  	MUL	A1, B1, S2	\
   187  	UMULH	A1, B1, S3	\
   188  				\
   189  	ADDS	S0, C2		\
   190  	ADCS	S1, C3		\
   191  	ADC	ZR, ZR, C4	\
   192  				\
   193  	MUL	A2, B0, S0	\
   194  	UMULH	A2, B0, S1	\
   195  				\
   196  	ADDS	S2, C2		\
   197  	ADCS	S3, C3		\
   198  	ADC	ZR, C4		\
   199  				\
   200  	MUL	A1, B2, S2	\
   201  	UMULH	A1, B2, S3	\
   202  				\
   203  	ADDS	S0, C2		\
   204  	ADCS	S1, C3		\
   205  	ADC	ZR, C4		\
   206  				\
   207  	MUL	A2, B1, S0	\
   208  	UMULH	A2, B1, S1	\
   209  				\
   210  	ADDS	S2, C3		\
   211  	ADCS	S3, C4		\
   212  	ADC	ZR, ZR, C5	\
   213  				\
   214  	MUL	A2, B2, S2	\
   215  	UMULH	A2, B2, S3	\
   216  				\
   217  	ADDS	S0, C3		\
   218  	ADCS	S1, C4		\
   219  	ADC	ZR, C5		\
   220  				\
   221  	ADDS	S2, C4		\
   222  	ADC	S3, C5
   223  
   224  
   225  // Assumes that there are at least 96 bytes left on the stack
   226  // Expects that X and Y point to input
   227  // X and Y get overwritten, Z0 will be in Y
   228  #define mul384x384karatsuba(X,Y, Z1,Z2,Z3,Z4,Z5,Z6,Z7,Z8,Z9,Z10,Z11, T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12) \
   229  	/* Load a in Z1-Z6, b in T12,Z7-Z11 */ \
   230  	LDP	 0(X), ( Z1,  Z2)	\
   231  	LDP	 0(Y), (T12,  Z7)	\
   232  	MUL	Z1,  Z7, T1		\
   233  	UMULH	Z1, T12, T3		\
   234  	LDP	16(X), ( Z3,  Z4)	\
   235  	LDP	16(Y), ( Z8,  Z9)	\
   236  	MUL	Z1, T12, T0		\
   237  	UMULH	Z1,  Z7, T2		\
   238  	LDP	32(X), ( Z5,  Z6)	\
   239  	LDP	32(Y), (Z10, Z11)	\
   240  					\
   241  	/* Compute aL*bL in T0-T5 */	\
   242  	mul192x192comba(Z1,Z2,Z3, T12,Z7,Z8, T0,T1,T2,T3,T4,T5, T6,T7,T8,T9) \
   243  					\
   244  	/* Compute aH*bH in T6-T11, destroys aL and bL */ \
   245  	MUL	Z4, Z10, T7		\
   246  	MUL	Z4,  Z9, T6		\
   247  	UMULH	Z4,  Z9, T9		\
   248  	UMULH	Z4, Z10, T8		\
   249  	mul192x192comba(Z4,Z5,Z6, Z9,Z10,Z11, T6,T7,T8,T9,T10,T11, Z1,Z2,T12,Z7) \
   250  					\
   251  	/* Compute aL*bL + aH*bH in Z1-Z6,T12, destroys aH */ \
   252  	ADDS	T0,  T6,  Z1		\
   253  	ADCS	T1,  T7,  Z2		\
   254  	ADCS	T2,  T8,  Z3		\
   255  	ADCS	T3,  T9,  Z4		\
   256  	ADCS	T4, T10,  Z5		\
   257  	ADCS	T5, T11,  Z6		\
   258  	ADC	ZR,  ZR, T12		\
   259  					\
   260  	/* Add to T0-T11 and store on stack */ \
   261  	STP	( T0,  T1), -16(RSP)	\
   262  	ADDS	Z1, T3			\
   263  	STP	( T2,  T3), -32(RSP)	\
   264  	ADCS	Z2, T4			\
   265  	ADCS	Z3, T5			\
   266  	STP	( T4,  T5), -48(RSP)	\
   267  	ADCS	Z4, T6			\
   268  	ADCS	Z5, T7			\
   269  	STP	( T6,  T7), -64(RSP)	\
   270  	ADCS	Z6, T8			\
   271  	ADC	ZR, T12			\
   272  	STP	( T8,  T9), -80(RSP)	\
   273  	STP	(T10, T11), -96(RSP)	\
   274  					\
   275  	/* Load a to Z1-Z6 */		\
   276  	LDP	 0(X), (Z1, Z2)		\
   277  	LDP	16(X), (Z3, Z4)		\
   278  	LDP	32(X), (Z5, Z6)		\
   279  					\
   280  	/* Compute |aL-aH| to Z1-Z3, keep borrow in X */ \
   281  	SUBS	Z4, Z1			\
   282  	SBCS	Z5, Z2			\
   283  	SBCS	Z6, Z3			\
   284  	SBC	ZR, ZR, X		\
   285  	NEGS	Z1, Z4			\
   286  	NGCS	Z2, Z5			\
   287  	NGC	Z3, Z6			\
   288  	ADDS	$1, X			\
   289  					\
   290  	/* Load b to Z7-Z11,T0 */	\
   291  	LDP	 0(Y), ( Z7,  Z8)	\
   292  	LDP	16(Y), ( Z9, Z10)	\
   293  	LDP	32(Y), (Z11,  T0)	\
   294  					\
   295  	CSEL	EQ, Z4, Z1, Z1		\
   296  	CSEL	EQ, Z5, Z2 ,Z2		\
   297  	CSEL	EQ, Z6, Z3, Z3		\
   298  					\
   299  	/* Compute |bH-bL| to Z7-Z9, keep borrow in Y */ \
   300  	SUBS	Z7, Z10			\
   301  	SBCS	Z8, Z11			\
   302  	SBCS	Z9, T0			\
   303  	SBC	ZR, ZR, Y		\
   304  	NEGS	Z10, Z7			\
   305  	NGCS	Z11, Z8			\
   306  	NGC	T0, Z9			\
   307  	ADDS	$1, Y			\
   308  	CSEL	EQ, Z7, Z10, Z7		\
   309  	CSEL	EQ, Z8, Z11, Z8		\
   310  	CSEL	EQ, Z9,  T0, Z9		\
   311  					\
   312  	/* Combine borrows */		\
   313  	EOR	Y, X			\
   314  					\
   315  	/* Compute |aL-aH|*|bH-bL| to Z10,Z11,T0-T3 */ \
   316  	MUL	Z1, Z8, Z11		\
   317  	MUL	Z1, Z7, Z10		\
   318  	UMULH	Z1, Z8,  T0		\
   319  	UMULH	Z1, Z7,  T1		\
   320  	mul192x192comba(Z1,Z2,Z3, Z7,Z8,Z9, Z10,Z11,T0,T1,T2,T3, T4,T5,T6,T7) \
   321  					\
   322  	/* The result has to be negated if exactly one of the operands was negative */ \
   323  	NEGS	Z10,  Y			\
   324  	NGCS	Z11, Z1			\
   325  	NGCS	 T0, Z2			\
   326  	NGCS	 T1, Z3			\
   327  	NGCS	 T2, Z4			\
   328  	NGCS	 T3, Z5			\
   329  	NGC	 ZR, T4			\
   330  					\
   331  	AND	T4, X			\
   332  	CMP	$1, X			\
   333  	CSEL	EQ,  Y, Z10, Z10	\
   334  	CSEL	EQ, Z1, Z11, Z11	\
   335  	CSEL	EQ, Z2,  T0,  T0	\
   336  	CSEL	EQ, Z3,  T1,  T1	\
   337  	CSEL	EQ, Z4,  T2,  T2	\
   338  	CSEL	EQ, Z5,  T3,  T3	\
   339  					\
   340  	/* Add that to the middle part */ \
   341  	LDP	-16(RSP), (  Y,  Z1)	\
   342  	LDP	-32(RSP), ( Z2,  Z3)	\
   343  	LDP	-48(RSP), ( Z4,  Z5)	\
   344  	ADDS	Z10, Z3			\
   345  	ADCS	Z11, Z4			\
   346  	LDP	-64(RSP), ( Z6,  Z7)	\
   347  	ADCS	T0, Z5			\
   348  	ADCS	T1, Z6			\
   349  	LDP	-80(RSP), ( Z8,  Z9)	\
   350  	ADCS	T2, Z7			\
   351  	ADCS	T3, Z8			\
   352  	LDP	-96(RSP), (Z10, Z11)	\
   353  	ADCS	T12, Z9			\
   354  	ADCS	ZR, Z10			\
   355  	ADC	ZR, Z11			\
   356  	SUBS	X, Z9			\
   357  	SBCS	ZR, Z10			\
   358  	SBC	ZR, Z11
   359  
   360  // Compute c = a*b*R^-1 mod p
   361  TEXT ·fp384Mul(SB), NOSPLIT, $200-24
   362  	MOVD	c+0(FP), R0
   363  	MOVD	a+8(FP), R1
   364  	MOVD	b+16(FP), R2
   365  
   366  	// Compute a*b in R2-R13
   367  	mul384x384karatsuba(R1, R2, R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,R13, R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26,R27)
   368  
   369  	// Store a*b on the stack
   370  	STP	( R2,  R3), -112(RSP)
   371  	STP	( R4,  R5), -128(RSP)
   372  	STP	( R6,  R7), -144(RSP)
   373  	STP	( R8,  R9), -160(RSP)
   374  	STP	(R10, R11), -176(RSP)
   375  	STP	(R12, R13), -192(RSP)
   376  
   377  	// Compute m = a*b*pp mod 2^384 in R19-R24
   378  	// Store it temporarily in c
   379  	MOVD	·pp+0(SB), R14
   380  	MUL	R14, R2, R19
   381  	UMULH	R14, R2, R20
   382  
   383  	MUL	R14, R3, R16
   384  	UMULH	R14, R3, R21
   385  	ADDS	R16, R20
   386  	ADC	 ZR, R21
   387  
   388  	MUL	R14, R4, R16
   389  	UMULH	R14, R4, R22
   390  	ADDS	R16, R21
   391  	ADC	 ZR, R22
   392  
   393  	MUL	R14, R5, R16
   394  	UMULH	R14, R5, R23
   395  	ADDS	R16, R22
   396  	ADC	 ZR, R23
   397  
   398  	MUL	R14, R6, R16
   399  	UMULH	R14, R6, R24
   400  	ADDS	R16, R23
   401  	ADC	 ZR, R24
   402  
   403  	MADD	R14, R24, R7, R24
   404  
   405  	// ·pp+8(SB) = 1, so we can just add
   406  	ADDS	R2, R20
   407  	STP	(R19, R20), 0(R0)
   408  	ADCS	R3, R21
   409  	ADCS	R4, R22
   410  	ADCS	R5, R23
   411  	ADC	R6, R24
   412  
   413  	LDP	·pp+16(SB), (R14, R15)
   414  	MUL	R14, R2, R8
   415  	UMULH	R14, R2, R9
   416  
   417  	MUL	R14, R3, R16
   418  	UMULH	R14, R3, R10
   419  	ADDS	R16, R9
   420  	ADC	 ZR, R10
   421  
   422  	MUL	R14, R4, R16
   423  	UMULH	R14, R4, R11
   424  	ADDS	R16, R10
   425  	ADC	 ZR, R11
   426  
   427  	MUL	R14, R5, R16
   428  	ADD	R16, R11
   429  
   430  	ADDS	 R8, R21
   431  	ADCS	 R9, R22
   432  	ADCS	R10, R23
   433  	ADC	R11, R24
   434  
   435  	MUL	R15, R2, R8
   436  	UMULH	R15, R2, R9
   437  
   438  	MUL	R15, R3, R16
   439  	UMULH	R15, R3, R10
   440  	ADDS	R16, R9
   441  	ADC	 ZR, R10
   442  
   443  	MADD	R15, R10, R4, R10
   444  
   445  	ADDS	R8, R22
   446  	STP	(R21, R22), 16(R0)
   447  	ADCS	R9, R23
   448  	ADC	R10, R24
   449  
   450  	LDP	·pp+32(SB), (R14, R15)
   451  	MUL	R14, R2, R8
   452  	UMULH	R14, R2, R9
   453  
   454  	MADD	R14, R9, R3, R9
   455  
   456  	ADDS	R8, R23
   457  	ADC	R9, R24
   458  
   459  	MADD	R15, R24, R2, R24
   460  	STP	(R23, R24), 32(R0)
   461  
   462  	// Compute m*p in R1-R12
   463  	MOVD	$·p(SB), R1
   464  	mul384x384karatsuba(R0, R1, R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12, R13,R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26)
   465  
   466  	// Add a*b to m*p in R1-R12,R26
   467  	LDP	-112(RSP), (R13, R14)
   468  	ADDS	R13, R1
   469  	LDP	-128(RSP), (R15, R16)
   470  	ADCS	R14, R2
   471  	ADCS	R15, R3
   472  	LDP	-144(RSP), (R17, R19)
   473  	ADCS	R16, R4
   474  	ADCS	R17, R5
   475  	LDP	-160(RSP), (R20, R21)
   476  	ADCS	R19, R6
   477  	ADCS	R20, R7
   478  	LDP	-176(RSP), (R22, R23)
   479  	ADCS	R21, R8
   480  	ADCS	R22, R9
   481  	LDP	-192(RSP), (R24, R25)
   482  	ADCS	R23, R10
   483  	ADCS	R24, R11
   484  	ADCS	R25, R12
   485  	ADC	ZR, ZR, R26
   486  
   487  	// Reduce the top half mod p
   488  	LDP	·p+ 0(SB), (R13, R14)
   489  	SUBS	R13, R7, R13
   490  	LDP	·p+16(SB), (R15, R16)
   491  	SBCS	R14, R8, R14
   492  	SBCS	R15, R9, R15
   493  	LDP	·p+32(SB), (R17, R19)
   494  	SBCS	R16, R10, R16
   495  	SBCS	R17, R11, R17
   496  	SBCS	R19, R12, R19
   497  	SBCS	ZR, R26
   498  
   499  	// Store result in c
   500  	MOVD	c+0(FP), R0
   501  	CSEL	CC, R7, R13, R7
   502  	CSEL	CC, R8, R14, R8
   503  	STP	( R7,  R8),  0(R0)
   504  	CSEL	CC, R9, R15, R9
   505  	CSEL	CC, R10, R16, R10
   506  	STP	( R9, R10), 16(R0)
   507  	CSEL	CC, R11, R17, R11
   508  	CSEL	CC, R12, R19, R12
   509  	STP	(R11, R12), 32(R0)
   510  
   511  	RET