github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p503/arith_arm64.s (about)

     1  // +build arm64,!purego
     2  
     3  #include "textflag.h"
     4  
     5  
     6  TEXT ·cmovP503(SB), NOSPLIT, $0-17
     7  	MOVD	x+0(FP), R0
     8  	MOVD	y+8(FP), R1
     9  	MOVB	choice+16(FP), R2
    10  
    11  	// Set flags
    12  	// If choice is not 0 or 1, this implementation will swap completely
    13  	CMP	$0, R2
    14  
    15  	LDP	0(R0), (R3, R4)
    16  	LDP	0(R1), (R5, R6)
    17  	CSEL	EQ, R3, R5, R7
    18  	CSEL	EQ, R4, R6, R8
    19  	STP	(R7, R8), 0(R0)
    20  
    21  	LDP	16(R0), (R3, R4)
    22  	LDP	16(R1), (R5, R6)
    23  	CSEL	EQ, R3, R5, R7
    24  	CSEL	EQ, R4, R6, R8
    25  	STP	(R7, R8), 16(R0)
    26  
    27  	LDP	32(R0), (R3, R4)
    28  	LDP	32(R1), (R5, R6)
    29  	CSEL	EQ, R3, R5, R7
    30  	CSEL	EQ, R4, R6, R8
    31  	STP	(R7, R8), 32(R0)
    32  
    33  	LDP	48(R0), (R3, R4)
    34  	LDP	48(R1), (R5, R6)
    35  	CSEL	EQ, R3, R5, R7
    36  	CSEL	EQ, R4, R6, R8
    37  	STP	(R7, R8), 48(R0)
    38  
    39  	RET
    40  
    41  TEXT ·cswapP503(SB), NOSPLIT, $0-17
    42  	MOVD	x+0(FP), R0
    43  	MOVD	y+8(FP), R1
    44  	MOVB	choice+16(FP), R2
    45  
    46  	// Set flags
    47  	// If choice is not 0 or 1, this implementation will swap completely
    48  	CMP	$0, R2
    49  
    50  	LDP	0(R0), (R3, R4)
    51  	LDP	0(R1), (R5, R6)
    52  	CSEL	EQ, R3, R5, R7
    53  	CSEL	EQ, R4, R6, R8
    54  	STP	(R7, R8), 0(R0)
    55  	CSEL	NE, R3, R5, R9
    56  	CSEL	NE, R4, R6, R10
    57  	STP	(R9, R10), 0(R1)
    58  
    59  	LDP	16(R0), (R3, R4)
    60  	LDP	16(R1), (R5, R6)
    61  	CSEL	EQ, R3, R5, R7
    62  	CSEL	EQ, R4, R6, R8
    63  	STP	(R7, R8), 16(R0)
    64  	CSEL	NE, R3, R5, R9
    65  	CSEL	NE, R4, R6, R10
    66  	STP	(R9, R10), 16(R1)
    67  
    68  	LDP	32(R0), (R3, R4)
    69  	LDP	32(R1), (R5, R6)
    70  	CSEL	EQ, R3, R5, R7
    71  	CSEL	EQ, R4, R6, R8
    72  	STP	(R7, R8), 32(R0)
    73  	CSEL	NE, R3, R5, R9
    74  	CSEL	NE, R4, R6, R10
    75  	STP	(R9, R10), 32(R1)
    76  
    77  	LDP	48(R0), (R3, R4)
    78  	LDP	48(R1), (R5, R6)
    79  	CSEL	EQ, R3, R5, R7
    80  	CSEL	EQ, R4, R6, R8
    81  	STP	(R7, R8), 48(R0)
    82  	CSEL	NE, R3, R5, R9
    83  	CSEL	NE, R4, R6, R10
    84  	STP	(R9, R10), 48(R1)
    85  
    86  	RET
    87  
    88  TEXT ·addP503(SB), NOSPLIT, $0-24
    89  	MOVD	z+0(FP), R2
    90  	MOVD	x+8(FP), R0
    91  	MOVD	y+16(FP), R1
    92  
    93  	// Load first summand into R3-R10
    94  	// Add first summand and second summand and store result in R3-R10
    95  	LDP	0(R0), (R3, R4)
    96  	LDP	0(R1), (R11, R12)
    97  	LDP	16(R0), (R5, R6)
    98  	LDP	16(R1), (R13, R14)
    99  	ADDS	R11, R3
   100  	ADCS	R12, R4
   101  	ADCS	R13, R5
   102  	ADCS	R14, R6
   103  
   104  	LDP	32(R0), (R7, R8)
   105  	LDP	32(R1), (R11, R12)
   106  	LDP	48(R0), (R9, R10)
   107  	LDP	48(R1), (R13, R14)
   108  	ADCS	R11, R7
   109  	ADCS	R12, R8
   110  	ADCS	R13, R9
   111  	ADC	R14, R10
   112  
   113  	// Subtract 2 * p503 in R11-R17 from the result in R3-R10
   114  	LDP	·P503x2+0(SB), (R11, R12)
   115  	LDP	·P503x2+24(SB), (R13, R14)
   116  	SUBS	R11, R3
   117  	SBCS	R12, R4
   118  	LDP	·P503x2+40(SB), (R15, R16)
   119  	SBCS	R12, R5
   120  	SBCS	R13, R6
   121  	MOVD	·P503x2+56(SB), R17
   122  	SBCS	R14, R7
   123  	SBCS	R15, R8
   124  	SBCS	R16, R9
   125  	SBCS	R17, R10
   126  	SBC	ZR, ZR, R19
   127  
   128  	// If x + y - 2 * p503 < 0, R19 is 1 and 2 * p503 should be added
   129  	AND	R19, R11
   130  	AND	R19, R12
   131  	AND	R19, R13
   132  	AND	R19, R14
   133  	AND	R19, R15
   134  	AND	R19, R16
   135  	AND	R19, R17
   136  
   137  	ADDS	R11, R3
   138  	ADCS	R12, R4
   139  	STP	(R3, R4), 0(R2)
   140  	ADCS	R12, R5
   141  	ADCS	R13, R6
   142  	STP	(R5, R6), 16(R2)
   143  	ADCS	R14, R7
   144  	ADCS	R15, R8
   145  	STP	(R7, R8), 32(R2)
   146  	ADCS	R16, R9
   147  	ADC	R17, R10
   148  	STP	(R9, R10), 48(R2)
   149  
   150  	RET
   151  
   152  TEXT ·subP503(SB), NOSPLIT, $0-24
   153  	MOVD	z+0(FP), R2
   154  	MOVD	x+8(FP), R0
   155  	MOVD	y+16(FP), R1
   156  
   157  	// Load x into R3-R10
   158  	// Subtract y from x and store result in R3-R10
   159  	LDP	0(R0), (R3, R4)
   160  	LDP	0(R1), (R11, R12)
   161  	LDP	16(R0), (R5, R6)
   162  	LDP	16(R1), (R13, R14)
   163  	SUBS	R11, R3
   164  	SBCS	R12, R4
   165  	SBCS	R13, R5
   166  	SBCS	R14, R6
   167  
   168  	LDP	32(R0), (R7, R8)
   169  	LDP	32(R1), (R11, R12)
   170  	LDP	48(R0), (R9, R10)
   171  	LDP	48(R1), (R13, R14)
   172  	SBCS	R11, R7
   173  	SBCS	R12, R8
   174  	SBCS	R13, R9
   175  	SBCS	R14, R10
   176  	SBC	ZR, ZR, R19
   177  
   178  	// If x - y < 0, R19 is 1 and 2 * p503 should be added
   179  	LDP	·P503x2+0(SB), (R11, R12)
   180  	LDP	·P503x2+24(SB), (R13, R14)
   181  	AND	R19, R11
   182  	AND	R19, R12
   183  	LDP	·P503x2+40(SB), (R15, R16)
   184  	AND	R19, R13
   185  	AND	R19, R14
   186  	MOVD	·P503x2+56(SB), R17
   187  	AND	R19, R15
   188  	AND	R19, R16
   189  	AND	R19, R17
   190  
   191  	ADDS	R11, R3
   192  	ADCS	R12, R4
   193  	STP	(R3, R4), 0(R2)
   194  	ADCS	R12, R5
   195  	ADCS	R13, R6
   196  	STP	(R5, R6), 16(R2)
   197  	ADCS	R14, R7
   198  	ADCS	R15, R8
   199  	STP	(R7, R8), 32(R2)
   200  	ADCS	R16, R9
   201  	ADC	R17, R10
   202  	STP	(R9, R10), 48(R2)
   203  
   204  	RET
   205  
   206  TEXT ·adlP503(SB), NOSPLIT, $0-24
   207  	MOVD	z+0(FP), R2
   208  	MOVD	x+8(FP), R0
   209  	MOVD	y+16(FP), R1
   210  
   211  	LDP	0(R0), (R3, R4)
   212  	LDP	0(R1), (R11, R12)
   213  	LDP	16(R0), (R5, R6)
   214  	LDP	16(R1), (R13, R14)
   215  	ADDS	R11, R3
   216  	ADCS	R12, R4
   217  	STP	(R3, R4), 0(R2)
   218  	ADCS	R13, R5
   219  	ADCS	R14, R6
   220  	STP	(R5, R6), 16(R2)
   221  
   222  	LDP	32(R0), (R7, R8)
   223  	LDP	32(R1), (R11, R12)
   224  	LDP	48(R0), (R9, R10)
   225  	LDP	48(R1), (R13, R14)
   226  	ADCS	R11, R7
   227  	ADCS	R12, R8
   228  	STP	(R7, R8), 32(R2)
   229  	ADCS	R13, R9
   230  	ADCS	R14, R10
   231  	STP	(R9, R10), 48(R2)
   232  
   233  	LDP	64(R0), (R3, R4)
   234  	LDP	64(R1), (R11, R12)
   235  	LDP	80(R0), (R5, R6)
   236  	LDP	80(R1), (R13, R14)
   237  	ADCS	R11, R3
   238  	ADCS	R12, R4
   239  	STP	(R3, R4), 64(R2)
   240  	ADCS	R13, R5
   241  	ADCS	R14, R6
   242  	STP	(R5, R6), 80(R2)
   243  
   244  	LDP	96(R0), (R7, R8)
   245  	LDP	96(R1), (R11, R12)
   246  	LDP	112(R0), (R9, R10)
   247  	LDP	112(R1), (R13, R14)
   248  	ADCS	R11, R7
   249  	ADCS	R12, R8
   250  	STP	(R7, R8), 96(R2)
   251  	ADCS	R13, R9
   252  	ADC	R14, R10
   253  	STP	(R9, R10), 112(R2)
   254  
   255  	RET
   256  
   257  TEXT ·sulP503(SB), NOSPLIT, $0-24
   258  	MOVD	z+0(FP), R2
   259  	MOVD	x+8(FP), R0
   260  	MOVD	y+16(FP), R1
   261  
   262  	LDP	0(R0), (R3, R4)
   263  	LDP	0(R1), (R11, R12)
   264  	LDP	16(R0), (R5, R6)
   265  	LDP	16(R1), (R13, R14)
   266  	SUBS	R11, R3
   267  	SBCS	R12, R4
   268  	STP	(R3, R4), 0(R2)
   269  	SBCS	R13, R5
   270  	SBCS	R14, R6
   271  	STP	(R5, R6), 16(R2)
   272  
   273  	LDP	32(R0), (R7, R8)
   274  	LDP	32(R1), (R11, R12)
   275  	LDP	48(R0), (R9, R10)
   276  	LDP	48(R1), (R13, R14)
   277  	SBCS	R11, R7
   278  	SBCS	R12, R8
   279  	STP	(R7, R8), 32(R2)
   280  	SBCS	R13, R9
   281  	SBCS	R14, R10
   282  	STP	(R9, R10), 48(R2)
   283  
   284  	LDP	64(R0), (R3, R4)
   285  	LDP	64(R1), (R11, R12)
   286  	LDP	80(R0), (R5, R6)
   287  	LDP	80(R1), (R13, R14)
   288  	SBCS	R11, R3
   289  	SBCS	R12, R4
   290  	SBCS	R13, R5
   291  	SBCS	R14, R6
   292  
   293  	LDP	96(R0), (R7, R8)
   294  	LDP	96(R1), (R11, R12)
   295  	LDP	112(R0), (R9, R10)
   296  	LDP	112(R1), (R13, R14)
   297  	SBCS	R11, R7
   298  	SBCS	R12, R8
   299  	SBCS	R13, R9
   300  	SBCS	R14, R10
   301  	SBC	ZR, ZR, R15
   302  
   303  	// If x - y < 0, R15 is 1 and p503 should be added
   304  	LDP	·P503+16(SB), (R16, R17)
   305  	LDP	·P503+32(SB), (R19, R20)
   306  	AND	R15, R16
   307  	AND	R15, R17
   308  	LDP	·P503+48(SB), (R21, R22)
   309  	AND	R15, R19
   310  	AND	R15, R20
   311  	AND	R15, R21
   312  	AND	R15, R22
   313  
   314  	ADDS	R16, R3
   315  	ADCS	R16, R4
   316  	STP	(R3, R4), 64(R2)
   317  	ADCS	R16, R5
   318  	ADCS	R17, R6
   319  	STP	(R5, R6), 80(R2)
   320  	ADCS	R19, R7
   321  	ADCS	R20, R8
   322  	STP	(R7, R8), 96(R2)
   323  	ADCS	R21, R9
   324  	ADC	R22, R10
   325  	STP	(R9, R10), 112(R2)
   326  
   327  	RET
   328  
   329  // Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high)
   330  // Z0 is not actually touched
   331  // Result of (X0-X1) * (Y0-Y1) will be in Z0-Z3
   332  // Inputs get overwritten, except for X1
   333  #define mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, Z2, Z3, T0) \
   334  	MUL	X1, Y0, X0	\
   335  	UMULH	X1, Y0, Y0	\
   336  	ADDS	Z3, Z1		\
   337  	ADC	ZR, Z2		\
   338  				\
   339  	MUL	Y1, X1, T0	\
   340  	UMULH	Y1, X1, Y1	\
   341  	ADDS	X0, Z1		\
   342  	ADCS	Y0, Z2		\
   343  	ADC	ZR, ZR, Z3	\
   344  				\
   345  	ADDS	T0, Z2		\
   346  	ADC	Y1, Z3
   347  
   348  // Expects that X points to (X0-X1)
   349  // Result of (X0-X3) * (Y0-Y3) will be in Z0-Z7
   350  // Inputs get overwritten, except X2-X3 and Y2-Y3
   351  #define mul256x256karatsuba(X, X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1)\
   352  	ADDS	X2, X0		\	// xH + xL, destroys xL
   353  	ADCS	X3, X1		\
   354  	ADCS	ZR, ZR, T0	\
   355  				\
   356  	ADDS	Y2, Y0, Z6	\	// yH + yL
   357  	ADCS	Y3, Y1, T1	\
   358  	ADC	ZR, ZR, Z7	\
   359  				\
   360  	SUB	T0, ZR, Z2	\
   361  	SUB	Z7, ZR, Z3	\
   362  	AND	Z7, T0		\	// combined carry
   363  				\
   364  	AND	Z2, Z6, Z0	\	// masked(yH + yL)
   365  	AND	Z2, T1, Z1	\
   366  				\
   367  	AND	Z3, X0, Z4	\	// masked(xH + xL)
   368  	AND	Z3, X1, Z5	\
   369  				\
   370  	MUL	Z6, X0, Z2	\
   371  	MUL	T1, X0, Z3	\
   372  				\
   373  	ADDS	Z4, Z0		\
   374  	UMULH	T1, X0, Z4	\
   375  	ADCS	Z5, Z1		\
   376  	UMULH	Z6, X0, Z5	\
   377  	ADC	ZR, T0		\
   378  				\	// (xH + xL) * (yH + yL)
   379  	mul128x128comba(X0, X1, Z6, T1, Z2, Z3, Z4, Z5, Z7)\
   380  				\
   381  	LDP 0+X, (X0, X1)	\
   382  				\
   383  	ADDS	Z0, Z4		\
   384  	UMULH	Y0, X0, Z7	\
   385  	UMULH	Y1, X0, T1	\
   386  	ADCS	Z1, Z5		\
   387  	MUL	Y0, X0, Z0	\
   388  	MUL	Y1, X0, Z1	\
   389  	ADC	ZR, T0		\
   390  				\	// xL * yL
   391  	mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, T1, Z7, Z6)\
   392  				\
   393  	MUL	Y2, X2, X0	\
   394  	UMULH	Y2, X2, Y0	\
   395  	SUBS	Z0, Z2		\	// (xH + xL) * (yH + yL) - xL * yL
   396  	SBCS	Z1, Z3		\
   397  	SBCS	T1, Z4		\
   398  	MUL	Y3, X2, X1	\
   399  	UMULH	Y3, X2, Z6	\
   400  	SBCS	Z7, Z5		\
   401  	SBCS	ZR, T0		\
   402  				\	// xH * yH
   403  	mul128x128comba(X2, X3, Y2, Y3, X0, X1, Z6, Y0, Y1)\
   404  				\
   405  	SUBS	X0, Z2		\	// (xH + xL) * (yH + yL) - xL * yL - xH * yH
   406  	SBCS	X1, Z3		\
   407  	SBCS	Z6, Z4		\
   408  	SBCS	Y0, Z5		\
   409  	SBCS	ZR, T0		\
   410  				\
   411  	ADDS	T1, Z2		\	// (xH * yH) * 2^256 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^128 + xL * yL
   412  	ADCS	Z7, Z3		\
   413  	ADCS	X0, Z4		\
   414  	ADCS	X1, Z5		\
   415  	ADCS	T0, Z6		\
   416  	ADC	Y0, ZR, Z7
   417  
   418  
   419  // This implements two-level Karatsuba with a 128x128 Comba multiplier
   420  // at the bottom
   421  TEXT ·mulP503(SB), NOSPLIT, $0-24
   422  	MOVD	z+0(FP), R2
   423  	MOVD	x+8(FP), R0
   424  	MOVD	y+16(FP), R1
   425  
   426  	// Load xL in R3-R6, xH in R7-R10
   427  	// (xH + xL) in R25-R29
   428  	LDP	0(R0), (R3, R4)
   429  	LDP	32(R0), (R7, R8)
   430  	ADDS	R3, R7, R25
   431  	ADCS	R4, R8, R26
   432  	LDP	16(R0), (R5, R6)
   433  	LDP	48(R0), (R9, R10)
   434  	ADCS	R5, R9, R27
   435  	ADCS	R6, R10, R29
   436  	ADC	ZR, ZR, R7
   437  
   438  	// Load yL in R11-R14, yH in R15-19
   439  	// (yH + yL) in R11-R14, destroys yL
   440  	LDP	0(R1), (R11, R12)
   441  	LDP	32(R1), (R15, R16)
   442  	ADDS	R15, R11
   443  	ADCS	R16, R12
   444  	LDP	16(R1), (R13, R14)
   445  	LDP	48(R1), (R17, R19)
   446  	ADCS	R17, R13
   447  	ADCS	R19, R14
   448  	ADC	ZR, ZR, R8
   449  
   450  	// Compute masks and combined carry
   451  	SUB	R7, ZR, R9
   452  	SUB	R8, ZR, R10
   453  	AND	R8, R7
   454  
   455  	// masked(yH + yL)
   456  	AND	R9, R11, R15
   457  	AND	R9, R12, R16
   458  	AND	R9, R13, R17
   459  	AND	R9, R14, R19
   460  
   461  	// masked(xH + xL)
   462  	AND	R10, R25, R20
   463  	AND	R10, R26, R21
   464  	AND	R10, R27, R22
   465  	AND	R10, R29, R23
   466  
   467  	// masked(xH + xL) + masked(yH + yL) in R15-R19
   468  	ADDS	R20, R15
   469  	ADCS	R21, R16
   470  	ADCS	R22, R17
   471  	ADCS	R23, R19
   472  	ADC	ZR, R7
   473  
   474  	// Use z as temporary storage
   475  	STP	(R25, R26), 0(R2)
   476  
   477  	// (xH + xL) * (yH + yL)
   478  	mul256x256karatsuba(0(R2), R25, R26, R27, R29, R11, R12, R13, R14, R8, R9, R10, R20, R21, R22, R23, R24, R0, R1)
   479  
   480  	MOVD	x+8(FP), R0
   481  	MOVD	y+16(FP), R1
   482  
   483  	ADDS	R21, R15
   484  	ADCS	R22, R16
   485  	ADCS	R23, R17
   486  	ADCS	R24, R19
   487  	ADC	ZR, R7
   488  
   489  	// Load yL in R11-R14
   490  	LDP	0(R1), (R11, R12)
   491  	LDP	16(R1), (R13, R14)
   492  
   493  	// xL * yL
   494  	mul256x256karatsuba(0(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R1, R2)
   495  
   496  	MOVD	z+0(FP), R2
   497  	MOVD	y+16(FP), R1
   498  
   499  	// (xH + xL) * (yH + yL) - xL * yL
   500  	SUBS	R21, R8
   501  	SBCS	R22, R9
   502  	STP	(R21, R22), 0(R2)
   503  	SBCS	R23, R10
   504  	SBCS	R24, R20
   505  	STP	(R23, R24), 16(R2)
   506  	SBCS	R25, R15
   507  	SBCS	R26, R16
   508  	SBCS	R27, R17
   509  	SBCS	R29, R19
   510  	SBC	ZR, R7
   511  
   512  	// Load xH in R3-R6, yH in R11-R14
   513  	LDP	32(R0), (R3, R4)
   514  	LDP	48(R0), (R5, R6)
   515  	LDP	32(R1), (R11, R12)
   516  	LDP	48(R1), (R13, R14)
   517  
   518  	ADDS	R25, R8
   519  	ADCS	R26, R9
   520  	ADCS	R27, R10
   521  	ADCS	R29, R20
   522  	ADC	ZR, ZR, R1
   523  
   524  	MOVD	R20, 32(R2)
   525  
   526  	// xH * yH
   527  	mul256x256karatsuba(32(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R2, R20)
   528  	NEG	R1, R1
   529  
   530  	MOVD	z+0(FP), R2
   531  	MOVD	32(R2), R20
   532  
   533  	// (xH + xL) * (yH + yL) - xL * yL - xH * yH in R8-R10,R20,R15-R19
   534  	// Store lower half in z, that's done
   535  	SUBS	R21, R8
   536  	SBCS	R22, R9
   537  	STP	(R8, R9), 32(R2)
   538  	SBCS	R23, R10
   539  	SBCS	R24, R20
   540  	STP	(R10, R20), 48(R2)
   541  	SBCS	R25, R15
   542  	SBCS	R26, R16
   543  	SBCS	R27, R17
   544  	SBCS	R29, R19
   545  	SBC	ZR, R7
   546  
   547  	// (xH * yH) * 2^512 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^256 + xL * yL
   548  	// Store remaining limbs in z
   549  	ADDS	$1, R1
   550  	ADCS	R21, R15
   551  	ADCS	R22, R16
   552  	STP	(R15, R16), 64(R2)
   553  	ADCS	R23, R17
   554  	ADCS	R24, R19
   555  	STP	(R17, R19), 80(R2)
   556  	ADCS	R7, R25
   557  	ADCS	ZR, R26
   558  	STP	(R25, R26), 96(R2)
   559  	ADCS	ZR, R27
   560  	ADC	ZR, R29
   561  	STP	(R27, R29), 112(R2)
   562  
   563  	RET
   564  
   565  // Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high)
   566  // Z0 is not actually touched
   567  // Result of (X0-X1) * (Y0-Y3) will be in Z0-Z5
   568  // Inputs remain intact
   569  #define mul128x256comba(X0, X1, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3)\
   570  	MUL	X1, Y0, T0	\
   571  	UMULH	X1, Y0, T1	\
   572  	ADDS	Z3, Z1		\
   573  	ADC	ZR, Z2		\
   574  				\
   575  	MUL	X0, Y2, T2	\
   576  	UMULH	X0, Y2, T3	\
   577  	ADDS	T0, Z1		\
   578  	ADCS	T1, Z2		\
   579  	ADC	ZR, ZR, Z3	\
   580  				\
   581  	MUL	X1, Y1, T0	\
   582  	UMULH	X1, Y1, T1	\
   583  	ADDS	T2, Z2		\
   584  	ADCS	T3, Z3		\
   585  	ADC	ZR, ZR, Z4	\
   586  				\
   587  	MUL	X0, Y3, T2	\
   588  	UMULH	X0, Y3, T3	\
   589  	ADDS	T0, Z2		\
   590  	ADCS	T1, Z3		\
   591  	ADC	ZR, Z4		\
   592  				\
   593  	MUL	X1, Y2, T0	\
   594  	UMULH	X1, Y2, T1	\
   595  	ADDS	T2, Z3		\
   596  	ADCS	T3, Z4		\
   597  	ADC	ZR, ZR, Z5	\
   598  				\
   599  	MUL	X1, Y3, T2	\
   600  	UMULH	X1, Y3, T3	\
   601  	ADDS	T0, Z3		\
   602  	ADCS	T1, Z4		\
   603  	ADC	ZR, Z5		\
   604  	ADDS	T2, Z4		\
   605  	ADC	T3, Z5
   606  
   607  // This implements the shifted 2^(B*w) Montgomery reduction from
   608  // https://eprint.iacr.org/2016/986.pdf with B = 4, w = 64
   609  TEXT ·rdcP503(SB), NOSPLIT, $0-16
   610  	MOVD	x+8(FP), R0
   611  
   612  	// Load x0-x1
   613  	LDP	0(R0), (R2, R3)
   614  
   615  	// Load the prime constant in R25-R29
   616  	LDP	·P503p1s8+32(SB), (R25, R26)
   617  	LDP	·P503p1s8+48(SB), (R27, R29)
   618  
   619  	// [x0,x1] * p503p1s8 to R4-R9
   620  	MUL	R2, R25, R4		// x0 * p503p1s8[0]
   621  	UMULH	R2, R25, R7
   622  	MUL	R2, R26, R5		// x0 * p503p1s8[1]
   623  	UMULH	R2, R26, R6
   624  
   625  	mul128x256comba(R2, R3, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13)
   626  
   627  	LDP	16(R0), (R3, R11)	// x2
   628  	LDP	32(R0), (R12, R13)
   629  	LDP	48(R0), (R14, R15)
   630  
   631  	// Left-shift result in R4-R9 by 56 to R4-R10
   632  	ORR	R9>>8, ZR, R10
   633  	LSL	$56, R9
   634  	ORR	R8>>8, R9
   635  	LSL	$56, R8
   636  	ORR	R7>>8, R8
   637  	LSL	$56, R7
   638  	ORR	R6>>8, R7
   639  	LSL	$56, R6
   640  	ORR	R5>>8, R6
   641  	LSL	$56, R5
   642  	ORR	R4>>8, R5
   643  	LSL	$56, R4
   644  
   645  	ADDS	R4, R11			// x3
   646  	ADCS	R5, R12			// x4
   647  	ADCS	R6, R13
   648  	ADCS	R7, R14
   649  	ADCS	R8, R15
   650  	LDP	64(R0), (R16, R17)
   651  	LDP	80(R0), (R19, R20)
   652  	MUL	R3, R25, R4		// x2 * p503p1s8[0]
   653  	UMULH	R3, R25, R7
   654  	ADCS	R9, R16
   655  	ADCS	R10, R17
   656  	ADCS	ZR, R19
   657  	ADCS	ZR, R20
   658  	LDP	96(R0), (R21, R22)
   659  	LDP	112(R0), (R23, R24)
   660  	MUL	R3, R26, R5		// x2 * p503p1s8[1]
   661  	UMULH	R3, R26, R6
   662  	ADCS	ZR, R21
   663  	ADCS	ZR, R22
   664  	ADCS	ZR, R23
   665  	ADC	ZR, R24
   666  
   667  	// [x2,x3] * p503p1s8 to R4-R9
   668  	mul128x256comba(R3, R11, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2)
   669  
   670  	ORR	R9>>8, ZR, R10
   671  	LSL	$56, R9
   672  	ORR	R8>>8, R9
   673  	LSL	$56, R8
   674  	ORR	R7>>8, R8
   675  	LSL	$56, R7
   676  	ORR	R6>>8, R7
   677  	LSL	$56, R6
   678  	ORR	R5>>8, R6
   679  	LSL	$56, R5
   680  	ORR	R4>>8, R5
   681  	LSL	$56, R4
   682  
   683  	ADDS	R4, R13			// x5
   684  	ADCS	R5, R14			// x6
   685  	ADCS	R6, R15
   686  	ADCS	R7, R16
   687  	MUL	R12, R25, R4		// x4 * p503p1s8[0]
   688  	UMULH	R12, R25, R7
   689  	ADCS	R8, R17
   690  	ADCS	R9, R19
   691  	ADCS	R10, R20
   692  	ADCS	ZR, R21
   693  	MUL	R12, R26, R5		// x4 * p503p1s8[1]
   694  	UMULH	R12, R26, R6
   695  	ADCS	ZR, R22
   696  	ADCS	ZR, R23
   697  	ADC	ZR, R24
   698  
   699  	// [x4,x5] * p503p1s8 to R4-R9
   700  	mul128x256comba(R12, R13, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2)
   701  
   702  	ORR	R9>>8, ZR, R10
   703  	LSL	$56, R9
   704  	ORR	R8>>8, R9
   705  	LSL	$56, R8
   706  	ORR	R7>>8, R8
   707  	LSL	$56, R7
   708  	ORR	R6>>8, R7
   709  	LSL	$56, R6
   710  	ORR	R5>>8, R6
   711  	LSL	$56, R5
   712  	ORR	R4>>8, R5
   713  	LSL	$56, R4
   714  
   715  	ADDS	R4, R15			// x7
   716  	ADCS	R5, R16			// x8
   717  	ADCS	R6, R17
   718  	ADCS	R7, R19
   719  	MUL	R14, R25, R4		// x6 * p503p1s8[0]
   720  	UMULH	R14, R25, R7
   721  	ADCS	R8, R20
   722  	ADCS	R9, R21
   723  	ADCS	R10, R22
   724  	MUL	R14, R26, R5		// x6 * p503p1s8[1]
   725  	UMULH	R14, R26, R6
   726  	ADCS	ZR, R23
   727  	ADC	ZR, R24
   728  
   729  	// [x6,x7] * p503p1s8 to R4-R9
   730  	mul128x256comba(R14, R15, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2)
   731  
   732  	ORR	R9>>8, ZR, R10
   733  	LSL	$56, R9
   734  	ORR	R8>>8, R9
   735  	LSL	$56, R8
   736  	ORR	R7>>8, R8
   737  	LSL	$56, R7
   738  	ORR	R6>>8, R7
   739  	LSL	$56, R6
   740  	ORR	R5>>8, R6
   741  	LSL	$56, R5
   742  	ORR	R4>>8, R5
   743  	LSL	$56, R4
   744  
   745  	MOVD	z+0(FP), R0
   746  	ADDS	R4, R17
   747  	ADCS	R5, R19
   748  	STP	(R16, R17),  0(R0)	// Store final result to z
   749  	ADCS	R6, R20
   750  	ADCS	R7, R21
   751  	STP	(R19, R20), 16(R0)
   752  	ADCS	R8, R22
   753  	ADCS	R9, R23
   754  	STP	(R21, R22), 32(R0)
   755  	ADC	R10, R24
   756  	STP	(R23, R24), 48(R0)
   757  
   758  	RET
   759  
   760  TEXT ·modP503(SB), NOSPLIT, $0-8
   761  	MOVD	x+0(FP), R0
   762  
   763  	// Keep x in R1-R8, p503 in R9-R14, subtract to R1-R8
   764  	LDP	·P503+16(SB), (R9, R10)
   765  	LDP	0(R0), (R1, R2)
   766  	LDP	16(R0), (R3, R4)
   767  	SUBS	R9, R1
   768  	SBCS	R9, R2
   769  
   770  	LDP	32(R0), (R5, R6)
   771  	LDP	·P503+32(SB), (R11, R12)
   772  	SBCS	R9, R3
   773  	SBCS	R10, R4
   774  
   775  	LDP	48(R0), (R7, R8)
   776  	LDP	·P503+48(SB), (R13, R14)
   777  	SBCS	R11, R5
   778  	SBCS	R12, R6
   779  
   780  	SBCS	R13, R7
   781  	SBCS	R14, R8
   782  	SBC	ZR, ZR, R15
   783  
   784  	// Mask with the borrow and add p503
   785  	AND	R15, R9
   786  	AND	R15, R10
   787  	AND	R15, R11
   788  	AND	R15, R12
   789  	AND	R15, R13
   790  	AND	R15, R14
   791  
   792  	ADDS	R9, R1
   793  	ADCS	R9, R2
   794  	STP	(R1, R2), 0(R0)
   795  	ADCS	R9, R3
   796  	ADCS	R10, R4
   797  	STP	(R3, R4), 16(R0)
   798  	ADCS	R11, R5
   799  	ADCS	R12, R6
   800  	STP	(R5, R6), 32(R0)
   801  	ADCS	R13, R7
   802  	ADCS	R14, R8
   803  	STP	(R7, R8), 48(R0)
   804  
   805  	RET