github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p751/arith_arm64.s (about)

     1  // +build arm64,!purego
     2  
     3  #include "textflag.h"
     4  
     5  TEXT ·cmovP751(SB), NOSPLIT, $0-17
     6  	MOVD	x+0(FP), R0
     7  	MOVD	y+8(FP), R1
     8  	MOVB	choice+16(FP), R2
     9  
    10  	// Set flags
    11  	// If choice is not 0 or 1, this implementation will swap completely
    12  	CMP	$0, R2
    13  
    14  	LDP	0(R0), (R3, R4)
    15  	LDP	0(R1), (R5, R6)
    16  	CSEL	EQ, R3, R5, R7
    17  	CSEL	EQ, R4, R6, R8
    18  	STP	(R7, R8), 0(R0)
    19  
    20  	LDP	16(R0), (R3, R4)
    21  	LDP	16(R1), (R5, R6)
    22  	CSEL	EQ, R3, R5, R7
    23  	CSEL	EQ, R4, R6, R8
    24  	STP	(R7, R8), 16(R0)
    25  
    26  	LDP	32(R0), (R3, R4)
    27  	LDP	32(R1), (R5, R6)
    28  	CSEL	EQ, R3, R5, R7
    29  	CSEL	EQ, R4, R6, R8
    30  	STP	(R7, R8), 32(R0)
    31  
    32  	LDP	48(R0), (R3, R4)
    33  	LDP	48(R1), (R5, R6)
    34  	CSEL	EQ, R3, R5, R7
    35  	CSEL	EQ, R4, R6, R8
    36  	STP	(R7, R8), 48(R0)
    37  
    38  	LDP	64(R0), (R3, R4)
    39  	LDP	64(R1), (R5, R6)
    40  	CSEL	EQ, R3, R5, R7
    41  	CSEL	EQ, R4, R6, R8
    42  	STP	(R7, R8), 64(R0)
    43  
    44  	LDP	80(R0), (R3, R4)
    45  	LDP	80(R1), (R5, R6)
    46  	CSEL	EQ, R3, R5, R7
    47  	CSEL	EQ, R4, R6, R8
    48  	STP	(R7, R8), 80(R0)
    49  
    50  	RET
    51  
    52  TEXT ·cswapP751(SB), NOSPLIT, $0-17
    53  	MOVD	x+0(FP), R0
    54  	MOVD	y+8(FP), R1
    55  	MOVB	choice+16(FP), R2
    56  
    57  	// Set flags
    58  	// If choice is not 0 or 1, this implementation will swap completely
    59  	CMP	$0, R2
    60  
    61  	LDP	0(R0), (R3, R4)
    62  	LDP	0(R1), (R5, R6)
    63  	CSEL	EQ, R3, R5, R7
    64  	CSEL	EQ, R4, R6, R8
    65  	STP	(R7, R8), 0(R0)
    66  	CSEL	NE, R3, R5, R9
    67  	CSEL	NE, R4, R6, R10
    68  	STP	(R9, R10), 0(R1)
    69  
    70  	LDP	16(R0), (R3, R4)
    71  	LDP	16(R1), (R5, R6)
    72  	CSEL	EQ, R3, R5, R7
    73  	CSEL	EQ, R4, R6, R8
    74  	STP	(R7, R8), 16(R0)
    75  	CSEL	NE, R3, R5, R9
    76  	CSEL	NE, R4, R6, R10
    77  	STP	(R9, R10), 16(R1)
    78  
    79  	LDP	32(R0), (R3, R4)
    80  	LDP	32(R1), (R5, R6)
    81  	CSEL	EQ, R3, R5, R7
    82  	CSEL	EQ, R4, R6, R8
    83  	STP	(R7, R8), 32(R0)
    84  	CSEL	NE, R3, R5, R9
    85  	CSEL	NE, R4, R6, R10
    86  	STP	(R9, R10), 32(R1)
    87  
    88  	LDP	48(R0), (R3, R4)
    89  	LDP	48(R1), (R5, R6)
    90  	CSEL	EQ, R3, R5, R7
    91  	CSEL	EQ, R4, R6, R8
    92  	STP	(R7, R8), 48(R0)
    93  	CSEL	NE, R3, R5, R9
    94  	CSEL	NE, R4, R6, R10
    95  	STP	(R9, R10), 48(R1)
    96  
    97  	LDP	64(R0), (R3, R4)
    98  	LDP	64(R1), (R5, R6)
    99  	CSEL	EQ, R3, R5, R7
   100  	CSEL	EQ, R4, R6, R8
   101  	STP	(R7, R8), 64(R0)
   102  	CSEL	NE, R3, R5, R9
   103  	CSEL	NE, R4, R6, R10
   104  	STP	(R9, R10), 64(R1)
   105  
   106  	LDP	80(R0), (R3, R4)
   107  	LDP	80(R1), (R5, R6)
   108  	CSEL	EQ, R3, R5, R7
   109  	CSEL	EQ, R4, R6, R8
   110  	STP	(R7, R8), 80(R0)
   111  	CSEL	NE, R3, R5, R9
   112  	CSEL	NE, R4, R6, R10
   113  	STP	(R9, R10), 80(R1)
   114  
   115  	RET
   116  
   117  TEXT ·addP751(SB), NOSPLIT, $0-24
   118  	MOVD	z+0(FP), R2
   119  	MOVD	x+8(FP), R0
   120  	MOVD	y+16(FP), R1
   121  
   122  	// Load first summand into R3-R14
   123  	// Add first summand and second summand and store result in R3-R14
   124  	LDP	0(R0), (R3, R4)
   125  	LDP	0(R1), (R15, R16)
   126  	LDP	16(R0), (R5, R6)
   127  	LDP	16(R1), (R17, R19)
   128  	ADDS	R15, R3
   129  	ADCS	R16, R4
   130  	ADCS	R17, R5
   131  	ADCS	R19, R6
   132  
   133  	LDP	32(R0), (R7, R8)
   134  	LDP	32(R1), (R15, R16)
   135  	LDP	48(R0), (R9, R10)
   136  	LDP	48(R1), (R17, R19)
   137  	ADCS	R15, R7
   138  	ADCS	R16, R8
   139  	ADCS	R17, R9
   140  	ADCS	R19, R10
   141  
   142  	LDP	64(R0), (R11, R12)
   143  	LDP	64(R1), (R15, R16)
   144  	LDP	80(R0), (R13, R14)
   145  	LDP	80(R1), (R17, R19)
   146  	ADCS	R15, R11
   147  	ADCS	R16, R12
   148  	ADCS	R17, R13
   149  	ADC	R19, R14
   150  
   151  	// Subtract 2 * p751 in R15-R24 from the result in R3-R14
   152  	LDP	·P751x2+0(SB), (R15, R16)
   153  	SUBS	R15, R3
   154  	SBCS	R16, R4
   155  	LDP	·P751x2+40(SB), (R17, R19)
   156  	SBCS	R16, R5
   157  	SBCS	R16, R6
   158  	SBCS	R16, R7
   159  	LDP	·P751x2+56(SB), (R20, R21)
   160  	SBCS	R17, R8
   161  	SBCS	R19, R9
   162  	LDP	·P751x2+72(SB), (R22, R23)
   163  	SBCS	R20, R10
   164  	SBCS	R21, R11
   165  	MOVD	·P751x2+88(SB), R24
   166  	SBCS	R22, R12
   167  	SBCS	R23, R13
   168  	SBCS	R24, R14
   169  	SBC	ZR, ZR, R25
   170  
   171  	// If x + y - 2 * p751 < 0, R25 is 1 and 2 * p751 should be added
   172  	AND	R25, R15
   173  	AND	R25, R16
   174  	AND 	R25, R17
   175  	AND	R25, R19
   176  	AND	R25, R20
   177  	AND	R25, R21
   178  	AND 	R25, R22
   179  	AND	R25, R23
   180  	AND	R25, R24
   181  
   182  	ADDS	R15, R3
   183  	ADCS	R16, R4
   184  	STP	(R3, R4), 0(R2)
   185  	ADCS	R16, R5
   186  	ADCS	R16, R6
   187  	STP	(R5, R6), 16(R2)
   188  	ADCS	R16, R7
   189  	ADCS	R17, R8
   190  	STP	(R7, R8), 32(R2)
   191  	ADCS	R19, R9
   192  	ADCS	R20, R10
   193  	STP	(R9, R10), 48(R2)
   194  	ADCS	R21, R11
   195  	ADCS	R22, R12
   196  	STP	(R11, R12), 64(R2)
   197  	ADCS	R23, R13
   198  	ADC	R24, R14
   199  	STP	(R13, R14), 80(R2)
   200  
   201  	RET
   202  
   203  TEXT ·subP751(SB), NOSPLIT, $0-24
   204  	MOVD	z+0(FP), R2
   205  	MOVD	x+8(FP), R0
   206  	MOVD	y+16(FP), R1
   207  
   208  	// Load x into R3-R14
   209  	// Subtract y from x and store result in R3-R14
   210  	LDP	0(R0), (R3, R4)
   211  	LDP	0(R1), (R15, R16)
   212  	LDP	16(R0), (R5, R6)
   213  	LDP	16(R1), (R17, R19)
   214  	SUBS	R15, R3
   215  	SBCS	R16, R4
   216  	SBCS	R17, R5
   217  	SBCS	R19, R6
   218  
   219  	LDP	32(R0), (R7, R8)
   220  	LDP	32(R1), (R15, R16)
   221  	LDP	48(R0), (R9, R10)
   222  	LDP	48(R1), (R17, R19)
   223  	SBCS	R15, R7
   224  	SBCS	R16, R8
   225  	SBCS	R17, R9
   226  	SBCS	R19, R10
   227  
   228  	LDP	64(R0), (R11, R12)
   229  	LDP	64(R1), (R15, R16)
   230  	LDP	80(R0), (R13, R14)
   231  	LDP	80(R1), (R17, R19)
   232  	SBCS	R15, R11
   233  	SBCS	R16, R12
   234  	SBCS	R17, R13
   235  	SBCS	R19, R14
   236  	SBC	ZR, ZR, R15
   237  
   238  	// If x - y < 0, R15 is 1 and 2 * p751 should be added
   239  	LDP	·P751x2+0(SB), (R16, R17)
   240  	AND	R15, R16
   241  	AND	R15, R17
   242  	LDP	·P751x2+40(SB), (R19, R20)
   243  	AND	R15, R19
   244  	AND	R15, R20
   245  
   246  	ADDS	R16, R3
   247  	ADCS	R17, R4
   248  	STP	(R3, R4), 0(R2)
   249  	ADCS	R17, R5
   250  	ADCS	R17, R6
   251  	STP	(R5, R6), 16(R2)
   252  	ADCS	R17, R7
   253  	ADCS	R19, R8
   254  	STP	(R7, R8), 32(R2)
   255  	ADCS	R20, R9
   256  
   257  	LDP	·P751x2+56(SB), (R16, R17)
   258  	AND	R15, R16
   259  	AND	R15, R17
   260  	LDP	·P751x2+72(SB), (R19, R20)
   261  	AND	R15, R19
   262  	AND	R15, R20
   263  
   264  	ADCS	R16, R10
   265  	STP	(R9, R10), 48(R2)
   266  	ADCS	R17, R11
   267  	ADCS	R19, R12
   268  	STP	(R11, R12), 64(R2)
   269  	ADCS	R20, R13
   270  
   271  	MOVD	·P751x2+88(SB), R16
   272  	AND	R15, R16
   273  	ADC	R16, R14
   274  	STP	(R13, R14), 80(R2)
   275  
   276  	RET
   277  
   278  TEXT ·adlP751(SB), NOSPLIT, $0-24
   279  	MOVD	z+0(FP), R2
   280  	MOVD	x+8(FP), R0
   281  	MOVD	y+16(FP), R1
   282  
   283  	LDP	0(R0), (R3, R4)
   284  	LDP	0(R1), (R15, R16)
   285  	LDP	16(R0), (R5, R6)
   286  	LDP	16(R1), (R17, R19)
   287  	ADDS	R15, R3
   288  	ADCS	R16, R4
   289  	STP	(R3, R4), 0(R2)
   290  	ADCS	R17, R5
   291  	ADCS	R19, R6
   292  	STP	(R5, R6), 16(R2)
   293  
   294  	LDP	32(R0), (R7, R8)
   295  	LDP	32(R1), (R15, R16)
   296  	LDP	48(R0), (R9, R10)
   297  	LDP	48(R1), (R17, R19)
   298  	ADCS	R15, R7
   299  	ADCS	R16, R8
   300  	STP	(R7, R8), 32(R2)
   301  	ADCS	R17, R9
   302  	ADCS	R19, R10
   303  	STP	(R9, R10), 48(R2)
   304  
   305  	LDP	64(R0), (R11, R12)
   306  	LDP	64(R1), (R15, R16)
   307  	LDP	80(R0), (R13, R14)
   308  	LDP	80(R1), (R17, R19)
   309  	ADCS	R15, R11
   310  	ADCS	R16, R12
   311  	STP	(R11, R12), 64(R2)
   312  	ADCS	R17, R13
   313  	ADCS	R19, R14
   314  	STP	(R13, R14), 80(R2)
   315  
   316  	LDP	96(R0), (R3, R4)
   317  	LDP	96(R1), (R15, R16)
   318  	LDP	112(R0), (R5, R6)
   319  	LDP	112(R1), (R17, R19)
   320  	ADCS	R15, R3
   321  	ADCS	R16, R4
   322  	STP	(R3, R4), 96(R2)
   323  	ADCS	R17, R5
   324  	ADCS	R19, R6
   325  	STP	(R5, R6), 112(R2)
   326  
   327  	LDP	128(R0), (R7, R8)
   328  	LDP	128(R1), (R15, R16)
   329  	LDP	144(R0), (R9, R10)
   330  	LDP	144(R1), (R17, R19)
   331  	ADCS	R15, R7
   332  	ADCS	R16, R8
   333  	STP	(R7, R8), 128(R2)
   334  	ADCS	R17, R9
   335  	ADCS	R19, R10
   336  	STP	(R9, R10), 144(R2)
   337  
   338  	LDP	160(R0), (R11, R12)
   339  	LDP	160(R1), (R15, R16)
   340  	LDP	176(R0), (R13, R14)
   341  	LDP	176(R1), (R17, R19)
   342  	ADCS	R15, R11
   343  	ADCS	R16, R12
   344  	STP	(R11, R12), 160(R2)
   345  	ADCS	R17, R13
   346  	ADC	R19, R14
   347  	STP	(R13, R14), 176(R2)
   348  
   349  	RET
   350  
   351  TEXT ·sulP751(SB), NOSPLIT, $0-24
   352  	MOVD	z+0(FP), R2
   353  	MOVD	x+8(FP), R0
   354  	MOVD	y+16(FP), R1
   355  
   356  	LDP	0(R0), (R3, R4)
   357  	LDP	0(R1), (R15, R16)
   358  	LDP	16(R0), (R5, R6)
   359  	LDP	16(R1), (R17, R19)
   360  	SUBS	R15, R3
   361  	SBCS	R16, R4
   362  	STP	(R3, R4), 0(R2)
   363  	SBCS	R17, R5
   364  	SBCS	R19, R6
   365  	STP	(R5, R6), 16(R2)
   366  
   367  	LDP	32(R0), (R7, R8)
   368  	LDP	32(R1), (R15, R16)
   369  	LDP	48(R0), (R9, R10)
   370  	LDP	48(R1), (R17, R19)
   371  	SBCS	R15, R7
   372  	SBCS	R16, R8
   373  	STP	(R7, R8), 32(R2)
   374  	SBCS	R17, R9
   375  	SBCS	R19, R10
   376  	STP	(R9, R10), 48(R2)
   377  
   378  	LDP	64(R0), (R11, R12)
   379  	LDP	64(R1), (R15, R16)
   380  	LDP	80(R0), (R13, R14)
   381  	LDP	80(R1), (R17, R19)
   382  	SBCS	R15, R11
   383  	SBCS	R16, R12
   384  	STP	(R11, R12), 64(R2)
   385  	SBCS	R17, R13
   386  	SBCS	R19, R14
   387  	STP	(R13, R14), 80(R2)
   388  
   389  	LDP	96(R0), (R3, R4)
   390  	LDP	96(R1), (R15, R16)
   391  	LDP	112(R0), (R5, R6)
   392  	LDP	112(R1), (R17, R19)
   393  	SBCS	R15, R3
   394  	SBCS	R16, R4
   395  	SBCS	R17, R5
   396  	SBCS	R19, R6
   397  
   398  	LDP	128(R0), (R7, R8)
   399  	LDP	128(R1), (R15, R16)
   400  	LDP	144(R0), (R9, R10)
   401  	LDP	144(R1), (R17, R19)
   402  	SBCS	R15, R7
   403  	SBCS	R16, R8
   404  	SBCS	R17, R9
   405  	SBCS	R19, R10
   406  
   407  	LDP	160(R0), (R11, R12)
   408  	LDP	160(R1), (R15, R16)
   409  	LDP	176(R0), (R13, R14)
   410  	LDP	176(R1), (R17, R19)
   411  	SBCS	R15, R11
   412  	SBCS	R16, R12
   413  	SBCS	R17, R13
   414  	SBCS	R19, R14
   415  	SBC	ZR, ZR, R15
   416  
   417  	// If x - y < 0, R15 is 1 and p751 should be added
   418  	MOVD	·P751+0(SB), R20
   419  	AND	R15, R20
   420  	LDP	·P751+40(SB), (R16, R17)
   421  	ADDS	R20, R3
   422  	ADCS	R20, R4
   423  	STP	(R3, R4), 96(R2)
   424  	ADCS	R20, R5
   425  	ADCS	R20, R6
   426  	STP	(R5, R6), 112(R2)
   427  	ADCS	R20, R7
   428  
   429  	LDP	·P751+56(SB), (R19, R20)
   430  	AND	R15, R16
   431  	AND 	R15, R17
   432  	ADCS	R16, R8
   433  	STP	(R7, R8), 128(R2)
   434  	ADCS	R17, R9
   435  
   436  	LDP	·P751+72(SB), (R16, R17)
   437  	AND	R15, R19
   438  	AND	R15, R20
   439  	ADCS	R19, R10
   440  	STP	(R9, R10), 144(R2)
   441  	ADCS	R20, R11
   442  
   443  	MOVD	·P751+88(SB), R19
   444  	AND 	R15, R16
   445  	AND 	R15, R17
   446  	ADCS	R16, R12
   447  	STP	(R11, R12), 160(R2)
   448  	ADCS	R17, R13
   449  
   450  	AND	R15, R19
   451  	ADC	R19, R14
   452  	STP	(R13, R14), 176(R2)
   453  
   454  	RET
   455  
   456  // Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high)
   457  // Z0 is not actually touched
   458  // Result of (X0-X2) * (Y0-Y2) will be in Z0-Z5
   459  // Inputs remain intact
   460  #define mul192x192comba(X0, X1, X2, Y0, Y1, Y2, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3) \
   461  	MUL	X1, Y0, T2	\
   462  	UMULH	X1, Y0, T3	\
   463  				\
   464  	ADDS	Z3, Z1		\
   465  	ADCS	ZR, Z2		\
   466  	ADC	ZR, ZR, Z3	\
   467  				\
   468  	MUL	X0, Y2, T0	\
   469  	UMULH	X0, Y2, T1	\
   470  				\
   471  	ADDS	T2, Z1		\
   472  	ADCS	T3, Z2		\
   473  	ADC	ZR, Z3		\
   474  				\
   475  	MUL	X1, Y1, T2	\
   476  	UMULH	X1, Y1, T3	\
   477  				\
   478  	ADDS	T0, Z2		\
   479  	ADCS	T1, Z3		\
   480  	ADC	ZR, ZR, Z4	\
   481  				\
   482  	MUL	X2, Y0, T0	\
   483  	UMULH	X2, Y0, T1	\
   484  				\
   485  	ADDS	T2, Z2		\
   486  	ADCS	T3, Z3		\
   487  	ADC	ZR, Z4		\
   488  				\
   489  	MUL	X1, Y2, T2	\
   490  	UMULH	X1, Y2, T3	\
   491  				\
   492  	ADDS	T0, Z2		\
   493  	ADCS	T1, Z3		\
   494  	ADC	ZR, Z4		\
   495  				\
   496  	MUL	X2, Y1, T0	\
   497  	UMULH	X2, Y1, T1	\
   498  				\
   499  	ADDS	T2, Z3		\
   500  	ADCS	T3, Z4		\
   501  	ADC	ZR, ZR, Z5	\
   502  				\
   503  	MUL	X2, Y2, T2	\
   504  	UMULH	X2, Y2, T3	\
   505  				\
   506  	ADDS	T0, Z3		\
   507  	ADCS	T1, Z4		\
   508  	ADC	ZR, Z5		\
   509  				\
   510  	ADDS	T2, Z4		\
   511  	ADC	T3, Z5
   512  
   513  // Expects that X points to (X4-X6), Y to (Y4-Y6)
   514  // Result of (X0-X5) * (Y0-Y5) will be in (0(Z), 8(Z), 16(Z), T0-T8)
   515  // Inputs get overwritten
   516  #define mul384x384karatsuba(X, Y, Z, X0, X1, X2, X3, X4, X5, Y0, Y1, Y2, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10)\
   517  	ADDS	X0, X3		\	// xH + xL, destroys xH
   518  	ADCS	X1, X4		\
   519  	ADCS	X2, X5		\
   520  	ADC	ZR, ZR, T10	\
   521  				\
   522  	ADDS	Y0, Y3		\	// yH + yL, destroys yH
   523  	ADCS	Y1, Y4		\
   524  	ADCS	Y2, Y5		\
   525  	ADC	ZR, ZR, T6	\
   526  				\
   527  	SUB	T10, ZR, T7	\
   528  	SUB	T6, ZR, T8	\
   529  	AND	T6, T10		\	// combined carry
   530  				\
   531  	AND	T7, Y3, T0	\	// masked(yH + yL)
   532  	AND	T7, Y4, T1	\
   533  	AND	T7, Y5, T2	\
   534  				\
   535  	AND	T8, X3, T3	\	// masked(xH + xL)
   536  	AND	T8, X4, T4	\
   537  	AND	T8, X5, T5	\
   538  				\
   539  	ADDS	T3, T0		\
   540  	ADCS	T4, T1		\
   541  	STP	(T0, T1), 0+Z	\
   542  				\
   543  	MUL	X3, Y3, T0	\
   544  	MUL	X3, Y4, T1	\
   545  				\
   546  	ADCS	T5, T2		\
   547  	MOVD	T2, 16+Z	\
   548  				\
   549  	UMULH	X3, Y4, T2	\
   550  	UMULH	X3, Y3, T3	\
   551  				\
   552  	ADC	ZR, T10		\
   553  				\	// (xH + xL) * (yH + yL)
   554  	mul192x192comba(X3, X4, X5, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
   555  				\
   556  	MUL	X0, Y0, X3	\
   557  	LDP	0+Z, (T6, T7)	\
   558  	MOVD	16+Z, T8	\
   559  				\
   560  	UMULH	X0, Y0, Y3	\
   561  	ADDS	T6, T3		\
   562  	ADCS	T7, T4		\
   563  	MUL	X0, Y1, X4	\
   564  	ADCS	T8, T5		\
   565  	ADC	ZR, T10		\
   566  	UMULH	X0, Y1, X5	\
   567  				\	// xL * yL
   568  	mul192x192comba(X0, X1, X2, Y0, Y1, Y2, X3, X4, X5, Y3, Y4, Y5, T6, T7, T8, T9)\
   569  				\
   570  	STP	(X3, X4), 0+Z	\
   571  	MOVD	X5, 16+Z	\
   572  				\
   573  	SUBS	X3, T0		\	// (xH + xL) * (yH + yL) - xL * yL
   574  	SBCS	X4, T1		\
   575  	LDP	0+X, (X3, X4)	\
   576  	SBCS	X5, T2		\
   577  	MOVD	16+X, X5	\
   578  	SBCS	Y3, T3		\
   579  	SBCS	Y4, T4		\
   580  	SBCS	Y5, T5		\
   581  	SBC	ZR, T10		\
   582  				\
   583  	ADDS	Y3, T0		\	// ((xH + xL) * (yH + yL) - xL * yL) * 2^192 + xL * yL
   584  	ADCS	Y4, T1		\
   585  	LDP	0+Y, (Y3, Y4)	\
   586  	MUL	X3, Y3, X0	\
   587  	ADCS	Y5, T2		\
   588  	UMULH	X3, Y3, Y0	\
   589  	MOVD	16+Y, Y5	\
   590  	MUL	X3, Y4, X1	\
   591  	ADCS	ZR, T3		\
   592  	UMULH	X3, Y4, X2	\
   593  	ADCS	ZR, T4		\
   594  	ADCS	ZR, T5		\
   595  	ADC	ZR, T10		\
   596  				\	// xH * yH, overwrite xLow, yLow
   597  	mul192x192comba(X3, X4, X5, Y3, Y4, Y5, X0, X1, X2, Y0, Y1, Y2, T6, T7, T8, T9)\
   598  				\
   599  	SUBS	X0, T0		\	// ((xH + xL) * (yH + yL) - xL * yL - xH * yH)
   600  	SBCS	X1, T1		\
   601  	SBCS	X2, T2		\
   602  	SBCS	Y0, T3		\
   603  	SBCS	Y1, T4		\
   604  	SBCS	Y2, T5		\
   605  	SBC	ZR, T10		\
   606  				\
   607  	ADDS	X0, T3		\
   608  	ADCS	X1, T4		\
   609  	ADCS	X2, T5		\
   610  	ADCS	T10, Y0, T6	\
   611  	ADCS	ZR, Y1, T7	\
   612  	ADC	ZR, Y2, T8
   613  
   614  
   615  TEXT ·mulP751(SB), NOSPLIT, $0-24
   616  	MOVD	z+0(FP), R2
   617  	MOVD	x+8(FP), R0
   618  	MOVD	y+16(FP), R1
   619  
   620  	// Load xL in R3-R8, xH in R9-R14
   621  	// (xH + xL) in R3-R8, destroys xH
   622  	LDP	0(R0), (R3, R4)
   623  	LDP	48(R0), (R9, R10)
   624  	ADDS	R9, R3
   625  	ADCS	R10, R4
   626  	LDP	16(R0), (R5, R6)
   627  	LDP	64(R0), (R11, R12)
   628  	ADCS	R11, R5
   629  	ADCS	R12, R6
   630  	LDP	32(R0), (R7, R8)
   631  	LDP	80(R0), (R13, R14)
   632  	ADCS	R13, R7
   633  	ADCS	R14, R8
   634  	ADC	ZR, ZR, R22
   635  
   636  	// Load yL in R9-R14, yH in R15-21
   637  	// (yH + yL) in R9-R14, destroys yH
   638  	LDP	0(R1), (R9, R10)
   639  	LDP	48(R1), (R15, R16)
   640  	ADDS	R15, R9
   641  	ADCS	R16, R10
   642  	LDP	16(R1), (R11, R12)
   643  	LDP	64(R1), (R17, R19)
   644  	ADCS	R17, R11
   645  	ADCS	R19, R12
   646  	LDP	32(R1), (R13, R14)
   647  	LDP	80(R1), (R20, R21)
   648  	ADCS	R20, R13
   649  	ADCS	R21, R14
   650  	ADC	ZR, ZR, R23
   651  
   652  	// Compute masks and combined carry
   653  	SUB	R22, ZR, R24
   654  	SUB	R23, ZR, R25
   655  	AND	R23, R22
   656  
   657  	// Store xH, yH in z so mul384x384karatsuba can retrieve them from memory
   658  	// It doesn't have enough registers
   659  	// Meanwhile computed masked(xH + xL) in R15-R21
   660  	STP	(R6, R7), 0(R2)
   661  	AND	R25, R3, R15
   662  	AND	R25, R4, R16
   663  	STP	(R8, R12), 16(R2)
   664  	AND	R25, R5, R17
   665  	AND	R25, R6, R19
   666  	STP	(R13, R14), 32(R2)
   667  	AND	R25, R7, R20
   668  	AND	R25, R8, R21
   669  
   670  	// Masked(xH + xL) + masked(yH + yL) in R15-R21
   671  	// Store intermediate values in z
   672  	AND	R24, R9, R25
   673  	AND	R24, R10, R26
   674  	ADDS	R25, R15
   675  	ADCS	R26, R16
   676  	STP	(R15, R16), 96(R2)
   677  	AND	R24, R11, R25
   678  	AND	R24, R12, R26
   679  	ADCS	R25, R17
   680  	ADCS	R26, R19
   681  	STP	(R17, R19), 112(R2)
   682  	AND	R24, R13, R25
   683  	AND	R24, R14, R26
   684  	ADCS	R25, R20
   685  	ADCS	R26, R21
   686  	STP	(R20, R21), 128(R2)
   687  	// Store carry in R29 so it can remain there
   688  	ADC	ZR, R22, R29
   689  
   690  	// (xH + xL) * (yH + yL)
   691  	mul384x384karatsuba(0(R2), 24(R2), 48(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
   692  
   693  	// Load masked(xH + xL) + masked(yH + yL) and add that to its top half
   694  	// Store the result back in z
   695  	STP	(R15, R16), 72(R2)
   696  	LDP	96(R2), (R3, R4)
   697  	ADDS	R3, R19
   698  	STP	(R17, R19), 88(R2)
   699  	ADCS	R4, R20
   700  	LDP	112(R2), (R5, R6)
   701  	ADCS	R5, R21
   702  	STP	(R20, R21), 104(R2)
   703  	ADCS	R6, R22
   704  	LDP	128(R2), (R7, R8)
   705  	ADCS	R7, R23
   706  	STP	(R22, R23), 120(R2)
   707  	ADCS	R8, R24
   708  	MOVD	R24, 136(R2)
   709  	ADC	ZR, R29
   710  
   711  	// Load xL, yL
   712  	LDP	0(R0), (R3, R4)
   713  	LDP	16(R0), (R5, R6)
   714  	LDP	32(R0), (R7, R8)
   715  	LDP	0(R1), (R9, R10)
   716  	LDP	16(R1), (R11, R12)
   717  	LDP	32(R1), (R13, R14)
   718  
   719  	// xL * yL
   720  	mul384x384karatsuba(24(R0), 24(R1), 0(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
   721  
   722  	// (xH + xL) * (yH + yL) - xL * yL in R3-R14
   723  	LDP	0(R2), (R12, R13)
   724  	LDP	48(R2), (R3, R4)
   725  	SUBS	R12, R3
   726  	LDP	64(R2), (R5, R6)
   727  	MOVD	16(R2), R14
   728  	SBCS	R13, R4
   729  	SBCS	R14, R5
   730  	LDP	80(R2), (R7, R8)
   731  	SBCS	R15, R6
   732  	SBCS	R16, R7
   733  	LDP	96(R2), (R9, R10)
   734  	SBCS	R17, R8
   735  	SBCS	R19, R9
   736  	LDP	112(R2), (R11, R12)
   737  	SBCS	R20, R10
   738  	SBCS	R21, R11
   739  	LDP	128(R2), (R13, R14)
   740  	SBCS	R22, R12
   741  	SBCS	R23, R13
   742  	SBCS	R24, R14
   743  	SBC	ZR, R29
   744  
   745  	STP	(R15, R16), 24(R2)
   746  	MOVD	R17, 40(R2)
   747  
   748  	// ((xH + xL) * (yH + yL) - xL * yL) * 2^384 + xL * yL and store back in z
   749  	ADDS	R19, R3
   750  	ADCS	R20, R4
   751  	STP	(R3, R4), 48(R2)
   752  	ADCS	R21, R5
   753  	ADCS	R22, R6
   754  	STP	(R5, R6), 64(R2)
   755  	ADCS	R23, R7
   756  	ADCS	R24, R8
   757  	STP	(R7, R8), 80(R2)
   758  	ADCS	ZR, R9
   759  	ADCS	ZR, R10
   760  	STP	(R9, R10), 96(R2)
   761  	ADCS	ZR, R11
   762  	ADCS	ZR, R12
   763  	STP	(R11, R12), 112(R2)
   764  	ADCS	ZR, R13
   765  	ADCS	ZR, R14
   766  	STP	(R13, R14), 128(R2)
   767  	ADC	ZR, R29
   768  
   769  	// Load xH, yH
   770  	LDP	48(R0), (R3, R4)
   771  	LDP	64(R0), (R5, R6)
   772  	LDP	80(R0), (R7, R8)
   773  	LDP	48(R1), (R9, R10)
   774  	LDP	64(R1), (R11, R12)
   775  	LDP	80(R1), (R13, R14)
   776  
   777  	// xH * yH
   778  	mul384x384karatsuba(72(R0), 72(R1), 144(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
   779  
   780  	LDP	144(R2), (R12, R13)
   781  	MOVD	160(R2), R14
   782  
   783  	// (xH + xL) * (yH + yL) - xL * yL - xH * yH in R3-R14
   784  	// Store lower half in z, that's done
   785  	LDP	48(R2), (R3, R4)
   786  	SUBS	R12, R3
   787  	LDP	64(R2), (R5, R6)
   788  	SBCS	R13, R4
   789  	SBCS	R14, R5
   790  	LDP	80(R2), (R7, R8)
   791  	SBCS	R15, R6
   792  	SBCS	R16, R7
   793  	LDP	96(R2), (R9, R10)
   794  	SBCS	R17, R8
   795  	SBCS	R19, R9
   796  	LDP	112(R2), (R11, R12)
   797  	SBCS	R20, R10
   798  	SBCS	R21, R11
   799  	LDP	128(R2), (R13, R14)
   800  	SBCS	R22, R12
   801  	SBCS	R23, R13
   802  	STP	(R3, R4), 48(R2)
   803  	SBCS	R24, R14
   804  	STP	(R5, R6), 64(R2)
   805  	SBC	ZR, R29
   806  	STP	(R7, R8), 80(R2)
   807  
   808  	// (xH * yH) * 2^768 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^384 + xL * yL
   809  	// Store remaining limbs in z
   810  	LDP	144(R2), (R3, R4)
   811  	MOVD	160(R2), R5
   812  
   813  	ADDS	R3, R9
   814  	ADCS	R4, R10
   815  	STP	(R9, R10), 96(R2)
   816  	ADCS	R5, R11
   817  	ADCS	R15, R12
   818  	STP	(R11, R12), 112(R2)
   819  	ADCS	R16, R13
   820  	ADCS	R17, R14
   821  	STP	(R13, R14), 128(R2)
   822  
   823  	ADCS	R29, R19
   824  	ADCS	ZR, R20
   825  	STP	(R19, R20), 144(R2)
   826  	ADCS	ZR, R21
   827  	ADCS	ZR, R22
   828  	STP	(R21, R22), 160(R2)
   829  	ADCS	ZR, R23
   830  	ADC	ZR, R24
   831  	STP	(R23, R24), 176(R2)
   832  
   833  	RET
   834  
   835  TEXT ·rdcP751(SB), NOSPLIT, $0-16
   836  	MOVD	z+0(FP), R0
   837  	MOVD	x+8(FP), R1
   838  
   839  	// Load p751+1 in R14-R17, R29, R19-R20, spread over arithmetic
   840  	LDP	·P751p1+40(SB), (R14, R15)
   841  	// z0-z11 will be R2-R13
   842  	// Load x0-x4 to z0-z4 and x5, spread over arithmetic
   843  	LDP	0(R1), (R2, R3)
   844  
   845  	// x5 iteration
   846  	MUL	R2, R14, R22
   847  	LDP	32(R1), (R6, R21)
   848  	UMULH	R2, R14, R23
   849  	ADDS	R21, R22, R7	// Set z5
   850  	ADC	ZR, R23, R25
   851  
   852  	// x6 iteration
   853  	MUL	R2, R15, R22
   854  	MOVD	48(R1), R21
   855  	UMULH	R2, R15, R23
   856  	ADDS	R22, R25
   857  	ADC	R23, ZR, R26
   858  
   859  	MUL	R3, R14, R22
   860  	LDP	·P751p1+56(SB), (R16, R17)
   861  	UMULH	R3, R14, R23
   862  	ADDS	R22, R25
   863  	ADCS	R23, R26
   864  	ADC	ZR, ZR, R24
   865  
   866  	ADDS	R21, R25, R8	// Set z6
   867  	ADCS	ZR, R26
   868  	ADC	ZR, R24
   869  
   870  	// x7 iteration
   871  	MUL	R2, R16, R22
   872  	MOVD	56(R1), R21
   873  	UMULH	R2, R16, R23
   874  	ADDS	R22, R26
   875  	ADCS	R23, R24
   876  	ADC	ZR, ZR, R25
   877  
   878  	MUL	R3, R15, R22
   879  	LDP	16(R1), (R4, R5)
   880  	UMULH	R3, R15, R23
   881  	ADDS	R22, R26
   882  	ADCS	R23, R24
   883  	ADC	ZR, R25
   884  
   885  	MUL	R4, R14, R22
   886  	LDP	·P751p1+72(SB), (R29, R19)
   887  	UMULH	R4, R14, R23
   888  	ADDS	R22, R26
   889  	ADCS	R23, R24
   890  	ADC	ZR, R25
   891  
   892  	ADDS	R21, R26, R9	// Set z7
   893  	ADCS	ZR, R24
   894  	ADC	ZR, R25
   895  
   896  	// x8 iteration
   897  	MUL	R2, R17, R22
   898  	MOVD	64(R1), R21
   899  	UMULH	R2, R17, R23
   900  	ADDS	R22, R24
   901  	ADCS	R23, R25
   902  	ADC	ZR, ZR, R26
   903  
   904  	MUL	R3, R16, R22
   905  	MOVD	·P751p1+88(SB), R20
   906  	UMULH	R3, R16, R23
   907  	ADDS	R22, R24
   908  	ADCS	R23, R25
   909  	ADC	ZR, R26
   910  
   911  	MUL	R4, R15, R22
   912  	UMULH	R4, R15, R23
   913  	ADDS	R22, R24
   914  	ADCS	R23, R25
   915  	ADC	ZR, R26
   916  
   917  	MUL	R5, R14, R22
   918  	UMULH	R5, R14, R23
   919  	ADDS	R22, R24
   920  	ADCS	R23, R25
   921  	ADC	ZR, R26
   922  
   923  	ADDS	R24, R21, R10	// Set z8
   924  	ADCS	ZR, R25
   925  	ADC	ZR, R26
   926  
   927  	// x9 iteration
   928  	MUL	R2, R29, R22
   929  	MOVD	72(R1), R21
   930  	UMULH	R2, R29, R23
   931  	ADDS	R22, R25
   932  	ADCS	R23, R26
   933  	ADC	ZR, ZR, R24
   934  
   935  	MUL	R3, R17, R22
   936  	UMULH	R3, R17, R23
   937  	ADDS	R22, R25
   938  	ADCS	R23, R26
   939  	ADC	ZR, R24
   940  
   941  	MUL	R4, R16, R22
   942  	UMULH	R4, R16, R23
   943  	ADDS	R22, R25
   944  	ADCS	R23, R26
   945  	ADC	ZR, R24
   946  
   947  	MUL	R5, R15, R22
   948  	UMULH	R5, R15, R23
   949  	ADDS	R22, R25
   950  	ADCS	R23, R26
   951  	ADC	ZR, R24
   952  
   953  	MUL	R6, R14, R22
   954  	UMULH	R6, R14, R23
   955  	ADDS	R22, R25
   956  	ADCS	R23, R26
   957  	ADC	ZR, R24
   958  
   959  	ADDS	R21, R25, R11	// Set z9
   960  	ADCS	ZR, R26
   961  	ADC	ZR, R24
   962  
   963  	// x10 iteration
   964  	MUL	R2, R19, R22
   965  	MOVD	80(R1), R21
   966  	UMULH	R2, R19, R23
   967  	ADDS	R22, R26
   968  	ADCS	R23, R24
   969  	ADC	ZR, ZR, R25
   970  
   971  	MUL	R3, R29, R22
   972  	UMULH	R3, R29, R23
   973  	ADDS	R22, R26
   974  	ADCS	R23, R24
   975  	ADC	ZR, R25
   976  
   977  	MUL	R4, R17, R22
   978  	UMULH	R4, R17, R23
   979  	ADDS	R22, R26
   980  	ADCS	R23, R24
   981  	ADC	ZR, R25
   982  
   983  	MUL	R5, R16, R22
   984  	UMULH	R5, R16, R23
   985  	ADDS	R22, R26
   986  	ADCS	R23, R24
   987  	ADC	ZR, R25
   988  
   989  	MUL	R6, R15, R22
   990  	UMULH	R6, R15, R23
   991  	ADDS	R22, R26
   992  	ADCS	R23, R24
   993  	ADC	ZR, R25
   994  
   995  	MUL	R7, R14, R22
   996  	UMULH	R7, R14, R23
   997  	ADDS	R22, R26
   998  	ADCS	R23, R24
   999  	ADC	ZR, R25
  1000  
  1001  	ADDS	R21, R26, R12	// Set z10
  1002  	ADCS	ZR, R24
  1003  	ADC	ZR, R25
  1004  
  1005  	// x11 iteration
  1006  	MUL	R2, R20, R22
  1007  	MOVD	88(R1), R21
  1008  	UMULH	R2, R20, R23
  1009  	ADDS	R22, R24
  1010  	ADCS	R23, R25
  1011  	ADC	ZR, ZR, R26
  1012  
  1013  	MUL	R3, R19, R22
  1014  	UMULH	R3, R19, R23
  1015  	ADDS	R22, R24
  1016  	ADCS	R23, R25
  1017  	ADC	ZR, R26
  1018  
  1019  	MUL	R4, R29, R22
  1020  	UMULH	R4, R29, R23
  1021  	ADDS	R22, R24
  1022  	ADCS	R23, R25
  1023  	ADC	ZR, R26
  1024  
  1025  	MUL	R5, R17, R22
  1026  	UMULH	R5, R17, R23
  1027  	ADDS	R22, R24
  1028  	ADCS	R23, R25
  1029  	ADC	ZR, R26
  1030  
  1031  	MUL	R6, R16, R22
  1032  	UMULH	R6, R16, R23
  1033  	ADDS	R22, R24
  1034  	ADCS	R23, R25
  1035  	ADC	ZR, R26
  1036  
  1037  	MUL	R7, R15, R22
  1038  	UMULH	R7, R15, R23
  1039  	ADDS	R22, R24
  1040  	ADCS	R23, R25
  1041  	ADC	ZR, R26
  1042  
  1043  	MUL	R8, R14, R22
  1044  	UMULH	R8, R14, R23
  1045  	ADDS	R22, R24
  1046  	ADCS	R23, R25
  1047  	ADC	ZR, R26
  1048  
  1049  	ADDS	R21, R24, R13	// Set z11
  1050  	ADCS	ZR, R25
  1051  	ADC	ZR, R26
  1052  
  1053  	// x12 iteration
  1054  	MUL	R3, R20, R22
  1055  	MOVD	96(R1), R21
  1056  	UMULH	R3, R20, R23
  1057  	ADDS	R22, R25
  1058  	ADCS	R23, R26
  1059  	ADC	ZR, ZR, R24
  1060  
  1061  	MUL	R4, R19, R22
  1062  	UMULH	R4, R19, R23
  1063  	ADDS	R22, R25
  1064  	ADCS	R23, R26
  1065  	ADC	ZR, R24
  1066  
  1067  	MUL	R5, R29, R22
  1068  	UMULH	R5, R29, R23
  1069  	ADDS	R22, R25
  1070  	ADCS	R23, R26
  1071  	ADC	ZR, R24
  1072  
  1073  	MUL	R6, R17, R22
  1074  	UMULH	R6, R17, R23
  1075  	ADDS	R22, R25
  1076  	ADCS	R23, R26
  1077  	ADC	ZR, R24
  1078  
  1079  	MUL	R7, R16, R22
  1080  	UMULH	R7, R16, R23
  1081  	ADDS	R22, R25
  1082  	ADCS	R23, R26
  1083  	ADC	ZR, R24
  1084  
  1085  	MUL	R8, R15, R22
  1086  	UMULH	R8, R15, R23
  1087  	ADDS	R22, R25
  1088  	ADCS	R23, R26
  1089  	ADC	ZR, R24
  1090  
  1091  	MUL	R9, R14, R22
  1092  	UMULH	R9, R14, R23
  1093  	ADDS	R22, R25
  1094  	ADCS	R23, R26
  1095  	ADC	ZR, R24
  1096  
  1097  	ADDS	R21, R25, R2	// Set z0
  1098  	ADCS	ZR, R26
  1099  	ADC	ZR, R24
  1100  
  1101  	// x13 iteration
  1102  	MUL	R4, R20, R22
  1103  	MOVD	104(R1), R21
  1104  	UMULH	R4, R20, R23
  1105  	ADDS	R22, R26
  1106  	ADCS	R23, R24
  1107  	ADC	ZR, ZR, R25
  1108  
  1109  	MUL	R5, R19, R22
  1110  	UMULH	R5, R19, R23
  1111  	ADDS	R22, R26
  1112  	ADCS	R23, R24
  1113  	ADC	ZR, R25
  1114  
  1115  	MUL	R6, R29, R22
  1116  	UMULH	R6, R29, R23
  1117  	ADDS	R22, R26
  1118  	ADCS	R23, R24
  1119  	ADC	ZR, R25
  1120  
  1121  	MUL	R7, R17, R22
  1122  	UMULH	R7, R17, R23
  1123  	ADDS	R22, R26
  1124  	ADCS	R23, R24
  1125  	ADC	ZR, R25
  1126  
  1127  	MUL	R8, R16, R22
  1128  	UMULH	R8, R16, R23
  1129  	ADDS	R22, R26
  1130  	ADCS	R23, R24
  1131  	ADC	ZR, R25
  1132  
  1133  	MUL	R9, R15, R22
  1134  	UMULH	R9, R15, R23
  1135  	ADDS	R22, R26
  1136  	ADCS	R23, R24
  1137  	ADC	ZR, R25
  1138  
  1139  	MUL	R10, R14, R22
  1140  	UMULH	R10, R14, R23
  1141  	ADDS	R22, R26
  1142  	ADCS	R23, R24
  1143  	ADC	ZR, R25
  1144  
  1145  	ADDS	R21, R26, R3	// Set z1
  1146  	STP	(R2, R3), 0(R0)
  1147  	ADCS	ZR, R24
  1148  	ADC	ZR, R25
  1149  
  1150  	// x14 iteration
  1151  	MUL	R5, R20, R22
  1152  	MOVD	112(R1), R21
  1153  	UMULH	R5, R20, R23
  1154  	ADDS	R22, R24
  1155  	ADCS	R23, R25
  1156  	ADC	ZR, ZR, R26
  1157  
  1158  	MUL	R6, R19, R22
  1159  	UMULH	R6, R19, R23
  1160  	ADDS	R22, R24
  1161  	ADCS	R23, R25
  1162  	ADC	ZR, R26
  1163  
  1164  	MUL	R7, R29, R22
  1165  	UMULH	R7, R29, R23
  1166  	ADDS	R22, R24
  1167  	ADCS	R23, R25
  1168  	ADC	ZR, R26
  1169  
  1170  	MUL	R8, R17, R22
  1171  	UMULH	R8, R17, R23
  1172  	ADDS	R22, R24
  1173  	ADCS	R23, R25
  1174  	ADC	ZR, R26
  1175  
  1176  	MUL	R9, R16, R22
  1177  	UMULH	R9, R16, R23
  1178  	ADDS	R22, R24
  1179  	ADCS	R23, R25
  1180  	ADC	ZR, R26
  1181  
  1182  	MUL	R10, R15, R22
  1183  	UMULH	R10, R15, R23
  1184  	ADDS	R22, R24
  1185  	ADCS	R23, R25
  1186  	ADC	ZR, R26
  1187  
  1188  	MUL	R11, R14, R22
  1189  	UMULH	R11, R14, R23
  1190  	ADDS	R22, R24
  1191  	ADCS	R23, R25
  1192  	ADC	ZR, R26
  1193  
  1194  	ADDS	R21, R24, R4	// Set z2
  1195  	ADCS	ZR, R25
  1196  	ADC	ZR, R26
  1197  
  1198  	// x15 iteration
  1199  	MUL	R6, R20, R22
  1200  	MOVD	120(R1), R21
  1201  	UMULH	R6, R20, R23
  1202  	ADDS	R22, R25
  1203  	ADCS	R23, R26
  1204  	ADC	ZR, ZR, R24
  1205  
  1206  	MUL	R7, R19, R22
  1207  	UMULH	R7, R19, R23
  1208  	ADDS	R22, R25
  1209  	ADCS	R23, R26
  1210  	ADC	ZR, R24
  1211  
  1212  	MUL	R8, R29, R22
  1213  	UMULH	R8, R29, R23
  1214  	ADDS	R22, R25
  1215  	ADCS	R23, R26
  1216  	ADC	ZR, R24
  1217  
  1218  	MUL	R9, R17, R22
  1219  	UMULH	R9, R17, R23
  1220  	ADDS	R22, R25
  1221  	ADCS	R23, R26
  1222  	ADC	ZR, R24
  1223  
  1224  	MUL	R10, R16, R22
  1225  	UMULH	R10, R16, R23
  1226  	ADDS	R22, R25
  1227  	ADCS	R23, R26
  1228  	ADC	ZR, R24
  1229  
  1230  	MUL	R11, R15, R22
  1231  	UMULH	R11, R15, R23
  1232  	ADDS	R22, R25
  1233  	ADCS	R23, R26
  1234  	ADC	ZR, R24
  1235  
  1236  	MUL	R12, R14, R22
  1237  	UMULH	R12, R14, R23
  1238  	ADDS	R22, R25
  1239  	ADCS	R23, R26
  1240  	ADC	ZR, R24
  1241  
  1242  	ADDS	R21, R25, R5	// Set z3
  1243  	STP	(R4, R5), 16(R0)
  1244  	ADCS	ZR, R26
  1245  	ADC	ZR, R24
  1246  
  1247  	// x16 iteration
  1248  	MUL	R7, R20, R22
  1249  	MOVD	128(R1), R21
  1250  	UMULH	R7, R20, R23
  1251  	ADDS	R22, R26
  1252  	ADCS	R23, R24
  1253  	ADC	ZR, ZR, R25
  1254  
  1255  	MUL	R8, R19, R22
  1256  	UMULH	R8, R19, R23
  1257  	ADDS	R22, R26
  1258  	ADCS	R23, R24
  1259  	ADC	ZR, R25
  1260  
  1261  	MUL	R9, R29, R22
  1262  	UMULH	R9, R29, R23
  1263  	ADDS	R22, R26
  1264  	ADCS	R23, R24
  1265  	ADC	ZR, R25
  1266  
  1267  	MUL	R10, R17, R22
  1268  	UMULH	R10, R17, R23
  1269  	ADDS	R22, R26
  1270  	ADCS	R23, R24
  1271  	ADC	ZR, R25
  1272  
  1273  	MUL	R11, R16, R22
  1274  	UMULH	R11, R16, R23
  1275  	ADDS	R22, R26
  1276  	ADCS	R23, R24
  1277  	ADC	ZR, R25
  1278  
  1279  	MUL	R12, R15, R22
  1280  	UMULH	R12, R15, R23
  1281  	ADDS	R22, R26
  1282  	ADCS	R23, R24
  1283  	ADC	ZR, R25
  1284  
  1285  	MUL	R13, R14, R22
  1286  	UMULH	R13, R14, R23
  1287  	ADDS	R22, R26
  1288  	ADCS	R23, R24
  1289  	ADC	ZR, R25
  1290  
  1291  	ADDS	R21, R26, R6	// Set z4
  1292  	ADCS	ZR, R24
  1293  	ADC	ZR, R25
  1294  
  1295  	// x17 iteration
  1296  	MUL	R8, R20, R22
  1297  	MOVD	136(R1), R21
  1298  	UMULH	R8, R20, R23
  1299  	ADDS	R22, R24
  1300  	ADCS	R23, R25
  1301  	ADC	ZR, ZR, R26
  1302  
  1303  	MUL	R9, R19, R22
  1304  	UMULH	R9, R19, R23
  1305  	ADDS	R22, R24
  1306  	ADCS	R23, R25
  1307  	ADC	ZR, R26
  1308  
  1309  	MUL	R10, R29, R22
  1310  	UMULH	R10, R29, R23
  1311  	ADDS	R22, R24
  1312  	ADCS	R23, R25
  1313  	ADC	ZR, R26
  1314  
  1315  	MUL	R11, R17, R22
  1316  	UMULH	R11, R17, R23
  1317  	ADDS	R22, R24
  1318  	ADCS	R23, R25
  1319  	ADC	ZR, R26
  1320  
  1321  	MUL	R12, R16, R22
  1322  	UMULH	R12, R16, R23
  1323  	ADDS	R22, R24
  1324  	ADCS	R23, R25
  1325  	ADC	ZR, R26
  1326  
  1327  	MUL	R13, R15, R22
  1328  	UMULH	R13, R15, R23
  1329  	ADDS	R22, R24
  1330  	ADCS	R23, R25
  1331  	ADC	ZR, R26
  1332  
  1333  	ADDS	R21, R24, R7	// Set z5
  1334  	STP	(R6, R7), 32(R0)
  1335  	ADCS	ZR, R25
  1336  	ADC	ZR, R26
  1337  
  1338  	// x18 iteration
  1339  	MUL	R9, R20, R22
  1340  	MOVD	144(R1), R21
  1341  	UMULH	R9, R20, R23
  1342  	ADDS	R22, R25
  1343  	ADCS	R23, R26
  1344  	ADC	ZR, ZR, R24
  1345  
  1346  	MUL	R10, R19, R22
  1347  	UMULH	R10, R19, R23
  1348  	ADDS	R22, R25
  1349  	ADCS	R23, R26
  1350  	ADC	ZR, R24
  1351  
  1352  	MUL	R11, R29, R22
  1353  	UMULH	R11, R29, R23
  1354  	ADDS	R22, R25
  1355  	ADCS	R23, R26
  1356  	ADC	ZR, R24
  1357  
  1358  	MUL	R12, R17, R22
  1359  	UMULH	R12, R17, R23
  1360  	ADDS	R22, R25
  1361  	ADCS	R23, R26
  1362  	ADC	ZR, R24
  1363  
  1364  	MUL	R13, R16, R22
  1365  	UMULH	R13, R16, R23
  1366  	ADDS	R22, R25
  1367  	ADCS	R23, R26
  1368  	ADC	ZR, R24
  1369  
  1370  	ADDS	R21, R25, R8	// Set z6
  1371  	ADCS	ZR, R26
  1372  	ADC	ZR, R24
  1373  
  1374  	// x19 iteration
  1375  	MUL	R10, R20, R22
  1376  	MOVD	152(R1), R21
  1377  	UMULH	R10, R20, R23
  1378  	ADDS	R22, R26
  1379  	ADCS	R23, R24
  1380  	ADC	ZR, ZR, R25
  1381  
  1382  	MUL	R11, R19, R22
  1383  	UMULH	R11, R19, R23
  1384  	ADDS	R22, R26
  1385  	ADCS	R23, R24
  1386  	ADC	ZR, R25
  1387  
  1388  	MUL	R12, R29, R22
  1389  	UMULH	R12, R29, R23
  1390  	ADDS	R22, R26
  1391  	ADCS	R23, R24
  1392  	ADC	ZR, R25
  1393  
  1394  	MUL	R13, R17, R22
  1395  	UMULH	R13, R17, R23
  1396  	ADDS	R22, R26
  1397  	ADCS	R23, R24
  1398  	ADC	ZR, R25
  1399  
  1400  	ADDS	R21, R26, R9	// Set z7
  1401  	STP	(R8, R9), 48(R0)
  1402  	ADCS	ZR, R24
  1403  	ADC	ZR, R25
  1404  
  1405  	// x20 iteration
  1406  	MUL	R11, R20, R22
  1407  	MOVD	160(R1), R21
  1408  	UMULH	R11, R20, R23
  1409  	ADDS	R22, R24
  1410  	ADCS	R23, R25
  1411  	ADC	ZR, ZR, R26
  1412  
  1413  	MUL	R12, R19, R22
  1414  	UMULH	R12, R19, R23
  1415  	ADDS	R22, R24
  1416  	ADCS	R23, R25
  1417  	ADC	ZR, R26
  1418  
  1419  	MUL	R13, R29, R22
  1420  	UMULH	R13, R29, R23
  1421  	ADDS	R22, R24
  1422  	ADCS	R23, R25
  1423  	ADC	ZR, R26
  1424  
  1425  	ADDS	R21, R24, R10	// Set z8
  1426  	ADCS	ZR, R25
  1427  	ADC	ZR, R26
  1428  
  1429  	// x21 iteration
  1430  	MUL	R12, R20, R22
  1431  	MOVD	168(R1), R21
  1432  	UMULH	R12, R20, R23
  1433  	ADDS	R22, R25
  1434  	ADCS	R23, R26
  1435  	ADC	ZR, ZR, R24
  1436  
  1437  	MUL	R13, R19, R22
  1438  	UMULH	R13, R19, R23
  1439  	ADDS	R22, R25
  1440  	ADCS	R23, R26
  1441  	ADC	ZR, R24
  1442  
  1443  	ADDS	R21, R25, R11	// Set z9
  1444  	STP	(R10, R11), 64(R0)
  1445  	ADCS	ZR, R26
  1446  	ADC	ZR, R24
  1447  
  1448  	// x22 iteration
  1449  	MUL	R13, R20, R22
  1450  	MOVD	176(R1), R21
  1451  	UMULH	R13, R20, R23
  1452  	ADDS	R22, R26
  1453  	ADC	R23, R24
  1454  	ADDS	R21, R26, R12	// Set z10
  1455  
  1456  	MOVD	184(R1), R21
  1457  	ADC	R21, R24, R13	// Set z11
  1458  	STP	(R12, R13), 80(R0)
  1459  
  1460  	RET
  1461  
  1462  TEXT ·modP751(SB), NOSPLIT, $0-8
  1463  	MOVD	x+0(FP), R0
  1464  
  1465  	// Keep x in R1-R12, p751 in R13-R21, subtract to R1-R12
  1466  	MOVD	·P751+0(SB), R13
  1467  	LDP	0(R0), (R1, R2)
  1468  	LDP	16(R0), (R3, R4)
  1469  	SUBS	R13, R1
  1470  	SBCS	R13, R2
  1471  
  1472  	LDP	32(R0), (R5, R6)
  1473  	LDP	·P751+40(SB), (R14, R15)
  1474  	SBCS	R13, R3
  1475  	SBCS	R13, R4
  1476  
  1477  	LDP	48(R0), (R7, R8)
  1478  	LDP	·P751+56(SB), (R16, R17)
  1479  	SBCS	R13, R5
  1480  	SBCS	R14, R6
  1481  
  1482  	LDP	64(R0), (R9, R10)
  1483  	LDP	·P751+72(SB), (R19, R20)
  1484  	SBCS	R15, R7
  1485  	SBCS	R16, R8
  1486  
  1487  	LDP	80(R0), (R11, R12)
  1488  	MOVD	·P751+88(SB), R21
  1489  	SBCS	R17, R9
  1490  	SBCS	R19, R10
  1491  
  1492  	SBCS	R20, R11
  1493  	SBCS	R21, R12
  1494  	SBC	ZR, ZR, R22
  1495  
  1496  	// Mask with the borrow and add p751
  1497  	AND	R22, R13
  1498  	AND	R22, R14
  1499  	AND	R22, R15
  1500  	AND	R22, R16
  1501  	AND	R22, R17
  1502  	AND	R22, R19
  1503  	AND	R22, R20
  1504  	AND	R22, R21
  1505  
  1506  	ADDS	R13, R1
  1507  	ADCS	R13, R2
  1508  	STP 	(R1, R2), 0(R0)
  1509  	ADCS	R13, R3
  1510  	ADCS	R13, R4
  1511  	STP 	(R3, R4), 16(R0)
  1512  	ADCS	R13, R5
  1513  	ADCS	R14, R6
  1514  	STP 	(R5, R6), 32(R0)
  1515  	ADCS	R15, R7
  1516  	ADCS	R16, R8
  1517  	STP 	(R7, R8), 48(R0)
  1518  	ADCS	R17, R9
  1519  	ADCS	R19, R10
  1520  	STP 	(R9, R10), 64(R0)
  1521  	ADCS	R20, R11
  1522  	ADC	R21, R12
  1523  	STP 	(R11, R12), 80(R0)
  1524  
  1525  	RET