github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p751/arith_amd64.s (about)

     1  // +build amd64,!purego
     2  
     3  #include "textflag.h"
     4  
     5  // p751 + 1
     6  #define P751P1_5   $0xEEB0000000000000
     7  #define P751P1_6   $0xE3EC968549F878A8
     8  #define P751P1_7   $0xDA959B1A13F7CC76
     9  #define P751P1_8   $0x084E9867D6EBE876
    10  #define P751P1_9   $0x8562B5045CB25748
    11  #define P751P1_10  $0x0E12909F97BADC66
    12  #define P751P1_11  $0x00006FE5D541F71C
    13  
    14  #define P751_0     $0xFFFFFFFFFFFFFFFF
    15  #define P751_5     $0xEEAFFFFFFFFFFFFF
    16  #define P751_6     $0xE3EC968549F878A8
    17  #define P751_7     $0xDA959B1A13F7CC76
    18  #define P751_8     $0x084E9867D6EBE876
    19  #define P751_9     $0x8562B5045CB25748
    20  #define P751_10    $0x0E12909F97BADC66
    21  #define P751_11    $0x00006FE5D541F71C
    22  
    23  #define P751X2_0   $0xFFFFFFFFFFFFFFFE
    24  #define P751X2_1   $0xFFFFFFFFFFFFFFFF
    25  #define P751X2_5   $0xDD5FFFFFFFFFFFFF
    26  #define P751X2_6   $0xC7D92D0A93F0F151
    27  #define P751X2_7   $0xB52B363427EF98ED
    28  #define P751X2_8   $0x109D30CFADD7D0ED
    29  #define P751X2_9   $0x0AC56A08B964AE90
    30  #define P751X2_10  $0x1C25213F2F75B8CD
    31  #define P751X2_11  $0x0000DFCBAA83EE38
    32  
    33  // The MSR code uses these registers for parameter passing.  Keep using
    34  // them to avoid significant code changes.  This means that when the Go
    35  // assembler does something strange, we can diff the machine code
    36  // against a different assembler to find out what Go did.
    37  
    38  #define REG_P1 DI
    39  #define REG_P2 SI
    40  #define REG_P3 DX
    41  
    42  TEXT ·modP751(SB), NOSPLIT, $0-8
    43  	MOVQ	x+0(FP), REG_P1
    44  
    45  	// Zero AX for later use:
    46  	XORQ	AX, AX
    47  
    48  	// Load p into registers:
    49  	MOVQ	P751_0, R8
    50  	// P751_{1,2,3,4} = P751_0, so reuse R8
    51  	MOVQ	P751_5, R9
    52  	MOVQ	P751_6, R10
    53  	MOVQ	P751_7, R11
    54  	MOVQ	P751_8, R12
    55  	MOVQ	P751_9, R13
    56  	MOVQ	P751_10, R14
    57  	MOVQ	P751_11, R15
    58  
    59  	// Set x <- x - p
    60  	SUBQ	R8, (REG_P1)
    61  	SBBQ	R8, (8)(REG_P1)
    62  	SBBQ	R8, (16)(REG_P1)
    63  	SBBQ	R8, (24)(REG_P1)
    64  	SBBQ	R8, (32)(REG_P1)
    65  	SBBQ	R9, (40)(REG_P1)
    66  	SBBQ	R10, (48)(REG_P1)
    67  	SBBQ	R11, (56)(REG_P1)
    68  	SBBQ	R12, (64)(REG_P1)
    69  	SBBQ	R13, (72)(REG_P1)
    70  	SBBQ	R14, (80)(REG_P1)
    71  	SBBQ    R15, (88)(REG_P1)
    72  
    73  	// Save carry flag indicating x-p < 0 as a mask in AX
    74  	SBBQ	$0, AX
    75  
    76  	// Conditionally add p to x if x-p < 0
    77  	ANDQ	AX, R8
    78  	ANDQ	AX, R9
    79  	ANDQ	AX, R10
    80  	ANDQ	AX, R11
    81  	ANDQ	AX, R12
    82  	ANDQ	AX, R13
    83  	ANDQ	AX, R14
    84  	ANDQ	AX, R15
    85  
    86  	ADDQ	R8, (REG_P1)
    87  	ADCQ	R8, (8)(REG_P1)
    88  	ADCQ	R8, (16)(REG_P1)
    89  	ADCQ	R8, (24)(REG_P1)
    90  	ADCQ	R8, (32)(REG_P1)
    91  	ADCQ	R9, (40)(REG_P1)
    92  	ADCQ	R10, (48)(REG_P1)
    93  	ADCQ	R11, (56)(REG_P1)
    94  	ADCQ	R12, (64)(REG_P1)
    95  	ADCQ	R13, (72)(REG_P1)
    96  	ADCQ	R14, (80)(REG_P1)
    97  	ADCQ    R15, (88)(REG_P1)
    98  
    99  	RET
   100  
   101  TEXT ·cswapP751(SB), NOSPLIT, $0-17
   102  
   103  	MOVQ	x+0(FP), REG_P1
   104  	MOVQ	y+8(FP), REG_P2
   105  	MOVB	choice+16(FP), AL	// AL = 0 or 1
   106  	MOVBLZX	AL, AX			// AX = 0 or 1
   107  	NEGQ	AX			// RAX = 0x00..00 or 0xff..ff
   108  
   109  	MOVQ	(0*8)(REG_P1), BX	// BX = x[0]
   110  	MOVQ 	(0*8)(REG_P2), CX	// CX = y[0]
   111  	MOVQ	CX, DX			// DX = y[0]
   112  	XORQ	BX, DX			// DX = y[0] ^ x[0]
   113  	ANDQ	AX, DX			// DX = (y[0] ^ x[0]) & mask
   114  	XORQ	DX, BX			// BX = (y[0] ^ x[0]) & mask) ^ x[0] = x[0] or y[0]
   115  	XORQ	DX, CX			// CX = (y[0] ^ x[0]) & mask) ^ y[0] = y[0] or x[0]
   116  	MOVQ	BX, (0*8)(REG_P1)
   117  	MOVQ	CX, (0*8)(REG_P2)
   118  
   119  	MOVQ	(1*8)(REG_P1), BX
   120  	MOVQ 	(1*8)(REG_P2), CX
   121  	MOVQ	CX, DX
   122  	XORQ	BX, DX
   123  	ANDQ	AX, DX
   124  	XORQ	DX, BX
   125  	XORQ	DX, CX
   126  	MOVQ	BX, (1*8)(REG_P1)
   127  	MOVQ	CX, (1*8)(REG_P2)
   128  
   129  	MOVQ	(2*8)(REG_P1), BX
   130  	MOVQ 	(2*8)(REG_P2), CX
   131  	MOVQ	CX, DX
   132  	XORQ	BX, DX
   133  	ANDQ	AX, DX
   134  	XORQ	DX, BX
   135  	XORQ	DX, CX
   136  	MOVQ	BX, (2*8)(REG_P1)
   137  	MOVQ	CX, (2*8)(REG_P2)
   138  
   139  	MOVQ	(3*8)(REG_P1), BX
   140  	MOVQ 	(3*8)(REG_P2), CX
   141  	MOVQ	CX, DX
   142  	XORQ	BX, DX
   143  	ANDQ	AX, DX
   144  	XORQ	DX, BX
   145  	XORQ	DX, CX
   146  	MOVQ	BX, (3*8)(REG_P1)
   147  	MOVQ	CX, (3*8)(REG_P2)
   148  
   149  	MOVQ	(4*8)(REG_P1), BX
   150  	MOVQ 	(4*8)(REG_P2), CX
   151  	MOVQ	CX, DX
   152  	XORQ	BX, DX
   153  	ANDQ	AX, DX
   154  	XORQ	DX, BX
   155  	XORQ	DX, CX
   156  	MOVQ	BX, (4*8)(REG_P1)
   157  	MOVQ	CX, (4*8)(REG_P2)
   158  
   159  	MOVQ	(5*8)(REG_P1), BX
   160  	MOVQ 	(5*8)(REG_P2), CX
   161  	MOVQ	CX, DX
   162  	XORQ	BX, DX
   163  	ANDQ	AX, DX
   164  	XORQ	DX, BX
   165  	XORQ	DX, CX
   166  	MOVQ	BX, (5*8)(REG_P1)
   167  	MOVQ	CX, (5*8)(REG_P2)
   168  
   169  	MOVQ	(6*8)(REG_P1), BX
   170  	MOVQ 	(6*8)(REG_P2), CX
   171  	MOVQ	CX, DX
   172  	XORQ	BX, DX
   173  	ANDQ	AX, DX
   174  	XORQ	DX, BX
   175  	XORQ	DX, CX
   176  	MOVQ	BX, (6*8)(REG_P1)
   177  	MOVQ	CX, (6*8)(REG_P2)
   178  
   179  	MOVQ	(7*8)(REG_P1), BX
   180  	MOVQ 	(7*8)(REG_P2), CX
   181  	MOVQ	CX, DX
   182  	XORQ	BX, DX
   183  	ANDQ	AX, DX
   184  	XORQ	DX, BX
   185  	XORQ	DX, CX
   186  	MOVQ	BX, (7*8)(REG_P1)
   187  	MOVQ	CX, (7*8)(REG_P2)
   188  
   189  	MOVQ	(8*8)(REG_P1), BX
   190  	MOVQ 	(8*8)(REG_P2), CX
   191  	MOVQ	CX, DX
   192  	XORQ	BX, DX
   193  	ANDQ	AX, DX
   194  	XORQ	DX, BX
   195  	XORQ	DX, CX
   196  	MOVQ	BX, (8*8)(REG_P1)
   197  	MOVQ	CX, (8*8)(REG_P2)
   198  
   199  	MOVQ	(9*8)(REG_P1), BX
   200  	MOVQ 	(9*8)(REG_P2), CX
   201  	MOVQ	CX, DX
   202  	XORQ	BX, DX
   203  	ANDQ	AX, DX
   204  	XORQ	DX, BX
   205  	XORQ	DX, CX
   206  	MOVQ	BX, (9*8)(REG_P1)
   207  	MOVQ	CX, (9*8)(REG_P2)
   208  
   209  	MOVQ	(10*8)(REG_P1), BX
   210  	MOVQ 	(10*8)(REG_P2), CX
   211  	MOVQ	CX, DX
   212  	XORQ	BX, DX
   213  	ANDQ	AX, DX
   214  	XORQ	DX, BX
   215  	XORQ	DX, CX
   216  	MOVQ	BX, (10*8)(REG_P1)
   217  	MOVQ	CX, (10*8)(REG_P2)
   218  
   219  	MOVQ	(11*8)(REG_P1), BX
   220  	MOVQ 	(11*8)(REG_P2), CX
   221  	MOVQ	CX, DX
   222  	XORQ	BX, DX
   223  	ANDQ	AX, DX
   224  	XORQ	DX, BX
   225  	XORQ	DX, CX
   226  	MOVQ	BX, (11*8)(REG_P1)
   227  	MOVQ	CX, (11*8)(REG_P2)
   228  
   229  	RET
   230  
   231  TEXT ·cmovP751(SB),NOSPLIT,$0-17
   232  
   233      MOVQ    x+0(FP), DI
   234      MOVQ    y+8(FP), SI
   235      MOVB    choice+16(FP), AL   // AL = 0 or 1
   236      MOVBLZX AL, AX  // AX = 0 or 1
   237      NEGQ    AX          // AX = 0x00..00 or 0xff..ff
   238  #ifndef CMOV_BLOCK
   239  #define CMOV_BLOCK(idx)    \
   240      MOVQ    (idx*8)(DI), BX \ // BX = x[idx]
   241      MOVQ    (idx*8)(SI), DX \ // DX = y[idx]
   242      XORQ    BX, DX          \ // DX = y[idx] ^ x[idx]
   243      ANDQ    AX, DX          \ // DX = (y[idx] ^ x[idx]) & mask
   244      XORQ    DX, BX          \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
   245      MOVQ    BX, (idx*8)(DI)
   246  #endif
   247      CMOV_BLOCK(0)
   248      CMOV_BLOCK(1)
   249      CMOV_BLOCK(2)
   250      CMOV_BLOCK(3)
   251      CMOV_BLOCK(4)
   252      CMOV_BLOCK(5)
   253      CMOV_BLOCK(6)
   254      CMOV_BLOCK(7)
   255      CMOV_BLOCK(8)
   256      CMOV_BLOCK(9)
   257      CMOV_BLOCK(10)
   258      CMOV_BLOCK(11)
   259  #ifdef CMOV_BLOCK
   260  #undef CMOV_BLOCK
   261  #endif
   262      RET
   263  
   264  TEXT ·addP751(SB), NOSPLIT, $0-24
   265  
   266  	MOVQ	z+0(FP), REG_P3
   267  	MOVQ	x+8(FP), REG_P1
   268  	MOVQ	y+16(FP), REG_P2
   269  
   270  	MOVQ	(REG_P1), R8
   271  	MOVQ	(8)(REG_P1), R9
   272  	MOVQ	(16)(REG_P1), R10
   273  	MOVQ	(24)(REG_P1), R11
   274  	MOVQ	(32)(REG_P1), R12
   275  	MOVQ	(40)(REG_P1), R13
   276  	MOVQ	(48)(REG_P1), R14
   277  	MOVQ	(56)(REG_P1), R15
   278  	MOVQ	(64)(REG_P1), CX
   279  	ADDQ	(REG_P2), R8
   280  	ADCQ	(8)(REG_P2), R9
   281  	ADCQ	(16)(REG_P2), R10
   282  	ADCQ	(24)(REG_P2), R11
   283  	ADCQ	(32)(REG_P2), R12
   284  	ADCQ	(40)(REG_P2), R13
   285  	ADCQ	(48)(REG_P2), R14
   286  	ADCQ	(56)(REG_P2), R15
   287  	ADCQ	(64)(REG_P2), CX
   288  	MOVQ	(72)(REG_P1), AX
   289  	ADCQ	(72)(REG_P2), AX
   290  	MOVQ	AX, (72)(REG_P3)
   291  	MOVQ	(80)(REG_P1), AX
   292  	ADCQ	(80)(REG_P2), AX
   293  	MOVQ	AX, (80)(REG_P3)
   294  	MOVQ	(88)(REG_P1), AX
   295  	ADCQ	(88)(REG_P2), AX
   296  	MOVQ	AX, (88)(REG_P3)
   297  
   298  	MOVQ	P751X2_0, AX
   299  	SUBQ	AX, R8
   300  	MOVQ	P751X2_1, AX
   301  	SBBQ	AX, R9
   302  	SBBQ	AX, R10
   303  	SBBQ	AX, R11
   304  	SBBQ	AX, R12
   305  	MOVQ	P751X2_5, AX
   306  	SBBQ	AX, R13
   307  	MOVQ	P751X2_6, AX
   308  	SBBQ	AX, R14
   309  	MOVQ	P751X2_7, AX
   310  	SBBQ	AX, R15
   311  	MOVQ	P751X2_8, AX
   312  	SBBQ	AX, CX
   313  	MOVQ	R8, (REG_P3)
   314  	MOVQ	R9, (8)(REG_P3)
   315  	MOVQ	R10, (16)(REG_P3)
   316  	MOVQ	R11, (24)(REG_P3)
   317  	MOVQ	R12, (32)(REG_P3)
   318  	MOVQ	R13, (40)(REG_P3)
   319  	MOVQ	R14, (48)(REG_P3)
   320  	MOVQ	R15, (56)(REG_P3)
   321  	MOVQ	CX, (64)(REG_P3)
   322  	MOVQ	(72)(REG_P3), R8
   323  	MOVQ	(80)(REG_P3), R9
   324  	MOVQ	(88)(REG_P3), R10
   325  	MOVQ	P751X2_9, AX
   326  	SBBQ	AX, R8
   327  	MOVQ	P751X2_10, AX
   328  	SBBQ	AX, R9
   329  	MOVQ	P751X2_11, AX
   330  	SBBQ	AX, R10
   331  	MOVQ	R8, (72)(REG_P3)
   332  	MOVQ	R9, (80)(REG_P3)
   333  	MOVQ	R10, (88)(REG_P3)
   334  	MOVQ	$0, AX
   335  	SBBQ	$0, AX
   336  
   337  	MOVQ	P751X2_0, SI
   338  	ANDQ	AX, SI
   339  	MOVQ	P751X2_1, R8
   340  	ANDQ	AX, R8
   341  	MOVQ	P751X2_5, R9
   342  	ANDQ	AX, R9
   343  	MOVQ	P751X2_6, R10
   344  	ANDQ	AX, R10
   345  	MOVQ	P751X2_7, R11
   346  	ANDQ	AX, R11
   347  	MOVQ	P751X2_8, R12
   348  	ANDQ	AX, R12
   349  	MOVQ	P751X2_9, R13
   350  	ANDQ	AX, R13
   351  	MOVQ	P751X2_10, R14
   352  	ANDQ	AX, R14
   353  	MOVQ	P751X2_11, R15
   354  	ANDQ	AX, R15
   355  
   356  	MOVQ	(REG_P3), AX
   357  	ADDQ	SI, AX
   358  	MOVQ	AX, (REG_P3)
   359  	MOVQ	(8)(REG_P3), AX
   360  	ADCQ	R8, AX
   361  	MOVQ	AX, (8)(REG_P3)
   362  	MOVQ	(16)(REG_P3), AX
   363  	ADCQ	R8, AX
   364  	MOVQ	AX, (16)(REG_P3)
   365  	MOVQ	(24)(REG_P3), AX
   366  	ADCQ	R8, AX
   367  	MOVQ	AX, (24)(REG_P3)
   368  	MOVQ	(32)(REG_P3), AX
   369  	ADCQ	R8, AX
   370  	MOVQ	AX, (32)(REG_P3)
   371  	MOVQ	(40)(REG_P3), AX
   372  	ADCQ	R9, AX
   373  	MOVQ	AX, (40)(REG_P3)
   374  	MOVQ	(48)(REG_P3), AX
   375  	ADCQ	R10, AX
   376  	MOVQ	AX, (48)(REG_P3)
   377  	MOVQ	(56)(REG_P3), AX
   378  	ADCQ	R11, AX
   379  	MOVQ	AX, (56)(REG_P3)
   380  	MOVQ	(64)(REG_P3), AX
   381  	ADCQ	R12, AX
   382  	MOVQ	AX, (64)(REG_P3)
   383  	MOVQ	(72)(REG_P3), AX
   384  	ADCQ	R13, AX
   385  	MOVQ	AX, (72)(REG_P3)
   386  	MOVQ	(80)(REG_P3), AX
   387  	ADCQ	R14, AX
   388  	MOVQ	AX, (80)(REG_P3)
   389  	MOVQ	(88)(REG_P3), AX
   390  	ADCQ	R15, AX
   391  	MOVQ	AX, (88)(REG_P3)
   392  
   393  	RET
   394  
   395  TEXT ·subP751(SB), NOSPLIT, $0-24
   396  
   397  	MOVQ	z+0(FP),  REG_P3
   398  	MOVQ	x+8(FP),  REG_P1
   399  	MOVQ	y+16(FP),  REG_P2
   400  
   401  	MOVQ	(REG_P1), R8
   402  	MOVQ	(8)(REG_P1), R9
   403  	MOVQ	(16)(REG_P1), R10
   404  	MOVQ	(24)(REG_P1), R11
   405  	MOVQ	(32)(REG_P1), R12
   406  	MOVQ	(40)(REG_P1), R13
   407  	MOVQ	(48)(REG_P1), R14
   408  	MOVQ	(56)(REG_P1), R15
   409  	MOVQ	(64)(REG_P1), CX
   410  	SUBQ	(REG_P2), R8
   411  	SBBQ	(8)(REG_P2), R9
   412  	SBBQ	(16)(REG_P2), R10
   413  	SBBQ	(24)(REG_P2), R11
   414  	SBBQ	(32)(REG_P2), R12
   415  	SBBQ	(40)(REG_P2), R13
   416  	SBBQ	(48)(REG_P2), R14
   417  	SBBQ	(56)(REG_P2), R15
   418  	SBBQ	(64)(REG_P2), CX
   419  	MOVQ	R8, (REG_P3)
   420  	MOVQ	R9, (8)(REG_P3)
   421  	MOVQ	R10, (16)(REG_P3)
   422  	MOVQ	R11, (24)(REG_P3)
   423  	MOVQ	R12, (32)(REG_P3)
   424  	MOVQ	R13, (40)(REG_P3)
   425  	MOVQ	R14, (48)(REG_P3)
   426  	MOVQ	R15, (56)(REG_P3)
   427  	MOVQ	CX, (64)(REG_P3)
   428  	MOVQ	(72)(REG_P1), AX
   429  	SBBQ	(72)(REG_P2), AX
   430  	MOVQ	AX, (72)(REG_P3)
   431  	MOVQ	(80)(REG_P1), AX
   432  	SBBQ	(80)(REG_P2), AX
   433  	MOVQ	AX, (80)(REG_P3)
   434  	MOVQ	(88)(REG_P1), AX
   435  	SBBQ	(88)(REG_P2), AX
   436  	MOVQ	AX, (88)(REG_P3)
   437  	MOVQ	$0, AX
   438  	SBBQ	$0, AX
   439  
   440  	MOVQ	P751X2_0, SI
   441  	ANDQ	AX, SI
   442  	MOVQ	P751X2_1, R8
   443  	ANDQ	AX, R8
   444  	MOVQ	P751X2_5, R9
   445  	ANDQ	AX, R9
   446  	MOVQ	P751X2_6, R10
   447  	ANDQ	AX, R10
   448  	MOVQ	P751X2_7, R11
   449  	ANDQ	AX, R11
   450  	MOVQ	P751X2_8, R12
   451  	ANDQ	AX, R12
   452  	MOVQ	P751X2_9, R13
   453  	ANDQ	AX, R13
   454  	MOVQ	P751X2_10, R14
   455  	ANDQ	AX, R14
   456  	MOVQ	P751X2_11, R15
   457  	ANDQ	AX, R15
   458  
   459  	MOVQ	(REG_P3), AX
   460  	ADDQ	SI, AX
   461  	MOVQ	AX, (REG_P3)
   462  	MOVQ	(8)(REG_P3), AX
   463  	ADCQ	R8, AX
   464  	MOVQ	AX, (8)(REG_P3)
   465  	MOVQ	(16)(REG_P3), AX
   466  	ADCQ	R8, AX
   467  	MOVQ	AX, (16)(REG_P3)
   468  	MOVQ	(24)(REG_P3), AX
   469  	ADCQ	R8, AX
   470  	MOVQ	AX, (24)(REG_P3)
   471  	MOVQ	(32)(REG_P3), AX
   472  	ADCQ	R8, AX
   473  	MOVQ	AX, (32)(REG_P3)
   474  	MOVQ	(40)(REG_P3), AX
   475  	ADCQ	R9, AX
   476  	MOVQ	AX, (40)(REG_P3)
   477  	MOVQ	(48)(REG_P3), AX
   478  	ADCQ	R10, AX
   479  	MOVQ	AX, (48)(REG_P3)
   480  	MOVQ	(56)(REG_P3), AX
   481  	ADCQ	R11, AX
   482  	MOVQ	AX, (56)(REG_P3)
   483  	MOVQ	(64)(REG_P3), AX
   484  	ADCQ	R12, AX
   485  	MOVQ	AX, (64)(REG_P3)
   486  	MOVQ	(72)(REG_P3), AX
   487  	ADCQ	R13, AX
   488  	MOVQ	AX, (72)(REG_P3)
   489  	MOVQ	(80)(REG_P3), AX
   490  	ADCQ	R14, AX
   491  	MOVQ	AX, (80)(REG_P3)
   492  	MOVQ	(88)(REG_P3), AX
   493  	ADCQ	R15, AX
   494  	MOVQ	AX, (88)(REG_P3)
   495  
   496  	RET
   497  
   498  TEXT ·mulP751(SB), $96-24
   499  
   500  	// Here we store the destination in CX instead of in REG_P3 because the
   501  	// multiplication instructions use DX as an implicit destination
   502  	// operand: MULQ $REG sets DX:AX <-- AX * $REG.
   503  
   504  	MOVQ	z+0(FP), CX
   505  	MOVQ	x+8(FP), REG_P1
   506  	MOVQ	y+16(FP), REG_P2
   507  
   508  	XORQ	AX, AX
   509  	MOVQ	(48)(REG_P1), R8
   510  	MOVQ	(56)(REG_P1), R9
   511  	MOVQ	(64)(REG_P1), R10
   512  	MOVQ	(72)(REG_P1), R11
   513  	MOVQ	(80)(REG_P1), R12
   514  	MOVQ	(88)(REG_P1), R13
   515  	ADDQ	(REG_P1), R8
   516  	ADCQ	(8)(REG_P1), R9
   517  	ADCQ	(16)(REG_P1), R10
   518  	ADCQ	(24)(REG_P1), R11
   519  	ADCQ	(32)(REG_P1), R12
   520  	ADCQ	(40)(REG_P1), R13
   521  	MOVQ	R8, (CX)
   522  	MOVQ	R9, (8)(CX)
   523  	MOVQ	R10, (16)(CX)
   524  	MOVQ	R11, (24)(CX)
   525  	MOVQ	R12, (32)(CX)
   526  	MOVQ	R13, (40)(CX)
   527  	SBBQ	$0, AX
   528  
   529  	XORQ	DX, DX
   530  	MOVQ	(48)(REG_P2), R8
   531  	MOVQ	(56)(REG_P2), R9
   532  	MOVQ	(64)(REG_P2), R10
   533  	MOVQ	(72)(REG_P2), R11
   534  	MOVQ	(80)(REG_P2), R12
   535  	MOVQ	(88)(REG_P2), R13
   536  	ADDQ	(REG_P2), R8
   537  	ADCQ	(8)(REG_P2), R9
   538  	ADCQ	(16)(REG_P2), R10
   539  	ADCQ	(24)(REG_P2), R11
   540  	ADCQ	(32)(REG_P2), R12
   541  	ADCQ	(40)(REG_P2), R13
   542  	MOVQ	R8, (48)(CX)
   543  	MOVQ	R9, (56)(CX)
   544  	MOVQ	R10, (64)(CX)
   545  	MOVQ	R11, (72)(CX)
   546  	MOVQ	R12, (80)(CX)
   547  	MOVQ	R13, (88)(CX)
   548  	SBBQ	$0, DX
   549  	MOVQ	AX, (80)(SP)
   550  	MOVQ	DX, (88)(SP)
   551  
   552  	// (SP[0-8],R10,R8,R9) <- (AH+AL)*(BH+BL)
   553  
   554  	MOVQ	(CX), R11
   555  	MOVQ	R8, AX
   556  	MULQ	R11
   557  	MOVQ	AX, (SP)		// c0
   558  	MOVQ	DX, R14
   559  
   560  	XORQ	R15, R15
   561  	MOVQ	R9, AX
   562  	MULQ	R11
   563  	XORQ	R9, R9
   564  	ADDQ	AX, R14
   565  	ADCQ	DX, R9
   566  
   567  	MOVQ	(8)(CX), R12
   568  	MOVQ	R8, AX
   569  	MULQ	R12
   570  	ADDQ	AX, R14
   571  	MOVQ	R14, (8)(SP)		// c1
   572  	ADCQ	DX, R9
   573  	ADCQ	$0, R15
   574  
   575  	XORQ	R8, R8
   576  	MOVQ	R10, AX
   577  	MULQ	R11
   578  	ADDQ	AX, R9
   579  	MOVQ	(48)(CX), R13
   580  	ADCQ	DX, R15
   581  	ADCQ	$0, R8
   582  
   583  	MOVQ	(16)(CX), AX
   584  	MULQ	R13
   585  	ADDQ	AX, R9
   586  	ADCQ	DX, R15
   587  	MOVQ	(56)(CX), AX
   588  	ADCQ	$0, R8
   589  
   590  	MULQ	R12
   591  	ADDQ	AX, R9
   592  	MOVQ	R9, (16)(SP)		// c2
   593  	ADCQ	DX, R15
   594  	ADCQ	$0, R8
   595  
   596  	XORQ	R9, R9
   597  	MOVQ	(72)(CX), AX
   598  	MULQ	R11
   599  	ADDQ	AX, R15
   600  	ADCQ	DX, R8
   601  	ADCQ	$0, R9
   602  
   603  	MOVQ	(24)(CX), AX
   604  	MULQ	R13
   605  	ADDQ	AX, R15
   606  	ADCQ	DX, R8
   607  	ADCQ	$0, R9
   608  
   609  	MOVQ	R10, AX
   610  	MULQ	R12
   611  	ADDQ	AX, R15
   612  	ADCQ	DX, R8
   613  	ADCQ	$0, R9
   614  
   615  	MOVQ	(16)(CX), R14
   616  	MOVQ	(56)(CX), AX
   617  	MULQ	R14
   618  	ADDQ	AX, R15
   619  	MOVQ	R15, (24)(SP)		// c3
   620  	ADCQ	DX, R8
   621  	ADCQ	$0, R9
   622  
   623  	XORQ	R10, R10
   624  	MOVQ	(80)(CX), AX
   625  	MULQ	R11
   626  	ADDQ	AX, R8
   627  	ADCQ	DX, R9
   628  	ADCQ	$0, R10
   629  
   630  	MOVQ	(64)(CX), AX
   631  	MULQ	R14
   632  	ADDQ	AX, R8
   633  	ADCQ	DX, R9
   634  	ADCQ	$0, R10
   635  
   636  	MOVQ	(48)(CX), R15
   637  	MOVQ	(32)(CX), AX
   638  	MULQ	R15
   639  	ADDQ	AX, R8
   640  	ADCQ	DX, R9
   641  	ADCQ	$0, R10
   642  
   643  	MOVQ	(72)(CX), AX
   644  	MULQ	R12
   645  	ADDQ	AX, R8
   646  	ADCQ	DX, R9
   647  	ADCQ	$0, R10
   648  
   649  	MOVQ	(24)(CX), R13
   650  	MOVQ	(56)(CX), AX
   651  	MULQ	R13
   652  	ADDQ	AX, R8
   653  	MOVQ	R8, (32)(SP)		// c4
   654  	ADCQ	DX, R9
   655  	ADCQ	$0, R10
   656  
   657  	XORQ	R8, R8
   658  	MOVQ	(88)(CX), AX
   659  	MULQ	R11
   660  	ADDQ	AX, R9
   661  	ADCQ	DX, R10
   662  	ADCQ	$0, R8
   663  
   664  	MOVQ	(64)(CX), AX
   665  	MULQ	R13
   666  	ADDQ	AX, R9
   667  	ADCQ	DX, R10
   668  	ADCQ	$0, R8
   669  
   670  	MOVQ	(72)(CX), AX
   671  	MULQ	R14
   672  	ADDQ	AX, R9
   673  	ADCQ	DX, R10
   674  	ADCQ	$0, R8
   675  
   676  	MOVQ	(40)(CX), AX
   677  	MULQ	R15
   678  	ADDQ	AX, R9
   679  	ADCQ	DX, R10
   680  	ADCQ	$0, R8
   681  
   682  	MOVQ	(80)(CX), AX
   683  	MULQ	R12
   684  	ADDQ	AX, R9
   685  	ADCQ	DX, R10
   686  	ADCQ	$0, R8
   687  
   688  	MOVQ	(32)(CX), R15
   689  	MOVQ	(56)(CX), AX
   690  	MULQ	R15
   691  	ADDQ	AX, R9
   692  	MOVQ	R9, (40)(SP)		// c5
   693  	ADCQ	DX, R10
   694  	ADCQ	$0, R8
   695  
   696  	XORQ	R9, R9
   697  	MOVQ	(64)(CX), AX
   698  	MULQ	R15
   699  	ADDQ	AX, R10
   700  	ADCQ	DX, R8
   701  	ADCQ	$0, R9
   702  
   703  	MOVQ	(88)(CX), AX
   704  	MULQ	R12
   705  	ADDQ	AX, R10
   706  	ADCQ	DX, R8
   707  	ADCQ	$0, R9
   708  
   709  	MOVQ	(80)(CX), AX
   710  	MULQ	R14
   711  	ADDQ	AX, R10
   712  	ADCQ	DX, R8
   713  	ADCQ	$0, R9
   714  
   715  	MOVQ	(40)(CX), R11
   716  	MOVQ	(56)(CX), AX
   717  	MULQ	R11
   718  	ADDQ	AX, R10
   719  	ADCQ	DX, R8
   720  	ADCQ	$0, R9
   721  
   722  	MOVQ	(72)(CX), AX
   723  	MULQ	R13
   724  	ADDQ	AX, R10
   725  	MOVQ	R10, (48)(SP)		// c6
   726  	ADCQ	DX, R8
   727  	ADCQ	$0, R9
   728  
   729  	XORQ	R10, R10
   730  	MOVQ	(88)(CX), AX
   731  	MULQ	R14
   732  	ADDQ	AX, R8
   733  	ADCQ	DX, R9
   734  	ADCQ	$0, R10
   735  
   736  	MOVQ	(64)(CX), AX
   737  	MULQ	R11
   738  	ADDQ	AX, R8
   739  	ADCQ	DX, R9
   740  	ADCQ	$0, R10
   741  
   742  	MOVQ	(80)(CX), AX
   743  	MULQ	R13
   744  	ADDQ	AX, R8
   745  	ADCQ	DX, R9
   746  	ADCQ	$0, R10
   747  
   748  	MOVQ	(72)(CX), AX
   749  	MULQ	R15
   750  	ADDQ	AX, R8
   751  	MOVQ	R8, (56)(SP)		// c7
   752  	ADCQ	DX, R9
   753  	ADCQ	$0, R10
   754  
   755  	XORQ	R8, R8
   756  	MOVQ	(72)(CX), AX
   757  	MULQ	R11
   758  	ADDQ	AX, R9
   759  	ADCQ	DX, R10
   760  	ADCQ	$0, R8
   761  
   762  	MOVQ	(80)(CX), AX
   763  	MULQ	R15
   764  	ADDQ	AX, R9
   765  	ADCQ	DX, R10
   766  	ADCQ	$0, R8
   767  
   768  	MOVQ	(88)(CX), AX
   769  	MULQ	R13
   770  	ADDQ	AX, R9
   771  	MOVQ	R9, (64)(SP)		// c8
   772  	ADCQ	DX, R10
   773  	ADCQ	$0, R8
   774  
   775  	XORQ	R9, R9
   776  	MOVQ	(88)(CX), AX
   777  	MULQ	R15
   778  	ADDQ	AX, R10
   779  	ADCQ	DX, R8
   780  	ADCQ	$0, R9
   781  
   782  	MOVQ	(80)(CX), AX
   783  	MULQ	R11
   784  	ADDQ	AX, R10			// c9
   785  	ADCQ	DX, R8
   786  	ADCQ	$0, R9
   787  
   788  	MOVQ	(88)(CX), AX
   789  	MULQ	R11
   790  	ADDQ	AX, R8			// c10
   791  	ADCQ	DX, R9			// c11
   792  
   793  	MOVQ	(88)(SP), AX
   794  	MOVQ	(CX), DX
   795  	ANDQ	AX, R12
   796  	ANDQ	AX, R14
   797  	ANDQ	AX, DX
   798  	ANDQ	AX, R13
   799  	ANDQ	AX, R15
   800  	ANDQ	AX, R11
   801  	MOVQ	(48)(SP), AX
   802  	ADDQ	AX, DX
   803  	MOVQ	(56)(SP), AX
   804  	ADCQ	AX, R12
   805  	MOVQ	(64)(SP), AX
   806  	ADCQ	AX, R14
   807  	ADCQ	R10, R13
   808  	ADCQ	R8, R15
   809  	ADCQ	R9, R11
   810  	MOVQ	(80)(SP), AX
   811  	MOVQ	DX, (48)(SP)
   812  	MOVQ	R12, (56)(SP)
   813  	MOVQ	R14, (64)(SP)
   814  	MOVQ	R13, (72)(SP)
   815  	MOVQ	R15, (80)(SP)
   816  	MOVQ	R11, (88)(SP)
   817  
   818  	MOVQ	(48)(CX), R8
   819  	MOVQ	(56)(CX), R9
   820  	MOVQ	(64)(CX), R10
   821  	MOVQ	(72)(CX), R11
   822  	MOVQ	(80)(CX), R12
   823  	MOVQ	(88)(CX), R13
   824  	ANDQ	AX, R8
   825  	ANDQ	AX, R9
   826  	ANDQ	AX, R10
   827  	ANDQ	AX, R11
   828  	ANDQ	AX, R12
   829  	ANDQ	AX, R13
   830  	MOVQ	(48)(SP), AX
   831  	ADDQ	AX, R8
   832  	MOVQ	(56)(SP), AX
   833  	ADCQ	AX, R9
   834  	MOVQ	(64)(SP), AX
   835  	ADCQ	AX, R10
   836  	MOVQ	(72)(SP), AX
   837  	ADCQ	AX, R11
   838  	MOVQ	(80)(SP), AX
   839  	ADCQ	AX, R12
   840  	MOVQ	(88)(SP), AX
   841  	ADCQ	AX, R13
   842  	MOVQ	R8, (48)(SP)
   843  	MOVQ	R9, (56)(SP)
   844  	MOVQ	R11, (72)(SP)
   845  
   846  	// CX[0-11] <- AL*BL
   847  	MOVQ	(REG_P1), R11
   848  	MOVQ	(REG_P2), AX
   849  	MULQ	R11
   850  	XORQ	R9, R9
   851  	MOVQ	AX, (CX)		// c0
   852  	MOVQ	R10, (64)(SP)
   853  	MOVQ	DX, R8
   854  
   855  	MOVQ	(8)(REG_P2), AX
   856  	MULQ	R11
   857  	XORQ	R10, R10
   858  	ADDQ	AX, R8
   859  	MOVQ	R12, (80)(SP)
   860  	ADCQ	DX, R9
   861  
   862  	MOVQ	(8)(REG_P1), R12
   863  	MOVQ	(REG_P2), AX
   864  	MULQ	R12
   865  	ADDQ	AX, R8
   866  	MOVQ	R8, (8)(CX)		// c1
   867  	ADCQ	DX, R9
   868  	MOVQ	R13, (88)(SP)
   869  	ADCQ	$0, R10
   870  
   871  	XORQ	R8, R8
   872  	MOVQ	(16)(REG_P2), AX
   873  	MULQ	R11
   874  	ADDQ	AX, R9
   875  	ADCQ	DX, R10
   876  	ADCQ	$0, R8
   877  
   878  	MOVQ	(REG_P2), R13
   879  	MOVQ	(16)(REG_P1), AX
   880  	MULQ	R13
   881  	ADDQ	AX, R9
   882  	ADCQ	DX, R10
   883  	ADCQ	$0, R8
   884  
   885  	MOVQ	(8)(REG_P2), AX
   886  	MULQ	R12
   887  	ADDQ	AX, R9
   888  	MOVQ	R9, (16)(CX)		// c2
   889  	ADCQ	DX, R10
   890  	ADCQ	$0, R8
   891  
   892  	XORQ	R9, R9
   893  	MOVQ	(24)(REG_P2), AX
   894  	MULQ	R11
   895  	ADDQ	AX, R10
   896  	ADCQ	DX, R8
   897  	ADCQ	$0, R9
   898  
   899  	MOVQ	(24)(REG_P1), AX
   900  	MULQ	R13
   901  	ADDQ	AX, R10
   902  	ADCQ	DX, R8
   903  	ADCQ	$0, R9
   904  
   905  	MOVQ	(16)(REG_P2), AX
   906  	MULQ	R12
   907  	ADDQ	AX, R10
   908  	ADCQ	DX, R8
   909  	ADCQ	$0, R9
   910  
   911  	MOVQ	(16)(REG_P1), R14
   912  	MOVQ	(8)(REG_P2), AX
   913  	MULQ	R14
   914  	ADDQ	AX, R10
   915  	MOVQ	R10, (24)(CX)		// c3
   916  	ADCQ	DX, R8
   917  	ADCQ	$0, R9
   918  
   919  	XORQ	R10, R10
   920  	MOVQ	(32)(REG_P2), AX
   921  	MULQ	R11
   922  	ADDQ	AX, R8
   923  	ADCQ	DX, R9
   924  	ADCQ	$0, R10
   925  
   926  	MOVQ	(16)(REG_P2), AX
   927  	MULQ	R14
   928  	ADDQ	AX, R8
   929  	ADCQ	DX, R9
   930  	ADCQ	$0, R10
   931  
   932  	MOVQ	(32)(REG_P1), AX
   933  	MULQ	R13
   934  	ADDQ	AX, R8
   935  	ADCQ	DX, R9
   936  	ADCQ	$0, R10
   937  
   938  	MOVQ	(24)(REG_P2), AX
   939  	MULQ	R12
   940  	ADDQ	AX, R8
   941  	ADCQ	DX, R9
   942  	ADCQ	$0, R10
   943  
   944  	MOVQ	(24)(REG_P1), R13
   945  	MOVQ	(8)(REG_P2), AX
   946  	MULQ	R13
   947  	ADDQ	AX, R8
   948  	MOVQ	R8, (32)(CX)		// c4
   949  	ADCQ	DX, R9
   950  	ADCQ	$0, R10
   951  
   952  	XORQ	R8, R8
   953  	MOVQ	(40)(REG_P2), AX
   954  	MULQ	R11
   955  	ADDQ	AX, R9
   956  	ADCQ	DX, R10
   957  	ADCQ	$0, R8
   958  
   959  	MOVQ	(16)(REG_P2), AX
   960  	MULQ	R13
   961  	ADDQ	AX, R9
   962  	ADCQ	DX, R10
   963  	ADCQ	$0, R8
   964  
   965  	MOVQ	(24)(REG_P2), AX
   966  	MULQ	R14
   967  	ADDQ	AX, R9
   968  	ADCQ	DX, R10
   969  	ADCQ	$0, R8
   970  
   971  	MOVQ	(40)(REG_P1), R11
   972  	MOVQ	(REG_P2), AX
   973  	MULQ	R11
   974  	ADDQ	AX, R9
   975  	ADCQ	DX, R10
   976  	ADCQ	$0, R8
   977  
   978  	MOVQ	(32)(REG_P2), AX
   979  	MULQ	R12
   980  	ADDQ	AX, R9
   981  	ADCQ	DX, R10
   982  	ADCQ	$0, R8
   983  
   984  	MOVQ	(32)(REG_P1), R15
   985  	MOVQ	(8)(REG_P2), AX
   986  	MULQ	R15
   987  	ADDQ	AX, R9
   988  	MOVQ	R9, (40)(CX)		//c5
   989  	ADCQ	DX, R10
   990  	ADCQ	$0, R8
   991  
   992  	XORQ	R9, R9
   993  	MOVQ	(16)(REG_P2), AX
   994  	MULQ	R15
   995  	ADDQ	AX, R10
   996  	ADCQ	DX, R8
   997  	ADCQ	$0, R9
   998  
   999  	MOVQ	(40)(REG_P2), AX
  1000  	MULQ	R12
  1001  	ADDQ	AX, R10
  1002  	ADCQ	DX, R8
  1003  	ADCQ	$0, R9
  1004  
  1005  	MOVQ	(32)(REG_P2), AX
  1006  	MULQ	R14
  1007  	ADDQ	AX, R10
  1008  	ADCQ	DX, R8
  1009  	ADCQ	$0, R9
  1010  
  1011  	MOVQ	(8)(REG_P2), AX
  1012  	MULQ	R11
  1013  	ADDQ	AX, R10
  1014  	ADCQ	DX, R8
  1015  	ADCQ	$0, R9
  1016  
  1017  	MOVQ	(24)(REG_P2), AX
  1018  	MULQ	R13
  1019  	ADDQ	AX, R10
  1020  	MOVQ	R10, (48)(CX)		// c6
  1021  	ADCQ	DX, R8
  1022  	ADCQ	$0, R9
  1023  
  1024  	XORQ	R10, R10
  1025  	MOVQ	(40)(REG_P2), AX
  1026  	MULQ	R14
  1027  	ADDQ	AX, R8
  1028  	ADCQ	DX, R9
  1029  	ADCQ	$0, R10
  1030  
  1031  	MOVQ	(16)(REG_P2), AX
  1032  	MULQ	R11
  1033  	ADDQ	AX, R8
  1034  	ADCQ	DX, R9
  1035  	ADCQ	$0, R10
  1036  
  1037  	MOVQ	(32)(REG_P2), AX
  1038  	MULQ	R13
  1039  	ADDQ	AX, R8
  1040  	ADCQ	DX, R9
  1041  	ADCQ	$0, R10
  1042  
  1043  	MOVQ	(24)(REG_P2), AX
  1044  	MULQ	R15
  1045  	ADDQ	AX, R8
  1046  	MOVQ	R8, (56)(CX)		// c7
  1047  	ADCQ	DX, R9
  1048  	ADCQ	$0, R10
  1049  
  1050  	XORQ	R8, R8
  1051  	MOVQ	(24)(REG_P2), AX
  1052  	MULQ	R11
  1053  	ADDQ	AX, R9
  1054  	ADCQ	DX, R10
  1055  	ADCQ	$0, R8
  1056  
  1057  	MOVQ	(32)(REG_P2), AX
  1058  	MULQ	R15
  1059  	ADDQ	AX, R9
  1060  	ADCQ	DX, R10
  1061  	ADCQ	$0, R8
  1062  
  1063  	MOVQ	(40)(REG_P2), AX
  1064  	MULQ	R13
  1065  	ADDQ	AX, R9
  1066  	MOVQ	R9, (64)(CX)		// c8
  1067  	ADCQ	DX, R10
  1068  	ADCQ	$0, R8
  1069  
  1070  	XORQ	R9, R9
  1071  	MOVQ	(40)(REG_P2), AX
  1072  	MULQ	R15
  1073  	ADDQ	AX, R10
  1074  	ADCQ	DX, R8
  1075  	ADCQ	$0, R9
  1076  
  1077  	MOVQ	(32)(REG_P2), AX
  1078  	MULQ	R11
  1079  	ADDQ	AX, R10
  1080  	MOVQ	R10, (72)(CX)		// c9
  1081  	ADCQ	DX, R8
  1082  	ADCQ	$0, R9
  1083  
  1084  	MOVQ	(40)(REG_P2), AX
  1085  	MULQ	R11
  1086  	ADDQ	AX, R8
  1087  	MOVQ	R8, (80)(CX)		// c10
  1088  	ADCQ	DX, R9
  1089  	MOVQ	R9, (88)(CX)		// c11
  1090  
  1091  	// CX[12-23] <- AH*BH
  1092  	MOVQ	(48)(REG_P1), R11
  1093  	MOVQ	(48)(REG_P2), AX
  1094  	MULQ	R11
  1095  	XORQ	R9, R9
  1096  	MOVQ	AX, (96)(CX)		// c0
  1097  	MOVQ	DX, R8
  1098  
  1099  	MOVQ	(56)(REG_P2), AX
  1100  	MULQ	R11
  1101  	XORQ	R10, R10
  1102  	ADDQ	AX, R8
  1103  	ADCQ	DX, R9
  1104  
  1105  	MOVQ	(56)(REG_P1), R12
  1106  	MOVQ	(48)(REG_P2), AX
  1107  	MULQ	R12
  1108  	ADDQ	AX, R8
  1109  	MOVQ	R8, (104)(CX)		// c1
  1110  	ADCQ	DX, R9
  1111  	ADCQ	$0, R10
  1112  
  1113  	XORQ	R8, R8
  1114  	MOVQ	(64)(REG_P2), AX
  1115  	MULQ	R11
  1116  	ADDQ	AX, R9
  1117  	ADCQ	DX, R10
  1118  	ADCQ	$0, R8
  1119  
  1120  	MOVQ	(48)(REG_P2), R13
  1121  	MOVQ	(64)(REG_P1), AX
  1122  	MULQ	R13
  1123  	ADDQ	AX, R9
  1124  	ADCQ	DX, R10
  1125  	ADCQ	$0, R8
  1126  
  1127  	MOVQ	(56)(REG_P2), AX
  1128  	MULQ	R12
  1129  	ADDQ	AX, R9
  1130  	MOVQ	R9, (112)(CX)		// c2
  1131  	ADCQ	DX, R10
  1132  	ADCQ	$0, R8
  1133  
  1134  	XORQ	R9, R9
  1135  	MOVQ	(72)(REG_P2), AX
  1136  	MULQ	R11
  1137  	ADDQ	AX, R10
  1138  	ADCQ	DX, R8
  1139  	ADCQ	$0, R9
  1140  
  1141  	MOVQ	(72)(REG_P1), AX
  1142  	MULQ	R13
  1143  	ADDQ	AX, R10
  1144  	ADCQ	DX, R8
  1145  	ADCQ	$0, R9
  1146  
  1147  	MOVQ	(64)(REG_P2), AX
  1148  	MULQ	R12
  1149  	ADDQ	AX, R10
  1150  	ADCQ	DX, R8
  1151  	ADCQ	$0, R9
  1152  
  1153  	MOVQ	(64)(REG_P1), R14
  1154  	MOVQ	(56)(REG_P2), AX
  1155  	MULQ	R14
  1156  	ADDQ	AX, R10
  1157  	MOVQ	R10, (120)(CX)		// c3
  1158  	ADCQ	DX, R8
  1159  	ADCQ	$0, R9
  1160  
  1161  	XORQ	R10, R10
  1162  	MOVQ	(80)(REG_P2), AX
  1163  	MULQ	R11
  1164  	ADDQ	AX, R8
  1165  	ADCQ	DX, R9
  1166  	ADCQ	$0, R10
  1167  
  1168  	MOVQ	(64)(REG_P2), AX
  1169  	MULQ	R14
  1170  	ADDQ	AX, R8
  1171  	ADCQ	DX, R9
  1172  	ADCQ	$0, R10
  1173  
  1174  	MOVQ	(80)(REG_P1), R15
  1175  	MOVQ	R13, AX
  1176  	MULQ	R15
  1177  	ADDQ	AX, R8
  1178  	ADCQ	DX, R9
  1179  	ADCQ	$0, R10
  1180  
  1181  	MOVQ	(72)(REG_P2), AX
  1182  	MULQ	R12
  1183  	ADDQ	AX, R8
  1184  	ADCQ	DX, R9
  1185  	ADCQ	$0, R10
  1186  
  1187  	MOVQ	(72)(REG_P1), R13
  1188  	MOVQ	(56)(REG_P2), AX
  1189  	MULQ	R13
  1190  	ADDQ	AX, R8
  1191  	MOVQ	R8, (128)(CX)		// c4
  1192  	ADCQ	DX, R9
  1193  	ADCQ	$0, R10
  1194  
  1195  	XORQ	R8, R8
  1196  	MOVQ	(88)(REG_P2), AX
  1197  	MULQ	R11
  1198  	ADDQ	AX, R9
  1199  	ADCQ	DX, R10
  1200  	ADCQ	$0, R8
  1201  
  1202  	MOVQ	(64)(REG_P2), AX
  1203  	MULQ	R13
  1204  	ADDQ	AX, R9
  1205  	ADCQ	DX, R10
  1206  	ADCQ	$0, R8
  1207  
  1208  	MOVQ	(72)(REG_P2), AX
  1209  	MULQ	R14
  1210  	ADDQ	AX, R9
  1211  	ADCQ	DX, R10
  1212  	ADCQ	$0, R8
  1213  
  1214  	MOVQ	(88)(REG_P1), R11
  1215  	MOVQ	(48)(REG_P2), AX
  1216  	MULQ	R11
  1217  	ADDQ	AX, R9
  1218  	ADCQ	DX, R10
  1219  	ADCQ	$0, R8
  1220  
  1221  	MOVQ	(80)(REG_P2), AX
  1222  	MULQ	R12
  1223  	ADDQ	AX, R9
  1224  	ADCQ	DX, R10
  1225  	ADCQ	$0, R8
  1226  
  1227  	MOVQ	(56)(REG_P2), AX
  1228  	MULQ	R15
  1229  	ADDQ	AX, R9
  1230  	MOVQ	R9, (136)(CX)		// c5
  1231  	ADCQ	DX, R10
  1232  	ADCQ	$0, R8
  1233  
  1234  	XORQ	R9, R9
  1235  	MOVQ	(64)(REG_P2), AX
  1236  	MULQ	R15
  1237  	ADDQ	AX, R10
  1238  	ADCQ	DX, R8
  1239  	ADCQ	$0, R9
  1240  
  1241  	MOVQ	(88)(REG_P2), AX
  1242  	MULQ	R12
  1243  	ADDQ	AX, R10
  1244  	ADCQ	DX, R8
  1245  	ADCQ	$0, R9
  1246  
  1247  	MOVQ	(80)(REG_P2), AX
  1248  	MULQ	R14
  1249  	ADDQ	AX, R10
  1250  	ADCQ	DX, R8
  1251  	ADCQ	$0, R9
  1252  
  1253  	MOVQ	(56)(REG_P2), AX
  1254  	MULQ	R11
  1255  	ADDQ	AX, R10
  1256  	ADCQ	DX, R8
  1257  	ADCQ	$0, R9
  1258  
  1259  	MOVQ	(72)(REG_P2), AX
  1260  	MULQ	R13
  1261  	ADDQ	AX, R10
  1262  	MOVQ	R10, (144)(CX)		// c6
  1263  	ADCQ	DX, R8
  1264  	ADCQ	$0, R9
  1265  
  1266  	XORQ	R10, R10
  1267  	MOVQ	(88)(REG_P2), AX
  1268  	MULQ	R14
  1269  	ADDQ	AX, R8
  1270  	ADCQ	DX, R9
  1271  	ADCQ	$0, R10
  1272  
  1273  	MOVQ	(64)(REG_P2), AX
  1274  	MULQ	R11
  1275  	ADDQ	AX, R8
  1276  	ADCQ	DX, R9
  1277  	ADCQ	$0, R10
  1278  
  1279  	MOVQ	(80)(REG_P2), AX
  1280  	MULQ	R13
  1281  	ADDQ	AX, R8
  1282  	ADCQ	DX, R9
  1283  	ADCQ	$0, R10
  1284  
  1285  	MOVQ	(72)(REG_P2), AX
  1286  	MULQ	R15
  1287  	ADDQ	AX, R8
  1288  	MOVQ	R8, (152)(CX)		// c7
  1289  	ADCQ	DX, R9
  1290  	ADCQ	$0, R10
  1291  
  1292  	XORQ	R8, R8
  1293  	MOVQ	(72)(REG_P2), AX
  1294  	MULQ	R11
  1295  	ADDQ	AX, R9
  1296  	ADCQ	DX, R10
  1297  	ADCQ	$0, R8
  1298  
  1299  	MOVQ	(80)(REG_P2), AX
  1300  	MULQ	R15
  1301  	ADDQ	AX, R9
  1302  	ADCQ	DX, R10
  1303  	ADCQ	$0, R8
  1304  
  1305  	MOVQ	(88)(REG_P2), AX
  1306  	MULQ	R13
  1307  	ADDQ	AX, R9
  1308  	MOVQ	R9, (160)(CX)		// c8
  1309  	ADCQ	DX, R10
  1310  	ADCQ	$0, R8
  1311  
  1312  	MOVQ	(88)(REG_P2), AX
  1313  	MULQ	R15
  1314  	ADDQ	AX, R10
  1315  	ADCQ	DX, R8
  1316  
  1317  	MOVQ	(80)(REG_P2), AX
  1318  	MULQ	R11
  1319  	ADDQ	AX, R10
  1320  	MOVQ	R10, (168)(CX)		// c9
  1321  	ADCQ	DX, R8
  1322  
  1323  	MOVQ	(88)(REG_P2), AX
  1324  	MULQ	R11
  1325  	ADDQ	AX, R8
  1326  	MOVQ	R8, (176)(CX)		// c10
  1327  	ADCQ	$0, DX
  1328  	MOVQ	DX, (184)(CX)		// c11
  1329  
  1330  	// [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL)-AL*BL
  1331  	MOVQ	(SP), R8
  1332  	SUBQ	(CX), R8
  1333  	MOVQ	(8)(SP), R9
  1334  	SBBQ	(8)(CX), R9
  1335  	MOVQ	(16)(SP), R10
  1336  	SBBQ	(16)(CX), R10
  1337  	MOVQ	(24)(SP), R11
  1338  	SBBQ	(24)(CX), R11
  1339  	MOVQ	(32)(SP), R12
  1340  	SBBQ	(32)(CX), R12
  1341  	MOVQ	(40)(SP), R13
  1342  	SBBQ	(40)(CX), R13
  1343  	MOVQ	(48)(SP), R14
  1344  	SBBQ	(48)(CX), R14
  1345  	MOVQ	(56)(SP), R15
  1346  	SBBQ	(56)(CX), R15
  1347  	MOVQ	(64)(SP), AX
  1348  	SBBQ	(64)(CX), AX
  1349  	MOVQ	(72)(SP), DX
  1350  	SBBQ	(72)(CX), DX
  1351  	MOVQ	(80)(SP), DI
  1352  	SBBQ	(80)(CX), DI
  1353  	MOVQ	(88)(SP), SI
  1354  	SBBQ	(88)(CX), SI
  1355  	MOVQ	SI, (SP)
  1356  
  1357  	// [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
  1358  	MOVQ	(96)(CX), SI
  1359  	SUBQ	SI, R8
  1360  	MOVQ	(104)(CX), SI
  1361  	SBBQ	SI, R9
  1362  	MOVQ	(112)(CX), SI
  1363  	SBBQ	SI, R10
  1364  	MOVQ	(120)(CX), SI
  1365  	SBBQ	SI, R11
  1366  	MOVQ	(128)(CX), SI
  1367  	SBBQ	SI, R12
  1368  	MOVQ	(136)(CX), SI
  1369  	SBBQ	SI, R13
  1370  	MOVQ	(144)(CX), SI
  1371  	SBBQ	SI, R14
  1372  	MOVQ	(152)(CX), SI
  1373  	SBBQ	SI, R15
  1374  	MOVQ	(160)(CX), SI
  1375  	SBBQ	SI, AX
  1376  	MOVQ	(168)(CX), SI
  1377  	SBBQ	SI, DX
  1378  	MOVQ	(176)(CX), SI
  1379  	SBBQ	SI, DI
  1380  	MOVQ	(SP), SI
  1381  	SBBQ	(184)(CX), SI
  1382  
  1383  	// FINAL RESULT
  1384  	ADDQ	(48)(CX), R8
  1385  	MOVQ	R8, (48)(CX)
  1386  	ADCQ	(56)(CX), R9
  1387  	MOVQ	R9, (56)(CX)
  1388  	ADCQ	(64)(CX), R10
  1389  	MOVQ	R10, (64)(CX)
  1390  	ADCQ	(72)(CX), R11
  1391  	MOVQ	R11, (72)(CX)
  1392  	ADCQ	(80)(CX), R12
  1393  	MOVQ	R12, (80)(CX)
  1394  	ADCQ	(88)(CX), R13
  1395  	MOVQ	R13, (88)(CX)
  1396  	ADCQ	(96)(CX), R14
  1397  	MOVQ	R14, (96)(CX)
  1398  	ADCQ	(104)(CX), R15
  1399  	MOVQ	R15, (104)(CX)
  1400  	ADCQ	(112)(CX), AX
  1401  	MOVQ	AX, (112)(CX)
  1402  	ADCQ	(120)(CX), DX
  1403  	MOVQ	DX, (120)(CX)
  1404  	ADCQ	(128)(CX), DI
  1405  	MOVQ	DI, (128)(CX)
  1406  	ADCQ	(136)(CX), SI
  1407  	MOVQ	SI, (136)(CX)
  1408  	MOVQ	(144)(CX), AX
  1409  	ADCQ	$0, AX
  1410  	MOVQ	AX, (144)(CX)
  1411  	MOVQ	(152)(CX), AX
  1412  	ADCQ	$0, AX
  1413  	MOVQ	AX, (152)(CX)
  1414  	MOVQ	(160)(CX), AX
  1415  	ADCQ	$0, AX
  1416  	MOVQ	AX, (160)(CX)
  1417  	MOVQ	(168)(CX), AX
  1418  	ADCQ	$0, AX
  1419  	MOVQ	AX, (168)(CX)
  1420  	MOVQ	(176)(CX), AX
  1421  	ADCQ	$0, AX
  1422  	MOVQ	AX, (176)(CX)
  1423  	MOVQ	(184)(CX), AX
  1424  	ADCQ	$0, AX
  1425  	MOVQ	AX, (184)(CX)
  1426  
  1427  	RET
  1428  
  1429  // This multiplies a 256-bit number pointed to by M0 with p751+1.
  1430  // It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
  1431  // C points to the place to store the result and should be at least 192 bits.
  1432  // This should only be used when the BMI2 and ADX instruction set extensions
  1433  // are available.
  1434  #define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  1435  	MOVQ	0+M0, DX		\
  1436  	MOVQ    M1+40(SB), AX   \
  1437  	MULXQ	AX, T1, T0      \
  1438  	MOVQ    M1+48(SB), AX   \
  1439  	MULXQ	AX, T3, T2   	\
  1440  	MOVQ	T1, 0+C			\	// C0_final
  1441  	MOVQ    M1+56(SB), AX   \
  1442  	MULXQ	AX, T5, T4	    \
  1443  	ADOXQ	T3, T0			\
  1444  	ADOXQ	T5, T2			\
  1445  	MOVQ    M1+64(SB), AX   \
  1446  	MULXQ	AX, T3, T1	    \
  1447  	ADOXQ	T3, T4			\
  1448  	MOVQ    M1+72(SB), AX   \
  1449  	MULXQ	AX, T6, T5	    \
  1450  	ADOXQ	T6, T1			\
  1451  	MOVQ    M1+80(SB), AX   \
  1452  	MULXQ	AX, T7, T3	    \
  1453  	ADOXQ	T7, T5			\
  1454  	MOVQ    M1+88(SB), AX   \
  1455  	MULXQ	AX, T8, T6	    \
  1456  	ADOXQ	T8, T3			\
  1457  	MOVL	$0, AX			\
  1458  	ADOXQ	AX, T6			\
  1459  					\
  1460  	MOVQ	8+M0, DX		\
  1461  	MOVQ    M1+40(SB), AX   \
  1462  	MULXQ	AX, T7, T8	    \
  1463  	ADCXQ	T7, T0			\
  1464  	MOVQ	T0, 8+C			\	// C1_final
  1465  	ADCXQ	T8, T2			\
  1466  	MOVQ    M1+48(SB), AX   \
  1467  	MULXQ	AX, T8, T7	    \
  1468  	ADOXQ	T8, T2			\
  1469  	ADCXQ	T7, T4			\
  1470  	MOVQ    M1+56(SB), AX   \
  1471  	MULXQ	AX, T8, T0	    \
  1472  	ADOXQ	T8, T4			\
  1473  	ADCXQ	T1, T0			\
  1474  	MOVQ    M1+64(SB), AX   \
  1475  	MULXQ	AX, T7, T1      \
  1476  	ADCXQ	T5, T1			\
  1477  	MOVQ    M1+72(SB), AX   \
  1478  	MULXQ	AX, T8, T5	    \
  1479  	ADCXQ	T5, T3			\
  1480  	MOVQ    M1+80(SB), AX   \
  1481  	MULXQ	AX, T9, T5   	\
  1482  	ADCXQ	T5, T6			\
  1483  	MOVQ    M1+88(SB), AX   \
  1484  	MULXQ	AX, DX, T5	    \
  1485  	MOVL	$0, AX			\
  1486  	ADCXQ	AX, T5			\
  1487  					\
  1488  	ADOXQ	T7, T0			\
  1489  	ADOXQ	T8, T1			\
  1490  	ADOXQ	T9, T3			\
  1491  	ADOXQ	DX, T6			\
  1492  	ADOXQ	AX, T5			\
  1493  					\
  1494  	MOVQ	16+M0, DX		\
  1495  	MOVQ    M1+40(SB), AX   \
  1496  	MULXQ	AX, T7, T8	    \
  1497  	ADCXQ	T7, T2			\
  1498  	MOVQ	T2, 16+C		\	// C2_final
  1499  	ADCXQ	T8, T4			\
  1500  	MOVQ    M1+48(SB), AX   \
  1501  	MULXQ	AX, T7, T8	    \
  1502  	ADOXQ	T7, T4			\
  1503  	ADCXQ	T8, T0			\
  1504  	MOVQ    M1+56(SB), AX   \
  1505  	MULXQ	AX, T8, T2	    \
  1506  	ADOXQ	T8, T0			\
  1507  	ADCXQ	T2, T1			\
  1508  	MOVQ    M1+64(SB), AX   \
  1509  	MULXQ	AX, T7, T2	    \
  1510  	ADCXQ	T2, T3			\
  1511  	MOVQ    M1+72(SB), AX   \
  1512  	MULXQ	AX, T8, T2	    \
  1513  	ADCXQ	T2, T6			\
  1514  	MOVQ    M1+80(SB), AX   \
  1515  	MULXQ	AX, T9, T2	    \
  1516  	ADCXQ	T2, T5			\
  1517  	MOVQ    M1+88(SB), AX   \
  1518  	MULXQ	AX, DX, T2      \
  1519  	MOVL	$0, AX			\
  1520  	ADCXQ	AX, T2			\
  1521  					\
  1522  	ADOXQ	T7, T1			\
  1523  	ADOXQ	T8, T3			\
  1524  	ADOXQ	T9, T6			\
  1525  	ADOXQ	DX, T5			\
  1526  	ADOXQ	AX, T2			\
  1527  					\
  1528  	MOVQ	24+M0, DX		\
  1529  	MOVQ    M1+40(SB), AX   \
  1530  	MULXQ	AX, T7, T8	    \
  1531  	ADCXQ	T4, T7			\
  1532  	ADCXQ	T8, T0			\
  1533  	MOVQ    M1+48(SB), AX   \
  1534  	MULXQ	AX, T9, T8		\
  1535  	ADOXQ	T9, T0			\
  1536  	ADCXQ	T8, T1			\
  1537  	MOVQ    M1+56(SB), AX   \
  1538  	MULXQ	AX, T8, T4      \
  1539  	ADOXQ	T8, T1			\
  1540  	ADCXQ	T4, T3			\
  1541  	MOVQ    M1+64(SB), AX   \
  1542  	MULXQ	AX, AX, T4		\
  1543  	ADCXQ	T4, T6			\
  1544  	ADOXQ	AX, T3			\
  1545  	MOVQ    M1+72(SB), AX   \
  1546  	MULXQ	AX, T8, T4	    \
  1547  	ADCXQ	T4, T5			\
  1548  	MOVQ    M1+80(SB), AX   \
  1549  	MULXQ	AX, T9, T4	    \
  1550  	ADCXQ	T4, T2			\
  1551  	MOVQ    M1+88(SB), AX   \
  1552  	MULXQ	AX, DX, T4	    \
  1553  	MOVL	$0, AX			\
  1554  	ADCXQ	AX, T4			\
  1555  					\
  1556  	ADOXQ	T8, T6			\
  1557  	ADOXQ	T9, T5			\
  1558  	ADOXQ	DX, T2			\
  1559  	ADOXQ	AX, T4
  1560  
  1561  // This multiplies a 256-bit number pointed to by M0 with p751+1.
  1562  // It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
  1563  // C points to the place to store the result and should be at least 192 bits.
  1564  // This should only be used when the BMI2 instruction set extension is
  1565  // available.
  1566  #define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  1567  	MOVQ	0+M0, DX		\
  1568  	MOVQ    M1+40(SB), AX   \
  1569  	MULXQ	AX, T1, T0   	\
  1570  	MOVQ    M1+48(SB), AX   \
  1571  	MULXQ	AX, T3, T2	    \
  1572  	MOVQ	T1, 0+C			\	// C0_final
  1573  	MOVQ    M1+56(SB), AX   \
  1574  	MULXQ	AX, T5, T4	    \
  1575  	ADDQ	T3, T0			\
  1576  	ADCQ	T5, T2			\
  1577  	MOVQ    M1+64(SB), AX   \
  1578  	MULXQ	AX, T3, T1	    \
  1579  	ADCQ	T3, T4			\
  1580  	MOVQ    M1+72(SB), AX   \
  1581  	MULXQ	AX, T6, T5	    \
  1582  	ADCQ	T6, T1			\
  1583  	MOVQ    M1+80(SB), AX   \
  1584  	MULXQ	AX, T7, T3	    \
  1585  	ADCQ	T7, T5			\
  1586  	MOVQ    M1+88(SB), AX   \
  1587  	MULXQ	AX, T8, T6	    \
  1588  	ADCQ	T8, T3			\
  1589  	ADCQ	$0, T6			\
  1590  					\
  1591  	MOVQ	8+M0, DX		\
  1592  	MOVQ    M1+40(SB), AX   \
  1593  	MULXQ	AX, T7, T8	    \
  1594  	ADDQ	T7, T0			\
  1595  	MOVQ	T0, 8+C			\	// C1_final
  1596  	ADCQ	T8, T2			\
  1597  	MOVQ    M1+48(SB), AX   \
  1598  	MULXQ	AX, T8, T7	    \
  1599  	MOVQ	T8, 32+C		\
  1600  	ADCQ	T7, T4			\
  1601  	MOVQ    M1+56(SB), AX   \
  1602  	MULXQ	AX, T8, T0	    \
  1603  	MOVQ	T8, 40+C	 	\
  1604  	ADCQ	T1, T0			\
  1605  	MOVQ    M1+64(SB), AX   \
  1606  	MULXQ	AX, T7, T1	    \
  1607  	ADCQ	T5, T1			\
  1608  	MOVQ    M1+72(SB), AX   \
  1609  	MULXQ	AX, T8, T5	    \
  1610  	ADCQ	T5, T3			\
  1611  	MOVQ    M1+80(SB), AX   \
  1612  	MULXQ	AX, T9, T5	    \
  1613  	ADCQ	T5, T6			\
  1614  	MOVQ    M1+88(SB), AX   \
  1615  	MULXQ	AX, DX, T5	    \
  1616  	ADCQ	$0, T5			\
  1617  					\
  1618  	XORQ	AX, AX			\
  1619  	ADDQ	32+C, T2		\
  1620  	ADCQ	40+C, T4		\
  1621  	ADCQ	T7, T0			\
  1622  	ADCQ	T8, T1			\
  1623  	ADCQ	T9, T3			\
  1624  	ADCQ	DX, T6			\
  1625  	ADCQ	AX, T5			\
  1626  					\
  1627  	MOVQ	16+M0, DX		\
  1628  	MOVQ    M1+40(SB), AX   \
  1629  	MULXQ	AX, T7, T8	    \
  1630  	ADDQ	T7, T2			\
  1631  	MOVQ	T2, 16+C		\	// C2_final
  1632  	ADCQ	T8, T4			\
  1633  	MOVQ    M1+48(SB), AX   \
  1634  	MULXQ	AX, T7, T8	    \
  1635  	MOVQ	T7, 32+C		\
  1636  	ADCQ	T8, T0			\
  1637  	MOVQ    M1+56(SB), AX   \
  1638  	MULXQ	AX, T8, T2	    \
  1639  	MOVQ	T8, 40+C		\
  1640  	ADCQ	T2, T1			\
  1641  	MOVQ    M1+64(SB), AX   \
  1642  	MULXQ	AX, T7, T2	    \
  1643  	ADCQ	T2, T3			\
  1644  	MOVQ    M1+72(SB), AX   \
  1645  	MULXQ	AX, T8, T2	    \
  1646  	ADCQ	T2, T6			\
  1647  	MOVQ    M1+80(SB), AX   \
  1648  	MULXQ	AX, T9, T2	    \
  1649  	ADCQ	T2, T5			\
  1650  	MOVQ    M1+88(SB), AX   \
  1651  	MULXQ	AX, DX, T2	    \
  1652  	ADCQ	$0, T2			\
  1653  					\
  1654  	XORQ	AX, AX			\
  1655  	ADDQ	32+C, T4		\
  1656  	ADCQ	40+C, T0		\
  1657  	ADCQ	T7, T1			\
  1658  	ADCQ	T8, T3			\
  1659  	ADCQ	T9, T6			\
  1660  	ADCQ	DX, T5			\
  1661  	ADCQ	AX, T2			\
  1662  					\
  1663  	MOVQ	24+M0, DX		\
  1664  	MOVQ    M1+40(SB), AX   \
  1665  	MULXQ	AX, T7, T8	    \
  1666  	ADDQ	T4, T7			\
  1667  	MOVQ    T7, 8(SP) /* push T7 */ \
  1668  	ADCQ	T8, T0			\
  1669  	MOVQ    M1+48(SB), AX   \
  1670  	MULXQ	AX, T9, T8  	\
  1671  	MOVQ	T9, 32+C 		\
  1672  	ADCQ	T8, T1			\
  1673  	MOVQ    M1+56(SB), AX   \
  1674  	MULXQ	AX, T8, T4	    \
  1675  	MOVQ	T8, 40+C		\
  1676  	ADCQ	T4, T3			\
  1677  	MOVQ    M1+64(SB), AX   \
  1678  	MULXQ	AX, T7, T4		\
  1679  	ADCQ	T4, T6			\
  1680  	MOVQ    M1+72(SB), AX   \
  1681  	MULXQ	AX, T8, T4	    \
  1682  	ADCQ	T4, T5			\
  1683  	MOVQ    M1+80(SB), AX   \
  1684  	MULXQ	AX, T9, T4	    \
  1685  	ADCQ	T4, T2			\
  1686  	MOVQ    M1+88(SB), AX   \
  1687  	MULXQ	AX, DX, T4	    \
  1688  	ADCQ	$0, T4			\
  1689  					\
  1690  	XORQ	AX, AX			\
  1691  	ADDQ	32+C, T0		\
  1692  	ADCQ	40+C, T1		\
  1693  	ADCQ	T7, T3			\
  1694  	ADCQ	T8, T6			\
  1695  	ADCQ	T9, T5			\
  1696  	ADCQ	DX, T2			\
  1697  	ADCQ	AX, T4			\
  1698  	MOVQ 8(SP), T7 /* pop T7 */
  1699  
  1700  // Template for calculating the Montgomery reduction algorithm described in
  1701  // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
  1702  // customized with schoolbook multiplication for 256 x 448-bit number.
  1703  // This macro reuses memory of IN value and *changes* it. Smashes registers
  1704  // R[8-15], AX, BX, CX, DX, BP.
  1705  // Input:
  1706  //    * M0: 1536-bit number to be reduced
  1707  //    * C : either mul256x448bmi2 or mul256x448bmi2adx
  1708  // Output: OUT 768-bit
  1709  #define REDC(C, M0, MULS) 	\
  1710      \ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
  1711      MULS(M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX) \
  1712      XORQ    R15, R15        \
  1713      MOVQ    48+C, AX        \
  1714      MOVQ    56+C, DX        \
  1715      MOVQ    64+C, BX        \
  1716      ADDQ    40+M0, AX       \
  1717      ADCQ    48+M0, DX       \
  1718      ADCQ    56+M0, BX       \
  1719      MOVQ    AX, 40+M0       \
  1720      MOVQ    DX, 48+M0       \
  1721      MOVQ    BX, 56+M0       \
  1722      ADCQ    64+M0, BP       \
  1723      ADCQ    72+M0, R8       \
  1724      ADCQ    80+M0, R9       \
  1725      ADCQ    88+M0, R10      \
  1726      ADCQ    96+M0, R11      \
  1727      ADCQ    104+M0, R12     \
  1728      ADCQ    112+M0, R13     \
  1729      ADCQ    120+M0, R14     \
  1730      ADCQ    128+M0, R15     \
  1731      MOVQ    BP, 64+M0       \
  1732      MOVQ    R8, 72+M0       \
  1733      MOVQ    R9, 80+M0       \
  1734      MOVQ    R10, 88+M0      \
  1735      MOVQ    R11, 96+M0      \
  1736      MOVQ    R12, 104+M0     \
  1737      MOVQ    R13, 112+M0     \
  1738      MOVQ    R14, 120+M0     \
  1739      MOVQ    R15, 128+M0     \
  1740      MOVQ    136+M0, R8      \
  1741      MOVQ    144+M0, R9      \
  1742      MOVQ    152+M0, R10     \
  1743      MOVQ    160+M0, R11     \
  1744      MOVQ    168+M0, R12     \
  1745      MOVQ    176+M0, R13     \
  1746      MOVQ    184+M0, R14     \
  1747      ADCQ    $0, R8          \
  1748      ADCQ    $0, R9          \
  1749      ADCQ    $0, R10         \
  1750      ADCQ    $0, R11         \
  1751      ADCQ    $0, R12         \
  1752      ADCQ    $0, R13         \
  1753      ADCQ    $0, R14         \
  1754      MOVQ    R8, 136+M0      \
  1755      MOVQ    R9, 144+M0      \
  1756      MOVQ    R10, 152+M0     \
  1757      MOVQ    R11, 160+M0     \
  1758      MOVQ    R12, 168+M0     \
  1759      MOVQ    R13, 176+M0     \
  1760      MOVQ    R14, 184+M0     \
  1761      \ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
  1762      MULS(32+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX) \
  1763      XORQ    R15, R15          \
  1764      MOVQ    48+C, AX        \
  1765      MOVQ    56+C, DX        \
  1766      MOVQ    64+C, BX        \
  1767      ADDQ    72+M0, AX       \
  1768      ADCQ    80+M0, DX       \
  1769      ADCQ    88+M0, BX       \
  1770      MOVQ    AX, 72+M0       \
  1771      MOVQ    DX, 80+M0       \
  1772      MOVQ    BX, 88+M0       \
  1773      ADCQ    96+M0, BP       \
  1774      ADCQ    104+M0, R8      \
  1775      ADCQ    112+M0, R9      \
  1776      ADCQ    120+M0, R10     \
  1777      ADCQ    128+M0, R11     \
  1778      ADCQ    136+M0, R12     \
  1779      ADCQ    144+M0, R13     \
  1780      ADCQ    152+M0, R14     \
  1781      ADCQ    160+M0, R15     \
  1782      MOVQ    BP, 0+C         \   // Final result c0
  1783      MOVQ    R8, 104+M0      \
  1784      MOVQ    R9, 112+M0      \
  1785      MOVQ    R10, 120+M0     \
  1786      MOVQ    R11, 128+M0     \
  1787      MOVQ    R12, 136+M0     \
  1788      MOVQ    R13, 144+M0     \
  1789      MOVQ    R14, 152+M0     \
  1790      MOVQ    R15, 160+M0     \
  1791      MOVQ    168+M0, R12     \
  1792      MOVQ    176+M0, R13     \
  1793      MOVQ    184+M0, R14     \
  1794      ADCQ    $0, R12         \
  1795      ADCQ    $0, R13         \
  1796      ADCQ    $0, R14         \
  1797      MOVQ    R12, 168+M0     \
  1798      MOVQ    R13, 176+M0     \
  1799      MOVQ    R14, 184+M0     \
  1800      \ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
  1801      MULS(64+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX) \
  1802      MOVQ    48+C, AX        \   // Final result c1:c11
  1803      MOVQ    56+C, DX        \
  1804      MOVQ    64+C, BX        \
  1805      ADDQ    104+M0, AX      \
  1806      ADCQ    112+M0, DX      \
  1807      ADCQ    120+M0, BX      \
  1808      MOVQ    AX, 8+C         \
  1809      MOVQ    DX, 16+C        \
  1810      MOVQ    BX, 24+C        \
  1811      ADCQ    128+M0, BP      \
  1812      ADCQ    136+M0, R8      \
  1813      ADCQ    144+M0, R9      \
  1814      ADCQ    152+M0, R10     \
  1815      ADCQ    160+M0, R11     \
  1816      ADCQ    168+M0, R12     \
  1817      ADCQ    176+M0, R13     \
  1818      ADCQ    184+M0, R14     \
  1819      MOVQ    BP, 32+C        \
  1820      MOVQ    R8, 40+C        \
  1821      MOVQ    R9, 48+C        \
  1822      MOVQ    R10, 56+C       \
  1823      MOVQ    R11, 64+C       \
  1824      MOVQ    R12, 72+C       \
  1825      MOVQ    R13, 80+C       \
  1826      MOVQ    R14, 88+C
  1827  
  1828  TEXT ·rdcP751(SB), $16-16
  1829  	MOVQ z+0(FP), REG_P2
  1830  	MOVQ x+8(FP), REG_P1
  1831  
  1832  	// Check whether to use optimized implementation
  1833  	CMPB    ·HasADXandBMI2(SB), $1
  1834  	JE      redc_with_mulx_adcx_adox
  1835  	CMPB    ·HasBMI2(SB), $1
  1836  	JE      redc_with_mulx
  1837  
  1838  	MOVQ	(REG_P1), R11
  1839  	MOVQ	P751P1_5, AX
  1840  	MULQ	R11
  1841  	XORQ	R8, R8
  1842  	ADDQ	(40)(REG_P1), AX
  1843  	MOVQ	AX, (40)(REG_P2)		// Z5
  1844  	ADCQ	DX, R8
  1845  
  1846  	XORQ	R9, R9
  1847  	MOVQ	P751P1_6, AX
  1848  	MULQ	R11
  1849  	XORQ	R10, R10
  1850  	ADDQ	AX, R8
  1851  	ADCQ	DX, R9
  1852  
  1853  	MOVQ	(8)(REG_P1), R12
  1854  	MOVQ	P751P1_5, AX
  1855  	MULQ	R12
  1856  	ADDQ	AX, R8
  1857  	ADCQ	DX, R9
  1858  	ADCQ	$0, R10
  1859  	ADDQ	(48)(REG_P1), R8
  1860  	MOVQ	R8, (48)(REG_P2)		// Z6
  1861  	ADCQ	$0, R9
  1862  	ADCQ	$0, R10
  1863  
  1864  	XORQ	R8, R8
  1865  	MOVQ	P751P1_7, AX
  1866  	MULQ	R11
  1867  	ADDQ	AX, R9
  1868  	ADCQ	DX, R10
  1869  	ADCQ	$0, R8
  1870  
  1871  	MOVQ	P751P1_6, AX
  1872  	MULQ	R12
  1873  	ADDQ	AX, R9
  1874  	ADCQ	DX, R10
  1875  	ADCQ	$0, R8
  1876  
  1877  	MOVQ	(16)(REG_P1), R13
  1878  	MOVQ	P751P1_5, AX
  1879  	MULQ	R13
  1880  	ADDQ	AX, R9
  1881  	ADCQ	DX, R10
  1882  	ADCQ	$0, R8
  1883  	ADDQ	(56)(REG_P1), R9
  1884  	MOVQ	R9, (56)(REG_P2)		// Z7
  1885  	ADCQ	$0, R10
  1886  	ADCQ	$0, R8
  1887  
  1888  	XORQ	R9, R9
  1889  	MOVQ	P751P1_8, AX
  1890  	MULQ	R11
  1891  	ADDQ	AX, R10
  1892  	ADCQ	DX, R8
  1893  	ADCQ	$0, R9
  1894  
  1895  	MOVQ	P751P1_7, AX
  1896  	MULQ	R12
  1897  	ADDQ	AX, R10
  1898  	ADCQ	DX, R8
  1899  	ADCQ	$0, R9
  1900  
  1901  	MOVQ	P751P1_6, AX
  1902  	MULQ	R13
  1903  	ADDQ	AX, R10
  1904  	ADCQ	DX, R8
  1905  	ADCQ	$0, R9
  1906  
  1907  	MOVQ	(24)(REG_P1), R14
  1908  	MOVQ	P751P1_5, AX
  1909  	MULQ	R14
  1910  	ADDQ	AX, R10
  1911  	ADCQ	DX, R8
  1912  	ADCQ	$0, R9
  1913  	ADDQ	(64)(REG_P1), R10
  1914  	MOVQ	R10, (64)(REG_P2)		// Z8
  1915  	ADCQ	$0, R8
  1916  	ADCQ	$0, R9
  1917  
  1918  	XORQ	R10, R10
  1919  	MOVQ	P751P1_9, AX
  1920  	MULQ	R11
  1921  	ADDQ	AX, R8
  1922  	ADCQ	DX, R9
  1923  	ADCQ	$0, R10
  1924  
  1925  	MOVQ	P751P1_8, AX
  1926  	MULQ	R12
  1927  	ADDQ	AX, R8
  1928  	ADCQ	DX, R9
  1929  	ADCQ	$0, R10
  1930  
  1931  	MOVQ	P751P1_7, AX
  1932  	MULQ	R13
  1933  	ADDQ	AX, R8
  1934  	ADCQ	DX, R9
  1935  	ADCQ	$0, R10
  1936  
  1937  	MOVQ	P751P1_6, AX
  1938  	MULQ	R14
  1939  	ADDQ	AX, R8
  1940  	ADCQ	DX, R9
  1941  	ADCQ	$0, R10
  1942  
  1943  	MOVQ	(32)(REG_P1), R15
  1944  	MOVQ	P751P1_5, AX
  1945  	MULQ	R15
  1946  	ADDQ	AX, R8
  1947  	ADCQ	DX, R9
  1948  	ADCQ	$0, R10
  1949  	ADDQ	(72)(REG_P1), R8
  1950  	MOVQ	R8, (72)(REG_P2)		// Z9
  1951  	ADCQ	$0, R9
  1952  	ADCQ	$0, R10
  1953  
  1954  	XORQ	R8, R8
  1955  	MOVQ	P751P1_10, AX
  1956  	MULQ	R11
  1957  	ADDQ	AX, R9
  1958  	ADCQ	DX, R10
  1959  	ADCQ	$0, R8
  1960  
  1961  	MOVQ	P751P1_9, AX
  1962  	MULQ	R12
  1963  	ADDQ	AX, R9
  1964  	ADCQ	DX, R10
  1965  	ADCQ	$0, R8
  1966  
  1967  	MOVQ	P751P1_8, AX
  1968  	MULQ	R13
  1969  	ADDQ	AX, R9
  1970  	ADCQ	DX, R10
  1971  	ADCQ	$0, R8
  1972  
  1973  	MOVQ	P751P1_7, AX
  1974  	MULQ	R14
  1975  	ADDQ	AX, R9
  1976  	ADCQ	DX, R10
  1977  	ADCQ	$0, R8
  1978  
  1979  	MOVQ	P751P1_6, AX
  1980  	MULQ	R15
  1981  	ADDQ	AX, R9
  1982  	ADCQ	DX, R10
  1983  	ADCQ	$0, R8
  1984  
  1985  	MOVQ	(40)(REG_P2), CX
  1986  	MOVQ	P751P1_5, AX
  1987  	MULQ	CX
  1988  	ADDQ	AX, R9
  1989  	ADCQ	DX, R10
  1990  	ADCQ	$0, R8
  1991  	ADDQ	(80)(REG_P1), R9
  1992  	MOVQ	R9, (80)(REG_P2)		// Z10
  1993  	ADCQ	$0, R10
  1994  	ADCQ	$0, R8
  1995  
  1996  	XORQ	R9, R9
  1997  	MOVQ	P751P1_11, AX
  1998  	MULQ	R11
  1999  	ADDQ	AX, R10
  2000  	ADCQ	DX, R8
  2001  	ADCQ	$0, R9
  2002  
  2003  	MOVQ	P751P1_10, AX
  2004  	MULQ	R12
  2005  	ADDQ	AX, R10
  2006  	ADCQ	DX, R8
  2007  	ADCQ	$0, R9
  2008  
  2009  	MOVQ	P751P1_9, AX
  2010  	MULQ	R13
  2011  	ADDQ	AX, R10
  2012  	ADCQ	DX, R8
  2013  	ADCQ	$0, R9
  2014  
  2015  	MOVQ	P751P1_8, AX
  2016  	MULQ	R14
  2017  	ADDQ	AX, R10
  2018  	ADCQ	DX, R8
  2019  	ADCQ	$0, R9
  2020  
  2021  	MOVQ	P751P1_7, AX
  2022  	MULQ	R15
  2023  	ADDQ	AX, R10
  2024  	ADCQ	DX, R8
  2025  	ADCQ	$0, R9
  2026  
  2027  	MOVQ	P751P1_6, AX
  2028  	MULQ	CX
  2029  	ADDQ	AX, R10
  2030  	ADCQ	DX, R8
  2031  	ADCQ	$0, R9
  2032  
  2033  	MOVQ	(48)(REG_P2), R11
  2034  	MOVQ	P751P1_5, AX
  2035  	MULQ	R11
  2036  	ADDQ	AX, R10
  2037  	ADCQ	DX, R8
  2038  	ADCQ	$0, R9
  2039  	ADDQ	(88)(REG_P1), R10
  2040  	MOVQ	R10, (88)(REG_P2)		// Z11
  2041  	ADCQ	$0, R8
  2042  	ADCQ	$0, R9
  2043  
  2044  	XORQ	R10, R10
  2045  	MOVQ	P751P1_11, AX
  2046  	MULQ	R12
  2047  	ADDQ	AX, R8
  2048  	ADCQ	DX, R9
  2049  	ADCQ	$0, R10
  2050  
  2051  	MOVQ	P751P1_10, AX
  2052  	MULQ	R13
  2053  	ADDQ	AX, R8
  2054  	ADCQ	DX, R9
  2055  	ADCQ	$0, R10
  2056  
  2057  	MOVQ	P751P1_9, AX
  2058  	MULQ	R14
  2059  	ADDQ	AX, R8
  2060  	ADCQ	DX, R9
  2061  	ADCQ	$0, R10
  2062  
  2063  	MOVQ	P751P1_8, AX
  2064  	MULQ	R15
  2065  	ADDQ	AX, R8
  2066  	ADCQ	DX, R9
  2067  	ADCQ	$0, R10
  2068  
  2069  	MOVQ	P751P1_7, AX
  2070  	MULQ	CX
  2071  	ADDQ	AX, R8
  2072  	ADCQ	DX, R9
  2073  	ADCQ	$0, R10
  2074  
  2075  	MOVQ	P751P1_6, AX
  2076  	MULQ	R11
  2077  	ADDQ	AX, R8
  2078  	ADCQ	DX, R9
  2079  	ADCQ	$0, R10
  2080  
  2081  	MOVQ	(56)(REG_P2), R12
  2082  	MOVQ	P751P1_5, AX
  2083  	MULQ	R12
  2084  	ADDQ	AX, R8
  2085  	ADCQ	DX, R9
  2086  	ADCQ	$0, R10
  2087  	ADDQ	(96)(REG_P1), R8
  2088  	MOVQ	R8, (REG_P2)		// Z0
  2089  	ADCQ	$0, R9
  2090  	ADCQ	$0, R10
  2091  
  2092  	XORQ	R8, R8
  2093  	MOVQ	P751P1_11, AX
  2094  	MULQ	R13
  2095  	ADDQ	AX, R9
  2096  	ADCQ	DX, R10
  2097  	ADCQ	$0, R8
  2098  
  2099  	MOVQ	P751P1_10, AX
  2100  	MULQ	R14
  2101  	ADDQ	AX, R9
  2102  	ADCQ	DX, R10
  2103  	ADCQ	$0, R8
  2104  
  2105  	MOVQ	P751P1_9, AX
  2106  	MULQ	R15
  2107  	ADDQ	AX, R9
  2108  	ADCQ	DX, R10
  2109  	ADCQ	$0, R8
  2110  
  2111  	MOVQ	P751P1_8, AX
  2112  	MULQ	CX
  2113  	ADDQ	AX, R9
  2114  	ADCQ	DX, R10
  2115  	ADCQ	$0, R8
  2116  
  2117  	MOVQ	P751P1_7, AX
  2118  	MULQ	R11
  2119  	ADDQ	AX, R9
  2120  	ADCQ	DX, R10
  2121  	ADCQ	$0, R8
  2122  
  2123  	MOVQ	P751P1_6, AX
  2124  	MULQ	R12
  2125  	ADDQ	AX, R9
  2126  	ADCQ	DX, R10
  2127  	ADCQ	$0, R8
  2128  
  2129  	MOVQ	(64)(REG_P2), R13
  2130  	MOVQ	P751P1_5, AX
  2131  	MULQ	R13
  2132  	ADDQ	AX, R9
  2133  	ADCQ	DX, R10
  2134  	ADCQ	$0, R8
  2135  	ADDQ	(104)(REG_P1), R9
  2136  	MOVQ	R9, (8)(REG_P2)		// Z1
  2137  	ADCQ	$0, R10
  2138  	ADCQ	$0, R8
  2139  
  2140  	XORQ	R9, R9
  2141  	MOVQ	P751P1_11, AX
  2142  	MULQ	R14
  2143  	ADDQ	AX, R10
  2144  	ADCQ	DX, R8
  2145  	ADCQ	$0, R9
  2146  
  2147  	MOVQ	P751P1_10, AX
  2148  	MULQ	R15
  2149  	ADDQ	AX, R10
  2150  	ADCQ	DX, R8
  2151  	ADCQ	$0, R9
  2152  
  2153  	MOVQ	P751P1_9, AX
  2154  	MULQ	CX
  2155  	ADDQ	AX, R10
  2156  	ADCQ	DX, R8
  2157  	ADCQ	$0, R9
  2158  
  2159  	MOVQ	P751P1_8, AX
  2160  	MULQ	R11
  2161  	ADDQ	AX, R10
  2162  	ADCQ	DX, R8
  2163  	ADCQ	$0, R9
  2164  
  2165  	MOVQ	P751P1_7, AX
  2166  	MULQ	R12
  2167  	ADDQ	AX, R10
  2168  	ADCQ	DX, R8
  2169  	ADCQ	$0, R9
  2170  
  2171  	MOVQ	P751P1_6, AX
  2172  	MULQ	R13
  2173  	ADDQ	AX, R10
  2174  	ADCQ	DX, R8
  2175  	ADCQ	$0, R9
  2176  
  2177  	MOVQ	(72)(REG_P2), R14
  2178  	MOVQ	P751P1_5, AX
  2179  	MULQ	R14
  2180  	ADDQ	AX, R10
  2181  	ADCQ	DX, R8
  2182  	ADCQ	$0, R9
  2183  	ADDQ	(112)(REG_P1), R10
  2184  	MOVQ	R10, (16)(REG_P2)		// Z2
  2185  	ADCQ	$0, R8
  2186  	ADCQ	$0, R9
  2187  
  2188  	XORQ	R10, R10
  2189  	MOVQ	P751P1_11, AX
  2190  	MULQ	R15
  2191  	ADDQ	AX, R8
  2192  	ADCQ	DX, R9
  2193  	ADCQ	$0, R10
  2194  
  2195  	MOVQ	P751P1_10, AX
  2196  	MULQ	CX
  2197  	ADDQ	AX, R8
  2198  	ADCQ	DX, R9
  2199  	ADCQ	$0, R10
  2200  
  2201  	MOVQ	P751P1_9, AX
  2202  	MULQ	R11
  2203  	ADDQ	AX, R8
  2204  	ADCQ	DX, R9
  2205  	ADCQ	$0, R10
  2206  
  2207  	MOVQ	P751P1_8, AX
  2208  	MULQ	R12
  2209  	ADDQ	AX, R8
  2210  	ADCQ	DX, R9
  2211  	ADCQ	$0, R10
  2212  
  2213  	MOVQ	P751P1_7, AX
  2214  	MULQ	R13
  2215  	ADDQ	AX, R8
  2216  	ADCQ	DX, R9
  2217  	ADCQ	$0, R10
  2218  
  2219  	MOVQ	P751P1_6, AX
  2220  	MULQ	R14
  2221  	ADDQ	AX, R8
  2222  	ADCQ	DX, R9
  2223  	ADCQ	$0, R10
  2224  
  2225  	MOVQ	(80)(REG_P2), R15
  2226  	MOVQ	P751P1_5, AX
  2227  	MULQ	R15
  2228  	ADDQ	AX, R8
  2229  	ADCQ	DX, R9
  2230  	ADCQ	$0, R10
  2231  	ADDQ	(120)(REG_P1), R8
  2232  	MOVQ	R8, (24)(REG_P2)		// Z3
  2233  	ADCQ	$0, R9
  2234  	ADCQ	$0, R10
  2235  
  2236  	XORQ	R8, R8
  2237  	MOVQ	P751P1_11, AX
  2238  	MULQ	CX
  2239  	ADDQ	AX, R9
  2240  	ADCQ	DX, R10
  2241  	ADCQ	$0, R8
  2242  
  2243  	MOVQ	P751P1_10, AX
  2244  	MULQ	R11
  2245  	ADDQ	AX, R9
  2246  	ADCQ	DX, R10
  2247  	ADCQ	$0, R8
  2248  
  2249  	MOVQ	P751P1_9, AX
  2250  	MULQ	R12
  2251  	ADDQ	AX, R9
  2252  	ADCQ	DX, R10
  2253  	ADCQ	$0, R8
  2254  
  2255  	MOVQ	P751P1_8, AX
  2256  	MULQ	R13
  2257  	ADDQ	AX, R9
  2258  	ADCQ	DX, R10
  2259  	ADCQ	$0, R8
  2260  
  2261  	MOVQ	P751P1_7, AX
  2262  	MULQ	R14
  2263  	ADDQ	AX, R9
  2264  	ADCQ	DX, R10
  2265  	ADCQ	$0, R8
  2266  
  2267  	MOVQ	P751P1_6, AX
  2268  	MULQ	R15
  2269  	ADDQ	AX, R9
  2270  	ADCQ	DX, R10
  2271  	ADCQ	$0, R8
  2272  
  2273  	MOVQ	(88)(REG_P2), CX
  2274  	MOVQ	P751P1_5, AX
  2275  	MULQ	CX
  2276  	ADDQ	AX, R9
  2277  	ADCQ	DX, R10
  2278  	ADCQ	$0, R8
  2279  	ADDQ	(128)(REG_P1), R9
  2280  	MOVQ	R9, (32)(REG_P2)		// Z4
  2281  	ADCQ	$0, R10
  2282  	ADCQ	$0, R8
  2283  
  2284  	XORQ	R9, R9
  2285  	MOVQ	P751P1_11, AX
  2286  	MULQ	R11
  2287  	ADDQ	AX, R10
  2288  	ADCQ	DX, R8
  2289  	ADCQ	$0, R9
  2290  
  2291  	MOVQ	P751P1_10, AX
  2292  	MULQ	R12
  2293  	ADDQ	AX, R10
  2294  	ADCQ	DX, R8
  2295  	ADCQ	$0, R9
  2296  
  2297  	MOVQ	P751P1_9, AX
  2298  	MULQ	R13
  2299  	ADDQ	AX, R10
  2300  	ADCQ	DX, R8
  2301  	ADCQ	$0, R9
  2302  
  2303  	MOVQ	P751P1_8, AX
  2304  	MULQ	R14
  2305  	ADDQ	AX, R10
  2306  	ADCQ	DX, R8
  2307  	ADCQ	$0, R9
  2308  
  2309  	MOVQ	P751P1_7, AX
  2310  	MULQ	R15
  2311  	ADDQ	AX, R10
  2312  	ADCQ	DX, R8
  2313  	ADCQ	$0, R9
  2314  
  2315  	MOVQ	P751P1_6, AX
  2316  	MULQ	CX
  2317  	ADDQ	AX, R10
  2318  	ADCQ	DX, R8
  2319  	ADCQ	$0, R9
  2320  	ADDQ	(136)(REG_P1), R10
  2321  	MOVQ	R10, (40)(REG_P2)		// Z5
  2322  	ADCQ	$0, R8
  2323  	ADCQ	$0, R9
  2324  
  2325  	XORQ	R10, R10
  2326  	MOVQ	P751P1_11, AX
  2327  	MULQ	R12
  2328  	ADDQ	AX, R8
  2329  	ADCQ	DX, R9
  2330  	ADCQ	$0, R10
  2331  
  2332  	MOVQ	P751P1_10, AX
  2333  	MULQ	R13
  2334  	ADDQ	AX, R8
  2335  	ADCQ	DX, R9
  2336  	ADCQ	$0, R10
  2337  
  2338  	MOVQ	P751P1_9, AX
  2339  	MULQ	R14
  2340  	ADDQ	AX, R8
  2341  	ADCQ	DX, R9
  2342  	ADCQ	$0, R10
  2343  
  2344  	MOVQ	P751P1_8, AX
  2345  	MULQ	R15
  2346  	ADDQ	AX, R8
  2347  	ADCQ	DX, R9
  2348  	ADCQ	$0, R10
  2349  
  2350  	MOVQ	P751P1_7, AX
  2351  	MULQ	CX
  2352  	ADDQ	AX, R8
  2353  	ADCQ	DX, R9
  2354  	ADCQ	$0, R10
  2355  	ADDQ	(144)(REG_P1), R8
  2356  	MOVQ	R8, (48)(REG_P2)		// Z6
  2357  	ADCQ	$0, R9
  2358  	ADCQ	$0, R10
  2359  
  2360  	XORQ	R8, R8
  2361  	MOVQ	P751P1_11, AX
  2362  	MULQ	R13
  2363  	ADDQ	AX, R9
  2364  	ADCQ	DX, R10
  2365  	ADCQ	$0, R8
  2366  
  2367  	MOVQ	P751P1_10, AX
  2368  	MULQ	R14
  2369  	ADDQ	AX, R9
  2370  	ADCQ	DX, R10
  2371  	ADCQ	$0, R8
  2372  
  2373  	MOVQ	P751P1_9, AX
  2374  	MULQ	R15
  2375  	ADDQ	AX, R9
  2376  	ADCQ	DX, R10
  2377  	ADCQ	$0, R8
  2378  
  2379  	MOVQ	P751P1_8, AX
  2380  	MULQ	CX
  2381  	ADDQ	AX, R9
  2382  	ADCQ	DX, R10
  2383  	ADCQ	$0, R8
  2384  	ADDQ	(152)(REG_P1), R9
  2385  	MOVQ	R9, (56)(REG_P2)		// Z7
  2386  	ADCQ	$0, R10
  2387  	ADCQ	$0, R8
  2388  
  2389  	XORQ	R9, R9
  2390  	MOVQ	P751P1_11, AX
  2391  	MULQ	R14
  2392  	ADDQ	AX, R10
  2393  	ADCQ	DX, R8
  2394  	ADCQ	$0, R9
  2395  
  2396  	MOVQ	P751P1_10, AX
  2397  	MULQ	R15
  2398  	ADDQ	AX, R10
  2399  	ADCQ	DX, R8
  2400  	ADCQ	$0, R9
  2401  
  2402  	MOVQ	P751P1_9, AX
  2403  	MULQ	CX
  2404  	ADDQ	AX, R10
  2405  	ADCQ	DX, R8
  2406  	ADCQ	$0, R9
  2407  	ADDQ	(160)(REG_P1), R10
  2408  	MOVQ	R10, (64)(REG_P2)		// Z8
  2409  	ADCQ	$0, R8
  2410  	ADCQ	$0, R9
  2411  
  2412  	XORQ	R10, R10
  2413  	MOVQ	P751P1_11, AX
  2414  	MULQ	R15
  2415  	ADDQ	AX, R8
  2416  	ADCQ	DX, R9
  2417  	ADCQ	$0, R10
  2418  
  2419  	MOVQ	P751P1_10, AX
  2420  	MULQ	CX
  2421  	ADDQ	AX, R8
  2422  	ADCQ	DX, R9
  2423  	ADCQ	$0, R10
  2424  	ADDQ	(168)(REG_P1), R8		// Z9
  2425  	MOVQ	R8, (72)(REG_P2)		// Z9
  2426  	ADCQ	$0, R9
  2427  	ADCQ	$0, R10
  2428  
  2429  	MOVQ	P751P1_11, AX
  2430  	MULQ	CX
  2431  	ADDQ	AX, R9
  2432  	ADCQ	DX, R10
  2433  	ADDQ	(176)(REG_P1), R9		// Z10
  2434  	MOVQ	R9, (80)(REG_P2)		// Z10
  2435  	ADCQ	$0, R10
  2436  	ADDQ	(184)(REG_P1), R10		// Z11
  2437  	MOVQ	R10, (88)(REG_P2)		// Z11
  2438  	RET
  2439  
  2440  redc_with_mulx_adcx_adox:
  2441  	// This implements the Montgomery reduction algorithm described in
  2442  	// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
  2443  	// This assumes that the BMI2 and ADX instruction set extensions are available.
  2444  	MOVQ BP, 0(SP) // push: BP is Callee-save.
  2445  	REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2adx)
  2446  	MOVQ 0(SP), BP // pop: BP is Callee-save.
  2447  	RET
  2448  
  2449  redc_with_mulx:
  2450  	// This implements the Montgomery reduction algorithm described in
  2451  	// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
  2452  	// This assumes that the BMI2 instruction set extension is available.
  2453  	MOVQ BP, 0(SP) // push: BP is Callee-save.
  2454  	REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2)
  2455  	MOVQ 0(SP), BP // pop: BP is Callee-save.
  2456  	RET
  2457  
  2458  TEXT ·adlP751(SB), NOSPLIT, $0-24
  2459  
  2460  	MOVQ z+0(FP), REG_P3
  2461  	MOVQ x+8(FP), REG_P1
  2462  	MOVQ y+16(FP), REG_P2
  2463  
  2464  	MOVQ	(REG_P1), R8
  2465  	MOVQ	(8)(REG_P1), R9
  2466  	MOVQ	(16)(REG_P1), R10
  2467  	MOVQ	(24)(REG_P1), R11
  2468  	MOVQ	(32)(REG_P1), R12
  2469  	MOVQ	(40)(REG_P1), R13
  2470  	MOVQ	(48)(REG_P1), R14
  2471  	MOVQ	(56)(REG_P1), R15
  2472  	MOVQ	(64)(REG_P1), AX
  2473  	MOVQ	(72)(REG_P1), BX
  2474  	MOVQ	(80)(REG_P1), CX
  2475  
  2476  	ADDQ	(REG_P2), R8
  2477  	ADCQ	(8)(REG_P2), R9
  2478  	ADCQ	(16)(REG_P2), R10
  2479  	ADCQ	(24)(REG_P2), R11
  2480  	ADCQ	(32)(REG_P2), R12
  2481  	ADCQ	(40)(REG_P2), R13
  2482  	ADCQ	(48)(REG_P2), R14
  2483  	ADCQ	(56)(REG_P2), R15
  2484  	ADCQ	(64)(REG_P2), AX
  2485  	ADCQ	(72)(REG_P2), BX
  2486  	ADCQ	(80)(REG_P2), CX
  2487  
  2488  	MOVQ	R8, (REG_P3)
  2489  	MOVQ	R9, (8)(REG_P3)
  2490  	MOVQ	R10, (16)(REG_P3)
  2491  	MOVQ	R11, (24)(REG_P3)
  2492  	MOVQ	R12, (32)(REG_P3)
  2493  	MOVQ	R13, (40)(REG_P3)
  2494  	MOVQ	R14, (48)(REG_P3)
  2495  	MOVQ	R15, (56)(REG_P3)
  2496  	MOVQ	AX, (64)(REG_P3)
  2497  	MOVQ	BX, (72)(REG_P3)
  2498  	MOVQ	CX, (80)(REG_P3)
  2499  	MOVQ	(88)(REG_P1), AX
  2500  	ADCQ	(88)(REG_P2), AX
  2501  	MOVQ	AX, (88)(REG_P3)
  2502  
  2503  	MOVQ	(96)(REG_P1), R8
  2504  	MOVQ	(104)(REG_P1), R9
  2505  	MOVQ	(112)(REG_P1), R10
  2506  	MOVQ	(120)(REG_P1), R11
  2507  	MOVQ	(128)(REG_P1), R12
  2508  	MOVQ	(136)(REG_P1), R13
  2509  	MOVQ	(144)(REG_P1), R14
  2510  	MOVQ	(152)(REG_P1), R15
  2511  	MOVQ	(160)(REG_P1), AX
  2512  	MOVQ	(168)(REG_P1), BX
  2513  	MOVQ	(176)(REG_P1), CX
  2514  	MOVQ	(184)(REG_P1), DI
  2515  
  2516  	ADCQ	(96)(REG_P2), R8
  2517  	ADCQ	(104)(REG_P2), R9
  2518  	ADCQ	(112)(REG_P2), R10
  2519  	ADCQ	(120)(REG_P2), R11
  2520  	ADCQ	(128)(REG_P2), R12
  2521  	ADCQ	(136)(REG_P2), R13
  2522  	ADCQ	(144)(REG_P2), R14
  2523  	ADCQ	(152)(REG_P2), R15
  2524  	ADCQ	(160)(REG_P2), AX
  2525  	ADCQ	(168)(REG_P2), BX
  2526  	ADCQ	(176)(REG_P2), CX
  2527  	ADCQ	(184)(REG_P2), DI
  2528  
  2529  	MOVQ	R8, (96)(REG_P3)
  2530  	MOVQ	R9, (104)(REG_P3)
  2531  	MOVQ	R10, (112)(REG_P3)
  2532  	MOVQ	R11, (120)(REG_P3)
  2533  	MOVQ	R12, (128)(REG_P3)
  2534  	MOVQ	R13, (136)(REG_P3)
  2535  	MOVQ	R14, (144)(REG_P3)
  2536  	MOVQ	R15, (152)(REG_P3)
  2537  	MOVQ	AX, (160)(REG_P3)
  2538  	MOVQ	BX, (168)(REG_P3)
  2539  	MOVQ	CX, (176)(REG_P3)
  2540  	MOVQ	DI, (184)(REG_P3)
  2541  
  2542  	RET
  2543  
  2544  
  2545  TEXT ·sulP751(SB), NOSPLIT, $0-24
  2546  
  2547  	MOVQ z+0(FP), REG_P3
  2548  	MOVQ x+8(FP), REG_P1
  2549  	MOVQ y+16(FP), REG_P2
  2550  
  2551  	MOVQ	(REG_P1), R8
  2552  	MOVQ	(8)(REG_P1), R9
  2553  	MOVQ	(16)(REG_P1), R10
  2554  	MOVQ	(24)(REG_P1), R11
  2555  	MOVQ	(32)(REG_P1), R12
  2556  	MOVQ	(40)(REG_P1), R13
  2557  	MOVQ	(48)(REG_P1), R14
  2558  	MOVQ	(56)(REG_P1), R15
  2559  	MOVQ	(64)(REG_P1), AX
  2560  	MOVQ	(72)(REG_P1), BX
  2561  	MOVQ	(80)(REG_P1), CX
  2562  
  2563  	SUBQ	(REG_P2), R8
  2564  	SBBQ	(8)(REG_P2), R9
  2565  	SBBQ	(16)(REG_P2), R10
  2566  	SBBQ	(24)(REG_P2), R11
  2567  	SBBQ	(32)(REG_P2), R12
  2568  	SBBQ	(40)(REG_P2), R13
  2569  	SBBQ	(48)(REG_P2), R14
  2570  	SBBQ	(56)(REG_P2), R15
  2571  	SBBQ	(64)(REG_P2), AX
  2572  	SBBQ	(72)(REG_P2), BX
  2573  	SBBQ	(80)(REG_P2), CX
  2574  
  2575  	MOVQ	R8, (REG_P3)
  2576  	MOVQ	R9, (8)(REG_P3)
  2577  	MOVQ	R10, (16)(REG_P3)
  2578  	MOVQ	R11, (24)(REG_P3)
  2579  	MOVQ	R12, (32)(REG_P3)
  2580  	MOVQ	R13, (40)(REG_P3)
  2581  	MOVQ	R14, (48)(REG_P3)
  2582  	MOVQ	R15, (56)(REG_P3)
  2583  	MOVQ	AX, (64)(REG_P3)
  2584  	MOVQ	BX, (72)(REG_P3)
  2585  	MOVQ	CX, (80)(REG_P3)
  2586  	MOVQ	(88)(REG_P1), AX
  2587  	SBBQ	(88)(REG_P2), AX
  2588  	MOVQ	AX, (88)(REG_P3)
  2589  
  2590  	MOVQ	(96)(REG_P1), R8
  2591  	MOVQ	(104)(REG_P1), R9
  2592  	MOVQ	(112)(REG_P1), R10
  2593  	MOVQ	(120)(REG_P1), R11
  2594  	MOVQ	(128)(REG_P1), R12
  2595  	MOVQ	(136)(REG_P1), R13
  2596  	MOVQ	(144)(REG_P1), R14
  2597  	MOVQ	(152)(REG_P1), R15
  2598  	MOVQ	(160)(REG_P1), AX
  2599  	MOVQ	(168)(REG_P1), BX
  2600  	MOVQ	(176)(REG_P1), CX
  2601  	MOVQ	(184)(REG_P1), DI
  2602  
  2603  	SBBQ	(96)(REG_P2), R8
  2604  	SBBQ	(104)(REG_P2), R9
  2605  	SBBQ	(112)(REG_P2), R10
  2606  	SBBQ	(120)(REG_P2), R11
  2607  	SBBQ	(128)(REG_P2), R12
  2608  	SBBQ	(136)(REG_P2), R13
  2609  	SBBQ	(144)(REG_P2), R14
  2610  	SBBQ	(152)(REG_P2), R15
  2611  	SBBQ	(160)(REG_P2), AX
  2612  	SBBQ	(168)(REG_P2), BX
  2613  	SBBQ	(176)(REG_P2), CX
  2614  	SBBQ	(184)(REG_P2), DI
  2615  
  2616  	MOVQ	R8, (96)(REG_P3)
  2617  	MOVQ	R9, (104)(REG_P3)
  2618  	MOVQ	R10, (112)(REG_P3)
  2619  	MOVQ	R11, (120)(REG_P3)
  2620  	MOVQ	R12, (128)(REG_P3)
  2621  	MOVQ	R13, (136)(REG_P3)
  2622  	MOVQ	R14, (144)(REG_P3)
  2623  	MOVQ	R15, (152)(REG_P3)
  2624  	MOVQ	AX, (160)(REG_P3)
  2625  	MOVQ	BX, (168)(REG_P3)
  2626  	MOVQ	CX, (176)(REG_P3)
  2627  	MOVQ	DI, (184)(REG_P3)
  2628  
  2629  	// Now the carry flag is 1 if x-y < 0.  If so, add p*2^768.
  2630  	MOVQ	$0, AX
  2631  	SBBQ	$0, AX
  2632  
  2633  	// Load p into registers:
  2634  	MOVQ	P751_0, R8
  2635  	// P751_{1,2,3,4} = P751_0, so reuse R8
  2636  	MOVQ	P751_5, R9
  2637  	MOVQ	P751_6, R10
  2638  	MOVQ	P751_7, R11
  2639  	MOVQ	P751_8, R12
  2640  	MOVQ	P751_9, R13
  2641  	MOVQ	P751_10, R14
  2642  	MOVQ	P751_11, R15
  2643  
  2644  	ANDQ	AX, R8
  2645  	ANDQ	AX, R9
  2646  	ANDQ	AX, R10
  2647  	ANDQ	AX, R11
  2648  	ANDQ	AX, R12
  2649  	ANDQ	AX, R13
  2650  	ANDQ	AX, R14
  2651  	ANDQ	AX, R15
  2652  
  2653  	ADDQ	R8,  (96   )(REG_P3)
  2654  	ADCQ	R8,  (96+ 8)(REG_P3)
  2655  	ADCQ	R8,  (96+16)(REG_P3)
  2656  	ADCQ	R8,  (96+24)(REG_P3)
  2657  	ADCQ	R8,  (96+32)(REG_P3)
  2658  	ADCQ	R9,  (96+40)(REG_P3)
  2659  	ADCQ	R10, (96+48)(REG_P3)
  2660  	ADCQ	R11, (96+56)(REG_P3)
  2661  	ADCQ	R12, (96+64)(REG_P3)
  2662  	ADCQ	R13, (96+72)(REG_P3)
  2663  	ADCQ	R14, (96+80)(REG_P3)
  2664  	ADCQ    R15, (96+88)(REG_P3)
  2665  
  2666  	RET