github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p503/arith_amd64.s (about)

     1  // +build amd64,!purego
     2  
     3  #include "textflag.h"
     4  
     5  // p503
     6  #define P503_0     $0xFFFFFFFFFFFFFFFF
     7  #define P503_1     $0xFFFFFFFFFFFFFFFF
     8  #define P503_2     $0xFFFFFFFFFFFFFFFF
     9  #define P503_3     $0xABFFFFFFFFFFFFFF
    10  #define P503_4     $0x13085BDA2211E7A0
    11  #define P503_5     $0x1B9BF6C87B7E7DAF
    12  #define P503_6     $0x6045C6BDDA77A4D0
    13  #define P503_7     $0x004066F541811E1E
    14  
    15  // p503+1
    16  #define P503P1_3   $0xAC00000000000000
    17  #define P503P1_4   $0x13085BDA2211E7A0
    18  #define P503P1_5   $0x1B9BF6C87B7E7DAF
    19  #define P503P1_6   $0x6045C6BDDA77A4D0
    20  #define P503P1_7   $0x004066F541811E1E
    21  
    22  // p503x2
    23  #define P503X2_0   $0xFFFFFFFFFFFFFFFE
    24  #define P503X2_1   $0xFFFFFFFFFFFFFFFF
    25  #define P503X2_2   $0xFFFFFFFFFFFFFFFF
    26  #define P503X2_3   $0x57FFFFFFFFFFFFFF
    27  #define P503X2_4   $0x2610B7B44423CF41
    28  #define P503X2_5   $0x3737ED90F6FCFB5E
    29  #define P503X2_6   $0xC08B8D7BB4EF49A0
    30  #define P503X2_7   $0x0080CDEA83023C3C
    31  
    32  #define REG_P1 DI
    33  #define REG_P2 SI
    34  #define REG_P3 DX
    35  
    36  // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
    37  // uses MULX instruction. Macro smashes value in DX.
    38  // Input: I0 and I1.
    39  // Output: O
    40  // All the other arguments are registers, used for storing temporary values
    41  #define MULS256_MULX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
    42  	MOVQ    I0, DX          \
    43  	MULXQ   I1, T1, T0      \   // T0:T1 = A0*B0
    44  	MOVQ    T1, O           \   // O[0]
    45  	MULXQ   8+I1, T2, T1    \   // T1:T2 = U0*V1
    46  	ADDQ    T2, T0          \
    47  	MULXQ   16+I1, T3, T2   \   // T2:T3 = U0*V2
    48  	ADCQ    T3, T1          \
    49  	MULXQ   24+I1, T4, T3   \   // T3:T4 = U0*V3
    50  	ADCQ    T4, T2          \
    51  	\ // Column U1
    52  	MOVQ    8+I0, DX        \
    53  	ADCQ    $0, T3          \
    54  	MULXQ   0+I1, T4, T5    \   // T5:T4 = U1*V0
    55  	MULXQ   8+I1, T7, T6    \   // T6:T7 = U1*V1
    56  	ADDQ    T7, T5          \
    57  	MULXQ   16+I1, T8, T7   \   // T7:T8 = U1*V2
    58  	ADCQ    T8, T6          \
    59  	MULXQ   24+I1, T9, T8   \   // T8:T9 = U1*V3
    60  	ADCQ    T9, T7          \
    61  	ADCQ    $0, T8          \
    62  	ADDQ    T0, T4          \
    63  	MOVQ    T4, 8+O         \   // O[1]
    64  	ADCQ    T1, T5          \
    65  	ADCQ    T2, T6          \
    66  	ADCQ    T3, T7          \
    67  	\ // Column U2
    68  	MOVQ    16+I0, DX       \
    69  	ADCQ    $0, T8          \
    70  	MULXQ   0+I1, T0, T1    \   // T1:T0 = U2*V0
    71  	MULXQ   8+I1, T3, T2    \   // T2:T3 = U2*V1
    72  	ADDQ    T3, T1          \
    73  	MULXQ   16+I1, T4, T3   \   // T3:T4 = U2*V2
    74  	ADCQ    T4, T2          \
    75  	MULXQ   24+I1, T9, T4   \   // T4:T9 = U2*V3
    76  	ADCQ    T9, T3          \
    77  	\ // Column U3
    78  	MOVQ    24+I0, DX       \
    79  	ADCQ    $0, T4          \
    80  	ADDQ    T5, T0          \
    81  	MOVQ    T0, 16+O        \   // O[2]
    82  	ADCQ    T6, T1          \
    83  	ADCQ    T7, T2          \
    84  	ADCQ    T8, T3          \
    85  	ADCQ    $0, T4          \
    86  	MULXQ   0+I1, T0, T5    \   // T5:T0 = U3*V0
    87  	MULXQ   8+I1, T7, T6    \   // T6:T7 = U3*V1
    88  	ADDQ    T7, T5          \
    89  	MULXQ   16+I1, T8, T7   \   // T7:T8 = U3*V2
    90  	ADCQ    T8, T6          \
    91  	MULXQ   24+I1, T9, T8   \   // T8:T9 = U3*V3
    92  	ADCQ    T9, T7          \
    93  	ADCQ    $0, T8          \
    94  	\ // Add values in remaining columns
    95  	ADDQ    T0, T1          \
    96  	MOVQ    T1, 24+O        \   // O[3]
    97  	ADCQ    T5, T2          \
    98  	MOVQ    T2, 32+O        \   // O[4]
    99  	ADCQ    T6, T3          \
   100  	MOVQ    T3, 40+O        \   // O[5]
   101  	ADCQ    T7, T4          \
   102  	MOVQ    T4, 48+O        \   // O[6]
   103  	ADCQ    $0, T8          \   // O[7]
   104  	MOVQ    T8, 56+O
   105  
   106  // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
   107  // uses ADOX, ADCX and MULX instructions. Macro smashes values in AX and DX.
   108  // Input: I0 and I1.
   109  // Output: O
   110  // All the other arguments registers are used for storing temporary values
   111  #define MULS256_MULX_ADCX_ADOX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
   112  							\   // U0[0]
   113  	MOVQ     0+I0, DX       \   // MULX requires multiplayer in DX
   114  							\   // T0:T1 = I1*DX
   115  	MULXQ    I1, T1, T0     \   // T0:T1 = U0*V0 (low:high)
   116  	MOVQ     T1, O          \   // O0[0]
   117  	MULXQ     8+I1, T2, T1  \   // T2:T1 = U0*V1
   118  	XORQ     AX, AX         \
   119  	ADOXQ    T2, T0         \
   120  	MULXQ    16+I1, T3, T2  \   // T2:T3 = U0*V2
   121  	ADOXQ    T3, T1         \
   122  	MULXQ    24+I1, T4, T3  \   // T3:T4 = U0*V3
   123  	ADOXQ    T4, T2         \
   124  	\  // Column U1
   125  	MOVQ      8+I0, DX      \
   126  	MULXQ    I1, T4, T5     \   // T5:T4 = U1*V0
   127  	ADOXQ    AX, T3         \
   128  	XORQ     AX, AX         \
   129  	MULXQ     8+I1, T7, T6  \   // T6:T7 = U1*V1
   130  	ADOXQ    T0, T4         \
   131  	MOVQ     T4, 8+O        \   // O[1]
   132  	ADCXQ    T7, T5         \
   133  	MULXQ    16+I1, T8, T7  \   // T7:T8 = U1*V2
   134  	ADCXQ    T8, T6         \
   135  	ADOXQ    T1, T5 \
   136  	MULXQ    24+I1, T9, T8  \   // T8:T9 = U1*V3
   137  	ADCXQ    T9, T7         \
   138  	ADCXQ    AX, T8         \
   139  	ADOXQ    T2, T6         \
   140  	\ // Column U2
   141  	MOVQ     16+I0, DX      \
   142  	MULXQ    I1, T0, T1     \   // T1:T0 = U2*V0
   143  	ADOXQ    T3, T7         \
   144  	ADOXQ    AX, T8         \
   145  	XORQ     AX, AX         \
   146  	MULXQ    8+I1, T3, T2   \   // T2:T3 = U2*V1
   147  	ADOXQ    T5, T0         \
   148  	MOVQ     T0, 16+O       \   // O[2]
   149  	ADCXQ    T3, T1         \
   150  	MULXQ    16+I1, T4, T3  \   // T3:T4 = U2*V2
   151  	ADCXQ    T4, T2         \
   152  	ADOXQ    T6, T1         \
   153  	MULXQ    24+I1, T9, T4  \   // T9:T4 = U2*V3
   154  	ADCXQ    T9, T3         \
   155  	MOVQ     24+I0, DX      \
   156  	ADCXQ    AX, T4         \
   157  	\
   158  	ADOXQ    T7, T2         \
   159  	ADOXQ    T8, T3         \
   160  	ADOXQ    AX, T4         \
   161  	\ // Column U3
   162  	MULXQ    I1, T0, T5     \   // T5:T0 = U3*B0
   163  	XORQ     AX, AX         \
   164  	MULXQ    8+I1, T7, T6   \   // T6:T7 = U3*B1
   165  	ADCXQ    T7, T5         \
   166  	ADOXQ    T0, T1         \
   167  	MULXQ    16+I1, T8, T7  \   // T7:T8 = U3*V2
   168  	ADCXQ    T8, T6         \
   169  	ADOXQ    T5, T2         \
   170  	MULXQ    24+I1, T9, T8  \   // T8:T9 = U3*V3
   171  	ADCXQ    T9, T7         \
   172  	ADCXQ    AX, T8         \
   173  	\
   174  	ADOXQ   T6, T3          \
   175  	ADOXQ   T7, T4          \
   176  	ADOXQ   AX, T8          \
   177  	MOVQ    T1, 24+O        \   // O[3]
   178  	MOVQ    T2, 32+O        \   // O[4]
   179  	MOVQ    T3, 40+O        \   // O[5]
   180  	MOVQ    T4, 48+O        \   // O[6] and O[7] below
   181  	MOVQ    T8, 56+O
   182  
   183  // Template of a macro that performs schoolbook multiplication of 128-bit with 320-bit
   184  // number. It uses MULX instruction This template must be customized with functions
   185  // performing ADD (add1, add2) and ADD-with-carry (adc1, adc2). addX/adcX may or may
   186  // not be instructions that use two independent carry chains.
   187  // Input:
   188  //   * I0 128-bit number
   189  //   * I1 320-bit number
   190  //   * add1, add2: instruction performing integer addition and starting carry chain
   191  //   * adc1, adc2: instruction performing integer addition with carry
   192  // Output: T[0-6] registers
   193  #define MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, add1, add2, adc1, adc2) \
   194  	\ // Column 0
   195  	MOVQ    I0, DX              \
   196  	MOVQ    I1+24(SB), AX       \
   197  	MULXQ   AX, T0, T1          \
   198  	MOVQ    I1+32(SB), AX       \
   199  	MULXQ   AX, T4, T2          \
   200  	MOVQ    I1+40(SB), AX       \
   201  	MULXQ   AX, T5, T3          \
   202  	XORQ    AX, AX              \
   203  	add1    T4, T1              \
   204  	adc1    T5, T2              \
   205  	MOVQ    I1+48(SB), AX       \
   206  	MULXQ   AX, T7, T4          \
   207  	adc1    T7, T3              \
   208  	MOVQ    I1+56(SB), AX       \
   209  	MULXQ   AX, T6, T5          \
   210  	adc1    T6, T4              \
   211  	MOVL    $0, AX              \
   212  	adc1    AX, T5              \
   213  	\ // Column 1
   214  	MOVQ    8+I0, DX            \
   215  	MOVQ    I1+24(SB), AX       \
   216  	MULXQ   AX, T6, T7          \
   217  	add2    T6, T1              \
   218  	adc2    T7, T2              \
   219  	MOVQ    I1+32(SB), AX       \
   220  	MULXQ   AX, T8, T6          \
   221  	adc2    T6, T3              \
   222  	MOVQ    I1+40(SB), AX       \
   223  	MULXQ   AX, T7, T9          \
   224  	adc2    T9, T4              \
   225  	MOVQ    I1+48(SB), AX       \
   226  	MULXQ   AX, T9, T6          \
   227  	adc2    T6, T5              \
   228  	MOVQ    I1+56(SB), AX       \
   229  	MULXQ   AX, DX, T6          \
   230  	MOVL    $0, AX              \
   231  	adc2    AX, T6              \
   232  	\ // Output
   233  	XORQ    AX, AX              \
   234  	add1    T8, T2              \
   235  	adc1    T7, T3              \
   236  	adc1    T9, T4              \
   237  	adc1    DX, T5              \
   238  	adc1    AX, T6
   239  
   240  // Multiplies 128-bit with 320-bit integer. Optimized with MULX instruction.
   241  #define MULS_128x320_MULX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
   242  	MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADDQ, ADDQ, ADCQ, ADCQ)
   243  
   244  // Multiplies 128-bit with 320-bit integer. Optimized with  MULX, ADOX and ADCX instructions
   245  #define MULS_128x320_MULX_ADCX_ADOX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
   246  	MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADOXQ, ADCXQ, ADOXQ, ADCXQ)
   247  
   248  // Template of a macro performing multiplication of two 512-bit numbers. It uses one
   249  // level of Karatsuba and one level of schoolbook multiplication. Template must be
   250  // customized with macro performing schoolbook multiplication.
   251  // Input:
   252  //  * I0, I1 - two 512-bit numbers
   253  //  * MULS - either MULS256_MULX or MULS256_MULX_ADCX_ADOX
   254  // Output: OUT - 1024-bit long
   255  #define MUL(OUT, I0, I1, MULS) \
   256  	\ // R[8-11]: U1+U0
   257  	XORQ    AX, AX  \
   258  	MOVQ    ( 0)(I0), R8    \
   259  	MOVQ    ( 8)(I0), R9    \
   260  	MOVQ    (16)(I0), R10   \
   261  	MOVQ    (24)(I0), R11   \
   262  	ADDQ    (32)(I0), R8    \
   263  	ADCQ    (40)(I0), R9    \
   264  	ADCQ    (48)(I0), R10   \
   265  	ADCQ    (56)(I0), R11   \
   266  	SBBQ    $0, AX          \ // store mask
   267  	MOVQ    R8,  ( 0)(SP)   \
   268  	MOVQ    R9,  ( 8)(SP)   \
   269  	MOVQ    R10, (16)(SP)   \
   270  	MOVQ    R11, (24)(SP)   \
   271  	\
   272  	\ // R[12-15]: V1+V0
   273  	XORQ    BX, BX          \
   274  	MOVQ    ( 0)(I1), R12   \
   275  	MOVQ    ( 8)(I1), R13   \
   276  	MOVQ    (16)(I1), R14   \
   277  	MOVQ    (24)(I1), R15   \
   278  	ADDQ    (32)(I1), R12   \
   279  	ADCQ    (40)(I1), R13   \
   280  	ADCQ    (48)(I1), R14   \
   281  	ADCQ    (56)(I1), R15   \
   282  	SBBQ    $0, BX          \ // store mask
   283  	MOVQ    R12, (32)(SP)   \
   284  	MOVQ    R13, (40)(SP)   \
   285  	MOVQ    R14, (48)(SP)   \
   286  	MOVQ    R15, (56)(SP)   \
   287  	\ // Prepare mask for U0+U1 (U1+U0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
   288  	ANDQ    AX, R12         \
   289  	ANDQ    AX, R13         \
   290  	ANDQ    AX, R14         \
   291  	ANDQ    AX, R15         \
   292  	\ // Prepare mask for V0+V1 (V1+V0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
   293  	ANDQ    BX, R8          \
   294  	ANDQ    BX, R9          \
   295  	ANDQ    BX, R10         \
   296  	ANDQ    BX, R11         \
   297  	\ // res = masked(U0+U1) + masked(V0 + V1)
   298  	ADDQ    R12, R8         \
   299  	ADCQ    R13, R9         \
   300  	ADCQ    R14, R10        \
   301  	ADCQ    R15, R11        \
   302  	\ // SP[64-96] <- res
   303  	MOVQ     R8, (64)(SP)   \
   304  	MOVQ     R9, (72)(SP)   \
   305  	MOVQ    R10, (80)(SP)   \
   306  	MOVQ    R11, (88)(SP)   \
   307  	\ // BP will be used for schoolbook multiplication below
   308  	MOVQ    BP, 96(SP)  \ // push: BP is Callee-save.
   309  	\ // (U1+U0)*(V1+V0)
   310  	MULS((64)(OUT), 0(SP), 32(SP), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP)    \
   311  	\ // U0 x V0
   312  	MULS(0(OUT), 0(I0), 0(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP)    \
   313  	\ // U1 x V1
   314  	MULS(0(SP), 32(I0), 32(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP)  \
   315  	\ // Recover BP
   316  	MOVQ    96(SP), BP  \ // pop: BP is Callee-save.
   317  	\ // Final part of schoolbook multiplication; R[8-11] = (U0+U1) x (V0+V1)
   318  	MOVQ    (64)(SP), R8    \
   319  	MOVQ    (72)(SP), R9    \
   320  	MOVQ    (80)(SP), R10   \
   321  	MOVQ    (88)(SP), R11   \
   322  	MOVQ    (96)(OUT), AX   \
   323  	ADDQ    AX, R8          \
   324  	MOVQ    (104)(OUT), AX  \
   325  	ADCQ    AX, R9          \
   326  	MOVQ    (112)(OUT), AX  \
   327  	ADCQ    AX, R10         \
   328  	MOVQ    (120)(OUT), AX  \
   329  	ADCQ    AX, R11 \
   330  	\ // R[12-15, 8-11] = (U0+U1) x (V0+V1) - U0xV0
   331  	MOVQ    (64)(OUT), R12  \
   332  	MOVQ    (72)(OUT), R13  \
   333  	MOVQ    (80)(OUT), R14  \
   334  	MOVQ    (88)(OUT), R15  \
   335  	SUBQ    ( 0)(OUT), R12  \
   336  	SBBQ    ( 8)(OUT), R13  \
   337  	SBBQ    (16)(OUT), R14  \
   338  	SBBQ    (24)(OUT), R15  \
   339  	SBBQ    (32)(OUT), R8   \
   340  	SBBQ    (40)(OUT), R9   \
   341  	SBBQ    (48)(OUT), R10  \
   342  	SBBQ    (56)(OUT), R11  \
   343  	\ // r8-r15 <- (U0+U1) x (V0+V1) - U0xV0 - U1xV1
   344  	SUBQ    ( 0)(SP), R12   \
   345  	SBBQ    ( 8)(SP), R13   \
   346  	SBBQ    (16)(SP), R14   \
   347  	SBBQ    (24)(SP), R15   \
   348  	SBBQ    (32)(SP), R8    \
   349  	SBBQ    (40)(SP), R9    \
   350  	SBBQ    (48)(SP), R10   \
   351  	SBBQ    (56)(SP), R11   \
   352  	\
   353  	;                       ADDQ   (32)(OUT), R12; MOVQ    R12, ( 32)(OUT) \
   354  	;                       ADCQ   (40)(OUT), R13; MOVQ    R13, ( 40)(OUT) \
   355  	;                       ADCQ   (48)(OUT), R14; MOVQ    R14, ( 48)(OUT) \
   356  	;                       ADCQ   (56)(OUT), R15; MOVQ    R15, ( 56)(OUT) \
   357  	MOVQ    ( 0)(SP), AX;   ADCQ    AX,  R8;       MOVQ     R8, ( 64)(OUT) \
   358  	MOVQ    ( 8)(SP), AX;   ADCQ    AX,  R9;       MOVQ     R9, ( 72)(OUT) \
   359  	MOVQ    (16)(SP), AX;   ADCQ    AX, R10;       MOVQ    R10, ( 80)(OUT) \
   360  	MOVQ    (24)(SP), AX;   ADCQ    AX, R11;       MOVQ    R11, ( 88)(OUT) \
   361  	MOVQ    (32)(SP), R12;  ADCQ    $0, R12;       MOVQ    R12, ( 96)(OUT) \
   362  	MOVQ    (40)(SP), R13;  ADCQ    $0, R13;       MOVQ    R13, (104)(OUT) \
   363  	MOVQ    (48)(SP), R14;  ADCQ    $0, R14;       MOVQ    R14, (112)(OUT) \
   364  	MOVQ    (56)(SP), R15;  ADCQ    $0, R15;       MOVQ    R15, (120)(OUT)
   365  
   366  // Template for calculating the Montgomery reduction algorithm described in
   367  // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
   368  // customized with schoolbook multiplication for 128 x 320-bit number.
   369  // This macro reuses memory of IN value and *changes* it. Smashes registers
   370  // R[8-15], BX, CX
   371  // Input:
   372  //    * IN: 1024-bit number to be reduced
   373  //    * MULS: either MULS_128x320_MULX or MULS_128x320_MULX_ADCX_ADOX
   374  // Output: OUT 512-bit
   375  #define REDC(OUT, IN, MULS) \
   376  	MULS(0(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \
   377  	XORQ    R15, R15        \
   378  	ADDQ    (24)(IN), R8    \
   379  	ADCQ    (32)(IN), R9    \
   380  	ADCQ    (40)(IN), R10   \
   381  	ADCQ    (48)(IN), R11   \
   382  	ADCQ    (56)(IN), R12   \
   383  	ADCQ    (64)(IN), R13   \
   384  	ADCQ    (72)(IN), R14   \
   385  	ADCQ    (80)(IN), R15   \
   386  	MOVQ    R8, (24)(IN)    \
   387  	MOVQ    R9, (32)(IN)    \
   388  	MOVQ    R10, (40)(IN)   \
   389  	MOVQ    R11, (48)(IN)   \
   390  	MOVQ    R12, (56)(IN)   \
   391  	MOVQ    R13, (64)(IN)   \
   392  	MOVQ    R14, (72)(IN)   \
   393  	MOVQ    R15, (80)(IN)   \
   394  	MOVQ    (88)(IN), R8    \
   395  	MOVQ    (96)(IN), R9    \
   396  	MOVQ    (104)(IN), R10  \
   397  	MOVQ    (112)(IN), R11  \
   398  	MOVQ    (120)(IN), R12  \
   399  	ADCQ    $0, R8          \
   400  	ADCQ    $0, R9          \
   401  	ADCQ    $0, R10         \
   402  	ADCQ    $0, R11         \
   403  	ADCQ    $0, R12         \
   404  	MOVQ    R8, (88)(IN)    \
   405  	MOVQ    R9, (96)(IN)    \
   406  	MOVQ    R10, (104)(IN)  \
   407  	MOVQ    R11, (112)(IN)  \
   408  	MOVQ    R12, (120)(IN)  \
   409  	\
   410  	MULS(16(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP)    \
   411  	XORQ    R15, R15        \
   412  	ADDQ    (40)(IN), R8    \
   413  	ADCQ    (48)(IN), R9    \
   414  	ADCQ    (56)(IN), R10   \
   415  	ADCQ    (64)(IN), R11   \
   416  	ADCQ    (72)(IN), R12   \
   417  	ADCQ    (80)(IN), R13   \
   418  	ADCQ    (88)(IN), R14   \
   419  	ADCQ    (96)(IN), R15   \
   420  	MOVQ    R8, (40)(IN)    \
   421  	MOVQ    R9, (48)(IN)    \
   422  	MOVQ    R10, (56)(IN)   \
   423  	MOVQ    R11, (64)(IN)   \
   424  	MOVQ    R12, (72)(IN)   \
   425  	MOVQ    R13, (80)(IN)   \
   426  	MOVQ    R14, (88)(IN)   \
   427  	MOVQ    R15, (96)(IN)   \
   428  	MOVQ    (104)(IN), R8   \
   429  	MOVQ    (112)(IN), R9   \
   430  	MOVQ    (120)(IN), R10  \
   431  	ADCQ    $0, R8          \
   432  	ADCQ    $0, R9          \
   433  	ADCQ    $0, R10         \
   434  	MOVQ    R8, (104)(IN)   \
   435  	MOVQ    R9, (112)(IN)   \
   436  	MOVQ    R10, (120)(IN)  \
   437  	\
   438  	MULS(32(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP)    \
   439  	XORQ    R15, R15        \
   440  	XORQ    BX, BX          \
   441  	ADDQ    ( 56)(IN), R8   \
   442  	ADCQ    ( 64)(IN), R9   \
   443  	ADCQ    ( 72)(IN), R10  \
   444  	ADCQ    ( 80)(IN), R11  \
   445  	ADCQ    ( 88)(IN), R12  \
   446  	ADCQ    ( 96)(IN), R13  \
   447  	ADCQ    (104)(IN), R14  \
   448  	ADCQ    (112)(IN), R15  \
   449  	ADCQ    (120)(IN), BX   \
   450  	MOVQ    R8,  ( 56)(IN)  \
   451  	MOVQ    R10, ( 72)(IN)  \
   452  	MOVQ    R11, ( 80)(IN)  \
   453  	MOVQ    R12, ( 88)(IN)  \
   454  	MOVQ    R13, ( 96)(IN)  \
   455  	MOVQ    R14, (104)(IN)  \
   456  	MOVQ    R15, (112)(IN)  \
   457  	MOVQ    BX,  (120)(IN)  \
   458  	MOVQ    R9,  (  0)(OUT) \ // Result: OUT[0]
   459  	\
   460  	MULS(48(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP)    \
   461  	ADDQ    ( 72)(IN), R8   \
   462  	ADCQ    ( 80)(IN), R9   \
   463  	ADCQ    ( 88)(IN), R10  \
   464  	ADCQ    ( 96)(IN), R11  \
   465  	ADCQ    (104)(IN), R12  \
   466  	ADCQ    (112)(IN), R13  \
   467  	ADCQ    (120)(IN), R14  \
   468  	MOVQ    R8,  ( 8)(OUT)  \ // Result: OUT[1]
   469  	MOVQ    R9,  (16)(OUT)  \ // Result: OUT[2]
   470  	MOVQ    R10, (24)(OUT)  \ // Result: OUT[3]
   471  	MOVQ    R11, (32)(OUT)  \ // Result: OUT[4]
   472  	MOVQ    R12, (40)(OUT)  \ // Result: OUT[5]
   473  	MOVQ    R13, (48)(OUT)  \ // Result: OUT[6] and OUT[7]
   474  	MOVQ    R14, (56)(OUT)
   475  
   476  TEXT ·modP503(SB), NOSPLIT, $0-8
   477  	MOVQ	x+0(FP), REG_P1
   478  
   479  	// Zero AX for later use:
   480  	XORQ	AX, AX
   481  
   482  	// Load p into registers:
   483  	MOVQ	P503_0, R8
   484  	// P503_{1,2} = P503_0, so reuse R8
   485  	MOVQ	P503_3, R9
   486  	MOVQ	P503_4, R10
   487  	MOVQ	P503_5, R11
   488  	MOVQ	P503_6, R12
   489  	MOVQ	P503_7, R13
   490  
   491  	// Set x <- x - p
   492  	SUBQ	R8,  ( 0)(REG_P1)
   493  	SBBQ	R8,  ( 8)(REG_P1)
   494  	SBBQ	R8,  (16)(REG_P1)
   495  	SBBQ	R9,  (24)(REG_P1)
   496  	SBBQ	R10, (32)(REG_P1)
   497  	SBBQ	R11, (40)(REG_P1)
   498  	SBBQ	R12, (48)(REG_P1)
   499  	SBBQ	R13, (56)(REG_P1)
   500  
   501  	// Save carry flag indicating x-p < 0 as a mask
   502  	SBBQ	$0, AX
   503  
   504  	// Conditionally add p to x if x-p < 0
   505  	ANDQ	AX, R8
   506  	ANDQ	AX, R9
   507  	ANDQ	AX, R10
   508  	ANDQ	AX, R11
   509  	ANDQ	AX, R12
   510  	ANDQ	AX, R13
   511  
   512  	ADDQ	R8, ( 0)(REG_P1)
   513  	ADCQ	R8, ( 8)(REG_P1)
   514  	ADCQ	R8, (16)(REG_P1)
   515  	ADCQ	R9, (24)(REG_P1)
   516  	ADCQ	R10,(32)(REG_P1)
   517  	ADCQ	R11,(40)(REG_P1)
   518  	ADCQ	R12,(48)(REG_P1)
   519  	ADCQ	R13,(56)(REG_P1)
   520  
   521  	RET
   522  
   523  TEXT ·cswapP503(SB),NOSPLIT,$0-17
   524  
   525  	MOVQ	x+0(FP), REG_P1
   526  	MOVQ	y+8(FP), REG_P2
   527  	MOVB	choice+16(FP), AL	// AL = 0 or 1
   528  	MOVBLZX	AL, AX				// AX = 0 or 1
   529  	NEGQ	AX					// AX = 0x00..00 or 0xff..ff
   530  
   531  #ifndef CSWAP_BLOCK
   532  #define CSWAP_BLOCK(idx) 	\
   533  	MOVQ	(idx*8)(REG_P1), BX	\ // BX = x[idx]
   534  	MOVQ 	(idx*8)(REG_P2), CX	\ // CX = y[idx]
   535  	MOVQ	CX, DX				\ // DX = y[idx]
   536  	XORQ	BX, DX				\ // DX = y[idx] ^ x[idx]
   537  	ANDQ	AX, DX				\ // DX = (y[idx] ^ x[idx]) & mask
   538  	XORQ	DX, BX				\ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
   539  	XORQ	DX, CX				\ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx]
   540  	MOVQ	BX, (idx*8)(REG_P1)	\
   541  	MOVQ	CX, (idx*8)(REG_P2)
   542  #endif
   543  
   544  	CSWAP_BLOCK(0)
   545  	CSWAP_BLOCK(1)
   546  	CSWAP_BLOCK(2)
   547  	CSWAP_BLOCK(3)
   548  	CSWAP_BLOCK(4)
   549  	CSWAP_BLOCK(5)
   550  	CSWAP_BLOCK(6)
   551  	CSWAP_BLOCK(7)
   552  
   553  #ifdef CSWAP_BLOCK
   554  #undef CSWAP_BLOCK
   555  #endif
   556  
   557  	RET
   558  
   559  TEXT ·cmovP503(SB),NOSPLIT,$0-17
   560  
   561      MOVQ    x+0(FP), DI
   562      MOVQ    y+8(FP), SI
   563      MOVB    choice+16(FP), AL   // AL = 0 or 1
   564      MOVBLZX AL, AX  // AX = 0 or 1
   565      NEGQ    AX          // AX = 0x00..00 or 0xff..ff
   566  #ifndef CMOV_BLOCK
   567  #define CMOV_BLOCK(idx)    \
   568      MOVQ    (idx*8)(DI), BX \ // BX = x[idx]
   569      MOVQ    (idx*8)(SI), DX \ // DX = y[idx]
   570      XORQ    BX, DX          \ // DX = y[idx] ^ x[idx]
   571      ANDQ    AX, DX          \ // DX = (y[idx] ^ x[idx]) & mask
   572      XORQ    DX, BX          \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
   573      MOVQ    BX, (idx*8)(DI)
   574  #endif
   575      CMOV_BLOCK(0)
   576      CMOV_BLOCK(1)
   577      CMOV_BLOCK(2)
   578      CMOV_BLOCK(3)
   579      CMOV_BLOCK(4)
   580      CMOV_BLOCK(5)
   581      CMOV_BLOCK(6)
   582      CMOV_BLOCK(7)
   583  #ifdef CMOV_BLOCK
   584  #undef CMOV_BLOCK
   585  #endif
   586      RET
   587  
   588  TEXT ·addP503(SB),NOSPLIT,$0-24
   589  
   590  	MOVQ	z+0(FP), REG_P3
   591  	MOVQ	x+8(FP), REG_P1
   592  	MOVQ	y+16(FP), REG_P2
   593  
   594  	// Used later to calculate a mask
   595  	XORQ    CX, CX
   596  
   597  	// [R8-R15]: z = x + y
   598  	MOVQ	( 0)(REG_P1), R8
   599  	MOVQ	( 8)(REG_P1), R9
   600  	MOVQ	(16)(REG_P1), R10
   601  	MOVQ	(24)(REG_P1), R11
   602  	MOVQ	(32)(REG_P1), R12
   603  	MOVQ	(40)(REG_P1), R13
   604  	MOVQ	(48)(REG_P1), R14
   605  	MOVQ	(56)(REG_P1), R15
   606  	ADDQ	( 0)(REG_P2), R8
   607  	ADCQ	( 8)(REG_P2), R9
   608  	ADCQ	(16)(REG_P2), R10
   609  	ADCQ	(24)(REG_P2), R11
   610  	ADCQ	(32)(REG_P2), R12
   611  	ADCQ	(40)(REG_P2), R13
   612  	ADCQ	(48)(REG_P2), R14
   613  	ADCQ	(56)(REG_P2), R15
   614  
   615  	MOVQ    P503X2_0, AX
   616  	SUBQ    AX, R8
   617  	MOVQ    P503X2_1, AX
   618  	SBBQ    AX, R9
   619  	SBBQ    AX, R10
   620  	MOVQ    P503X2_3, AX
   621  	SBBQ    AX, R11
   622  	MOVQ    P503X2_4, AX
   623  	SBBQ    AX, R12
   624  	MOVQ    P503X2_5, AX
   625  	SBBQ    AX, R13
   626  	MOVQ    P503X2_6, AX
   627  	SBBQ    AX, R14
   628  	MOVQ    P503X2_7, AX
   629  	SBBQ    AX, R15
   630  
   631  	// mask
   632  	SBBQ    $0, CX
   633  
   634  	// move z to REG_P3
   635  	MOVQ    R8,  ( 0)(REG_P3)
   636  	MOVQ    R9,  ( 8)(REG_P3)
   637  	MOVQ    R10, (16)(REG_P3)
   638  	MOVQ    R11, (24)(REG_P3)
   639  	MOVQ    R12, (32)(REG_P3)
   640  	MOVQ    R13, (40)(REG_P3)
   641  	MOVQ    R14, (48)(REG_P3)
   642  	MOVQ    R15, (56)(REG_P3)
   643  
   644  	// if z<0 add p503x2 back
   645  	MOVQ    P503X2_0,   R8
   646  	MOVQ    P503X2_1,   R9
   647  	MOVQ    P503X2_3,   R10
   648  	MOVQ    P503X2_4,   R11
   649  	MOVQ    P503X2_5,   R12
   650  	MOVQ    P503X2_6,   R13
   651  	MOVQ    P503X2_7,   R14
   652  	ANDQ    CX, R8
   653  	ANDQ    CX, R9
   654  	ANDQ    CX, R10
   655  	ANDQ    CX, R11
   656  	ANDQ    CX, R12
   657  	ANDQ    CX, R13
   658  	ANDQ    CX, R14
   659  	MOVQ    ( 0)(REG_P3), AX; ADDQ    R8,  AX; MOVQ    AX, ( 0)(REG_P3)
   660  	MOVQ    ( 8)(REG_P3), AX; ADCQ    R9,  AX; MOVQ    AX, ( 8)(REG_P3)
   661  	MOVQ    (16)(REG_P3), AX; ADCQ    R9,  AX; MOVQ    AX, (16)(REG_P3)
   662  	MOVQ    (24)(REG_P3), AX; ADCQ    R10, AX; MOVQ    AX, (24)(REG_P3)
   663  	MOVQ    (32)(REG_P3), AX; ADCQ    R11, AX; MOVQ    AX, (32)(REG_P3)
   664  	MOVQ    (40)(REG_P3), AX; ADCQ    R12, AX; MOVQ    AX, (40)(REG_P3)
   665  	MOVQ    (48)(REG_P3), AX; ADCQ    R13, AX; MOVQ    AX, (48)(REG_P3)
   666  	MOVQ    (56)(REG_P3), AX; ADCQ    R14, AX; MOVQ    AX, (56)(REG_P3)
   667  	RET
   668  
   669  TEXT ·subP503(SB), NOSPLIT, $0-24
   670  
   671  	MOVQ    z+0(FP), REG_P3
   672  	MOVQ    x+8(FP), REG_P1
   673  	MOVQ    y+16(FP), REG_P2
   674  
   675  	// Used later to calculate a mask
   676  	XORQ    CX, CX
   677  
   678  	MOVQ    ( 0)(REG_P1), R8
   679  	MOVQ    ( 8)(REG_P1), R9
   680  	MOVQ    (16)(REG_P1), R10
   681  	MOVQ    (24)(REG_P1), R11
   682  	MOVQ    (32)(REG_P1), R12
   683  	MOVQ    (40)(REG_P1), R13
   684  	MOVQ    (48)(REG_P1), R14
   685  	MOVQ    (56)(REG_P1), R15
   686  
   687  	SUBQ    ( 0)(REG_P2), R8
   688  	SBBQ    ( 8)(REG_P2), R9
   689  	SBBQ    (16)(REG_P2), R10
   690  	SBBQ    (24)(REG_P2), R11
   691  	SBBQ    (32)(REG_P2), R12
   692  	SBBQ    (40)(REG_P2), R13
   693  	SBBQ    (48)(REG_P2), R14
   694  	SBBQ    (56)(REG_P2), R15
   695  
   696  	// mask
   697  	SBBQ    $0, CX
   698  
   699  	// store x-y in REG_P3
   700  	MOVQ    R8,  ( 0)(REG_P3)
   701  	MOVQ    R9,  ( 8)(REG_P3)
   702  	MOVQ    R10, (16)(REG_P3)
   703  	MOVQ    R11, (24)(REG_P3)
   704  	MOVQ    R12, (32)(REG_P3)
   705  	MOVQ    R13, (40)(REG_P3)
   706  	MOVQ    R14, (48)(REG_P3)
   707  	MOVQ    R15, (56)(REG_P3)
   708  
   709  	// if z<0 add p503x2 back
   710  	MOVQ    P503X2_0, R8
   711  	MOVQ    P503X2_1, R9
   712  	MOVQ    P503X2_3, R10
   713  	MOVQ    P503X2_4, R11
   714  	MOVQ    P503X2_5, R12
   715  	MOVQ    P503X2_6, R13
   716  	MOVQ    P503X2_7, R14
   717  	ANDQ    CX, R8
   718  	ANDQ    CX, R9
   719  	ANDQ    CX, R10
   720  	ANDQ    CX, R11
   721  	ANDQ    CX, R12
   722  	ANDQ    CX, R13
   723  	ANDQ    CX, R14
   724  	MOVQ    ( 0)(REG_P3), AX; ADDQ    R8,  AX; MOVQ    AX, ( 0)(REG_P3)
   725  	MOVQ    ( 8)(REG_P3), AX; ADCQ    R9,  AX; MOVQ    AX, ( 8)(REG_P3)
   726  	MOVQ    (16)(REG_P3), AX; ADCQ    R9,  AX; MOVQ    AX, (16)(REG_P3)
   727  	MOVQ    (24)(REG_P3), AX; ADCQ    R10, AX; MOVQ    AX, (24)(REG_P3)
   728  	MOVQ    (32)(REG_P3), AX; ADCQ    R11, AX; MOVQ    AX, (32)(REG_P3)
   729  	MOVQ    (40)(REG_P3), AX; ADCQ    R12, AX; MOVQ    AX, (40)(REG_P3)
   730  	MOVQ    (48)(REG_P3), AX; ADCQ    R13, AX; MOVQ    AX, (48)(REG_P3)
   731  	MOVQ    (56)(REG_P3), AX; ADCQ    R14, AX; MOVQ    AX, (56)(REG_P3)
   732  
   733  	RET
   734  
   735  TEXT ·mulP503(SB), NOSPLIT, $104-24
   736  	MOVQ    z+0(FP), CX
   737  	MOVQ    x+8(FP), REG_P1
   738  	MOVQ    y+16(FP), REG_P2
   739  
   740  	// Check whether to use optimized implementation
   741  	CMPB    ·HasADXandBMI2(SB), $1
   742  	JE      mul_with_mulx_adcx_adox
   743  	CMPB    ·HasBMI2(SB), $1
   744  	JE      mul_with_mulx
   745  
   746  	// Generic x86 implementation (below) uses variant of Karatsuba method.
   747  	//
   748  	// Here we store the destination in CX instead of in REG_P3 because the
   749  	// multiplication instructions use DX as an implicit destination
   750  	// operand: MULQ $REG sets DX:AX <-- AX * $REG.
   751  
   752  	// RAX and RDX will be used for a mask (0-borrow)
   753  	XORQ	AX, AX
   754  
   755  	// RCX[0-3]: U1+U0
   756  	MOVQ	(32)(REG_P1), R8
   757  	MOVQ	(40)(REG_P1), R9
   758  	MOVQ	(48)(REG_P1), R10
   759  	MOVQ	(56)(REG_P1), R11
   760  	ADDQ	( 0)(REG_P1), R8
   761  	ADCQ	( 8)(REG_P1), R9
   762  	ADCQ	(16)(REG_P1), R10
   763  	ADCQ	(24)(REG_P1), R11
   764  	MOVQ	R8,  ( 0)(CX)
   765  	MOVQ	R9,  ( 8)(CX)
   766  	MOVQ	R10, (16)(CX)
   767  	MOVQ	R11, (24)(CX)
   768  
   769  	SBBQ	$0, AX
   770  
   771  	// R12-R15: V1+V0
   772  	XORQ	DX, DX
   773  	MOVQ	(32)(REG_P2), R12
   774  	MOVQ	(40)(REG_P2), R13
   775  	MOVQ	(48)(REG_P2), R14
   776  	MOVQ	(56)(REG_P2), R15
   777  	ADDQ	( 0)(REG_P2), R12
   778  	ADCQ	( 8)(REG_P2), R13
   779  	ADCQ	(16)(REG_P2), R14
   780  	ADCQ	(24)(REG_P2), R15
   781  
   782  	SBBQ	$0, DX
   783  
   784  	// Store carries on stack
   785  	MOVQ	AX, (64)(SP)
   786  	MOVQ	DX, (72)(SP)
   787  
   788  	// (SP[0-3],R8,R9,R10,R11) <- (U0+U1)*(V0+V1).
   789  	// MUL using comba; In comments below U=U0+U1 V=V0+V1
   790  
   791  	// U0*V0
   792  	MOVQ    (CX), AX
   793  	MULQ    R12
   794  	MOVQ    AX, (SP)        // C0
   795  	MOVQ    DX, R8
   796  
   797  	// U0*V1
   798  	XORQ    R9, R9
   799  	MOVQ    (CX), AX
   800  	MULQ    R13
   801  	ADDQ    AX, R8
   802  	ADCQ    DX, R9
   803  
   804  	// U1*V0
   805  	XORQ    R10, R10
   806  	MOVQ    (8)(CX), AX
   807  	MULQ    R12
   808  	ADDQ    AX, R8
   809  	MOVQ    R8, (8)(SP)     // C1
   810  	ADCQ    DX, R9
   811  	ADCQ    $0, R10
   812  
   813  	// U0*V2
   814  	XORQ    R8, R8
   815  	MOVQ    (CX), AX
   816  	MULQ    R14
   817  	ADDQ    AX, R9
   818  	ADCQ    DX, R10
   819  	ADCQ    $0, R8
   820  
   821  	// U2*V0
   822  	MOVQ    (16)(CX), AX
   823  	MULQ    R12
   824  	ADDQ    AX, R9
   825  	ADCQ    DX, R10
   826  	ADCQ    $0, R8
   827  
   828  	// U1*V1
   829  	MOVQ    (8)(CX), AX
   830  	MULQ    R13
   831  	ADDQ    AX, R9
   832  	MOVQ    R9, (16)(SP)        // C2
   833  	ADCQ    DX, R10
   834  	ADCQ    $0, R8
   835  
   836  	// U0*V3
   837  	XORQ    R9, R9
   838  	MOVQ    (CX), AX
   839  	MULQ    R15
   840  	ADDQ    AX, R10
   841  	ADCQ    DX, R8
   842  	ADCQ    $0, R9
   843  
   844  	// U3*V0
   845  	MOVQ    (24)(CX), AX
   846  	MULQ    R12
   847  	ADDQ    AX, R10
   848  	ADCQ    DX, R8
   849  	ADCQ    $0, R9
   850  
   851  	// U1*V2
   852  	MOVQ    (8)(CX), AX
   853  	MULQ    R14
   854  	ADDQ    AX, R10
   855  	ADCQ    DX, R8
   856  	ADCQ    $0, R9
   857  
   858  	// U2*V1
   859  	MOVQ    (16)(CX), AX
   860  	MULQ    R13
   861  	ADDQ    AX, R10
   862  	MOVQ    R10, (24)(SP)       // C3
   863  	ADCQ    DX, R8
   864  	ADCQ    $0, R9
   865  
   866  	// U1*V3
   867  	XORQ    R10, R10
   868  	MOVQ    (8)(CX), AX
   869  	MULQ    R15
   870  	ADDQ    AX, R8
   871  	ADCQ    DX, R9
   872  	ADCQ    $0, R10
   873  
   874  	// U3*V1
   875  	MOVQ    (24)(CX), AX
   876  	MULQ    R13
   877  	ADDQ    AX, R8
   878  	ADCQ    DX, R9
   879  	ADCQ    $0, R10
   880  
   881  	// U2*V2
   882  	MOVQ    (16)(CX), AX
   883  	MULQ    R14
   884  	ADDQ    AX, R8
   885  	MOVQ    R8, (32)(SP)        // C4
   886  	ADCQ    DX, R9
   887  	ADCQ    $0, R10
   888  
   889  	// U2*V3
   890  	XORQ    R11, R11
   891  	MOVQ    (16)(CX), AX
   892  	MULQ    R15
   893  	ADDQ    AX, R9
   894  	ADCQ    DX, R10
   895  	ADCQ    $0, R11
   896  
   897  	// U3*V2
   898  	MOVQ    (24)(CX), AX
   899  	MULQ    R14
   900  	ADDQ    AX, R9              // C5
   901  	ADCQ    DX, R10
   902  	ADCQ    $0, R11
   903  
   904  	// U3*V3
   905  	MOVQ    (24)(CX), AX
   906  	MULQ    R15
   907  	ADDQ    AX, R10             // C6
   908  	ADCQ    DX, R11             // C7
   909  
   910  	MOVQ    (64)(SP), AX
   911  	ANDQ    AX, R12
   912  	ANDQ    AX, R13
   913  	ANDQ    AX, R14
   914  	ANDQ    AX, R15
   915  	ADDQ    R8, R12
   916  	ADCQ    R9, R13
   917  	ADCQ    R10, R14
   918  	ADCQ    R11, R15
   919  
   920  	MOVQ    (72)(SP), AX
   921  	MOVQ    (CX), R8
   922  	MOVQ    (8)(CX), R9
   923  	MOVQ    (16)(CX), R10
   924  	MOVQ    (24)(CX), R11
   925  	ANDQ    AX, R8
   926  	ANDQ    AX, R9
   927  	ANDQ    AX, R10
   928  	ANDQ    AX, R11
   929  	ADDQ    R12, R8
   930  	ADCQ    R13, R9
   931  	ADCQ    R14, R10
   932  	ADCQ    R15, R11
   933  	MOVQ    R8, (32)(SP)
   934  	MOVQ    R9, (40)(SP)
   935  	MOVQ    R10, (48)(SP)
   936  	MOVQ    R11, (56)(SP)
   937  
   938  	// CX[0-7] <- AL*BL
   939  
   940  	// U0*V0
   941  	MOVQ    (REG_P1), R11
   942  	MOVQ    (REG_P2), AX
   943  	MULQ    R11
   944  	XORQ    R9, R9
   945  	MOVQ    AX, (CX)            // C0
   946  	MOVQ    DX, R8
   947  
   948  	// U0*V1
   949  	MOVQ    (16)(REG_P1), R14
   950  	MOVQ    (8)(REG_P2), AX
   951  	MULQ    R11
   952  	XORQ    R10, R10
   953  	ADDQ    AX, R8
   954  	ADCQ    DX, R9
   955  
   956  	// U1*V0
   957  	MOVQ    (8)(REG_P1), R12
   958  	MOVQ    (REG_P2), AX
   959  	MULQ    R12
   960  	ADDQ    AX, R8
   961  	MOVQ    R8, (8)(CX)         // C1
   962  	ADCQ    DX, R9
   963  	ADCQ    $0, R10
   964  
   965  	// U0*V2
   966  	XORQ    R8, R8
   967  	MOVQ    (16)(REG_P2), AX
   968  	MULQ    R11
   969  	ADDQ    AX, R9
   970  	ADCQ    DX, R10
   971  	ADCQ    $0, R8
   972  
   973  	// U2*V0
   974  	MOVQ    (REG_P2), R13
   975  	MOVQ    R14, AX
   976  	MULQ    R13
   977  	ADDQ    AX, R9
   978  	ADCQ    DX, R10
   979  	ADCQ    $0, R8
   980  
   981  	// U1*V1
   982  	MOVQ    (8)(REG_P2), AX
   983  	MULQ    R12
   984  	ADDQ    AX, R9
   985  	MOVQ    R9, (16)(CX)        // C2
   986  	ADCQ    DX, R10
   987  	ADCQ    $0, R8
   988  
   989  	// U0*V3
   990  	XORQ    R9, R9
   991  	MOVQ    (24)(REG_P2), AX
   992  	MULQ    R11
   993  	MOVQ    (24)(REG_P1), R15
   994  	ADDQ    AX, R10
   995  	ADCQ    DX, R8
   996  	ADCQ    $0, R9
   997  
   998  	// U3*V1
   999  	MOVQ    R15, AX
  1000  	MULQ    R13
  1001  	ADDQ    AX, R10
  1002  	ADCQ    DX, R8
  1003  	ADCQ    $0, R9
  1004  
  1005  	// U2*V2
  1006  	MOVQ    (16)(REG_P2), AX
  1007  	MULQ    R12
  1008  	ADDQ    AX, R10
  1009  	ADCQ    DX, R8
  1010  	ADCQ    $0, R9
  1011  
  1012  	// U2*V3
  1013  	MOVQ    (8)(REG_P2), AX
  1014  	MULQ    R14
  1015  	ADDQ    AX, R10
  1016  	MOVQ    R10, (24)(CX)       // C3
  1017  	ADCQ    DX, R8
  1018  	ADCQ    $0, R9
  1019  
  1020  	// U3*V2
  1021  	XORQ    R10, R10
  1022  	MOVQ    (24)(REG_P2), AX
  1023  	MULQ    R12
  1024  	ADDQ    AX, R8
  1025  	ADCQ    DX, R9
  1026  	ADCQ    $0, R10
  1027  
  1028  	// U3*V1
  1029  	MOVQ    (8)(REG_P2), AX
  1030  	MULQ    R15
  1031  	ADDQ    AX, R8
  1032  	ADCQ    DX, R9
  1033  	ADCQ    $0, R10
  1034  
  1035  	// U2*V2
  1036  	MOVQ    (16)(REG_P2), AX
  1037  	MULQ    R14
  1038  	ADDQ    AX, R8
  1039  	MOVQ    R8, (32)(CX)		// C4
  1040  	ADCQ    DX, R9
  1041  	ADCQ    $0, R10
  1042  
  1043  	// U2*V3
  1044  	XORQ    R8, R8
  1045  	MOVQ    (24)(REG_P2), AX
  1046  	MULQ    R14
  1047  	ADDQ    AX, R9
  1048  	ADCQ    DX, R10
  1049  	ADCQ    $0, R8
  1050  
  1051  	// U3*V2
  1052  	MOVQ    (16)(REG_P2), AX
  1053  	MULQ    R15
  1054  	ADDQ    AX, R9
  1055  	MOVQ    R9, (40)(CX)		// C5
  1056  	ADCQ    DX, R10
  1057  	ADCQ    $0, R8
  1058  
  1059  	// U3*V3
  1060  	MOVQ    (24)(REG_P2), AX
  1061  	MULQ    R15
  1062  	ADDQ    AX, R10
  1063  	MOVQ    R10, (48)(CX)		// C6
  1064  	ADCQ    DX, R8
  1065  	MOVQ    R8, (56)(CX)		// C7
  1066  
  1067  	// CX[8-15] <- U1*V1
  1068  	MOVQ    (32)(REG_P1), R11
  1069  	MOVQ    (32)(REG_P2), AX
  1070  	MULQ    R11
  1071  	XORQ    R9, R9
  1072  	MOVQ    AX, (64)(CX)        // C0
  1073  	MOVQ    DX, R8
  1074  
  1075  	MOVQ    (48)(REG_P1), R14
  1076  	MOVQ    (40)(REG_P2), AX
  1077  	MULQ    R11
  1078  	XORQ    R10, R10
  1079  	ADDQ    AX, R8
  1080  	ADCQ    DX, R9
  1081  
  1082  	MOVQ    (40)(REG_P1), R12
  1083  	MOVQ    (32)(REG_P2), AX
  1084  	MULQ    R12
  1085  	ADDQ    AX, R8
  1086  	MOVQ    R8, (72)(CX)        // C1
  1087  	ADCQ    DX, R9
  1088  	ADCQ    $0, R10
  1089  
  1090  	XORQ    R8, R8
  1091  	MOVQ    (48)(REG_P2), AX
  1092  	MULQ    R11
  1093  	ADDQ    AX, R9
  1094  	ADCQ    DX, R10
  1095  	ADCQ    $0, R8
  1096  
  1097  	MOVQ    (32)(REG_P2), R13
  1098  	MOVQ    R14, AX
  1099  	MULQ    R13
  1100  	ADDQ    AX, R9
  1101  	ADCQ    DX, R10
  1102  	ADCQ    $0, R8
  1103  
  1104  	MOVQ    (40)(REG_P2), AX
  1105  	MULQ    R12
  1106  	ADDQ    AX, R9
  1107  	MOVQ    R9, (80)(CX)        // C2
  1108  	ADCQ    DX, R10
  1109  	ADCQ    $0, R8
  1110  
  1111  	XORQ    R9, R9
  1112  	MOVQ    (56)(REG_P2), AX
  1113  	MULQ    R11
  1114  	MOVQ    (56)(REG_P1), R15
  1115  	ADDQ    AX, R10
  1116  	ADCQ    DX, R8
  1117  	ADCQ    $0, R9
  1118  
  1119  	MOVQ    R15, AX
  1120  	MULQ    R13
  1121  	ADDQ    AX, R10
  1122  	ADCQ    DX, R8
  1123  	ADCQ    $0, R9
  1124  
  1125  	MOVQ    (48)(REG_P2), AX
  1126  	MULQ    R12
  1127  	ADDQ    AX, R10
  1128  	ADCQ    DX, R8
  1129  	ADCQ    $0, R9
  1130  
  1131  	MOVQ    (40)(REG_P2), AX
  1132  	MULQ    R14
  1133  	ADDQ    AX, R10
  1134  	MOVQ    R10, (88)(CX)       // C3
  1135  	ADCQ    DX, R8
  1136  	ADCQ    $0, R9
  1137  
  1138  	XORQ    R10, R10
  1139  	MOVQ    (56)(REG_P2), AX
  1140  	MULQ    R12
  1141  	ADDQ    AX, R8
  1142  	ADCQ    DX, R9
  1143  	ADCQ    $0, R10
  1144  
  1145  	MOVQ    (40)(REG_P2), AX
  1146  	MULQ    R15
  1147  	ADDQ    AX, R8
  1148  	ADCQ    DX, R9
  1149  	ADCQ    $0, R10
  1150  
  1151  	MOVQ    (48)(REG_P2), AX
  1152  	MULQ    R14
  1153  	ADDQ    AX, R8
  1154  	MOVQ    R8, (96)(CX)        // C4
  1155  	ADCQ    DX, R9
  1156  	ADCQ    $0, R10
  1157  
  1158  	XORQ    R8, R8
  1159  	MOVQ    (56)(REG_P2), AX
  1160  	MULQ    R14
  1161  	ADDQ    AX, R9
  1162  	ADCQ    DX, R10
  1163  	ADCQ    $0, R8
  1164  
  1165  	MOVQ    (48)(REG_P2), AX
  1166  	MULQ    R15
  1167  	ADDQ    AX, R9
  1168  	MOVQ    R9, (104)(CX)       // C5
  1169  	ADCQ    DX, R10
  1170  	ADCQ    $0, R8
  1171  
  1172  	MOVQ    (56)(REG_P2), AX
  1173  	MULQ    R15
  1174  	ADDQ    AX, R10
  1175  	MOVQ    R10, (112)(CX)      // C6
  1176  	ADCQ    DX, R8
  1177  	MOVQ    R8, (120)(CX)       // C7
  1178  
  1179  	// [R8-R15] <- (U0+U1)*(V0+V1) - U1*V1
  1180  	MOVQ    (SP), R8
  1181  	SUBQ    (CX), R8
  1182  	MOVQ    (8)(SP), R9
  1183  	SBBQ    (8)(CX), R9
  1184  	MOVQ    (16)(SP), R10
  1185  	SBBQ    (16)(CX), R10
  1186  	MOVQ    (24)(SP), R11
  1187  	SBBQ    (24)(CX), R11
  1188  	MOVQ    (32)(SP), R12
  1189  	SBBQ    (32)(CX), R12
  1190  	MOVQ    (40)(SP), R13
  1191  	SBBQ    (40)(CX), R13
  1192  	MOVQ    (48)(SP), R14
  1193  	SBBQ    (48)(CX), R14
  1194  	MOVQ    (56)(SP), R15
  1195  	SBBQ    (56)(CX), R15
  1196  
  1197  	// [R8-R15] <- (U0+U1)*(V0+V1) - U1*V0 - U0*U1
  1198  	MOVQ    ( 64)(CX), AX;	SUBQ    AX, R8
  1199  	MOVQ    ( 72)(CX), AX;	SBBQ    AX, R9
  1200  	MOVQ    ( 80)(CX), AX;	SBBQ    AX, R10
  1201  	MOVQ    ( 88)(CX), AX;	SBBQ    AX, R11
  1202  	MOVQ    ( 96)(CX), AX;	SBBQ    AX, R12
  1203  	MOVQ    (104)(CX), DX;	SBBQ    DX, R13
  1204  	MOVQ    (112)(CX), DI;	SBBQ    DI, R14
  1205  	MOVQ    (120)(CX), SI;	SBBQ    SI, R15
  1206  
  1207  	// Final result
  1208  	ADDQ    (32)(CX), R8;	MOVQ    R8,  (32)(CX)
  1209  	ADCQ    (40)(CX), R9;	MOVQ    R9,  (40)(CX)
  1210  	ADCQ    (48)(CX), R10;	MOVQ    R10, (48)(CX)
  1211  	ADCQ    (56)(CX), R11;	MOVQ    R11, (56)(CX)
  1212  	ADCQ    (64)(CX), R12;	MOVQ    R12, (64)(CX)
  1213  	ADCQ    (72)(CX), R13;	MOVQ    R13, (72)(CX)
  1214  	ADCQ    (80)(CX), R14;	MOVQ    R14, (80)(CX)
  1215  	ADCQ    (88)(CX), R15;	MOVQ    R15, (88)(CX)
  1216  	ADCQ    $0, AX;        	MOVQ    AX,  (96)(CX)
  1217  	ADCQ    $0, DX;        	MOVQ    DX, (104)(CX)
  1218  	ADCQ    $0, DI;         MOVQ    DI, (112)(CX)
  1219  	ADCQ    $0, SI;     	MOVQ    SI, (120)(CX)
  1220  	RET
  1221  
  1222  mul_with_mulx_adcx_adox:
  1223  	// Mul implementation for CPUs supporting two independent carry chain
  1224  	// (ADOX/ADCX) instructions and carry-less MULX multiplier
  1225  	MUL(CX, REG_P1, REG_P2, MULS256_MULX_ADCX_ADOX)
  1226  	RET
  1227  
  1228  mul_with_mulx:
  1229  	// Mul implementation for CPUs supporting carry-less MULX multiplier.
  1230  	MUL(CX, REG_P1, REG_P2, MULS256_MULX)
  1231  	RET
  1232  
  1233  TEXT ·rdcP503(SB), $8-16
  1234  	MOVQ    z+0(FP), REG_P2
  1235  	MOVQ    x+8(FP), REG_P1
  1236  
  1237  	// Check whether to use optimized implementation
  1238  	CMPB    ·HasADXandBMI2(SB), $1
  1239  	JE      redc_with_mulx_adcx_adox
  1240  	CMPB    ·HasBMI2(SB), $1
  1241  	JE      redc_with_mulx
  1242  
  1243  	MOVQ    (REG_P1), R11
  1244  	MOVQ    P503P1_3, AX
  1245  	MULQ    R11
  1246  	XORQ    R8, R8
  1247  	ADDQ    (24)(REG_P1), AX
  1248  	MOVQ    AX, (24)(REG_P2)
  1249  	ADCQ    DX, R8
  1250  
  1251  	XORQ    R9, R9
  1252  	MOVQ    P503P1_4, AX
  1253  	MULQ    R11
  1254  	XORQ    R10, R10
  1255  	ADDQ    AX, R8
  1256  	ADCQ    DX, R9
  1257  
  1258  	MOVQ    (8)(REG_P1), R12
  1259  	MOVQ    P503P1_3, AX
  1260  	MULQ    R12
  1261  	ADDQ    AX, R8
  1262  	ADCQ    DX, R9
  1263  	ADCQ    $0, R10
  1264  	ADDQ    (32)(REG_P1), R8
  1265  	MOVQ    R8, (32)(REG_P2)       // Z4
  1266  	ADCQ    $0, R9
  1267  	ADCQ    $0, R10
  1268  
  1269  	XORQ    R8, R8
  1270  	MOVQ    P503P1_5, AX
  1271  	MULQ    R11
  1272  	ADDQ    AX, R9
  1273  	ADCQ    DX, R10
  1274  	ADCQ    $0, R8
  1275  
  1276  	MOVQ    P503P1_4, AX
  1277  	MULQ    R12
  1278  	ADDQ    AX, R9
  1279  	ADCQ    DX, R10
  1280  	ADCQ    $0, R8
  1281  
  1282  	MOVQ    (16)(REG_P1), R13
  1283  	MOVQ    P503P1_3, AX
  1284  	MULQ    R13
  1285  	ADDQ    AX, R9
  1286  	ADCQ    DX, R10
  1287  	ADCQ    $0, R8
  1288  	ADDQ    (40)(REG_P1), R9
  1289  	MOVQ    R9, (40)(REG_P2)       // Z5
  1290  	ADCQ    $0, R10
  1291  	ADCQ    $0, R8
  1292  
  1293  	XORQ    R9, R9
  1294  	MOVQ    P503P1_6, AX
  1295  	MULQ    R11
  1296  	ADDQ    AX, R10
  1297  	ADCQ    DX, R8
  1298  	ADCQ    $0, R9
  1299  
  1300  	MOVQ    P503P1_5, AX
  1301  	MULQ    R12
  1302  	ADDQ    AX, R10
  1303  	ADCQ    DX, R8
  1304  	ADCQ    $0, R9
  1305  
  1306  	MOVQ    P503P1_4, AX
  1307  	MULQ    R13
  1308  	ADDQ    AX, R10
  1309  	ADCQ    DX, R8
  1310  	ADCQ    $0, R9
  1311  
  1312  	MOVQ    (24)(REG_P2), R14
  1313  	MOVQ    P503P1_3, AX
  1314  	MULQ    R14
  1315  	ADDQ    AX, R10
  1316  	ADCQ    DX, R8
  1317  	ADCQ    $0, R9
  1318  	ADDQ    (48)(REG_P1), R10
  1319  	MOVQ    R10, (48)(REG_P2)      // Z6
  1320  	ADCQ    $0, R8
  1321  	ADCQ    $0, R9
  1322  
  1323  	XORQ    R10, R10
  1324  	MOVQ    P503P1_7, AX
  1325  	MULQ    R11
  1326  	ADDQ    AX, R8
  1327  	ADCQ    DX, R9
  1328  	ADCQ    $0, R10
  1329  
  1330  	MOVQ    P503P1_6, AX
  1331  	MULQ    R12
  1332  	ADDQ    AX, R8
  1333  	ADCQ    DX, R9
  1334  	ADCQ    $0, R10
  1335  
  1336  	MOVQ    P503P1_5, AX
  1337  	MULQ    R13
  1338  	ADDQ    AX, R8
  1339  	ADCQ    DX, R9
  1340  	ADCQ    $0, R10
  1341  
  1342  	MOVQ    P503P1_4, AX
  1343  	MULQ    R14
  1344  	ADDQ    AX, R8
  1345  	ADCQ    DX, R9
  1346  	ADCQ    $0, R10
  1347  
  1348  	MOVQ    (32)(REG_P2), R15
  1349  	MOVQ    P503P1_3, AX
  1350  	MULQ    R15
  1351  	ADDQ    AX, R8
  1352  	ADCQ    DX, R9
  1353  	ADCQ    $0, R10
  1354  	ADDQ    (56)(REG_P1), R8
  1355  	MOVQ    R8, (56)(REG_P2)       // Z7
  1356  	ADCQ    $0, R9
  1357  	ADCQ    $0, R10
  1358  
  1359  	XORQ    R8, R8
  1360  	MOVQ    P503P1_7, AX
  1361  	MULQ    R12
  1362  	ADDQ    AX, R9
  1363  	ADCQ    DX, R10
  1364  	ADCQ    $0, R8
  1365  
  1366  	MOVQ    P503P1_6, AX
  1367  	MULQ    R13
  1368  	ADDQ    AX, R9
  1369  	ADCQ    DX, R10
  1370  	ADCQ    $0, R8
  1371  
  1372  	MOVQ    P503P1_5, AX
  1373  	MULQ    R14
  1374  	ADDQ    AX, R9
  1375  	ADCQ    DX, R10
  1376  	ADCQ    $0, R8
  1377  
  1378  	MOVQ    P503P1_4, AX
  1379  	MULQ    R15
  1380  	ADDQ    AX, R9
  1381  	ADCQ    DX, R10
  1382  	ADCQ    $0, R8
  1383  
  1384  	MOVQ    (40)(REG_P2), CX
  1385  	MOVQ    P503P1_3, AX
  1386  	MULQ    CX
  1387  	ADDQ    AX, R9
  1388  	ADCQ    DX, R10
  1389  	ADCQ    $0, R8
  1390  	ADDQ    (64)(REG_P1), R9
  1391  	MOVQ    R9, (REG_P2)           // Z0
  1392  	ADCQ    $0, R10
  1393  	ADCQ    $0, R8
  1394  
  1395  	XORQ    R9, R9
  1396  	MOVQ    P503P1_7, AX
  1397  	MULQ    R13
  1398  	ADDQ    AX, R10
  1399  	ADCQ    DX, R8
  1400  	ADCQ    $0, R9
  1401  
  1402  	MOVQ    P503P1_6, AX
  1403  	MULQ    R14
  1404  	ADDQ    AX, R10
  1405  	ADCQ    DX, R8
  1406  	ADCQ    $0, R9
  1407  
  1408  	MOVQ    P503P1_5, AX
  1409  	MULQ    R15
  1410  	ADDQ    AX, R10
  1411  	ADCQ    DX, R8
  1412  	ADCQ    $0, R9
  1413  
  1414  	MOVQ    P503P1_4, AX
  1415  	MULQ    CX
  1416  	ADDQ    AX, R10
  1417  	ADCQ    DX, R8
  1418  	ADCQ    $0, R9
  1419  
  1420  	MOVQ    (48)(REG_P2), R13
  1421  	MOVQ    P503P1_3, AX
  1422  	MULQ    R13
  1423  	ADDQ    AX, R10
  1424  	ADCQ    DX, R8
  1425  	ADCQ    $0, R9
  1426  	ADDQ    (72)(REG_P1), R10
  1427  	MOVQ    R10, (8)(REG_P2)       // Z1
  1428  	ADCQ    $0, R8
  1429  	ADCQ    $0, R9
  1430  
  1431  	XORQ    R10, R10
  1432  	MOVQ    P503P1_7, AX
  1433  	MULQ    R14
  1434  	ADDQ    AX, R8
  1435  	ADCQ    DX, R9
  1436  	ADCQ    $0, R10
  1437  
  1438  	MOVQ    P503P1_6, AX
  1439  	MULQ    R15
  1440  	ADDQ    AX, R8
  1441  	ADCQ    DX, R9
  1442  	ADCQ    $0, R10
  1443  
  1444  	MOVQ    P503P1_5, AX
  1445  	MULQ    CX
  1446  	ADDQ    AX, R8
  1447  	ADCQ    DX, R9
  1448  	ADCQ    $0, R10
  1449  
  1450  	MOVQ    P503P1_4, AX
  1451  	MULQ    R13
  1452  	ADDQ    AX, R8
  1453  	ADCQ    DX, R9
  1454  	ADCQ    $0, R10
  1455  
  1456  	MOVQ    (56)(REG_P2), R14
  1457  	MOVQ    P503P1_3, AX
  1458  	MULQ    R14
  1459  	ADDQ    AX, R8
  1460  	ADCQ    DX, R9
  1461  	ADCQ    $0, R10
  1462  	ADDQ    (80)(REG_P1), R8
  1463  	MOVQ    R8, (16)(REG_P2)       // Z2
  1464  	ADCQ    $0, R9
  1465  	ADCQ    $0, R10
  1466  
  1467  	XORQ    R8, R8
  1468  	MOVQ    P503P1_7, AX
  1469  	MULQ    R15
  1470  	ADDQ    AX, R9
  1471  	ADCQ    DX, R10
  1472  	ADCQ    $0, R8
  1473  
  1474  	MOVQ    P503P1_6, AX
  1475  	MULQ    CX
  1476  	ADDQ    AX, R9
  1477  	ADCQ    DX, R10
  1478  	ADCQ    $0, R8
  1479  
  1480  	MOVQ    P503P1_5, AX
  1481  	MULQ    R13
  1482  	ADDQ    AX, R9
  1483  	ADCQ    DX, R10
  1484  	ADCQ    $0, R8
  1485  
  1486  	MOVQ    P503P1_4, AX
  1487  	MULQ    R14
  1488  	ADDQ    AX, R9
  1489  	ADCQ    DX, R10
  1490  	ADCQ    $0, R8
  1491  	ADDQ    (88)(REG_P1), R9
  1492  	MOVQ    R9, (24)(REG_P2)       // Z3
  1493  	ADCQ    $0, R10
  1494  	ADCQ    $0, R8
  1495  
  1496  	XORQ    R9, R9
  1497  	MOVQ    P503P1_7, AX
  1498  	MULQ    CX
  1499  	ADDQ    AX, R10
  1500  	ADCQ    DX, R8
  1501  	ADCQ    $0, R9
  1502  
  1503  	MOVQ    P503P1_6, AX
  1504  	MULQ    R13
  1505  	ADDQ    AX, R10
  1506  	ADCQ    DX, R8
  1507  	ADCQ    $0, R9
  1508  
  1509  	MOVQ    P503P1_5, AX
  1510  	MULQ    R14
  1511  	ADDQ    AX, R10
  1512  	ADCQ    DX, R8
  1513  	ADCQ    $0, R9
  1514  	ADDQ    (96)(REG_P1), R10
  1515  	MOVQ    R10, (32)(REG_P2)      // Z4
  1516  	ADCQ    $0, R8
  1517  	ADCQ    $0, R9
  1518  
  1519  	XORQ    R10, R10
  1520  	MOVQ    P503P1_7, AX
  1521  	MULQ    R13
  1522  	ADDQ    AX, R8
  1523  	ADCQ    DX, R9
  1524  	ADCQ    $0, R10
  1525  
  1526  	MOVQ    P503P1_6, AX
  1527  	MULQ    R14
  1528  	ADDQ    AX, R8
  1529  	ADCQ    DX, R9
  1530  	ADCQ    $0, R10
  1531  	ADDQ    (104)(REG_P1), R8      // Z5
  1532  	MOVQ    R8, (40)(REG_P2)       // Z5
  1533  	ADCQ    $0, R9
  1534  	ADCQ    $0, R10
  1535  
  1536  	MOVQ    P503P1_7, AX
  1537  	MULQ    R14
  1538  	ADDQ    AX, R9
  1539  	ADCQ    DX, R10
  1540  	ADDQ    (112)(REG_P1), R9      // Z6
  1541  	MOVQ    R9, (48)(REG_P2)       // Z6
  1542  	ADCQ    $0, R10
  1543  	ADDQ    (120)(REG_P1), R10     // Z7
  1544  	MOVQ    R10, (56)(REG_P2)      // Z7
  1545  	RET
  1546  
  1547  redc_with_mulx_adcx_adox:
  1548  	// Implementation of the Montgomery reduction for CPUs
  1549  	// supporting two independent carry chain (ADOX/ADCX)
  1550  	// instructions and carry-less MULX multiplier
  1551  	MOVQ BP, 0(SP) // push: BP is Callee-save.
  1552  	REDC(REG_P2, REG_P1, MULS_128x320_MULX_ADCX_ADOX)
  1553  	MOVQ 0(SP), BP // pop: BP is Callee-save.
  1554  	RET
  1555  
  1556  redc_with_mulx:
  1557  	// Implementation of the Montgomery reduction for CPUs
  1558  	// supporting carry-less MULX multiplier.
  1559  	MOVQ BP, 0(SP) // push: BP is Callee-save.
  1560  	REDC(REG_P2, REG_P1, MULS_128x320_MULX)
  1561  	MOVQ 0(SP), BP // pop: BP is Callee-save.
  1562  	RET
  1563  
  1564  TEXT ·adlP503(SB), NOSPLIT, $0-24
  1565  
  1566  	MOVQ	z+0(FP), REG_P3
  1567  	MOVQ	x+8(FP), REG_P1
  1568  	MOVQ	y+16(FP), REG_P2
  1569  
  1570  	MOVQ	(REG_P1), R8
  1571  	MOVQ	(8)(REG_P1), R9
  1572  	MOVQ	(16)(REG_P1), R10
  1573  	MOVQ	(24)(REG_P1), R11
  1574  	MOVQ	(32)(REG_P1), R12
  1575  	MOVQ	(40)(REG_P1), R13
  1576  	MOVQ	(48)(REG_P1), R14
  1577  	MOVQ	(56)(REG_P1), R15
  1578  	MOVQ	(64)(REG_P1), AX
  1579  	MOVQ	(72)(REG_P1), BX
  1580  	MOVQ	(80)(REG_P1), CX
  1581  
  1582  	ADDQ	(REG_P2), R8
  1583  	ADCQ	(8)(REG_P2), R9
  1584  	ADCQ	(16)(REG_P2), R10
  1585  	ADCQ	(24)(REG_P2), R11
  1586  	ADCQ	(32)(REG_P2), R12
  1587  	ADCQ	(40)(REG_P2), R13
  1588  	ADCQ	(48)(REG_P2), R14
  1589  	ADCQ	(56)(REG_P2), R15
  1590  	ADCQ	(64)(REG_P2), AX
  1591  	ADCQ	(72)(REG_P2), BX
  1592  	ADCQ	(80)(REG_P2), CX
  1593  
  1594  	MOVQ	R8, (REG_P3)
  1595  	MOVQ	R9, (8)(REG_P3)
  1596  	MOVQ	R10, (16)(REG_P3)
  1597  	MOVQ	R11, (24)(REG_P3)
  1598  	MOVQ	R12, (32)(REG_P3)
  1599  	MOVQ	R13, (40)(REG_P3)
  1600  	MOVQ	R14, (48)(REG_P3)
  1601  	MOVQ	R15, (56)(REG_P3)
  1602  	MOVQ	AX, (64)(REG_P3)
  1603  	MOVQ	BX, (72)(REG_P3)
  1604  	MOVQ	CX, (80)(REG_P3)
  1605  
  1606  	MOVQ	(88)(REG_P1), R8
  1607  	MOVQ	(96)(REG_P1), R9
  1608  	MOVQ	(104)(REG_P1), R10
  1609  	MOVQ	(112)(REG_P1), R11
  1610  	MOVQ	(120)(REG_P1), R12
  1611  
  1612  	ADCQ	(88)(REG_P2), R8
  1613  	ADCQ	(96)(REG_P2), R9
  1614  	ADCQ	(104)(REG_P2), R10
  1615  	ADCQ	(112)(REG_P2), R11
  1616  	ADCQ	(120)(REG_P2), R12
  1617  
  1618  	MOVQ	R8, (88)(REG_P3)
  1619  	MOVQ	R9, (96)(REG_P3)
  1620  	MOVQ	R10, (104)(REG_P3)
  1621  	MOVQ	R11, (112)(REG_P3)
  1622  	MOVQ	R12, (120)(REG_P3)
  1623  
  1624  	RET
  1625  
  1626  TEXT ·sulP503(SB), NOSPLIT, $0-24
  1627  
  1628  	MOVQ z+0(FP), REG_P3
  1629  	MOVQ x+8(FP), REG_P1
  1630  	MOVQ y+16(FP), REG_P2
  1631  	// Used later to store result of 0-borrow
  1632  	XORQ CX, CX
  1633  
  1634  	// SUBC for first 11 limbs
  1635  	MOVQ	(REG_P1), R8
  1636  	MOVQ	(8)(REG_P1), R9
  1637  	MOVQ	(16)(REG_P1), R10
  1638  	MOVQ	(24)(REG_P1), R11
  1639  	MOVQ	(32)(REG_P1), R12
  1640  	MOVQ	(40)(REG_P1), R13
  1641  	MOVQ	(48)(REG_P1), R14
  1642  	MOVQ	(56)(REG_P1), R15
  1643  	MOVQ	(64)(REG_P1), AX
  1644  	MOVQ	(72)(REG_P1), BX
  1645  
  1646  	SUBQ	(REG_P2), R8
  1647  	SBBQ	(8)(REG_P2), R9
  1648  	SBBQ	(16)(REG_P2), R10
  1649  	SBBQ	(24)(REG_P2), R11
  1650  	SBBQ	(32)(REG_P2), R12
  1651  	SBBQ	(40)(REG_P2), R13
  1652  	SBBQ	(48)(REG_P2), R14
  1653  	SBBQ	(56)(REG_P2), R15
  1654  	SBBQ	(64)(REG_P2), AX
  1655  	SBBQ	(72)(REG_P2), BX
  1656  
  1657  	MOVQ	R8, (REG_P3)
  1658  	MOVQ	R9, (8)(REG_P3)
  1659  	MOVQ	R10, (16)(REG_P3)
  1660  	MOVQ	R11, (24)(REG_P3)
  1661  	MOVQ	R12, (32)(REG_P3)
  1662  	MOVQ	R13, (40)(REG_P3)
  1663  	MOVQ	R14, (48)(REG_P3)
  1664  	MOVQ	R15, (56)(REG_P3)
  1665  	MOVQ	AX, (64)(REG_P3)
  1666  	MOVQ	BX, (72)(REG_P3)
  1667  
  1668  	// SUBC for last 5 limbs
  1669  	MOVQ	(80)(REG_P1), 	R8
  1670  	MOVQ	(88)(REG_P1), 	R9
  1671  	MOVQ	(96)(REG_P1), 	R10
  1672  	MOVQ	(104)(REG_P1), 	R11
  1673  	MOVQ	(112)(REG_P1), 	R12
  1674  	MOVQ	(120)(REG_P1), 	R13
  1675  
  1676  	SBBQ	(80)(REG_P2), R8
  1677  	SBBQ	(88)(REG_P2), R9
  1678  	SBBQ	(96)(REG_P2), R10
  1679  	SBBQ	(104)(REG_P2), R11
  1680  	SBBQ	(112)(REG_P2), R12
  1681  	SBBQ	(120)(REG_P2), R13
  1682  
  1683  	MOVQ	R8, (80)(REG_P3)
  1684  	MOVQ	R9, (88)(REG_P3)
  1685  	MOVQ	R10, (96)(REG_P3)
  1686  	MOVQ	R11, (104)(REG_P3)
  1687  	MOVQ	R12, (112)(REG_P3)
  1688  	MOVQ	R13, (120)(REG_P3)
  1689  
  1690  	// Now the carry flag is 1 if x-y < 0.  If so, add p*2^512.
  1691  	SBBQ	$0, CX
  1692  
  1693  	// Load p into registers:
  1694  	MOVQ	P503_0, R8
  1695  	// P503_{1,2} = P503_0, so reuse R8
  1696  	MOVQ	P503_3, R9
  1697  	MOVQ	P503_4, R10
  1698  	MOVQ	P503_5, R11
  1699  	MOVQ	P503_6, R12
  1700  	MOVQ	P503_7, R13
  1701  
  1702  	ANDQ	CX, R8
  1703  	ANDQ	CX, R9
  1704  	ANDQ	CX, R10
  1705  	ANDQ	CX, R11
  1706  	ANDQ	CX, R12
  1707  	ANDQ	CX, R13
  1708  
  1709  	MOVQ   (64   )(REG_P3), AX; ADDQ R8,  AX; MOVQ AX, (64   )(REG_P3)
  1710  	MOVQ   (64+ 8)(REG_P3), AX; ADCQ R8,  AX; MOVQ AX, (64+ 8)(REG_P3)
  1711  	MOVQ   (64+16)(REG_P3), AX; ADCQ R8,  AX; MOVQ AX, (64+16)(REG_P3)
  1712  	MOVQ   (64+24)(REG_P3), AX; ADCQ R9,  AX; MOVQ AX, (64+24)(REG_P3)
  1713  	MOVQ   (64+32)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (64+32)(REG_P3)
  1714  	MOVQ   (64+40)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (64+40)(REG_P3)
  1715  	MOVQ   (64+48)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (64+48)(REG_P3)
  1716  	MOVQ   (64+56)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (64+56)(REG_P3)
  1717  
  1718  	RET