github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p434/arith_amd64.s (about)

     1  // +build amd64,!purego
     2  
     3  #include "textflag.h"
     4  
     5  // p434
     6  #define P434_0 $0xFFFFFFFFFFFFFFFF
     7  #define P434_3 $0xFDC1767AE2FFFFFF
     8  #define P434_4 $0x7BC65C783158AEA3
     9  #define P434_5 $0x6CFC5FD681C52056
    10  #define P434_6 $0x0002341F27177344
    11  
    12  // p434 x 2
    13  #define P434X2_0 $0xFFFFFFFFFFFFFFFE
    14  #define P434X2_1 $0xFFFFFFFFFFFFFFFF
    15  #define P434X2_3 $0xFB82ECF5C5FFFFFF
    16  #define P434X2_4 $0xF78CB8F062B15D47
    17  #define P434X2_5 $0xD9F8BFAD038A40AC
    18  #define P434X2_6 $0x0004683E4E2EE688
    19  
    20  // Redefine P434p1Zeros
    21  #define P434_P1_ZEROS 3
    22  
    23  // Performs schoolbook multiplication of 128-bit with 256-bit
    24  // number. Uses MULX, ADOX, ADCX instruction.
    25  //
    26  // Uses registers: DX,AX
    27  // Calculates:
    28  //   (I0,I1) x [M1][0,1,2,3] = (T0,T1,T2,T3,T4,T5)
    29  //   |-128-| x |--- 256 ---| = |------ 384 ------|
    30  // Assuming the first digit multiplication was already performed.
    31  #define MULX128x256(I1, M1, T1, T2, T3, T4, T5)    \
    32      MOVQ    M1+ 8(SB), AX       \
    33      MULXQ   AX, T4, T2          \
    34      XORQ    AX, AX              \
    35      MOVQ    M1+16(SB), AX       \
    36      MULXQ   AX, T5, T3          \
    37      ADOXQ   T4, T1              \ // T1: interm1
    38      ADOXQ   T5, T2              \ // T2: interm2
    39      MOVQ    M1+24(SB), AX       \
    40      MULXQ   AX, T5, T4          \
    41      ADOXQ   T5, T3              \ // T3: interm3
    42      MOVL    $0, AX              \
    43      ADOXQ   AX, T4              \ // T4: interm4
    44      \
    45      XORQ    AX, AX              \
    46      MOVQ    I1, DX              \
    47      MOVQ    M1+ 0(SB), AX       \
    48      MULXQ   AX, T5, I1          \ // T0 <- C0
    49      ADCXQ   T5, T1              \
    50      ADCXQ   I1, T2              \ // T1 <- C1
    51      MOVQ    M1+ 8(SB), AX       \
    52      MULXQ   AX, I1, T5          \
    53      ADCXQ   T5, T3              \
    54      ADOXQ   I1, T2              \ // T2 <- C2
    55      MOVQ    M1+16(SB), AX       \
    56      MULXQ   AX, I1, T5          \
    57      ADCXQ   T5, T4              \
    58      ADOXQ   I1, T3              \ // T3 <- C3
    59      MOVQ    M1+24(SB), AX       \
    60      MULXQ   AX, I1, T5          \
    61      MOVL    $0, AX              \
    62      ADCXQ   AX, T5              \
    63      ADOXQ   I1, T4              \ // T4 <- C4
    64      ADOXQ   AX, T5                // T5 <- C5
    65  
    66  // Performs schoolbook multiplication of 64-bit with 256-bit
    67  // number. Uses MULX and ADOX instructions.
    68  //
    69  // Uses registers: DX,AX
    70  // Calculates:
    71  //   (I0) x [M1][0,1,2,3] = (T0,T1,T2,T3,T4)
    72  //   |64| x |--- 256 ---| = |----- 320 ----|
    73  // Assuming the first digit multiplication was already performed.
    74  #define MULX64x256(M1, T1, T2, T3, T4, T5) \
    75      MOVQ    M1+ 8(SB), AX       \
    76      MULXQ   AX, T4, T2          \
    77      XORQ    AX, AX              \
    78      MOVQ    M1+16(SB), AX       \
    79      MULXQ   AX, T5, T3          \
    80      ADOXQ   T4, T1              \ // T1 <- C1
    81      ADOXQ   T5, T2              \ // T2 <- C2
    82      MOVQ    M1+24(SB), AX       \
    83      MULXQ   AX, T5, T4          \
    84      ADOXQ   T5, T3              \ // T3 <- C3
    85      MOVL    $0, AX              \
    86      ADOXQ   AX, T4                // T4 <- C4
    87  
    88  // Performs schoolbook multiplication of two 192-bit numbers
    89  // number. Uses MULX and ADOX instructions.
    90  //
    91  // Uses registers: DX,AX
    92  #define MULX192(IM0,M0,IM1,M1,ID,MDST,T0,T1,T2,T3,T4,T5,T6) \
    93      MOVQ    (0+IM0)(M0), DX      \
    94      MULXQ   (0+IM1)(M1), T1, T0  \ // T0:T1 = A0*B0
    95      MOVQ    T1,(ID+0)(MDST)      \ // MDST0
    96      MULXQ   (IM1+ 8)(M1), T2, T1 \ // T1:T2 = A0*B1
    97      XORQ    AX, AX               \
    98      ADOXQ   T2, T0               \
    99      MULXQ   (IM1+16)(M1),T3, T2  \ // T2:T3 = A0*B2
   100      ADOXQ   T3, T1               \
   101      \
   102      MOVQ    (IM0+8)(M0), DX      \
   103      MULXQ   (IM1+0)(M1), T4, T3  \ // T3:T4 = A1*B0
   104      ADOXQ   AX, T2               \
   105      XORQ    AX, AX               \
   106      \
   107      MULXQ   (IM1+8)(M1), T6, T5  \ // T6:T7 = A1*B1
   108      ADOXQ   T0, T4               \
   109      MOVQ    T4,(ID+8)(MDST)      \ // MDST1
   110      ADCXQ   T6, T3               \
   111      \
   112      MULXQ   (IM1+16)(M1),T0, T6  \ // T6:T0 = A1*B2
   113      ADOXQ   T1, T3               \
   114      ADCXQ   T0, T5               \
   115      ADCXQ   AX, T6               \
   116      ADOXQ   T2, T5               \
   117      \
   118      MOVQ    (IM0+16)(M0),DX      \
   119      MULXQ   (IM1+ 0)(M1), T0, T1 \ // T1:T0 = A2*B0
   120      ADOXQ   AX, T6               \
   121      XORQ    AX, AX               \
   122      \
   123      MULXQ   (IM1+ 8)(M1), T2, T4 \ // T4:T2 = A2*B1
   124      ADOXQ   T3, T0               \
   125      MOVQ    T0, (ID+16)(MDST)    \ // MDST2
   126      ADCXQ   T5, T1               \
   127      \
   128      MULXQ   (IM1+16)(M1),T3, T0  \ // T0:T3 = A2*B2
   129      ADCXQ   T6, T4               \
   130      ADCXQ   AX, T0               \
   131      ADOXQ   T2, T1               \
   132      ADOXQ   T4, T3               \
   133      ADOXQ   T0, AX
   134  
   135  // Performs schoolbook multiplication of 2 256-bit numbers. Uses
   136  // MULX instruction. Result is stored in 256 bits pointed by $DST.
   137  //
   138  // Uses registers: DX,AX
   139  #define MULX256(IM0,M0,IM1,M1,ID,MDST,T0,T1,T2,T3,T4,T5,T6,T7,T8,T9) \
   140      MOVQ    (IM0+0)(M0), DX      \
   141      MULXQ   (IM1+0)(M1), T1, T0  \ // A0*B[0-3]
   142      MOVQ    T1, (ID+0)(MDST)     \
   143      MULXQ   (IM1+8)(M1), T2, T1  \
   144      XORQ    AX, AX               \
   145      ADOXQ   T2, T0               \
   146      MULXQ   (IM1+16)(M1),T3, T2  \
   147      ADOXQ   T3, T1               \
   148      MULXQ   (IM1+24)(M1),T4, T3  \
   149      ADOXQ   T4, T2               \
   150      \
   151      MOVQ    (IM0+8)(M0), DX      \
   152      MULXQ   (IM1+0)(M1), T4, T5  \ // A1*B[0-3]
   153      ADOXQ   AX, T3               \
   154      XORQ    AX, AX               \
   155      MULXQ   (IM1+8)(M1), T7, T6  \
   156      ADOXQ   T0, T4               \
   157      MOVQ    T4, (ID+8)(MDST)     \
   158      ADCXQ   T7, T5               \
   159      MULXQ   (IM1+16)(M1),T8, T7  \
   160      ADCXQ   T8, T6               \
   161      ADOXQ   T1, T5               \
   162      MULXQ   (IM1+24)(M1),T9, T8  \
   163      ADCXQ   T9, T7               \
   164      ADCXQ   AX, T8               \
   165      ADOXQ   T2, T6               \
   166      \
   167      MOVQ    (IM0+16)(M0),DX      \ // A2*B[0-3]
   168      MULXQ   (IM1+ 0)(M1), T0, T1 \
   169      ADOXQ   T3, T7               \
   170      ADOXQ   AX, T8               \
   171      XORQ    AX, AX               \
   172      MULXQ   (IM1+8)(M1), T3, T2  \
   173      ADOXQ   T5, T0               \
   174      MOVQ    T0, (ID+16)(MDST)    \
   175      ADCXQ   T3, T1               \
   176      MULXQ   (IM1+16)(M1),T4, T3  \
   177      ADCXQ   T4, T2               \
   178      ADOXQ   T6, T1               \
   179      MULXQ   (IM1+24)(M1),T9, T4  \
   180      ADCXQ   T9, T3               \
   181      MOVQ    (IM0+24)(M0),DX      \
   182      ADCXQ   AX, T4               \
   183      \
   184      ADOXQ   T7, T2               \
   185      ADOXQ   T8, T3               \
   186      ADOXQ   AX, T4               \
   187      \
   188      MULXQ   (IM1+ 0)(M1),  T0, T5\ // A3*B[0-3]
   189      XORQ    AX,  AX              \
   190      MULXQ   (IM1+ 8)(M1),  T7, T6\
   191      ADCXQ   T7,  T5              \
   192      ADOXQ   T0,  T1              \
   193      MULXQ   (IM1+16)(M1), T8, T7 \
   194      ADCXQ   T8,  T6              \
   195      ADOXQ   T5,  T2              \
   196      MULXQ   (IM1+24)(M1), T9, T8 \
   197      ADCXQ   T9,  T7              \
   198      ADCXQ   AX,  T8              \
   199      ADOXQ   T6,  T3              \
   200      ADOXQ   T7,  T4              \
   201      ADOXQ   AX,  T8              \
   202      MOVQ    T1,  (ID+24)(MDST)   \
   203      MOVQ    T2,  (ID+32)(MDST)   \
   204      MOVQ    T3,  (ID+40)(MDST)   \
   205      MOVQ    T4,  (ID+48)(MDST)   \
   206      MOVQ    T8,  (ID+56)(MDST)
   207  
   208  // Performs schoolbook multiplication of 64-bit with 256-bit
   209  // number.
   210  //
   211  // Uses registers: DX, AX
   212  #define MUL64x256(IDX,M0,M1,C0,C1,C2,C3,C4,T0) \
   213      MOVQ   (IDX)(M0), T0 \
   214      \
   215      XORQ   C2, C2        \
   216      MOVQ   M1+0(SB), AX  \
   217      MULQ   T0            \
   218      MOVQ   AX, C0        \
   219      MOVQ   DX, C1        \
   220      \
   221      XORQ   C3, C3        \
   222      MOVQ   M1+8(SB), AX  \
   223      MULQ   T0            \
   224      ADDQ   AX, C1        \
   225      ADCQ   DX, C2        \
   226      \
   227      XORQ   C4, C4        \
   228      MOVQ   M1+16(SB), AX \
   229      MULQ   T0            \
   230      ADDQ   AX, C2        \
   231      ADCQ   DX, C3        \
   232      \
   233      MOVQ   M1+24(SB), AX \
   234      MULQ   T0            \
   235      ADDQ   AX, C3        \
   236      ADCQ   DX, C4
   237  
   238  // Performs schoolbook multiplication of 128-bit with 256-bit
   239  // number. Destroys RAX and RDX
   240  //
   241  // Uses registers: DX, AX
   242  #define MUL128x256(IDX,M0,M1,C0,C1,C2,C3,C4,C5,T0,T1) \
   243      \ // A0 x B0
   244      MOVQ   (IDX+0)(M0), T0 \
   245      MOVQ   M1+0(SB), AX    \
   246      MULQ   T0              \
   247      XORQ   C2, C2          \
   248      MOVQ   AX, C0          \
   249      MOVQ   DX, C1          \
   250      \ // A0 x B1
   251      MOVQ   M1+8(SB), AX    \
   252      MULQ   T0              \
   253      XORQ   C3, C3          \
   254      ADDQ   AX, C1          \
   255      ADCQ   DX, C2          \
   256      \ // A1 x B0
   257      MOVQ   (IDX+8)(M0), T1 \
   258      MOVQ   M1+0(SB), AX    \
   259      MULQ   T1              \
   260      ADDQ   AX, C1          \
   261      ADCQ   DX, C2          \
   262      ADCQ   $0, C3          \
   263      \ // A0 x B2
   264      XORQ   C4, C4          \
   265      MOVQ   M1+16(SB), AX   \
   266      MULQ   T0              \
   267      ADDQ   AX, C2          \
   268      ADCQ   DX, C3          \
   269      ADCQ   $0, C4          \
   270      \ // A1 x B1
   271      MOVQ   M1+8(SB), AX    \
   272      MULQ   T1              \
   273      ADDQ   AX, C2          \
   274      ADCQ   DX, C3          \
   275      ADCQ   $0, C4          \
   276      \ // A0 x B3
   277      MOVQ   M1+24(SB), AX   \
   278      MULQ   T0              \
   279      XORQ   C5, C5          \
   280      ADDQ   AX, C3          \
   281      ADCQ   DX, C4          \
   282      ADCQ   $0, C5          \
   283      \ // A1 x B2
   284      MOVQ   M1+16(SB), AX   \
   285      MULQ   T1              \
   286      ADDQ   AX, C3          \
   287      ADCQ   DX, C4          \
   288      ADCQ   $0, C5          \
   289      \ // A1 x B3
   290      MOVQ   M1+24(SB), AX   \
   291      MULQ   T1              \
   292      ADDQ   AX, C4          \
   293      ADCQ   DX, C5
   294  
   295  //  Montgomery reduction
   296  //  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
   297  #define REDC_MULX(P1, MUL01, MUL23, MUL45, MUL67) \
   298      MOVQ 0x0(DI), DX        \
   299      MOVQ 0x8(DI), R14       \
   300      MOVQ  P1, AX            \
   301      MULXQ AX, R8, R9        \
   302      MUL01                   \
   303      MOVQ 0x10(DI), DX       \
   304      MOVQ 0x48(DI), CX       \
   305      ADDQ   0x18(DI), R8     \
   306      ADCQ   0x20(DI), R9     \
   307      ADCQ   0x28(DI), R10    \
   308      ADCQ   0x30(DI), R11    \
   309      ADCQ   0x38(DI), R12    \
   310      ADCQ   0x40(DI), R13    \
   311      ADCQ   $0, CX           \
   312      MOVQ  P1, AX            \
   313      MULXQ AX, BX, BP        \
   314      MOVQ   R9,   0x0(SI)    \
   315      MOVQ   R10,  0x8(SI)    \
   316      MOVQ   R11, 0x10(SI)    \
   317      MOVQ   R12, 0x18(SI)    \
   318      MOVQ   R13, 0x20(SI)    \
   319      MOVQ   CX,  0x28(SI)    \
   320      MOVQ   0x50(DI), R9     \
   321      MOVQ   0x58(DI), R10    \
   322      MOVQ   0x60(DI), R11    \
   323      MOVQ   0x68(DI), DI     \
   324      ADCQ   $0, R9           \
   325      ADCQ   $0, R10          \
   326      ADCQ   $0, R11          \
   327      ADCQ   $0, DI           \
   328      MUL23                   \
   329      MOVQ 0x0(SI), DX        \
   330      ADDQ   0x08(SI), BX     \
   331      ADCQ   0x10(SI), BP     \
   332      ADCQ   0x18(SI), R12    \
   333      ADCQ   0x20(SI), R13    \
   334      ADCQ   0x28(SI), R14    \
   335      MOVQ   R14, 0x18(SI)    \
   336      MOVQ   CX, R14          \
   337      MOVQ   $0, CX           \
   338      ADCQ   R9, R14          \
   339      ADCQ   R10, CX          \
   340      MOVQ  P1, AX            \
   341      MULXQ AX, R8, R9        \
   342      MOVQ   BP, 0x0(SI)      \
   343      MOVQ   R12, 0x8(SI)     \
   344      MOVQ   R13, 0x10(SI)    \
   345      ADCQ   $0, R11          \
   346      ADCQ   $0, DI           \
   347      MUL45                   \
   348      MOVQ 0x0(SI), DX        \
   349      ADDQ   0x8(SI), R8      \
   350      ADCQ   0x10(SI), R9     \
   351      ADCQ   0x18(SI), R10    \
   352      ADCQ   R14, BP          \
   353      ADCQ   CX, R12          \
   354      ADCQ   R11, R13         \
   355      ADCQ   $0, DI           \
   356      MOVQ  P1, AX            \
   357      MULXQ AX, R14, BX       \
   358      MOVQ   R8,   0x0(SI)    \
   359      MOVQ   R9,   0x8(SI)    \
   360      MUL67                   \
   361      ADDQ   R10, R14         \
   362      ADCQ   BP, BX           \
   363      ADCQ   R12, R8          \
   364      ADCQ   R13, R9          \
   365      ADCQ   DI, R11          \
   366      MOVQ   R14, 0x10(SI)    \
   367      MOVQ   BX, 0x18(SI)     \
   368      MOVQ   R8, 0x20(SI)     \
   369      MOVQ   R9, 0x28(SI)     \
   370      MOVQ   R11, 0x30(SI)
   371  
   372  #define REDC_MULQ(MUL01, MUL23, MUL45, MUL67) \
   373      MUL01                   \
   374      XORQ   CX, CX           \
   375      ADDQ   0x18(DI), R8     \
   376      ADCQ   0x20(DI), R9     \
   377      ADCQ   0x28(DI), R10    \
   378      ADCQ   0x30(DI), R11    \
   379      ADCQ   0x38(DI), R12    \
   380      ADCQ   0x40(DI), R13    \
   381      ADCQ   0x48(DI), CX     \
   382      MOVQ   R8, 0x18(DI)     \
   383      MOVQ   R9, 0x20(DI)     \
   384      MOVQ   R10, 0x28(DI)    \
   385      MOVQ   R11, 0x30(DI)    \
   386      MOVQ   R12, 0x38(DI)    \
   387      MOVQ   R13, 0x40(DI)    \
   388      MOVQ   CX, 0x48(DI)     \
   389      MOVQ   0x50(DI), R8     \
   390      MOVQ   0x58(DI), R9     \
   391      MOVQ   0x60(DI), R10    \
   392      MOVQ   0x68(DI), R11    \
   393      ADCQ   $0, R8           \
   394      ADCQ   $0, R9           \
   395      ADCQ   $0, R10          \
   396      ADCQ   $0, R11          \
   397      MOVQ   R8, 0x50(DI)     \
   398      MOVQ   R9, 0x58(DI)     \
   399      MOVQ   R10, 0x60(DI)    \
   400      MOVQ   R11, 0x68(DI)    \
   401      \
   402      MUL23                   \
   403      XORQ   CX, CX           \
   404      ADDQ   0x28(DI), R8     \
   405      ADCQ   0x30(DI), R9     \
   406      ADCQ   0x38(DI), R10    \
   407      ADCQ   0x40(DI), R11    \
   408      ADCQ   0x48(DI), R12    \
   409      ADCQ   0x50(DI), R13    \
   410      ADCQ   0x58(DI), CX     \
   411      MOVQ   R8, 0x28(DI)     \
   412      MOVQ   R9, 0x30(DI)     \
   413      MOVQ   R10, 0x38(DI)    \
   414      MOVQ   R11, 0x40(DI)    \
   415      MOVQ   R12, 0x48(DI)    \
   416      MOVQ   R13, 0x50(DI)    \
   417      MOVQ   CX, 0x58(DI)     \
   418      MOVQ   0x60(DI), R8     \
   419      MOVQ   0x68(DI), R9     \
   420      ADCQ   $0, R8           \
   421      ADCQ   $0, R9           \
   422      MOVQ   R8, 0x60(DI)     \
   423      MOVQ   R9, 0x68(DI)     \
   424      \
   425      MUL45                   \
   426      XORQ   CX, CX           \
   427      ADDQ   0x38(DI), R8     \
   428      ADCQ   0x40(DI), R9     \
   429      ADCQ   0x48(DI), R10    \
   430      ADCQ   0x50(DI), R11    \
   431      ADCQ   0x58(DI), R12    \
   432      ADCQ   0x60(DI), R13    \
   433      ADCQ   0x68(DI), CX     \
   434      MOVQ   R8,   0x0(SI)    \ // OUT0
   435      MOVQ   R9,   0x8(SI)    \ // OUT1
   436      MOVQ   R10, 0x48(DI)    \
   437      MOVQ   R11, 0x50(DI)    \
   438      MOVQ   R12, 0x58(DI)    \
   439      MOVQ   R13, 0x60(DI)    \
   440      MOVQ   CX, 0x68(DI)     \
   441      \
   442      MUL67                   \
   443      ADDQ   0x48(DI), R8     \
   444      ADCQ   0x50(DI), R9     \
   445      ADCQ   0x58(DI), R10    \
   446      ADCQ   0x60(DI), R11    \
   447      ADCQ   0x68(DI), R12    \
   448      MOVQ   R8,  0x10(SI)    \ // OUT2
   449      MOVQ   R9,  0x18(SI)    \ // OUT3
   450      MOVQ   R10, 0x20(SI)    \ // OUT4
   451      MOVQ   R11, 0x28(SI)    \ // OUT5
   452      MOVQ   R12, 0x30(SI)      // OUT6
   453  
   454  TEXT ·cswapP434(SB),NOSPLIT,$0-17
   455  
   456      MOVQ    x+0(FP), DI
   457      MOVQ    y+8(FP), SI
   458      MOVB    choice+16(FP), AL   // AL = 0 or 1
   459      MOVBLZX AL, AX  // AX = 0 or 1
   460      NEGQ    AX          // AX = 0x00..00 or 0xff..ff
   461  #ifndef CSWAP_BLOCK
   462  #define CSWAP_BLOCK(idx)    \
   463      MOVQ    (idx*8)(DI), BX \ // BX = x[idx]
   464      MOVQ    (idx*8)(SI), CX \ // CX = y[idx]
   465      MOVQ    CX, DX          \ // DX = y[idx]
   466      XORQ    BX, DX          \ // DX = y[idx] ^ x[idx]
   467      ANDQ    AX, DX          \ // DX = (y[idx] ^ x[idx]) & mask
   468      XORQ    DX, BX          \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
   469      XORQ    DX, CX          \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx]
   470      MOVQ    BX, (idx*8)(DI) \
   471      MOVQ    CX, (idx*8)(SI)
   472  #endif
   473      CSWAP_BLOCK(0)
   474      CSWAP_BLOCK(1)
   475      CSWAP_BLOCK(2)
   476      CSWAP_BLOCK(3)
   477      CSWAP_BLOCK(4)
   478      CSWAP_BLOCK(5)
   479      CSWAP_BLOCK(6)
   480  #ifdef CSWAP_BLOCK
   481  #undef CSWAP_BLOCK
   482  #endif
   483      RET
   484  
   485  TEXT ·cmovP434(SB),NOSPLIT,$0-17
   486  
   487      MOVQ    x+0(FP), DI
   488      MOVQ    y+8(FP), SI
   489      MOVB    choice+16(FP), AL   // AL = 0 or 1
   490      MOVBLZX AL, AX  // AX = 0 or 1
   491      NEGQ    AX          // AX = 0x00..00 or 0xff..ff
   492  #ifndef CMOV_BLOCK
   493  #define CMOV_BLOCK(idx)    \
   494      MOVQ    (idx*8)(DI), BX \ // BX = x[idx]
   495      MOVQ    (idx*8)(SI), DX \ // DX = y[idx]
   496      XORQ    BX, DX          \ // DX = y[idx] ^ x[idx]
   497      ANDQ    AX, DX          \ // DX = (y[idx] ^ x[idx]) & mask
   498      XORQ    DX, BX          \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
   499      MOVQ    BX, (idx*8)(DI)
   500  #endif
   501      CMOV_BLOCK(0)
   502      CMOV_BLOCK(1)
   503      CMOV_BLOCK(2)
   504      CMOV_BLOCK(3)
   505      CMOV_BLOCK(4)
   506      CMOV_BLOCK(5)
   507      CMOV_BLOCK(6)
   508  #ifdef CMOV_BLOCK
   509  #undef CMOV_BLOCK
   510  #endif
   511      RET
   512  
   513  TEXT ·addP434(SB),NOSPLIT,$0-24
   514      MOVQ    z+0(FP), DX
   515      MOVQ    x+8(FP), DI
   516      MOVQ    y+16(FP), SI
   517  
   518      // Used later to calculate a mask
   519      XORQ    CX, CX
   520  
   521      // [R8-R14]: z = x + y
   522      MOVQ    ( 0)(DI), R8;   ADDQ    ( 0)(SI), R8
   523      MOVQ    ( 8)(DI), R9;   ADCQ    ( 8)(SI), R9
   524      MOVQ    (16)(DI), R10;  ADCQ    (16)(SI), R10
   525      MOVQ    (24)(DI), R11;  ADCQ    (24)(SI), R11
   526      MOVQ    (32)(DI), R12;  ADCQ    (32)(SI), R12
   527      MOVQ    (40)(DI), R13;  ADCQ    (40)(SI), R13
   528      MOVQ    (48)(DI), R14;  ADCQ    (48)(SI), R14
   529  
   530      XORQ    DI, DI
   531  
   532      MOVQ    P434X2_0, AX;   SUBQ    AX, R8
   533      MOVQ    P434X2_1, AX;   SBBQ    AX, R9
   534                              SBBQ    AX, R10
   535      MOVQ    P434X2_3, AX;   SBBQ    AX, R11
   536      MOVQ    P434X2_4, AX;   SBBQ    AX, R12
   537      MOVQ    P434X2_5, AX;   SBBQ    AX, R13
   538      MOVQ    P434X2_6, AX;   SBBQ    AX, R14
   539  
   540      // mask
   541      SBBQ    $0, CX
   542  
   543      // if z<0 add P434x2 back
   544      MOVQ    P434X2_0, R15;  ANDQ    CX, R15;
   545      MOVQ    P434X2_1, AX;   ANDQ    CX, AX;
   546  
   547      ADDQ    R8, R15; MOVQ  R15, ( 0)(DX)
   548      ADCQ    AX, R9;  MOVQ   R9, ( 8)(DX)
   549      ADCQ    AX, R10; MOVQ  R10, (16)(DX)
   550  
   551      ADCQ    $0, DI
   552      MOVQ    P434X2_3, R15;  ANDQ    CX, R15;
   553      MOVQ    P434X2_4,  R8;  ANDQ    CX, R8;
   554      MOVQ    P434X2_5,  R9;  ANDQ    CX, R9;
   555      MOVQ    P434X2_6, R10;  ANDQ    CX, R10;
   556      BTQ     $0, DI
   557  
   558      ADCQ    R11, R15;   MOVQ R15, (24)(DX)
   559      ADCQ    R12, R8;    MOVQ R8,  (32)(DX)
   560      ADCQ    R13, R9;    MOVQ R9,  (40)(DX)
   561      ADCQ    R14, R10;   MOVQ R10, (48)(DX)
   562  
   563      RET
   564  
   565  TEXT ·adlP434(SB),NOSPLIT,$0-24
   566      MOVQ    z+0(FP), DX
   567      MOVQ    x+8(FP), DI
   568      MOVQ    y+16(FP),SI
   569  
   570      MOVQ    ( 0)(DI), R8
   571      ADDQ    ( 0)(SI), R8
   572      MOVQ    ( 8)(DI), R9
   573      ADCQ    ( 8)(SI), R9
   574      MOVQ    (16)(DI), R10
   575      ADCQ    (16)(SI), R10
   576      MOVQ    (24)(DI), R11
   577      ADCQ    (24)(SI), R11
   578      MOVQ    (32)(DI), R12
   579      ADCQ    (32)(SI), R12
   580      MOVQ    (40)(DI), R13
   581      ADCQ    (40)(SI), R13
   582      MOVQ    (48)(DI), R14
   583      ADCQ    (48)(SI), R14
   584      MOVQ    (56)(DI), R15
   585      ADCQ    (56)(SI), R15
   586      MOVQ    (64)(DI), AX
   587      ADCQ    (64)(SI), AX
   588      MOVQ    (72)(DI), BX
   589      ADCQ    (72)(SI), BX
   590      MOVQ    (80)(DI), CX
   591      ADCQ    (80)(SI), CX
   592  
   593      MOVQ    R8, ( 0)(DX)
   594      MOVQ    R9, ( 8)(DX)
   595      MOVQ    R10,(16)(DX)
   596      MOVQ    R11,(24)(DX)
   597      MOVQ    R12,(32)(DX)
   598      MOVQ    R13,(40)(DX)
   599      MOVQ    R14,(48)(DX)
   600      MOVQ    R15,(56)(DX)
   601      MOVQ    AX, (64)(DX)
   602      MOVQ    BX, (72)(DX)
   603      MOVQ    CX, (80)(DX)
   604  
   605      MOVQ    (88)(DI), R8
   606      ADCQ    (88)(SI), R8
   607      MOVQ    (96)(DI), R9
   608      ADCQ    (96)(SI), R9
   609      MOVQ    (104)(DI), R10
   610      ADCQ    (104)(SI), R10
   611  
   612      MOVQ    R8, (88)(DX)
   613      MOVQ    R9, (96)(DX)
   614      MOVQ    R10,(104)(DX)
   615      RET
   616  
   617  TEXT ·subP434(SB),NOSPLIT,$0-24
   618      MOVQ    z+0(FP), DX
   619      MOVQ    x+8(FP), DI
   620      MOVQ    y+16(FP), SI
   621  
   622      // Used later to calculate a mask
   623      XORQ    CX, CX
   624  
   625      MOVQ    ( 0)(DI), R8;  SUBQ    ( 0)(SI), R8
   626      MOVQ    ( 8)(DI), R9;  SBBQ    ( 8)(SI), R9
   627      MOVQ    (16)(DI), R10; SBBQ    (16)(SI), R10
   628      MOVQ    (24)(DI), R11; SBBQ    (24)(SI), R11
   629      MOVQ    (32)(DI), R12; SBBQ    (32)(SI), R12
   630      MOVQ    (40)(DI), R13; SBBQ    (40)(SI), R13
   631      MOVQ    (48)(DI), R14; SBBQ    (48)(SI), R14
   632  
   633      // mask
   634      SBBQ    $0, CX
   635      XORQ    R15, R15
   636  
   637      // if z<0 add p434x2 back
   638      MOVQ    P434X2_0, DI; ANDQ    CX, DI
   639      MOVQ    P434X2_1, SI; ANDQ    CX, SI
   640      MOVQ    P434X2_3, AX; ANDQ    CX, AX
   641  
   642      ADDQ     DI, R8;  MOVQ     R8, ( 0)(DX)
   643      ADCQ     SI, R9;  MOVQ     R9, ( 8)(DX)
   644      ADCQ     SI, R10; MOVQ    R10, (16)(DX)
   645      ADCQ     AX, R11; MOVQ    R11, (24)(DX)
   646      ADCQ    $0, R15
   647  
   648      MOVQ    P434X2_4, R8;  ANDQ    CX, R8;
   649      MOVQ    P434X2_5, R9;  ANDQ    CX, R9;
   650      MOVQ    P434X2_6, R10; ANDQ    CX, R10
   651  
   652      BTQ     $0, R15
   653  
   654      ADCQ     R8, R12; MOVQ    R12, (32)(DX)
   655      ADCQ     R9, R13; MOVQ    R13, (40)(DX)
   656      ADCQ    R10, R14; MOVQ    R14, (48)(DX)
   657      RET
   658  
   659  TEXT ·sulP434(SB),NOSPLIT,$0-24
   660      MOVQ z+0(FP), DX
   661      MOVQ x+8(FP), DI
   662      MOVQ y+16(FP), SI
   663  
   664      // Used later to store result of 0-borrow
   665      XORQ CX, CX
   666  
   667      // SUBC for first 10 limbs
   668      MOVQ    ( 0)(DI), R8;  SUBQ    ( 0)(SI), R8
   669      MOVQ    ( 8)(DI), R9;  SBBQ    ( 8)(SI), R9
   670      MOVQ    (16)(DI), R10; SBBQ    (16)(SI), R10
   671      MOVQ    (24)(DI), R11; SBBQ    (24)(SI), R11
   672      MOVQ    (32)(DI), R12; SBBQ    (32)(SI), R12
   673      MOVQ    (40)(DI), R13; SBBQ    (40)(SI), R13
   674      MOVQ    (48)(DI), R14; SBBQ    (48)(SI), R14
   675      MOVQ    (56)(DI), R15; SBBQ    (56)(SI), R15
   676      MOVQ    (64)(DI), AX;  SBBQ    (64)(SI), AX
   677      MOVQ    (72)(DI), BX;  SBBQ    (72)(SI), BX
   678  
   679      MOVQ     R8, ( 0)(DX)
   680      MOVQ     R9, ( 8)(DX)
   681      MOVQ    R10, (16)(DX)
   682      MOVQ    R11, (24)(DX)
   683      MOVQ    R12, (32)(DX)
   684      MOVQ    R13, (40)(DX)
   685      MOVQ    R14, (48)(DX)
   686      MOVQ    R15, (56)(DX)
   687      MOVQ     AX, (64)(DX)
   688      MOVQ     BX, (72)(DX)
   689  
   690      // SUBC for last 4 limbs
   691      MOVQ    ( 80)(DI), R8;  SBBQ    ( 80)(SI), R8
   692      MOVQ    ( 88)(DI), R9;  SBBQ    ( 88)(SI), R9
   693      MOVQ    ( 96)(DI), R10; SBBQ    ( 96)(SI), R10
   694      MOVQ    (104)(DI), R11; SBBQ    (104)(SI), R11
   695  
   696      // Store carry flag
   697      SBBQ    $0, CX
   698  
   699      MOVQ    R8,  ( 80)(DX)
   700      MOVQ    R9,  ( 88)(DX)
   701      MOVQ    R10, ( 96)(DX)
   702      MOVQ    R11, (104)(DX)
   703  
   704      // Load p into registers:
   705      MOVQ    P434_0, R8;  ANDQ    CX, R8
   706      // P434_{1,2} = P434_0, so reuse R8
   707      MOVQ    P434_3, R9;  ANDQ    CX, R9
   708      MOVQ    P434_4, R10; ANDQ    CX, R10
   709      MOVQ    P434_5, R11; ANDQ    CX, R11
   710      MOVQ    P434_6, R12; ANDQ    CX, R12
   711  
   712      MOVQ   (56   )(DX), AX; ADDQ R8,  AX; MOVQ AX, (56   )(DX)
   713      MOVQ   (56+ 8)(DX), AX; ADCQ R8,  AX; MOVQ AX, (56+ 8)(DX)
   714      MOVQ   (56+16)(DX), AX; ADCQ R8,  AX; MOVQ AX, (56+16)(DX)
   715      MOVQ   (56+24)(DX), AX; ADCQ R9,  AX; MOVQ AX, (56+24)(DX)
   716      MOVQ   (56+32)(DX), AX; ADCQ R10, AX; MOVQ AX, (56+32)(DX)
   717      MOVQ   (56+40)(DX), AX; ADCQ R11, AX; MOVQ AX, (56+40)(DX)
   718      MOVQ   (56+48)(DX), AX; ADCQ R12, AX; MOVQ AX, (56+48)(DX)
   719  
   720      RET
   721  
   722  TEXT ·modP434(SB),NOSPLIT,$0-8
   723      MOVQ    x+0(FP), DI
   724  
   725      // Zero AX for later use:
   726      XORQ    AX, AX
   727  
   728      // Set x <- x - p
   729      MOVQ    P434_0, R8
   730      SUBQ    R8,  ( 0)(DI)
   731      // P434_{1,2} = P434_0, so reuse R8
   732      MOVQ    P434_3, R9
   733      SBBQ    R8,  ( 8)(DI)
   734      SBBQ    R8,  (16)(DI)
   735      MOVQ    P434_4, R10
   736      SBBQ    R9,  (24)(DI)
   737      MOVQ    P434_5, R11
   738      SBBQ    R10, (32)(DI)
   739      MOVQ    P434_6, R12
   740      SBBQ    R11, (40)(DI)
   741      SBBQ    R12, (48)(DI)
   742  
   743      // save carry
   744      SBBQ    $0, AX
   745  
   746      // Conditionally add p to x if x-p < 0
   747      ANDQ    AX, R8
   748      ANDQ    AX, R9
   749      ANDQ    AX, R10
   750      ANDQ    AX, R11
   751      ANDQ    AX, R12
   752  
   753      ADDQ    R8, ( 0)(DI)
   754      ADCQ    R8, ( 8)(DI)
   755      ADCQ    R8, (16)(DI)
   756      ADCQ    R9, (24)(DI)
   757      ADCQ    R10,(32)(DI)
   758      ADCQ    R11,(40)(DI)
   759      ADCQ    R12,(48)(DI)
   760      RET
   761  
   762  // 434-bit multiplication using Karatsuba (one level),
   763  // schoolbook (one level).
   764  TEXT ·mulP434(SB),NOSPLIT,$112-24
   765      MOVQ    z+0(FP), CX
   766      MOVQ    x+8(FP), DI
   767      MOVQ    y+16(FP), SI
   768  
   769      // Check whether to use optimized implementation
   770      CMPB    ·HasADXandBMI2(SB), $1
   771      JE      mul_with_mulx_adcx_adox
   772  
   773      // rcx[0-3] <- AH+AL
   774      XORQ         AX, AX
   775      MOVQ   0x20(DI), R8
   776      MOVQ   0x28(DI), R9
   777      MOVQ   0x30(DI), R10
   778      XORQ        R11, R11
   779      ADDQ    0x0(DI), R8
   780      ADCQ    0x8(DI), R9
   781      ADCQ   0x10(DI), R10
   782      ADCQ   0x18(DI), R11
   783      // store AH+AL mask
   784      SBBQ   $0, AX
   785      MOVQ   AX, 0x40(SP)
   786      // store AH+AL in 0-0x18(rcx)
   787      MOVQ    R8,  0x0(CX)
   788      MOVQ    R9,  0x8(CX)
   789      MOVQ   R10, 0x10(CX)
   790      MOVQ   R11, 0x18(CX)
   791  
   792      // r12-r15 <- BH+BL
   793      XORQ         DX, DX
   794      MOVQ   0x20(SI), R12
   795      MOVQ   0x28(SI), R13
   796      MOVQ   0x30(SI), R14
   797      XORQ        R15, R15
   798      ADDQ    0x0(SI), R12
   799      ADCQ    0x8(SI), R13
   800      ADCQ   0x10(SI), R14
   801      ADCQ   0x18(SI), R15
   802      SBBQ         $0, DX
   803  
   804      // store BH+BL mask
   805      MOVQ DX, 0x48(SP)
   806  
   807      // (rsp[0-0x38]) <- (AH+AL)*(BH+BL)
   808      MOVQ   (CX), AX
   809      MULQ   R12
   810      MOVQ   AX, (SP)
   811      MOVQ   DX, R8
   812  
   813      XORQ    R9, R9
   814      MOVQ   (CX), AX
   815      MULQ    R13
   816      ADDQ     AX, R8
   817      ADCQ     DX, R9
   818  
   819      XORQ   R10, R10
   820      MOVQ   0x8(CX), AX
   821      MULQ   R12
   822      ADDQ    AX, R8
   823      MOVQ    R8,  0x8(SP)
   824      ADCQ    DX, R9
   825      ADCQ    $0, R10
   826  
   827      XORQ   R8, R8
   828      MOVQ   (CX), AX
   829      MULQ   R14
   830      ADDQ   AX, R9
   831      ADCQ   DX, R10
   832      ADCQ   $0, R8
   833  
   834      MOVQ   0x10(CX), AX
   835      MULQ   R12
   836      ADDQ   AX, R9
   837      ADCQ   DX, R10
   838      ADCQ   $0, R8
   839  
   840      MOVQ   0x8(CX), AX
   841      MULQ   R13
   842      ADDQ   AX, R9
   843      MOVQ   R9, 0x10(SP)
   844      ADCQ   DX, R10
   845      ADCQ   $0, R8
   846  
   847      XORQ   R9, R9
   848      MOVQ   (CX),AX
   849      MULQ   R15
   850      ADDQ   AX, R10
   851      ADCQ   DX, R8
   852      ADCQ   $0, R9
   853  
   854      MOVQ   0x18(CX), AX
   855      MULQ   R12
   856      ADDQ   AX, R10
   857      ADCQ   DX, R8
   858      ADCQ   $0, R9
   859  
   860      MOVQ   0x8(CX), AX
   861      MULQ   R14
   862      ADDQ   AX, R10
   863      ADCQ   DX, R8
   864      ADCQ   $0, R9
   865  
   866      MOVQ   0x10(CX), AX
   867      MULQ   R13
   868      ADDQ    AX, R10
   869      MOVQ   R10, 0x18(SP)
   870      ADCQ    DX, R8
   871      ADCQ    $0, R9
   872  
   873      XORQ   R10, R10
   874      MOVQ   0x8(CX), AX
   875      MULQ   R15
   876      ADDQ    AX, R8
   877      ADCQ    DX, R9
   878      ADCQ    $0, R10
   879  
   880      MOVQ   0x18(CX), AX
   881      MULQ   R13
   882      ADDQ   AX, R8
   883      ADCQ   DX, R9
   884      ADCQ   $0, R10
   885  
   886      MOVQ   0x10(CX), AX
   887      MULQ   R14
   888      ADDQ    AX, R8
   889      MOVQ    R8, 0x20(SP)
   890      ADCQ    DX, R9
   891      ADCQ    $0, R10
   892  
   893      XORQ   R11, R11
   894      MOVQ   0x10(CX), AX
   895      MULQ   R15
   896      ADDQ    AX, R9
   897      ADCQ    DX, R10
   898      ADCQ    $0, R11
   899  
   900      MOVQ   0x18(CX), AX
   901      MULQ   R14
   902      ADDQ    AX, R9
   903      MOVQ    R9, 0x28(SP)
   904      ADCQ    DX, R10
   905      ADCQ    $0, R11
   906  
   907      MOVQ   0x18(CX), AX
   908      MULQ   R15
   909      ADDQ    AX, R10
   910      MOVQ   R10, 0x30(SP)
   911      ADCQ    DX, R11
   912      MOVQ    R11,0x38(SP)
   913  
   914      // r12-r15 <- masked (BH + BL)
   915      MOVQ   0x40(SP), AX
   916      ANDQ   AX, R12
   917      ANDQ   AX, R13
   918      ANDQ   AX, R14
   919      ANDQ   AX, R15
   920  
   921      // r8-r11 <- masked (AH + AL)
   922      MOVQ   0x48(SP), AX
   923      MOVQ   0x00(CX), R8
   924      ANDQ         AX, R8
   925      MOVQ   0x08(CX), R9
   926      ANDQ         AX, R9
   927      MOVQ   0x10(CX), R10
   928      ANDQ         AX, R10
   929      MOVQ   0x18(CX), R11
   930      ANDQ         AX, R11
   931  
   932      // r12-r15 <- masked (AH + AL) + masked (BH + BL)
   933      ADDQ    R8, R12
   934      ADCQ    R9, R13
   935      ADCQ   R10, R14
   936      ADCQ   R11, R15
   937  
   938      // rsp[0x20-0x38] <- (AH+AL) x (BH+BL) high
   939      MOVQ   0x20(SP), AX
   940      ADDQ         AX, R12
   941      MOVQ   0x28(SP), AX
   942      ADCQ         AX, R13
   943      MOVQ   0x30(SP), AX
   944      ADCQ         AX, R14
   945      MOVQ   0x38(SP), AX
   946      ADCQ         AX, R15
   947      MOVQ   R12, 0x50(SP)
   948      MOVQ   R13, 0x58(SP)
   949      MOVQ   R14, 0x60(SP)
   950      MOVQ   R15, 0x68(SP)
   951  
   952      // [rcx] <- CL = AL x BL
   953      MOVQ   (DI), R11
   954      MOVQ   (SI), AX
   955      MULQ    R11
   956      XORQ    R9,  R9
   957      MOVQ    AX, (CX)
   958      MOVQ    DX, R8
   959  
   960      MOVQ   0x10(DI), R14
   961      MOVQ   0x8(SI), AX
   962      MULQ   R11
   963      XORQ   R10, R10
   964      ADDQ    AX, R8
   965      ADCQ    DX, R9
   966  
   967      MOVQ   0x8(DI), R12
   968      MOVQ   (SI), AX
   969      MULQ   R12
   970      ADDQ   AX, R8
   971      MOVQ   R8, 0x8(CX)
   972      ADCQ   DX, R9
   973      ADCQ   $0, R10
   974  
   975      XORQ   R8,  R8
   976      MOVQ   0x10(SI), AX
   977      MULQ   R11
   978      ADDQ   AX, R9
   979      ADCQ   DX, R10
   980      ADCQ   $0, R8
   981  
   982      MOVQ   (SI), R13
   983      MOVQ   R14, AX
   984      MULQ   R13
   985      ADDQ    AX, R9
   986      ADCQ    DX, R10
   987      ADCQ    $0, R8
   988  
   989      MOVQ   0x8(SI), AX
   990      MULQ   R12
   991      ADDQ   AX, R9
   992      MOVQ   R9, 0x10(CX)
   993      ADCQ   DX, R10
   994      ADCQ   $0, R8
   995  
   996      XORQ   R9,  R9
   997      MOVQ   0x18(SI), AX
   998      MULQ   R11
   999      MOVQ   0x18(DI), R15
  1000      ADDQ   AX, R10
  1001      ADCQ   DX, R8
  1002      ADCQ   $0, R9
  1003  
  1004      MOVQ   R15, AX
  1005      MULQ   R13
  1006      ADDQ   AX, R10
  1007      ADCQ   DX, R8
  1008      ADCQ   $0, R9
  1009  
  1010      MOVQ   0x10(SI), AX
  1011      MULQ   R12
  1012      ADDQ   AX, R10
  1013      ADCQ   DX, R8
  1014      ADCQ   $0, R9
  1015  
  1016      MOVQ   0x8(SI), AX
  1017      MULQ   R14
  1018      ADDQ    AX, R10
  1019      MOVQ   R10, 0x18(CX)
  1020      ADCQ    DX, R8
  1021      ADCQ    $0, R9
  1022  
  1023      XORQ   R10, R10
  1024      MOVQ   0x18(SI), AX
  1025      MULQ   R12
  1026      ADDQ    AX, R8
  1027      ADCQ    DX, R9
  1028      ADCQ    $0, R10
  1029  
  1030      MOVQ   0x8(SI), AX
  1031      MULQ   R15
  1032      ADDQ    AX, R8
  1033      ADCQ    DX, R9
  1034      ADCQ    $0, R10
  1035  
  1036      MOVQ   0x10(SI), AX
  1037      MULQ   R14
  1038      ADDQ    AX, R8
  1039      MOVQ    R8,  0x20(CX)
  1040      ADCQ    DX, R9
  1041      ADCQ    $0, R10
  1042  
  1043      XORQ   R8, R8
  1044      MOVQ   0x18(SI), AX
  1045      MULQ   R14
  1046      ADDQ    AX, R9
  1047      ADCQ    DX, R10
  1048      ADCQ    $0, R8
  1049  
  1050      MOVQ   0x10(SI), AX
  1051      MULQ   R15
  1052      ADDQ    AX, R9
  1053      MOVQ    R9,  0x28(CX)
  1054      ADCQ    DX, R10
  1055      ADCQ    $0, R8
  1056  
  1057      MOVQ   0x18(SI), AX
  1058      MULQ   R15
  1059      ADDQ    AX, R10
  1060      MOVQ   R10, 0x30(CX)
  1061      ADCQ    DX, R8
  1062      MOVQ    R8, 0x38(CX)
  1063  
  1064      // rcx[0x40-0x68] <- AH*BH
  1065      // multiplies 2 192-bit numbers A,B
  1066      MOVQ   0x20(DI), R11
  1067      MOVQ   0x20(SI), AX
  1068      MULQ   R11
  1069      XORQ    R9,  R9
  1070      MOVQ    AX, 0x40(CX)
  1071      MOVQ    DX, R8
  1072  
  1073      MOVQ   0x30(DI), R14
  1074      MOVQ   0x28(SI), AX
  1075      MULQ   R11
  1076      XORQ   R10, R10
  1077      ADDQ    AX, R8
  1078      ADCQ    DX, R9
  1079  
  1080      MOVQ   0x28(DI), R12
  1081      MOVQ   0x20(SI), AX
  1082      MULQ   R12
  1083      ADDQ    AX, R8
  1084      MOVQ    R8,  0x48(CX)
  1085      ADCQ    DX, R9
  1086      ADCQ    $0, R10
  1087  
  1088      XORQ   R8,  R8
  1089      MOVQ   0x30(SI), AX
  1090      MULQ   R11
  1091      ADDQ    AX, R9
  1092      ADCQ    DX, R10
  1093      ADCQ    $0, R8
  1094  
  1095      MOVQ   0x20(SI), R13
  1096      MOVQ   R14, AX
  1097      MULQ   R13
  1098      ADDQ    AX, R9
  1099      ADCQ    DX, R10
  1100      ADCQ    $0, R8
  1101  
  1102      MOVQ   0x28(SI), AX
  1103      MULQ   R12
  1104      ADDQ    AX, R9
  1105      MOVQ    R9,  0x50(CX)
  1106      ADCQ    DX, R10
  1107      ADCQ    $0, R8
  1108  
  1109      MOVQ   0x30(SI), AX
  1110      MULQ   R12
  1111      XORQ   R12, R12
  1112      ADDQ    AX, R10
  1113      ADCQ    DX, R8
  1114      ADCQ    $0, R12
  1115  
  1116      MOVQ   0x28(SI), AX
  1117      MULQ   R14
  1118      ADDQ    AX, R10
  1119      ADCQ    DX, R8
  1120      ADCQ    $0, R12
  1121      MOVQ   R10, 0x58(CX)
  1122  
  1123      MOVQ    0x30(SI), AX
  1124      MULQ    R14
  1125      ADDQ     AX, R8
  1126      ADCQ     $0, R12
  1127      MOVQ     R8,  0x60(CX)
  1128  
  1129      ADDQ    R12, DX
  1130  
  1131      // [r8-r15] <- (AH+AL)x(BH+BL) - ALxBL
  1132      MOVQ    0x0(SP), R8
  1133      SUBQ    0x0(CX), R8
  1134      MOVQ    0x8(SP), R9
  1135      SBBQ    0x8(CX), R9
  1136      MOVQ   0x10(SP), R10
  1137      SBBQ   0x10(CX), R10
  1138      MOVQ   0x18(SP), R11
  1139      SBBQ   0x18(CX), R11
  1140      MOVQ   0x50(SP), R12
  1141      SBBQ   0x20(CX), R12
  1142      MOVQ   0x58(SP), R13
  1143      SBBQ   0x28(CX), R13
  1144      MOVQ   0x60(SP), R14
  1145      SBBQ   0x30(CX), R14
  1146      MOVQ   0x68(SP), R15
  1147      SBBQ   0x38(CX), R15
  1148  
  1149      // [r8-r15] <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
  1150      MOVQ   0x40(CX), AX
  1151      SUBQ   AX, R8
  1152      MOVQ   0x48(CX), AX
  1153      SBBQ   AX, R9
  1154      MOVQ   0x50(CX), AX
  1155      SBBQ   AX, R10
  1156      MOVQ   0x58(CX), AX
  1157      SBBQ   AX, R11
  1158      MOVQ   0x60(CX), AX
  1159      SBBQ   AX, R12
  1160      SBBQ   DX, R13
  1161      SBBQ   $0, R14
  1162      SBBQ   $0, R15
  1163  
  1164      // Final result
  1165      ADDQ   0x20(CX), R8
  1166      MOVQ    R8, 0x20(CX)    // OUT4
  1167      ADCQ   0x28(CX), R9
  1168      MOVQ    R9, 0x28(CX)    // OUT5
  1169      ADCQ   0x30(CX), R10
  1170      MOVQ   R10, 0x30(CX)    // OUT6
  1171      ADCQ   0x38(CX), R11
  1172      MOVQ   R11, 0x38(CX)    // OUT7
  1173      ADCQ   0x40(CX), R12
  1174      MOVQ   R12, 0x40(CX)    // OUT8
  1175      ADCQ   0x48(CX), R13
  1176      MOVQ   R13, 0x48(CX)    // OUT9
  1177      ADCQ   0x50(CX), R14
  1178      MOVQ   R14, 0x50(CX)    // OUT10
  1179      ADCQ   0x58(CX), R15
  1180      MOVQ   R15, 0x58(CX)    // OUT11
  1181      MOVQ   0x60(CX), R12
  1182      ADCQ    $0, R12
  1183      MOVQ   R12, 0x60(CX)    // OUT12
  1184      ADCQ    $0, DX
  1185      MOVQ    DX, 0x68(CX)    // OUT13
  1186      RET
  1187  
  1188  mul_with_mulx_adcx_adox:
  1189      // Mul implementation for CPUs supporting two independent carry chain
  1190      // (ADOX/ADCX) instructions and carry-less MULX multiplier
  1191      XORQ    AX, AX
  1192      MOVQ    0x0(DI), R8
  1193      MOVQ    0x8(DI), R9
  1194      MOVQ   0x10(DI), R10
  1195      MOVQ   0x18(DI), R11
  1196  
  1197      MOVQ   BP, 0x70(SP) // push: BP is Callee-save.
  1198  
  1199      ADDQ   0x20(DI), R8
  1200      ADCQ   0x28(DI), R9
  1201      ADCQ   0x30(DI), R10
  1202      ADCQ     $0, R11
  1203      SBBQ     $0, AX
  1204      MOVQ   R8,   0x0(SP)
  1205      MOVQ   R9,   0x8(SP)
  1206      MOVQ   R10, 0x10(SP)
  1207      MOVQ   R11, 0x18(SP)
  1208  
  1209      // r12-r15 <- BH + BL, rbx <- mask
  1210      XORQ         BX, BX
  1211      MOVQ    0x0(SI), R12
  1212      MOVQ    0x8(SI), R13
  1213      MOVQ   0x10(SI), R14
  1214      MOVQ   0x18(SI), R15
  1215      ADDQ   0x20(SI), R12
  1216      ADCQ   0x28(SI), R13
  1217      ADCQ   0x30(SI), R14
  1218      ADCQ    $0, R15
  1219      SBBQ    $0, BX
  1220      MOVQ   R12, 0x20(SP)
  1221      MOVQ   R13, 0x28(SP)
  1222      MOVQ   R14, 0x30(SP)
  1223      MOVQ   R15, 0x38(SP)
  1224  
  1225      // r12-r15 <- masked (BH + BL)
  1226      ANDQ   AX, R12
  1227      ANDQ   AX, R13
  1228      ANDQ   AX, R14
  1229      ANDQ   AX, R15
  1230  
  1231      // r8-r11 <- masked (AH + AL)
  1232      ANDQ   BX, R8
  1233      ANDQ   BX, R9
  1234      ANDQ   BX, R10
  1235      ANDQ   BX, R11
  1236  
  1237      // r8-r11 <- masked (AH + AL) + masked (BH + BL)
  1238      ADDQ   R12, R8
  1239      ADCQ   R13, R9
  1240      ADCQ   R14, R10
  1241      ADCQ   R15, R11
  1242      MOVQ    R8, 0x40(SP)
  1243      MOVQ    R9, 0x48(SP)
  1244      MOVQ   R10, 0x50(SP)
  1245      MOVQ   R11, 0x58(SP)
  1246  
  1247      // [rsp] <- CM = (AH+AL) x (BH+BL)
  1248      MULX256(0,SP,32,SP,0,SP,R8,R9,R10,R11,R12,R13,R14,R15,BX,BP)
  1249      // [rcx] <- CL = AL x BL (Result c0-c3)
  1250      MULX256(0,DI,0,SI,0,CX,R8,R9,R10,R11,R12,R13,R14,R15,BX,BP)
  1251      // [rcx+64], rbx, rbp, rax <- CH = AH x BH
  1252      MULX192(32,DI,32,SI,64,CX,R8,BX,R10,BP,R12,R13,R14)
  1253  
  1254      // r8-r11 <- (AH+AL) x (BH+BL), final step
  1255      MOVQ   0x40(SP),  R8
  1256      MOVQ   0x48(SP),  R9
  1257      MOVQ   0x50(SP), R10
  1258      MOVQ   0x58(SP), R11
  1259  
  1260      MOVQ   0x20(SP), DX
  1261      ADDQ   DX, R8
  1262      MOVQ   0x28(SP), DX
  1263      ADCQ   DX, R9
  1264      MOVQ   0x30(SP), DX
  1265      ADCQ   DX, R10
  1266      MOVQ   0x38(SP), DX
  1267      ADCQ   DX, R11
  1268  
  1269      // [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
  1270      MOVQ    0x0(SP), R12
  1271      MOVQ    0x8(SP), R13
  1272      MOVQ   0x10(SP), R14
  1273      MOVQ   0x18(SP), R15
  1274      SUBQ    0x0(CX), R12
  1275      SBBQ    0x8(CX), R13
  1276      SBBQ   0x10(CX), R14
  1277      SBBQ   0x18(CX), R15
  1278      SBBQ   0x20(CX), R8
  1279      SBBQ   0x28(CX), R9
  1280      SBBQ   0x30(CX), R10
  1281      SBBQ   0x38(CX), R11
  1282  
  1283      // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
  1284      SUBQ   0x40(CX), R12
  1285      SBBQ   0x48(CX), R13
  1286      SBBQ   0x50(CX), R14
  1287      SBBQ   BX, R15
  1288      SBBQ   BP, R8
  1289      SBBQ   AX, R9
  1290      SBBQ   $0, R10
  1291      SBBQ   $0, R11
  1292  
  1293      ADDQ   0x20(CX), R12
  1294      MOVQ   R12, 0x20(CX)    // OUT4
  1295      ADCQ   0x28(CX), R13
  1296      MOVQ   R13, 0x28(CX)    // OUT5
  1297      ADCQ   0x30(CX), R14
  1298      MOVQ   R14, 0x30(CX)    // OUT6
  1299      ADCQ   0x38(CX), R15
  1300      MOVQ   R15, 0x38(CX)    // OUT7
  1301      ADCQ   0x40(CX), R8
  1302      MOVQ   R8, 0x40(CX)     // OUT8
  1303      ADCQ   0x48(CX), R9
  1304      MOVQ   R9, 0x48(CX)     // OUT9
  1305      ADCQ   0x50(CX), R10
  1306      MOVQ   R10, 0x50(CX)    // OUT10
  1307      ADCQ   BX, R11
  1308      MOVQ   R11, 0x58(CX)    // OUT11
  1309      ADCQ   $0, BP
  1310      MOVQ   BP, 0x60(CX)    // OUT12
  1311      ADCQ   $0, AX
  1312      MOVQ   AX, 0x68(CX)    // OUT13
  1313  
  1314      MOVQ   0x70(SP), BP // pop: BP is Callee-save.
  1315      RET
  1316  
  1317  TEXT ·rdcP434(SB),$0-16
  1318      MOVQ    z+0(FP), SI
  1319      MOVQ    x+8(FP), DI
  1320      CMPB    ·HasADXandBMI2(SB), $1
  1321      JE      redc_bdw
  1322  #define MUL01 MUL128x256( 0,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX)
  1323  #define MUL23 MUL128x256(16,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX)
  1324  #define MUL45 MUL128x256(32,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX)
  1325  #define MUL67  MUL64x256(48,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13)
  1326      REDC_MULQ(MUL01, MUL23, MUL45, MUL67)
  1327  #undef MUL01
  1328  #undef MUL23
  1329  #undef MUL45
  1330  #undef MUL67
  1331      RET
  1332  
  1333  // 434-bit montgomery reduction Uses MULX/ADOX/ADCX instructions
  1334  // available on Broadwell micro-architectures and newer.
  1335  redc_bdw:
  1336  #define MULX01 MULX128x256(R14,·P434p1+(8*P434_P1_ZEROS),R9 ,R10,R11,R12,R13)
  1337  #define MULX23 MULX128x256(R8 ,·P434p1+(8*P434_P1_ZEROS),BP ,R12,R13,R14,CX )
  1338  #define MULX45 MULX128x256(BX ,·P434p1+(8*P434_P1_ZEROS),R9 ,R10,BP ,R12,R13)
  1339  #define MULX67 MULX64x256 (    ·P434p1+(8*P434_P1_ZEROS),BX ,R8 ,R9 ,R11,CX )
  1340      REDC_MULX(·P434p1+(8*P434_P1_ZEROS)+0(SB), MULX01, MULX23, MULX45, MULX67)
  1341  #undef MULX01
  1342  #undef MULX23
  1343  #undef MULX45
  1344  #undef MULX67
  1345      RET