github.com/cloudflare/circl@v1.5.0/math/fp448/fp_amd64.h (about)

     1  // This code was imported from https://github.com/armfazh/rfc7748_precomputed
     2  
     3  // CHECK_BMI2ADX triggers bmi2adx if supported,
     4  // otherwise it fallbacks to legacy code.
     5  #define CHECK_BMI2ADX(label, legacy, bmi2adx) \
     6      CMPB ·hasBmi2Adx(SB), $0  \
     7      JE label                  \
     8      bmi2adx                   \
     9      RET                       \
    10      label:                    \
    11      legacy                    \
    12      RET
    13  
    14  // cselect is a conditional move
    15  // if b=1: it copies y into x;
    16  // if b=0: x remains with the same value;
    17  // if b<> 0,1: undefined.
    18  // Uses: AX, DX, FLAGS
    19  // Instr: x86_64, cmov
    20  #define cselect(x,y,b) \
    21      TESTQ b, b \
    22      MOVQ  0+x, AX; MOVQ  0+y, DX; CMOVQNE DX, AX; MOVQ AX,  0+x; \
    23      MOVQ  8+x, AX; MOVQ  8+y, DX; CMOVQNE DX, AX; MOVQ AX,  8+x; \
    24      MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
    25      MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; \
    26      MOVQ 32+x, AX; MOVQ 32+y, DX; CMOVQNE DX, AX; MOVQ AX, 32+x; \
    27      MOVQ 40+x, AX; MOVQ 40+y, DX; CMOVQNE DX, AX; MOVQ AX, 40+x; \
    28      MOVQ 48+x, AX; MOVQ 48+y, DX; CMOVQNE DX, AX; MOVQ AX, 48+x;
    29  
    30  // cswap is a conditional swap
    31  // if b=1: x,y <- y,x;
    32  // if b=0: x,y remain with the same values;
    33  // if b<> 0,1: undefined.
    34  // Uses: AX, DX, R8, FLAGS
    35  // Instr: x86_64, cmov
    36  #define cswap(x,y,b) \
    37      TESTQ b, b \
    38      MOVQ  0+x, AX; MOVQ AX, R8; MOVQ  0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  0+x; MOVQ DX,  0+y; \
    39      MOVQ  8+x, AX; MOVQ AX, R8; MOVQ  8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  8+x; MOVQ DX,  8+y; \
    40      MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
    41      MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; \
    42      MOVQ 32+x, AX; MOVQ AX, R8; MOVQ 32+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 32+x; MOVQ DX, 32+y; \
    43      MOVQ 40+x, AX; MOVQ AX, R8; MOVQ 40+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 40+x; MOVQ DX, 40+y; \
    44      MOVQ 48+x, AX; MOVQ AX, R8; MOVQ 48+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 48+x; MOVQ DX, 48+y;
    45  
    46  // additionLeg adds x and y and stores in z
    47  // Uses: AX, DX, R8-R14, FLAGS
    48  // Instr: x86_64
    49  #define additionLeg(z,x,y) \
    50      MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
    51      MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
    52      MOVQ 16+x, R10;  ADCQ 16+y, R10; \
    53      MOVQ 24+x, R11;  ADCQ 24+y, R11; \
    54      MOVQ 32+x, R12;  ADCQ 32+y, R12; \
    55      MOVQ 40+x, R13;  ADCQ 40+y, R13; \
    56      MOVQ 48+x, R14;  ADCQ 48+y, R14; \
    57      MOVQ   $0,  AX;  ADCQ   $0,  AX; \
    58      MOVQ AX,  DX; \
    59      SHLQ $32, DX; \
    60      ADDQ AX,  R8; MOVQ  $0, AX; \
    61      ADCQ $0,  R9; \
    62      ADCQ $0, R10; \
    63      ADCQ DX, R11; \
    64      ADCQ $0, R12; \
    65      ADCQ $0, R13; \
    66      ADCQ $0, R14; \
    67      ADCQ $0,  AX; \
    68      MOVQ AX,  DX; \
    69      SHLQ $32, DX; \
    70      ADDQ AX,  R8;  MOVQ  R8,  0+z; \
    71      ADCQ $0,  R9;  MOVQ  R9,  8+z; \
    72      ADCQ $0, R10;  MOVQ R10, 16+z; \
    73      ADCQ DX, R11;  MOVQ R11, 24+z; \
    74      ADCQ $0, R12;  MOVQ R12, 32+z; \
    75      ADCQ $0, R13;  MOVQ R13, 40+z; \
    76      ADCQ $0, R14;  MOVQ R14, 48+z;
    77  
    78  
    79  // additionAdx adds x and y and stores in z
    80  // Uses: AX, DX, R8-R15, FLAGS
    81  // Instr: x86_64, adx
    82  #define additionAdx(z,x,y) \
    83      MOVL $32, R15; \
    84      XORL DX, DX; \
    85      MOVQ  0+x,  R8;  ADCXQ  0+y,  R8; \
    86      MOVQ  8+x,  R9;  ADCXQ  8+y,  R9; \
    87      MOVQ 16+x, R10;  ADCXQ 16+y, R10; \
    88      MOVQ 24+x, R11;  ADCXQ 24+y, R11; \
    89      MOVQ 32+x, R12;  ADCXQ 32+y, R12; \
    90      MOVQ 40+x, R13;  ADCXQ 40+y, R13; \
    91      MOVQ 48+x, R14;  ADCXQ 48+y, R14; \
    92      ;;;;;;;;;;;;;;;  ADCXQ   DX,  DX; \
    93      XORL AX, AX; \
    94      ADCXQ DX,  R8; SHLXQ R15, DX, DX; \
    95      ADCXQ AX,  R9; \
    96      ADCXQ AX, R10; \
    97      ADCXQ DX, R11; \
    98      ADCXQ AX, R12; \
    99      ADCXQ AX, R13; \
   100      ADCXQ AX, R14; \
   101      ADCXQ AX,  AX; \
   102      XORL  DX,  DX; \
   103      ADCXQ AX,  R8;  MOVQ  R8,  0+z; SHLXQ R15, AX, AX; \
   104      ADCXQ DX,  R9;  MOVQ  R9,  8+z; \
   105      ADCXQ DX, R10;  MOVQ R10, 16+z; \
   106      ADCXQ AX, R11;  MOVQ R11, 24+z; \
   107      ADCXQ DX, R12;  MOVQ R12, 32+z; \
   108      ADCXQ DX, R13;  MOVQ R13, 40+z; \
   109      ADCXQ DX, R14;  MOVQ R14, 48+z;
   110  
   111  // subtraction subtracts y from x and stores in z
   112  // Uses: AX, DX, R8-R14, FLAGS
   113  // Instr: x86_64
   114  #define subtraction(z,x,y) \
   115      MOVQ  0+x,  R8;  SUBQ  0+y,  R8; \
   116      MOVQ  8+x,  R9;  SBBQ  8+y,  R9; \
   117      MOVQ 16+x, R10;  SBBQ 16+y, R10; \
   118      MOVQ 24+x, R11;  SBBQ 24+y, R11; \
   119      MOVQ 32+x, R12;  SBBQ 32+y, R12; \
   120      MOVQ 40+x, R13;  SBBQ 40+y, R13; \
   121      MOVQ 48+x, R14;  SBBQ 48+y, R14; \
   122      MOVQ   $0,  AX;  SETCS AX; \
   123      MOVQ AX,  DX; \
   124      SHLQ $32, DX; \
   125      SUBQ AX,  R8; MOVQ  $0, AX; \
   126      SBBQ $0,  R9; \
   127      SBBQ $0, R10; \
   128      SBBQ DX, R11; \
   129      SBBQ $0, R12; \
   130      SBBQ $0, R13; \
   131      SBBQ $0, R14; \
   132      SETCS AX; \
   133      MOVQ AX,  DX; \
   134      SHLQ $32, DX; \
   135      SUBQ AX,  R8;  MOVQ  R8,  0+z; \
   136      SBBQ $0,  R9;  MOVQ  R9,  8+z; \
   137      SBBQ $0, R10;  MOVQ R10, 16+z; \
   138      SBBQ DX, R11;  MOVQ R11, 24+z; \
   139      SBBQ $0, R12;  MOVQ R12, 32+z; \
   140      SBBQ $0, R13;  MOVQ R13, 40+z; \
   141      SBBQ $0, R14;  MOVQ R14, 48+z;
   142  
   143  // maddBmi2Adx multiplies x and y and accumulates in z
   144  // Uses: AX, DX, R15, FLAGS
   145  // Instr: x86_64, bmi2, adx
   146  #define maddBmi2Adx(z,x,y,i,r0,r1,r2,r3,r4,r5,r6) \
   147      MOVQ   i+y, DX; XORL AX, AX; \
   148      MULXQ  0+x, AX, R8;  ADOXQ AX, r0;  ADCXQ R8, r1; MOVQ r0,i+z; \
   149      MULXQ  8+x, AX, r0;  ADOXQ AX, r1;  ADCXQ r0, r2; MOVQ $0, R8; \
   150      MULXQ 16+x, AX, r0;  ADOXQ AX, r2;  ADCXQ r0, r3; \
   151      MULXQ 24+x, AX, r0;  ADOXQ AX, r3;  ADCXQ r0, r4; \
   152      MULXQ 32+x, AX, r0;  ADOXQ AX, r4;  ADCXQ r0, r5; \
   153      MULXQ 40+x, AX, r0;  ADOXQ AX, r5;  ADCXQ r0, r6; \
   154      MULXQ 48+x, AX, r0;  ADOXQ AX, r6;  ADCXQ R8, r0; \
   155      ;;;;;;;;;;;;;;;;;;;  ADOXQ R8, r0;
   156  
   157  // integerMulAdx multiplies x and y and stores in z
   158  // Uses: AX, DX, R8-R15, FLAGS
   159  // Instr: x86_64, bmi2, adx
   160  #define integerMulAdx(z,x,y) \
   161      MOVL    $0,R15; \
   162      MOVQ   0+y, DX;  XORL AX, AX;  MOVQ $0, R8; \
   163      MULXQ  0+x, AX,  R9;  MOVQ  AX, 0+z; \
   164      MULXQ  8+x, AX, R10;  ADCXQ AX,  R9; \
   165      MULXQ 16+x, AX, R11;  ADCXQ AX, R10; \
   166      MULXQ 24+x, AX, R12;  ADCXQ AX, R11; \
   167      MULXQ 32+x, AX, R13;  ADCXQ AX, R12; \
   168      MULXQ 40+x, AX, R14;  ADCXQ AX, R13; \
   169      MULXQ 48+x, AX, R15;  ADCXQ AX, R14; \
   170      ;;;;;;;;;;;;;;;;;;;;  ADCXQ R8, R15; \
   171      maddBmi2Adx(z,x,y, 8, R9,R10,R11,R12,R13,R14,R15) \
   172      maddBmi2Adx(z,x,y,16,R10,R11,R12,R13,R14,R15, R9) \
   173      maddBmi2Adx(z,x,y,24,R11,R12,R13,R14,R15, R9,R10) \
   174      maddBmi2Adx(z,x,y,32,R12,R13,R14,R15, R9,R10,R11) \
   175      maddBmi2Adx(z,x,y,40,R13,R14,R15, R9,R10,R11,R12) \
   176      maddBmi2Adx(z,x,y,48,R14,R15, R9,R10,R11,R12,R13) \
   177      MOVQ R15,  56+z; \
   178      MOVQ  R9,  64+z; \
   179      MOVQ R10,  72+z; \
   180      MOVQ R11,  80+z; \
   181      MOVQ R12,  88+z; \
   182      MOVQ R13,  96+z; \
   183      MOVQ R14, 104+z;
   184  
   185  // maddLegacy multiplies x and y and accumulates in z
   186  // Uses: AX, DX, R15, FLAGS
   187  // Instr: x86_64
   188  #define maddLegacy(z,x,y,i) \
   189      MOVQ  i+y, R15; \
   190      MOVQ  0+x, AX; MULQ R15; MOVQ AX,  R8; ;;;;;;;;;;;; MOVQ DX,  R9; \
   191      MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
   192      MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
   193      MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
   194      MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
   195      MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
   196      MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
   197      ADDQ  0+i+z,  R8; MOVQ  R8,  0+i+z; \
   198      ADCQ  8+i+z,  R9; MOVQ  R9,  8+i+z; \
   199      ADCQ 16+i+z, R10; MOVQ R10, 16+i+z; \
   200      ADCQ 24+i+z, R11; MOVQ R11, 24+i+z; \
   201      ADCQ 32+i+z, R12; MOVQ R12, 32+i+z; \
   202      ADCQ 40+i+z, R13; MOVQ R13, 40+i+z; \
   203      ADCQ 48+i+z, R14; MOVQ R14, 48+i+z; \
   204      ADCQ     $0,  DX; MOVQ  DX, 56+i+z;
   205  
   206  // integerMulLeg multiplies x and y and stores in z
   207  // Uses: AX, DX, R8-R15, FLAGS
   208  // Instr: x86_64
   209  #define integerMulLeg(z,x,y) \
   210      MOVQ  0+y, R15; \
   211      MOVQ  0+x, AX; MULQ R15; MOVQ AX, 0+z; ;;;;;;;;;;;; MOVQ DX,  R8; \
   212      MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ  R8,  8+z; \
   213      MOVQ 16+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; MOVQ  R9, 16+z; \
   214      MOVQ 24+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; MOVQ R10, 24+z; \
   215      MOVQ 32+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; MOVQ R11, 32+z; \
   216      MOVQ 40+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; MOVQ R12, 40+z; \
   217      MOVQ 48+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX,56+z; MOVQ R13, 48+z; \
   218      maddLegacy(z,x,y, 8) \
   219      maddLegacy(z,x,y,16) \
   220      maddLegacy(z,x,y,24) \
   221      maddLegacy(z,x,y,32) \
   222      maddLegacy(z,x,y,40) \
   223      maddLegacy(z,x,y,48)
   224  
   225  // integerSqrLeg squares x and stores in z
   226  // Uses: AX, CX, DX, R8-R15, FLAGS
   227  // Instr: x86_64
   228  #define integerSqrLeg(z,x) \
   229      XORL R15, R15; \
   230      MOVQ  0+x, CX; \
   231      MOVQ   CX, AX; MULQ CX; MOVQ AX, 0+z; MOVQ DX, R8; \
   232      ADDQ   CX, CX; ADCQ $0, R15; \
   233      MOVQ  8+x, AX; MULQ CX; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ R8, 8+z; \
   234      MOVQ 16+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
   235      MOVQ 24+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
   236      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
   237      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
   238      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
   239      \
   240      MOVQ  8+x, CX; \
   241      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   242      ;;;;;;;;;;;;;; MULQ CX; ADDQ  AX, R9; ADCQ $0, DX; MOVQ R9,16+z; \
   243      MOVQ  R15, AX; NEGQ AX; ANDQ 8+x, AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
   244      ADDQ  8+x, CX; ADCQ $0, R15; \
   245      MOVQ 16+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 24+z; \
   246      MOVQ 24+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX, R8; \
   247      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; \
   248      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
   249      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R9; \
   250      \
   251      MOVQ 16+x, CX; \
   252      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   253      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 32+z; \
   254      MOVQ  R15, AX; NEGQ AX; ANDQ 16+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
   255      ADDQ 16+x, CX; ADCQ $0, R15; \
   256      MOVQ 24+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 40+z; \
   257      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
   258      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; \
   259      MOVQ 48+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX,R10; \
   260      \
   261      MOVQ 24+x, CX; \
   262      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   263      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 48+z; \
   264      MOVQ  R15, AX; NEGQ AX; ANDQ 24+x,AX; ADDQ AX, DX; ADCQ $0,  R9; MOVQ DX, R8; \
   265      ADDQ 24+x, CX; ADCQ $0, R15; \
   266      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; MOVQ R14, 56+z; \
   267      MOVQ 40+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX, R8; \
   268      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX,R11; \
   269      \
   270      MOVQ 32+x, CX; \
   271      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   272      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ R9, 64+z; \
   273      MOVQ  R15, AX; NEGQ AX; ANDQ 32+x,AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
   274      ADDQ 32+x, CX; ADCQ $0, R15; \
   275      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 72+z; \
   276      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX,R12; \
   277      \
   278      XORL R13, R13; \
   279      XORL R14, R14; \
   280      MOVQ 40+x, CX; \
   281      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   282      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 80+z; \
   283      MOVQ  R15, AX; NEGQ AX; ANDQ 40+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
   284      ADDQ 40+x, CX; ADCQ $0, R15; \
   285      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 88+z; \
   286      ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8, R13; ADCQ $0,R14; \
   287      \
   288      XORL   R9, R9; \
   289      MOVQ 48+x, CX; \
   290      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   291      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 96+z; \
   292      MOVQ  R15, AX; NEGQ AX; ANDQ 48+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
   293      ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8,R14; ADCQ $0, R9; MOVQ R14, 104+z;
   294  
   295  
   296  // integerSqrAdx squares x and stores in z
   297  // Uses: AX, CX, DX, R8-R15, FLAGS
   298  // Instr: x86_64, bmi2, adx
   299  #define integerSqrAdx(z,x) \
   300      XORL R15, R15; \
   301      MOVQ  0+x, DX; \
   302      ;;;;;;;;;;;;;; MULXQ DX, AX, R8; MOVQ AX, 0+z; \
   303      ADDQ   DX, DX; ADCQ $0, R15; CLC; \
   304      MULXQ  8+x, AX,  R9; ADCXQ AX,  R8; MOVQ R8, 8+z; \
   305      MULXQ 16+x, AX, R10; ADCXQ AX,  R9; MOVQ $0, R8;\
   306      MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
   307      MULXQ 32+x, AX, R12; ADCXQ AX, R11; \
   308      MULXQ 40+x, AX, R13; ADCXQ AX, R12; \
   309      MULXQ 48+x, AX, R14; ADCXQ AX, R13; \
   310      ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R14; \
   311      \
   312      MOVQ  8+x, DX; \
   313      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   314      MULXQ AX,  AX, CX; \
   315      MOVQ R15,  R8; NEGQ R8; ANDQ 8+x, R8; \
   316      ADDQ AX,  R9; MOVQ R9, 16+z; \
   317      ADCQ CX,  R8; \
   318      ADCQ $0, R11; \
   319      ADDQ  8+x,  DX; \
   320      ADCQ   $0, R15; \
   321      XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
   322      MULXQ 16+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 24+z; \
   323      MULXQ 24+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; MOVQ  $0, R10; \
   324      MULXQ 32+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; \
   325      MULXQ 40+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; \
   326      MULXQ 48+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
   327      ;;;;;;;;;;;;;;;;;;; ADCXQ R10, R9; \
   328      \
   329      MOVQ 16+x, DX; \
   330      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   331      MULXQ AX,  AX, CX; \
   332      MOVQ R15,  R8; NEGQ R8; ANDQ 16+x, R8; \
   333      ADDQ AX, R11; MOVQ R11, 32+z; \
   334      ADCQ CX,  R8; \
   335      ADCQ $0, R13; \
   336      ADDQ 16+x,  DX; \
   337      ADCQ   $0, R15; \
   338      XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
   339      MULXQ 24+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 40+z; \
   340      MULXQ 32+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; MOVQ  $0, R12; \
   341      MULXQ 40+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
   342      MULXQ 48+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; \
   343      ;;;;;;;;;;;;;;;;;;; ADCXQ R11,R10; \
   344      \
   345      MOVQ 24+x, DX; \
   346      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   347      MULXQ AX,  AX, CX; \
   348      MOVQ R15,  R8; NEGQ R8; ANDQ 24+x, R8; \
   349      ADDQ AX, R13; MOVQ R13, 48+z; \
   350      ADCQ CX,  R8; \
   351      ADCQ $0,  R9; \
   352      ADDQ 24+x,  DX; \
   353      ADCQ   $0, R15; \
   354      XORL R13, R13; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R14; \
   355      MULXQ 32+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; MOVQ R14, 56+z; \
   356      MULXQ 40+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; MOVQ  $0, R14; \
   357      MULXQ 48+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; \
   358      ;;;;;;;;;;;;;;;;;;; ADCXQ R12,R11; \
   359      \
   360      MOVQ 32+x, DX; \
   361      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   362      MULXQ AX,  AX, CX; \
   363      MOVQ R15,  R8; NEGQ R8; ANDQ 32+x, R8; \
   364      ADDQ AX,  R9; MOVQ R9, 64+z; \
   365      ADCQ CX,  R8; \
   366      ADCQ $0, R11; \
   367      ADDQ 32+x,  DX; \
   368      ADCQ   $0, R15; \
   369      XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
   370      MULXQ 40+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 72+z; \
   371      MULXQ 48+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; \
   372      ;;;;;;;;;;;;;;;;;;; ADCXQ R13,R12; \
   373      \
   374      MOVQ 40+x, DX; \
   375      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   376      MULXQ AX,  AX, CX; \
   377      MOVQ R15,  R8; NEGQ R8; ANDQ 40+x, R8; \
   378      ADDQ AX, R11; MOVQ R11, 80+z; \
   379      ADCQ CX,  R8; \
   380      ADCQ $0, R13; \
   381      ADDQ 40+x,  DX; \
   382      ADCQ   $0, R15; \
   383      XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
   384      MULXQ 48+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 88+z; \
   385      ;;;;;;;;;;;;;;;;;;; ADCXQ R14,R13; \
   386      \
   387      MOVQ 48+x, DX; \
   388      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   389      MULXQ AX,  AX, CX; \
   390      MOVQ R15,  R8; NEGQ R8; ANDQ 48+x, R8; \
   391      XORL R10, R10; ;;;;;;;;;;;;;; ADOXQ CX, R14; \
   392      ;;;;;;;;;;;;;; ADCXQ AX, R13; ;;;;;;;;;;;;;; MOVQ R13, 96+z; \
   393      ;;;;;;;;;;;;;; ADCXQ R8, R14; MOVQ R14, 104+z;
   394  
   395  // reduceFromDoubleLeg finds a z=x modulo p such that z<2^448 and stores in z
   396  // Uses: AX, R8-R15, FLAGS
   397  // Instr: x86_64
   398  #define reduceFromDoubleLeg(z,x) \
   399      /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
   400      /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
   401      MOVQ 80+x,AX; MOVQ AX,R10; \
   402      MOVQ $0xFFFFFFFF00000000, R8; \
   403      ANDQ R8,R10; \
   404      \
   405      MOVQ $0,R14; \
   406      MOVQ 104+x,R13; SHLQ $1,R13,R14; \
   407      MOVQ  96+x,R12; SHLQ $1,R12,R13; \
   408      MOVQ  88+x,R11; SHLQ $1,R11,R12; \
   409      MOVQ  72+x, R9; SHLQ $1,R10,R11; \
   410      MOVQ  64+x, R8; SHLQ $1,R10; \
   411      MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
   412      MOVQ  56+x,R15; \
   413      \
   414      ADDQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
   415      ADCQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
   416      ADCQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
   417      ADCQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
   418      ADCQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
   419      ADCQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
   420      ADCQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
   421      ADCQ   $0,R14; \
   422      /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
   423      /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
   424      MOVQ R10, AX; \
   425      SHRQ $32,R11,R10; \
   426      SHRQ $32,R12,R11; \
   427      SHRQ $32,R13,R12; \
   428      SHRQ $32,R15,R13; \
   429      SHRQ $32, R8,R15; \
   430      SHRQ $32, R9, R8; \
   431      SHRQ $32, AX, R9; \
   432      \
   433      ADDQ  0+z,R10; \
   434      ADCQ  8+z,R11; \
   435      ADCQ 16+z,R12; \
   436      ADCQ 24+z,R13; \
   437      ADCQ 32+z,R15; \
   438      ADCQ 40+z, R8; \
   439      ADCQ 48+z, R9; \
   440      ADCQ   $0,R14; \
   441      /* ( c7) + (c6,...,c0) */ \
   442      /* (r14) */ \
   443      MOVQ R14, AX; SHLQ $32, AX; \
   444      ADDQ R14,R10; MOVQ  $0,R14; \
   445      ADCQ  $0,R11; \
   446      ADCQ  $0,R12; \
   447      ADCQ  AX,R13; \
   448      ADCQ  $0,R15; \
   449      ADCQ  $0, R8; \
   450      ADCQ  $0, R9; \
   451      ADCQ  $0,R14; \
   452      /* ( c7) + (c6,...,c0) */ \
   453      /* (r14) */ \
   454      MOVQ R14, AX; SHLQ $32,AX; \
   455      ADDQ R14,R10; MOVQ R10, 0+z; \
   456      ADCQ  $0,R11; MOVQ R11, 8+z; \
   457      ADCQ  $0,R12; MOVQ R12,16+z; \
   458      ADCQ  AX,R13; MOVQ R13,24+z; \
   459      ADCQ  $0,R15; MOVQ R15,32+z; \
   460      ADCQ  $0, R8; MOVQ  R8,40+z; \
   461      ADCQ  $0, R9; MOVQ  R9,48+z;
   462  
   463  // reduceFromDoubleAdx finds a z=x modulo p such that z<2^448 and stores in z
   464  // Uses: AX, R8-R15, FLAGS
   465  // Instr: x86_64, adx
   466  #define reduceFromDoubleAdx(z,x) \
   467      /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
   468      /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
   469      MOVQ 80+x,AX; MOVQ AX,R10; \
   470      MOVQ $0xFFFFFFFF00000000, R8; \
   471      ANDQ R8,R10; \
   472      \
   473      MOVQ $0,R14; \
   474      MOVQ 104+x,R13; SHLQ $1,R13,R14; \
   475      MOVQ  96+x,R12; SHLQ $1,R12,R13; \
   476      MOVQ  88+x,R11; SHLQ $1,R11,R12; \
   477      MOVQ  72+x, R9; SHLQ $1,R10,R11; \
   478      MOVQ  64+x, R8; SHLQ $1,R10; \
   479      MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
   480      MOVQ  56+x,R15; \
   481      \
   482      XORL AX,AX; \
   483      ADCXQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
   484      ADCXQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
   485      ADCXQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
   486      ADCXQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
   487      ADCXQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
   488      ADCXQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
   489      ADCXQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
   490      ADCXQ   AX,R14; \
   491      /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
   492      /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
   493      MOVQ R10, AX; \
   494      SHRQ $32,R11,R10; \
   495      SHRQ $32,R12,R11; \
   496      SHRQ $32,R13,R12; \
   497      SHRQ $32,R15,R13; \
   498      SHRQ $32, R8,R15; \
   499      SHRQ $32, R9, R8; \
   500      SHRQ $32, AX, R9; \
   501      \
   502      XORL AX,AX; \
   503      ADCXQ  0+z,R10; \
   504      ADCXQ  8+z,R11; \
   505      ADCXQ 16+z,R12; \
   506      ADCXQ 24+z,R13; \
   507      ADCXQ 32+z,R15; \
   508      ADCXQ 40+z, R8; \
   509      ADCXQ 48+z, R9; \
   510      ADCXQ   AX,R14; \
   511      /* ( c7) + (c6,...,c0) */ \
   512      /* (r14) */ \
   513      MOVQ R14, AX; SHLQ $32, AX; \
   514      CLC; \
   515      ADCXQ R14,R10; MOVQ $0,R14; \
   516      ADCXQ R14,R11; \
   517      ADCXQ R14,R12; \
   518      ADCXQ  AX,R13; \
   519      ADCXQ R14,R15; \
   520      ADCXQ R14, R8; \
   521      ADCXQ R14, R9; \
   522      ADCXQ R14,R14; \
   523      /* ( c7) + (c6,...,c0) */ \
   524      /* (r14) */ \
   525      MOVQ R14, AX; SHLQ $32, AX; \
   526      CLC; \
   527      ADCXQ R14,R10; MOVQ R10, 0+z; MOVQ $0,R14; \
   528      ADCXQ R14,R11; MOVQ R11, 8+z; \
   529      ADCXQ R14,R12; MOVQ R12,16+z; \
   530      ADCXQ  AX,R13; MOVQ R13,24+z; \
   531      ADCXQ R14,R15; MOVQ R15,32+z; \
   532      ADCXQ R14, R8; MOVQ  R8,40+z; \
   533      ADCXQ R14, R9; MOVQ  R9,48+z;
   534  
   535  // addSub calculates two operations: x,y = x+y,x-y
   536  // Uses: AX, DX, R8-R15, FLAGS
   537  #define addSub(x,y) \
   538      MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
   539      MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
   540      MOVQ 16+x, R10;  ADCQ 16+y, R10; \
   541      MOVQ 24+x, R11;  ADCQ 24+y, R11; \
   542      MOVQ 32+x, R12;  ADCQ 32+y, R12; \
   543      MOVQ 40+x, R13;  ADCQ 40+y, R13; \
   544      MOVQ 48+x, R14;  ADCQ 48+y, R14; \
   545      MOVQ   $0,  AX;  ADCQ   $0,  AX; \
   546      MOVQ AX,  DX; \
   547      SHLQ $32, DX; \
   548      ADDQ AX,  R8; MOVQ  $0, AX; \
   549      ADCQ $0,  R9; \
   550      ADCQ $0, R10; \
   551      ADCQ DX, R11; \
   552      ADCQ $0, R12; \
   553      ADCQ $0, R13; \
   554      ADCQ $0, R14; \
   555      ADCQ $0,  AX; \
   556      MOVQ AX,  DX; \
   557      SHLQ $32, DX; \
   558      ADDQ AX,  R8;  MOVQ  0+x,AX; MOVQ  R8,  0+x; MOVQ AX,  R8; \
   559      ADCQ $0,  R9;  MOVQ  8+x,AX; MOVQ  R9,  8+x; MOVQ AX,  R9; \
   560      ADCQ $0, R10;  MOVQ 16+x,AX; MOVQ R10, 16+x; MOVQ AX, R10; \
   561      ADCQ DX, R11;  MOVQ 24+x,AX; MOVQ R11, 24+x; MOVQ AX, R11; \
   562      ADCQ $0, R12;  MOVQ 32+x,AX; MOVQ R12, 32+x; MOVQ AX, R12; \
   563      ADCQ $0, R13;  MOVQ 40+x,AX; MOVQ R13, 40+x; MOVQ AX, R13; \
   564      ADCQ $0, R14;  MOVQ 48+x,AX; MOVQ R14, 48+x; MOVQ AX, R14; \
   565      SUBQ  0+y,  R8; \
   566      SBBQ  8+y,  R9; \
   567      SBBQ 16+y, R10; \
   568      SBBQ 24+y, R11; \
   569      SBBQ 32+y, R12; \
   570      SBBQ 40+y, R13; \
   571      SBBQ 48+y, R14; \
   572      MOVQ   $0,  AX;  SETCS AX; \
   573      MOVQ AX,  DX; \
   574      SHLQ $32, DX; \
   575      SUBQ AX,  R8; MOVQ  $0, AX; \
   576      SBBQ $0,  R9; \
   577      SBBQ $0, R10; \
   578      SBBQ DX, R11; \
   579      SBBQ $0, R12; \
   580      SBBQ $0, R13; \
   581      SBBQ $0, R14; \
   582      SETCS AX; \
   583      MOVQ AX,  DX; \
   584      SHLQ $32, DX; \
   585      SUBQ AX,  R8;  MOVQ  R8,  0+y; \
   586      SBBQ $0,  R9;  MOVQ  R9,  8+y; \
   587      SBBQ $0, R10;  MOVQ R10, 16+y; \
   588      SBBQ DX, R11;  MOVQ R11, 24+y; \
   589      SBBQ $0, R12;  MOVQ R12, 32+y; \
   590      SBBQ $0, R13;  MOVQ R13, 40+y; \
   591      SBBQ $0, R14;  MOVQ R14, 48+y;