github.com/cloudflare/circl@v1.5.0/dh/x448/curve_amd64.s (about)

     1  //go:build amd64 && !purego
     2  // +build amd64,!purego
     3  
     4  #include "textflag.h"
     5  
     6  // Depends on circl/math/fp448 package
     7  #include "../../math/fp448/fp_amd64.h"
     8  #include "curve_amd64.h"
     9  
    10  // CTE_A24 is (A+2)/4 from Curve448
    11  #define CTE_A24 39082
    12  
    13  #define Size 56
    14  
    15  // multiplyA24Leg multiplies x times CTE_A24 and stores in z
    16  // Uses: AX, DX, R8-R15, FLAGS
    17  // Instr: x86_64, cmov, adx
    18  #define multiplyA24Leg(z,x) \
    19      MOVQ $CTE_A24, R15; \
    20      MOVQ  0+x, AX; MULQ R15; MOVQ AX,  R8; ;;;;;;;;;;;;  MOVQ DX,  R9; \
    21      MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX;  MOVQ DX, R10; \
    22      MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX;  MOVQ DX, R11; \
    23      MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX;  MOVQ DX, R12; \
    24      MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX;  MOVQ DX, R13; \
    25      MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX;  MOVQ DX, R14; \
    26      MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
    27      MOVQ DX,  AX; \
    28      SHLQ $32, AX; \
    29      ADDQ DX,  R8; MOVQ $0, DX; \
    30      ADCQ $0,  R9; \
    31      ADCQ $0, R10; \
    32      ADCQ AX, R11; \
    33      ADCQ $0, R12; \
    34      ADCQ $0, R13; \
    35      ADCQ $0, R14; \
    36      ADCQ $0,  DX; \
    37      MOVQ DX,  AX; \
    38      SHLQ $32, AX; \
    39      ADDQ DX,  R8; \
    40      ADCQ $0,  R9; \
    41      ADCQ $0, R10; \
    42      ADCQ AX, R11; \
    43      ADCQ $0, R12; \
    44      ADCQ $0, R13; \
    45      ADCQ $0, R14; \
    46      MOVQ  R8,  0+z; \
    47      MOVQ  R9,  8+z; \
    48      MOVQ R10, 16+z; \
    49      MOVQ R11, 24+z; \
    50      MOVQ R12, 32+z; \
    51      MOVQ R13, 40+z; \
    52      MOVQ R14, 48+z;
    53  
    54  // multiplyA24Adx multiplies x times CTE_A24 and stores in z
    55  // Uses: AX, DX, R8-R14, FLAGS
    56  // Instr: x86_64, bmi2
    57  #define multiplyA24Adx(z,x) \
    58      MOVQ $CTE_A24, DX; \
    59      MULXQ  0+x, R8,  R9; \
    60      MULXQ  8+x, AX, R10;  ADDQ AX,  R9; \
    61      MULXQ 16+x, AX, R11;  ADCQ AX, R10; \
    62      MULXQ 24+x, AX, R12;  ADCQ AX, R11; \
    63      MULXQ 32+x, AX, R13;  ADCQ AX, R12; \
    64      MULXQ 40+x, AX, R14;  ADCQ AX, R13; \
    65      MULXQ 48+x, AX,  DX;  ADCQ AX, R14; \
    66      ;;;;;;;;;;;;;;;;;;;;  ADCQ $0,  DX; \
    67      MOVQ DX,  AX; \
    68      SHLQ $32, AX; \
    69      ADDQ DX,  R8; MOVQ $0, DX; \
    70      ADCQ $0,  R9; \
    71      ADCQ $0, R10; \
    72      ADCQ AX, R11; \
    73      ADCQ $0, R12; \
    74      ADCQ $0, R13; \
    75      ADCQ $0, R14; \
    76      ADCQ $0,  DX; \
    77      MOVQ DX,  AX; \
    78      SHLQ $32, AX; \
    79      ADDQ DX,  R8; \
    80      ADCQ $0,  R9; \
    81      ADCQ $0, R10; \
    82      ADCQ AX, R11; \
    83      ADCQ $0, R12; \
    84      ADCQ $0, R13; \
    85      ADCQ $0, R14; \
    86      MOVQ  R8,  0+z; \
    87      MOVQ  R9,  8+z; \
    88      MOVQ R10, 16+z; \
    89      MOVQ R11, 24+z; \
    90      MOVQ R12, 32+z; \
    91      MOVQ R13, 40+z; \
    92      MOVQ R14, 48+z;
    93  
    94  #define mulA24Legacy \
    95      multiplyA24Leg(0(DI),0(SI))
    96  #define mulA24Bmi2Adx \
    97      multiplyA24Adx(0(DI),0(SI))
    98  
    99  // func mulA24Amd64(z, x *fp448.Elt)
   100  TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
   101      MOVQ z+0(FP), DI
   102      MOVQ x+8(FP), SI
   103      CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
   104  
   105  // func ladderStepAmd64(w *[5]fp448.Elt, b uint)
   106  // ladderStepAmd64 calculates a point addition and doubling as follows:
   107  // (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
   108  //    w    = {x1,x2,z2,x3,z4} are five fp255.Elt of 56 bytes.
   109  //  stack  = (t0,t1) are two fp.Elt of fp.Size bytes, and
   110  //           (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
   111  TEXT ·ladderStepAmd64(SB),NOSPLIT,$336-16
   112      // Parameters
   113      #define regWork DI
   114      #define regMove SI
   115      #define x1 0*Size(regWork)
   116      #define x2 1*Size(regWork)
   117      #define z2 2*Size(regWork)
   118      #define x3 3*Size(regWork)
   119      #define z3 4*Size(regWork)
   120      // Local variables
   121      #define t0 0*Size(SP)
   122      #define t1 1*Size(SP)
   123      #define b0 2*Size(SP)
   124      #define b1 4*Size(SP)
   125      MOVQ w+0(FP), regWork
   126      MOVQ b+8(FP), regMove
   127      CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
   128      #undef regWork
   129      #undef regMove
   130      #undef x1
   131      #undef x2
   132      #undef z2
   133      #undef x3
   134      #undef z3
   135      #undef t0
   136      #undef t1
   137      #undef b0
   138      #undef b1
   139  
   140  // func diffAddAmd64(work *[5]fp.Elt, swap uint)
   141  // diffAddAmd64 calculates a differential point addition using a precomputed point.
   142  // (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
   143  //    work = {mu,x1,z1,x2,z2} are five fp448.Elt of 56 bytes, and
   144  //   stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
   145  // This is Equation 7 at https://eprint.iacr.org/2017/264.
   146  TEXT ·diffAddAmd64(SB),NOSPLIT,$224-16
   147      // Parameters
   148      #define regWork DI
   149      #define regSwap SI
   150      #define ui 0*Size(regWork)
   151      #define x1 1*Size(regWork)
   152      #define z1 2*Size(regWork)
   153      #define x2 3*Size(regWork)
   154      #define z2 4*Size(regWork)
   155      // Local variables
   156      #define b0 0*Size(SP)
   157      #define b1 2*Size(SP)
   158      MOVQ w+0(FP), regWork
   159      MOVQ b+8(FP), regSwap
   160      cswap(x1,x2,regSwap)
   161      cswap(z1,z2,regSwap)
   162      CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
   163      #undef regWork
   164      #undef regSwap
   165      #undef ui
   166      #undef x1
   167      #undef z1
   168      #undef x2
   169      #undef z2
   170      #undef b0
   171      #undef b1
   172  
   173  // func doubleAmd64(x, z *fp448.Elt)
   174  // doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
   175  //  stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
   176  //          (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
   177  TEXT ·doubleAmd64(SB),NOSPLIT,$336-16
   178      // Parameters
   179      #define x1 0(DI)
   180      #define z1 0(SI)
   181      // Local variables
   182      #define t0 0*Size(SP)
   183      #define t1 1*Size(SP)
   184      #define b0 2*Size(SP)
   185      #define b1 4*Size(SP)
   186      MOVQ x+0(FP), DI
   187      MOVQ z+8(FP), SI
   188      CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
   189      #undef x1
   190      #undef z1
   191      #undef t0
   192      #undef t1
   193      #undef b0
   194      #undef b1