github.com/cloudflare/circl@v1.5.0/dh/x25519/curve_amd64.s

github.com/cloudflare/circl@v1.5.0/dh/x25519/curve_amd64.s (about)

     1  //go:build amd64 && !purego
     2  // +build amd64,!purego
     3  
     4  #include "textflag.h"
     5  
     6  // Depends on circl/math/fp25519 package
     7  #include "../../math/fp25519/fp_amd64.h"
     8  #include "curve_amd64.h"
     9  
    10  // CTE_A24 is (A+2)/4 from Curve25519
    11  #define CTE_A24 121666
    12  
    13  #define Size 32
    14  
    15  // multiplyA24Leg multiplies x times CTE_A24 and stores in z
    16  // Uses: AX, DX, R8-R13, FLAGS
    17  // Instr: x86_64, cmov
    18  #define multiplyA24Leg(z,x) \
    19      MOVL $CTE_A24, AX; MULQ  0+x; MOVQ AX,  R8; MOVQ DX,  R9; \
    20      MOVL $CTE_A24, AX; MULQ  8+x; MOVQ AX, R12; MOVQ DX, R10; \
    21      MOVL $CTE_A24, AX; MULQ 16+x; MOVQ AX, R13; MOVQ DX, R11; \
    22      MOVL $CTE_A24, AX; MULQ 24+x; \
    23      ADDQ R12,  R9; \
    24      ADCQ R13, R10; \
    25      ADCQ  AX, R11; \
    26      ADCQ  $0,  DX; \
    27      MOVL $38,  AX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
    28      IMULQ AX, DX; \
    29      ADDQ DX, R8; \
    30      ADCQ $0,  R9;  MOVQ  R9,  8+z; \
    31      ADCQ $0, R10;  MOVQ R10, 16+z; \
    32      ADCQ $0, R11;  MOVQ R11, 24+z; \
    33      MOVQ $0, DX; \
    34      CMOVQCS AX, DX; \
    35      ADDQ DX, R8;  MOVQ  R8,   0+z;
    36  
    37  // multiplyA24Adx multiplies x times CTE_A24 and stores in z
    38  // Uses: AX, DX, R8-R12, FLAGS
    39  // Instr: x86_64, cmov, bmi2
    40  #define multiplyA24Adx(z,x) \
    41      MOVQ  $CTE_A24, DX; \
    42      MULXQ  0+x,  R8, R10; \
    43      MULXQ  8+x,  R9, R11;  ADDQ R10,  R9; \
    44      MULXQ 16+x, R10,  AX;  ADCQ R11, R10; \
    45      MULXQ 24+x, R11, R12;  ADCQ  AX, R11; \
    46      ;;;;;;;;;;;;;;;;;;;;;  ADCQ  $0, R12; \
    47      MOVL $38,  DX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
    48      IMULQ DX, R12; \
    49      ADDQ R12, R8; \
    50      ADCQ $0,  R9;  MOVQ  R9,  8+z; \
    51      ADCQ $0, R10;  MOVQ R10, 16+z; \
    52      ADCQ $0, R11;  MOVQ R11, 24+z; \
    53      MOVQ $0, R12; \
    54      CMOVQCS DX, R12; \
    55      ADDQ R12, R8;  MOVQ  R8,  0+z;
    56  
    57  #define mulA24Legacy \
    58      multiplyA24Leg(0(DI),0(SI))
    59  #define mulA24Bmi2Adx \
    60      multiplyA24Adx(0(DI),0(SI))
    61  
    62  // func mulA24Amd64(z, x *fp255.Elt)
    63  TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
    64      MOVQ z+0(FP), DI
    65      MOVQ x+8(FP), SI
    66      CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
    67  
    68  
    69  // func ladderStepAmd64(w *[5]fp255.Elt, b uint)
    70  // ladderStepAmd64 calculates a point addition and doubling as follows:
    71  // (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
    72  //  work  = (x1,x2,z2,x3,z3) are five fp255.Elt of 32 bytes.
    73  //  stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
    74  //          (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
    75  TEXT ·ladderStepAmd64(SB),NOSPLIT,$192-16
    76      // Parameters
    77      #define regWork DI
    78      #define regMove SI
    79      #define x1 0*Size(regWork)
    80      #define x2 1*Size(regWork)
    81      #define z2 2*Size(regWork)
    82      #define x3 3*Size(regWork)
    83      #define z3 4*Size(regWork)
    84      // Local variables
    85      #define t0 0*Size(SP)
    86      #define t1 1*Size(SP)
    87      #define b0 2*Size(SP)
    88      #define b1 4*Size(SP)
    89      MOVQ w+0(FP), regWork
    90      MOVQ b+8(FP), regMove
    91      CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
    92      #undef regWork
    93      #undef regMove
    94      #undef x1
    95      #undef x2
    96      #undef z2
    97      #undef x3
    98      #undef z3
    99      #undef t0
   100      #undef t1
   101      #undef b0
   102      #undef b1
   103  
   104  // func diffAddAmd64(w *[5]fp255.Elt, b uint)
   105  // diffAddAmd64 calculates a differential point addition using a precomputed point.
   106  // (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
   107  //    w    = (mu,x1,z1,x2,z2) are five fp.Elt, and
   108  //   stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
   109  TEXT ·diffAddAmd64(SB),NOSPLIT,$128-16
   110      // Parameters
   111      #define regWork DI
   112      #define regSwap SI
   113      #define ui 0*Size(regWork)
   114      #define x1 1*Size(regWork)
   115      #define z1 2*Size(regWork)
   116      #define x2 3*Size(regWork)
   117      #define z2 4*Size(regWork)
   118      // Local variables
   119      #define b0 0*Size(SP)
   120      #define b1 2*Size(SP)
   121      MOVQ w+0(FP), regWork
   122      MOVQ b+8(FP), regSwap
   123      cswap(x1,x2,regSwap)
   124      cswap(z1,z2,regSwap)
   125      CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
   126      #undef regWork
   127      #undef regSwap
   128      #undef ui
   129      #undef x1
   130      #undef z1
   131      #undef x2
   132      #undef z2
   133      #undef b0
   134      #undef b1
   135  
   136  // func doubleAmd64(x, z *fp255.Elt)
   137  // doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
   138  //  stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
   139  //          (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
   140  TEXT ·doubleAmd64(SB),NOSPLIT,$192-16
   141      // Parameters
   142      #define x1 0(DI)
   143      #define z1 0(SI)
   144      // Local variables
   145      #define t0 0*Size(SP)
   146      #define t1 1*Size(SP)
   147      #define b0 2*Size(SP)
   148      #define b1 4*Size(SP)
   149      MOVQ x+0(FP), DI
   150      MOVQ z+8(FP), SI
   151      CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
   152      #undef x1
   153      #undef z1
   154      #undef t0
   155      #undef t1
   156      #undef b0
   157      #undef b1