github.com/cloudflare/circl@v1.5.0/dh/x25519/curve_amd64.s (about) 1 //go:build amd64 && !purego 2 // +build amd64,!purego 3 4 #include "textflag.h" 5 6 // Depends on circl/math/fp25519 package 7 #include "../../math/fp25519/fp_amd64.h" 8 #include "curve_amd64.h" 9 10 // CTE_A24 is (A+2)/4 from Curve25519 11 #define CTE_A24 121666 12 13 #define Size 32 14 15 // multiplyA24Leg multiplies x times CTE_A24 and stores in z 16 // Uses: AX, DX, R8-R13, FLAGS 17 // Instr: x86_64, cmov 18 #define multiplyA24Leg(z,x) \ 19 MOVL $CTE_A24, AX; MULQ 0+x; MOVQ AX, R8; MOVQ DX, R9; \ 20 MOVL $CTE_A24, AX; MULQ 8+x; MOVQ AX, R12; MOVQ DX, R10; \ 21 MOVL $CTE_A24, AX; MULQ 16+x; MOVQ AX, R13; MOVQ DX, R11; \ 22 MOVL $CTE_A24, AX; MULQ 24+x; \ 23 ADDQ R12, R9; \ 24 ADCQ R13, R10; \ 25 ADCQ AX, R11; \ 26 ADCQ $0, DX; \ 27 MOVL $38, AX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \ 28 IMULQ AX, DX; \ 29 ADDQ DX, R8; \ 30 ADCQ $0, R9; MOVQ R9, 8+z; \ 31 ADCQ $0, R10; MOVQ R10, 16+z; \ 32 ADCQ $0, R11; MOVQ R11, 24+z; \ 33 MOVQ $0, DX; \ 34 CMOVQCS AX, DX; \ 35 ADDQ DX, R8; MOVQ R8, 0+z; 36 37 // multiplyA24Adx multiplies x times CTE_A24 and stores in z 38 // Uses: AX, DX, R8-R12, FLAGS 39 // Instr: x86_64, cmov, bmi2 40 #define multiplyA24Adx(z,x) \ 41 MOVQ $CTE_A24, DX; \ 42 MULXQ 0+x, R8, R10; \ 43 MULXQ 8+x, R9, R11; ADDQ R10, R9; \ 44 MULXQ 16+x, R10, AX; ADCQ R11, R10; \ 45 MULXQ 24+x, R11, R12; ADCQ AX, R11; \ 46 ;;;;;;;;;;;;;;;;;;;;; ADCQ $0, R12; \ 47 MOVL $38, DX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \ 48 IMULQ DX, R12; \ 49 ADDQ R12, R8; \ 50 ADCQ $0, R9; MOVQ R9, 8+z; \ 51 ADCQ $0, R10; MOVQ R10, 16+z; \ 52 ADCQ $0, R11; MOVQ R11, 24+z; \ 53 MOVQ $0, R12; \ 54 CMOVQCS DX, R12; \ 55 ADDQ R12, R8; MOVQ R8, 0+z; 56 57 #define mulA24Legacy \ 58 multiplyA24Leg(0(DI),0(SI)) 59 #define mulA24Bmi2Adx \ 60 multiplyA24Adx(0(DI),0(SI)) 61 62 // func mulA24Amd64(z, x *fp255.Elt) 63 TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16 64 MOVQ z+0(FP), DI 65 MOVQ x+8(FP), SI 66 CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx) 67 68 69 // func ladderStepAmd64(w *[5]fp255.Elt, b uint) 70 // ladderStepAmd64 calculates a point addition and doubling as follows: 71 // (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-). 72 // work = (x1,x2,z2,x3,z3) are five fp255.Elt of 32 bytes. 73 // stack = (t0,t1) are two fp.Elt of fp.Size bytes, and 74 // (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes. 75 TEXT ·ladderStepAmd64(SB),NOSPLIT,$192-16 76 // Parameters 77 #define regWork DI 78 #define regMove SI 79 #define x1 0*Size(regWork) 80 #define x2 1*Size(regWork) 81 #define z2 2*Size(regWork) 82 #define x3 3*Size(regWork) 83 #define z3 4*Size(regWork) 84 // Local variables 85 #define t0 0*Size(SP) 86 #define t1 1*Size(SP) 87 #define b0 2*Size(SP) 88 #define b1 4*Size(SP) 89 MOVQ w+0(FP), regWork 90 MOVQ b+8(FP), regMove 91 CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx) 92 #undef regWork 93 #undef regMove 94 #undef x1 95 #undef x2 96 #undef z2 97 #undef x3 98 #undef z3 99 #undef t0 100 #undef t1 101 #undef b0 102 #undef b1 103 104 // func diffAddAmd64(w *[5]fp255.Elt, b uint) 105 // diffAddAmd64 calculates a differential point addition using a precomputed point. 106 // (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2) 107 // w = (mu,x1,z1,x2,z2) are five fp.Elt, and 108 // stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes. 109 TEXT ·diffAddAmd64(SB),NOSPLIT,$128-16 110 // Parameters 111 #define regWork DI 112 #define regSwap SI 113 #define ui 0*Size(regWork) 114 #define x1 1*Size(regWork) 115 #define z1 2*Size(regWork) 116 #define x2 3*Size(regWork) 117 #define z2 4*Size(regWork) 118 // Local variables 119 #define b0 0*Size(SP) 120 #define b1 2*Size(SP) 121 MOVQ w+0(FP), regWork 122 MOVQ b+8(FP), regSwap 123 cswap(x1,x2,regSwap) 124 cswap(z1,z2,regSwap) 125 CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx) 126 #undef regWork 127 #undef regSwap 128 #undef ui 129 #undef x1 130 #undef z1 131 #undef x2 132 #undef z2 133 #undef b0 134 #undef b1 135 136 // func doubleAmd64(x, z *fp255.Elt) 137 // doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1). 138 // stack = (t0,t1) are two fp.Elt of fp.Size bytes, and 139 // (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes. 140 TEXT ·doubleAmd64(SB),NOSPLIT,$192-16 141 // Parameters 142 #define x1 0(DI) 143 #define z1 0(SI) 144 // Local variables 145 #define t0 0*Size(SP) 146 #define t1 1*Size(SP) 147 #define b0 2*Size(SP) 148 #define b1 4*Size(SP) 149 MOVQ x+0(FP), DI 150 MOVQ z+8(FP), SI 151 CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx) 152 #undef x1 153 #undef z1 154 #undef t0 155 #undef t1 156 #undef b0 157 #undef b1