github.com/cloudflare/circl@v1.5.0/dh/x448/curve_amd64.s (about) 1 //go:build amd64 && !purego 2 // +build amd64,!purego 3 4 #include "textflag.h" 5 6 // Depends on circl/math/fp448 package 7 #include "../../math/fp448/fp_amd64.h" 8 #include "curve_amd64.h" 9 10 // CTE_A24 is (A+2)/4 from Curve448 11 #define CTE_A24 39082 12 13 #define Size 56 14 15 // multiplyA24Leg multiplies x times CTE_A24 and stores in z 16 // Uses: AX, DX, R8-R15, FLAGS 17 // Instr: x86_64, cmov, adx 18 #define multiplyA24Leg(z,x) \ 19 MOVQ $CTE_A24, R15; \ 20 MOVQ 0+x, AX; MULQ R15; MOVQ AX, R8; ;;;;;;;;;;;; MOVQ DX, R9; \ 21 MOVQ 8+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \ 22 MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \ 23 MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \ 24 MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \ 25 MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \ 26 MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \ 27 MOVQ DX, AX; \ 28 SHLQ $32, AX; \ 29 ADDQ DX, R8; MOVQ $0, DX; \ 30 ADCQ $0, R9; \ 31 ADCQ $0, R10; \ 32 ADCQ AX, R11; \ 33 ADCQ $0, R12; \ 34 ADCQ $0, R13; \ 35 ADCQ $0, R14; \ 36 ADCQ $0, DX; \ 37 MOVQ DX, AX; \ 38 SHLQ $32, AX; \ 39 ADDQ DX, R8; \ 40 ADCQ $0, R9; \ 41 ADCQ $0, R10; \ 42 ADCQ AX, R11; \ 43 ADCQ $0, R12; \ 44 ADCQ $0, R13; \ 45 ADCQ $0, R14; \ 46 MOVQ R8, 0+z; \ 47 MOVQ R9, 8+z; \ 48 MOVQ R10, 16+z; \ 49 MOVQ R11, 24+z; \ 50 MOVQ R12, 32+z; \ 51 MOVQ R13, 40+z; \ 52 MOVQ R14, 48+z; 53 54 // multiplyA24Adx multiplies x times CTE_A24 and stores in z 55 // Uses: AX, DX, R8-R14, FLAGS 56 // Instr: x86_64, bmi2 57 #define multiplyA24Adx(z,x) \ 58 MOVQ $CTE_A24, DX; \ 59 MULXQ 0+x, R8, R9; \ 60 MULXQ 8+x, AX, R10; ADDQ AX, R9; \ 61 MULXQ 16+x, AX, R11; ADCQ AX, R10; \ 62 MULXQ 24+x, AX, R12; ADCQ AX, R11; \ 63 MULXQ 32+x, AX, R13; ADCQ AX, R12; \ 64 MULXQ 40+x, AX, R14; ADCQ AX, R13; \ 65 MULXQ 48+x, AX, DX; ADCQ AX, R14; \ 66 ;;;;;;;;;;;;;;;;;;;; ADCQ $0, DX; \ 67 MOVQ DX, AX; \ 68 SHLQ $32, AX; \ 69 ADDQ DX, R8; MOVQ $0, DX; \ 70 ADCQ $0, R9; \ 71 ADCQ $0, R10; \ 72 ADCQ AX, R11; \ 73 ADCQ $0, R12; \ 74 ADCQ $0, R13; \ 75 ADCQ $0, R14; \ 76 ADCQ $0, DX; \ 77 MOVQ DX, AX; \ 78 SHLQ $32, AX; \ 79 ADDQ DX, R8; \ 80 ADCQ $0, R9; \ 81 ADCQ $0, R10; \ 82 ADCQ AX, R11; \ 83 ADCQ $0, R12; \ 84 ADCQ $0, R13; \ 85 ADCQ $0, R14; \ 86 MOVQ R8, 0+z; \ 87 MOVQ R9, 8+z; \ 88 MOVQ R10, 16+z; \ 89 MOVQ R11, 24+z; \ 90 MOVQ R12, 32+z; \ 91 MOVQ R13, 40+z; \ 92 MOVQ R14, 48+z; 93 94 #define mulA24Legacy \ 95 multiplyA24Leg(0(DI),0(SI)) 96 #define mulA24Bmi2Adx \ 97 multiplyA24Adx(0(DI),0(SI)) 98 99 // func mulA24Amd64(z, x *fp448.Elt) 100 TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16 101 MOVQ z+0(FP), DI 102 MOVQ x+8(FP), SI 103 CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx) 104 105 // func ladderStepAmd64(w *[5]fp448.Elt, b uint) 106 // ladderStepAmd64 calculates a point addition and doubling as follows: 107 // (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-). 108 // w = {x1,x2,z2,x3,z4} are five fp255.Elt of 56 bytes. 109 // stack = (t0,t1) are two fp.Elt of fp.Size bytes, and 110 // (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes. 111 TEXT ·ladderStepAmd64(SB),NOSPLIT,$336-16 112 // Parameters 113 #define regWork DI 114 #define regMove SI 115 #define x1 0*Size(regWork) 116 #define x2 1*Size(regWork) 117 #define z2 2*Size(regWork) 118 #define x3 3*Size(regWork) 119 #define z3 4*Size(regWork) 120 // Local variables 121 #define t0 0*Size(SP) 122 #define t1 1*Size(SP) 123 #define b0 2*Size(SP) 124 #define b1 4*Size(SP) 125 MOVQ w+0(FP), regWork 126 MOVQ b+8(FP), regMove 127 CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx) 128 #undef regWork 129 #undef regMove 130 #undef x1 131 #undef x2 132 #undef z2 133 #undef x3 134 #undef z3 135 #undef t0 136 #undef t1 137 #undef b0 138 #undef b1 139 140 // func diffAddAmd64(work *[5]fp.Elt, swap uint) 141 // diffAddAmd64 calculates a differential point addition using a precomputed point. 142 // (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2) 143 // work = {mu,x1,z1,x2,z2} are five fp448.Elt of 56 bytes, and 144 // stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes. 145 // This is Equation 7 at https://eprint.iacr.org/2017/264. 146 TEXT ·diffAddAmd64(SB),NOSPLIT,$224-16 147 // Parameters 148 #define regWork DI 149 #define regSwap SI 150 #define ui 0*Size(regWork) 151 #define x1 1*Size(regWork) 152 #define z1 2*Size(regWork) 153 #define x2 3*Size(regWork) 154 #define z2 4*Size(regWork) 155 // Local variables 156 #define b0 0*Size(SP) 157 #define b1 2*Size(SP) 158 MOVQ w+0(FP), regWork 159 MOVQ b+8(FP), regSwap 160 cswap(x1,x2,regSwap) 161 cswap(z1,z2,regSwap) 162 CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx) 163 #undef regWork 164 #undef regSwap 165 #undef ui 166 #undef x1 167 #undef z1 168 #undef x2 169 #undef z2 170 #undef b0 171 #undef b1 172 173 // func doubleAmd64(x, z *fp448.Elt) 174 // doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1). 175 // stack = (t0,t1) are two fp.Elt of fp.Size bytes, and 176 // (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes. 177 TEXT ·doubleAmd64(SB),NOSPLIT,$336-16 178 // Parameters 179 #define x1 0(DI) 180 #define z1 0(SI) 181 // Local variables 182 #define t0 0*Size(SP) 183 #define t1 1*Size(SP) 184 #define b0 2*Size(SP) 185 #define b1 4*Size(SP) 186 MOVQ x+0(FP), DI 187 MOVQ z+8(FP), SI 188 CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx) 189 #undef x1 190 #undef z1 191 #undef t0 192 #undef t1 193 #undef b0 194 #undef b1