github.com/cloudflare/circl@v1.5.0/math/fp25519/fp_amd64.h (about) 1 // This code was imported from https://github.com/armfazh/rfc7748_precomputed 2 3 // CHECK_BMI2ADX triggers bmi2adx if supported, 4 // otherwise it fallbacks to legacy code. 5 #define CHECK_BMI2ADX(label, legacy, bmi2adx) \ 6 CMPB ·hasBmi2Adx(SB), $0 \ 7 JE label \ 8 bmi2adx \ 9 RET \ 10 label: \ 11 legacy \ 12 RET 13 14 // cselect is a conditional move 15 // if b=1: it copies y into x; 16 // if b=0: x remains with the same value; 17 // if b<> 0,1: undefined. 18 // Uses: AX, DX, FLAGS 19 // Instr: x86_64, cmov 20 #define cselect(x,y,b) \ 21 TESTQ b, b \ 22 MOVQ 0+x, AX; MOVQ 0+y, DX; CMOVQNE DX, AX; MOVQ AX, 0+x; \ 23 MOVQ 8+x, AX; MOVQ 8+y, DX; CMOVQNE DX, AX; MOVQ AX, 8+x; \ 24 MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \ 25 MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; 26 27 // cswap is a conditional swap 28 // if b=1: x,y <- y,x; 29 // if b=0: x,y remain with the same values; 30 // if b<> 0,1: undefined. 31 // Uses: AX, DX, R8, FLAGS 32 // Instr: x86_64, cmov 33 #define cswap(x,y,b) \ 34 TESTQ b, b \ 35 MOVQ 0+x, AX; MOVQ AX, R8; MOVQ 0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 0+x; MOVQ DX, 0+y; \ 36 MOVQ 8+x, AX; MOVQ AX, R8; MOVQ 8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 8+x; MOVQ DX, 8+y; \ 37 MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \ 38 MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; 39 40 // additionLeg adds x and y and stores in z 41 // Uses: AX, DX, R8-R11, FLAGS 42 // Instr: x86_64, cmov 43 #define additionLeg(z,x,y) \ 44 MOVL $38, AX; \ 45 MOVL $0, DX; \ 46 MOVQ 0+x, R8; ADDQ 0+y, R8; \ 47 MOVQ 8+x, R9; ADCQ 8+y, R9; \ 48 MOVQ 16+x, R10; ADCQ 16+y, R10; \ 49 MOVQ 24+x, R11; ADCQ 24+y, R11; \ 50 CMOVQCS AX, DX; \ 51 ADDQ DX, R8; \ 52 ADCQ $0, R9; MOVQ R9, 8+z; \ 53 ADCQ $0, R10; MOVQ R10, 16+z; \ 54 ADCQ $0, R11; MOVQ R11, 24+z; \ 55 MOVL $0, DX; \ 56 CMOVQCS AX, DX; \ 57 ADDQ DX, R8; MOVQ R8, 0+z; 58 59 // additionAdx adds x and y and stores in z 60 // Uses: AX, DX, R8-R11, FLAGS 61 // Instr: x86_64, cmov, adx 62 #define additionAdx(z,x,y) \ 63 MOVL $38, AX; \ 64 XORL DX, DX; \ 65 MOVQ 0+x, R8; ADCXQ 0+y, R8; \ 66 MOVQ 8+x, R9; ADCXQ 8+y, R9; \ 67 MOVQ 16+x, R10; ADCXQ 16+y, R10; \ 68 MOVQ 24+x, R11; ADCXQ 24+y, R11; \ 69 CMOVQCS AX, DX ; \ 70 XORL AX, AX; \ 71 ADCXQ DX, R8; \ 72 ADCXQ AX, R9; MOVQ R9, 8+z; \ 73 ADCXQ AX, R10; MOVQ R10, 16+z; \ 74 ADCXQ AX, R11; MOVQ R11, 24+z; \ 75 MOVL $38, DX; \ 76 CMOVQCS DX, AX; \ 77 ADDQ AX, R8; MOVQ R8, 0+z; 78 79 // subtraction subtracts y from x and stores in z 80 // Uses: AX, DX, R8-R11, FLAGS 81 // Instr: x86_64, cmov 82 #define subtraction(z,x,y) \ 83 MOVL $38, AX; \ 84 MOVQ 0+x, R8; SUBQ 0+y, R8; \ 85 MOVQ 8+x, R9; SBBQ 8+y, R9; \ 86 MOVQ 16+x, R10; SBBQ 16+y, R10; \ 87 MOVQ 24+x, R11; SBBQ 24+y, R11; \ 88 MOVL $0, DX; \ 89 CMOVQCS AX, DX; \ 90 SUBQ DX, R8; \ 91 SBBQ $0, R9; MOVQ R9, 8+z; \ 92 SBBQ $0, R10; MOVQ R10, 16+z; \ 93 SBBQ $0, R11; MOVQ R11, 24+z; \ 94 MOVL $0, DX; \ 95 CMOVQCS AX, DX; \ 96 SUBQ DX, R8; MOVQ R8, 0+z; 97 98 // integerMulAdx multiplies x and y and stores in z 99 // Uses: AX, DX, R8-R15, FLAGS 100 // Instr: x86_64, bmi2, adx 101 #define integerMulAdx(z,x,y) \ 102 MOVL $0,R15; \ 103 MOVQ 0+y, DX; XORL AX, AX; \ 104 MULXQ 0+x, AX, R8; MOVQ AX, 0+z; \ 105 MULXQ 8+x, AX, R9; ADCXQ AX, R8; \ 106 MULXQ 16+x, AX, R10; ADCXQ AX, R9; \ 107 MULXQ 24+x, AX, R11; ADCXQ AX, R10; \ 108 MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; \ 109 MOVQ 8+y, DX; XORL AX, AX; \ 110 MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 8+z; \ 111 MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; \ 112 MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; \ 113 MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; \ 114 MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; \ 115 MOVQ 16+y, DX; XORL AX, AX; \ 116 MULXQ 0+x, AX, R8; ADCXQ R12, AX; MOVQ AX, 16+z; \ 117 MULXQ 8+x, AX, R9; ADCXQ R13, R8; ADOXQ AX, R8; \ 118 MULXQ 16+x, AX, R10; ADCXQ R14, R9; ADOXQ AX, R9; \ 119 MULXQ 24+x, AX, R11; ADCXQ R15, R10; ADOXQ AX, R10; \ 120 MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; ADOXQ AX, R11; \ 121 MOVQ 24+y, DX; XORL AX, AX; \ 122 MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 24+z; \ 123 MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; MOVQ R12, 32+z; \ 124 MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; MOVQ R13, 40+z; \ 125 MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; MOVQ R14, 48+z; \ 126 MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; MOVQ R15, 56+z; 127 128 // integerMulLeg multiplies x and y and stores in z 129 // Uses: AX, DX, R8-R15, FLAGS 130 // Instr: x86_64 131 #define integerMulLeg(z,x,y) \ 132 MOVQ 0+y, R8; \ 133 MOVQ 0+x, AX; MULQ R8; MOVQ AX, 0+z; MOVQ DX, R15; \ 134 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ 135 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ 136 MOVQ 24+x, AX; MULQ R8; \ 137 ADDQ R13, R15; \ 138 ADCQ R14, R10; MOVQ R10, 16+z; \ 139 ADCQ AX, R11; MOVQ R11, 24+z; \ 140 ADCQ $0, DX; MOVQ DX, 32+z; \ 141 MOVQ 8+y, R8; \ 142 MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \ 143 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ 144 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ 145 MOVQ 24+x, AX; MULQ R8; \ 146 ADDQ R12, R15; MOVQ R15, 8+z; \ 147 ADCQ R13, R9; \ 148 ADCQ R14, R10; \ 149 ADCQ AX, R11; \ 150 ADCQ $0, DX; \ 151 ADCQ 16+z, R9; MOVQ R9, R15; \ 152 ADCQ 24+z, R10; MOVQ R10, 24+z; \ 153 ADCQ 32+z, R11; MOVQ R11, 32+z; \ 154 ADCQ $0, DX; MOVQ DX, 40+z; \ 155 MOVQ 16+y, R8; \ 156 MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \ 157 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ 158 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ 159 MOVQ 24+x, AX; MULQ R8; \ 160 ADDQ R12, R15; MOVQ R15, 16+z; \ 161 ADCQ R13, R9; \ 162 ADCQ R14, R10; \ 163 ADCQ AX, R11; \ 164 ADCQ $0, DX; \ 165 ADCQ 24+z, R9; MOVQ R9, R15; \ 166 ADCQ 32+z, R10; MOVQ R10, 32+z; \ 167 ADCQ 40+z, R11; MOVQ R11, 40+z; \ 168 ADCQ $0, DX; MOVQ DX, 48+z; \ 169 MOVQ 24+y, R8; \ 170 MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \ 171 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ 172 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ 173 MOVQ 24+x, AX; MULQ R8; \ 174 ADDQ R12, R15; MOVQ R15, 24+z; \ 175 ADCQ R13, R9; \ 176 ADCQ R14, R10; \ 177 ADCQ AX, R11; \ 178 ADCQ $0, DX; \ 179 ADCQ 32+z, R9; MOVQ R9, 32+z; \ 180 ADCQ 40+z, R10; MOVQ R10, 40+z; \ 181 ADCQ 48+z, R11; MOVQ R11, 48+z; \ 182 ADCQ $0, DX; MOVQ DX, 56+z; 183 184 // integerSqrLeg squares x and stores in z 185 // Uses: AX, CX, DX, R8-R15, FLAGS 186 // Instr: x86_64 187 #define integerSqrLeg(z,x) \ 188 MOVQ 0+x, R8; \ 189 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R9; MOVQ DX, R10; /* A[0]*A[1] */ \ 190 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; /* A[0]*A[2] */ \ 191 MOVQ 24+x, AX; MULQ R8; MOVQ AX, R15; MOVQ DX, R12; /* A[0]*A[3] */ \ 192 MOVQ 24+x, R8; \ 193 MOVQ 8+x, AX; MULQ R8; MOVQ AX, CX; MOVQ DX, R13; /* A[3]*A[1] */ \ 194 MOVQ 16+x, AX; MULQ R8; /* A[3]*A[2] */ \ 195 \ 196 ADDQ R14, R10;\ 197 ADCQ R15, R11; MOVL $0, R15;\ 198 ADCQ CX, R12;\ 199 ADCQ AX, R13;\ 200 ADCQ $0, DX; MOVQ DX, R14;\ 201 MOVQ 8+x, AX; MULQ 16+x;\ 202 \ 203 ADDQ AX, R11;\ 204 ADCQ DX, R12;\ 205 ADCQ $0, R13;\ 206 ADCQ $0, R14;\ 207 ADCQ $0, R15;\ 208 \ 209 SHLQ $1, R14, R15; MOVQ R15, 56+z;\ 210 SHLQ $1, R13, R14; MOVQ R14, 48+z;\ 211 SHLQ $1, R12, R13; MOVQ R13, 40+z;\ 212 SHLQ $1, R11, R12; MOVQ R12, 32+z;\ 213 SHLQ $1, R10, R11; MOVQ R11, 24+z;\ 214 SHLQ $1, R9, R10; MOVQ R10, 16+z;\ 215 SHLQ $1, R9; MOVQ R9, 8+z;\ 216 \ 217 MOVQ 0+x,AX; MULQ AX; MOVQ AX, 0+z; MOVQ DX, R9;\ 218 MOVQ 8+x,AX; MULQ AX; MOVQ AX, R10; MOVQ DX, R11;\ 219 MOVQ 16+x,AX; MULQ AX; MOVQ AX, R12; MOVQ DX, R13;\ 220 MOVQ 24+x,AX; MULQ AX; MOVQ AX, R14; MOVQ DX, R15;\ 221 \ 222 ADDQ 8+z, R9; MOVQ R9, 8+z;\ 223 ADCQ 16+z, R10; MOVQ R10, 16+z;\ 224 ADCQ 24+z, R11; MOVQ R11, 24+z;\ 225 ADCQ 32+z, R12; MOVQ R12, 32+z;\ 226 ADCQ 40+z, R13; MOVQ R13, 40+z;\ 227 ADCQ 48+z, R14; MOVQ R14, 48+z;\ 228 ADCQ 56+z, R15; MOVQ R15, 56+z; 229 230 // integerSqrAdx squares x and stores in z 231 // Uses: AX, CX, DX, R8-R15, FLAGS 232 // Instr: x86_64, bmi2, adx 233 #define integerSqrAdx(z,x) \ 234 MOVQ 0+x, DX; /* A[0] */ \ 235 MULXQ 8+x, R8, R14; /* A[1]*A[0] */ XORL R15, R15; \ 236 MULXQ 16+x, R9, R10; /* A[2]*A[0] */ ADCXQ R14, R9; \ 237 MULXQ 24+x, AX, CX; /* A[3]*A[0] */ ADCXQ AX, R10; \ 238 MOVQ 24+x, DX; /* A[3] */ \ 239 MULXQ 8+x, R11, R12; /* A[1]*A[3] */ ADCXQ CX, R11; \ 240 MULXQ 16+x, AX, R13; /* A[2]*A[3] */ ADCXQ AX, R12; \ 241 MOVQ 8+x, DX; /* A[1] */ ADCXQ R15, R13; \ 242 MULXQ 16+x, AX, CX; /* A[2]*A[1] */ MOVL $0, R14; \ 243 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADCXQ R15, R14; \ 244 XORL R15, R15; \ 245 ADOXQ AX, R10; ADCXQ R8, R8; \ 246 ADOXQ CX, R11; ADCXQ R9, R9; \ 247 ADOXQ R15, R12; ADCXQ R10, R10; \ 248 ADOXQ R15, R13; ADCXQ R11, R11; \ 249 ADOXQ R15, R14; ADCXQ R12, R12; \ 250 ;;;;;;;;;;;;;;; ADCXQ R13, R13; \ 251 ;;;;;;;;;;;;;;; ADCXQ R14, R14; \ 252 MOVQ 0+x, DX; MULXQ DX, AX, CX; /* A[0]^2 */ \ 253 ;;;;;;;;;;;;;;; MOVQ AX, 0+z; \ 254 ADDQ CX, R8; MOVQ R8, 8+z; \ 255 MOVQ 8+x, DX; MULXQ DX, AX, CX; /* A[1]^2 */ \ 256 ADCQ AX, R9; MOVQ R9, 16+z; \ 257 ADCQ CX, R10; MOVQ R10, 24+z; \ 258 MOVQ 16+x, DX; MULXQ DX, AX, CX; /* A[2]^2 */ \ 259 ADCQ AX, R11; MOVQ R11, 32+z; \ 260 ADCQ CX, R12; MOVQ R12, 40+z; \ 261 MOVQ 24+x, DX; MULXQ DX, AX, CX; /* A[3]^2 */ \ 262 ADCQ AX, R13; MOVQ R13, 48+z; \ 263 ADCQ CX, R14; MOVQ R14, 56+z; 264 265 // reduceFromDouble finds z congruent to x modulo p such that 0<z<2^256 266 // Uses: AX, DX, R8-R13, FLAGS 267 // Instr: x86_64 268 #define reduceFromDoubleLeg(z,x) \ 269 /* 2*C = 38 = 2^256 */ \ 270 MOVL $38, AX; MULQ 32+x; MOVQ AX, R8; MOVQ DX, R9; /* C*C[4] */ \ 271 MOVL $38, AX; MULQ 40+x; MOVQ AX, R12; MOVQ DX, R10; /* C*C[5] */ \ 272 MOVL $38, AX; MULQ 48+x; MOVQ AX, R13; MOVQ DX, R11; /* C*C[6] */ \ 273 MOVL $38, AX; MULQ 56+x; /* C*C[7] */ \ 274 ADDQ R12, R9; \ 275 ADCQ R13, R10; \ 276 ADCQ AX, R11; \ 277 ADCQ $0, DX; \ 278 ADDQ 0+x, R8; \ 279 ADCQ 8+x, R9; \ 280 ADCQ 16+x, R10; \ 281 ADCQ 24+x, R11; \ 282 ADCQ $0, DX; \ 283 MOVL $38, AX; \ 284 IMULQ AX, DX; /* C*C[4], CF=0, OF=0 */ \ 285 ADDQ DX, R8; \ 286 ADCQ $0, R9; MOVQ R9, 8+z; \ 287 ADCQ $0, R10; MOVQ R10, 16+z; \ 288 ADCQ $0, R11; MOVQ R11, 24+z; \ 289 MOVL $0, DX; \ 290 CMOVQCS AX, DX; \ 291 ADDQ DX, R8; MOVQ R8, 0+z; 292 293 // reduceFromDoubleAdx finds z congruent to x modulo p such that 0<z<2^256 294 // Uses: AX, DX, R8-R13, FLAGS 295 // Instr: x86_64, bmi2, adx 296 #define reduceFromDoubleAdx(z,x) \ 297 MOVL $38, DX; /* 2*C = 38 = 2^256 */ \ 298 MULXQ 32+x, R8, R10; /* C*C[4] */ XORL AX, AX; ADOXQ 0+x, R8; \ 299 MULXQ 40+x, R9, R11; /* C*C[5] */ ADCXQ R10, R9; ADOXQ 8+x, R9; \ 300 MULXQ 48+x, R10, R13; /* C*C[6] */ ADCXQ R11, R10; ADOXQ 16+x, R10; \ 301 MULXQ 56+x, R11, R12; /* C*C[7] */ ADCXQ R13, R11; ADOXQ 24+x, R11; \ 302 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADCXQ AX, R12; ADOXQ AX, R12; \ 303 IMULQ DX, R12; /* C*C[4], CF=0, OF=0 */ \ 304 ADCXQ R12, R8; \ 305 ADCXQ AX, R9; MOVQ R9, 8+z; \ 306 ADCXQ AX, R10; MOVQ R10, 16+z; \ 307 ADCXQ AX, R11; MOVQ R11, 24+z; \ 308 MOVL $0, R12; \ 309 CMOVQCS DX, R12; \ 310 ADDQ R12, R8; MOVQ R8, 0+z; 311 312 // addSub calculates two operations: x,y = x+y,x-y 313 // Uses: AX, DX, R8-R15, FLAGS 314 #define addSub(x,y) \ 315 MOVL $38, AX; \ 316 XORL DX, DX; \ 317 MOVQ 0+x, R8; MOVQ R8, R12; ADDQ 0+y, R8; \ 318 MOVQ 8+x, R9; MOVQ R9, R13; ADCQ 8+y, R9; \ 319 MOVQ 16+x, R10; MOVQ R10, R14; ADCQ 16+y, R10; \ 320 MOVQ 24+x, R11; MOVQ R11, R15; ADCQ 24+y, R11; \ 321 CMOVQCS AX, DX; \ 322 XORL AX, AX; \ 323 ADDQ DX, R8; \ 324 ADCQ $0, R9; \ 325 ADCQ $0, R10; \ 326 ADCQ $0, R11; \ 327 MOVL $38, DX; \ 328 CMOVQCS DX, AX; \ 329 ADDQ AX, R8; \ 330 MOVL $38, AX; \ 331 SUBQ 0+y, R12; \ 332 SBBQ 8+y, R13; \ 333 SBBQ 16+y, R14; \ 334 SBBQ 24+y, R15; \ 335 MOVL $0, DX; \ 336 CMOVQCS AX, DX; \ 337 SUBQ DX, R12; \ 338 SBBQ $0, R13; \ 339 SBBQ $0, R14; \ 340 SBBQ $0, R15; \ 341 MOVL $0, DX; \ 342 CMOVQCS AX, DX; \ 343 SUBQ DX, R12; \ 344 MOVQ R8, 0+x; \ 345 MOVQ R9, 8+x; \ 346 MOVQ R10, 16+x; \ 347 MOVQ R11, 24+x; \ 348 MOVQ R12, 0+y; \ 349 MOVQ R13, 8+y; \ 350 MOVQ R14, 16+y; \ 351 MOVQ R15, 24+y;