github.com/cloudflare/circl@v1.5.0/math/fp448/fp_amd64.h (about) 1 // This code was imported from https://github.com/armfazh/rfc7748_precomputed 2 3 // CHECK_BMI2ADX triggers bmi2adx if supported, 4 // otherwise it fallbacks to legacy code. 5 #define CHECK_BMI2ADX(label, legacy, bmi2adx) \ 6 CMPB ·hasBmi2Adx(SB), $0 \ 7 JE label \ 8 bmi2adx \ 9 RET \ 10 label: \ 11 legacy \ 12 RET 13 14 // cselect is a conditional move 15 // if b=1: it copies y into x; 16 // if b=0: x remains with the same value; 17 // if b<> 0,1: undefined. 18 // Uses: AX, DX, FLAGS 19 // Instr: x86_64, cmov 20 #define cselect(x,y,b) \ 21 TESTQ b, b \ 22 MOVQ 0+x, AX; MOVQ 0+y, DX; CMOVQNE DX, AX; MOVQ AX, 0+x; \ 23 MOVQ 8+x, AX; MOVQ 8+y, DX; CMOVQNE DX, AX; MOVQ AX, 8+x; \ 24 MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \ 25 MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; \ 26 MOVQ 32+x, AX; MOVQ 32+y, DX; CMOVQNE DX, AX; MOVQ AX, 32+x; \ 27 MOVQ 40+x, AX; MOVQ 40+y, DX; CMOVQNE DX, AX; MOVQ AX, 40+x; \ 28 MOVQ 48+x, AX; MOVQ 48+y, DX; CMOVQNE DX, AX; MOVQ AX, 48+x; 29 30 // cswap is a conditional swap 31 // if b=1: x,y <- y,x; 32 // if b=0: x,y remain with the same values; 33 // if b<> 0,1: undefined. 34 // Uses: AX, DX, R8, FLAGS 35 // Instr: x86_64, cmov 36 #define cswap(x,y,b) \ 37 TESTQ b, b \ 38 MOVQ 0+x, AX; MOVQ AX, R8; MOVQ 0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 0+x; MOVQ DX, 0+y; \ 39 MOVQ 8+x, AX; MOVQ AX, R8; MOVQ 8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 8+x; MOVQ DX, 8+y; \ 40 MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \ 41 MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; \ 42 MOVQ 32+x, AX; MOVQ AX, R8; MOVQ 32+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 32+x; MOVQ DX, 32+y; \ 43 MOVQ 40+x, AX; MOVQ AX, R8; MOVQ 40+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 40+x; MOVQ DX, 40+y; \ 44 MOVQ 48+x, AX; MOVQ AX, R8; MOVQ 48+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 48+x; MOVQ DX, 48+y; 45 46 // additionLeg adds x and y and stores in z 47 // Uses: AX, DX, R8-R14, FLAGS 48 // Instr: x86_64 49 #define additionLeg(z,x,y) \ 50 MOVQ 0+x, R8; ADDQ 0+y, R8; \ 51 MOVQ 8+x, R9; ADCQ 8+y, R9; \ 52 MOVQ 16+x, R10; ADCQ 16+y, R10; \ 53 MOVQ 24+x, R11; ADCQ 24+y, R11; \ 54 MOVQ 32+x, R12; ADCQ 32+y, R12; \ 55 MOVQ 40+x, R13; ADCQ 40+y, R13; \ 56 MOVQ 48+x, R14; ADCQ 48+y, R14; \ 57 MOVQ $0, AX; ADCQ $0, AX; \ 58 MOVQ AX, DX; \ 59 SHLQ $32, DX; \ 60 ADDQ AX, R8; MOVQ $0, AX; \ 61 ADCQ $0, R9; \ 62 ADCQ $0, R10; \ 63 ADCQ DX, R11; \ 64 ADCQ $0, R12; \ 65 ADCQ $0, R13; \ 66 ADCQ $0, R14; \ 67 ADCQ $0, AX; \ 68 MOVQ AX, DX; \ 69 SHLQ $32, DX; \ 70 ADDQ AX, R8; MOVQ R8, 0+z; \ 71 ADCQ $0, R9; MOVQ R9, 8+z; \ 72 ADCQ $0, R10; MOVQ R10, 16+z; \ 73 ADCQ DX, R11; MOVQ R11, 24+z; \ 74 ADCQ $0, R12; MOVQ R12, 32+z; \ 75 ADCQ $0, R13; MOVQ R13, 40+z; \ 76 ADCQ $0, R14; MOVQ R14, 48+z; 77 78 79 // additionAdx adds x and y and stores in z 80 // Uses: AX, DX, R8-R15, FLAGS 81 // Instr: x86_64, adx 82 #define additionAdx(z,x,y) \ 83 MOVL $32, R15; \ 84 XORL DX, DX; \ 85 MOVQ 0+x, R8; ADCXQ 0+y, R8; \ 86 MOVQ 8+x, R9; ADCXQ 8+y, R9; \ 87 MOVQ 16+x, R10; ADCXQ 16+y, R10; \ 88 MOVQ 24+x, R11; ADCXQ 24+y, R11; \ 89 MOVQ 32+x, R12; ADCXQ 32+y, R12; \ 90 MOVQ 40+x, R13; ADCXQ 40+y, R13; \ 91 MOVQ 48+x, R14; ADCXQ 48+y, R14; \ 92 ;;;;;;;;;;;;;;; ADCXQ DX, DX; \ 93 XORL AX, AX; \ 94 ADCXQ DX, R8; SHLXQ R15, DX, DX; \ 95 ADCXQ AX, R9; \ 96 ADCXQ AX, R10; \ 97 ADCXQ DX, R11; \ 98 ADCXQ AX, R12; \ 99 ADCXQ AX, R13; \ 100 ADCXQ AX, R14; \ 101 ADCXQ AX, AX; \ 102 XORL DX, DX; \ 103 ADCXQ AX, R8; MOVQ R8, 0+z; SHLXQ R15, AX, AX; \ 104 ADCXQ DX, R9; MOVQ R9, 8+z; \ 105 ADCXQ DX, R10; MOVQ R10, 16+z; \ 106 ADCXQ AX, R11; MOVQ R11, 24+z; \ 107 ADCXQ DX, R12; MOVQ R12, 32+z; \ 108 ADCXQ DX, R13; MOVQ R13, 40+z; \ 109 ADCXQ DX, R14; MOVQ R14, 48+z; 110 111 // subtraction subtracts y from x and stores in z 112 // Uses: AX, DX, R8-R14, FLAGS 113 // Instr: x86_64 114 #define subtraction(z,x,y) \ 115 MOVQ 0+x, R8; SUBQ 0+y, R8; \ 116 MOVQ 8+x, R9; SBBQ 8+y, R9; \ 117 MOVQ 16+x, R10; SBBQ 16+y, R10; \ 118 MOVQ 24+x, R11; SBBQ 24+y, R11; \ 119 MOVQ 32+x, R12; SBBQ 32+y, R12; \ 120 MOVQ 40+x, R13; SBBQ 40+y, R13; \ 121 MOVQ 48+x, R14; SBBQ 48+y, R14; \ 122 MOVQ $0, AX; SETCS AX; \ 123 MOVQ AX, DX; \ 124 SHLQ $32, DX; \ 125 SUBQ AX, R8; MOVQ $0, AX; \ 126 SBBQ $0, R9; \ 127 SBBQ $0, R10; \ 128 SBBQ DX, R11; \ 129 SBBQ $0, R12; \ 130 SBBQ $0, R13; \ 131 SBBQ $0, R14; \ 132 SETCS AX; \ 133 MOVQ AX, DX; \ 134 SHLQ $32, DX; \ 135 SUBQ AX, R8; MOVQ R8, 0+z; \ 136 SBBQ $0, R9; MOVQ R9, 8+z; \ 137 SBBQ $0, R10; MOVQ R10, 16+z; \ 138 SBBQ DX, R11; MOVQ R11, 24+z; \ 139 SBBQ $0, R12; MOVQ R12, 32+z; \ 140 SBBQ $0, R13; MOVQ R13, 40+z; \ 141 SBBQ $0, R14; MOVQ R14, 48+z; 142 143 // maddBmi2Adx multiplies x and y and accumulates in z 144 // Uses: AX, DX, R15, FLAGS 145 // Instr: x86_64, bmi2, adx 146 #define maddBmi2Adx(z,x,y,i,r0,r1,r2,r3,r4,r5,r6) \ 147 MOVQ i+y, DX; XORL AX, AX; \ 148 MULXQ 0+x, AX, R8; ADOXQ AX, r0; ADCXQ R8, r1; MOVQ r0,i+z; \ 149 MULXQ 8+x, AX, r0; ADOXQ AX, r1; ADCXQ r0, r2; MOVQ $0, R8; \ 150 MULXQ 16+x, AX, r0; ADOXQ AX, r2; ADCXQ r0, r3; \ 151 MULXQ 24+x, AX, r0; ADOXQ AX, r3; ADCXQ r0, r4; \ 152 MULXQ 32+x, AX, r0; ADOXQ AX, r4; ADCXQ r0, r5; \ 153 MULXQ 40+x, AX, r0; ADOXQ AX, r5; ADCXQ r0, r6; \ 154 MULXQ 48+x, AX, r0; ADOXQ AX, r6; ADCXQ R8, r0; \ 155 ;;;;;;;;;;;;;;;;;;; ADOXQ R8, r0; 156 157 // integerMulAdx multiplies x and y and stores in z 158 // Uses: AX, DX, R8-R15, FLAGS 159 // Instr: x86_64, bmi2, adx 160 #define integerMulAdx(z,x,y) \ 161 MOVL $0,R15; \ 162 MOVQ 0+y, DX; XORL AX, AX; MOVQ $0, R8; \ 163 MULXQ 0+x, AX, R9; MOVQ AX, 0+z; \ 164 MULXQ 8+x, AX, R10; ADCXQ AX, R9; \ 165 MULXQ 16+x, AX, R11; ADCXQ AX, R10; \ 166 MULXQ 24+x, AX, R12; ADCXQ AX, R11; \ 167 MULXQ 32+x, AX, R13; ADCXQ AX, R12; \ 168 MULXQ 40+x, AX, R14; ADCXQ AX, R13; \ 169 MULXQ 48+x, AX, R15; ADCXQ AX, R14; \ 170 ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R15; \ 171 maddBmi2Adx(z,x,y, 8, R9,R10,R11,R12,R13,R14,R15) \ 172 maddBmi2Adx(z,x,y,16,R10,R11,R12,R13,R14,R15, R9) \ 173 maddBmi2Adx(z,x,y,24,R11,R12,R13,R14,R15, R9,R10) \ 174 maddBmi2Adx(z,x,y,32,R12,R13,R14,R15, R9,R10,R11) \ 175 maddBmi2Adx(z,x,y,40,R13,R14,R15, R9,R10,R11,R12) \ 176 maddBmi2Adx(z,x,y,48,R14,R15, R9,R10,R11,R12,R13) \ 177 MOVQ R15, 56+z; \ 178 MOVQ R9, 64+z; \ 179 MOVQ R10, 72+z; \ 180 MOVQ R11, 80+z; \ 181 MOVQ R12, 88+z; \ 182 MOVQ R13, 96+z; \ 183 MOVQ R14, 104+z; 184 185 // maddLegacy multiplies x and y and accumulates in z 186 // Uses: AX, DX, R15, FLAGS 187 // Instr: x86_64 188 #define maddLegacy(z,x,y,i) \ 189 MOVQ i+y, R15; \ 190 MOVQ 0+x, AX; MULQ R15; MOVQ AX, R8; ;;;;;;;;;;;; MOVQ DX, R9; \ 191 MOVQ 8+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \ 192 MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \ 193 MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \ 194 MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \ 195 MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \ 196 MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \ 197 ADDQ 0+i+z, R8; MOVQ R8, 0+i+z; \ 198 ADCQ 8+i+z, R9; MOVQ R9, 8+i+z; \ 199 ADCQ 16+i+z, R10; MOVQ R10, 16+i+z; \ 200 ADCQ 24+i+z, R11; MOVQ R11, 24+i+z; \ 201 ADCQ 32+i+z, R12; MOVQ R12, 32+i+z; \ 202 ADCQ 40+i+z, R13; MOVQ R13, 40+i+z; \ 203 ADCQ 48+i+z, R14; MOVQ R14, 48+i+z; \ 204 ADCQ $0, DX; MOVQ DX, 56+i+z; 205 206 // integerMulLeg multiplies x and y and stores in z 207 // Uses: AX, DX, R8-R15, FLAGS 208 // Instr: x86_64 209 #define integerMulLeg(z,x,y) \ 210 MOVQ 0+y, R15; \ 211 MOVQ 0+x, AX; MULQ R15; MOVQ AX, 0+z; ;;;;;;;;;;;; MOVQ DX, R8; \ 212 MOVQ 8+x, AX; MULQ R15; ADDQ AX, R8; ADCQ $0, DX; MOVQ DX, R9; MOVQ R8, 8+z; \ 213 MOVQ 16+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; MOVQ R9, 16+z; \ 214 MOVQ 24+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; MOVQ R10, 24+z; \ 215 MOVQ 32+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; MOVQ R11, 32+z; \ 216 MOVQ 40+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; MOVQ R12, 40+z; \ 217 MOVQ 48+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX,56+z; MOVQ R13, 48+z; \ 218 maddLegacy(z,x,y, 8) \ 219 maddLegacy(z,x,y,16) \ 220 maddLegacy(z,x,y,24) \ 221 maddLegacy(z,x,y,32) \ 222 maddLegacy(z,x,y,40) \ 223 maddLegacy(z,x,y,48) 224 225 // integerSqrLeg squares x and stores in z 226 // Uses: AX, CX, DX, R8-R15, FLAGS 227 // Instr: x86_64 228 #define integerSqrLeg(z,x) \ 229 XORL R15, R15; \ 230 MOVQ 0+x, CX; \ 231 MOVQ CX, AX; MULQ CX; MOVQ AX, 0+z; MOVQ DX, R8; \ 232 ADDQ CX, CX; ADCQ $0, R15; \ 233 MOVQ 8+x, AX; MULQ CX; ADDQ AX, R8; ADCQ $0, DX; MOVQ DX, R9; MOVQ R8, 8+z; \ 234 MOVQ 16+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \ 235 MOVQ 24+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \ 236 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \ 237 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \ 238 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \ 239 \ 240 MOVQ 8+x, CX; \ 241 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \ 242 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ R9,16+z; \ 243 MOVQ R15, AX; NEGQ AX; ANDQ 8+x, AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \ 244 ADDQ 8+x, CX; ADCQ $0, R15; \ 245 MOVQ 16+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 24+z; \ 246 MOVQ 24+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX, R8; \ 247 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; \ 248 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \ 249 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R9; \ 250 \ 251 MOVQ 16+x, CX; \ 252 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \ 253 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 32+z; \ 254 MOVQ R15, AX; NEGQ AX; ANDQ 16+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \ 255 ADDQ 16+x, CX; ADCQ $0, R15; \ 256 MOVQ 24+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 40+z; \ 257 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \ 258 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; \ 259 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; ADDQ R8, R9; ADCQ $0, DX; MOVQ DX,R10; \ 260 \ 261 MOVQ 24+x, CX; \ 262 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \ 263 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 48+z; \ 264 MOVQ R15, AX; NEGQ AX; ANDQ 24+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \ 265 ADDQ 24+x, CX; ADCQ $0, R15; \ 266 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; MOVQ R14, 56+z; \ 267 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; ADDQ R8, R9; ADCQ $0, DX; MOVQ DX, R8; \ 268 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX,R11; \ 269 \ 270 MOVQ 32+x, CX; \ 271 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \ 272 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ R9, 64+z; \ 273 MOVQ R15, AX; NEGQ AX; ANDQ 32+x,AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \ 274 ADDQ 32+x, CX; ADCQ $0, R15; \ 275 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 72+z; \ 276 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX,R12; \ 277 \ 278 XORL R13, R13; \ 279 XORL R14, R14; \ 280 MOVQ 40+x, CX; \ 281 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \ 282 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 80+z; \ 283 MOVQ R15, AX; NEGQ AX; ANDQ 40+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \ 284 ADDQ 40+x, CX; ADCQ $0, R15; \ 285 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 88+z; \ 286 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8, R13; ADCQ $0,R14; \ 287 \ 288 XORL R9, R9; \ 289 MOVQ 48+x, CX; \ 290 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \ 291 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 96+z; \ 292 MOVQ R15, AX; NEGQ AX; ANDQ 48+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \ 293 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8,R14; ADCQ $0, R9; MOVQ R14, 104+z; 294 295 296 // integerSqrAdx squares x and stores in z 297 // Uses: AX, CX, DX, R8-R15, FLAGS 298 // Instr: x86_64, bmi2, adx 299 #define integerSqrAdx(z,x) \ 300 XORL R15, R15; \ 301 MOVQ 0+x, DX; \ 302 ;;;;;;;;;;;;;; MULXQ DX, AX, R8; MOVQ AX, 0+z; \ 303 ADDQ DX, DX; ADCQ $0, R15; CLC; \ 304 MULXQ 8+x, AX, R9; ADCXQ AX, R8; MOVQ R8, 8+z; \ 305 MULXQ 16+x, AX, R10; ADCXQ AX, R9; MOVQ $0, R8;\ 306 MULXQ 24+x, AX, R11; ADCXQ AX, R10; \ 307 MULXQ 32+x, AX, R12; ADCXQ AX, R11; \ 308 MULXQ 40+x, AX, R13; ADCXQ AX, R12; \ 309 MULXQ 48+x, AX, R14; ADCXQ AX, R13; \ 310 ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R14; \ 311 \ 312 MOVQ 8+x, DX; \ 313 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \ 314 MULXQ AX, AX, CX; \ 315 MOVQ R15, R8; NEGQ R8; ANDQ 8+x, R8; \ 316 ADDQ AX, R9; MOVQ R9, 16+z; \ 317 ADCQ CX, R8; \ 318 ADCQ $0, R11; \ 319 ADDQ 8+x, DX; \ 320 ADCQ $0, R15; \ 321 XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \ 322 MULXQ 16+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 24+z; \ 323 MULXQ 24+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; MOVQ $0, R10; \ 324 MULXQ 32+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; \ 325 MULXQ 40+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; \ 326 MULXQ 48+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; \ 327 ;;;;;;;;;;;;;;;;;;; ADCXQ R10, R9; \ 328 \ 329 MOVQ 16+x, DX; \ 330 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \ 331 MULXQ AX, AX, CX; \ 332 MOVQ R15, R8; NEGQ R8; ANDQ 16+x, R8; \ 333 ADDQ AX, R11; MOVQ R11, 32+z; \ 334 ADCQ CX, R8; \ 335 ADCQ $0, R13; \ 336 ADDQ 16+x, DX; \ 337 ADCQ $0, R15; \ 338 XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \ 339 MULXQ 24+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 40+z; \ 340 MULXQ 32+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; MOVQ $0, R12; \ 341 MULXQ 40+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; \ 342 MULXQ 48+x, AX, CX; ADCXQ AX, R9; ADOXQ CX, R10; \ 343 ;;;;;;;;;;;;;;;;;;; ADCXQ R11,R10; \ 344 \ 345 MOVQ 24+x, DX; \ 346 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \ 347 MULXQ AX, AX, CX; \ 348 MOVQ R15, R8; NEGQ R8; ANDQ 24+x, R8; \ 349 ADDQ AX, R13; MOVQ R13, 48+z; \ 350 ADCQ CX, R8; \ 351 ADCQ $0, R9; \ 352 ADDQ 24+x, DX; \ 353 ADCQ $0, R15; \ 354 XORL R13, R13; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R14; \ 355 MULXQ 32+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; MOVQ R14, 56+z; \ 356 MULXQ 40+x, AX, CX; ADCXQ AX, R9; ADOXQ CX, R10; MOVQ $0, R14; \ 357 MULXQ 48+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; \ 358 ;;;;;;;;;;;;;;;;;;; ADCXQ R12,R11; \ 359 \ 360 MOVQ 32+x, DX; \ 361 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \ 362 MULXQ AX, AX, CX; \ 363 MOVQ R15, R8; NEGQ R8; ANDQ 32+x, R8; \ 364 ADDQ AX, R9; MOVQ R9, 64+z; \ 365 ADCQ CX, R8; \ 366 ADCQ $0, R11; \ 367 ADDQ 32+x, DX; \ 368 ADCQ $0, R15; \ 369 XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \ 370 MULXQ 40+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 72+z; \ 371 MULXQ 48+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; \ 372 ;;;;;;;;;;;;;;;;;;; ADCXQ R13,R12; \ 373 \ 374 MOVQ 40+x, DX; \ 375 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \ 376 MULXQ AX, AX, CX; \ 377 MOVQ R15, R8; NEGQ R8; ANDQ 40+x, R8; \ 378 ADDQ AX, R11; MOVQ R11, 80+z; \ 379 ADCQ CX, R8; \ 380 ADCQ $0, R13; \ 381 ADDQ 40+x, DX; \ 382 ADCQ $0, R15; \ 383 XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \ 384 MULXQ 48+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 88+z; \ 385 ;;;;;;;;;;;;;;;;;;; ADCXQ R14,R13; \ 386 \ 387 MOVQ 48+x, DX; \ 388 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \ 389 MULXQ AX, AX, CX; \ 390 MOVQ R15, R8; NEGQ R8; ANDQ 48+x, R8; \ 391 XORL R10, R10; ;;;;;;;;;;;;;; ADOXQ CX, R14; \ 392 ;;;;;;;;;;;;;; ADCXQ AX, R13; ;;;;;;;;;;;;;; MOVQ R13, 96+z; \ 393 ;;;;;;;;;;;;;; ADCXQ R8, R14; MOVQ R14, 104+z; 394 395 // reduceFromDoubleLeg finds a z=x modulo p such that z<2^448 and stores in z 396 // Uses: AX, R8-R15, FLAGS 397 // Instr: x86_64 398 #define reduceFromDoubleLeg(z,x) \ 399 /* ( ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \ 400 /* (r14, r13, r12, r11, r10,r9,r8,r15) */ \ 401 MOVQ 80+x,AX; MOVQ AX,R10; \ 402 MOVQ $0xFFFFFFFF00000000, R8; \ 403 ANDQ R8,R10; \ 404 \ 405 MOVQ $0,R14; \ 406 MOVQ 104+x,R13; SHLQ $1,R13,R14; \ 407 MOVQ 96+x,R12; SHLQ $1,R12,R13; \ 408 MOVQ 88+x,R11; SHLQ $1,R11,R12; \ 409 MOVQ 72+x, R9; SHLQ $1,R10,R11; \ 410 MOVQ 64+x, R8; SHLQ $1,R10; \ 411 MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \ 412 MOVQ 56+x,R15; \ 413 \ 414 ADDQ 0+x,R15; MOVQ R15, 0+z; MOVQ 56+x,R15; \ 415 ADCQ 8+x, R8; MOVQ R8, 8+z; MOVQ 64+x, R8; \ 416 ADCQ 16+x, R9; MOVQ R9,16+z; MOVQ 72+x, R9; \ 417 ADCQ 24+x,R10; MOVQ R10,24+z; MOVQ 80+x,R10; \ 418 ADCQ 32+x,R11; MOVQ R11,32+z; MOVQ 88+x,R11; \ 419 ADCQ 40+x,R12; MOVQ R12,40+z; MOVQ 96+x,R12; \ 420 ADCQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \ 421 ADCQ $0,R14; \ 422 /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \ 423 /* ( r9, r8, r15, r13, r12, r11, r10) */ \ 424 MOVQ R10, AX; \ 425 SHRQ $32,R11,R10; \ 426 SHRQ $32,R12,R11; \ 427 SHRQ $32,R13,R12; \ 428 SHRQ $32,R15,R13; \ 429 SHRQ $32, R8,R15; \ 430 SHRQ $32, R9, R8; \ 431 SHRQ $32, AX, R9; \ 432 \ 433 ADDQ 0+z,R10; \ 434 ADCQ 8+z,R11; \ 435 ADCQ 16+z,R12; \ 436 ADCQ 24+z,R13; \ 437 ADCQ 32+z,R15; \ 438 ADCQ 40+z, R8; \ 439 ADCQ 48+z, R9; \ 440 ADCQ $0,R14; \ 441 /* ( c7) + (c6,...,c0) */ \ 442 /* (r14) */ \ 443 MOVQ R14, AX; SHLQ $32, AX; \ 444 ADDQ R14,R10; MOVQ $0,R14; \ 445 ADCQ $0,R11; \ 446 ADCQ $0,R12; \ 447 ADCQ AX,R13; \ 448 ADCQ $0,R15; \ 449 ADCQ $0, R8; \ 450 ADCQ $0, R9; \ 451 ADCQ $0,R14; \ 452 /* ( c7) + (c6,...,c0) */ \ 453 /* (r14) */ \ 454 MOVQ R14, AX; SHLQ $32,AX; \ 455 ADDQ R14,R10; MOVQ R10, 0+z; \ 456 ADCQ $0,R11; MOVQ R11, 8+z; \ 457 ADCQ $0,R12; MOVQ R12,16+z; \ 458 ADCQ AX,R13; MOVQ R13,24+z; \ 459 ADCQ $0,R15; MOVQ R15,32+z; \ 460 ADCQ $0, R8; MOVQ R8,40+z; \ 461 ADCQ $0, R9; MOVQ R9,48+z; 462 463 // reduceFromDoubleAdx finds a z=x modulo p such that z<2^448 and stores in z 464 // Uses: AX, R8-R15, FLAGS 465 // Instr: x86_64, adx 466 #define reduceFromDoubleAdx(z,x) \ 467 /* ( ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \ 468 /* (r14, r13, r12, r11, r10,r9,r8,r15) */ \ 469 MOVQ 80+x,AX; MOVQ AX,R10; \ 470 MOVQ $0xFFFFFFFF00000000, R8; \ 471 ANDQ R8,R10; \ 472 \ 473 MOVQ $0,R14; \ 474 MOVQ 104+x,R13; SHLQ $1,R13,R14; \ 475 MOVQ 96+x,R12; SHLQ $1,R12,R13; \ 476 MOVQ 88+x,R11; SHLQ $1,R11,R12; \ 477 MOVQ 72+x, R9; SHLQ $1,R10,R11; \ 478 MOVQ 64+x, R8; SHLQ $1,R10; \ 479 MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \ 480 MOVQ 56+x,R15; \ 481 \ 482 XORL AX,AX; \ 483 ADCXQ 0+x,R15; MOVQ R15, 0+z; MOVQ 56+x,R15; \ 484 ADCXQ 8+x, R8; MOVQ R8, 8+z; MOVQ 64+x, R8; \ 485 ADCXQ 16+x, R9; MOVQ R9,16+z; MOVQ 72+x, R9; \ 486 ADCXQ 24+x,R10; MOVQ R10,24+z; MOVQ 80+x,R10; \ 487 ADCXQ 32+x,R11; MOVQ R11,32+z; MOVQ 88+x,R11; \ 488 ADCXQ 40+x,R12; MOVQ R12,40+z; MOVQ 96+x,R12; \ 489 ADCXQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \ 490 ADCXQ AX,R14; \ 491 /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \ 492 /* ( r9, r8, r15, r13, r12, r11, r10) */ \ 493 MOVQ R10, AX; \ 494 SHRQ $32,R11,R10; \ 495 SHRQ $32,R12,R11; \ 496 SHRQ $32,R13,R12; \ 497 SHRQ $32,R15,R13; \ 498 SHRQ $32, R8,R15; \ 499 SHRQ $32, R9, R8; \ 500 SHRQ $32, AX, R9; \ 501 \ 502 XORL AX,AX; \ 503 ADCXQ 0+z,R10; \ 504 ADCXQ 8+z,R11; \ 505 ADCXQ 16+z,R12; \ 506 ADCXQ 24+z,R13; \ 507 ADCXQ 32+z,R15; \ 508 ADCXQ 40+z, R8; \ 509 ADCXQ 48+z, R9; \ 510 ADCXQ AX,R14; \ 511 /* ( c7) + (c6,...,c0) */ \ 512 /* (r14) */ \ 513 MOVQ R14, AX; SHLQ $32, AX; \ 514 CLC; \ 515 ADCXQ R14,R10; MOVQ $0,R14; \ 516 ADCXQ R14,R11; \ 517 ADCXQ R14,R12; \ 518 ADCXQ AX,R13; \ 519 ADCXQ R14,R15; \ 520 ADCXQ R14, R8; \ 521 ADCXQ R14, R9; \ 522 ADCXQ R14,R14; \ 523 /* ( c7) + (c6,...,c0) */ \ 524 /* (r14) */ \ 525 MOVQ R14, AX; SHLQ $32, AX; \ 526 CLC; \ 527 ADCXQ R14,R10; MOVQ R10, 0+z; MOVQ $0,R14; \ 528 ADCXQ R14,R11; MOVQ R11, 8+z; \ 529 ADCXQ R14,R12; MOVQ R12,16+z; \ 530 ADCXQ AX,R13; MOVQ R13,24+z; \ 531 ADCXQ R14,R15; MOVQ R15,32+z; \ 532 ADCXQ R14, R8; MOVQ R8,40+z; \ 533 ADCXQ R14, R9; MOVQ R9,48+z; 534 535 // addSub calculates two operations: x,y = x+y,x-y 536 // Uses: AX, DX, R8-R15, FLAGS 537 #define addSub(x,y) \ 538 MOVQ 0+x, R8; ADDQ 0+y, R8; \ 539 MOVQ 8+x, R9; ADCQ 8+y, R9; \ 540 MOVQ 16+x, R10; ADCQ 16+y, R10; \ 541 MOVQ 24+x, R11; ADCQ 24+y, R11; \ 542 MOVQ 32+x, R12; ADCQ 32+y, R12; \ 543 MOVQ 40+x, R13; ADCQ 40+y, R13; \ 544 MOVQ 48+x, R14; ADCQ 48+y, R14; \ 545 MOVQ $0, AX; ADCQ $0, AX; \ 546 MOVQ AX, DX; \ 547 SHLQ $32, DX; \ 548 ADDQ AX, R8; MOVQ $0, AX; \ 549 ADCQ $0, R9; \ 550 ADCQ $0, R10; \ 551 ADCQ DX, R11; \ 552 ADCQ $0, R12; \ 553 ADCQ $0, R13; \ 554 ADCQ $0, R14; \ 555 ADCQ $0, AX; \ 556 MOVQ AX, DX; \ 557 SHLQ $32, DX; \ 558 ADDQ AX, R8; MOVQ 0+x,AX; MOVQ R8, 0+x; MOVQ AX, R8; \ 559 ADCQ $0, R9; MOVQ 8+x,AX; MOVQ R9, 8+x; MOVQ AX, R9; \ 560 ADCQ $0, R10; MOVQ 16+x,AX; MOVQ R10, 16+x; MOVQ AX, R10; \ 561 ADCQ DX, R11; MOVQ 24+x,AX; MOVQ R11, 24+x; MOVQ AX, R11; \ 562 ADCQ $0, R12; MOVQ 32+x,AX; MOVQ R12, 32+x; MOVQ AX, R12; \ 563 ADCQ $0, R13; MOVQ 40+x,AX; MOVQ R13, 40+x; MOVQ AX, R13; \ 564 ADCQ $0, R14; MOVQ 48+x,AX; MOVQ R14, 48+x; MOVQ AX, R14; \ 565 SUBQ 0+y, R8; \ 566 SBBQ 8+y, R9; \ 567 SBBQ 16+y, R10; \ 568 SBBQ 24+y, R11; \ 569 SBBQ 32+y, R12; \ 570 SBBQ 40+y, R13; \ 571 SBBQ 48+y, R14; \ 572 MOVQ $0, AX; SETCS AX; \ 573 MOVQ AX, DX; \ 574 SHLQ $32, DX; \ 575 SUBQ AX, R8; MOVQ $0, AX; \ 576 SBBQ $0, R9; \ 577 SBBQ $0, R10; \ 578 SBBQ DX, R11; \ 579 SBBQ $0, R12; \ 580 SBBQ $0, R13; \ 581 SBBQ $0, R14; \ 582 SETCS AX; \ 583 MOVQ AX, DX; \ 584 SHLQ $32, DX; \ 585 SUBQ AX, R8; MOVQ R8, 0+y; \ 586 SBBQ $0, R9; MOVQ R9, 8+y; \ 587 SBBQ $0, R10; MOVQ R10, 16+y; \ 588 SBBQ DX, R11; MOVQ R11, 24+y; \ 589 SBBQ $0, R12; MOVQ R12, 32+y; \ 590 SBBQ $0, R13; MOVQ R13, 40+y; \ 591 SBBQ $0, R14; MOVQ R14, 48+y;