github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p503/arith_amd64.s (about) 1 // +build amd64,!purego 2 3 #include "textflag.h" 4 5 // p503 6 #define P503_0 $0xFFFFFFFFFFFFFFFF 7 #define P503_1 $0xFFFFFFFFFFFFFFFF 8 #define P503_2 $0xFFFFFFFFFFFFFFFF 9 #define P503_3 $0xABFFFFFFFFFFFFFF 10 #define P503_4 $0x13085BDA2211E7A0 11 #define P503_5 $0x1B9BF6C87B7E7DAF 12 #define P503_6 $0x6045C6BDDA77A4D0 13 #define P503_7 $0x004066F541811E1E 14 15 // p503+1 16 #define P503P1_3 $0xAC00000000000000 17 #define P503P1_4 $0x13085BDA2211E7A0 18 #define P503P1_5 $0x1B9BF6C87B7E7DAF 19 #define P503P1_6 $0x6045C6BDDA77A4D0 20 #define P503P1_7 $0x004066F541811E1E 21 22 // p503x2 23 #define P503X2_0 $0xFFFFFFFFFFFFFFFE 24 #define P503X2_1 $0xFFFFFFFFFFFFFFFF 25 #define P503X2_2 $0xFFFFFFFFFFFFFFFF 26 #define P503X2_3 $0x57FFFFFFFFFFFFFF 27 #define P503X2_4 $0x2610B7B44423CF41 28 #define P503X2_5 $0x3737ED90F6FCFB5E 29 #define P503X2_6 $0xC08B8D7BB4EF49A0 30 #define P503X2_7 $0x0080CDEA83023C3C 31 32 #define REG_P1 DI 33 #define REG_P2 SI 34 #define REG_P3 DX 35 36 // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version 37 // uses MULX instruction. Macro smashes value in DX. 38 // Input: I0 and I1. 39 // Output: O 40 // All the other arguments are registers, used for storing temporary values 41 #define MULS256_MULX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \ 42 MOVQ I0, DX \ 43 MULXQ I1, T1, T0 \ // T0:T1 = A0*B0 44 MOVQ T1, O \ // O[0] 45 MULXQ 8+I1, T2, T1 \ // T1:T2 = U0*V1 46 ADDQ T2, T0 \ 47 MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2 48 ADCQ T3, T1 \ 49 MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3 50 ADCQ T4, T2 \ 51 \ // Column U1 52 MOVQ 8+I0, DX \ 53 ADCQ $0, T3 \ 54 MULXQ 0+I1, T4, T5 \ // T5:T4 = U1*V0 55 MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1 56 ADDQ T7, T5 \ 57 MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2 58 ADCQ T8, T6 \ 59 MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3 60 ADCQ T9, T7 \ 61 ADCQ $0, T8 \ 62 ADDQ T0, T4 \ 63 MOVQ T4, 8+O \ // O[1] 64 ADCQ T1, T5 \ 65 ADCQ T2, T6 \ 66 ADCQ T3, T7 \ 67 \ // Column U2 68 MOVQ 16+I0, DX \ 69 ADCQ $0, T8 \ 70 MULXQ 0+I1, T0, T1 \ // T1:T0 = U2*V0 71 MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1 72 ADDQ T3, T1 \ 73 MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2 74 ADCQ T4, T2 \ 75 MULXQ 24+I1, T9, T4 \ // T4:T9 = U2*V3 76 ADCQ T9, T3 \ 77 \ // Column U3 78 MOVQ 24+I0, DX \ 79 ADCQ $0, T4 \ 80 ADDQ T5, T0 \ 81 MOVQ T0, 16+O \ // O[2] 82 ADCQ T6, T1 \ 83 ADCQ T7, T2 \ 84 ADCQ T8, T3 \ 85 ADCQ $0, T4 \ 86 MULXQ 0+I1, T0, T5 \ // T5:T0 = U3*V0 87 MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*V1 88 ADDQ T7, T5 \ 89 MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2 90 ADCQ T8, T6 \ 91 MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3 92 ADCQ T9, T7 \ 93 ADCQ $0, T8 \ 94 \ // Add values in remaining columns 95 ADDQ T0, T1 \ 96 MOVQ T1, 24+O \ // O[3] 97 ADCQ T5, T2 \ 98 MOVQ T2, 32+O \ // O[4] 99 ADCQ T6, T3 \ 100 MOVQ T3, 40+O \ // O[5] 101 ADCQ T7, T4 \ 102 MOVQ T4, 48+O \ // O[6] 103 ADCQ $0, T8 \ // O[7] 104 MOVQ T8, 56+O 105 106 // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version 107 // uses ADOX, ADCX and MULX instructions. Macro smashes values in AX and DX. 108 // Input: I0 and I1. 109 // Output: O 110 // All the other arguments registers are used for storing temporary values 111 #define MULS256_MULX_ADCX_ADOX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \ 112 \ // U0[0] 113 MOVQ 0+I0, DX \ // MULX requires multiplayer in DX 114 \ // T0:T1 = I1*DX 115 MULXQ I1, T1, T0 \ // T0:T1 = U0*V0 (low:high) 116 MOVQ T1, O \ // O0[0] 117 MULXQ 8+I1, T2, T1 \ // T2:T1 = U0*V1 118 XORQ AX, AX \ 119 ADOXQ T2, T0 \ 120 MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2 121 ADOXQ T3, T1 \ 122 MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3 123 ADOXQ T4, T2 \ 124 \ // Column U1 125 MOVQ 8+I0, DX \ 126 MULXQ I1, T4, T5 \ // T5:T4 = U1*V0 127 ADOXQ AX, T3 \ 128 XORQ AX, AX \ 129 MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1 130 ADOXQ T0, T4 \ 131 MOVQ T4, 8+O \ // O[1] 132 ADCXQ T7, T5 \ 133 MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2 134 ADCXQ T8, T6 \ 135 ADOXQ T1, T5 \ 136 MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3 137 ADCXQ T9, T7 \ 138 ADCXQ AX, T8 \ 139 ADOXQ T2, T6 \ 140 \ // Column U2 141 MOVQ 16+I0, DX \ 142 MULXQ I1, T0, T1 \ // T1:T0 = U2*V0 143 ADOXQ T3, T7 \ 144 ADOXQ AX, T8 \ 145 XORQ AX, AX \ 146 MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1 147 ADOXQ T5, T0 \ 148 MOVQ T0, 16+O \ // O[2] 149 ADCXQ T3, T1 \ 150 MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2 151 ADCXQ T4, T2 \ 152 ADOXQ T6, T1 \ 153 MULXQ 24+I1, T9, T4 \ // T9:T4 = U2*V3 154 ADCXQ T9, T3 \ 155 MOVQ 24+I0, DX \ 156 ADCXQ AX, T4 \ 157 \ 158 ADOXQ T7, T2 \ 159 ADOXQ T8, T3 \ 160 ADOXQ AX, T4 \ 161 \ // Column U3 162 MULXQ I1, T0, T5 \ // T5:T0 = U3*B0 163 XORQ AX, AX \ 164 MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*B1 165 ADCXQ T7, T5 \ 166 ADOXQ T0, T1 \ 167 MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2 168 ADCXQ T8, T6 \ 169 ADOXQ T5, T2 \ 170 MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3 171 ADCXQ T9, T7 \ 172 ADCXQ AX, T8 \ 173 \ 174 ADOXQ T6, T3 \ 175 ADOXQ T7, T4 \ 176 ADOXQ AX, T8 \ 177 MOVQ T1, 24+O \ // O[3] 178 MOVQ T2, 32+O \ // O[4] 179 MOVQ T3, 40+O \ // O[5] 180 MOVQ T4, 48+O \ // O[6] and O[7] below 181 MOVQ T8, 56+O 182 183 // Template of a macro that performs schoolbook multiplication of 128-bit with 320-bit 184 // number. It uses MULX instruction This template must be customized with functions 185 // performing ADD (add1, add2) and ADD-with-carry (adc1, adc2). addX/adcX may or may 186 // not be instructions that use two independent carry chains. 187 // Input: 188 // * I0 128-bit number 189 // * I1 320-bit number 190 // * add1, add2: instruction performing integer addition and starting carry chain 191 // * adc1, adc2: instruction performing integer addition with carry 192 // Output: T[0-6] registers 193 #define MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, add1, add2, adc1, adc2) \ 194 \ // Column 0 195 MOVQ I0, DX \ 196 MOVQ I1+24(SB), AX \ 197 MULXQ AX, T0, T1 \ 198 MOVQ I1+32(SB), AX \ 199 MULXQ AX, T4, T2 \ 200 MOVQ I1+40(SB), AX \ 201 MULXQ AX, T5, T3 \ 202 XORQ AX, AX \ 203 add1 T4, T1 \ 204 adc1 T5, T2 \ 205 MOVQ I1+48(SB), AX \ 206 MULXQ AX, T7, T4 \ 207 adc1 T7, T3 \ 208 MOVQ I1+56(SB), AX \ 209 MULXQ AX, T6, T5 \ 210 adc1 T6, T4 \ 211 MOVL $0, AX \ 212 adc1 AX, T5 \ 213 \ // Column 1 214 MOVQ 8+I0, DX \ 215 MOVQ I1+24(SB), AX \ 216 MULXQ AX, T6, T7 \ 217 add2 T6, T1 \ 218 adc2 T7, T2 \ 219 MOVQ I1+32(SB), AX \ 220 MULXQ AX, T8, T6 \ 221 adc2 T6, T3 \ 222 MOVQ I1+40(SB), AX \ 223 MULXQ AX, T7, T9 \ 224 adc2 T9, T4 \ 225 MOVQ I1+48(SB), AX \ 226 MULXQ AX, T9, T6 \ 227 adc2 T6, T5 \ 228 MOVQ I1+56(SB), AX \ 229 MULXQ AX, DX, T6 \ 230 MOVL $0, AX \ 231 adc2 AX, T6 \ 232 \ // Output 233 XORQ AX, AX \ 234 add1 T8, T2 \ 235 adc1 T7, T3 \ 236 adc1 T9, T4 \ 237 adc1 DX, T5 \ 238 adc1 AX, T6 239 240 // Multiplies 128-bit with 320-bit integer. Optimized with MULX instruction. 241 #define MULS_128x320_MULX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \ 242 MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADDQ, ADDQ, ADCQ, ADCQ) 243 244 // Multiplies 128-bit with 320-bit integer. Optimized with MULX, ADOX and ADCX instructions 245 #define MULS_128x320_MULX_ADCX_ADOX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \ 246 MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADOXQ, ADCXQ, ADOXQ, ADCXQ) 247 248 // Template of a macro performing multiplication of two 512-bit numbers. It uses one 249 // level of Karatsuba and one level of schoolbook multiplication. Template must be 250 // customized with macro performing schoolbook multiplication. 251 // Input: 252 // * I0, I1 - two 512-bit numbers 253 // * MULS - either MULS256_MULX or MULS256_MULX_ADCX_ADOX 254 // Output: OUT - 1024-bit long 255 #define MUL(OUT, I0, I1, MULS) \ 256 \ // R[8-11]: U1+U0 257 XORQ AX, AX \ 258 MOVQ ( 0)(I0), R8 \ 259 MOVQ ( 8)(I0), R9 \ 260 MOVQ (16)(I0), R10 \ 261 MOVQ (24)(I0), R11 \ 262 ADDQ (32)(I0), R8 \ 263 ADCQ (40)(I0), R9 \ 264 ADCQ (48)(I0), R10 \ 265 ADCQ (56)(I0), R11 \ 266 SBBQ $0, AX \ // store mask 267 MOVQ R8, ( 0)(SP) \ 268 MOVQ R9, ( 8)(SP) \ 269 MOVQ R10, (16)(SP) \ 270 MOVQ R11, (24)(SP) \ 271 \ 272 \ // R[12-15]: V1+V0 273 XORQ BX, BX \ 274 MOVQ ( 0)(I1), R12 \ 275 MOVQ ( 8)(I1), R13 \ 276 MOVQ (16)(I1), R14 \ 277 MOVQ (24)(I1), R15 \ 278 ADDQ (32)(I1), R12 \ 279 ADCQ (40)(I1), R13 \ 280 ADCQ (48)(I1), R14 \ 281 ADCQ (56)(I1), R15 \ 282 SBBQ $0, BX \ // store mask 283 MOVQ R12, (32)(SP) \ 284 MOVQ R13, (40)(SP) \ 285 MOVQ R14, (48)(SP) \ 286 MOVQ R15, (56)(SP) \ 287 \ // Prepare mask for U0+U1 (U1+U0 mod 256^4 if U1+U0 sets carry flag, otherwise 0) 288 ANDQ AX, R12 \ 289 ANDQ AX, R13 \ 290 ANDQ AX, R14 \ 291 ANDQ AX, R15 \ 292 \ // Prepare mask for V0+V1 (V1+V0 mod 256^4 if U1+U0 sets carry flag, otherwise 0) 293 ANDQ BX, R8 \ 294 ANDQ BX, R9 \ 295 ANDQ BX, R10 \ 296 ANDQ BX, R11 \ 297 \ // res = masked(U0+U1) + masked(V0 + V1) 298 ADDQ R12, R8 \ 299 ADCQ R13, R9 \ 300 ADCQ R14, R10 \ 301 ADCQ R15, R11 \ 302 \ // SP[64-96] <- res 303 MOVQ R8, (64)(SP) \ 304 MOVQ R9, (72)(SP) \ 305 MOVQ R10, (80)(SP) \ 306 MOVQ R11, (88)(SP) \ 307 \ // BP will be used for schoolbook multiplication below 308 MOVQ BP, 96(SP) \ // push: BP is Callee-save. 309 \ // (U1+U0)*(V1+V0) 310 MULS((64)(OUT), 0(SP), 32(SP), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \ 311 \ // U0 x V0 312 MULS(0(OUT), 0(I0), 0(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \ 313 \ // U1 x V1 314 MULS(0(SP), 32(I0), 32(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \ 315 \ // Recover BP 316 MOVQ 96(SP), BP \ // pop: BP is Callee-save. 317 \ // Final part of schoolbook multiplication; R[8-11] = (U0+U1) x (V0+V1) 318 MOVQ (64)(SP), R8 \ 319 MOVQ (72)(SP), R9 \ 320 MOVQ (80)(SP), R10 \ 321 MOVQ (88)(SP), R11 \ 322 MOVQ (96)(OUT), AX \ 323 ADDQ AX, R8 \ 324 MOVQ (104)(OUT), AX \ 325 ADCQ AX, R9 \ 326 MOVQ (112)(OUT), AX \ 327 ADCQ AX, R10 \ 328 MOVQ (120)(OUT), AX \ 329 ADCQ AX, R11 \ 330 \ // R[12-15, 8-11] = (U0+U1) x (V0+V1) - U0xV0 331 MOVQ (64)(OUT), R12 \ 332 MOVQ (72)(OUT), R13 \ 333 MOVQ (80)(OUT), R14 \ 334 MOVQ (88)(OUT), R15 \ 335 SUBQ ( 0)(OUT), R12 \ 336 SBBQ ( 8)(OUT), R13 \ 337 SBBQ (16)(OUT), R14 \ 338 SBBQ (24)(OUT), R15 \ 339 SBBQ (32)(OUT), R8 \ 340 SBBQ (40)(OUT), R9 \ 341 SBBQ (48)(OUT), R10 \ 342 SBBQ (56)(OUT), R11 \ 343 \ // r8-r15 <- (U0+U1) x (V0+V1) - U0xV0 - U1xV1 344 SUBQ ( 0)(SP), R12 \ 345 SBBQ ( 8)(SP), R13 \ 346 SBBQ (16)(SP), R14 \ 347 SBBQ (24)(SP), R15 \ 348 SBBQ (32)(SP), R8 \ 349 SBBQ (40)(SP), R9 \ 350 SBBQ (48)(SP), R10 \ 351 SBBQ (56)(SP), R11 \ 352 \ 353 ; ADDQ (32)(OUT), R12; MOVQ R12, ( 32)(OUT) \ 354 ; ADCQ (40)(OUT), R13; MOVQ R13, ( 40)(OUT) \ 355 ; ADCQ (48)(OUT), R14; MOVQ R14, ( 48)(OUT) \ 356 ; ADCQ (56)(OUT), R15; MOVQ R15, ( 56)(OUT) \ 357 MOVQ ( 0)(SP), AX; ADCQ AX, R8; MOVQ R8, ( 64)(OUT) \ 358 MOVQ ( 8)(SP), AX; ADCQ AX, R9; MOVQ R9, ( 72)(OUT) \ 359 MOVQ (16)(SP), AX; ADCQ AX, R10; MOVQ R10, ( 80)(OUT) \ 360 MOVQ (24)(SP), AX; ADCQ AX, R11; MOVQ R11, ( 88)(OUT) \ 361 MOVQ (32)(SP), R12; ADCQ $0, R12; MOVQ R12, ( 96)(OUT) \ 362 MOVQ (40)(SP), R13; ADCQ $0, R13; MOVQ R13, (104)(OUT) \ 363 MOVQ (48)(SP), R14; ADCQ $0, R14; MOVQ R14, (112)(OUT) \ 364 MOVQ (56)(SP), R15; ADCQ $0, R15; MOVQ R15, (120)(OUT) 365 366 // Template for calculating the Montgomery reduction algorithm described in 367 // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be 368 // customized with schoolbook multiplication for 128 x 320-bit number. 369 // This macro reuses memory of IN value and *changes* it. Smashes registers 370 // R[8-15], BX, CX 371 // Input: 372 // * IN: 1024-bit number to be reduced 373 // * MULS: either MULS_128x320_MULX or MULS_128x320_MULX_ADCX_ADOX 374 // Output: OUT 512-bit 375 #define REDC(OUT, IN, MULS) \ 376 MULS(0(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ 377 XORQ R15, R15 \ 378 ADDQ (24)(IN), R8 \ 379 ADCQ (32)(IN), R9 \ 380 ADCQ (40)(IN), R10 \ 381 ADCQ (48)(IN), R11 \ 382 ADCQ (56)(IN), R12 \ 383 ADCQ (64)(IN), R13 \ 384 ADCQ (72)(IN), R14 \ 385 ADCQ (80)(IN), R15 \ 386 MOVQ R8, (24)(IN) \ 387 MOVQ R9, (32)(IN) \ 388 MOVQ R10, (40)(IN) \ 389 MOVQ R11, (48)(IN) \ 390 MOVQ R12, (56)(IN) \ 391 MOVQ R13, (64)(IN) \ 392 MOVQ R14, (72)(IN) \ 393 MOVQ R15, (80)(IN) \ 394 MOVQ (88)(IN), R8 \ 395 MOVQ (96)(IN), R9 \ 396 MOVQ (104)(IN), R10 \ 397 MOVQ (112)(IN), R11 \ 398 MOVQ (120)(IN), R12 \ 399 ADCQ $0, R8 \ 400 ADCQ $0, R9 \ 401 ADCQ $0, R10 \ 402 ADCQ $0, R11 \ 403 ADCQ $0, R12 \ 404 MOVQ R8, (88)(IN) \ 405 MOVQ R9, (96)(IN) \ 406 MOVQ R10, (104)(IN) \ 407 MOVQ R11, (112)(IN) \ 408 MOVQ R12, (120)(IN) \ 409 \ 410 MULS(16(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ 411 XORQ R15, R15 \ 412 ADDQ (40)(IN), R8 \ 413 ADCQ (48)(IN), R9 \ 414 ADCQ (56)(IN), R10 \ 415 ADCQ (64)(IN), R11 \ 416 ADCQ (72)(IN), R12 \ 417 ADCQ (80)(IN), R13 \ 418 ADCQ (88)(IN), R14 \ 419 ADCQ (96)(IN), R15 \ 420 MOVQ R8, (40)(IN) \ 421 MOVQ R9, (48)(IN) \ 422 MOVQ R10, (56)(IN) \ 423 MOVQ R11, (64)(IN) \ 424 MOVQ R12, (72)(IN) \ 425 MOVQ R13, (80)(IN) \ 426 MOVQ R14, (88)(IN) \ 427 MOVQ R15, (96)(IN) \ 428 MOVQ (104)(IN), R8 \ 429 MOVQ (112)(IN), R9 \ 430 MOVQ (120)(IN), R10 \ 431 ADCQ $0, R8 \ 432 ADCQ $0, R9 \ 433 ADCQ $0, R10 \ 434 MOVQ R8, (104)(IN) \ 435 MOVQ R9, (112)(IN) \ 436 MOVQ R10, (120)(IN) \ 437 \ 438 MULS(32(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ 439 XORQ R15, R15 \ 440 XORQ BX, BX \ 441 ADDQ ( 56)(IN), R8 \ 442 ADCQ ( 64)(IN), R9 \ 443 ADCQ ( 72)(IN), R10 \ 444 ADCQ ( 80)(IN), R11 \ 445 ADCQ ( 88)(IN), R12 \ 446 ADCQ ( 96)(IN), R13 \ 447 ADCQ (104)(IN), R14 \ 448 ADCQ (112)(IN), R15 \ 449 ADCQ (120)(IN), BX \ 450 MOVQ R8, ( 56)(IN) \ 451 MOVQ R10, ( 72)(IN) \ 452 MOVQ R11, ( 80)(IN) \ 453 MOVQ R12, ( 88)(IN) \ 454 MOVQ R13, ( 96)(IN) \ 455 MOVQ R14, (104)(IN) \ 456 MOVQ R15, (112)(IN) \ 457 MOVQ BX, (120)(IN) \ 458 MOVQ R9, ( 0)(OUT) \ // Result: OUT[0] 459 \ 460 MULS(48(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ 461 ADDQ ( 72)(IN), R8 \ 462 ADCQ ( 80)(IN), R9 \ 463 ADCQ ( 88)(IN), R10 \ 464 ADCQ ( 96)(IN), R11 \ 465 ADCQ (104)(IN), R12 \ 466 ADCQ (112)(IN), R13 \ 467 ADCQ (120)(IN), R14 \ 468 MOVQ R8, ( 8)(OUT) \ // Result: OUT[1] 469 MOVQ R9, (16)(OUT) \ // Result: OUT[2] 470 MOVQ R10, (24)(OUT) \ // Result: OUT[3] 471 MOVQ R11, (32)(OUT) \ // Result: OUT[4] 472 MOVQ R12, (40)(OUT) \ // Result: OUT[5] 473 MOVQ R13, (48)(OUT) \ // Result: OUT[6] and OUT[7] 474 MOVQ R14, (56)(OUT) 475 476 TEXT ·modP503(SB), NOSPLIT, $0-8 477 MOVQ x+0(FP), REG_P1 478 479 // Zero AX for later use: 480 XORQ AX, AX 481 482 // Load p into registers: 483 MOVQ P503_0, R8 484 // P503_{1,2} = P503_0, so reuse R8 485 MOVQ P503_3, R9 486 MOVQ P503_4, R10 487 MOVQ P503_5, R11 488 MOVQ P503_6, R12 489 MOVQ P503_7, R13 490 491 // Set x <- x - p 492 SUBQ R8, ( 0)(REG_P1) 493 SBBQ R8, ( 8)(REG_P1) 494 SBBQ R8, (16)(REG_P1) 495 SBBQ R9, (24)(REG_P1) 496 SBBQ R10, (32)(REG_P1) 497 SBBQ R11, (40)(REG_P1) 498 SBBQ R12, (48)(REG_P1) 499 SBBQ R13, (56)(REG_P1) 500 501 // Save carry flag indicating x-p < 0 as a mask 502 SBBQ $0, AX 503 504 // Conditionally add p to x if x-p < 0 505 ANDQ AX, R8 506 ANDQ AX, R9 507 ANDQ AX, R10 508 ANDQ AX, R11 509 ANDQ AX, R12 510 ANDQ AX, R13 511 512 ADDQ R8, ( 0)(REG_P1) 513 ADCQ R8, ( 8)(REG_P1) 514 ADCQ R8, (16)(REG_P1) 515 ADCQ R9, (24)(REG_P1) 516 ADCQ R10,(32)(REG_P1) 517 ADCQ R11,(40)(REG_P1) 518 ADCQ R12,(48)(REG_P1) 519 ADCQ R13,(56)(REG_P1) 520 521 RET 522 523 TEXT ·cswapP503(SB),NOSPLIT,$0-17 524 525 MOVQ x+0(FP), REG_P1 526 MOVQ y+8(FP), REG_P2 527 MOVB choice+16(FP), AL // AL = 0 or 1 528 MOVBLZX AL, AX // AX = 0 or 1 529 NEGQ AX // AX = 0x00..00 or 0xff..ff 530 531 #ifndef CSWAP_BLOCK 532 #define CSWAP_BLOCK(idx) \ 533 MOVQ (idx*8)(REG_P1), BX \ // BX = x[idx] 534 MOVQ (idx*8)(REG_P2), CX \ // CX = y[idx] 535 MOVQ CX, DX \ // DX = y[idx] 536 XORQ BX, DX \ // DX = y[idx] ^ x[idx] 537 ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask 538 XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx] 539 XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx] 540 MOVQ BX, (idx*8)(REG_P1) \ 541 MOVQ CX, (idx*8)(REG_P2) 542 #endif 543 544 CSWAP_BLOCK(0) 545 CSWAP_BLOCK(1) 546 CSWAP_BLOCK(2) 547 CSWAP_BLOCK(3) 548 CSWAP_BLOCK(4) 549 CSWAP_BLOCK(5) 550 CSWAP_BLOCK(6) 551 CSWAP_BLOCK(7) 552 553 #ifdef CSWAP_BLOCK 554 #undef CSWAP_BLOCK 555 #endif 556 557 RET 558 559 TEXT ·cmovP503(SB),NOSPLIT,$0-17 560 561 MOVQ x+0(FP), DI 562 MOVQ y+8(FP), SI 563 MOVB choice+16(FP), AL // AL = 0 or 1 564 MOVBLZX AL, AX // AX = 0 or 1 565 NEGQ AX // AX = 0x00..00 or 0xff..ff 566 #ifndef CMOV_BLOCK 567 #define CMOV_BLOCK(idx) \ 568 MOVQ (idx*8)(DI), BX \ // BX = x[idx] 569 MOVQ (idx*8)(SI), DX \ // DX = y[idx] 570 XORQ BX, DX \ // DX = y[idx] ^ x[idx] 571 ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask 572 XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx] 573 MOVQ BX, (idx*8)(DI) 574 #endif 575 CMOV_BLOCK(0) 576 CMOV_BLOCK(1) 577 CMOV_BLOCK(2) 578 CMOV_BLOCK(3) 579 CMOV_BLOCK(4) 580 CMOV_BLOCK(5) 581 CMOV_BLOCK(6) 582 CMOV_BLOCK(7) 583 #ifdef CMOV_BLOCK 584 #undef CMOV_BLOCK 585 #endif 586 RET 587 588 TEXT ·addP503(SB),NOSPLIT,$0-24 589 590 MOVQ z+0(FP), REG_P3 591 MOVQ x+8(FP), REG_P1 592 MOVQ y+16(FP), REG_P2 593 594 // Used later to calculate a mask 595 XORQ CX, CX 596 597 // [R8-R15]: z = x + y 598 MOVQ ( 0)(REG_P1), R8 599 MOVQ ( 8)(REG_P1), R9 600 MOVQ (16)(REG_P1), R10 601 MOVQ (24)(REG_P1), R11 602 MOVQ (32)(REG_P1), R12 603 MOVQ (40)(REG_P1), R13 604 MOVQ (48)(REG_P1), R14 605 MOVQ (56)(REG_P1), R15 606 ADDQ ( 0)(REG_P2), R8 607 ADCQ ( 8)(REG_P2), R9 608 ADCQ (16)(REG_P2), R10 609 ADCQ (24)(REG_P2), R11 610 ADCQ (32)(REG_P2), R12 611 ADCQ (40)(REG_P2), R13 612 ADCQ (48)(REG_P2), R14 613 ADCQ (56)(REG_P2), R15 614 615 MOVQ P503X2_0, AX 616 SUBQ AX, R8 617 MOVQ P503X2_1, AX 618 SBBQ AX, R9 619 SBBQ AX, R10 620 MOVQ P503X2_3, AX 621 SBBQ AX, R11 622 MOVQ P503X2_4, AX 623 SBBQ AX, R12 624 MOVQ P503X2_5, AX 625 SBBQ AX, R13 626 MOVQ P503X2_6, AX 627 SBBQ AX, R14 628 MOVQ P503X2_7, AX 629 SBBQ AX, R15 630 631 // mask 632 SBBQ $0, CX 633 634 // move z to REG_P3 635 MOVQ R8, ( 0)(REG_P3) 636 MOVQ R9, ( 8)(REG_P3) 637 MOVQ R10, (16)(REG_P3) 638 MOVQ R11, (24)(REG_P3) 639 MOVQ R12, (32)(REG_P3) 640 MOVQ R13, (40)(REG_P3) 641 MOVQ R14, (48)(REG_P3) 642 MOVQ R15, (56)(REG_P3) 643 644 // if z<0 add p503x2 back 645 MOVQ P503X2_0, R8 646 MOVQ P503X2_1, R9 647 MOVQ P503X2_3, R10 648 MOVQ P503X2_4, R11 649 MOVQ P503X2_5, R12 650 MOVQ P503X2_6, R13 651 MOVQ P503X2_7, R14 652 ANDQ CX, R8 653 ANDQ CX, R9 654 ANDQ CX, R10 655 ANDQ CX, R11 656 ANDQ CX, R12 657 ANDQ CX, R13 658 ANDQ CX, R14 659 MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3) 660 MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3) 661 MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3) 662 MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3) 663 MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3) 664 MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3) 665 MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3) 666 MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3) 667 RET 668 669 TEXT ·subP503(SB), NOSPLIT, $0-24 670 671 MOVQ z+0(FP), REG_P3 672 MOVQ x+8(FP), REG_P1 673 MOVQ y+16(FP), REG_P2 674 675 // Used later to calculate a mask 676 XORQ CX, CX 677 678 MOVQ ( 0)(REG_P1), R8 679 MOVQ ( 8)(REG_P1), R9 680 MOVQ (16)(REG_P1), R10 681 MOVQ (24)(REG_P1), R11 682 MOVQ (32)(REG_P1), R12 683 MOVQ (40)(REG_P1), R13 684 MOVQ (48)(REG_P1), R14 685 MOVQ (56)(REG_P1), R15 686 687 SUBQ ( 0)(REG_P2), R8 688 SBBQ ( 8)(REG_P2), R9 689 SBBQ (16)(REG_P2), R10 690 SBBQ (24)(REG_P2), R11 691 SBBQ (32)(REG_P2), R12 692 SBBQ (40)(REG_P2), R13 693 SBBQ (48)(REG_P2), R14 694 SBBQ (56)(REG_P2), R15 695 696 // mask 697 SBBQ $0, CX 698 699 // store x-y in REG_P3 700 MOVQ R8, ( 0)(REG_P3) 701 MOVQ R9, ( 8)(REG_P3) 702 MOVQ R10, (16)(REG_P3) 703 MOVQ R11, (24)(REG_P3) 704 MOVQ R12, (32)(REG_P3) 705 MOVQ R13, (40)(REG_P3) 706 MOVQ R14, (48)(REG_P3) 707 MOVQ R15, (56)(REG_P3) 708 709 // if z<0 add p503x2 back 710 MOVQ P503X2_0, R8 711 MOVQ P503X2_1, R9 712 MOVQ P503X2_3, R10 713 MOVQ P503X2_4, R11 714 MOVQ P503X2_5, R12 715 MOVQ P503X2_6, R13 716 MOVQ P503X2_7, R14 717 ANDQ CX, R8 718 ANDQ CX, R9 719 ANDQ CX, R10 720 ANDQ CX, R11 721 ANDQ CX, R12 722 ANDQ CX, R13 723 ANDQ CX, R14 724 MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3) 725 MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3) 726 MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3) 727 MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3) 728 MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3) 729 MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3) 730 MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3) 731 MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3) 732 733 RET 734 735 TEXT ·mulP503(SB), NOSPLIT, $104-24 736 MOVQ z+0(FP), CX 737 MOVQ x+8(FP), REG_P1 738 MOVQ y+16(FP), REG_P2 739 740 // Check whether to use optimized implementation 741 CMPB ·HasADXandBMI2(SB), $1 742 JE mul_with_mulx_adcx_adox 743 CMPB ·HasBMI2(SB), $1 744 JE mul_with_mulx 745 746 // Generic x86 implementation (below) uses variant of Karatsuba method. 747 // 748 // Here we store the destination in CX instead of in REG_P3 because the 749 // multiplication instructions use DX as an implicit destination 750 // operand: MULQ $REG sets DX:AX <-- AX * $REG. 751 752 // RAX and RDX will be used for a mask (0-borrow) 753 XORQ AX, AX 754 755 // RCX[0-3]: U1+U0 756 MOVQ (32)(REG_P1), R8 757 MOVQ (40)(REG_P1), R9 758 MOVQ (48)(REG_P1), R10 759 MOVQ (56)(REG_P1), R11 760 ADDQ ( 0)(REG_P1), R8 761 ADCQ ( 8)(REG_P1), R9 762 ADCQ (16)(REG_P1), R10 763 ADCQ (24)(REG_P1), R11 764 MOVQ R8, ( 0)(CX) 765 MOVQ R9, ( 8)(CX) 766 MOVQ R10, (16)(CX) 767 MOVQ R11, (24)(CX) 768 769 SBBQ $0, AX 770 771 // R12-R15: V1+V0 772 XORQ DX, DX 773 MOVQ (32)(REG_P2), R12 774 MOVQ (40)(REG_P2), R13 775 MOVQ (48)(REG_P2), R14 776 MOVQ (56)(REG_P2), R15 777 ADDQ ( 0)(REG_P2), R12 778 ADCQ ( 8)(REG_P2), R13 779 ADCQ (16)(REG_P2), R14 780 ADCQ (24)(REG_P2), R15 781 782 SBBQ $0, DX 783 784 // Store carries on stack 785 MOVQ AX, (64)(SP) 786 MOVQ DX, (72)(SP) 787 788 // (SP[0-3],R8,R9,R10,R11) <- (U0+U1)*(V0+V1). 789 // MUL using comba; In comments below U=U0+U1 V=V0+V1 790 791 // U0*V0 792 MOVQ (CX), AX 793 MULQ R12 794 MOVQ AX, (SP) // C0 795 MOVQ DX, R8 796 797 // U0*V1 798 XORQ R9, R9 799 MOVQ (CX), AX 800 MULQ R13 801 ADDQ AX, R8 802 ADCQ DX, R9 803 804 // U1*V0 805 XORQ R10, R10 806 MOVQ (8)(CX), AX 807 MULQ R12 808 ADDQ AX, R8 809 MOVQ R8, (8)(SP) // C1 810 ADCQ DX, R9 811 ADCQ $0, R10 812 813 // U0*V2 814 XORQ R8, R8 815 MOVQ (CX), AX 816 MULQ R14 817 ADDQ AX, R9 818 ADCQ DX, R10 819 ADCQ $0, R8 820 821 // U2*V0 822 MOVQ (16)(CX), AX 823 MULQ R12 824 ADDQ AX, R9 825 ADCQ DX, R10 826 ADCQ $0, R8 827 828 // U1*V1 829 MOVQ (8)(CX), AX 830 MULQ R13 831 ADDQ AX, R9 832 MOVQ R9, (16)(SP) // C2 833 ADCQ DX, R10 834 ADCQ $0, R8 835 836 // U0*V3 837 XORQ R9, R9 838 MOVQ (CX), AX 839 MULQ R15 840 ADDQ AX, R10 841 ADCQ DX, R8 842 ADCQ $0, R9 843 844 // U3*V0 845 MOVQ (24)(CX), AX 846 MULQ R12 847 ADDQ AX, R10 848 ADCQ DX, R8 849 ADCQ $0, R9 850 851 // U1*V2 852 MOVQ (8)(CX), AX 853 MULQ R14 854 ADDQ AX, R10 855 ADCQ DX, R8 856 ADCQ $0, R9 857 858 // U2*V1 859 MOVQ (16)(CX), AX 860 MULQ R13 861 ADDQ AX, R10 862 MOVQ R10, (24)(SP) // C3 863 ADCQ DX, R8 864 ADCQ $0, R9 865 866 // U1*V3 867 XORQ R10, R10 868 MOVQ (8)(CX), AX 869 MULQ R15 870 ADDQ AX, R8 871 ADCQ DX, R9 872 ADCQ $0, R10 873 874 // U3*V1 875 MOVQ (24)(CX), AX 876 MULQ R13 877 ADDQ AX, R8 878 ADCQ DX, R9 879 ADCQ $0, R10 880 881 // U2*V2 882 MOVQ (16)(CX), AX 883 MULQ R14 884 ADDQ AX, R8 885 MOVQ R8, (32)(SP) // C4 886 ADCQ DX, R9 887 ADCQ $0, R10 888 889 // U2*V3 890 XORQ R11, R11 891 MOVQ (16)(CX), AX 892 MULQ R15 893 ADDQ AX, R9 894 ADCQ DX, R10 895 ADCQ $0, R11 896 897 // U3*V2 898 MOVQ (24)(CX), AX 899 MULQ R14 900 ADDQ AX, R9 // C5 901 ADCQ DX, R10 902 ADCQ $0, R11 903 904 // U3*V3 905 MOVQ (24)(CX), AX 906 MULQ R15 907 ADDQ AX, R10 // C6 908 ADCQ DX, R11 // C7 909 910 MOVQ (64)(SP), AX 911 ANDQ AX, R12 912 ANDQ AX, R13 913 ANDQ AX, R14 914 ANDQ AX, R15 915 ADDQ R8, R12 916 ADCQ R9, R13 917 ADCQ R10, R14 918 ADCQ R11, R15 919 920 MOVQ (72)(SP), AX 921 MOVQ (CX), R8 922 MOVQ (8)(CX), R9 923 MOVQ (16)(CX), R10 924 MOVQ (24)(CX), R11 925 ANDQ AX, R8 926 ANDQ AX, R9 927 ANDQ AX, R10 928 ANDQ AX, R11 929 ADDQ R12, R8 930 ADCQ R13, R9 931 ADCQ R14, R10 932 ADCQ R15, R11 933 MOVQ R8, (32)(SP) 934 MOVQ R9, (40)(SP) 935 MOVQ R10, (48)(SP) 936 MOVQ R11, (56)(SP) 937 938 // CX[0-7] <- AL*BL 939 940 // U0*V0 941 MOVQ (REG_P1), R11 942 MOVQ (REG_P2), AX 943 MULQ R11 944 XORQ R9, R9 945 MOVQ AX, (CX) // C0 946 MOVQ DX, R8 947 948 // U0*V1 949 MOVQ (16)(REG_P1), R14 950 MOVQ (8)(REG_P2), AX 951 MULQ R11 952 XORQ R10, R10 953 ADDQ AX, R8 954 ADCQ DX, R9 955 956 // U1*V0 957 MOVQ (8)(REG_P1), R12 958 MOVQ (REG_P2), AX 959 MULQ R12 960 ADDQ AX, R8 961 MOVQ R8, (8)(CX) // C1 962 ADCQ DX, R9 963 ADCQ $0, R10 964 965 // U0*V2 966 XORQ R8, R8 967 MOVQ (16)(REG_P2), AX 968 MULQ R11 969 ADDQ AX, R9 970 ADCQ DX, R10 971 ADCQ $0, R8 972 973 // U2*V0 974 MOVQ (REG_P2), R13 975 MOVQ R14, AX 976 MULQ R13 977 ADDQ AX, R9 978 ADCQ DX, R10 979 ADCQ $0, R8 980 981 // U1*V1 982 MOVQ (8)(REG_P2), AX 983 MULQ R12 984 ADDQ AX, R9 985 MOVQ R9, (16)(CX) // C2 986 ADCQ DX, R10 987 ADCQ $0, R8 988 989 // U0*V3 990 XORQ R9, R9 991 MOVQ (24)(REG_P2), AX 992 MULQ R11 993 MOVQ (24)(REG_P1), R15 994 ADDQ AX, R10 995 ADCQ DX, R8 996 ADCQ $0, R9 997 998 // U3*V1 999 MOVQ R15, AX 1000 MULQ R13 1001 ADDQ AX, R10 1002 ADCQ DX, R8 1003 ADCQ $0, R9 1004 1005 // U2*V2 1006 MOVQ (16)(REG_P2), AX 1007 MULQ R12 1008 ADDQ AX, R10 1009 ADCQ DX, R8 1010 ADCQ $0, R9 1011 1012 // U2*V3 1013 MOVQ (8)(REG_P2), AX 1014 MULQ R14 1015 ADDQ AX, R10 1016 MOVQ R10, (24)(CX) // C3 1017 ADCQ DX, R8 1018 ADCQ $0, R9 1019 1020 // U3*V2 1021 XORQ R10, R10 1022 MOVQ (24)(REG_P2), AX 1023 MULQ R12 1024 ADDQ AX, R8 1025 ADCQ DX, R9 1026 ADCQ $0, R10 1027 1028 // U3*V1 1029 MOVQ (8)(REG_P2), AX 1030 MULQ R15 1031 ADDQ AX, R8 1032 ADCQ DX, R9 1033 ADCQ $0, R10 1034 1035 // U2*V2 1036 MOVQ (16)(REG_P2), AX 1037 MULQ R14 1038 ADDQ AX, R8 1039 MOVQ R8, (32)(CX) // C4 1040 ADCQ DX, R9 1041 ADCQ $0, R10 1042 1043 // U2*V3 1044 XORQ R8, R8 1045 MOVQ (24)(REG_P2), AX 1046 MULQ R14 1047 ADDQ AX, R9 1048 ADCQ DX, R10 1049 ADCQ $0, R8 1050 1051 // U3*V2 1052 MOVQ (16)(REG_P2), AX 1053 MULQ R15 1054 ADDQ AX, R9 1055 MOVQ R9, (40)(CX) // C5 1056 ADCQ DX, R10 1057 ADCQ $0, R8 1058 1059 // U3*V3 1060 MOVQ (24)(REG_P2), AX 1061 MULQ R15 1062 ADDQ AX, R10 1063 MOVQ R10, (48)(CX) // C6 1064 ADCQ DX, R8 1065 MOVQ R8, (56)(CX) // C7 1066 1067 // CX[8-15] <- U1*V1 1068 MOVQ (32)(REG_P1), R11 1069 MOVQ (32)(REG_P2), AX 1070 MULQ R11 1071 XORQ R9, R9 1072 MOVQ AX, (64)(CX) // C0 1073 MOVQ DX, R8 1074 1075 MOVQ (48)(REG_P1), R14 1076 MOVQ (40)(REG_P2), AX 1077 MULQ R11 1078 XORQ R10, R10 1079 ADDQ AX, R8 1080 ADCQ DX, R9 1081 1082 MOVQ (40)(REG_P1), R12 1083 MOVQ (32)(REG_P2), AX 1084 MULQ R12 1085 ADDQ AX, R8 1086 MOVQ R8, (72)(CX) // C1 1087 ADCQ DX, R9 1088 ADCQ $0, R10 1089 1090 XORQ R8, R8 1091 MOVQ (48)(REG_P2), AX 1092 MULQ R11 1093 ADDQ AX, R9 1094 ADCQ DX, R10 1095 ADCQ $0, R8 1096 1097 MOVQ (32)(REG_P2), R13 1098 MOVQ R14, AX 1099 MULQ R13 1100 ADDQ AX, R9 1101 ADCQ DX, R10 1102 ADCQ $0, R8 1103 1104 MOVQ (40)(REG_P2), AX 1105 MULQ R12 1106 ADDQ AX, R9 1107 MOVQ R9, (80)(CX) // C2 1108 ADCQ DX, R10 1109 ADCQ $0, R8 1110 1111 XORQ R9, R9 1112 MOVQ (56)(REG_P2), AX 1113 MULQ R11 1114 MOVQ (56)(REG_P1), R15 1115 ADDQ AX, R10 1116 ADCQ DX, R8 1117 ADCQ $0, R9 1118 1119 MOVQ R15, AX 1120 MULQ R13 1121 ADDQ AX, R10 1122 ADCQ DX, R8 1123 ADCQ $0, R9 1124 1125 MOVQ (48)(REG_P2), AX 1126 MULQ R12 1127 ADDQ AX, R10 1128 ADCQ DX, R8 1129 ADCQ $0, R9 1130 1131 MOVQ (40)(REG_P2), AX 1132 MULQ R14 1133 ADDQ AX, R10 1134 MOVQ R10, (88)(CX) // C3 1135 ADCQ DX, R8 1136 ADCQ $0, R9 1137 1138 XORQ R10, R10 1139 MOVQ (56)(REG_P2), AX 1140 MULQ R12 1141 ADDQ AX, R8 1142 ADCQ DX, R9 1143 ADCQ $0, R10 1144 1145 MOVQ (40)(REG_P2), AX 1146 MULQ R15 1147 ADDQ AX, R8 1148 ADCQ DX, R9 1149 ADCQ $0, R10 1150 1151 MOVQ (48)(REG_P2), AX 1152 MULQ R14 1153 ADDQ AX, R8 1154 MOVQ R8, (96)(CX) // C4 1155 ADCQ DX, R9 1156 ADCQ $0, R10 1157 1158 XORQ R8, R8 1159 MOVQ (56)(REG_P2), AX 1160 MULQ R14 1161 ADDQ AX, R9 1162 ADCQ DX, R10 1163 ADCQ $0, R8 1164 1165 MOVQ (48)(REG_P2), AX 1166 MULQ R15 1167 ADDQ AX, R9 1168 MOVQ R9, (104)(CX) // C5 1169 ADCQ DX, R10 1170 ADCQ $0, R8 1171 1172 MOVQ (56)(REG_P2), AX 1173 MULQ R15 1174 ADDQ AX, R10 1175 MOVQ R10, (112)(CX) // C6 1176 ADCQ DX, R8 1177 MOVQ R8, (120)(CX) // C7 1178 1179 // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V1 1180 MOVQ (SP), R8 1181 SUBQ (CX), R8 1182 MOVQ (8)(SP), R9 1183 SBBQ (8)(CX), R9 1184 MOVQ (16)(SP), R10 1185 SBBQ (16)(CX), R10 1186 MOVQ (24)(SP), R11 1187 SBBQ (24)(CX), R11 1188 MOVQ (32)(SP), R12 1189 SBBQ (32)(CX), R12 1190 MOVQ (40)(SP), R13 1191 SBBQ (40)(CX), R13 1192 MOVQ (48)(SP), R14 1193 SBBQ (48)(CX), R14 1194 MOVQ (56)(SP), R15 1195 SBBQ (56)(CX), R15 1196 1197 // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V0 - U0*U1 1198 MOVQ ( 64)(CX), AX; SUBQ AX, R8 1199 MOVQ ( 72)(CX), AX; SBBQ AX, R9 1200 MOVQ ( 80)(CX), AX; SBBQ AX, R10 1201 MOVQ ( 88)(CX), AX; SBBQ AX, R11 1202 MOVQ ( 96)(CX), AX; SBBQ AX, R12 1203 MOVQ (104)(CX), DX; SBBQ DX, R13 1204 MOVQ (112)(CX), DI; SBBQ DI, R14 1205 MOVQ (120)(CX), SI; SBBQ SI, R15 1206 1207 // Final result 1208 ADDQ (32)(CX), R8; MOVQ R8, (32)(CX) 1209 ADCQ (40)(CX), R9; MOVQ R9, (40)(CX) 1210 ADCQ (48)(CX), R10; MOVQ R10, (48)(CX) 1211 ADCQ (56)(CX), R11; MOVQ R11, (56)(CX) 1212 ADCQ (64)(CX), R12; MOVQ R12, (64)(CX) 1213 ADCQ (72)(CX), R13; MOVQ R13, (72)(CX) 1214 ADCQ (80)(CX), R14; MOVQ R14, (80)(CX) 1215 ADCQ (88)(CX), R15; MOVQ R15, (88)(CX) 1216 ADCQ $0, AX; MOVQ AX, (96)(CX) 1217 ADCQ $0, DX; MOVQ DX, (104)(CX) 1218 ADCQ $0, DI; MOVQ DI, (112)(CX) 1219 ADCQ $0, SI; MOVQ SI, (120)(CX) 1220 RET 1221 1222 mul_with_mulx_adcx_adox: 1223 // Mul implementation for CPUs supporting two independent carry chain 1224 // (ADOX/ADCX) instructions and carry-less MULX multiplier 1225 MUL(CX, REG_P1, REG_P2, MULS256_MULX_ADCX_ADOX) 1226 RET 1227 1228 mul_with_mulx: 1229 // Mul implementation for CPUs supporting carry-less MULX multiplier. 1230 MUL(CX, REG_P1, REG_P2, MULS256_MULX) 1231 RET 1232 1233 TEXT ·rdcP503(SB), $8-16 1234 MOVQ z+0(FP), REG_P2 1235 MOVQ x+8(FP), REG_P1 1236 1237 // Check whether to use optimized implementation 1238 CMPB ·HasADXandBMI2(SB), $1 1239 JE redc_with_mulx_adcx_adox 1240 CMPB ·HasBMI2(SB), $1 1241 JE redc_with_mulx 1242 1243 MOVQ (REG_P1), R11 1244 MOVQ P503P1_3, AX 1245 MULQ R11 1246 XORQ R8, R8 1247 ADDQ (24)(REG_P1), AX 1248 MOVQ AX, (24)(REG_P2) 1249 ADCQ DX, R8 1250 1251 XORQ R9, R9 1252 MOVQ P503P1_4, AX 1253 MULQ R11 1254 XORQ R10, R10 1255 ADDQ AX, R8 1256 ADCQ DX, R9 1257 1258 MOVQ (8)(REG_P1), R12 1259 MOVQ P503P1_3, AX 1260 MULQ R12 1261 ADDQ AX, R8 1262 ADCQ DX, R9 1263 ADCQ $0, R10 1264 ADDQ (32)(REG_P1), R8 1265 MOVQ R8, (32)(REG_P2) // Z4 1266 ADCQ $0, R9 1267 ADCQ $0, R10 1268 1269 XORQ R8, R8 1270 MOVQ P503P1_5, AX 1271 MULQ R11 1272 ADDQ AX, R9 1273 ADCQ DX, R10 1274 ADCQ $0, R8 1275 1276 MOVQ P503P1_4, AX 1277 MULQ R12 1278 ADDQ AX, R9 1279 ADCQ DX, R10 1280 ADCQ $0, R8 1281 1282 MOVQ (16)(REG_P1), R13 1283 MOVQ P503P1_3, AX 1284 MULQ R13 1285 ADDQ AX, R9 1286 ADCQ DX, R10 1287 ADCQ $0, R8 1288 ADDQ (40)(REG_P1), R9 1289 MOVQ R9, (40)(REG_P2) // Z5 1290 ADCQ $0, R10 1291 ADCQ $0, R8 1292 1293 XORQ R9, R9 1294 MOVQ P503P1_6, AX 1295 MULQ R11 1296 ADDQ AX, R10 1297 ADCQ DX, R8 1298 ADCQ $0, R9 1299 1300 MOVQ P503P1_5, AX 1301 MULQ R12 1302 ADDQ AX, R10 1303 ADCQ DX, R8 1304 ADCQ $0, R9 1305 1306 MOVQ P503P1_4, AX 1307 MULQ R13 1308 ADDQ AX, R10 1309 ADCQ DX, R8 1310 ADCQ $0, R9 1311 1312 MOVQ (24)(REG_P2), R14 1313 MOVQ P503P1_3, AX 1314 MULQ R14 1315 ADDQ AX, R10 1316 ADCQ DX, R8 1317 ADCQ $0, R9 1318 ADDQ (48)(REG_P1), R10 1319 MOVQ R10, (48)(REG_P2) // Z6 1320 ADCQ $0, R8 1321 ADCQ $0, R9 1322 1323 XORQ R10, R10 1324 MOVQ P503P1_7, AX 1325 MULQ R11 1326 ADDQ AX, R8 1327 ADCQ DX, R9 1328 ADCQ $0, R10 1329 1330 MOVQ P503P1_6, AX 1331 MULQ R12 1332 ADDQ AX, R8 1333 ADCQ DX, R9 1334 ADCQ $0, R10 1335 1336 MOVQ P503P1_5, AX 1337 MULQ R13 1338 ADDQ AX, R8 1339 ADCQ DX, R9 1340 ADCQ $0, R10 1341 1342 MOVQ P503P1_4, AX 1343 MULQ R14 1344 ADDQ AX, R8 1345 ADCQ DX, R9 1346 ADCQ $0, R10 1347 1348 MOVQ (32)(REG_P2), R15 1349 MOVQ P503P1_3, AX 1350 MULQ R15 1351 ADDQ AX, R8 1352 ADCQ DX, R9 1353 ADCQ $0, R10 1354 ADDQ (56)(REG_P1), R8 1355 MOVQ R8, (56)(REG_P2) // Z7 1356 ADCQ $0, R9 1357 ADCQ $0, R10 1358 1359 XORQ R8, R8 1360 MOVQ P503P1_7, AX 1361 MULQ R12 1362 ADDQ AX, R9 1363 ADCQ DX, R10 1364 ADCQ $0, R8 1365 1366 MOVQ P503P1_6, AX 1367 MULQ R13 1368 ADDQ AX, R9 1369 ADCQ DX, R10 1370 ADCQ $0, R8 1371 1372 MOVQ P503P1_5, AX 1373 MULQ R14 1374 ADDQ AX, R9 1375 ADCQ DX, R10 1376 ADCQ $0, R8 1377 1378 MOVQ P503P1_4, AX 1379 MULQ R15 1380 ADDQ AX, R9 1381 ADCQ DX, R10 1382 ADCQ $0, R8 1383 1384 MOVQ (40)(REG_P2), CX 1385 MOVQ P503P1_3, AX 1386 MULQ CX 1387 ADDQ AX, R9 1388 ADCQ DX, R10 1389 ADCQ $0, R8 1390 ADDQ (64)(REG_P1), R9 1391 MOVQ R9, (REG_P2) // Z0 1392 ADCQ $0, R10 1393 ADCQ $0, R8 1394 1395 XORQ R9, R9 1396 MOVQ P503P1_7, AX 1397 MULQ R13 1398 ADDQ AX, R10 1399 ADCQ DX, R8 1400 ADCQ $0, R9 1401 1402 MOVQ P503P1_6, AX 1403 MULQ R14 1404 ADDQ AX, R10 1405 ADCQ DX, R8 1406 ADCQ $0, R9 1407 1408 MOVQ P503P1_5, AX 1409 MULQ R15 1410 ADDQ AX, R10 1411 ADCQ DX, R8 1412 ADCQ $0, R9 1413 1414 MOVQ P503P1_4, AX 1415 MULQ CX 1416 ADDQ AX, R10 1417 ADCQ DX, R8 1418 ADCQ $0, R9 1419 1420 MOVQ (48)(REG_P2), R13 1421 MOVQ P503P1_3, AX 1422 MULQ R13 1423 ADDQ AX, R10 1424 ADCQ DX, R8 1425 ADCQ $0, R9 1426 ADDQ (72)(REG_P1), R10 1427 MOVQ R10, (8)(REG_P2) // Z1 1428 ADCQ $0, R8 1429 ADCQ $0, R9 1430 1431 XORQ R10, R10 1432 MOVQ P503P1_7, AX 1433 MULQ R14 1434 ADDQ AX, R8 1435 ADCQ DX, R9 1436 ADCQ $0, R10 1437 1438 MOVQ P503P1_6, AX 1439 MULQ R15 1440 ADDQ AX, R8 1441 ADCQ DX, R9 1442 ADCQ $0, R10 1443 1444 MOVQ P503P1_5, AX 1445 MULQ CX 1446 ADDQ AX, R8 1447 ADCQ DX, R9 1448 ADCQ $0, R10 1449 1450 MOVQ P503P1_4, AX 1451 MULQ R13 1452 ADDQ AX, R8 1453 ADCQ DX, R9 1454 ADCQ $0, R10 1455 1456 MOVQ (56)(REG_P2), R14 1457 MOVQ P503P1_3, AX 1458 MULQ R14 1459 ADDQ AX, R8 1460 ADCQ DX, R9 1461 ADCQ $0, R10 1462 ADDQ (80)(REG_P1), R8 1463 MOVQ R8, (16)(REG_P2) // Z2 1464 ADCQ $0, R9 1465 ADCQ $0, R10 1466 1467 XORQ R8, R8 1468 MOVQ P503P1_7, AX 1469 MULQ R15 1470 ADDQ AX, R9 1471 ADCQ DX, R10 1472 ADCQ $0, R8 1473 1474 MOVQ P503P1_6, AX 1475 MULQ CX 1476 ADDQ AX, R9 1477 ADCQ DX, R10 1478 ADCQ $0, R8 1479 1480 MOVQ P503P1_5, AX 1481 MULQ R13 1482 ADDQ AX, R9 1483 ADCQ DX, R10 1484 ADCQ $0, R8 1485 1486 MOVQ P503P1_4, AX 1487 MULQ R14 1488 ADDQ AX, R9 1489 ADCQ DX, R10 1490 ADCQ $0, R8 1491 ADDQ (88)(REG_P1), R9 1492 MOVQ R9, (24)(REG_P2) // Z3 1493 ADCQ $0, R10 1494 ADCQ $0, R8 1495 1496 XORQ R9, R9 1497 MOVQ P503P1_7, AX 1498 MULQ CX 1499 ADDQ AX, R10 1500 ADCQ DX, R8 1501 ADCQ $0, R9 1502 1503 MOVQ P503P1_6, AX 1504 MULQ R13 1505 ADDQ AX, R10 1506 ADCQ DX, R8 1507 ADCQ $0, R9 1508 1509 MOVQ P503P1_5, AX 1510 MULQ R14 1511 ADDQ AX, R10 1512 ADCQ DX, R8 1513 ADCQ $0, R9 1514 ADDQ (96)(REG_P1), R10 1515 MOVQ R10, (32)(REG_P2) // Z4 1516 ADCQ $0, R8 1517 ADCQ $0, R9 1518 1519 XORQ R10, R10 1520 MOVQ P503P1_7, AX 1521 MULQ R13 1522 ADDQ AX, R8 1523 ADCQ DX, R9 1524 ADCQ $0, R10 1525 1526 MOVQ P503P1_6, AX 1527 MULQ R14 1528 ADDQ AX, R8 1529 ADCQ DX, R9 1530 ADCQ $0, R10 1531 ADDQ (104)(REG_P1), R8 // Z5 1532 MOVQ R8, (40)(REG_P2) // Z5 1533 ADCQ $0, R9 1534 ADCQ $0, R10 1535 1536 MOVQ P503P1_7, AX 1537 MULQ R14 1538 ADDQ AX, R9 1539 ADCQ DX, R10 1540 ADDQ (112)(REG_P1), R9 // Z6 1541 MOVQ R9, (48)(REG_P2) // Z6 1542 ADCQ $0, R10 1543 ADDQ (120)(REG_P1), R10 // Z7 1544 MOVQ R10, (56)(REG_P2) // Z7 1545 RET 1546 1547 redc_with_mulx_adcx_adox: 1548 // Implementation of the Montgomery reduction for CPUs 1549 // supporting two independent carry chain (ADOX/ADCX) 1550 // instructions and carry-less MULX multiplier 1551 MOVQ BP, 0(SP) // push: BP is Callee-save. 1552 REDC(REG_P2, REG_P1, MULS_128x320_MULX_ADCX_ADOX) 1553 MOVQ 0(SP), BP // pop: BP is Callee-save. 1554 RET 1555 1556 redc_with_mulx: 1557 // Implementation of the Montgomery reduction for CPUs 1558 // supporting carry-less MULX multiplier. 1559 MOVQ BP, 0(SP) // push: BP is Callee-save. 1560 REDC(REG_P2, REG_P1, MULS_128x320_MULX) 1561 MOVQ 0(SP), BP // pop: BP is Callee-save. 1562 RET 1563 1564 TEXT ·adlP503(SB), NOSPLIT, $0-24 1565 1566 MOVQ z+0(FP), REG_P3 1567 MOVQ x+8(FP), REG_P1 1568 MOVQ y+16(FP), REG_P2 1569 1570 MOVQ (REG_P1), R8 1571 MOVQ (8)(REG_P1), R9 1572 MOVQ (16)(REG_P1), R10 1573 MOVQ (24)(REG_P1), R11 1574 MOVQ (32)(REG_P1), R12 1575 MOVQ (40)(REG_P1), R13 1576 MOVQ (48)(REG_P1), R14 1577 MOVQ (56)(REG_P1), R15 1578 MOVQ (64)(REG_P1), AX 1579 MOVQ (72)(REG_P1), BX 1580 MOVQ (80)(REG_P1), CX 1581 1582 ADDQ (REG_P2), R8 1583 ADCQ (8)(REG_P2), R9 1584 ADCQ (16)(REG_P2), R10 1585 ADCQ (24)(REG_P2), R11 1586 ADCQ (32)(REG_P2), R12 1587 ADCQ (40)(REG_P2), R13 1588 ADCQ (48)(REG_P2), R14 1589 ADCQ (56)(REG_P2), R15 1590 ADCQ (64)(REG_P2), AX 1591 ADCQ (72)(REG_P2), BX 1592 ADCQ (80)(REG_P2), CX 1593 1594 MOVQ R8, (REG_P3) 1595 MOVQ R9, (8)(REG_P3) 1596 MOVQ R10, (16)(REG_P3) 1597 MOVQ R11, (24)(REG_P3) 1598 MOVQ R12, (32)(REG_P3) 1599 MOVQ R13, (40)(REG_P3) 1600 MOVQ R14, (48)(REG_P3) 1601 MOVQ R15, (56)(REG_P3) 1602 MOVQ AX, (64)(REG_P3) 1603 MOVQ BX, (72)(REG_P3) 1604 MOVQ CX, (80)(REG_P3) 1605 1606 MOVQ (88)(REG_P1), R8 1607 MOVQ (96)(REG_P1), R9 1608 MOVQ (104)(REG_P1), R10 1609 MOVQ (112)(REG_P1), R11 1610 MOVQ (120)(REG_P1), R12 1611 1612 ADCQ (88)(REG_P2), R8 1613 ADCQ (96)(REG_P2), R9 1614 ADCQ (104)(REG_P2), R10 1615 ADCQ (112)(REG_P2), R11 1616 ADCQ (120)(REG_P2), R12 1617 1618 MOVQ R8, (88)(REG_P3) 1619 MOVQ R9, (96)(REG_P3) 1620 MOVQ R10, (104)(REG_P3) 1621 MOVQ R11, (112)(REG_P3) 1622 MOVQ R12, (120)(REG_P3) 1623 1624 RET 1625 1626 TEXT ·sulP503(SB), NOSPLIT, $0-24 1627 1628 MOVQ z+0(FP), REG_P3 1629 MOVQ x+8(FP), REG_P1 1630 MOVQ y+16(FP), REG_P2 1631 // Used later to store result of 0-borrow 1632 XORQ CX, CX 1633 1634 // SUBC for first 11 limbs 1635 MOVQ (REG_P1), R8 1636 MOVQ (8)(REG_P1), R9 1637 MOVQ (16)(REG_P1), R10 1638 MOVQ (24)(REG_P1), R11 1639 MOVQ (32)(REG_P1), R12 1640 MOVQ (40)(REG_P1), R13 1641 MOVQ (48)(REG_P1), R14 1642 MOVQ (56)(REG_P1), R15 1643 MOVQ (64)(REG_P1), AX 1644 MOVQ (72)(REG_P1), BX 1645 1646 SUBQ (REG_P2), R8 1647 SBBQ (8)(REG_P2), R9 1648 SBBQ (16)(REG_P2), R10 1649 SBBQ (24)(REG_P2), R11 1650 SBBQ (32)(REG_P2), R12 1651 SBBQ (40)(REG_P2), R13 1652 SBBQ (48)(REG_P2), R14 1653 SBBQ (56)(REG_P2), R15 1654 SBBQ (64)(REG_P2), AX 1655 SBBQ (72)(REG_P2), BX 1656 1657 MOVQ R8, (REG_P3) 1658 MOVQ R9, (8)(REG_P3) 1659 MOVQ R10, (16)(REG_P3) 1660 MOVQ R11, (24)(REG_P3) 1661 MOVQ R12, (32)(REG_P3) 1662 MOVQ R13, (40)(REG_P3) 1663 MOVQ R14, (48)(REG_P3) 1664 MOVQ R15, (56)(REG_P3) 1665 MOVQ AX, (64)(REG_P3) 1666 MOVQ BX, (72)(REG_P3) 1667 1668 // SUBC for last 5 limbs 1669 MOVQ (80)(REG_P1), R8 1670 MOVQ (88)(REG_P1), R9 1671 MOVQ (96)(REG_P1), R10 1672 MOVQ (104)(REG_P1), R11 1673 MOVQ (112)(REG_P1), R12 1674 MOVQ (120)(REG_P1), R13 1675 1676 SBBQ (80)(REG_P2), R8 1677 SBBQ (88)(REG_P2), R9 1678 SBBQ (96)(REG_P2), R10 1679 SBBQ (104)(REG_P2), R11 1680 SBBQ (112)(REG_P2), R12 1681 SBBQ (120)(REG_P2), R13 1682 1683 MOVQ R8, (80)(REG_P3) 1684 MOVQ R9, (88)(REG_P3) 1685 MOVQ R10, (96)(REG_P3) 1686 MOVQ R11, (104)(REG_P3) 1687 MOVQ R12, (112)(REG_P3) 1688 MOVQ R13, (120)(REG_P3) 1689 1690 // Now the carry flag is 1 if x-y < 0. If so, add p*2^512. 1691 SBBQ $0, CX 1692 1693 // Load p into registers: 1694 MOVQ P503_0, R8 1695 // P503_{1,2} = P503_0, so reuse R8 1696 MOVQ P503_3, R9 1697 MOVQ P503_4, R10 1698 MOVQ P503_5, R11 1699 MOVQ P503_6, R12 1700 MOVQ P503_7, R13 1701 1702 ANDQ CX, R8 1703 ANDQ CX, R9 1704 ANDQ CX, R10 1705 ANDQ CX, R11 1706 ANDQ CX, R12 1707 ANDQ CX, R13 1708 1709 MOVQ (64 )(REG_P3), AX; ADDQ R8, AX; MOVQ AX, (64 )(REG_P3) 1710 MOVQ (64+ 8)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+ 8)(REG_P3) 1711 MOVQ (64+16)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+16)(REG_P3) 1712 MOVQ (64+24)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (64+24)(REG_P3) 1713 MOVQ (64+32)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (64+32)(REG_P3) 1714 MOVQ (64+40)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (64+40)(REG_P3) 1715 MOVQ (64+48)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (64+48)(REG_P3) 1716 MOVQ (64+56)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (64+56)(REG_P3) 1717 1718 RET