github.com/cloudflare/circl@v1.5.0/dh/sidh/internal/p434/arith_amd64.s (about) 1 // +build amd64,!purego 2 3 #include "textflag.h" 4 5 // p434 6 #define P434_0 $0xFFFFFFFFFFFFFFFF 7 #define P434_3 $0xFDC1767AE2FFFFFF 8 #define P434_4 $0x7BC65C783158AEA3 9 #define P434_5 $0x6CFC5FD681C52056 10 #define P434_6 $0x0002341F27177344 11 12 // p434 x 2 13 #define P434X2_0 $0xFFFFFFFFFFFFFFFE 14 #define P434X2_1 $0xFFFFFFFFFFFFFFFF 15 #define P434X2_3 $0xFB82ECF5C5FFFFFF 16 #define P434X2_4 $0xF78CB8F062B15D47 17 #define P434X2_5 $0xD9F8BFAD038A40AC 18 #define P434X2_6 $0x0004683E4E2EE688 19 20 // Redefine P434p1Zeros 21 #define P434_P1_ZEROS 3 22 23 // Performs schoolbook multiplication of 128-bit with 256-bit 24 // number. Uses MULX, ADOX, ADCX instruction. 25 // 26 // Uses registers: DX,AX 27 // Calculates: 28 // (I0,I1) x [M1][0,1,2,3] = (T0,T1,T2,T3,T4,T5) 29 // |-128-| x |--- 256 ---| = |------ 384 ------| 30 // Assuming the first digit multiplication was already performed. 31 #define MULX128x256(I1, M1, T1, T2, T3, T4, T5) \ 32 MOVQ M1+ 8(SB), AX \ 33 MULXQ AX, T4, T2 \ 34 XORQ AX, AX \ 35 MOVQ M1+16(SB), AX \ 36 MULXQ AX, T5, T3 \ 37 ADOXQ T4, T1 \ // T1: interm1 38 ADOXQ T5, T2 \ // T2: interm2 39 MOVQ M1+24(SB), AX \ 40 MULXQ AX, T5, T4 \ 41 ADOXQ T5, T3 \ // T3: interm3 42 MOVL $0, AX \ 43 ADOXQ AX, T4 \ // T4: interm4 44 \ 45 XORQ AX, AX \ 46 MOVQ I1, DX \ 47 MOVQ M1+ 0(SB), AX \ 48 MULXQ AX, T5, I1 \ // T0 <- C0 49 ADCXQ T5, T1 \ 50 ADCXQ I1, T2 \ // T1 <- C1 51 MOVQ M1+ 8(SB), AX \ 52 MULXQ AX, I1, T5 \ 53 ADCXQ T5, T3 \ 54 ADOXQ I1, T2 \ // T2 <- C2 55 MOVQ M1+16(SB), AX \ 56 MULXQ AX, I1, T5 \ 57 ADCXQ T5, T4 \ 58 ADOXQ I1, T3 \ // T3 <- C3 59 MOVQ M1+24(SB), AX \ 60 MULXQ AX, I1, T5 \ 61 MOVL $0, AX \ 62 ADCXQ AX, T5 \ 63 ADOXQ I1, T4 \ // T4 <- C4 64 ADOXQ AX, T5 // T5 <- C5 65 66 // Performs schoolbook multiplication of 64-bit with 256-bit 67 // number. Uses MULX and ADOX instructions. 68 // 69 // Uses registers: DX,AX 70 // Calculates: 71 // (I0) x [M1][0,1,2,3] = (T0,T1,T2,T3,T4) 72 // |64| x |--- 256 ---| = |----- 320 ----| 73 // Assuming the first digit multiplication was already performed. 74 #define MULX64x256(M1, T1, T2, T3, T4, T5) \ 75 MOVQ M1+ 8(SB), AX \ 76 MULXQ AX, T4, T2 \ 77 XORQ AX, AX \ 78 MOVQ M1+16(SB), AX \ 79 MULXQ AX, T5, T3 \ 80 ADOXQ T4, T1 \ // T1 <- C1 81 ADOXQ T5, T2 \ // T2 <- C2 82 MOVQ M1+24(SB), AX \ 83 MULXQ AX, T5, T4 \ 84 ADOXQ T5, T3 \ // T3 <- C3 85 MOVL $0, AX \ 86 ADOXQ AX, T4 // T4 <- C4 87 88 // Performs schoolbook multiplication of two 192-bit numbers 89 // number. Uses MULX and ADOX instructions. 90 // 91 // Uses registers: DX,AX 92 #define MULX192(IM0,M0,IM1,M1,ID,MDST,T0,T1,T2,T3,T4,T5,T6) \ 93 MOVQ (0+IM0)(M0), DX \ 94 MULXQ (0+IM1)(M1), T1, T0 \ // T0:T1 = A0*B0 95 MOVQ T1,(ID+0)(MDST) \ // MDST0 96 MULXQ (IM1+ 8)(M1), T2, T1 \ // T1:T2 = A0*B1 97 XORQ AX, AX \ 98 ADOXQ T2, T0 \ 99 MULXQ (IM1+16)(M1),T3, T2 \ // T2:T3 = A0*B2 100 ADOXQ T3, T1 \ 101 \ 102 MOVQ (IM0+8)(M0), DX \ 103 MULXQ (IM1+0)(M1), T4, T3 \ // T3:T4 = A1*B0 104 ADOXQ AX, T2 \ 105 XORQ AX, AX \ 106 \ 107 MULXQ (IM1+8)(M1), T6, T5 \ // T6:T7 = A1*B1 108 ADOXQ T0, T4 \ 109 MOVQ T4,(ID+8)(MDST) \ // MDST1 110 ADCXQ T6, T3 \ 111 \ 112 MULXQ (IM1+16)(M1),T0, T6 \ // T6:T0 = A1*B2 113 ADOXQ T1, T3 \ 114 ADCXQ T0, T5 \ 115 ADCXQ AX, T6 \ 116 ADOXQ T2, T5 \ 117 \ 118 MOVQ (IM0+16)(M0),DX \ 119 MULXQ (IM1+ 0)(M1), T0, T1 \ // T1:T0 = A2*B0 120 ADOXQ AX, T6 \ 121 XORQ AX, AX \ 122 \ 123 MULXQ (IM1+ 8)(M1), T2, T4 \ // T4:T2 = A2*B1 124 ADOXQ T3, T0 \ 125 MOVQ T0, (ID+16)(MDST) \ // MDST2 126 ADCXQ T5, T1 \ 127 \ 128 MULXQ (IM1+16)(M1),T3, T0 \ // T0:T3 = A2*B2 129 ADCXQ T6, T4 \ 130 ADCXQ AX, T0 \ 131 ADOXQ T2, T1 \ 132 ADOXQ T4, T3 \ 133 ADOXQ T0, AX 134 135 // Performs schoolbook multiplication of 2 256-bit numbers. Uses 136 // MULX instruction. Result is stored in 256 bits pointed by $DST. 137 // 138 // Uses registers: DX,AX 139 #define MULX256(IM0,M0,IM1,M1,ID,MDST,T0,T1,T2,T3,T4,T5,T6,T7,T8,T9) \ 140 MOVQ (IM0+0)(M0), DX \ 141 MULXQ (IM1+0)(M1), T1, T0 \ // A0*B[0-3] 142 MOVQ T1, (ID+0)(MDST) \ 143 MULXQ (IM1+8)(M1), T2, T1 \ 144 XORQ AX, AX \ 145 ADOXQ T2, T0 \ 146 MULXQ (IM1+16)(M1),T3, T2 \ 147 ADOXQ T3, T1 \ 148 MULXQ (IM1+24)(M1),T4, T3 \ 149 ADOXQ T4, T2 \ 150 \ 151 MOVQ (IM0+8)(M0), DX \ 152 MULXQ (IM1+0)(M1), T4, T5 \ // A1*B[0-3] 153 ADOXQ AX, T3 \ 154 XORQ AX, AX \ 155 MULXQ (IM1+8)(M1), T7, T6 \ 156 ADOXQ T0, T4 \ 157 MOVQ T4, (ID+8)(MDST) \ 158 ADCXQ T7, T5 \ 159 MULXQ (IM1+16)(M1),T8, T7 \ 160 ADCXQ T8, T6 \ 161 ADOXQ T1, T5 \ 162 MULXQ (IM1+24)(M1),T9, T8 \ 163 ADCXQ T9, T7 \ 164 ADCXQ AX, T8 \ 165 ADOXQ T2, T6 \ 166 \ 167 MOVQ (IM0+16)(M0),DX \ // A2*B[0-3] 168 MULXQ (IM1+ 0)(M1), T0, T1 \ 169 ADOXQ T3, T7 \ 170 ADOXQ AX, T8 \ 171 XORQ AX, AX \ 172 MULXQ (IM1+8)(M1), T3, T2 \ 173 ADOXQ T5, T0 \ 174 MOVQ T0, (ID+16)(MDST) \ 175 ADCXQ T3, T1 \ 176 MULXQ (IM1+16)(M1),T4, T3 \ 177 ADCXQ T4, T2 \ 178 ADOXQ T6, T1 \ 179 MULXQ (IM1+24)(M1),T9, T4 \ 180 ADCXQ T9, T3 \ 181 MOVQ (IM0+24)(M0),DX \ 182 ADCXQ AX, T4 \ 183 \ 184 ADOXQ T7, T2 \ 185 ADOXQ T8, T3 \ 186 ADOXQ AX, T4 \ 187 \ 188 MULXQ (IM1+ 0)(M1), T0, T5\ // A3*B[0-3] 189 XORQ AX, AX \ 190 MULXQ (IM1+ 8)(M1), T7, T6\ 191 ADCXQ T7, T5 \ 192 ADOXQ T0, T1 \ 193 MULXQ (IM1+16)(M1), T8, T7 \ 194 ADCXQ T8, T6 \ 195 ADOXQ T5, T2 \ 196 MULXQ (IM1+24)(M1), T9, T8 \ 197 ADCXQ T9, T7 \ 198 ADCXQ AX, T8 \ 199 ADOXQ T6, T3 \ 200 ADOXQ T7, T4 \ 201 ADOXQ AX, T8 \ 202 MOVQ T1, (ID+24)(MDST) \ 203 MOVQ T2, (ID+32)(MDST) \ 204 MOVQ T3, (ID+40)(MDST) \ 205 MOVQ T4, (ID+48)(MDST) \ 206 MOVQ T8, (ID+56)(MDST) 207 208 // Performs schoolbook multiplication of 64-bit with 256-bit 209 // number. 210 // 211 // Uses registers: DX, AX 212 #define MUL64x256(IDX,M0,M1,C0,C1,C2,C3,C4,T0) \ 213 MOVQ (IDX)(M0), T0 \ 214 \ 215 XORQ C2, C2 \ 216 MOVQ M1+0(SB), AX \ 217 MULQ T0 \ 218 MOVQ AX, C0 \ 219 MOVQ DX, C1 \ 220 \ 221 XORQ C3, C3 \ 222 MOVQ M1+8(SB), AX \ 223 MULQ T0 \ 224 ADDQ AX, C1 \ 225 ADCQ DX, C2 \ 226 \ 227 XORQ C4, C4 \ 228 MOVQ M1+16(SB), AX \ 229 MULQ T0 \ 230 ADDQ AX, C2 \ 231 ADCQ DX, C3 \ 232 \ 233 MOVQ M1+24(SB), AX \ 234 MULQ T0 \ 235 ADDQ AX, C3 \ 236 ADCQ DX, C4 237 238 // Performs schoolbook multiplication of 128-bit with 256-bit 239 // number. Destroys RAX and RDX 240 // 241 // Uses registers: DX, AX 242 #define MUL128x256(IDX,M0,M1,C0,C1,C2,C3,C4,C5,T0,T1) \ 243 \ // A0 x B0 244 MOVQ (IDX+0)(M0), T0 \ 245 MOVQ M1+0(SB), AX \ 246 MULQ T0 \ 247 XORQ C2, C2 \ 248 MOVQ AX, C0 \ 249 MOVQ DX, C1 \ 250 \ // A0 x B1 251 MOVQ M1+8(SB), AX \ 252 MULQ T0 \ 253 XORQ C3, C3 \ 254 ADDQ AX, C1 \ 255 ADCQ DX, C2 \ 256 \ // A1 x B0 257 MOVQ (IDX+8)(M0), T1 \ 258 MOVQ M1+0(SB), AX \ 259 MULQ T1 \ 260 ADDQ AX, C1 \ 261 ADCQ DX, C2 \ 262 ADCQ $0, C3 \ 263 \ // A0 x B2 264 XORQ C4, C4 \ 265 MOVQ M1+16(SB), AX \ 266 MULQ T0 \ 267 ADDQ AX, C2 \ 268 ADCQ DX, C3 \ 269 ADCQ $0, C4 \ 270 \ // A1 x B1 271 MOVQ M1+8(SB), AX \ 272 MULQ T1 \ 273 ADDQ AX, C2 \ 274 ADCQ DX, C3 \ 275 ADCQ $0, C4 \ 276 \ // A0 x B3 277 MOVQ M1+24(SB), AX \ 278 MULQ T0 \ 279 XORQ C5, C5 \ 280 ADDQ AX, C3 \ 281 ADCQ DX, C4 \ 282 ADCQ $0, C5 \ 283 \ // A1 x B2 284 MOVQ M1+16(SB), AX \ 285 MULQ T1 \ 286 ADDQ AX, C3 \ 287 ADCQ DX, C4 \ 288 ADCQ $0, C5 \ 289 \ // A1 x B3 290 MOVQ M1+24(SB), AX \ 291 MULQ T1 \ 292 ADDQ AX, C4 \ 293 ADCQ DX, C5 294 295 // Montgomery reduction 296 // Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 297 #define REDC_MULX(P1, MUL01, MUL23, MUL45, MUL67) \ 298 MOVQ 0x0(DI), DX \ 299 MOVQ 0x8(DI), R14 \ 300 MOVQ P1, AX \ 301 MULXQ AX, R8, R9 \ 302 MUL01 \ 303 MOVQ 0x10(DI), DX \ 304 MOVQ 0x48(DI), CX \ 305 ADDQ 0x18(DI), R8 \ 306 ADCQ 0x20(DI), R9 \ 307 ADCQ 0x28(DI), R10 \ 308 ADCQ 0x30(DI), R11 \ 309 ADCQ 0x38(DI), R12 \ 310 ADCQ 0x40(DI), R13 \ 311 ADCQ $0, CX \ 312 MOVQ P1, AX \ 313 MULXQ AX, BX, BP \ 314 MOVQ R9, 0x0(SI) \ 315 MOVQ R10, 0x8(SI) \ 316 MOVQ R11, 0x10(SI) \ 317 MOVQ R12, 0x18(SI) \ 318 MOVQ R13, 0x20(SI) \ 319 MOVQ CX, 0x28(SI) \ 320 MOVQ 0x50(DI), R9 \ 321 MOVQ 0x58(DI), R10 \ 322 MOVQ 0x60(DI), R11 \ 323 MOVQ 0x68(DI), DI \ 324 ADCQ $0, R9 \ 325 ADCQ $0, R10 \ 326 ADCQ $0, R11 \ 327 ADCQ $0, DI \ 328 MUL23 \ 329 MOVQ 0x0(SI), DX \ 330 ADDQ 0x08(SI), BX \ 331 ADCQ 0x10(SI), BP \ 332 ADCQ 0x18(SI), R12 \ 333 ADCQ 0x20(SI), R13 \ 334 ADCQ 0x28(SI), R14 \ 335 MOVQ R14, 0x18(SI) \ 336 MOVQ CX, R14 \ 337 MOVQ $0, CX \ 338 ADCQ R9, R14 \ 339 ADCQ R10, CX \ 340 MOVQ P1, AX \ 341 MULXQ AX, R8, R9 \ 342 MOVQ BP, 0x0(SI) \ 343 MOVQ R12, 0x8(SI) \ 344 MOVQ R13, 0x10(SI) \ 345 ADCQ $0, R11 \ 346 ADCQ $0, DI \ 347 MUL45 \ 348 MOVQ 0x0(SI), DX \ 349 ADDQ 0x8(SI), R8 \ 350 ADCQ 0x10(SI), R9 \ 351 ADCQ 0x18(SI), R10 \ 352 ADCQ R14, BP \ 353 ADCQ CX, R12 \ 354 ADCQ R11, R13 \ 355 ADCQ $0, DI \ 356 MOVQ P1, AX \ 357 MULXQ AX, R14, BX \ 358 MOVQ R8, 0x0(SI) \ 359 MOVQ R9, 0x8(SI) \ 360 MUL67 \ 361 ADDQ R10, R14 \ 362 ADCQ BP, BX \ 363 ADCQ R12, R8 \ 364 ADCQ R13, R9 \ 365 ADCQ DI, R11 \ 366 MOVQ R14, 0x10(SI) \ 367 MOVQ BX, 0x18(SI) \ 368 MOVQ R8, 0x20(SI) \ 369 MOVQ R9, 0x28(SI) \ 370 MOVQ R11, 0x30(SI) 371 372 #define REDC_MULQ(MUL01, MUL23, MUL45, MUL67) \ 373 MUL01 \ 374 XORQ CX, CX \ 375 ADDQ 0x18(DI), R8 \ 376 ADCQ 0x20(DI), R9 \ 377 ADCQ 0x28(DI), R10 \ 378 ADCQ 0x30(DI), R11 \ 379 ADCQ 0x38(DI), R12 \ 380 ADCQ 0x40(DI), R13 \ 381 ADCQ 0x48(DI), CX \ 382 MOVQ R8, 0x18(DI) \ 383 MOVQ R9, 0x20(DI) \ 384 MOVQ R10, 0x28(DI) \ 385 MOVQ R11, 0x30(DI) \ 386 MOVQ R12, 0x38(DI) \ 387 MOVQ R13, 0x40(DI) \ 388 MOVQ CX, 0x48(DI) \ 389 MOVQ 0x50(DI), R8 \ 390 MOVQ 0x58(DI), R9 \ 391 MOVQ 0x60(DI), R10 \ 392 MOVQ 0x68(DI), R11 \ 393 ADCQ $0, R8 \ 394 ADCQ $0, R9 \ 395 ADCQ $0, R10 \ 396 ADCQ $0, R11 \ 397 MOVQ R8, 0x50(DI) \ 398 MOVQ R9, 0x58(DI) \ 399 MOVQ R10, 0x60(DI) \ 400 MOVQ R11, 0x68(DI) \ 401 \ 402 MUL23 \ 403 XORQ CX, CX \ 404 ADDQ 0x28(DI), R8 \ 405 ADCQ 0x30(DI), R9 \ 406 ADCQ 0x38(DI), R10 \ 407 ADCQ 0x40(DI), R11 \ 408 ADCQ 0x48(DI), R12 \ 409 ADCQ 0x50(DI), R13 \ 410 ADCQ 0x58(DI), CX \ 411 MOVQ R8, 0x28(DI) \ 412 MOVQ R9, 0x30(DI) \ 413 MOVQ R10, 0x38(DI) \ 414 MOVQ R11, 0x40(DI) \ 415 MOVQ R12, 0x48(DI) \ 416 MOVQ R13, 0x50(DI) \ 417 MOVQ CX, 0x58(DI) \ 418 MOVQ 0x60(DI), R8 \ 419 MOVQ 0x68(DI), R9 \ 420 ADCQ $0, R8 \ 421 ADCQ $0, R9 \ 422 MOVQ R8, 0x60(DI) \ 423 MOVQ R9, 0x68(DI) \ 424 \ 425 MUL45 \ 426 XORQ CX, CX \ 427 ADDQ 0x38(DI), R8 \ 428 ADCQ 0x40(DI), R9 \ 429 ADCQ 0x48(DI), R10 \ 430 ADCQ 0x50(DI), R11 \ 431 ADCQ 0x58(DI), R12 \ 432 ADCQ 0x60(DI), R13 \ 433 ADCQ 0x68(DI), CX \ 434 MOVQ R8, 0x0(SI) \ // OUT0 435 MOVQ R9, 0x8(SI) \ // OUT1 436 MOVQ R10, 0x48(DI) \ 437 MOVQ R11, 0x50(DI) \ 438 MOVQ R12, 0x58(DI) \ 439 MOVQ R13, 0x60(DI) \ 440 MOVQ CX, 0x68(DI) \ 441 \ 442 MUL67 \ 443 ADDQ 0x48(DI), R8 \ 444 ADCQ 0x50(DI), R9 \ 445 ADCQ 0x58(DI), R10 \ 446 ADCQ 0x60(DI), R11 \ 447 ADCQ 0x68(DI), R12 \ 448 MOVQ R8, 0x10(SI) \ // OUT2 449 MOVQ R9, 0x18(SI) \ // OUT3 450 MOVQ R10, 0x20(SI) \ // OUT4 451 MOVQ R11, 0x28(SI) \ // OUT5 452 MOVQ R12, 0x30(SI) // OUT6 453 454 TEXT ·cswapP434(SB),NOSPLIT,$0-17 455 456 MOVQ x+0(FP), DI 457 MOVQ y+8(FP), SI 458 MOVB choice+16(FP), AL // AL = 0 or 1 459 MOVBLZX AL, AX // AX = 0 or 1 460 NEGQ AX // AX = 0x00..00 or 0xff..ff 461 #ifndef CSWAP_BLOCK 462 #define CSWAP_BLOCK(idx) \ 463 MOVQ (idx*8)(DI), BX \ // BX = x[idx] 464 MOVQ (idx*8)(SI), CX \ // CX = y[idx] 465 MOVQ CX, DX \ // DX = y[idx] 466 XORQ BX, DX \ // DX = y[idx] ^ x[idx] 467 ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask 468 XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx] 469 XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx] 470 MOVQ BX, (idx*8)(DI) \ 471 MOVQ CX, (idx*8)(SI) 472 #endif 473 CSWAP_BLOCK(0) 474 CSWAP_BLOCK(1) 475 CSWAP_BLOCK(2) 476 CSWAP_BLOCK(3) 477 CSWAP_BLOCK(4) 478 CSWAP_BLOCK(5) 479 CSWAP_BLOCK(6) 480 #ifdef CSWAP_BLOCK 481 #undef CSWAP_BLOCK 482 #endif 483 RET 484 485 TEXT ·cmovP434(SB),NOSPLIT,$0-17 486 487 MOVQ x+0(FP), DI 488 MOVQ y+8(FP), SI 489 MOVB choice+16(FP), AL // AL = 0 or 1 490 MOVBLZX AL, AX // AX = 0 or 1 491 NEGQ AX // AX = 0x00..00 or 0xff..ff 492 #ifndef CMOV_BLOCK 493 #define CMOV_BLOCK(idx) \ 494 MOVQ (idx*8)(DI), BX \ // BX = x[idx] 495 MOVQ (idx*8)(SI), DX \ // DX = y[idx] 496 XORQ BX, DX \ // DX = y[idx] ^ x[idx] 497 ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask 498 XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx] 499 MOVQ BX, (idx*8)(DI) 500 #endif 501 CMOV_BLOCK(0) 502 CMOV_BLOCK(1) 503 CMOV_BLOCK(2) 504 CMOV_BLOCK(3) 505 CMOV_BLOCK(4) 506 CMOV_BLOCK(5) 507 CMOV_BLOCK(6) 508 #ifdef CMOV_BLOCK 509 #undef CMOV_BLOCK 510 #endif 511 RET 512 513 TEXT ·addP434(SB),NOSPLIT,$0-24 514 MOVQ z+0(FP), DX 515 MOVQ x+8(FP), DI 516 MOVQ y+16(FP), SI 517 518 // Used later to calculate a mask 519 XORQ CX, CX 520 521 // [R8-R14]: z = x + y 522 MOVQ ( 0)(DI), R8; ADDQ ( 0)(SI), R8 523 MOVQ ( 8)(DI), R9; ADCQ ( 8)(SI), R9 524 MOVQ (16)(DI), R10; ADCQ (16)(SI), R10 525 MOVQ (24)(DI), R11; ADCQ (24)(SI), R11 526 MOVQ (32)(DI), R12; ADCQ (32)(SI), R12 527 MOVQ (40)(DI), R13; ADCQ (40)(SI), R13 528 MOVQ (48)(DI), R14; ADCQ (48)(SI), R14 529 530 XORQ DI, DI 531 532 MOVQ P434X2_0, AX; SUBQ AX, R8 533 MOVQ P434X2_1, AX; SBBQ AX, R9 534 SBBQ AX, R10 535 MOVQ P434X2_3, AX; SBBQ AX, R11 536 MOVQ P434X2_4, AX; SBBQ AX, R12 537 MOVQ P434X2_5, AX; SBBQ AX, R13 538 MOVQ P434X2_6, AX; SBBQ AX, R14 539 540 // mask 541 SBBQ $0, CX 542 543 // if z<0 add P434x2 back 544 MOVQ P434X2_0, R15; ANDQ CX, R15; 545 MOVQ P434X2_1, AX; ANDQ CX, AX; 546 547 ADDQ R8, R15; MOVQ R15, ( 0)(DX) 548 ADCQ AX, R9; MOVQ R9, ( 8)(DX) 549 ADCQ AX, R10; MOVQ R10, (16)(DX) 550 551 ADCQ $0, DI 552 MOVQ P434X2_3, R15; ANDQ CX, R15; 553 MOVQ P434X2_4, R8; ANDQ CX, R8; 554 MOVQ P434X2_5, R9; ANDQ CX, R9; 555 MOVQ P434X2_6, R10; ANDQ CX, R10; 556 BTQ $0, DI 557 558 ADCQ R11, R15; MOVQ R15, (24)(DX) 559 ADCQ R12, R8; MOVQ R8, (32)(DX) 560 ADCQ R13, R9; MOVQ R9, (40)(DX) 561 ADCQ R14, R10; MOVQ R10, (48)(DX) 562 563 RET 564 565 TEXT ·adlP434(SB),NOSPLIT,$0-24 566 MOVQ z+0(FP), DX 567 MOVQ x+8(FP), DI 568 MOVQ y+16(FP),SI 569 570 MOVQ ( 0)(DI), R8 571 ADDQ ( 0)(SI), R8 572 MOVQ ( 8)(DI), R9 573 ADCQ ( 8)(SI), R9 574 MOVQ (16)(DI), R10 575 ADCQ (16)(SI), R10 576 MOVQ (24)(DI), R11 577 ADCQ (24)(SI), R11 578 MOVQ (32)(DI), R12 579 ADCQ (32)(SI), R12 580 MOVQ (40)(DI), R13 581 ADCQ (40)(SI), R13 582 MOVQ (48)(DI), R14 583 ADCQ (48)(SI), R14 584 MOVQ (56)(DI), R15 585 ADCQ (56)(SI), R15 586 MOVQ (64)(DI), AX 587 ADCQ (64)(SI), AX 588 MOVQ (72)(DI), BX 589 ADCQ (72)(SI), BX 590 MOVQ (80)(DI), CX 591 ADCQ (80)(SI), CX 592 593 MOVQ R8, ( 0)(DX) 594 MOVQ R9, ( 8)(DX) 595 MOVQ R10,(16)(DX) 596 MOVQ R11,(24)(DX) 597 MOVQ R12,(32)(DX) 598 MOVQ R13,(40)(DX) 599 MOVQ R14,(48)(DX) 600 MOVQ R15,(56)(DX) 601 MOVQ AX, (64)(DX) 602 MOVQ BX, (72)(DX) 603 MOVQ CX, (80)(DX) 604 605 MOVQ (88)(DI), R8 606 ADCQ (88)(SI), R8 607 MOVQ (96)(DI), R9 608 ADCQ (96)(SI), R9 609 MOVQ (104)(DI), R10 610 ADCQ (104)(SI), R10 611 612 MOVQ R8, (88)(DX) 613 MOVQ R9, (96)(DX) 614 MOVQ R10,(104)(DX) 615 RET 616 617 TEXT ·subP434(SB),NOSPLIT,$0-24 618 MOVQ z+0(FP), DX 619 MOVQ x+8(FP), DI 620 MOVQ y+16(FP), SI 621 622 // Used later to calculate a mask 623 XORQ CX, CX 624 625 MOVQ ( 0)(DI), R8; SUBQ ( 0)(SI), R8 626 MOVQ ( 8)(DI), R9; SBBQ ( 8)(SI), R9 627 MOVQ (16)(DI), R10; SBBQ (16)(SI), R10 628 MOVQ (24)(DI), R11; SBBQ (24)(SI), R11 629 MOVQ (32)(DI), R12; SBBQ (32)(SI), R12 630 MOVQ (40)(DI), R13; SBBQ (40)(SI), R13 631 MOVQ (48)(DI), R14; SBBQ (48)(SI), R14 632 633 // mask 634 SBBQ $0, CX 635 XORQ R15, R15 636 637 // if z<0 add p434x2 back 638 MOVQ P434X2_0, DI; ANDQ CX, DI 639 MOVQ P434X2_1, SI; ANDQ CX, SI 640 MOVQ P434X2_3, AX; ANDQ CX, AX 641 642 ADDQ DI, R8; MOVQ R8, ( 0)(DX) 643 ADCQ SI, R9; MOVQ R9, ( 8)(DX) 644 ADCQ SI, R10; MOVQ R10, (16)(DX) 645 ADCQ AX, R11; MOVQ R11, (24)(DX) 646 ADCQ $0, R15 647 648 MOVQ P434X2_4, R8; ANDQ CX, R8; 649 MOVQ P434X2_5, R9; ANDQ CX, R9; 650 MOVQ P434X2_6, R10; ANDQ CX, R10 651 652 BTQ $0, R15 653 654 ADCQ R8, R12; MOVQ R12, (32)(DX) 655 ADCQ R9, R13; MOVQ R13, (40)(DX) 656 ADCQ R10, R14; MOVQ R14, (48)(DX) 657 RET 658 659 TEXT ·sulP434(SB),NOSPLIT,$0-24 660 MOVQ z+0(FP), DX 661 MOVQ x+8(FP), DI 662 MOVQ y+16(FP), SI 663 664 // Used later to store result of 0-borrow 665 XORQ CX, CX 666 667 // SUBC for first 10 limbs 668 MOVQ ( 0)(DI), R8; SUBQ ( 0)(SI), R8 669 MOVQ ( 8)(DI), R9; SBBQ ( 8)(SI), R9 670 MOVQ (16)(DI), R10; SBBQ (16)(SI), R10 671 MOVQ (24)(DI), R11; SBBQ (24)(SI), R11 672 MOVQ (32)(DI), R12; SBBQ (32)(SI), R12 673 MOVQ (40)(DI), R13; SBBQ (40)(SI), R13 674 MOVQ (48)(DI), R14; SBBQ (48)(SI), R14 675 MOVQ (56)(DI), R15; SBBQ (56)(SI), R15 676 MOVQ (64)(DI), AX; SBBQ (64)(SI), AX 677 MOVQ (72)(DI), BX; SBBQ (72)(SI), BX 678 679 MOVQ R8, ( 0)(DX) 680 MOVQ R9, ( 8)(DX) 681 MOVQ R10, (16)(DX) 682 MOVQ R11, (24)(DX) 683 MOVQ R12, (32)(DX) 684 MOVQ R13, (40)(DX) 685 MOVQ R14, (48)(DX) 686 MOVQ R15, (56)(DX) 687 MOVQ AX, (64)(DX) 688 MOVQ BX, (72)(DX) 689 690 // SUBC for last 4 limbs 691 MOVQ ( 80)(DI), R8; SBBQ ( 80)(SI), R8 692 MOVQ ( 88)(DI), R9; SBBQ ( 88)(SI), R9 693 MOVQ ( 96)(DI), R10; SBBQ ( 96)(SI), R10 694 MOVQ (104)(DI), R11; SBBQ (104)(SI), R11 695 696 // Store carry flag 697 SBBQ $0, CX 698 699 MOVQ R8, ( 80)(DX) 700 MOVQ R9, ( 88)(DX) 701 MOVQ R10, ( 96)(DX) 702 MOVQ R11, (104)(DX) 703 704 // Load p into registers: 705 MOVQ P434_0, R8; ANDQ CX, R8 706 // P434_{1,2} = P434_0, so reuse R8 707 MOVQ P434_3, R9; ANDQ CX, R9 708 MOVQ P434_4, R10; ANDQ CX, R10 709 MOVQ P434_5, R11; ANDQ CX, R11 710 MOVQ P434_6, R12; ANDQ CX, R12 711 712 MOVQ (56 )(DX), AX; ADDQ R8, AX; MOVQ AX, (56 )(DX) 713 MOVQ (56+ 8)(DX), AX; ADCQ R8, AX; MOVQ AX, (56+ 8)(DX) 714 MOVQ (56+16)(DX), AX; ADCQ R8, AX; MOVQ AX, (56+16)(DX) 715 MOVQ (56+24)(DX), AX; ADCQ R9, AX; MOVQ AX, (56+24)(DX) 716 MOVQ (56+32)(DX), AX; ADCQ R10, AX; MOVQ AX, (56+32)(DX) 717 MOVQ (56+40)(DX), AX; ADCQ R11, AX; MOVQ AX, (56+40)(DX) 718 MOVQ (56+48)(DX), AX; ADCQ R12, AX; MOVQ AX, (56+48)(DX) 719 720 RET 721 722 TEXT ·modP434(SB),NOSPLIT,$0-8 723 MOVQ x+0(FP), DI 724 725 // Zero AX for later use: 726 XORQ AX, AX 727 728 // Set x <- x - p 729 MOVQ P434_0, R8 730 SUBQ R8, ( 0)(DI) 731 // P434_{1,2} = P434_0, so reuse R8 732 MOVQ P434_3, R9 733 SBBQ R8, ( 8)(DI) 734 SBBQ R8, (16)(DI) 735 MOVQ P434_4, R10 736 SBBQ R9, (24)(DI) 737 MOVQ P434_5, R11 738 SBBQ R10, (32)(DI) 739 MOVQ P434_6, R12 740 SBBQ R11, (40)(DI) 741 SBBQ R12, (48)(DI) 742 743 // save carry 744 SBBQ $0, AX 745 746 // Conditionally add p to x if x-p < 0 747 ANDQ AX, R8 748 ANDQ AX, R9 749 ANDQ AX, R10 750 ANDQ AX, R11 751 ANDQ AX, R12 752 753 ADDQ R8, ( 0)(DI) 754 ADCQ R8, ( 8)(DI) 755 ADCQ R8, (16)(DI) 756 ADCQ R9, (24)(DI) 757 ADCQ R10,(32)(DI) 758 ADCQ R11,(40)(DI) 759 ADCQ R12,(48)(DI) 760 RET 761 762 // 434-bit multiplication using Karatsuba (one level), 763 // schoolbook (one level). 764 TEXT ·mulP434(SB),NOSPLIT,$112-24 765 MOVQ z+0(FP), CX 766 MOVQ x+8(FP), DI 767 MOVQ y+16(FP), SI 768 769 // Check whether to use optimized implementation 770 CMPB ·HasADXandBMI2(SB), $1 771 JE mul_with_mulx_adcx_adox 772 773 // rcx[0-3] <- AH+AL 774 XORQ AX, AX 775 MOVQ 0x20(DI), R8 776 MOVQ 0x28(DI), R9 777 MOVQ 0x30(DI), R10 778 XORQ R11, R11 779 ADDQ 0x0(DI), R8 780 ADCQ 0x8(DI), R9 781 ADCQ 0x10(DI), R10 782 ADCQ 0x18(DI), R11 783 // store AH+AL mask 784 SBBQ $0, AX 785 MOVQ AX, 0x40(SP) 786 // store AH+AL in 0-0x18(rcx) 787 MOVQ R8, 0x0(CX) 788 MOVQ R9, 0x8(CX) 789 MOVQ R10, 0x10(CX) 790 MOVQ R11, 0x18(CX) 791 792 // r12-r15 <- BH+BL 793 XORQ DX, DX 794 MOVQ 0x20(SI), R12 795 MOVQ 0x28(SI), R13 796 MOVQ 0x30(SI), R14 797 XORQ R15, R15 798 ADDQ 0x0(SI), R12 799 ADCQ 0x8(SI), R13 800 ADCQ 0x10(SI), R14 801 ADCQ 0x18(SI), R15 802 SBBQ $0, DX 803 804 // store BH+BL mask 805 MOVQ DX, 0x48(SP) 806 807 // (rsp[0-0x38]) <- (AH+AL)*(BH+BL) 808 MOVQ (CX), AX 809 MULQ R12 810 MOVQ AX, (SP) 811 MOVQ DX, R8 812 813 XORQ R9, R9 814 MOVQ (CX), AX 815 MULQ R13 816 ADDQ AX, R8 817 ADCQ DX, R9 818 819 XORQ R10, R10 820 MOVQ 0x8(CX), AX 821 MULQ R12 822 ADDQ AX, R8 823 MOVQ R8, 0x8(SP) 824 ADCQ DX, R9 825 ADCQ $0, R10 826 827 XORQ R8, R8 828 MOVQ (CX), AX 829 MULQ R14 830 ADDQ AX, R9 831 ADCQ DX, R10 832 ADCQ $0, R8 833 834 MOVQ 0x10(CX), AX 835 MULQ R12 836 ADDQ AX, R9 837 ADCQ DX, R10 838 ADCQ $0, R8 839 840 MOVQ 0x8(CX), AX 841 MULQ R13 842 ADDQ AX, R9 843 MOVQ R9, 0x10(SP) 844 ADCQ DX, R10 845 ADCQ $0, R8 846 847 XORQ R9, R9 848 MOVQ (CX),AX 849 MULQ R15 850 ADDQ AX, R10 851 ADCQ DX, R8 852 ADCQ $0, R9 853 854 MOVQ 0x18(CX), AX 855 MULQ R12 856 ADDQ AX, R10 857 ADCQ DX, R8 858 ADCQ $0, R9 859 860 MOVQ 0x8(CX), AX 861 MULQ R14 862 ADDQ AX, R10 863 ADCQ DX, R8 864 ADCQ $0, R9 865 866 MOVQ 0x10(CX), AX 867 MULQ R13 868 ADDQ AX, R10 869 MOVQ R10, 0x18(SP) 870 ADCQ DX, R8 871 ADCQ $0, R9 872 873 XORQ R10, R10 874 MOVQ 0x8(CX), AX 875 MULQ R15 876 ADDQ AX, R8 877 ADCQ DX, R9 878 ADCQ $0, R10 879 880 MOVQ 0x18(CX), AX 881 MULQ R13 882 ADDQ AX, R8 883 ADCQ DX, R9 884 ADCQ $0, R10 885 886 MOVQ 0x10(CX), AX 887 MULQ R14 888 ADDQ AX, R8 889 MOVQ R8, 0x20(SP) 890 ADCQ DX, R9 891 ADCQ $0, R10 892 893 XORQ R11, R11 894 MOVQ 0x10(CX), AX 895 MULQ R15 896 ADDQ AX, R9 897 ADCQ DX, R10 898 ADCQ $0, R11 899 900 MOVQ 0x18(CX), AX 901 MULQ R14 902 ADDQ AX, R9 903 MOVQ R9, 0x28(SP) 904 ADCQ DX, R10 905 ADCQ $0, R11 906 907 MOVQ 0x18(CX), AX 908 MULQ R15 909 ADDQ AX, R10 910 MOVQ R10, 0x30(SP) 911 ADCQ DX, R11 912 MOVQ R11,0x38(SP) 913 914 // r12-r15 <- masked (BH + BL) 915 MOVQ 0x40(SP), AX 916 ANDQ AX, R12 917 ANDQ AX, R13 918 ANDQ AX, R14 919 ANDQ AX, R15 920 921 // r8-r11 <- masked (AH + AL) 922 MOVQ 0x48(SP), AX 923 MOVQ 0x00(CX), R8 924 ANDQ AX, R8 925 MOVQ 0x08(CX), R9 926 ANDQ AX, R9 927 MOVQ 0x10(CX), R10 928 ANDQ AX, R10 929 MOVQ 0x18(CX), R11 930 ANDQ AX, R11 931 932 // r12-r15 <- masked (AH + AL) + masked (BH + BL) 933 ADDQ R8, R12 934 ADCQ R9, R13 935 ADCQ R10, R14 936 ADCQ R11, R15 937 938 // rsp[0x20-0x38] <- (AH+AL) x (BH+BL) high 939 MOVQ 0x20(SP), AX 940 ADDQ AX, R12 941 MOVQ 0x28(SP), AX 942 ADCQ AX, R13 943 MOVQ 0x30(SP), AX 944 ADCQ AX, R14 945 MOVQ 0x38(SP), AX 946 ADCQ AX, R15 947 MOVQ R12, 0x50(SP) 948 MOVQ R13, 0x58(SP) 949 MOVQ R14, 0x60(SP) 950 MOVQ R15, 0x68(SP) 951 952 // [rcx] <- CL = AL x BL 953 MOVQ (DI), R11 954 MOVQ (SI), AX 955 MULQ R11 956 XORQ R9, R9 957 MOVQ AX, (CX) 958 MOVQ DX, R8 959 960 MOVQ 0x10(DI), R14 961 MOVQ 0x8(SI), AX 962 MULQ R11 963 XORQ R10, R10 964 ADDQ AX, R8 965 ADCQ DX, R9 966 967 MOVQ 0x8(DI), R12 968 MOVQ (SI), AX 969 MULQ R12 970 ADDQ AX, R8 971 MOVQ R8, 0x8(CX) 972 ADCQ DX, R9 973 ADCQ $0, R10 974 975 XORQ R8, R8 976 MOVQ 0x10(SI), AX 977 MULQ R11 978 ADDQ AX, R9 979 ADCQ DX, R10 980 ADCQ $0, R8 981 982 MOVQ (SI), R13 983 MOVQ R14, AX 984 MULQ R13 985 ADDQ AX, R9 986 ADCQ DX, R10 987 ADCQ $0, R8 988 989 MOVQ 0x8(SI), AX 990 MULQ R12 991 ADDQ AX, R9 992 MOVQ R9, 0x10(CX) 993 ADCQ DX, R10 994 ADCQ $0, R8 995 996 XORQ R9, R9 997 MOVQ 0x18(SI), AX 998 MULQ R11 999 MOVQ 0x18(DI), R15 1000 ADDQ AX, R10 1001 ADCQ DX, R8 1002 ADCQ $0, R9 1003 1004 MOVQ R15, AX 1005 MULQ R13 1006 ADDQ AX, R10 1007 ADCQ DX, R8 1008 ADCQ $0, R9 1009 1010 MOVQ 0x10(SI), AX 1011 MULQ R12 1012 ADDQ AX, R10 1013 ADCQ DX, R8 1014 ADCQ $0, R9 1015 1016 MOVQ 0x8(SI), AX 1017 MULQ R14 1018 ADDQ AX, R10 1019 MOVQ R10, 0x18(CX) 1020 ADCQ DX, R8 1021 ADCQ $0, R9 1022 1023 XORQ R10, R10 1024 MOVQ 0x18(SI), AX 1025 MULQ R12 1026 ADDQ AX, R8 1027 ADCQ DX, R9 1028 ADCQ $0, R10 1029 1030 MOVQ 0x8(SI), AX 1031 MULQ R15 1032 ADDQ AX, R8 1033 ADCQ DX, R9 1034 ADCQ $0, R10 1035 1036 MOVQ 0x10(SI), AX 1037 MULQ R14 1038 ADDQ AX, R8 1039 MOVQ R8, 0x20(CX) 1040 ADCQ DX, R9 1041 ADCQ $0, R10 1042 1043 XORQ R8, R8 1044 MOVQ 0x18(SI), AX 1045 MULQ R14 1046 ADDQ AX, R9 1047 ADCQ DX, R10 1048 ADCQ $0, R8 1049 1050 MOVQ 0x10(SI), AX 1051 MULQ R15 1052 ADDQ AX, R9 1053 MOVQ R9, 0x28(CX) 1054 ADCQ DX, R10 1055 ADCQ $0, R8 1056 1057 MOVQ 0x18(SI), AX 1058 MULQ R15 1059 ADDQ AX, R10 1060 MOVQ R10, 0x30(CX) 1061 ADCQ DX, R8 1062 MOVQ R8, 0x38(CX) 1063 1064 // rcx[0x40-0x68] <- AH*BH 1065 // multiplies 2 192-bit numbers A,B 1066 MOVQ 0x20(DI), R11 1067 MOVQ 0x20(SI), AX 1068 MULQ R11 1069 XORQ R9, R9 1070 MOVQ AX, 0x40(CX) 1071 MOVQ DX, R8 1072 1073 MOVQ 0x30(DI), R14 1074 MOVQ 0x28(SI), AX 1075 MULQ R11 1076 XORQ R10, R10 1077 ADDQ AX, R8 1078 ADCQ DX, R9 1079 1080 MOVQ 0x28(DI), R12 1081 MOVQ 0x20(SI), AX 1082 MULQ R12 1083 ADDQ AX, R8 1084 MOVQ R8, 0x48(CX) 1085 ADCQ DX, R9 1086 ADCQ $0, R10 1087 1088 XORQ R8, R8 1089 MOVQ 0x30(SI), AX 1090 MULQ R11 1091 ADDQ AX, R9 1092 ADCQ DX, R10 1093 ADCQ $0, R8 1094 1095 MOVQ 0x20(SI), R13 1096 MOVQ R14, AX 1097 MULQ R13 1098 ADDQ AX, R9 1099 ADCQ DX, R10 1100 ADCQ $0, R8 1101 1102 MOVQ 0x28(SI), AX 1103 MULQ R12 1104 ADDQ AX, R9 1105 MOVQ R9, 0x50(CX) 1106 ADCQ DX, R10 1107 ADCQ $0, R8 1108 1109 MOVQ 0x30(SI), AX 1110 MULQ R12 1111 XORQ R12, R12 1112 ADDQ AX, R10 1113 ADCQ DX, R8 1114 ADCQ $0, R12 1115 1116 MOVQ 0x28(SI), AX 1117 MULQ R14 1118 ADDQ AX, R10 1119 ADCQ DX, R8 1120 ADCQ $0, R12 1121 MOVQ R10, 0x58(CX) 1122 1123 MOVQ 0x30(SI), AX 1124 MULQ R14 1125 ADDQ AX, R8 1126 ADCQ $0, R12 1127 MOVQ R8, 0x60(CX) 1128 1129 ADDQ R12, DX 1130 1131 // [r8-r15] <- (AH+AL)x(BH+BL) - ALxBL 1132 MOVQ 0x0(SP), R8 1133 SUBQ 0x0(CX), R8 1134 MOVQ 0x8(SP), R9 1135 SBBQ 0x8(CX), R9 1136 MOVQ 0x10(SP), R10 1137 SBBQ 0x10(CX), R10 1138 MOVQ 0x18(SP), R11 1139 SBBQ 0x18(CX), R11 1140 MOVQ 0x50(SP), R12 1141 SBBQ 0x20(CX), R12 1142 MOVQ 0x58(SP), R13 1143 SBBQ 0x28(CX), R13 1144 MOVQ 0x60(SP), R14 1145 SBBQ 0x30(CX), R14 1146 MOVQ 0x68(SP), R15 1147 SBBQ 0x38(CX), R15 1148 1149 // [r8-r15] <- (AH+AL) x (BH+BL) - ALxBL - AHxBH 1150 MOVQ 0x40(CX), AX 1151 SUBQ AX, R8 1152 MOVQ 0x48(CX), AX 1153 SBBQ AX, R9 1154 MOVQ 0x50(CX), AX 1155 SBBQ AX, R10 1156 MOVQ 0x58(CX), AX 1157 SBBQ AX, R11 1158 MOVQ 0x60(CX), AX 1159 SBBQ AX, R12 1160 SBBQ DX, R13 1161 SBBQ $0, R14 1162 SBBQ $0, R15 1163 1164 // Final result 1165 ADDQ 0x20(CX), R8 1166 MOVQ R8, 0x20(CX) // OUT4 1167 ADCQ 0x28(CX), R9 1168 MOVQ R9, 0x28(CX) // OUT5 1169 ADCQ 0x30(CX), R10 1170 MOVQ R10, 0x30(CX) // OUT6 1171 ADCQ 0x38(CX), R11 1172 MOVQ R11, 0x38(CX) // OUT7 1173 ADCQ 0x40(CX), R12 1174 MOVQ R12, 0x40(CX) // OUT8 1175 ADCQ 0x48(CX), R13 1176 MOVQ R13, 0x48(CX) // OUT9 1177 ADCQ 0x50(CX), R14 1178 MOVQ R14, 0x50(CX) // OUT10 1179 ADCQ 0x58(CX), R15 1180 MOVQ R15, 0x58(CX) // OUT11 1181 MOVQ 0x60(CX), R12 1182 ADCQ $0, R12 1183 MOVQ R12, 0x60(CX) // OUT12 1184 ADCQ $0, DX 1185 MOVQ DX, 0x68(CX) // OUT13 1186 RET 1187 1188 mul_with_mulx_adcx_adox: 1189 // Mul implementation for CPUs supporting two independent carry chain 1190 // (ADOX/ADCX) instructions and carry-less MULX multiplier 1191 XORQ AX, AX 1192 MOVQ 0x0(DI), R8 1193 MOVQ 0x8(DI), R9 1194 MOVQ 0x10(DI), R10 1195 MOVQ 0x18(DI), R11 1196 1197 MOVQ BP, 0x70(SP) // push: BP is Callee-save. 1198 1199 ADDQ 0x20(DI), R8 1200 ADCQ 0x28(DI), R9 1201 ADCQ 0x30(DI), R10 1202 ADCQ $0, R11 1203 SBBQ $0, AX 1204 MOVQ R8, 0x0(SP) 1205 MOVQ R9, 0x8(SP) 1206 MOVQ R10, 0x10(SP) 1207 MOVQ R11, 0x18(SP) 1208 1209 // r12-r15 <- BH + BL, rbx <- mask 1210 XORQ BX, BX 1211 MOVQ 0x0(SI), R12 1212 MOVQ 0x8(SI), R13 1213 MOVQ 0x10(SI), R14 1214 MOVQ 0x18(SI), R15 1215 ADDQ 0x20(SI), R12 1216 ADCQ 0x28(SI), R13 1217 ADCQ 0x30(SI), R14 1218 ADCQ $0, R15 1219 SBBQ $0, BX 1220 MOVQ R12, 0x20(SP) 1221 MOVQ R13, 0x28(SP) 1222 MOVQ R14, 0x30(SP) 1223 MOVQ R15, 0x38(SP) 1224 1225 // r12-r15 <- masked (BH + BL) 1226 ANDQ AX, R12 1227 ANDQ AX, R13 1228 ANDQ AX, R14 1229 ANDQ AX, R15 1230 1231 // r8-r11 <- masked (AH + AL) 1232 ANDQ BX, R8 1233 ANDQ BX, R9 1234 ANDQ BX, R10 1235 ANDQ BX, R11 1236 1237 // r8-r11 <- masked (AH + AL) + masked (BH + BL) 1238 ADDQ R12, R8 1239 ADCQ R13, R9 1240 ADCQ R14, R10 1241 ADCQ R15, R11 1242 MOVQ R8, 0x40(SP) 1243 MOVQ R9, 0x48(SP) 1244 MOVQ R10, 0x50(SP) 1245 MOVQ R11, 0x58(SP) 1246 1247 // [rsp] <- CM = (AH+AL) x (BH+BL) 1248 MULX256(0,SP,32,SP,0,SP,R8,R9,R10,R11,R12,R13,R14,R15,BX,BP) 1249 // [rcx] <- CL = AL x BL (Result c0-c3) 1250 MULX256(0,DI,0,SI,0,CX,R8,R9,R10,R11,R12,R13,R14,R15,BX,BP) 1251 // [rcx+64], rbx, rbp, rax <- CH = AH x BH 1252 MULX192(32,DI,32,SI,64,CX,R8,BX,R10,BP,R12,R13,R14) 1253 1254 // r8-r11 <- (AH+AL) x (BH+BL), final step 1255 MOVQ 0x40(SP), R8 1256 MOVQ 0x48(SP), R9 1257 MOVQ 0x50(SP), R10 1258 MOVQ 0x58(SP), R11 1259 1260 MOVQ 0x20(SP), DX 1261 ADDQ DX, R8 1262 MOVQ 0x28(SP), DX 1263 ADCQ DX, R9 1264 MOVQ 0x30(SP), DX 1265 ADCQ DX, R10 1266 MOVQ 0x38(SP), DX 1267 ADCQ DX, R11 1268 1269 // [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL 1270 MOVQ 0x0(SP), R12 1271 MOVQ 0x8(SP), R13 1272 MOVQ 0x10(SP), R14 1273 MOVQ 0x18(SP), R15 1274 SUBQ 0x0(CX), R12 1275 SBBQ 0x8(CX), R13 1276 SBBQ 0x10(CX), R14 1277 SBBQ 0x18(CX), R15 1278 SBBQ 0x20(CX), R8 1279 SBBQ 0x28(CX), R9 1280 SBBQ 0x30(CX), R10 1281 SBBQ 0x38(CX), R11 1282 1283 // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH 1284 SUBQ 0x40(CX), R12 1285 SBBQ 0x48(CX), R13 1286 SBBQ 0x50(CX), R14 1287 SBBQ BX, R15 1288 SBBQ BP, R8 1289 SBBQ AX, R9 1290 SBBQ $0, R10 1291 SBBQ $0, R11 1292 1293 ADDQ 0x20(CX), R12 1294 MOVQ R12, 0x20(CX) // OUT4 1295 ADCQ 0x28(CX), R13 1296 MOVQ R13, 0x28(CX) // OUT5 1297 ADCQ 0x30(CX), R14 1298 MOVQ R14, 0x30(CX) // OUT6 1299 ADCQ 0x38(CX), R15 1300 MOVQ R15, 0x38(CX) // OUT7 1301 ADCQ 0x40(CX), R8 1302 MOVQ R8, 0x40(CX) // OUT8 1303 ADCQ 0x48(CX), R9 1304 MOVQ R9, 0x48(CX) // OUT9 1305 ADCQ 0x50(CX), R10 1306 MOVQ R10, 0x50(CX) // OUT10 1307 ADCQ BX, R11 1308 MOVQ R11, 0x58(CX) // OUT11 1309 ADCQ $0, BP 1310 MOVQ BP, 0x60(CX) // OUT12 1311 ADCQ $0, AX 1312 MOVQ AX, 0x68(CX) // OUT13 1313 1314 MOVQ 0x70(SP), BP // pop: BP is Callee-save. 1315 RET 1316 1317 TEXT ·rdcP434(SB),$0-16 1318 MOVQ z+0(FP), SI 1319 MOVQ x+8(FP), DI 1320 CMPB ·HasADXandBMI2(SB), $1 1321 JE redc_bdw 1322 #define MUL01 MUL128x256( 0,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX) 1323 #define MUL23 MUL128x256(16,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX) 1324 #define MUL45 MUL128x256(32,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX) 1325 #define MUL67 MUL64x256(48,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13) 1326 REDC_MULQ(MUL01, MUL23, MUL45, MUL67) 1327 #undef MUL01 1328 #undef MUL23 1329 #undef MUL45 1330 #undef MUL67 1331 RET 1332 1333 // 434-bit montgomery reduction Uses MULX/ADOX/ADCX instructions 1334 // available on Broadwell micro-architectures and newer. 1335 redc_bdw: 1336 #define MULX01 MULX128x256(R14,·P434p1+(8*P434_P1_ZEROS),R9 ,R10,R11,R12,R13) 1337 #define MULX23 MULX128x256(R8 ,·P434p1+(8*P434_P1_ZEROS),BP ,R12,R13,R14,CX ) 1338 #define MULX45 MULX128x256(BX ,·P434p1+(8*P434_P1_ZEROS),R9 ,R10,BP ,R12,R13) 1339 #define MULX67 MULX64x256 ( ·P434p1+(8*P434_P1_ZEROS),BX ,R8 ,R9 ,R11,CX ) 1340 REDC_MULX(·P434p1+(8*P434_P1_ZEROS)+0(SB), MULX01, MULX23, MULX45, MULX67) 1341 #undef MULX01 1342 #undef MULX23 1343 #undef MULX45 1344 #undef MULX67 1345 RET