github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/elliptic/p256_asm_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f 8 DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 9 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff 10 DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84 11 DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551 12 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 13 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 14 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 15 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 16 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 17 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 18 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 19 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 20 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 21 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 22 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 23 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 24 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 25 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 26 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 27 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 28 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 29 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 30 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 31 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 32 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 33 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 34 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 35 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 36 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 37 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 38 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 39 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 40 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 41 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 42 GLOBL p256ordK0<>(SB), 8, $4 43 GLOBL p256ord<>(SB), 8, $32 44 GLOBL p256<>(SB), 8, $80 45 GLOBL p256mul<>(SB), 8, $160 46 47 // func hasVectorFacility() bool 48 TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 49 MOVD $x-24(SP), R1 50 XC $24, 0(R1), 0(R1) // clear the storage 51 MOVD $2, R0 // R0 is the number of double words stored -1 52 WORD $0xB2B01000 // STFLE 0(R1) 53 XOR R0, R0 // reset the value of R0 54 MOVBZ z-8(SP), R1 55 AND $0x40, R1 56 BEQ novector 57 58 vectorinstalled: 59 // check if the vector instruction has been enabled 60 VLEIB $0, $0xF, V16 61 VLGVB $0, V16, R1 62 CMPBNE R1, $0xF, novector 63 MOVB $1, ret+0(FP) // have vx 64 RET 65 66 novector: 67 MOVB $0, ret+0(FP) // no vx 68 RET 69 70 // --------------------------------------- 71 // iff cond == 1 val <- -val 72 // func p256NegCond(val *p256Point, cond int) 73 #define P1ptr R1 74 #define CPOOL R4 75 76 #define Y1L V0 77 #define Y1H V1 78 #define T1L V2 79 #define T1H V3 80 81 #define PL V30 82 #define PH V31 83 84 #define ZER V4 85 #define SEL1 V5 86 #define CAR1 V6 87 TEXT ·p256NegCond(SB), NOSPLIT, $0 88 MOVD val+0(FP), P1ptr 89 90 MOVD $p256mul<>+0x00(SB), CPOOL 91 VL 16(CPOOL), PL 92 VL 0(CPOOL), PH 93 94 VL 32(P1ptr), Y1H 95 VL 48(P1ptr), Y1L 96 97 VLREPG cond+8(FP), SEL1 98 VZERO ZER 99 VCEQG SEL1, ZER, SEL1 100 101 VSCBIQ Y1L, PL, CAR1 102 VSQ Y1L, PL, T1L 103 VSBIQ PH, Y1H, CAR1, T1H 104 105 VSEL Y1L, T1L, SEL1, Y1L 106 VSEL Y1H, T1H, SEL1, Y1H 107 108 VST Y1H, 32(P1ptr) 109 VST Y1L, 48(P1ptr) 110 RET 111 112 #undef P1ptr 113 #undef CPOOL 114 #undef Y1L 115 #undef Y1H 116 #undef T1L 117 #undef T1H 118 #undef PL 119 #undef PH 120 #undef ZER 121 #undef SEL1 122 #undef CAR1 123 124 // --------------------------------------- 125 // if cond == 0 res <- b; else res <- a 126 // func p256MovCond(res, a, b *p256Point, cond int) 127 #define P3ptr R1 128 #define P1ptr R2 129 #define P2ptr R3 130 131 #define X1L V0 132 #define X1H V1 133 #define Y1L V2 134 #define Y1H V3 135 #define Z1L V4 136 #define Z1H V5 137 #define X2L V6 138 #define X2H V7 139 #define Y2L V8 140 #define Y2H V9 141 #define Z2L V10 142 #define Z2H V11 143 144 #define ZER V18 145 #define SEL1 V19 146 TEXT ·p256MovCond(SB), NOSPLIT, $0 147 MOVD res+0(FP), P3ptr 148 MOVD a+8(FP), P1ptr 149 MOVD b+16(FP), P2ptr 150 VLREPG cond+24(FP), SEL1 151 VZERO ZER 152 VCEQG SEL1, ZER, SEL1 153 154 VL 0(P1ptr), X1H 155 VL 16(P1ptr), X1L 156 VL 32(P1ptr), Y1H 157 VL 48(P1ptr), Y1L 158 VL 64(P1ptr), Z1H 159 VL 80(P1ptr), Z1L 160 161 VL 0(P2ptr), X2H 162 VL 16(P2ptr), X2L 163 VL 32(P2ptr), Y2H 164 VL 48(P2ptr), Y2L 165 VL 64(P2ptr), Z2H 166 VL 80(P2ptr), Z2L 167 168 VSEL X2L, X1L, SEL1, X1L 169 VSEL X2H, X1H, SEL1, X1H 170 VSEL Y2L, Y1L, SEL1, Y1L 171 VSEL Y2H, Y1H, SEL1, Y1H 172 VSEL Z2L, Z1L, SEL1, Z1L 173 VSEL Z2H, Z1H, SEL1, Z1H 174 175 VST X1H, 0(P3ptr) 176 VST X1L, 16(P3ptr) 177 VST Y1H, 32(P3ptr) 178 VST Y1L, 48(P3ptr) 179 VST Z1H, 64(P3ptr) 180 VST Z1L, 80(P3ptr) 181 182 RET 183 184 #undef P3ptr 185 #undef P1ptr 186 #undef P2ptr 187 #undef X1L 188 #undef X1H 189 #undef Y1L 190 #undef Y1H 191 #undef Z1L 192 #undef Z1H 193 #undef X2L 194 #undef X2H 195 #undef Y2L 196 #undef Y2H 197 #undef Z2L 198 #undef Z2H 199 #undef ZER 200 #undef SEL1 201 202 // --------------------------------------- 203 // Constant time table access 204 // Indexed from 1 to 15, with -1 offset 205 // (index 0 is implicitly point at infinity) 206 // func p256Select(point *p256Point, table []p256Point, idx int) 207 #define P3ptr R1 208 #define P1ptr R2 209 #define COUNT R4 210 211 #define X1L V0 212 #define X1H V1 213 #define Y1L V2 214 #define Y1H V3 215 #define Z1L V4 216 #define Z1H V5 217 #define X2L V6 218 #define X2H V7 219 #define Y2L V8 220 #define Y2H V9 221 #define Z2L V10 222 #define Z2H V11 223 224 #define ONE V18 225 #define IDX V19 226 #define SEL1 V20 227 #define SEL2 V21 228 TEXT ·p256Select(SB), NOSPLIT, $0 229 MOVD point+0(FP), P3ptr 230 MOVD table+8(FP), P1ptr 231 VLREPB idx+(32+7)(FP), IDX 232 VREPIB $1, ONE 233 VREPIB $1, SEL2 234 MOVD $1, COUNT 235 236 VZERO X1H 237 VZERO X1L 238 VZERO Y1H 239 VZERO Y1L 240 VZERO Z1H 241 VZERO Z1L 242 243 loop_select: 244 VL 0(P1ptr), X2H 245 VL 16(P1ptr), X2L 246 VL 32(P1ptr), Y2H 247 VL 48(P1ptr), Y2L 248 VL 64(P1ptr), Z2H 249 VL 80(P1ptr), Z2L 250 251 VCEQG SEL2, IDX, SEL1 252 253 VSEL X2L, X1L, SEL1, X1L 254 VSEL X2H, X1H, SEL1, X1H 255 VSEL Y2L, Y1L, SEL1, Y1L 256 VSEL Y2H, Y1H, SEL1, Y1H 257 VSEL Z2L, Z1L, SEL1, Z1L 258 VSEL Z2H, Z1H, SEL1, Z1H 259 260 VAB SEL2, ONE, SEL2 261 ADDW $1, COUNT 262 ADD $96, P1ptr 263 CMPW COUNT, $17 264 BLT loop_select 265 266 VST X1H, 0(P3ptr) 267 VST X1L, 16(P3ptr) 268 VST Y1H, 32(P3ptr) 269 VST Y1L, 48(P3ptr) 270 VST Z1H, 64(P3ptr) 271 VST Z1L, 80(P3ptr) 272 RET 273 274 #undef P3ptr 275 #undef P1ptr 276 #undef COUNT 277 #undef X1L 278 #undef X1H 279 #undef Y1L 280 #undef Y1H 281 #undef Z1L 282 #undef Z1H 283 #undef X2L 284 #undef X2H 285 #undef Y2L 286 #undef Y2H 287 #undef Z2L 288 #undef Z2H 289 #undef ONE 290 #undef IDX 291 #undef SEL1 292 #undef SEL2 293 294 // --------------------------------------- 295 // Constant time table access 296 // Indexed from 1 to 15, with -1 offset 297 // (index 0 is implicitly point at infinity) 298 // func p256SelectBase(point *p256Point, table []p256Point, idx int) 299 #define P3ptr R1 300 #define P1ptr R2 301 #define COUNT R4 302 303 #define X1L V0 304 #define X1H V1 305 #define Y1L V2 306 #define Y1H V3 307 #define Z1L V4 308 #define Z1H V5 309 #define X2L V6 310 #define X2H V7 311 #define Y2L V8 312 #define Y2H V9 313 #define Z2L V10 314 #define Z2H V11 315 316 #define ONE V18 317 #define IDX V19 318 #define SEL1 V20 319 #define SEL2 V21 320 TEXT ·p256SelectBase(SB), NOSPLIT, $0 321 MOVD point+0(FP), P3ptr 322 MOVD table+8(FP), P1ptr 323 VLREPB idx+(32+7)(FP), IDX 324 VREPIB $1, ONE 325 VREPIB $1, SEL2 326 MOVD $1, COUNT 327 328 VZERO X1H 329 VZERO X1L 330 VZERO Y1H 331 VZERO Y1L 332 VZERO Z1H 333 VZERO Z1L 334 335 loop_select: 336 VL 0(P1ptr), X2H 337 VL 16(P1ptr), X2L 338 VL 32(P1ptr), Y2H 339 VL 48(P1ptr), Y2L 340 VL 64(P1ptr), Z2H 341 VL 80(P1ptr), Z2L 342 343 VCEQG SEL2, IDX, SEL1 344 345 VSEL X2L, X1L, SEL1, X1L 346 VSEL X2H, X1H, SEL1, X1H 347 VSEL Y2L, Y1L, SEL1, Y1L 348 VSEL Y2H, Y1H, SEL1, Y1H 349 VSEL Z2L, Z1L, SEL1, Z1L 350 VSEL Z2H, Z1H, SEL1, Z1H 351 352 VAB SEL2, ONE, SEL2 353 ADDW $1, COUNT 354 ADD $96, P1ptr 355 CMPW COUNT, $65 356 BLT loop_select 357 358 VST X1H, 0(P3ptr) 359 VST X1L, 16(P3ptr) 360 VST Y1H, 32(P3ptr) 361 VST Y1L, 48(P3ptr) 362 VST Z1H, 64(P3ptr) 363 VST Z1L, 80(P3ptr) 364 RET 365 366 #undef P3ptr 367 #undef P1ptr 368 #undef COUNT 369 #undef X1L 370 #undef X1H 371 #undef Y1L 372 #undef Y1H 373 #undef Z1L 374 #undef Z1H 375 #undef X2L 376 #undef X2H 377 #undef Y2L 378 #undef Y2H 379 #undef Z2L 380 #undef Z2H 381 #undef ONE 382 #undef IDX 383 #undef SEL1 384 #undef SEL2 385 386 // --------------------------------------- 387 // func p256FromMont(res, in []byte) 388 #define res_ptr R1 389 #define x_ptr R2 390 #define CPOOL R4 391 392 #define T0 V0 393 #define T1 V1 394 #define T2 V2 395 #define TT0 V3 396 #define TT1 V4 397 398 #define ZER V6 399 #define SEL1 V7 400 #define SEL2 V8 401 #define CAR1 V9 402 #define CAR2 V10 403 #define RED1 V11 404 #define RED2 V12 405 #define PL V13 406 #define PH V14 407 408 TEXT ·p256FromMont(SB), NOSPLIT, $0 409 MOVD res+0(FP), res_ptr 410 MOVD in+24(FP), x_ptr 411 412 VZERO T2 413 VZERO ZER 414 MOVD $p256<>+0x00(SB), CPOOL 415 VL 16(CPOOL), PL 416 VL 0(CPOOL), PH 417 VL 48(CPOOL), SEL2 418 VL 64(CPOOL), SEL1 419 420 VL (1*16)(x_ptr), T0 421 VL (0*16)(x_ptr), T1 422 423 // First round 424 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 425 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 426 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 427 428 VSLDB $8, T1, T0, T0 429 VSLDB $8, T2, T1, T1 430 431 VACCQ T0, RED1, CAR1 432 VAQ T0, RED1, T0 433 VACCCQ T1, RED2, CAR1, CAR2 434 VACQ T1, RED2, CAR1, T1 435 VAQ T2, CAR2, T2 436 437 // Second round 438 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 439 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 440 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 441 442 VSLDB $8, T1, T0, T0 443 VSLDB $8, T2, T1, T1 444 445 VACCQ T0, RED1, CAR1 446 VAQ T0, RED1, T0 447 VACCCQ T1, RED2, CAR1, CAR2 448 VACQ T1, RED2, CAR1, T1 449 VAQ T2, CAR2, T2 450 451 // Third round 452 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 453 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 454 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 455 456 VSLDB $8, T1, T0, T0 457 VSLDB $8, T2, T1, T1 458 459 VACCQ T0, RED1, CAR1 460 VAQ T0, RED1, T0 461 VACCCQ T1, RED2, CAR1, CAR2 462 VACQ T1, RED2, CAR1, T1 463 VAQ T2, CAR2, T2 464 465 // Last round 466 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 467 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 468 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 469 470 VSLDB $8, T1, T0, T0 471 VSLDB $8, T2, T1, T1 472 473 VACCQ T0, RED1, CAR1 474 VAQ T0, RED1, T0 475 VACCCQ T1, RED2, CAR1, CAR2 476 VACQ T1, RED2, CAR1, T1 477 VAQ T2, CAR2, T2 478 479 // --------------------------------------------------- 480 481 VSCBIQ PL, T0, CAR1 482 VSQ PL, T0, TT0 483 VSBCBIQ T1, PH, CAR1, CAR2 484 VSBIQ T1, PH, CAR1, TT1 485 VSBIQ T2, ZER, CAR2, T2 486 487 // what output to use, TT1||TT0 or T1||T0? 488 VSEL T0, TT0, T2, T0 489 VSEL T1, TT1, T2, T1 490 491 VST T0, (1*16)(res_ptr) 492 VST T1, (0*16)(res_ptr) 493 RET 494 495 #undef res_ptr 496 #undef x_ptr 497 #undef CPOOL 498 #undef T0 499 #undef T1 500 #undef T2 501 #undef TT0 502 #undef TT1 503 #undef ZER 504 #undef SEL1 505 #undef SEL2 506 #undef CAR1 507 #undef CAR2 508 #undef RED1 509 #undef RED2 510 #undef PL 511 #undef PH 512 513 // --------------------------------------- 514 // func p256OrdMul(res, in1, in2 []byte) 515 #define res_ptr R1 516 #define x_ptr R2 517 #define y_ptr R3 518 #define X0 V0 519 #define X1 V1 520 #define Y0 V2 521 #define Y1 V3 522 #define M0 V4 523 #define M1 V5 524 #define T0 V6 525 #define T1 V7 526 #define T2 V8 527 #define YDIG V9 528 529 #define ADD1 V16 530 #define ADD1H V17 531 #define ADD2 V18 532 #define ADD2H V19 533 #define RED1 V20 534 #define RED1H V21 535 #define RED2 V22 536 #define RED2H V23 537 #define CAR1 V24 538 #define CAR1M V25 539 540 #define MK0 V30 541 #define K0 V31 542 TEXT ·p256OrdMul(SB), NOSPLIT, $0 543 MOVD res+0(FP), res_ptr 544 MOVD in1+24(FP), x_ptr 545 MOVD in2+48(FP), y_ptr 546 547 VZERO T2 548 MOVD $p256ordK0<>+0x00(SB), R4 549 550 // VLEF $3, 0(R4), K0 551 WORD $0xE7F40000 552 BYTE $0x38 553 BYTE $0x03 554 MOVD $p256ord<>+0x00(SB), R4 555 VL 16(R4), M0 556 VL 0(R4), M1 557 558 VL (1*16)(x_ptr), X0 559 VL (0*16)(x_ptr), X1 560 VL (1*16)(y_ptr), Y0 561 VL (0*16)(y_ptr), Y1 562 563 // ---------------------------------------------------------------------------/ 564 VREPF $3, Y0, YDIG 565 VMLF X0, YDIG, ADD1 566 VMLF ADD1, K0, MK0 567 VREPF $3, MK0, MK0 568 569 VMLF X1, YDIG, ADD2 570 VMLHF X0, YDIG, ADD1H 571 VMLHF X1, YDIG, ADD2H 572 573 VMALF M0, MK0, ADD1, RED1 574 VMALHF M0, MK0, ADD1, RED1H 575 VMALF M1, MK0, ADD2, RED2 576 VMALHF M1, MK0, ADD2, RED2H 577 578 VSLDB $12, RED2, RED1, RED1 579 VSLDB $12, T2, RED2, RED2 580 581 VACCQ RED1, ADD1H, CAR1 582 VAQ RED1, ADD1H, T0 583 VACCQ RED1H, T0, CAR1M 584 VAQ RED1H, T0, T0 585 586 // << ready for next MK0 587 588 VACQ RED2, ADD2H, CAR1, T1 589 VACCCQ RED2, ADD2H, CAR1, CAR1 590 VACCCQ RED2H, T1, CAR1M, T2 591 VACQ RED2H, T1, CAR1M, T1 592 VAQ CAR1, T2, T2 593 594 // --------------------------------------------------- 595 /* * 596 * ---+--------+--------+ 597 * T2| T1 | T0 | 598 * ---+--------+--------+ 599 * *(add)* 600 * +--------+--------+ 601 * | X1 | X0 | 602 * +--------+--------+ 603 * *(mul)* 604 * +--------+--------+ 605 * | YDIG | YDIG | 606 * +--------+--------+ 607 * *(add)* 608 * +--------+--------+ 609 * | M1 | M0 | 610 * +--------+--------+ 611 * *(mul)* 612 * +--------+--------+ 613 * | MK0 | MK0 | 614 * +--------+--------+ 615 * 616 * --------------------- 617 * 618 * +--------+--------+ 619 * | ADD2 | ADD1 | 620 * +--------+--------+ 621 * +--------+--------+ 622 * | ADD2H | ADD1H | 623 * +--------+--------+ 624 * +--------+--------+ 625 * | RED2 | RED1 | 626 * +--------+--------+ 627 * +--------+--------+ 628 * | RED2H | RED1H | 629 * +--------+--------+ 630 */ 631 VREPF $2, Y0, YDIG 632 VMALF X0, YDIG, T0, ADD1 633 VMLF ADD1, K0, MK0 634 VREPF $3, MK0, MK0 635 636 VMALF X1, YDIG, T1, ADD2 637 VMALHF X0, YDIG, T0, ADD1H 638 VMALHF X1, YDIG, T1, ADD2H 639 640 VMALF M0, MK0, ADD1, RED1 641 VMALHF M0, MK0, ADD1, RED1H 642 VMALF M1, MK0, ADD2, RED2 643 VMALHF M1, MK0, ADD2, RED2H 644 645 VSLDB $12, RED2, RED1, RED1 646 VSLDB $12, T2, RED2, RED2 647 648 VACCQ RED1, ADD1H, CAR1 649 VAQ RED1, ADD1H, T0 650 VACCQ RED1H, T0, CAR1M 651 VAQ RED1H, T0, T0 652 653 // << ready for next MK0 654 655 VACQ RED2, ADD2H, CAR1, T1 656 VACCCQ RED2, ADD2H, CAR1, CAR1 657 VACCCQ RED2H, T1, CAR1M, T2 658 VACQ RED2H, T1, CAR1M, T1 659 VAQ CAR1, T2, T2 660 661 // --------------------------------------------------- 662 VREPF $1, Y0, YDIG 663 VMALF X0, YDIG, T0, ADD1 664 VMLF ADD1, K0, MK0 665 VREPF $3, MK0, MK0 666 667 VMALF X1, YDIG, T1, ADD2 668 VMALHF X0, YDIG, T0, ADD1H 669 VMALHF X1, YDIG, T1, ADD2H 670 671 VMALF M0, MK0, ADD1, RED1 672 VMALHF M0, MK0, ADD1, RED1H 673 VMALF M1, MK0, ADD2, RED2 674 VMALHF M1, MK0, ADD2, RED2H 675 676 VSLDB $12, RED2, RED1, RED1 677 VSLDB $12, T2, RED2, RED2 678 679 VACCQ RED1, ADD1H, CAR1 680 VAQ RED1, ADD1H, T0 681 VACCQ RED1H, T0, CAR1M 682 VAQ RED1H, T0, T0 683 684 // << ready for next MK0 685 686 VACQ RED2, ADD2H, CAR1, T1 687 VACCCQ RED2, ADD2H, CAR1, CAR1 688 VACCCQ RED2H, T1, CAR1M, T2 689 VACQ RED2H, T1, CAR1M, T1 690 VAQ CAR1, T2, T2 691 692 // --------------------------------------------------- 693 VREPF $0, Y0, YDIG 694 VMALF X0, YDIG, T0, ADD1 695 VMLF ADD1, K0, MK0 696 VREPF $3, MK0, MK0 697 698 VMALF X1, YDIG, T1, ADD2 699 VMALHF X0, YDIG, T0, ADD1H 700 VMALHF X1, YDIG, T1, ADD2H 701 702 VMALF M0, MK0, ADD1, RED1 703 VMALHF M0, MK0, ADD1, RED1H 704 VMALF M1, MK0, ADD2, RED2 705 VMALHF M1, MK0, ADD2, RED2H 706 707 VSLDB $12, RED2, RED1, RED1 708 VSLDB $12, T2, RED2, RED2 709 710 VACCQ RED1, ADD1H, CAR1 711 VAQ RED1, ADD1H, T0 712 VACCQ RED1H, T0, CAR1M 713 VAQ RED1H, T0, T0 714 715 // << ready for next MK0 716 717 VACQ RED2, ADD2H, CAR1, T1 718 VACCCQ RED2, ADD2H, CAR1, CAR1 719 VACCCQ RED2H, T1, CAR1M, T2 720 VACQ RED2H, T1, CAR1M, T1 721 VAQ CAR1, T2, T2 722 723 // --------------------------------------------------- 724 VREPF $3, Y1, YDIG 725 VMALF X0, YDIG, T0, ADD1 726 VMLF ADD1, K0, MK0 727 VREPF $3, MK0, MK0 728 729 VMALF X1, YDIG, T1, ADD2 730 VMALHF X0, YDIG, T0, ADD1H 731 VMALHF X1, YDIG, T1, ADD2H 732 733 VMALF M0, MK0, ADD1, RED1 734 VMALHF M0, MK0, ADD1, RED1H 735 VMALF M1, MK0, ADD2, RED2 736 VMALHF M1, MK0, ADD2, RED2H 737 738 VSLDB $12, RED2, RED1, RED1 739 VSLDB $12, T2, RED2, RED2 740 741 VACCQ RED1, ADD1H, CAR1 742 VAQ RED1, ADD1H, T0 743 VACCQ RED1H, T0, CAR1M 744 VAQ RED1H, T0, T0 745 746 // << ready for next MK0 747 748 VACQ RED2, ADD2H, CAR1, T1 749 VACCCQ RED2, ADD2H, CAR1, CAR1 750 VACCCQ RED2H, T1, CAR1M, T2 751 VACQ RED2H, T1, CAR1M, T1 752 VAQ CAR1, T2, T2 753 754 // --------------------------------------------------- 755 VREPF $2, Y1, YDIG 756 VMALF X0, YDIG, T0, ADD1 757 VMLF ADD1, K0, MK0 758 VREPF $3, MK0, MK0 759 760 VMALF X1, YDIG, T1, ADD2 761 VMALHF X0, YDIG, T0, ADD1H 762 VMALHF X1, YDIG, T1, ADD2H 763 764 VMALF M0, MK0, ADD1, RED1 765 VMALHF M0, MK0, ADD1, RED1H 766 VMALF M1, MK0, ADD2, RED2 767 VMALHF M1, MK0, ADD2, RED2H 768 769 VSLDB $12, RED2, RED1, RED1 770 VSLDB $12, T2, RED2, RED2 771 772 VACCQ RED1, ADD1H, CAR1 773 VAQ RED1, ADD1H, T0 774 VACCQ RED1H, T0, CAR1M 775 VAQ RED1H, T0, T0 776 777 // << ready for next MK0 778 779 VACQ RED2, ADD2H, CAR1, T1 780 VACCCQ RED2, ADD2H, CAR1, CAR1 781 VACCCQ RED2H, T1, CAR1M, T2 782 VACQ RED2H, T1, CAR1M, T1 783 VAQ CAR1, T2, T2 784 785 // --------------------------------------------------- 786 VREPF $1, Y1, YDIG 787 VMALF X0, YDIG, T0, ADD1 788 VMLF ADD1, K0, MK0 789 VREPF $3, MK0, MK0 790 791 VMALF X1, YDIG, T1, ADD2 792 VMALHF X0, YDIG, T0, ADD1H 793 VMALHF X1, YDIG, T1, ADD2H 794 795 VMALF M0, MK0, ADD1, RED1 796 VMALHF M0, MK0, ADD1, RED1H 797 VMALF M1, MK0, ADD2, RED2 798 VMALHF M1, MK0, ADD2, RED2H 799 800 VSLDB $12, RED2, RED1, RED1 801 VSLDB $12, T2, RED2, RED2 802 803 VACCQ RED1, ADD1H, CAR1 804 VAQ RED1, ADD1H, T0 805 VACCQ RED1H, T0, CAR1M 806 VAQ RED1H, T0, T0 807 808 // << ready for next MK0 809 810 VACQ RED2, ADD2H, CAR1, T1 811 VACCCQ RED2, ADD2H, CAR1, CAR1 812 VACCCQ RED2H, T1, CAR1M, T2 813 VACQ RED2H, T1, CAR1M, T1 814 VAQ CAR1, T2, T2 815 816 // --------------------------------------------------- 817 VREPF $0, Y1, YDIG 818 VMALF X0, YDIG, T0, ADD1 819 VMLF ADD1, K0, MK0 820 VREPF $3, MK0, MK0 821 822 VMALF X1, YDIG, T1, ADD2 823 VMALHF X0, YDIG, T0, ADD1H 824 VMALHF X1, YDIG, T1, ADD2H 825 826 VMALF M0, MK0, ADD1, RED1 827 VMALHF M0, MK0, ADD1, RED1H 828 VMALF M1, MK0, ADD2, RED2 829 VMALHF M1, MK0, ADD2, RED2H 830 831 VSLDB $12, RED2, RED1, RED1 832 VSLDB $12, T2, RED2, RED2 833 834 VACCQ RED1, ADD1H, CAR1 835 VAQ RED1, ADD1H, T0 836 VACCQ RED1H, T0, CAR1M 837 VAQ RED1H, T0, T0 838 839 // << ready for next MK0 840 841 VACQ RED2, ADD2H, CAR1, T1 842 VACCCQ RED2, ADD2H, CAR1, CAR1 843 VACCCQ RED2H, T1, CAR1M, T2 844 VACQ RED2H, T1, CAR1M, T1 845 VAQ CAR1, T2, T2 846 847 // --------------------------------------------------- 848 849 VZERO RED1 850 VSCBIQ M0, T0, CAR1 851 VSQ M0, T0, ADD1 852 VSBCBIQ T1, M1, CAR1, CAR1M 853 VSBIQ T1, M1, CAR1, ADD2 854 VSBIQ T2, RED1, CAR1M, T2 855 856 // what output to use, ADD2||ADD1 or T1||T0? 857 VSEL T0, ADD1, T2, T0 858 VSEL T1, ADD2, T2, T1 859 860 VST T0, (1*16)(res_ptr) 861 VST T1, (0*16)(res_ptr) 862 RET 863 864 #undef res_ptr 865 #undef x_ptr 866 #undef y_ptr 867 #undef X0 868 #undef X1 869 #undef Y0 870 #undef Y1 871 #undef M0 872 #undef M1 873 #undef T0 874 #undef T1 875 #undef T2 876 #undef YDIG 877 878 #undef ADD1 879 #undef ADD1H 880 #undef ADD2 881 #undef ADD2H 882 #undef RED1 883 #undef RED1H 884 #undef RED2 885 #undef RED2H 886 #undef CAR1 887 #undef CAR1M 888 889 #undef MK0 890 #undef K0 891 892 // --------------------------------------- 893 // p256MulInternal 894 // V0-V3,V30,V31 - Not Modified 895 // V4-V15 - Volatile 896 897 #define CPOOL R4 898 899 // Parameters 900 #define X0 V0 // Not modified 901 #define X1 V1 // Not modified 902 #define Y0 V2 // Not modified 903 #define Y1 V3 // Not modified 904 #define T0 V4 905 #define T1 V5 906 #define P0 V30 // Not modified 907 #define P1 V31 // Not modified 908 909 // Temporaries 910 #define YDIG V6 // Overloaded with CAR2, ZER 911 #define ADD1H V7 // Overloaded with ADD3H 912 #define ADD2H V8 // Overloaded with ADD4H 913 #define ADD3 V9 // Overloaded with SEL2,SEL5 914 #define ADD4 V10 // Overloaded with SEL3,SEL6 915 #define RED1 V11 // Overloaded with CAR2 916 #define RED2 V12 917 #define RED3 V13 // Overloaded with SEL1 918 #define T2 V14 919 // Overloaded temporaries 920 #define ADD1 V4 // Overloaded with T0 921 #define ADD2 V5 // Overloaded with T1 922 #define ADD3H V7 // Overloaded with ADD1H 923 #define ADD4H V8 // Overloaded with ADD2H 924 #define ZER V6 // Overloaded with YDIG, CAR2 925 #define CAR1 V6 // Overloaded with YDIG, ZER 926 #define CAR2 V11 // Overloaded with RED1 927 // Constant Selects 928 #define SEL1 V13 // Overloaded with RED3 929 #define SEL2 V9 // Overloaded with ADD3,SEL5 930 #define SEL3 V10 // Overloaded with ADD4,SEL6 931 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER 932 #define SEL5 V9 // Overloaded with ADD3,SEL2 933 #define SEL6 V10 // Overloaded with ADD4,SEL3 934 935 /* * 936 * To follow the flow of bits, for your own sanity a stiff drink, need you shall. 937 * Of a single round, a 'helpful' picture, here is. Meaning, column position has. 938 * With you, SIMD be... 939 * 940 * +--------+--------+ 941 * +--------| RED2 | RED1 | 942 * | +--------+--------+ 943 * | ---+--------+--------+ 944 * | +---- T2| T1 | T0 |--+ 945 * | | ---+--------+--------+ | 946 * | | | 947 * | | ======================= | 948 * | | | 949 * | | +--------+--------+<-+ 950 * | +-------| ADD2 | ADD1 |--|-----+ 951 * | | +--------+--------+ | | 952 * | | +--------+--------+<---+ | 953 * | | | ADD2H | ADD1H |--+ | 954 * | | +--------+--------+ | | 955 * | | +--------+--------+<-+ | 956 * | | | ADD4 | ADD3 |--|-+ | 957 * | | +--------+--------+ | | | 958 * | | +--------+--------+<---+ | | 959 * | | | ADD4H | ADD3H |------|-+ |(+vzero) 960 * | | +--------+--------+ | | V 961 * | | ------------------------ | | +--------+ 962 * | | | | | RED3 | [d0 0 0 d0] 963 * | | | | +--------+ 964 * | +---->+--------+--------+ | | | 965 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | 966 * | +--------+--------+ | | | 967 * +---->---+--------+--------+ | | | 968 * T2| T1 | T0 |----+ | | 969 * ---+--------+--------+ | | | 970 * ---+--------+--------+<---+ | | 971 * +--- T2| T1 | T0 |----------+ 972 * | ---+--------+--------+ | | 973 * | +--------+--------+<-------------+ 974 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 975 * | +--------+--------+ | | | 976 * | +--------+<----------------------+ 977 * | | RED3 |--------------+ | [0 0 d1 d0] 978 * | +--------+ | | 979 * +--->+--------+--------+ | | 980 * | T1 | T0 |--------+ 981 * +--------+--------+ | | 982 * --------------------------- | | 983 * | | 984 * +--------+--------+<----+ | 985 * | RED2 | RED1 | | 986 * +--------+--------+ | 987 * ---+--------+--------+<-------+ 988 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 989 * ---+--------+--------+ 990 * 991 * *Mi obra de arte de siglo XXI @vpaprots 992 * 993 * 994 * First group is special, doesnt get the two inputs: 995 * +--------+--------+<-+ 996 * +-------| ADD2 | ADD1 |--|-----+ 997 * | +--------+--------+ | | 998 * | +--------+--------+<---+ | 999 * | | ADD2H | ADD1H |--+ | 1000 * | +--------+--------+ | | 1001 * | +--------+--------+<-+ | 1002 * | | ADD4 | ADD3 |--|-+ | 1003 * | +--------+--------+ | | | 1004 * | +--------+--------+<---+ | | 1005 * | | ADD4H | ADD3H |------|-+ |(+vzero) 1006 * | +--------+--------+ | | V 1007 * | ------------------------ | | +--------+ 1008 * | | | | RED3 | [d0 0 0 d0] 1009 * | | | +--------+ 1010 * +---->+--------+--------+ | | | 1011 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | 1012 * +--------+--------+ | | | 1013 * ---+--------+--------+<---+ | | 1014 * +--- T2| T1 | T0 |----------+ 1015 * | ---+--------+--------+ | | 1016 * | +--------+--------+<-------------+ 1017 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 1018 * | +--------+--------+ | | | 1019 * | +--------+<----------------------+ 1020 * | | RED3 |--------------+ | [0 0 d1 d0] 1021 * | +--------+ | | 1022 * +--->+--------+--------+ | | 1023 * | T1 | T0 |--------+ 1024 * +--------+--------+ | | 1025 * --------------------------- | | 1026 * | | 1027 * +--------+--------+<----+ | 1028 * | RED2 | RED1 | | 1029 * +--------+--------+ | 1030 * ---+--------+--------+<-------+ 1031 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 1032 * ---+--------+--------+ 1033 * 1034 * Last 'group' needs to RED2||RED1 shifted less 1035 */ 1036 TEXT p256MulInternal<>(SB), NOSPLIT, $0-0 1037 VL 32(CPOOL), SEL1 1038 VL 48(CPOOL), SEL2 1039 VL 64(CPOOL), SEL3 1040 VL 80(CPOOL), SEL4 1041 1042 // --------------------------------------------------- 1043 1044 VREPF $3, Y0, YDIG 1045 VMLHF X0, YDIG, ADD1H 1046 VMLHF X1, YDIG, ADD2H 1047 VMLF X0, YDIG, ADD1 1048 VMLF X1, YDIG, ADD2 1049 1050 VREPF $2, Y0, YDIG 1051 VMALF X0, YDIG, ADD1H, ADD3 1052 VMALF X1, YDIG, ADD2H, ADD4 1053 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1054 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1055 1056 VZERO ZER 1057 VL 32(CPOOL), SEL1 1058 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1059 1060 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1061 VSLDB $12, ZER, ADD2, T1 // ADD2 Free 1062 1063 VACCQ T0, ADD3, CAR1 1064 VAQ T0, ADD3, T0 // ADD3 Free 1065 VACCCQ T1, ADD4, CAR1, T2 1066 VACQ T1, ADD4, CAR1, T1 // ADD4 Free 1067 1068 VL 48(CPOOL), SEL2 1069 VL 64(CPOOL), SEL3 1070 VL 80(CPOOL), SEL4 1071 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1072 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1073 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1074 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1075 1076 VSLDB $12, T1, T0, T0 1077 VSLDB $12, T2, T1, T1 1078 1079 VACCQ T0, ADD3H, CAR1 1080 VAQ T0, ADD3H, T0 1081 VACCCQ T1, ADD4H, CAR1, T2 1082 VACQ T1, ADD4H, CAR1, T1 1083 1084 // --------------------------------------------------- 1085 1086 VREPF $1, Y0, YDIG 1087 VMALHF X0, YDIG, T0, ADD1H 1088 VMALHF X1, YDIG, T1, ADD2H 1089 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 1090 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 1091 1092 VREPF $0, Y0, YDIG 1093 VMALF X0, YDIG, ADD1H, ADD3 1094 VMALF X1, YDIG, ADD2H, ADD4 1095 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 1096 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 1097 1098 VZERO ZER 1099 VL 32(CPOOL), SEL1 1100 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1101 1102 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 1103 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free 1104 1105 VACCQ T0, RED1, CAR1 1106 VAQ T0, RED1, T0 1107 VACCCQ T1, RED2, CAR1, T2 1108 VACQ T1, RED2, CAR1, T1 1109 1110 VACCQ T0, ADD3, CAR1 1111 VAQ T0, ADD3, T0 1112 VACCCQ T1, ADD4, CAR1, CAR2 1113 VACQ T1, ADD4, CAR1, T1 1114 VAQ T2, CAR2, T2 1115 1116 VL 48(CPOOL), SEL2 1117 VL 64(CPOOL), SEL3 1118 VL 80(CPOOL), SEL4 1119 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1120 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1121 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1122 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1123 1124 VSLDB $12, T1, T0, T0 1125 VSLDB $12, T2, T1, T1 1126 1127 VACCQ T0, ADD3H, CAR1 1128 VAQ T0, ADD3H, T0 1129 VACCCQ T1, ADD4H, CAR1, T2 1130 VACQ T1, ADD4H, CAR1, T1 1131 1132 // --------------------------------------------------- 1133 1134 VREPF $3, Y1, YDIG 1135 VMALHF X0, YDIG, T0, ADD1H 1136 VMALHF X1, YDIG, T1, ADD2H 1137 VMALF X0, YDIG, T0, ADD1 1138 VMALF X1, YDIG, T1, ADD2 1139 1140 VREPF $2, Y1, YDIG 1141 VMALF X0, YDIG, ADD1H, ADD3 1142 VMALF X1, YDIG, ADD2H, ADD4 1143 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1144 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1145 1146 VZERO ZER 1147 VL 32(CPOOL), SEL1 1148 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1149 1150 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1151 VSLDB $12, T2, ADD2, T1 // ADD2 Free 1152 1153 VACCQ T0, RED1, CAR1 1154 VAQ T0, RED1, T0 1155 VACCCQ T1, RED2, CAR1, T2 1156 VACQ T1, RED2, CAR1, T1 1157 1158 VACCQ T0, ADD3, CAR1 1159 VAQ T0, ADD3, T0 1160 VACCCQ T1, ADD4, CAR1, CAR2 1161 VACQ T1, ADD4, CAR1, T1 1162 VAQ T2, CAR2, T2 1163 1164 VL 48(CPOOL), SEL2 1165 VL 64(CPOOL), SEL3 1166 VL 80(CPOOL), SEL4 1167 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1168 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1169 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1170 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1171 1172 VSLDB $12, T1, T0, T0 1173 VSLDB $12, T2, T1, T1 1174 1175 VACCQ T0, ADD3H, CAR1 1176 VAQ T0, ADD3H, T0 1177 VACCCQ T1, ADD4H, CAR1, T2 1178 VACQ T1, ADD4H, CAR1, T1 1179 1180 // --------------------------------------------------- 1181 1182 VREPF $1, Y1, YDIG 1183 VMALHF X0, YDIG, T0, ADD1H 1184 VMALHF X1, YDIG, T1, ADD2H 1185 VMALF X0, YDIG, T0, ADD1 1186 VMALF X1, YDIG, T1, ADD2 1187 1188 VREPF $0, Y1, YDIG 1189 VMALF X0, YDIG, ADD1H, ADD3 1190 VMALF X1, YDIG, ADD2H, ADD4 1191 VMALHF X0, YDIG, ADD1H, ADD3H 1192 VMALHF X1, YDIG, ADD2H, ADD4H 1193 1194 VZERO ZER 1195 VL 32(CPOOL), SEL1 1196 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1197 1198 VSLDB $12, ADD2, ADD1, T0 1199 VSLDB $12, T2, ADD2, T1 1200 1201 VACCQ T0, RED1, CAR1 1202 VAQ T0, RED1, T0 1203 VACCCQ T1, RED2, CAR1, T2 1204 VACQ T1, RED2, CAR1, T1 1205 1206 VACCQ T0, ADD3, CAR1 1207 VAQ T0, ADD3, T0 1208 VACCCQ T1, ADD4, CAR1, CAR2 1209 VACQ T1, ADD4, CAR1, T1 1210 VAQ T2, CAR2, T2 1211 1212 VL 96(CPOOL), SEL5 1213 VL 112(CPOOL), SEL6 1214 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] 1215 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] 1216 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 1217 1218 VSLDB $12, T1, T0, T0 1219 VSLDB $12, T2, T1, T1 1220 1221 VACCQ T0, ADD3H, CAR1 1222 VAQ T0, ADD3H, T0 1223 VACCCQ T1, ADD4H, CAR1, T2 1224 VACQ T1, ADD4H, CAR1, T1 1225 1226 VACCQ T0, RED1, CAR1 1227 VAQ T0, RED1, T0 1228 VACCCQ T1, RED2, CAR1, CAR2 1229 VACQ T1, RED2, CAR1, T1 1230 VAQ T2, CAR2, T2 1231 1232 // --------------------------------------------------- 1233 1234 VZERO RED3 1235 VSCBIQ P0, T0, CAR1 1236 VSQ P0, T0, ADD1H 1237 VSBCBIQ T1, P1, CAR1, CAR2 1238 VSBIQ T1, P1, CAR1, ADD2H 1239 VSBIQ T2, RED3, CAR2, T2 1240 1241 // what output to use, ADD2H||ADD1H or T1||T0? 1242 VSEL T0, ADD1H, T2, T0 1243 VSEL T1, ADD2H, T2, T1 1244 RET 1245 1246 #undef CPOOL 1247 1248 #undef X0 1249 #undef X1 1250 #undef Y0 1251 #undef Y1 1252 #undef T0 1253 #undef T1 1254 #undef P0 1255 #undef P1 1256 1257 #undef SEL1 1258 #undef SEL2 1259 #undef SEL3 1260 #undef SEL4 1261 #undef SEL5 1262 #undef SEL6 1263 1264 #undef YDIG 1265 #undef ADD1H 1266 #undef ADD2H 1267 #undef ADD3 1268 #undef ADD4 1269 #undef RED1 1270 #undef RED2 1271 #undef RED3 1272 #undef T2 1273 #undef ADD1 1274 #undef ADD2 1275 #undef ADD3H 1276 #undef ADD4H 1277 #undef ZER 1278 #undef CAR1 1279 #undef CAR2 1280 1281 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1282 VZERO ZER \ 1283 VSCBIQ Y0, X0, CAR1 \ 1284 VSQ Y0, X0, T0 \ 1285 VSBCBIQ X1, Y1, CAR1, SEL1 \ 1286 VSBIQ X1, Y1, CAR1, T1 \ 1287 VSQ SEL1, ZER, SEL1 \ 1288 \ 1289 VACCQ T0, PL, CAR1 \ 1290 VAQ T0, PL, TT0 \ 1291 VACQ T1, PH, CAR1, TT1 \ 1292 \ 1293 VSEL T0, TT0, SEL1, T0 \ 1294 VSEL T1, TT1, SEL1, T1 \ 1295 1296 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1297 VACCQ X0, Y0, CAR1 \ 1298 VAQ X0, Y0, T0 \ 1299 VACCCQ X1, Y1, CAR1, T2 \ 1300 VACQ X1, Y1, CAR1, T1 \ 1301 \ 1302 VZERO ZER \ 1303 VSCBIQ PL, T0, CAR1 \ 1304 VSQ PL, T0, TT0 \ 1305 VSBCBIQ T1, PH, CAR1, CAR2 \ 1306 VSBIQ T1, PH, CAR1, TT1 \ 1307 VSBIQ T2, ZER, CAR2, SEL1 \ 1308 \ 1309 VSEL T0, TT0, SEL1, T0 \ 1310 VSEL T1, TT1, SEL1, T1 1311 1312 #define p256HalfInternal(T1, T0, X1, X0) \ 1313 VZERO ZER \ 1314 VSBIQ ZER, ZER, X0, SEL1 \ 1315 \ 1316 VACCQ X0, PL, CAR1 \ 1317 VAQ X0, PL, T0 \ 1318 VACCCQ X1, PH, CAR1, T2 \ 1319 VACQ X1, PH, CAR1, T1 \ 1320 \ 1321 VSEL X0, T0, SEL1, T0 \ 1322 VSEL X1, T1, SEL1, T1 \ 1323 VSEL ZER, T2, SEL1, T2 \ 1324 \ 1325 VSLDB $15, T2, ZER, TT1 \ 1326 VSLDB $15, T1, ZER, TT0 \ 1327 VREPIB $1, SEL1 \ 1328 VSRL SEL1, T0, T0 \ 1329 VSRL SEL1, T1, T1 \ 1330 VREPIB $7, SEL1 \ 1331 VSL SEL1, TT0, TT0 \ 1332 VSL SEL1, TT1, TT1 \ 1333 VO T0, TT0, T0 \ 1334 VO T1, TT1, T1 1335 1336 // --------------------------------------- 1337 // func p256MulAsm(res, in1, in2 []byte) 1338 #define res_ptr R1 1339 #define x_ptr R2 1340 #define y_ptr R3 1341 #define CPOOL R4 1342 1343 // Parameters 1344 #define X0 V0 1345 #define X1 V1 1346 #define Y0 V2 1347 #define Y1 V3 1348 #define T0 V4 1349 #define T1 V5 1350 1351 // Constants 1352 #define P0 V30 1353 #define P1 V31 1354 TEXT ·p256MulAsm(SB), NOSPLIT, $0 1355 MOVD res+0(FP), res_ptr 1356 MOVD in1+24(FP), x_ptr 1357 MOVD in2+48(FP), y_ptr 1358 1359 VL (1*16)(x_ptr), X0 1360 VL (0*16)(x_ptr), X1 1361 VL (1*16)(y_ptr), Y0 1362 VL (0*16)(y_ptr), Y1 1363 1364 MOVD $p256mul<>+0x00(SB), CPOOL 1365 VL 16(CPOOL), P0 1366 VL 0(CPOOL), P1 1367 1368 CALL p256MulInternal<>(SB) 1369 1370 VST T0, (1*16)(res_ptr) 1371 VST T1, (0*16)(res_ptr) 1372 RET 1373 1374 #undef res_ptr 1375 #undef x_ptr 1376 #undef y_ptr 1377 #undef CPOOL 1378 1379 #undef X0 1380 #undef X1 1381 #undef Y0 1382 #undef Y1 1383 #undef T0 1384 #undef T1 1385 #undef P0 1386 #undef P1 1387 1388 // Point add with P2 being affine point 1389 // If sign == 1 -> P2 = -P2 1390 // If sel == 0 -> P3 = P1 1391 // if zero == 0 -> P3 = P2 1392 // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) 1393 #define P3ptr R1 1394 #define P1ptr R2 1395 #define P2ptr R3 1396 #define CPOOL R4 1397 1398 // Temporaries in REGs 1399 #define Y2L V15 1400 #define Y2H V16 1401 #define T1L V17 1402 #define T1H V18 1403 #define T2L V19 1404 #define T2H V20 1405 #define T3L V21 1406 #define T3H V22 1407 #define T4L V23 1408 #define T4H V24 1409 1410 // Temps for Sub and Add 1411 #define TT0 V11 1412 #define TT1 V12 1413 #define T2 V13 1414 1415 // p256MulAsm Parameters 1416 #define X0 V0 1417 #define X1 V1 1418 #define Y0 V2 1419 #define Y1 V3 1420 #define T0 V4 1421 #define T1 V5 1422 1423 #define PL V30 1424 #define PH V31 1425 1426 // Names for zero/sel selects 1427 #define X1L V0 1428 #define X1H V1 1429 #define Y1L V2 // p256MulAsmParmY 1430 #define Y1H V3 // p256MulAsmParmY 1431 #define Z1L V4 1432 #define Z1H V5 1433 #define X2L V0 1434 #define X2H V1 1435 #define Z2L V4 1436 #define Z2H V5 1437 #define X3L V17 // T1L 1438 #define X3H V18 // T1H 1439 #define Y3L V21 // T3L 1440 #define Y3H V22 // T3H 1441 #define Z3L V28 1442 #define Z3H V29 1443 1444 #define ZER V6 1445 #define SEL1 V7 1446 #define CAR1 V8 1447 #define CAR2 V9 1448 /* * 1449 * Three operand formula: 1450 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1451 * T1 = Z1² 1452 * T2 = T1*Z1 1453 * T1 = T1*X2 1454 * T2 = T2*Y2 1455 * T1 = T1-X1 1456 * T2 = T2-Y1 1457 * Z3 = Z1*T1 1458 * T3 = T1² 1459 * T4 = T3*T1 1460 * T3 = T3*X1 1461 * T1 = 2*T3 1462 * X3 = T2² 1463 * X3 = X3-T1 1464 * X3 = X3-T4 1465 * T3 = T3-X3 1466 * T3 = T3*T2 1467 * T4 = T4*Y1 1468 * Y3 = T3-T4 1469 1470 * Three operand formulas, but with MulInternal X,Y used to store temps 1471 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1472 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1473 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1474 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1475 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1476 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1477 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1478 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1479 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1480 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1481 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1482 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1483 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1484 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1485 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1486 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1487 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1488 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1489 1490 */ 1491 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 1492 MOVD P3+0(FP), P3ptr 1493 MOVD P1+8(FP), P1ptr 1494 MOVD P2+16(FP), P2ptr 1495 1496 MOVD $p256mul<>+0x00(SB), CPOOL 1497 VL 16(CPOOL), PL 1498 VL 0(CPOOL), PH 1499 1500 // if (sign == 1) { 1501 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 1502 // } 1503 1504 VL 32(P2ptr), Y2H 1505 VL 48(P2ptr), Y2L 1506 1507 VLREPG sign+24(FP), SEL1 1508 VZERO ZER 1509 VCEQG SEL1, ZER, SEL1 1510 1511 VSCBIQ Y2L, PL, CAR1 1512 VSQ Y2L, PL, T1L 1513 VSBIQ PH, Y2H, CAR1, T1H 1514 1515 VSEL Y2L, T1L, SEL1, Y2L 1516 VSEL Y2H, T1H, SEL1, Y2H 1517 1518 /* * 1519 * Three operand formula: 1520 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1521 */ 1522 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1523 VL 64(P1ptr), X1 // Z1H 1524 VL 80(P1ptr), X0 // Z1L 1525 VLR X0, Y0 1526 VLR X1, Y1 1527 CALL p256MulInternal<>(SB) 1528 1529 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1530 VLR T0, X0 1531 VLR T1, X1 1532 CALL p256MulInternal<>(SB) 1533 VLR T0, T2L 1534 VLR T1, T2H 1535 1536 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1537 VL 0(P2ptr), Y1 // X2H 1538 VL 16(P2ptr), Y0 // X2L 1539 CALL p256MulInternal<>(SB) 1540 VLR T0, T1L 1541 VLR T1, T1H 1542 1543 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1544 VLR T2L, X0 1545 VLR T2H, X1 1546 VLR Y2L, Y0 1547 VLR Y2H, Y1 1548 CALL p256MulInternal<>(SB) 1549 1550 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1551 VL 32(P1ptr), Y1H 1552 VL 48(P1ptr), Y1L 1553 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1554 1555 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1556 VL 0(P1ptr), X1H 1557 VL 16(P1ptr), X1L 1558 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1559 1560 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1561 VL 64(P1ptr), X1 // Z1H 1562 VL 80(P1ptr), X0 // Z1L 1563 CALL p256MulInternal<>(SB) 1564 1565 // VST T1, 64(P3ptr) 1566 // VST T0, 80(P3ptr) 1567 VLR T0, Z3L 1568 VLR T1, Z3H 1569 1570 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1571 VLR Y0, X0 1572 VLR Y1, X1 1573 CALL p256MulInternal<>(SB) 1574 VLR T0, X0 1575 VLR T1, X1 1576 1577 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1578 CALL p256MulInternal<>(SB) 1579 VLR T0, T4L 1580 VLR T1, T4H 1581 1582 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1583 VL 0(P1ptr), Y1 // X1H 1584 VL 16(P1ptr), Y0 // X1L 1585 CALL p256MulInternal<>(SB) 1586 VLR T0, T3L 1587 VLR T1, T3H 1588 1589 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1590 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1591 1592 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1593 VLR T2L, X0 1594 VLR T2H, X1 1595 VLR T2L, Y0 1596 VLR T2H, Y1 1597 CALL p256MulInternal<>(SB) 1598 1599 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1600 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1601 1602 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1603 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1604 VLR T0, X3L 1605 VLR T1, X3H 1606 1607 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1608 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1609 1610 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1611 CALL p256MulInternal<>(SB) 1612 VLR T0, T3L 1613 VLR T1, T3H 1614 1615 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1616 VLR T4L, X0 1617 VLR T4H, X1 1618 VL 32(P1ptr), Y1 // Y1H 1619 VL 48(P1ptr), Y0 // Y1L 1620 CALL p256MulInternal<>(SB) 1621 1622 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 1623 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 1624 1625 // if (sel == 0) { 1626 // copy(P3.x[:], X1) 1627 // copy(P3.y[:], Y1) 1628 // copy(P3.z[:], Z1) 1629 // } 1630 1631 VL 0(P1ptr), X1H 1632 VL 16(P1ptr), X1L 1633 1634 // Y1 already loaded, left over from addition 1635 VL 64(P1ptr), Z1H 1636 VL 80(P1ptr), Z1L 1637 1638 VLREPG sel+32(FP), SEL1 1639 VZERO ZER 1640 VCEQG SEL1, ZER, SEL1 1641 1642 VSEL X1L, X3L, SEL1, X3L 1643 VSEL X1H, X3H, SEL1, X3H 1644 VSEL Y1L, Y3L, SEL1, Y3L 1645 VSEL Y1H, Y3H, SEL1, Y3H 1646 VSEL Z1L, Z3L, SEL1, Z3L 1647 VSEL Z1H, Z3H, SEL1, Z3H 1648 1649 // if (zero == 0) { 1650 // copy(P3.x[:], X2) 1651 // copy(P3.y[:], Y2) 1652 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1653 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p 1654 // } 1655 VL 0(P2ptr), X2H 1656 VL 16(P2ptr), X2L 1657 1658 // Y2 already loaded 1659 VL 128(CPOOL), Z2H 1660 VL 144(CPOOL), Z2L 1661 1662 VLREPG zero+40(FP), SEL1 1663 VZERO ZER 1664 VCEQG SEL1, ZER, SEL1 1665 1666 VSEL X2L, X3L, SEL1, X3L 1667 VSEL X2H, X3H, SEL1, X3H 1668 VSEL Y2L, Y3L, SEL1, Y3L 1669 VSEL Y2H, Y3H, SEL1, Y3H 1670 VSEL Z2L, Z3L, SEL1, Z3L 1671 VSEL Z2H, Z3H, SEL1, Z3H 1672 1673 // All done, store out the result!!! 1674 VST X3H, 0(P3ptr) 1675 VST X3L, 16(P3ptr) 1676 VST Y3H, 32(P3ptr) 1677 VST Y3L, 48(P3ptr) 1678 VST Z3H, 64(P3ptr) 1679 VST Z3L, 80(P3ptr) 1680 1681 RET 1682 1683 #undef P3ptr 1684 #undef P1ptr 1685 #undef P2ptr 1686 #undef CPOOL 1687 1688 #undef Y2L 1689 #undef Y2H 1690 #undef T1L 1691 #undef T1H 1692 #undef T2L 1693 #undef T2H 1694 #undef T3L 1695 #undef T3H 1696 #undef T4L 1697 #undef T4H 1698 1699 #undef TT0 1700 #undef TT1 1701 #undef T2 1702 1703 #undef X0 1704 #undef X1 1705 #undef Y0 1706 #undef Y1 1707 #undef T0 1708 #undef T1 1709 1710 #undef PL 1711 #undef PH 1712 1713 #undef X1L 1714 #undef X1H 1715 #undef Y1L 1716 #undef Y1H 1717 #undef Z1L 1718 #undef Z1H 1719 #undef X2L 1720 #undef X2H 1721 #undef Z2L 1722 #undef Z2H 1723 #undef X3L 1724 #undef X3H 1725 #undef Y3L 1726 #undef Y3H 1727 #undef Z3L 1728 #undef Z3H 1729 1730 #undef ZER 1731 #undef SEL1 1732 #undef CAR1 1733 #undef CAR2 1734 1735 // p256PointDoubleAsm(P3, P1 *p256Point) 1736 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 1737 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 1738 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 1739 #define P3ptr R1 1740 #define P1ptr R2 1741 #define CPOOL R4 1742 1743 // Temporaries in REGs 1744 #define X3L V15 1745 #define X3H V16 1746 #define Y3L V17 1747 #define Y3H V18 1748 #define T1L V19 1749 #define T1H V20 1750 #define T2L V21 1751 #define T2H V22 1752 #define T3L V23 1753 #define T3H V24 1754 1755 #define X1L V6 1756 #define X1H V7 1757 #define Y1L V8 1758 #define Y1H V9 1759 #define Z1L V10 1760 #define Z1H V11 1761 1762 // Temps for Sub and Add 1763 #define TT0 V11 1764 #define TT1 V12 1765 #define T2 V13 1766 1767 // p256MulAsm Parameters 1768 #define X0 V0 1769 #define X1 V1 1770 #define Y0 V2 1771 #define Y1 V3 1772 #define T0 V4 1773 #define T1 V5 1774 1775 #define PL V30 1776 #define PH V31 1777 1778 #define Z3L V23 1779 #define Z3H V24 1780 1781 #define ZER V26 1782 #define SEL1 V27 1783 #define CAR1 V28 1784 #define CAR2 V29 1785 /* 1786 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 1787 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 1788 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1789 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 1790 * B = 2Y₁ 1791 * Z₃ = B×Z₁ 1792 * C = B² 1793 * D = C×X₁ 1794 * X₃ = A²-2D 1795 * Y₃ = (D-X₃)×A-C²/2 1796 * 1797 * Three-operand formula: 1798 * T1 = Z1² 1799 * T2 = X1-T1 1800 * T1 = X1+T1 1801 * T2 = T2*T1 1802 * T2 = 3*T2 1803 * Y3 = 2*Y1 1804 * Z3 = Y3*Z1 1805 * Y3 = Y3² 1806 * T3 = Y3*X1 1807 * Y3 = Y3² 1808 * Y3 = half*Y3 1809 * X3 = T2² 1810 * T1 = 2*T3 1811 * X3 = X3-T1 1812 * T1 = T3-X3 1813 * T1 = T1*T2 1814 * Y3 = T1-Y3 1815 */ 1816 1817 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 1818 MOVD P3+0(FP), P3ptr 1819 MOVD P1+8(FP), P1ptr 1820 1821 MOVD $p256mul<>+0x00(SB), CPOOL 1822 VL 16(CPOOL), PL 1823 VL 0(CPOOL), PH 1824 1825 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 1826 VL 64(P1ptr), X1 // Z1H 1827 VL 80(P1ptr), X0 // Z1L 1828 VLR X0, Y0 1829 VLR X1, Y1 1830 CALL p256MulInternal<>(SB) 1831 1832 // SUB(X<X1-T) // T2 = X1-T1 1833 VL 0(P1ptr), X1H 1834 VL 16(P1ptr), X1L 1835 p256SubInternal(X1,X0,X1H,X1L,T1,T0) 1836 1837 // ADD(Y<X1+T) // T1 = X1+T1 1838 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) 1839 1840 // X- ; Y- ; MUL; T- // T2 = T2*T1 1841 CALL p256MulInternal<>(SB) 1842 1843 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 1844 p256AddInternal(T2H,T2L,T1,T0,T1,T0) 1845 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) 1846 1847 // ADD(X<Y1+Y1) // Y3 = 2*Y1 1848 VL 32(P1ptr), Y1H 1849 VL 48(P1ptr), Y1L 1850 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) 1851 1852 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 1853 VL 64(P1ptr), Y1 // Z1H 1854 VL 80(P1ptr), Y0 // Z1L 1855 CALL p256MulInternal<>(SB) 1856 VST T1, 64(P3ptr) 1857 VST T0, 80(P3ptr) 1858 1859 // X- ; Y=X ; MUL; T- // Y3 = Y3² 1860 VLR X0, Y0 1861 VLR X1, Y1 1862 CALL p256MulInternal<>(SB) 1863 1864 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 1865 VLR T0, X0 1866 VLR T1, X1 1867 VL 0(P1ptr), Y1 1868 VL 16(P1ptr), Y0 1869 CALL p256MulInternal<>(SB) 1870 VLR T0, T3L 1871 VLR T1, T3H 1872 1873 // X- ; Y=X ; MUL; T- // Y3 = Y3² 1874 VLR X0, Y0 1875 VLR X1, Y1 1876 CALL p256MulInternal<>(SB) 1877 1878 // HAL(Y3<T) // Y3 = half*Y3 1879 p256HalfInternal(Y3H,Y3L, T1,T0) 1880 1881 // X=T2; Y=T2; MUL; T- // X3 = T2² 1882 VLR T2L, X0 1883 VLR T2H, X1 1884 VLR T2L, Y0 1885 VLR T2H, Y1 1886 CALL p256MulInternal<>(SB) 1887 1888 // ADD(T1<T3+T3) // T1 = 2*T3 1889 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) 1890 1891 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 1892 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) 1893 VST X3H, 0(P3ptr) 1894 VST X3L, 16(P3ptr) 1895 1896 // SUB(X<T3-X3) // T1 = T3-X3 1897 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) 1898 1899 // X- ; Y- ; MUL; T- // T1 = T1*T2 1900 CALL p256MulInternal<>(SB) 1901 1902 // SUB(Y3<T-Y3) // Y3 = T1-Y3 1903 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) 1904 1905 VST Y3H, 32(P3ptr) 1906 VST Y3L, 48(P3ptr) 1907 RET 1908 1909 #undef P3ptr 1910 #undef P1ptr 1911 #undef CPOOL 1912 #undef X3L 1913 #undef X3H 1914 #undef Y3L 1915 #undef Y3H 1916 #undef T1L 1917 #undef T1H 1918 #undef T2L 1919 #undef T2H 1920 #undef T3L 1921 #undef T3H 1922 #undef X1L 1923 #undef X1H 1924 #undef Y1L 1925 #undef Y1H 1926 #undef Z1L 1927 #undef Z1H 1928 #undef TT0 1929 #undef TT1 1930 #undef T2 1931 #undef X0 1932 #undef X1 1933 #undef Y0 1934 #undef Y1 1935 #undef T0 1936 #undef T1 1937 #undef PL 1938 #undef PH 1939 #undef Z3L 1940 #undef Z3H 1941 #undef ZER 1942 #undef SEL1 1943 #undef CAR1 1944 #undef CAR2 1945 1946 // p256PointAddAsm(P3, P1, P2 *p256Point) 1947 #define P3ptr R1 1948 #define P1ptr R2 1949 #define P2ptr R3 1950 #define CPOOL R4 1951 #define ISZERO R5 1952 #define TRUE R6 1953 1954 // Temporaries in REGs 1955 #define T1L V16 1956 #define T1H V17 1957 #define T2L V18 1958 #define T2H V19 1959 #define U1L V20 1960 #define U1H V21 1961 #define S1L V22 1962 #define S1H V23 1963 #define HL V24 1964 #define HH V25 1965 #define RL V26 1966 #define RH V27 1967 1968 // Temps for Sub and Add 1969 #define ZER V6 1970 #define SEL1 V7 1971 #define CAR1 V8 1972 #define CAR2 V9 1973 #define TT0 V11 1974 #define TT1 V12 1975 #define T2 V13 1976 1977 // p256MulAsm Parameters 1978 #define X0 V0 1979 #define X1 V1 1980 #define Y0 V2 1981 #define Y1 V3 1982 #define T0 V4 1983 #define T1 V5 1984 1985 #define PL V30 1986 #define PH V31 1987 /* 1988 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 1989 * 1990 * A = X₁×Z₂² 1991 * B = Y₁×Z₂³ 1992 * C = X₂×Z₁²-A 1993 * D = Y₂×Z₁³-B 1994 * X₃ = D² - 2A×C² - C³ 1995 * Y₃ = D×(A×C² - X₃) - B×C³ 1996 * Z₃ = Z₁×Z₂×C 1997 * 1998 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 1999 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 2000 * 2001 * T1 = Z1*Z1 2002 * T2 = Z2*Z2 2003 * U1 = X1*T2 2004 * H = X2*T1 2005 * H = H-U1 2006 * Z3 = Z1*Z2 2007 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2008 * 2009 * S1 = Z2*T2 2010 * S1 = Y1*S1 2011 * R = Z1*T1 2012 * R = Y2*R 2013 * R = R-S1 2014 * 2015 * T1 = H*H 2016 * T2 = H*T1 2017 * U1 = U1*T1 2018 * 2019 * X3 = R*R 2020 * X3 = X3-T2 2021 * T1 = 2*U1 2022 * X3 = X3-T1 << store-out X3 result reg 2023 * 2024 * T2 = S1*T2 2025 * Y3 = U1-X3 2026 * Y3 = R*Y3 2027 * Y3 = Y3-T2 << store-out Y3 result reg 2028 2029 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2030 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2031 // X=X2; Y- ; MUL; H=T // H = X2*T1 2032 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2033 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2034 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2035 // SUB(H<H-T) // H = H-U1 2036 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2037 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2038 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2039 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2040 // SUB(R<T-S1) // R = R-S1 2041 // X=H ; Y=H ; MUL; T- // T1 = H*H 2042 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2043 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2044 // X=R ; Y=R ; MUL; T- // X3 = R*R 2045 // SUB(T<T-T2) // X3 = X3-T2 2046 // ADD(X<U1+U1) // T1 = 2*U1 2047 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2048 // SUB(Y<U1-T) // Y3 = U1-X3 2049 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2050 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2051 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2052 */ 2053 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 2054 MOVD P3+0(FP), P3ptr 2055 MOVD P1+8(FP), P1ptr 2056 MOVD P2+16(FP), P2ptr 2057 2058 MOVD $p256mul<>+0x00(SB), CPOOL 2059 VL 16(CPOOL), PL 2060 VL 0(CPOOL), PH 2061 2062 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2063 VL 64(P1ptr), X1 // Z1H 2064 VL 80(P1ptr), X0 // Z1L 2065 VLR X0, Y0 2066 VLR X1, Y1 2067 CALL p256MulInternal<>(SB) 2068 2069 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2070 VLR T0, Y0 2071 VLR T1, Y1 2072 CALL p256MulInternal<>(SB) 2073 VLR T0, RL 2074 VLR T1, RH 2075 2076 // X=X2; Y- ; MUL; H=T // H = X2*T1 2077 VL 0(P2ptr), X1 // X2H 2078 VL 16(P2ptr), X0 // X2L 2079 CALL p256MulInternal<>(SB) 2080 VLR T0, HL 2081 VLR T1, HH 2082 2083 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2084 VL 64(P2ptr), X1 // Z2H 2085 VL 80(P2ptr), X0 // Z2L 2086 VLR X0, Y0 2087 VLR X1, Y1 2088 CALL p256MulInternal<>(SB) 2089 2090 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2091 VLR T0, Y0 2092 VLR T1, Y1 2093 CALL p256MulInternal<>(SB) 2094 VLR T0, S1L 2095 VLR T1, S1H 2096 2097 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2098 VL 0(P1ptr), X1 // X1H 2099 VL 16(P1ptr), X0 // X1L 2100 CALL p256MulInternal<>(SB) 2101 VLR T0, U1L 2102 VLR T1, U1H 2103 2104 // SUB(H<H-T) // H = H-U1 2105 p256SubInternal(HH,HL,HH,HL,T1,T0) 2106 2107 // if H == 0 or H^P == 0 then ret=1 else ret=0 2108 // clobbers T1H and T1L 2109 MOVD $0, ISZERO 2110 MOVD $1, TRUE 2111 VZERO ZER 2112 VO HL, HH, T1H 2113 VCEQGS ZER, T1H, T1H 2114 MOVDEQ TRUE, ISZERO 2115 VX HL, PL, T1L 2116 VX HH, PH, T1H 2117 VO T1L, T1H, T1H 2118 VCEQGS ZER, T1H, T1H 2119 MOVDEQ TRUE, ISZERO 2120 MOVD ISZERO, ret+24(FP) 2121 2122 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2123 VL 64(P1ptr), X1 // Z1H 2124 VL 80(P1ptr), X0 // Z1L 2125 VL 64(P2ptr), Y1 // Z2H 2126 VL 80(P2ptr), Y0 // Z2L 2127 CALL p256MulInternal<>(SB) 2128 2129 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2130 VLR T0, X0 2131 VLR T1, X1 2132 VLR HL, Y0 2133 VLR HH, Y1 2134 CALL p256MulInternal<>(SB) 2135 VST T1, 64(P3ptr) 2136 VST T0, 80(P3ptr) 2137 2138 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2139 VL 32(P1ptr), X1 2140 VL 48(P1ptr), X0 2141 VLR S1L, Y0 2142 VLR S1H, Y1 2143 CALL p256MulInternal<>(SB) 2144 VLR T0, S1L 2145 VLR T1, S1H 2146 2147 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2148 VL 32(P2ptr), X1 2149 VL 48(P2ptr), X0 2150 VLR RL, Y0 2151 VLR RH, Y1 2152 CALL p256MulInternal<>(SB) 2153 2154 // SUB(R<T-S1) // R = T-S1 2155 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2156 2157 // if R == 0 or R^P == 0 then ret=ret else ret=0 2158 // clobbers T1H and T1L 2159 MOVD $0, ISZERO 2160 MOVD $1, TRUE 2161 VZERO ZER 2162 VO RL, RH, T1H 2163 VCEQGS ZER, T1H, T1H 2164 MOVDEQ TRUE, ISZERO 2165 VX RL, PL, T1L 2166 VX RH, PH, T1H 2167 VO T1L, T1H, T1H 2168 VCEQGS ZER, T1H, T1H 2169 MOVDEQ TRUE, ISZERO 2170 AND ret+24(FP), ISZERO 2171 MOVD ISZERO, ret+24(FP) 2172 2173 // X=H ; Y=H ; MUL; T- // T1 = H*H 2174 VLR HL, X0 2175 VLR HH, X1 2176 VLR HL, Y0 2177 VLR HH, Y1 2178 CALL p256MulInternal<>(SB) 2179 2180 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2181 VLR T0, Y0 2182 VLR T1, Y1 2183 CALL p256MulInternal<>(SB) 2184 VLR T0, T2L 2185 VLR T1, T2H 2186 2187 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2188 VLR U1L, X0 2189 VLR U1H, X1 2190 CALL p256MulInternal<>(SB) 2191 VLR T0, U1L 2192 VLR T1, U1H 2193 2194 // X=R ; Y=R ; MUL; T- // X3 = R*R 2195 VLR RL, X0 2196 VLR RH, X1 2197 VLR RL, Y0 2198 VLR RH, Y1 2199 CALL p256MulInternal<>(SB) 2200 2201 // SUB(T<T-T2) // X3 = X3-T2 2202 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2203 2204 // ADD(X<U1+U1) // T1 = 2*U1 2205 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2206 2207 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2208 p256SubInternal(T1,T0,T1,T0,X1,X0) 2209 VST T1, 0(P3ptr) 2210 VST T0, 16(P3ptr) 2211 2212 // SUB(Y<U1-T) // Y3 = U1-X3 2213 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2214 2215 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2216 VLR RL, X0 2217 VLR RH, X1 2218 CALL p256MulInternal<>(SB) 2219 VLR T0, U1L 2220 VLR T1, U1H 2221 2222 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2223 VLR S1L, X0 2224 VLR S1H, X1 2225 VLR T2L, Y0 2226 VLR T2H, Y1 2227 CALL p256MulInternal<>(SB) 2228 2229 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2230 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2231 VST T1, 32(P3ptr) 2232 VST T0, 48(P3ptr) 2233 2234 RET