github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_s390x.s (about) 1 // This is a port of the NIST P256 s390x asm implementation to SM2 P256. 2 // 3 // Copyright 2019 The Go Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 7 //go:build !purego 8 9 #include "textflag.h" 10 #include "go_asm.h" 11 12 DATA p256ordK0<>+0x00(SB)/4, $0x72350975 13 DATA p256ord<>+0x00(SB)/8, $0xfffffffeffffffff 14 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff 15 DATA p256ord<>+0x10(SB)/8, $0x7203df6b21c6052b 16 DATA p256ord<>+0x18(SB)/8, $0x53bbf40939d54123 17 DATA p256<>+0x00(SB)/8, $0xfffffffeffffffff // P256 18 DATA p256<>+0x08(SB)/8, $0xffffffffffffffff // P256 19 DATA p256<>+0x10(SB)/8, $0xffffffff00000000 // P256 20 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 21 DATA p256<>+0x20(SB)/8, $0x0000000000000000 // SEL 0 0 d1 d0 22 DATA p256<>+0x28(SB)/8, $0x18191a1b1c1d1e1f // SEL 0 0 d1 d0 23 DATA p256<>+0x30(SB)/8, $0x0706050403020100 // LE2BE permute mask 24 DATA p256<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask 25 DATA p256mul<>+0x00(SB)/8, $0xfffffffeffffffff // P256 26 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256 27 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000000 // P256 28 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 29 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 30 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 31 DATA p256mul<>+0x30(SB)/8, $0x0405060708090a0b // SEL 0 0 d1 d0 32 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 33 DATA p256mul<>+0x40(SB)/8, $0x0000000100000000 // (1*2^256)%P256 34 DATA p256mul<>+0x48(SB)/8, $0x0000000000000000 // (1*2^256)%P256 35 DATA p256mul<>+0x50(SB)/8, $0x00000000ffffffff // (1*2^256)%P256 36 DATA p256mul<>+0x58(SB)/8, $0x0000000000000001 // (1*2^256)%P256 37 GLOBL p256ordK0<>(SB), 8, $4 38 GLOBL p256ord<>(SB), 8, $32 39 GLOBL p256<>(SB), 8, $64 40 GLOBL p256mul<>(SB), 8, $96 41 42 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 43 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0 44 JMP ·p256BigToLittle(SB) 45 46 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 47 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0 48 JMP ·p256BigToLittle(SB) 49 50 // --------------------------------------- 51 // func p256LittleToBig(res *[32]byte, in *p256Element) 52 TEXT ·p256LittleToBig(SB), NOSPLIT, $0 53 JMP ·p256BigToLittle(SB) 54 55 // func p256BigToLittle(res *p256Element, in *[32]byte) 56 #define res_ptr R1 57 #define in_ptr R2 58 #define T1L V3 59 #define T1H V2 60 #define T1L2 V1 61 62 TEXT ·p256BigToLittle(SB), NOSPLIT, $0 63 MOVD res+0(FP), res_ptr 64 MOVD in+8(FP), in_ptr 65 66 VLM (in_ptr), T1H, T1L 67 68 VPDI $0x4, T1L, T1L, T1L2 69 VPDI $0x4, T1H, T1H, T1H 70 71 VSTM T1L2, T1H, (res_ptr) 72 73 RET 74 75 #undef res_ptr 76 #undef in_ptr 77 #undef T1L 78 #undef T1H 79 #undef T1L2 80 81 // --------------------------------------- 82 // iff cond == 1 val <- -val 83 // func p256NegCond(val *p256Element, cond int) 84 #define P1ptr R1 85 #define CPOOL R4 86 87 #define Y1L V0 88 #define Y1H V1 89 #define T1L V2 90 #define T1H V3 91 92 #define PL V31 93 #define PH V30 94 95 #define ZER V4 96 #define SEL1 V5 97 #define CAR1 V6 98 TEXT ·p256NegCond(SB), NOSPLIT, $0 99 MOVD val+0(FP), P1ptr 100 101 MOVD $p256mul<>+0x00(SB), CPOOL 102 VLM (CPOOL), PH, PL 103 104 VLM (P1ptr), Y1L, Y1H 105 VPDI $0x4, Y1H, Y1H, Y1H 106 VPDI $0x4, Y1L, Y1L, Y1L 107 108 VLREPG cond+8(FP), SEL1 109 VZERO ZER 110 VCEQG SEL1, ZER, SEL1 111 112 VSCBIQ Y1L, PL, CAR1 113 VSQ Y1L, PL, T1L 114 VSBIQ PH, Y1H, CAR1, T1H 115 116 VSEL Y1L, T1L, SEL1, Y1L 117 VSEL Y1H, T1H, SEL1, Y1H 118 119 VPDI $0x4, Y1H, Y1H, Y1H 120 VPDI $0x4, Y1L, Y1L, Y1L 121 VSTM Y1L, Y1H, (P1ptr) 122 123 RET 124 125 #undef P1ptr 126 #undef CPOOL 127 #undef Y1L 128 #undef Y1H 129 #undef T1L 130 #undef T1H 131 #undef PL 132 #undef PH 133 #undef ZER 134 #undef SEL1 135 #undef CAR1 136 137 // --------------------------------------- 138 // if cond == 0 res <- b; else res <- a 139 // func p256MovCond(res, a, b *P256Point, cond int) 140 #define P3ptr R1 141 #define P1ptr R2 142 #define P2ptr R3 143 144 #define X1L V1 145 #define X1H V0 146 #define Y1L V3 147 #define Y1H V2 148 #define Z1L V5 149 #define Z1H V4 150 #define X2L V7 151 #define X2H V6 152 #define Y2L V9 153 #define Y2H V8 154 #define Z2L V11 155 #define Z2H V10 156 157 #define ZER V18 158 #define SEL1 V19 159 TEXT ·p256MovCond(SB), NOSPLIT, $0 160 MOVD res+0(FP), P3ptr 161 MOVD a+8(FP), P1ptr 162 MOVD b+16(FP), P2ptr 163 VLREPG cond+24(FP), SEL1 164 VZERO ZER 165 VCEQG SEL1, ZER, SEL1 166 167 VLM (P1ptr), X1H, Z1L 168 VLM (P2ptr), X2H, Z2L 169 170 VSEL X2L, X1L, SEL1, X1L 171 VSEL X2H, X1H, SEL1, X1H 172 VSEL Y2L, Y1L, SEL1, Y1L 173 VSEL Y2H, Y1H, SEL1, Y1H 174 VSEL Z2L, Z1L, SEL1, Z1L 175 VSEL Z2H, Z1H, SEL1, Z1H 176 177 VSTM X1H, Z1L, (P3ptr) 178 179 RET 180 181 #undef P3ptr 182 #undef P1ptr 183 #undef P2ptr 184 #undef X1L 185 #undef X1H 186 #undef Y1L 187 #undef Y1H 188 #undef Z1L 189 #undef Z1H 190 #undef X2L 191 #undef X2H 192 #undef Y2L 193 #undef Y2H 194 #undef Z2L 195 #undef Z2H 196 #undef ZER 197 #undef SEL1 198 199 // --------------------------------------- 200 // Constant time table access 201 // Indexed from 1 to 15, with -1 offset 202 // (index 0 is implicitly point at infinity) 203 // func p256Select(res *P256Point, table *p256Table, idx int, limit int) 204 #define P3ptr R1 205 #define P1ptr R2 206 #define LIMIT R3 207 #define COUNT R4 208 209 #define X1L V1 210 #define X1H V0 211 #define Y1L V3 212 #define Y1H V2 213 #define Z1L V5 214 #define Z1H V4 215 #define X2L V7 216 #define X2H V6 217 #define Y2L V9 218 #define Y2H V8 219 #define Z2L V11 220 #define Z2H V10 221 222 #define ONE V18 223 #define IDX V19 224 #define SEL1 V20 225 #define SEL2 V21 226 TEXT ·p256Select(SB), NOSPLIT, $0 227 MOVD res+0(FP), P3ptr 228 MOVD table+8(FP), P1ptr 229 MOVD limit+24(FP), LIMIT 230 VLREPB idx+(16+7)(FP), IDX 231 VREPIB $1, ONE 232 VREPIB $1, SEL2 233 MOVD $1, COUNT 234 235 VZERO X1H 236 VZERO X1L 237 VZERO Y1H 238 VZERO Y1L 239 VZERO Z1H 240 VZERO Z1L 241 242 loop_select: 243 VLM (P1ptr), X2H, Z2L 244 245 VCEQG SEL2, IDX, SEL1 246 247 VSEL X2L, X1L, SEL1, X1L 248 VSEL X2H, X1H, SEL1, X1H 249 VSEL Y2L, Y1L, SEL1, Y1L 250 VSEL Y2H, Y1H, SEL1, Y1H 251 VSEL Z2L, Z1L, SEL1, Z1L 252 VSEL Z2H, Z1H, SEL1, Z1H 253 254 VAB SEL2, ONE, SEL2 255 ADD $96, P1ptr 256 ADD $1, COUNT 257 CMPBLE COUNT, LIMIT, loop_select 258 259 VSTM X1H, Z1L, (P3ptr) 260 261 RET 262 263 #undef P3ptr 264 #undef P1ptr 265 #undef COUNT 266 #undef LIMIT 267 #undef X1L 268 #undef X1H 269 #undef Y1L 270 #undef Y1H 271 #undef Z1L 272 #undef Z1H 273 #undef X2L 274 #undef X2H 275 #undef Y2L 276 #undef Y2H 277 #undef Z2L 278 #undef Z2H 279 #undef ONE 280 #undef IDX 281 #undef SEL1 282 #undef SEL2 283 284 // --------------------------------------- 285 286 // func p256FromMont(res, in *p256Element) 287 #define res_ptr R1 288 #define x_ptr R2 289 #define CPOOL R4 290 291 #define T0 V0 292 #define T1 V1 293 #define T2 V2 294 #define TT0 V3 295 #define TT1 V4 296 297 #define ZER V6 298 #define CAR1 V9 299 #define CAR2 V10 300 #define RED1 V11 301 #define RED2 V12 302 #define PH V13 303 #define PL V14 304 #define SEL1 V15 305 306 TEXT ·p256FromMont(SB), NOSPLIT, $0 307 MOVD res+0(FP), res_ptr 308 MOVD in+8(FP), x_ptr 309 310 VZERO T2 311 VZERO ZER 312 MOVD $p256<>+0x00(SB), CPOOL 313 VLM (CPOOL), PH, SEL1 314 315 VLM (x_ptr), T0, T1 316 VPDI $0x4, T0, T0, T0 317 VPDI $0x4, T1, T1, T1 318 319 // First round 320 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 321 VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 322 VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 323 VSCBIQ TT0, RED1, CAR1 324 VSQ TT0, RED1, RED1 325 VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 326 327 VSLDB $8, T1, T0, T0 328 VSLDB $8, T2, T1, T1 329 330 VACCQ T0, RED1, CAR1 331 VAQ T0, RED1, T0 332 VACCCQ T1, RED2, CAR1, CAR2 333 VACQ T1, RED2, CAR1, T1 334 VAQ T2, CAR2, T2 335 336 // Second round 337 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 338 VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 339 VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 340 VSCBIQ TT0, RED1, CAR1 341 VSQ TT0, RED1, RED1 342 VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 343 344 VSLDB $8, T1, T0, T0 345 VSLDB $8, T2, T1, T1 346 347 VACCQ T0, RED1, CAR1 348 VAQ T0, RED1, T0 349 VACCCQ T1, RED2, CAR1, CAR2 350 VACQ T1, RED2, CAR1, T1 351 VAQ T2, CAR2, T2 352 353 // Third round 354 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 355 VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 356 VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 357 VSCBIQ TT0, RED1, CAR1 358 VSQ TT0, RED1, RED1 359 VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 360 361 VSLDB $8, T1, T0, T0 362 VSLDB $8, T2, T1, T1 363 364 VACCQ T0, RED1, CAR1 365 VAQ T0, RED1, T0 366 VACCCQ T1, RED2, CAR1, CAR2 367 VACQ T1, RED2, CAR1, T1 368 VAQ T2, CAR2, T2 369 370 // Last round 371 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 372 VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 373 VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 374 VSCBIQ TT0, RED1, CAR1 375 VSQ TT0, RED1, RED1 376 VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 377 378 VSLDB $8, T1, T0, T0 379 VSLDB $8, T2, T1, T1 380 381 VACCQ T0, RED1, CAR1 382 VAQ T0, RED1, T0 383 VACCCQ T1, RED2, CAR1, CAR2 384 VACQ T1, RED2, CAR1, T1 385 VAQ T2, CAR2, T2 386 387 // --------------------------------------------------- 388 389 VSCBIQ PL, T0, CAR1 390 VSQ PL, T0, TT0 391 VSBCBIQ T1, PH, CAR1, CAR2 392 VSBIQ T1, PH, CAR1, TT1 393 VSBIQ T2, ZER, CAR2, T2 394 395 // what output to use, TT1||TT0 or T1||T0? 396 VSEL T0, TT0, T2, T0 397 VSEL T1, TT1, T2, T1 398 399 VPDI $0x4, T0, T0, TT0 400 VPDI $0x4, T1, T1, TT1 401 VSTM TT0, TT1, (res_ptr) 402 403 RET 404 405 #undef res_ptr 406 #undef x_ptr 407 #undef CPOOL 408 #undef T0 409 #undef T1 410 #undef T2 411 #undef TT0 412 #undef TT1 413 #undef ZER 414 #undef SEL1 415 #undef CAR1 416 #undef CAR2 417 #undef RED1 418 #undef RED2 419 #undef PL 420 #undef PH 421 422 // Constant time table access 423 // Indexed from 1 to 15, with -1 offset 424 // (index 0 is implicitly point at infinity) 425 // func p256SelectBase(point *p256Point, table []p256Point, idx int) 426 // new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 427 428 #define P3ptr R1 429 #define P1ptr R2 430 #define COUNT R4 431 #define CPOOL R5 432 433 #define X1L V1 434 #define X1H V0 435 #define Y1L V3 436 #define Y1H V2 437 #define X2L V7 438 #define X2H V6 439 #define Y2L V9 440 #define Y2H V8 441 442 #define ONE V18 443 #define IDX V19 444 #define SEL1 V20 445 #define SEL2 V21 446 447 TEXT ·p256SelectAffine(SB), NOSPLIT, $0 448 MOVD res+0(FP), P3ptr 449 MOVD table+8(FP), P1ptr 450 MOVD $p256<>+0x00(SB), CPOOL 451 VLREPB idx+(16+7)(FP), IDX 452 VREPIB $1, ONE 453 VREPIB $1, SEL2 454 MOVD $1, COUNT 455 456 VZERO X1H 457 VZERO X1L 458 VZERO Y1H 459 VZERO Y1L 460 461 loop_select: 462 VLM (P1ptr), X2H, Y2L 463 464 VCEQG SEL2, IDX, SEL1 465 466 VSEL X2L, X1L, SEL1, X1L 467 VSEL X2H, X1H, SEL1, X1H 468 VSEL Y2L, Y1L, SEL1, Y1L 469 VSEL Y2H, Y1H, SEL1, Y1H 470 471 VAB SEL2, ONE, SEL2 472 ADDW $1, COUNT 473 ADD $64, P1ptr 474 CMPW COUNT, $33 475 BLT loop_select 476 477 VSTM X1H, Y1L, (P3ptr) 478 479 RET 480 481 #undef P3ptr 482 #undef P1ptr 483 #undef COUNT 484 #undef X1L 485 #undef X1H 486 #undef Y1L 487 #undef Y1H 488 #undef X2L 489 #undef X2H 490 #undef Y2L 491 #undef Y2H 492 #undef ONE 493 #undef IDX 494 #undef SEL1 495 #undef SEL2 496 #undef CPOOL 497 498 // --------------------------------------- 499 // sm2p256OrdMulInternal 500 #define X0 V0 501 #define X1 V1 502 #define Y0 V2 503 #define Y1 V3 504 #define M1 V4 505 #define M0 V5 506 #define T0 V6 507 #define T1 V7 508 #define T2 V8 509 #define YDIG V9 510 511 #define ADD1 V16 512 #define ADD1H V17 513 #define ADD2 V18 514 #define ADD2H V19 515 #define RED1 V20 516 #define RED1H V21 517 #define RED2 V22 518 #define RED2H V23 519 #define CAR1 V24 520 #define CAR1M V25 521 522 #define MK0 V30 523 #define K0 V31 524 TEXT sm2p256OrdMulInternal<>(SB), NOSPLIT, $0-0 525 // ---------------------------------------------------------------------------/ 526 VREPF $3, Y0, YDIG 527 VMLF X0, YDIG, ADD1 528 VMLF ADD1, K0, MK0 529 VREPF $3, MK0, MK0 530 531 VMLF X1, YDIG, ADD2 532 VMLHF X0, YDIG, ADD1H 533 VMLHF X1, YDIG, ADD2H 534 535 VMALF M0, MK0, ADD1, RED1 536 VMALHF M0, MK0, ADD1, RED1H 537 VMALF M1, MK0, ADD2, RED2 538 VMALHF M1, MK0, ADD2, RED2H 539 540 VZERO T2 541 VSLDB $12, RED2, RED1, RED1 542 VSLDB $12, T2, RED2, RED2 543 544 VACCQ RED1, ADD1H, CAR1 545 VAQ RED1, ADD1H, T0 546 VACCQ RED1H, T0, CAR1M 547 VAQ RED1H, T0, T0 548 549 // << ready for next MK0 550 551 VACQ RED2, ADD2H, CAR1, T1 552 VACCCQ RED2, ADD2H, CAR1, CAR1 553 VACCCQ RED2H, T1, CAR1M, T2 554 VACQ RED2H, T1, CAR1M, T1 555 VAQ CAR1, T2, T2 556 557 // --------------------------------------------------- 558 /* * 559 * ---+--------+--------+ 560 * T2| T1 | T0 | 561 * ---+--------+--------+ 562 * *(add)* 563 * +--------+--------+ 564 * | X1 | X0 | 565 * +--------+--------+ 566 * *(mul)* 567 * +--------+--------+ 568 * | YDIG | YDIG | 569 * +--------+--------+ 570 * *(add)* 571 * +--------+--------+ 572 * | M1 | M0 | 573 * +--------+--------+ 574 * *(mul)* 575 * +--------+--------+ 576 * | MK0 | MK0 | 577 * +--------+--------+ 578 * 579 * --------------------- 580 * 581 * +--------+--------+ 582 * | ADD2 | ADD1 | 583 * +--------+--------+ 584 * +--------+--------+ 585 * | ADD2H | ADD1H | 586 * +--------+--------+ 587 * +--------+--------+ 588 * | RED2 | RED1 | 589 * +--------+--------+ 590 * +--------+--------+ 591 * | RED2H | RED1H | 592 * +--------+--------+ 593 */ 594 VREPF $2, Y0, YDIG 595 VMALF X0, YDIG, T0, ADD1 596 VMLF ADD1, K0, MK0 597 VREPF $3, MK0, MK0 598 599 VMALF X1, YDIG, T1, ADD2 600 VMALHF X0, YDIG, T0, ADD1H 601 VMALHF X1, YDIG, T1, ADD2H 602 603 VMALF M0, MK0, ADD1, RED1 604 VMALHF M0, MK0, ADD1, RED1H 605 VMALF M1, MK0, ADD2, RED2 606 VMALHF M1, MK0, ADD2, RED2H 607 608 VSLDB $12, RED2, RED1, RED1 609 VSLDB $12, T2, RED2, RED2 610 611 VACCQ RED1, ADD1H, CAR1 612 VAQ RED1, ADD1H, T0 613 VACCQ RED1H, T0, CAR1M 614 VAQ RED1H, T0, T0 615 616 // << ready for next MK0 617 618 VACQ RED2, ADD2H, CAR1, T1 619 VACCCQ RED2, ADD2H, CAR1, CAR1 620 VACCCQ RED2H, T1, CAR1M, T2 621 VACQ RED2H, T1, CAR1M, T1 622 VAQ CAR1, T2, T2 623 624 // --------------------------------------------------- 625 VREPF $1, Y0, YDIG 626 VMALF X0, YDIG, T0, ADD1 627 VMLF ADD1, K0, MK0 628 VREPF $3, MK0, MK0 629 630 VMALF X1, YDIG, T1, ADD2 631 VMALHF X0, YDIG, T0, ADD1H 632 VMALHF X1, YDIG, T1, ADD2H 633 634 VMALF M0, MK0, ADD1, RED1 635 VMALHF M0, MK0, ADD1, RED1H 636 VMALF M1, MK0, ADD2, RED2 637 VMALHF M1, MK0, ADD2, RED2H 638 639 VSLDB $12, RED2, RED1, RED1 640 VSLDB $12, T2, RED2, RED2 641 642 VACCQ RED1, ADD1H, CAR1 643 VAQ RED1, ADD1H, T0 644 VACCQ RED1H, T0, CAR1M 645 VAQ RED1H, T0, T0 646 647 // << ready for next MK0 648 649 VACQ RED2, ADD2H, CAR1, T1 650 VACCCQ RED2, ADD2H, CAR1, CAR1 651 VACCCQ RED2H, T1, CAR1M, T2 652 VACQ RED2H, T1, CAR1M, T1 653 VAQ CAR1, T2, T2 654 655 // --------------------------------------------------- 656 VREPF $0, Y0, YDIG 657 VMALF X0, YDIG, T0, ADD1 658 VMLF ADD1, K0, MK0 659 VREPF $3, MK0, MK0 660 661 VMALF X1, YDIG, T1, ADD2 662 VMALHF X0, YDIG, T0, ADD1H 663 VMALHF X1, YDIG, T1, ADD2H 664 665 VMALF M0, MK0, ADD1, RED1 666 VMALHF M0, MK0, ADD1, RED1H 667 VMALF M1, MK0, ADD2, RED2 668 VMALHF M1, MK0, ADD2, RED2H 669 670 VSLDB $12, RED2, RED1, RED1 671 VSLDB $12, T2, RED2, RED2 672 673 VACCQ RED1, ADD1H, CAR1 674 VAQ RED1, ADD1H, T0 675 VACCQ RED1H, T0, CAR1M 676 VAQ RED1H, T0, T0 677 678 // << ready for next MK0 679 680 VACQ RED2, ADD2H, CAR1, T1 681 VACCCQ RED2, ADD2H, CAR1, CAR1 682 VACCCQ RED2H, T1, CAR1M, T2 683 VACQ RED2H, T1, CAR1M, T1 684 VAQ CAR1, T2, T2 685 686 // --------------------------------------------------- 687 VREPF $3, Y1, YDIG 688 VMALF X0, YDIG, T0, ADD1 689 VMLF ADD1, K0, MK0 690 VREPF $3, MK0, MK0 691 692 VMALF X1, YDIG, T1, ADD2 693 VMALHF X0, YDIG, T0, ADD1H 694 VMALHF X1, YDIG, T1, ADD2H 695 696 VMALF M0, MK0, ADD1, RED1 697 VMALHF M0, MK0, ADD1, RED1H 698 VMALF M1, MK0, ADD2, RED2 699 VMALHF M1, MK0, ADD2, RED2H 700 701 VSLDB $12, RED2, RED1, RED1 702 VSLDB $12, T2, RED2, RED2 703 704 VACCQ RED1, ADD1H, CAR1 705 VAQ RED1, ADD1H, T0 706 VACCQ RED1H, T0, CAR1M 707 VAQ RED1H, T0, T0 708 709 // << ready for next MK0 710 711 VACQ RED2, ADD2H, CAR1, T1 712 VACCCQ RED2, ADD2H, CAR1, CAR1 713 VACCCQ RED2H, T1, CAR1M, T2 714 VACQ RED2H, T1, CAR1M, T1 715 VAQ CAR1, T2, T2 716 717 // --------------------------------------------------- 718 VREPF $2, Y1, YDIG 719 VMALF X0, YDIG, T0, ADD1 720 VMLF ADD1, K0, MK0 721 VREPF $3, MK0, MK0 722 723 VMALF X1, YDIG, T1, ADD2 724 VMALHF X0, YDIG, T0, ADD1H 725 VMALHF X1, YDIG, T1, ADD2H 726 727 VMALF M0, MK0, ADD1, RED1 728 VMALHF M0, MK0, ADD1, RED1H 729 VMALF M1, MK0, ADD2, RED2 730 VMALHF M1, MK0, ADD2, RED2H 731 732 VSLDB $12, RED2, RED1, RED1 733 VSLDB $12, T2, RED2, RED2 734 735 VACCQ RED1, ADD1H, CAR1 736 VAQ RED1, ADD1H, T0 737 VACCQ RED1H, T0, CAR1M 738 VAQ RED1H, T0, T0 739 740 // << ready for next MK0 741 742 VACQ RED2, ADD2H, CAR1, T1 743 VACCCQ RED2, ADD2H, CAR1, CAR1 744 VACCCQ RED2H, T1, CAR1M, T2 745 VACQ RED2H, T1, CAR1M, T1 746 VAQ CAR1, T2, T2 747 748 // --------------------------------------------------- 749 VREPF $1, Y1, YDIG 750 VMALF X0, YDIG, T0, ADD1 751 VMLF ADD1, K0, MK0 752 VREPF $3, MK0, MK0 753 754 VMALF X1, YDIG, T1, ADD2 755 VMALHF X0, YDIG, T0, ADD1H 756 VMALHF X1, YDIG, T1, ADD2H 757 758 VMALF M0, MK0, ADD1, RED1 759 VMALHF M0, MK0, ADD1, RED1H 760 VMALF M1, MK0, ADD2, RED2 761 VMALHF M1, MK0, ADD2, RED2H 762 763 VSLDB $12, RED2, RED1, RED1 764 VSLDB $12, T2, RED2, RED2 765 766 VACCQ RED1, ADD1H, CAR1 767 VAQ RED1, ADD1H, T0 768 VACCQ RED1H, T0, CAR1M 769 VAQ RED1H, T0, T0 770 771 // << ready for next MK0 772 773 VACQ RED2, ADD2H, CAR1, T1 774 VACCCQ RED2, ADD2H, CAR1, CAR1 775 VACCCQ RED2H, T1, CAR1M, T2 776 VACQ RED2H, T1, CAR1M, T1 777 VAQ CAR1, T2, T2 778 779 // --------------------------------------------------- 780 VREPF $0, Y1, YDIG 781 VMALF X0, YDIG, T0, ADD1 782 VMLF ADD1, K0, MK0 783 VREPF $3, MK0, MK0 784 785 VMALF X1, YDIG, T1, ADD2 786 VMALHF X0, YDIG, T0, ADD1H 787 VMALHF X1, YDIG, T1, ADD2H 788 789 VMALF M0, MK0, ADD1, RED1 790 VMALHF M0, MK0, ADD1, RED1H 791 VMALF M1, MK0, ADD2, RED2 792 VMALHF M1, MK0, ADD2, RED2H 793 794 VSLDB $12, RED2, RED1, RED1 795 VSLDB $12, T2, RED2, RED2 796 797 VACCQ RED1, ADD1H, CAR1 798 VAQ RED1, ADD1H, T0 799 VACCQ RED1H, T0, CAR1M 800 VAQ RED1H, T0, T0 801 802 // << ready for next MK0 803 804 VACQ RED2, ADD2H, CAR1, T1 805 VACCCQ RED2, ADD2H, CAR1, CAR1 806 VACCCQ RED2H, T1, CAR1M, T2 807 VACQ RED2H, T1, CAR1M, T1 808 VAQ CAR1, T2, T2 809 810 // --------------------------------------------------- 811 812 VZERO RED1 813 VSCBIQ M0, T0, CAR1 814 VSQ M0, T0, ADD1 815 VSBCBIQ T1, M1, CAR1, CAR1M 816 VSBIQ T1, M1, CAR1, ADD2 817 VSBIQ T2, RED1, CAR1M, T2 818 819 // what output to use, ADD2||ADD1 or T1||T0? 820 VSEL T0, ADD1, T2, T0 821 VSEL T1, ADD2, T2, T1 822 823 RET 824 825 #undef X0 826 #undef X1 827 #undef Y0 828 #undef Y1 829 #undef M0 830 #undef M1 831 #undef T0 832 #undef T1 833 #undef T2 834 #undef YDIG 835 836 #undef ADD1 837 #undef ADD1H 838 #undef ADD2 839 #undef ADD2H 840 #undef RED1 841 #undef RED1H 842 #undef RED2 843 #undef RED2H 844 #undef CAR1 845 #undef CAR1M 846 847 #undef MK0 848 #undef K0 849 850 // --------------------------------------- 851 852 // Parameters 853 #define X0 V0 854 #define X1 V1 855 #define Y0 V2 856 #define Y1 V3 857 858 TEXT sm2p256OrdSqrInternal<>(SB), NOFRAME|NOSPLIT, $0 859 VLR X0, Y0 860 VLR X1, Y1 861 BR sm2p256OrdMulInternal<>(SB) 862 863 #undef X0 864 #undef X1 865 #undef Y0 866 #undef Y1 867 868 // --------------------------------------- 869 870 // func p256OrdMul(res, in1, in2 *p256OrdElement) 871 #define res_ptr R1 872 #define x_ptr R2 873 #define y_ptr R3 874 #define X0 V0 875 #define X1 V1 876 #define Y0 V2 877 #define Y1 V3 878 #define M0 V5 879 #define M1 V4 880 #define T0 V6 881 #define T1 V7 882 #define K0 V31 883 TEXT ·p256OrdMul(SB), NOSPLIT, $0 884 MOVD res+0(FP), res_ptr 885 MOVD in1+8(FP), x_ptr 886 MOVD in2+16(FP), y_ptr 887 888 MOVD $p256ordK0<>+0x00(SB), R4 889 890 VLEF $3, 0(R4), K0 891 //WORD $0xE7F40000 892 //BYTE $0x38 893 //BYTE $0x03 894 MOVD $p256ord<>+0x00(SB), R4 895 VLM (R4), M1, M0 896 897 VLM (x_ptr), X0, X1 898 VPDI $0x4, X0, X0, X0 899 VPDI $0x4, X1, X1, X1 900 VLM (y_ptr), Y0, Y1 901 VPDI $0x4, Y0, Y0, Y0 902 VPDI $0x4, Y1, Y1, Y1 903 904 CALL sm2p256OrdMulInternal<>(SB) 905 906 VPDI $0x4, T0, T0, T0 907 VPDI $0x4, T1, T1, T1 908 VSTM T0, T1, (res_ptr) 909 910 RET 911 912 #undef res_ptr 913 #undef x_ptr 914 #undef y_ptr 915 #undef X0 916 #undef X1 917 #undef Y0 918 #undef Y1 919 #undef M0 920 #undef M1 921 #undef T0 922 #undef T1 923 #undef K0 924 925 // --------------------------------------- 926 // func p256OrdSqr(res, in *p256OrdElement, n int) 927 #define res_ptr R1 928 #define x_ptr R2 929 #define COUNT R5 930 #define N R6 931 #define X0 V0 932 #define X1 V1 933 #define M0 V5 934 #define M1 V4 935 #define T0 V6 936 #define T1 V7 937 #define K0 V31 938 TEXT ·p256OrdSqr(SB), NOSPLIT, $0 939 MOVD res+0(FP), res_ptr 940 MOVD in+8(FP), x_ptr 941 MOVD n+16(FP), N 942 943 MOVD $0, COUNT 944 945 MOVD $p256ordK0<>+0x00(SB), R4 946 947 VLEF $3, 0(R4), K0 948 //WORD $0xE7F40000 949 //BYTE $0x38 950 //BYTE $0x03 951 MOVD $p256ord<>+0x00(SB), R4 952 VLM (R4), M1, M0 953 954 VLM (x_ptr), X0, X1 955 VPDI $0x4, X0, X0, X0 956 VPDI $0x4, X1, X1, X1 957 958 loop: 959 CALL sm2p256OrdSqrInternal<>(SB) 960 VLR T0, X0 961 VLR T1, X1 962 ADDW $1, COUNT 963 CMPW COUNT, N 964 BLT loop 965 966 VPDI $0x4, T0, T0, T0 967 VPDI $0x4, T1, T1, T1 968 VSTM T0, T1, (res_ptr) 969 970 RET 971 972 #undef res_ptr 973 #undef x_ptr 974 #undef COUNT 975 #undef N 976 #undef X0 977 #undef X1 978 #undef M0 979 #undef M1 980 #undef T0 981 #undef T1 982 #undef K0 983 984 // --------------------------------------- 985 // sm2p256MulInternal 986 // V0-V3,V30,V31 - Not Modified 987 // V4-V14 - Volatile 988 989 #define CPOOL R4 990 991 // Parameters 992 #define X0 V0 // Not modified 993 #define X1 V1 // Not modified 994 #define Y0 V2 // Not modified 995 #define Y1 V3 // Not modified 996 #define T0 V4 997 #define T1 V5 998 #define P0 V31 // Not modified 999 #define P1 V30 // Not modified 1000 1001 // Temporaries 1002 #define YDIG V6 // Overloaded with CAR2, ZER 1003 #define ADD1H V7 // Overloaded with ADD3H 1004 #define ADD2H V8 // Overloaded with ADD4H 1005 #define ADD3 V9 // Overloaded with SEL2,SEL5 1006 #define ADD4 V10 // Overloaded with SEL3,SEL6 1007 #define RED1 V11 // Overloaded with CAR2 1008 #define RED2 V12 1009 #define RED3 V13 // Overloaded with SEL1 1010 #define T2 V14 1011 // Overloaded temporaries 1012 #define ADD1 V4 // Overloaded with T0 1013 #define ADD2 V5 // Overloaded with T1 1014 #define ADD3H V7 // Overloaded with ADD1H 1015 #define ADD4H V8 // Overloaded with ADD2H 1016 #define ZER V6 // Overloaded with YDIG, CAR2 1017 #define CAR1 V6 // Overloaded with YDIG, ZER 1018 #define CAR2 V11 // Overloaded with RED1 1019 // Constant Selects 1020 #define SEL1 V13 // Overloaded with RED3 1021 #define SEL2 V9 // Overloaded with ADD3,SEL5 1022 #define SEL3 V10 // Overloaded with ADD4,SEL6 1023 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER 1024 #define SEL5 V9 // Overloaded with ADD3,SEL2 1025 #define SEL6 V10 // Overloaded with ADD4,SEL3 1026 1027 TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-0 1028 // --------------------------------------------------- 1029 1030 VREPF $3, Y0, YDIG 1031 VMLHF X0, YDIG, ADD1H 1032 VMLHF X1, YDIG, ADD2H 1033 VMLF X0, YDIG, ADD1 1034 VMLF X1, YDIG, ADD2 1035 1036 VREPF $2, Y0, YDIG 1037 VMALF X0, YDIG, ADD1H, ADD3 1038 VMALF X1, YDIG, ADD2H, ADD4 1039 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1040 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1041 1042 VZERO ZER 1043 VL 32(CPOOL), SEL1 1044 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1045 1046 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1047 VSLDB $12, ZER, ADD2, T1 // ADD2 Free 1048 1049 VACCQ T0, ADD3, CAR1 1050 VAQ T0, ADD3, T0 // ADD3 Free 1051 VACCCQ T1, ADD4, CAR1, T2 1052 VACQ T1, ADD4, CAR1, T1 // ADD4 Free 1053 1054 VL 48(CPOOL), SEL2 1055 VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] 1056 VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1057 VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] 1058 VSCBIQ RED3, RED1, CAR1 1059 VSQ RED3, RED1, RED1 1060 VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1061 1062 VSLDB $12, T1, T0, T0 1063 VSLDB $12, T2, T1, T1 // T2 Free 1064 1065 VACCQ T0, ADD3H, CAR1 1066 VAQ T0, ADD3H, T0 1067 VACCCQ T1, ADD4H, CAR1, T2 1068 VACQ T1, ADD4H, CAR1, T1 1069 1070 VACCQ T0, RED1, CAR1 1071 VAQ T0, RED1, T0 1072 VACCCQ T1, RED2, CAR1, CAR2 1073 VACQ T1, RED2, CAR1, T1 1074 VAQ T2, CAR2, T2 1075 // --------------------------------------------------- 1076 1077 VREPF $1, Y0, YDIG 1078 VMALHF X0, YDIG, T0, ADD1H 1079 VMALHF X1, YDIG, T1, ADD2H 1080 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 1081 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 1082 1083 VREPF $0, Y0, YDIG 1084 VMALF X0, YDIG, ADD1H, ADD3 1085 VMALF X1, YDIG, ADD2H, ADD4 1086 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 1087 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 1088 1089 VZERO ZER 1090 VL 32(CPOOL), SEL1 1091 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1092 1093 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 1094 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free 1095 1096 VACCQ T0, ADD3, CAR1 1097 VAQ T0, ADD3, T0 1098 VACCCQ T1, ADD4, CAR1, T2 1099 VACQ T1, ADD4, CAR1, T1 1100 1101 VL 48(CPOOL), SEL2 1102 VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] 1103 VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1104 VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] 1105 VSCBIQ RED3, RED1, CAR1 1106 VSQ RED3, RED1, RED1 1107 VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1108 1109 VSLDB $12, T1, T0, T0 1110 VSLDB $12, T2, T1, T1 1111 1112 VACCQ T0, ADD3H, CAR1 1113 VAQ T0, ADD3H, T0 1114 VACCCQ T1, ADD4H, CAR1, T2 1115 VACQ T1, ADD4H, CAR1, T1 1116 1117 VACCQ T0, RED1, CAR1 1118 VAQ T0, RED1, T0 1119 VACCCQ T1, RED2, CAR1, CAR2 1120 VACQ T1, RED2, CAR1, T1 1121 VAQ T2, CAR2, T2 1122 // --------------------------------------------------- 1123 1124 VREPF $3, Y1, YDIG 1125 VMALHF X0, YDIG, T0, ADD1H 1126 VMALHF X1, YDIG, T1, ADD2H 1127 VMALF X0, YDIG, T0, ADD1 1128 VMALF X1, YDIG, T1, ADD2 1129 1130 VREPF $2, Y1, YDIG 1131 VMALF X0, YDIG, ADD1H, ADD3 1132 VMALF X1, YDIG, ADD2H, ADD4 1133 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1134 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1135 1136 VZERO ZER 1137 VL 32(CPOOL), SEL1 1138 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1139 1140 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1141 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free 1142 1143 VACCQ T0, ADD3, CAR1 1144 VAQ T0, ADD3, T0 1145 VACCCQ T1, ADD4, CAR1, T2 1146 VACQ T1, ADD4, CAR1, T1 1147 1148 VL 48(CPOOL), SEL2 1149 VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] 1150 VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1151 VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] 1152 VSCBIQ RED3, RED1, CAR1 1153 VSQ RED3, RED1, RED1 1154 VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1155 1156 VSLDB $12, T1, T0, T0 1157 VSLDB $12, T2, T1, T1 1158 1159 VACCQ T0, ADD3H, CAR1 1160 VAQ T0, ADD3H, T0 1161 VACCCQ T1, ADD4H, CAR1, T2 1162 VACQ T1, ADD4H, CAR1, T1 1163 1164 VACCQ T0, RED1, CAR1 1165 VAQ T0, RED1, T0 1166 VACCCQ T1, RED2, CAR1, CAR2 1167 VACQ T1, RED2, CAR1, T1 1168 VAQ T2, CAR2, T2 1169 // --------------------------------------------------- 1170 1171 VREPF $1, Y1, YDIG 1172 VMALHF X0, YDIG, T0, ADD1H 1173 VMALHF X1, YDIG, T1, ADD2H 1174 VMALF X0, YDIG, T0, ADD1 1175 VMALF X1, YDIG, T1, ADD2 1176 1177 VREPF $0, Y1, YDIG 1178 VMALF X0, YDIG, ADD1H, ADD3 1179 VMALF X1, YDIG, ADD2H, ADD4 1180 VMALHF X0, YDIG, ADD1H, ADD3H 1181 VMALHF X1, YDIG, ADD2H, ADD4H 1182 1183 VZERO ZER 1184 VL 32(CPOOL), SEL1 1185 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1186 1187 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1188 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free 1189 1190 VACCQ T0, ADD3, CAR1 1191 VAQ T0, ADD3, T0 1192 VACCCQ T1, ADD4, CAR1, T2 1193 VACQ T1, ADD4, CAR1, T1 1194 1195 VL 48(CPOOL), SEL2 1196 VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] 1197 VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1198 VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] 1199 VSCBIQ RED3, RED1, CAR1 1200 VSQ RED3, RED1, RED1 1201 VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1202 1203 VSLDB $12, T1, T0, T0 1204 VSLDB $12, T2, T1, T1 1205 1206 VACCQ T0, ADD3H, CAR1 1207 VAQ T0, ADD3H, T0 1208 VACCCQ T1, ADD4H, CAR1, T2 1209 VACQ T1, ADD4H, CAR1, T1 1210 1211 VACCQ T0, RED1, CAR1 1212 VAQ T0, RED1, T0 1213 VACCCQ T1, RED2, CAR1, CAR2 1214 VACQ T1, RED2, CAR1, T1 1215 VAQ T2, CAR2, T2 1216 1217 // --------------------------------------------------- 1218 1219 VZERO RED3 1220 VSCBIQ P0, T0, CAR1 1221 VSQ P0, T0, ADD1H 1222 VSBCBIQ T1, P1, CAR1, CAR2 1223 VSBIQ T1, P1, CAR1, ADD2H 1224 VSBIQ T2, RED3, CAR2, T2 1225 1226 // what output to use, ADD2H||ADD1H or T1||T0? 1227 VSEL T0, ADD1H, T2, T0 1228 VSEL T1, ADD2H, T2, T1 1229 RET 1230 1231 #undef CPOOL 1232 1233 #undef X0 1234 #undef X1 1235 #undef Y0 1236 #undef Y1 1237 #undef T0 1238 #undef T1 1239 #undef P0 1240 #undef P1 1241 1242 #undef SEL1 1243 #undef SEL2 1244 #undef SEL3 1245 #undef SEL4 1246 #undef SEL5 1247 #undef SEL6 1248 1249 #undef YDIG 1250 #undef ADD1H 1251 #undef ADD2H 1252 #undef ADD3 1253 #undef ADD4 1254 #undef RED1 1255 #undef RED2 1256 #undef RED3 1257 #undef T2 1258 #undef ADD1 1259 #undef ADD2 1260 #undef ADD3H 1261 #undef ADD4H 1262 #undef ZER 1263 #undef CAR1 1264 #undef CAR2 1265 1266 // --------------------------------------- 1267 1268 // Parameters 1269 #define X0 V0 1270 #define X1 V1 1271 #define Y0 V2 1272 #define Y1 V3 1273 1274 TEXT sm2p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0 1275 VLR X0, Y0 1276 VLR X1, Y1 1277 BR sm2p256MulInternal<>(SB) 1278 1279 #undef X0 1280 #undef X1 1281 #undef Y0 1282 #undef Y1 1283 1284 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1285 VZERO ZER \ 1286 VSCBIQ Y0, X0, CAR1 \ 1287 VSQ Y0, X0, T0 \ 1288 VSBCBIQ X1, Y1, CAR1, SEL1 \ 1289 VSBIQ X1, Y1, CAR1, T1 \ 1290 VSQ SEL1, ZER, SEL1 \ 1291 \ 1292 VACCQ T0, PL, CAR1 \ 1293 VAQ T0, PL, TT0 \ 1294 VACQ T1, PH, CAR1, TT1 \ 1295 \ 1296 VSEL T0, TT0, SEL1, T0 \ 1297 VSEL T1, TT1, SEL1, T1 \ 1298 1299 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1300 VACCQ X0, Y0, CAR1 \ 1301 VAQ X0, Y0, T0 \ 1302 VACCCQ X1, Y1, CAR1, T2 \ 1303 VACQ X1, Y1, CAR1, T1 \ 1304 \ 1305 VZERO ZER \ 1306 VSCBIQ PL, T0, CAR1 \ 1307 VSQ PL, T0, TT0 \ 1308 VSBCBIQ T1, PH, CAR1, CAR2 \ 1309 VSBIQ T1, PH, CAR1, TT1 \ 1310 VSBIQ T2, ZER, CAR2, SEL1 \ 1311 \ 1312 VSEL T0, TT0, SEL1, T0 \ 1313 VSEL T1, TT1, SEL1, T1 1314 1315 #define p256HalfInternal(T1, T0, X1, X0) \ 1316 VZERO ZER \ 1317 VSBIQ ZER, ZER, X0, SEL1 \ 1318 \ 1319 VACCQ X0, PL, CAR1 \ 1320 VAQ X0, PL, T0 \ 1321 VACCCQ X1, PH, CAR1, T2 \ 1322 VACQ X1, PH, CAR1, T1 \ 1323 \ 1324 VSEL X0, T0, SEL1, T0 \ 1325 VSEL X1, T1, SEL1, T1 \ 1326 VSEL ZER, T2, SEL1, T2 \ 1327 \ 1328 VSLDB $15, T2, ZER, TT1 \ 1329 VSLDB $15, T1, ZER, TT0 \ 1330 VREPIB $1, SEL1 \ 1331 VSRL SEL1, T0, T0 \ 1332 VSRL SEL1, T1, T1 \ 1333 VREPIB $7, SEL1 \ 1334 VSL SEL1, TT0, TT0 \ 1335 VSL SEL1, TT1, TT1 \ 1336 VO T0, TT0, T0 \ 1337 VO T1, TT1, T1 1338 1339 // --------------------------------------- 1340 // func p256Mul(res, in1, in2 *p256Element) 1341 #define res_ptr R1 1342 #define x_ptr R2 1343 #define y_ptr R3 1344 #define CPOOL R4 1345 1346 // Parameters 1347 #define X0 V0 1348 #define X1 V1 1349 #define Y0 V2 1350 #define Y1 V3 1351 #define T0 V4 1352 #define T1 V5 1353 1354 // Constants 1355 #define P0 V31 1356 #define P1 V30 1357 TEXT ·p256Mul(SB), NOSPLIT, $0 1358 MOVD res+0(FP), res_ptr 1359 MOVD in1+8(FP), x_ptr 1360 MOVD in2+16(FP), y_ptr 1361 1362 VLM (x_ptr), X0, X1 1363 VPDI $0x4, X0, X0, X0 1364 VPDI $0x4, X1, X1, X1 1365 VLM (y_ptr), Y0, Y1 1366 VPDI $0x4, Y0, Y0, Y0 1367 VPDI $0x4, Y1, Y1, Y1 1368 1369 MOVD $p256mul<>+0x00(SB), CPOOL 1370 VLM (CPOOL), P1, P0 1371 1372 CALL sm2p256MulInternal<>(SB) 1373 1374 VPDI $0x4, T0, T0, T0 1375 VPDI $0x4, T1, T1, T1 1376 VSTM T0, T1, (res_ptr) 1377 RET 1378 1379 #undef res_ptr 1380 #undef x_ptr 1381 #undef y_ptr 1382 #undef CPOOL 1383 1384 #undef X0 1385 #undef X1 1386 #undef Y0 1387 #undef Y1 1388 #undef T0 1389 #undef T1 1390 #undef P0 1391 #undef P1 1392 1393 // --------------------------------------- 1394 // func p256Sqr(res, in *p256Element, n int) 1395 #define res_ptr R1 1396 #define x_ptr R2 1397 #define y_ptr R3 1398 #define CPOOL R4 1399 #define COUNT R5 1400 #define N R6 1401 1402 // Parameters 1403 #define X0 V0 1404 #define X1 V1 1405 #define T0 V4 1406 #define T1 V5 1407 1408 // Constants 1409 #define P0 V31 1410 #define P1 V30 1411 TEXT ·p256Sqr(SB), NOSPLIT, $0 1412 MOVD res+0(FP), res_ptr 1413 MOVD in+8(FP), x_ptr 1414 1415 VLM (x_ptr), X0, X1 1416 VPDI $0x4, X0, X0, X0 1417 VPDI $0x4, X1, X1, X1 1418 1419 MOVD $p256mul<>+0x00(SB), CPOOL 1420 MOVD $0, COUNT 1421 MOVD n+16(FP), N 1422 VLM (CPOOL), P1, P0 1423 1424 loop: 1425 CALL sm2p256SqrInternal<>(SB) 1426 VLR T0, X0 1427 VLR T1, X1 1428 ADDW $1, COUNT 1429 CMPW COUNT, N 1430 BLT loop 1431 1432 VPDI $0x4, T0, T0, T0 1433 VPDI $0x4, T1, T1, T1 1434 VSTM T0, T1, (res_ptr) 1435 RET 1436 1437 #undef res_ptr 1438 #undef x_ptr 1439 #undef y_ptr 1440 #undef CPOOL 1441 #undef COUNT 1442 #undef N 1443 1444 #undef X0 1445 #undef X1 1446 #undef T0 1447 #undef T1 1448 #undef P0 1449 #undef P1 1450 1451 // Point add with P2 being affine point 1452 // If sign == 1 -> P2 = -P2 1453 // If sel == 0 -> P3 = P1 1454 // if zero == 0 -> P3 = P2 1455 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1456 #define P3ptr R1 1457 #define P1ptr R2 1458 #define P2ptr R3 1459 #define CPOOL R4 1460 1461 // Temporaries in REGs 1462 #define Y2L V15 1463 #define Y2H V16 1464 #define T1L V17 1465 #define T1H V18 1466 #define T2L V19 1467 #define T2H V20 1468 #define T3L V21 1469 #define T3H V22 1470 #define T4L V23 1471 #define T4H V24 1472 1473 // Temps for Sub and Add 1474 #define TT0 V11 1475 #define TT1 V12 1476 #define T2 V13 1477 1478 // p256MulAsm Parameters 1479 #define X0 V0 1480 #define X1 V1 1481 #define Y0 V2 1482 #define Y1 V3 1483 #define T0 V4 1484 #define T1 V5 1485 1486 #define PL V31 1487 #define PH V30 1488 1489 // Names for zero/sel selects 1490 #define X1L V0 1491 #define X1H V1 1492 #define Y1L V2 // p256MulAsmParmY 1493 #define Y1H V3 // p256MulAsmParmY 1494 #define Z1L V4 1495 #define Z1H V5 1496 #define X2L V0 1497 #define X2H V1 1498 #define Z2L V4 1499 #define Z2H V5 1500 #define X3L V17 // T1L 1501 #define X3H V18 // T1H 1502 #define Y3L V21 // T3L 1503 #define Y3H V22 // T3H 1504 #define Z3L V28 1505 #define Z3H V29 1506 1507 #define ZER V6 1508 #define SEL1 V7 1509 #define CAR1 V8 1510 #define CAR2 V9 1511 /* * 1512 * Three operand formula: 1513 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1514 * T1 = Z1² 1515 * T2 = T1*Z1 1516 * T1 = T1*X2 1517 * T2 = T2*Y2 1518 * T1 = T1-X1 1519 * T2 = T2-Y1 1520 * Z3 = Z1*T1 1521 * T3 = T1² 1522 * T4 = T3*T1 1523 * T3 = T3*X1 1524 * T1 = 2*T3 1525 * X3 = T2² 1526 * X3 = X3-T1 1527 * X3 = X3-T4 1528 * T3 = T3-X3 1529 * T3 = T3*T2 1530 * T4 = T4*Y1 1531 * Y3 = T3-T4 1532 1533 * Three operand formulas, but with MulInternal X,Y used to store temps 1534 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1535 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1536 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1537 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1538 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1539 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1540 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1541 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1542 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1543 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1544 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1545 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1546 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1547 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1548 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1549 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1550 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1551 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1552 1553 */ 1554 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 1555 MOVD res+0(FP), P3ptr 1556 MOVD in1+8(FP), P1ptr 1557 MOVD in2+16(FP), P2ptr 1558 1559 MOVD $p256mul<>+0x00(SB), CPOOL 1560 VL 16(CPOOL), PL 1561 VL 0(CPOOL), PH 1562 1563 // if (sign == 1) { 1564 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 1565 // } 1566 1567 VL 48(P2ptr), Y2H 1568 VPDI $0x4, Y2H, Y2H, Y2H 1569 VL 32(P2ptr), Y2L 1570 VPDI $0x4, Y2L, Y2L, Y2L 1571 1572 VLREPG sign+24(FP), SEL1 1573 VZERO ZER 1574 VCEQG SEL1, ZER, SEL1 1575 1576 VSCBIQ Y2L, PL, CAR1 1577 VSQ Y2L, PL, T1L 1578 VSBIQ PH, Y2H, CAR1, T1H 1579 1580 VSEL Y2L, T1L, SEL1, Y2L 1581 VSEL Y2H, T1H, SEL1, Y2H 1582 1583 /* * 1584 * Three operand formula: 1585 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1586 */ 1587 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1588 VL 80(P1ptr), X1 // Z1H 1589 VPDI $0x4, X1, X1, X1 1590 VL 64(P1ptr), X0 // Z1L 1591 VPDI $0x4, X0, X0, X0 1592 VLR X0, Y0 1593 VLR X1, Y1 1594 CALL sm2p256SqrInternal<>(SB) 1595 1596 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1597 VLR T0, X0 1598 VLR T1, X1 1599 CALL sm2p256MulInternal<>(SB) 1600 VLR T0, T2L 1601 VLR T1, T2H 1602 1603 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1604 VL 16(P2ptr), Y1 // X2H 1605 VPDI $0x4, Y1, Y1, Y1 1606 VL 0(P2ptr), Y0 // X2L 1607 VPDI $0x4, Y0, Y0, Y0 1608 CALL sm2p256MulInternal<>(SB) 1609 VLR T0, T1L 1610 VLR T1, T1H 1611 1612 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1613 VLR T2L, X0 1614 VLR T2H, X1 1615 VLR Y2L, Y0 1616 VLR Y2H, Y1 1617 CALL sm2p256MulInternal<>(SB) 1618 1619 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1620 VL 48(P1ptr), Y1H 1621 VPDI $0x4, Y1H, Y1H, Y1H 1622 VL 32(P1ptr), Y1L 1623 VPDI $0x4, Y1L, Y1L, Y1L 1624 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1625 1626 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1627 VL 16(P1ptr), X1H 1628 VPDI $0x4, X1H, X1H, X1H 1629 VL 0(P1ptr), X1L 1630 VPDI $0x4, X1L, X1L, X1L 1631 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1632 1633 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1634 VL 80(P1ptr), X1 // Z1H 1635 VPDI $0x4, X1, X1, X1 1636 VL 64(P1ptr), X0 // Z1L 1637 VPDI $0x4, X0, X0, X0 1638 CALL sm2p256MulInternal<>(SB) 1639 1640 // VST T1, 64(P3ptr) 1641 // VST T0, 80(P3ptr) 1642 VLR T0, Z3L 1643 VLR T1, Z3H 1644 1645 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1646 VLR Y0, X0 1647 VLR Y1, X1 1648 CALL sm2p256SqrInternal<>(SB) 1649 VLR T0, X0 1650 VLR T1, X1 1651 1652 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1653 CALL sm2p256MulInternal<>(SB) 1654 VLR T0, T4L 1655 VLR T1, T4H 1656 1657 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1658 VL 16(P1ptr), Y1 // X1H 1659 VPDI $0x4, Y1, Y1, Y1 1660 VL 0(P1ptr), Y0 // X1L 1661 VPDI $0x4, Y0, Y0, Y0 1662 CALL sm2p256MulInternal<>(SB) 1663 VLR T0, T3L 1664 VLR T1, T3H 1665 1666 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1667 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1668 1669 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1670 VLR T2L, X0 1671 VLR T2H, X1 1672 VLR T2L, Y0 1673 VLR T2H, Y1 1674 CALL sm2p256SqrInternal<>(SB) 1675 1676 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1677 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1678 1679 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1680 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1681 VLR T0, X3L 1682 VLR T1, X3H 1683 1684 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1685 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1686 1687 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1688 CALL sm2p256MulInternal<>(SB) 1689 VLR T0, T3L 1690 VLR T1, T3H 1691 1692 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1693 VLR T4L, X0 1694 VLR T4H, X1 1695 VL 48(P1ptr), Y1 // Y1H 1696 VPDI $0x4, Y1, Y1, Y1 1697 VL 32(P1ptr), Y0 // Y1L 1698 VPDI $0x4, Y0, Y0, Y0 1699 CALL sm2p256MulInternal<>(SB) 1700 1701 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 1702 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 1703 1704 // if (sel == 0) { 1705 // copy(P3.x[:], X1) 1706 // copy(P3.y[:], Y1) 1707 // copy(P3.z[:], Z1) 1708 // } 1709 1710 VL 16(P1ptr), X1H 1711 VPDI $0x4, X1H, X1H, X1H 1712 VL 0(P1ptr), X1L 1713 VPDI $0x4, X1L, X1L, X1L 1714 1715 // Y1 already loaded, left over from addition 1716 VL 80(P1ptr), Z1H 1717 VPDI $0x4, Z1H, Z1H, Z1H 1718 VL 64(P1ptr), Z1L 1719 VPDI $0x4, Z1L, Z1L, Z1L 1720 1721 VLREPG sel+32(FP), SEL1 1722 VZERO ZER 1723 VCEQG SEL1, ZER, SEL1 1724 1725 VSEL X1L, X3L, SEL1, X3L 1726 VSEL X1H, X3H, SEL1, X3H 1727 VSEL Y1L, Y3L, SEL1, Y3L 1728 VSEL Y1H, Y3H, SEL1, Y3H 1729 VSEL Z1L, Z3L, SEL1, Z3L 1730 VSEL Z1H, Z3H, SEL1, Z3H 1731 1732 // if (zero == 0) { 1733 // copy(P3.x[:], X2) 1734 // copy(P3.y[:], Y2) 1735 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1736 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p 1737 // } 1738 VL 16(P2ptr), X2H 1739 VPDI $0x4, X2H, X2H, X2H 1740 VL 0(P2ptr), X2L 1741 VPDI $0x4, X2L, X2L, X2L 1742 1743 // Y2 already loaded 1744 VL 64(CPOOL), Z2H 1745 VL 80(CPOOL), Z2L 1746 1747 VLREPG zero+40(FP), SEL1 1748 VZERO ZER 1749 VCEQG SEL1, ZER, SEL1 1750 1751 VSEL X2L, X3L, SEL1, X3L 1752 VSEL X2H, X3H, SEL1, X3H 1753 VSEL Y2L, Y3L, SEL1, Y3L 1754 VSEL Y2H, Y3H, SEL1, Y3H 1755 VSEL Z2L, Z3L, SEL1, Z3L 1756 VSEL Z2H, Z3H, SEL1, Z3H 1757 1758 // All done, store out the result!!! 1759 VPDI $0x4, X3H, X3H, X3H 1760 VST X3H, 16(P3ptr) 1761 VPDI $0x4, X3L, X3L, X3L 1762 VST X3L, 0(P3ptr) 1763 VPDI $0x4, Y3H, Y3H, Y3H 1764 VST Y3H, 48(P3ptr) 1765 VPDI $0x4, Y3L, Y3L, Y3L 1766 VST Y3L, 32(P3ptr) 1767 VPDI $0x4, Z3H, Z3H, Z3H 1768 VST Z3H, 80(P3ptr) 1769 VPDI $0x4, Z3L, Z3L, Z3L 1770 VST Z3L, 64(P3ptr) 1771 1772 RET 1773 1774 #undef P3ptr 1775 #undef P1ptr 1776 #undef P2ptr 1777 #undef CPOOL 1778 1779 #undef Y2L 1780 #undef Y2H 1781 #undef T1L 1782 #undef T1H 1783 #undef T2L 1784 #undef T2H 1785 #undef T3L 1786 #undef T3H 1787 #undef T4L 1788 #undef T4H 1789 1790 #undef TT0 1791 #undef TT1 1792 #undef T2 1793 1794 #undef X0 1795 #undef X1 1796 #undef Y0 1797 #undef Y1 1798 #undef T0 1799 #undef T1 1800 1801 #undef PL 1802 #undef PH 1803 1804 #undef X1L 1805 #undef X1H 1806 #undef Y1L 1807 #undef Y1H 1808 #undef Z1L 1809 #undef Z1H 1810 #undef X2L 1811 #undef X2H 1812 #undef Z2L 1813 #undef Z2H 1814 #undef X3L 1815 #undef X3H 1816 #undef Y3L 1817 #undef Y3H 1818 #undef Z3L 1819 #undef Z3H 1820 1821 #undef ZER 1822 #undef SEL1 1823 #undef CAR1 1824 #undef CAR2 1825 1826 // func p256PointDoubleAsm(res, in *P256Point) 1827 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 1828 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 1829 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 1830 #define P3ptr R1 1831 #define P1ptr R2 1832 #define CPOOL R4 1833 1834 // Temporaries in REGs 1835 #define X3L V15 1836 #define X3H V16 1837 #define Y3L V17 1838 #define Y3H V18 1839 #define T1L V19 1840 #define T1H V20 1841 #define T2L V21 1842 #define T2H V22 1843 #define T3L V23 1844 #define T3H V24 1845 1846 #define X1L V6 1847 #define X1H V7 1848 #define Y1L V8 1849 #define Y1H V9 1850 #define Z1L V10 1851 #define Z1H V11 1852 1853 // Temps for Sub and Add 1854 #define TT0 V11 1855 #define TT1 V12 1856 #define T2 V13 1857 1858 // p256MulAsm Parameters 1859 #define X0 V0 1860 #define X1 V1 1861 #define Y0 V2 1862 #define Y1 V3 1863 #define T0 V4 1864 #define T1 V5 1865 1866 #define PL V31 1867 #define PH V30 1868 1869 #define Z3L V23 1870 #define Z3H V24 1871 1872 #define ZER V26 1873 #define SEL1 V27 1874 #define CAR1 V28 1875 #define CAR2 V29 1876 /* 1877 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 1878 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 1879 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1880 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 1881 * B = 2Y₁ 1882 * Z₃ = B×Z₁ 1883 * C = B² 1884 * D = C×X₁ 1885 * X₃ = A²-2D 1886 * Y₃ = (D-X₃)×A-C²/2 1887 * 1888 * Three-operand formula: 1889 * T1 = Z1² 1890 * T2 = X1-T1 1891 * T1 = X1+T1 1892 * T2 = T2*T1 1893 * T2 = 3*T2 1894 * Y3 = 2*Y1 1895 * Z3 = Y3*Z1 1896 * Y3 = Y3² 1897 * T3 = Y3*X1 1898 * Y3 = Y3² 1899 * Y3 = half*Y3 1900 * X3 = T2² 1901 * T1 = 2*T3 1902 * X3 = X3-T1 1903 * T1 = T3-X3 1904 * T1 = T1*T2 1905 * Y3 = T1-Y3 1906 */ 1907 1908 #define p256PointDoubleRound(P1ptr, P3ptr) \ 1909 \ // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 1910 VLM 64(P1ptr), X0, X1 \ // Z1L, Z1H 1911 VPDI $0x4, X1, X1, X1 \ 1912 VPDI $0x4, X0, X0, X0 \ 1913 VLR X0, Y0 \ 1914 VLR X1, Y1 \ 1915 CALL sm2p256SqrInternal<>(SB) \ 1916 \ 1917 \ // SUB(X<X1-T) // T2 = X1-T1 1918 VLM (P1ptr), X1L, X1H \ 1919 VPDI $0x4, X1H, X1H, X1H \ 1920 VPDI $0x4, X1L, X1L, X1L \ 1921 p256SubInternal(X1,X0,X1H,X1L,T1,T0) \ 1922 \ 1923 \ // ADD(Y<X1+T) // T1 = X1+T1 1924 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) \ 1925 \ 1926 \ // X- ; Y- ; MUL; T- // T2 = T2*T1 1927 CALL sm2p256MulInternal<>(SB) \ 1928 \ 1929 \ // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 1930 p256AddInternal(T2H,T2L,T1,T0,T1,T0) \ 1931 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) \ 1932 \ 1933 \// ADD(X<Y1+Y1) // Y3 = 2*Y1 1934 VLM 32(P1ptr), Y1L, Y1H \ 1935 VPDI $0x4, Y1H, Y1H, Y1H \ 1936 VPDI $0x4, Y1L, Y1L, Y1L \ 1937 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) \ 1938 \ 1939 \// X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 1940 VLM 64(P1ptr), Y0, Y1 \ // Z1L, Z1H 1941 VPDI $0x4, Y1, Y1, Y1 \ 1942 VPDI $0x4, Y0, Y0, Y0 \ 1943 CALL sm2p256MulInternal<>(SB) \ 1944 VPDI $0x4, T1, T1, TT1 \ 1945 VPDI $0x4, T0, T0, TT0 \ 1946 VSTM TT0, TT1, 64(P3ptr) \ 1947 \ 1948 \ // X- ; Y=X ; MUL; T- // Y3 = Y3² 1949 VLR X0, Y0 \ 1950 VLR X1, Y1 \ 1951 CALL sm2p256SqrInternal<>(SB) \ 1952 \ 1953 \ // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 1954 VLR T0, X0 \ 1955 VLR T1, X1 \ 1956 VLM 0(P1ptr), Y0, Y1 \ 1957 VPDI $0x4, Y1, Y1, Y1 \ 1958 VPDI $0x4, Y0, Y0, Y0 \ 1959 CALL sm2p256MulInternal<>(SB) \ 1960 VLR T0, T3L \ 1961 VLR T1, T3H \ 1962 \ 1963 \ // X- ; Y=X ; MUL; T- // Y3 = Y3² 1964 VLR X0, Y0 \ 1965 VLR X1, Y1 \ 1966 CALL sm2p256SqrInternal<>(SB) \ 1967 \ 1968 \ // HAL(Y3<T) // Y3 = half*Y3 1969 p256HalfInternal(Y3H,Y3L, T1,T0) \ 1970 \ 1971 \ // X=T2; Y=T2; MUL; T- // X3 = T2² 1972 VLR T2L, X0 \ 1973 VLR T2H, X1 \ 1974 VLR T2L, Y0 \ 1975 VLR T2H, Y1 \ 1976 CALL sm2p256SqrInternal<>(SB) \ 1977 \ 1978 \ // ADD(T1<T3+T3) // T1 = 2*T3 1979 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) \ 1980 \ 1981 \ // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 1982 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) \ 1983 VPDI $0x4, X3H, X3H, TT1 \ 1984 VPDI $0x4, X3L, X3L, TT0 \ 1985 VSTM TT0, TT1, (P3ptr) \ 1986 \ 1987 \ // SUB(X<T3-X3) // T1 = T3-X3 1988 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) \ 1989 \ 1990 \ // X- ; Y- ; MUL; T- // T1 = T1*T2 1991 CALL sm2p256MulInternal<>(SB) \ 1992 \ 1993 \ // SUB(Y3<T-Y3) // Y3 = T1-Y3 1994 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) \ 1995 \ 1996 VPDI $0x4, Y3H, Y3H, Y3H \ 1997 VPDI $0x4, Y3L, Y3L, Y3L \ 1998 VSTM Y3L, Y3H, 32(P3ptr) \ 1999 2000 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 2001 MOVD res+0(FP), P3ptr 2002 MOVD in+8(FP), P1ptr 2003 2004 MOVD $p256mul<>+0x00(SB), CPOOL 2005 VLM (CPOOL), PH, PL 2006 2007 p256PointDoubleRound(P1ptr, P3ptr) 2008 RET 2009 2010 TEXT ·p256PointDouble6TimesAsm(SB), NOSPLIT, $0 2011 MOVD res+0(FP), P3ptr 2012 MOVD in+8(FP), P1ptr 2013 2014 MOVD $p256mul<>+0x00(SB), CPOOL 2015 VLM (CPOOL), PH, PL 2016 2017 p256PointDoubleRound(P1ptr, P3ptr) 2018 p256PointDoubleRound(P3ptr, P3ptr) 2019 p256PointDoubleRound(P3ptr, P3ptr) 2020 p256PointDoubleRound(P3ptr, P3ptr) 2021 p256PointDoubleRound(P3ptr, P3ptr) 2022 p256PointDoubleRound(P3ptr, P3ptr) 2023 2024 RET 2025 2026 #undef P3ptr 2027 #undef P1ptr 2028 #undef CPOOL 2029 #undef X3L 2030 #undef X3H 2031 #undef Y3L 2032 #undef Y3H 2033 #undef T1L 2034 #undef T1H 2035 #undef T2L 2036 #undef T2H 2037 #undef T3L 2038 #undef T3H 2039 #undef X1L 2040 #undef X1H 2041 #undef Y1L 2042 #undef Y1H 2043 #undef Z1L 2044 #undef Z1H 2045 #undef TT0 2046 #undef TT1 2047 #undef T2 2048 #undef X0 2049 #undef X1 2050 #undef Y0 2051 #undef Y1 2052 #undef T0 2053 #undef T1 2054 #undef PL 2055 #undef PH 2056 #undef Z3L 2057 #undef Z3H 2058 #undef ZER 2059 #undef SEL1 2060 #undef CAR1 2061 #undef CAR2 2062 2063 // func p256PointAddAsm(res, in1, in2 *P256Point) int 2064 #define P3ptr R1 2065 #define P1ptr R2 2066 #define P2ptr R3 2067 #define CPOOL R4 2068 #define ISZERO R5 2069 #define TRUE R6 2070 2071 // Temporaries in REGs 2072 #define T1L V16 2073 #define T1H V17 2074 #define T2L V18 2075 #define T2H V19 2076 #define U1L V20 2077 #define U1H V21 2078 #define S1L V22 2079 #define S1H V23 2080 #define HL V24 2081 #define HH V25 2082 #define RL V26 2083 #define RH V27 2084 2085 // Temps for Sub and Add 2086 #define ZER V6 2087 #define SEL1 V7 2088 #define CAR1 V8 2089 #define CAR2 V9 2090 #define TT0 V11 2091 #define TT1 V12 2092 #define T2 V13 2093 2094 // p256MulAsm Parameters 2095 #define X0 V0 2096 #define X1 V1 2097 #define Y0 V2 2098 #define Y1 V3 2099 #define T0 V4 2100 #define T1 V5 2101 2102 #define PL V31 2103 #define PH V30 2104 /* 2105 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 2106 * 2107 * A = X₁×Z₂² 2108 * B = Y₁×Z₂³ 2109 * C = X₂×Z₁²-A 2110 * D = Y₂×Z₁³-B 2111 * X₃ = D² - 2A×C² - C³ 2112 * Y₃ = D×(A×C² - X₃) - B×C³ 2113 * Z₃ = Z₁×Z₂×C 2114 * 2115 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 2116 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 2117 * 2118 * T1 = Z1*Z1 2119 * T2 = Z2*Z2 2120 * U1 = X1*T2 2121 * H = X2*T1 2122 * H = H-U1 2123 * Z3 = Z1*Z2 2124 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2125 * 2126 * S1 = Z2*T2 2127 * S1 = Y1*S1 2128 * R = Z1*T1 2129 * R = Y2*R 2130 * R = R-S1 2131 * 2132 * T1 = H*H 2133 * T2 = H*T1 2134 * U1 = U1*T1 2135 * 2136 * X3 = R*R 2137 * X3 = X3-T2 2138 * T1 = 2*U1 2139 * X3 = X3-T1 << store-out X3 result reg 2140 * 2141 * T2 = S1*T2 2142 * Y3 = U1-X3 2143 * Y3 = R*Y3 2144 * Y3 = Y3-T2 << store-out Y3 result reg 2145 2146 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2147 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2148 // X=X2; Y- ; MUL; H=T // H = X2*T1 2149 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2150 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2151 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2152 // SUB(H<H-T) // H = H-U1 2153 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2154 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2155 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2156 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2157 // SUB(R<T-S1) // R = R-S1 2158 // X=H ; Y=H ; MUL; T- // T1 = H*H 2159 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2160 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2161 // X=R ; Y=R ; MUL; T- // X3 = R*R 2162 // SUB(T<T-T2) // X3 = X3-T2 2163 // ADD(X<U1+U1) // T1 = 2*U1 2164 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2165 // SUB(Y<U1-T) // Y3 = U1-X3 2166 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2167 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2168 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2169 */ 2170 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 2171 MOVD res+0(FP), P3ptr 2172 MOVD in1+8(FP), P1ptr 2173 MOVD in2+16(FP), P2ptr 2174 2175 MOVD $p256mul<>+0x00(SB), CPOOL 2176 VLM (CPOOL), PH, PL 2177 2178 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2179 VLM 64(P1ptr), X0, X1 // Z1L, Z1H 2180 VPDI $0x4, X1, X1, X1 2181 VPDI $0x4, X0, X0, X0 2182 VLR X0, Y0 2183 VLR X1, Y1 2184 CALL sm2p256SqrInternal<>(SB) 2185 2186 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2187 VLR T0, Y0 2188 VLR T1, Y1 2189 CALL sm2p256MulInternal<>(SB) 2190 VLR T0, RL 2191 VLR T1, RH 2192 2193 // X=X2; Y- ; MUL; H=T // H = X2*T1 2194 VLM (P2ptr), X0, X1 // X2L, X2H 2195 VPDI $0x4, X1, X1, X1 2196 VPDI $0x4, X0, X0, X0 2197 CALL sm2p256MulInternal<>(SB) 2198 VLR T0, HL 2199 VLR T1, HH 2200 2201 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2202 VLM 64(P2ptr), X0, X1 // Z2L, Z2H 2203 VPDI $0x4, X1, X1, X1 2204 VPDI $0x4, X0, X0, X0 2205 VLR X0, Y0 2206 VLR X1, Y1 2207 CALL sm2p256SqrInternal<>(SB) 2208 2209 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2210 VLR T0, Y0 2211 VLR T1, Y1 2212 CALL sm2p256MulInternal<>(SB) 2213 VLR T0, S1L 2214 VLR T1, S1H 2215 2216 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2217 VLM (P1ptr), X0, X1 // X1L, X1H 2218 VPDI $0x4, X1, X1, X1 2219 VPDI $0x4, X0, X0, X0 2220 CALL sm2p256MulInternal<>(SB) 2221 VLR T0, U1L 2222 VLR T1, U1H 2223 2224 // SUB(H<H-T) // H = H-U1 2225 p256SubInternal(HH,HL,HH,HL,T1,T0) 2226 2227 // if H == 0 or H^P == 0 then ret=1 else ret=0 2228 // clobbers T1H and T1L 2229 MOVD $0, ISZERO 2230 MOVD $1, TRUE 2231 VZERO ZER 2232 VO HL, HH, T1H 2233 VCEQGS ZER, T1H, T1H 2234 MOVDEQ TRUE, ISZERO 2235 VX HL, PL, T1L 2236 VX HH, PH, T1H 2237 VO T1L, T1H, T1H 2238 VCEQGS ZER, T1H, T1H 2239 MOVDEQ TRUE, ISZERO 2240 MOVD ISZERO, ret+24(FP) 2241 2242 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2243 VLM 64(P1ptr), X0, X1 // Z1L, Z1H 2244 VPDI $0x4, X1, X1, X1 2245 VPDI $0x4, X0, X0, X0 2246 VLM 64(P2ptr), Y0, Y1 // Z2L, Z2H 2247 VPDI $0x4, Y1, Y1, Y1 2248 VPDI $0x4, Y0, Y0, Y0 2249 CALL sm2p256MulInternal<>(SB) 2250 2251 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2252 VLR T0, X0 2253 VLR T1, X1 2254 VLR HL, Y0 2255 VLR HH, Y1 2256 CALL sm2p256MulInternal<>(SB) 2257 VPDI $0x4, T1, T1, TT1 2258 VPDI $0x4, T0, T0, TT0 2259 VSTM TT0, TT1, 64(P3ptr) 2260 2261 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2262 VLM 32(P1ptr), X0, X1 2263 VPDI $0x4, X1, X1, X1 2264 VPDI $0x4, X0, X0, X0 2265 VLR S1L, Y0 2266 VLR S1H, Y1 2267 CALL sm2p256MulInternal<>(SB) 2268 VLR T0, S1L 2269 VLR T1, S1H 2270 2271 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2272 VLM 32(P2ptr), X0, X1 2273 VPDI $0x4, X1, X1, X1 2274 VPDI $0x4, X0, X0, X0 2275 VLR RL, Y0 2276 VLR RH, Y1 2277 CALL sm2p256MulInternal<>(SB) 2278 2279 // SUB(R<T-S1) // R = T-S1 2280 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2281 2282 // if R == 0 or R^P == 0 then ret=ret else ret=0 2283 // clobbers T1H and T1L 2284 MOVD $0, ISZERO 2285 MOVD $1, TRUE 2286 VZERO ZER 2287 VO RL, RH, T1H 2288 VCEQGS ZER, T1H, T1H 2289 MOVDEQ TRUE, ISZERO 2290 VX RL, PL, T1L 2291 VX RH, PH, T1H 2292 VO T1L, T1H, T1H 2293 VCEQGS ZER, T1H, T1H 2294 MOVDEQ TRUE, ISZERO 2295 AND ret+24(FP), ISZERO 2296 MOVD ISZERO, ret+24(FP) 2297 2298 // X=H ; Y=H ; MUL; T- // T1 = H*H 2299 VLR HL, X0 2300 VLR HH, X1 2301 VLR HL, Y0 2302 VLR HH, Y1 2303 CALL sm2p256SqrInternal<>(SB) 2304 2305 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2306 VLR T0, Y0 2307 VLR T1, Y1 2308 CALL sm2p256MulInternal<>(SB) 2309 VLR T0, T2L 2310 VLR T1, T2H 2311 2312 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2313 VLR U1L, X0 2314 VLR U1H, X1 2315 CALL sm2p256MulInternal<>(SB) 2316 VLR T0, U1L 2317 VLR T1, U1H 2318 2319 // X=R ; Y=R ; MUL; T- // X3 = R*R 2320 VLR RL, X0 2321 VLR RH, X1 2322 VLR RL, Y0 2323 VLR RH, Y1 2324 CALL sm2p256SqrInternal<>(SB) 2325 2326 // SUB(T<T-T2) // X3 = X3-T2 2327 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2328 2329 // ADD(X<U1+U1) // T1 = 2*U1 2330 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2331 2332 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2333 p256SubInternal(T1,T0,T1,T0,X1,X0) 2334 VPDI $0x4, T1, T1, TT1 2335 VPDI $0x4, T0, T0, TT0 2336 VSTM TT0, TT1, (P3ptr) 2337 2338 // SUB(Y<U1-T) // Y3 = U1-X3 2339 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2340 2341 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2342 VLR RL, X0 2343 VLR RH, X1 2344 CALL sm2p256MulInternal<>(SB) 2345 VLR T0, U1L 2346 VLR T1, U1H 2347 2348 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2349 VLR S1L, X0 2350 VLR S1H, X1 2351 VLR T2L, Y0 2352 VLR T2H, Y1 2353 CALL sm2p256MulInternal<>(SB) 2354 2355 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2356 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2357 VPDI $0x4, T1, T1, T1 2358 VPDI $0x4, T0, T0, T0 2359 VSTM T0, T1, 32(P3ptr) 2360 2361 RET 2362 2363 #undef P3ptr 2364 #undef P1ptr 2365 #undef P2ptr 2366 #undef CPOOL 2367 #undef ISZERO 2368 #undef TRUE 2369 #undef T1L 2370 #undef T1H 2371 #undef T2L 2372 #undef T2H 2373 #undef U1L 2374 #undef U1H 2375 #undef S1L 2376 #undef S1H 2377 #undef HL 2378 #undef HH 2379 #undef RL 2380 #undef RH 2381 #undef ZER 2382 #undef SEL1 2383 #undef CAR1 2384 #undef CAR2 2385 #undef TT0 2386 #undef TT1 2387 #undef T2 2388 #undef X0 2389 #undef X1 2390 #undef Y0 2391 #undef Y1 2392 #undef T0 2393 #undef T1 2394 #undef PL 2395 #undef PH 2396 2397 //func p256OrdReduce(s *p256OrdElement) 2398 #define res_ptr R1 2399 #define CPOOL R4 2400 2401 #define T0 V0 2402 #define T1 V1 2403 #define T2 V2 2404 #define TT0 V3 2405 #define TT1 V4 2406 2407 #define ZER V6 2408 #define CAR1 V7 2409 #define CAR2 V8 2410 #define PL V10 2411 #define PH V9 2412 2413 TEXT ·p256OrdReduce(SB),NOSPLIT,$0 2414 MOVD res+0(FP), res_ptr 2415 2416 VZERO T2 2417 VZERO ZER 2418 MOVD $p256ord<>+0x00(SB), CPOOL 2419 VLM (CPOOL), PH, PL 2420 2421 VLM (res_ptr), T0, T1 2422 VPDI $0x4, T0, T0, T0 2423 VPDI $0x4, T1, T1, T1 2424 2425 VSCBIQ PL, T0, CAR1 2426 VSQ PL, T0, TT0 2427 VSBCBIQ T1, PH, CAR1, CAR2 2428 VSBIQ T1, PH, CAR1, TT1 2429 VSBIQ T2, ZER, CAR2, T2 2430 2431 // what output to use, TT1||TT0 or T1||T0? 2432 VSEL T0, TT0, T2, T0 2433 VSEL T1, TT1, T2, T1 2434 2435 VPDI $0x4, T0, T0, TT0 2436 VPDI $0x4, T1, T1, TT1 2437 VSTM TT0, TT1, (res_ptr) 2438 2439 RET 2440 #undef res_ptr 2441 #undef CPOOL 2442 #undef T0 2443 #undef T1 2444 #undef T2 2445 #undef TT0 2446 #undef TT1 2447 #undef ZER 2448 #undef CAR1 2449 #undef CAR2 2450 #undef PL 2451 #undef PH