github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/crypto/internal/nistec/p256_asm_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 #include "go_asm.h" 7 8 DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f 9 DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 10 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff 11 DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84 12 DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551 13 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 14 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 15 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 16 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 17 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 18 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 19 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 20 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 21 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 22 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 23 DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask 24 DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask 25 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 26 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 27 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 28 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 29 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 30 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 31 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 32 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 33 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 34 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 35 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 36 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 37 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 38 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 39 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 40 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 41 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 42 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 43 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 44 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 45 GLOBL p256ordK0<>(SB), 8, $4 46 GLOBL p256ord<>(SB), 8, $32 47 GLOBL p256<>(SB), 8, $96 48 GLOBL p256mul<>(SB), 8, $160 49 50 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 51 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0 52 JMP ·p256BigToLittle(SB) 53 54 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 55 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0 56 JMP ·p256BigToLittle(SB) 57 58 // --------------------------------------- 59 // func p256LittleToBig(res *[32]byte, in *p256Element) 60 TEXT ·p256LittleToBig(SB), NOSPLIT, $0 61 JMP ·p256BigToLittle(SB) 62 63 // func p256BigToLittle(res *p256Element, in *[32]byte) 64 #define res_ptr R1 65 #define in_ptr R2 66 #define T1L V2 67 #define T1H V3 68 69 TEXT ·p256BigToLittle(SB), NOSPLIT, $0 70 MOVD res+0(FP), res_ptr 71 MOVD in+8(FP), in_ptr 72 73 VL 0(in_ptr), T1H 74 VL 16(in_ptr), T1L 75 76 VPDI $0x4, T1L, T1L, T1L 77 VPDI $0x4, T1H, T1H, T1H 78 79 VST T1L, 0(res_ptr) 80 VST T1H, 16(res_ptr) 81 RET 82 83 #undef res_ptr 84 #undef in_ptr 85 #undef T1L 86 #undef T1H 87 88 // --------------------------------------- 89 // iff cond == 1 val <- -val 90 // func p256NegCond(val *p256Element, cond int) 91 #define P1ptr R1 92 #define CPOOL R4 93 94 #define Y1L V0 95 #define Y1H V1 96 #define T1L V2 97 #define T1H V3 98 99 #define PL V30 100 #define PH V31 101 102 #define ZER V4 103 #define SEL1 V5 104 #define CAR1 V6 105 TEXT ·p256NegCond(SB), NOSPLIT, $0 106 MOVD val+0(FP), P1ptr 107 108 MOVD $p256mul<>+0x00(SB), CPOOL 109 VL 16(CPOOL), PL 110 VL 0(CPOOL), PH 111 112 VL 16(P1ptr), Y1H 113 VPDI $0x4, Y1H, Y1H, Y1H 114 VL 0(P1ptr), Y1L 115 VPDI $0x4, Y1L, Y1L, Y1L 116 117 VLREPG cond+8(FP), SEL1 118 VZERO ZER 119 VCEQG SEL1, ZER, SEL1 120 121 VSCBIQ Y1L, PL, CAR1 122 VSQ Y1L, PL, T1L 123 VSBIQ PH, Y1H, CAR1, T1H 124 125 VSEL Y1L, T1L, SEL1, Y1L 126 VSEL Y1H, T1H, SEL1, Y1H 127 128 VPDI $0x4, Y1H, Y1H, Y1H 129 VST Y1H, 16(P1ptr) 130 VPDI $0x4, Y1L, Y1L, Y1L 131 VST Y1L, 0(P1ptr) 132 RET 133 134 #undef P1ptr 135 #undef CPOOL 136 #undef Y1L 137 #undef Y1H 138 #undef T1L 139 #undef T1H 140 #undef PL 141 #undef PH 142 #undef ZER 143 #undef SEL1 144 #undef CAR1 145 146 // --------------------------------------- 147 // if cond == 0 res <- b; else res <- a 148 // func p256MovCond(res, a, b *P256Point, cond int) 149 #define P3ptr R1 150 #define P1ptr R2 151 #define P2ptr R3 152 153 #define X1L V0 154 #define X1H V1 155 #define Y1L V2 156 #define Y1H V3 157 #define Z1L V4 158 #define Z1H V5 159 #define X2L V6 160 #define X2H V7 161 #define Y2L V8 162 #define Y2H V9 163 #define Z2L V10 164 #define Z2H V11 165 166 #define ZER V18 167 #define SEL1 V19 168 TEXT ·p256MovCond(SB), NOSPLIT, $0 169 MOVD res+0(FP), P3ptr 170 MOVD a+8(FP), P1ptr 171 MOVD b+16(FP), P2ptr 172 VLREPG cond+24(FP), SEL1 173 VZERO ZER 174 VCEQG SEL1, ZER, SEL1 175 176 VL 0(P1ptr), X1H 177 VL 16(P1ptr), X1L 178 VL 32(P1ptr), Y1H 179 VL 48(P1ptr), Y1L 180 VL 64(P1ptr), Z1H 181 VL 80(P1ptr), Z1L 182 183 VL 0(P2ptr), X2H 184 VL 16(P2ptr), X2L 185 VL 32(P2ptr), Y2H 186 VL 48(P2ptr), Y2L 187 VL 64(P2ptr), Z2H 188 VL 80(P2ptr), Z2L 189 190 VSEL X2L, X1L, SEL1, X1L 191 VSEL X2H, X1H, SEL1, X1H 192 VSEL Y2L, Y1L, SEL1, Y1L 193 VSEL Y2H, Y1H, SEL1, Y1H 194 VSEL Z2L, Z1L, SEL1, Z1L 195 VSEL Z2H, Z1H, SEL1, Z1H 196 197 VST X1H, 0(P3ptr) 198 VST X1L, 16(P3ptr) 199 VST Y1H, 32(P3ptr) 200 VST Y1L, 48(P3ptr) 201 VST Z1H, 64(P3ptr) 202 VST Z1L, 80(P3ptr) 203 204 RET 205 206 #undef P3ptr 207 #undef P1ptr 208 #undef P2ptr 209 #undef X1L 210 #undef X1H 211 #undef Y1L 212 #undef Y1H 213 #undef Z1L 214 #undef Z1H 215 #undef X2L 216 #undef X2H 217 #undef Y2L 218 #undef Y2H 219 #undef Z2L 220 #undef Z2H 221 #undef ZER 222 #undef SEL1 223 224 // --------------------------------------- 225 // Constant time table access 226 // Indexed from 1 to 15, with -1 offset 227 // (index 0 is implicitly point at infinity) 228 // func p256Select(res *P256Point, table *p256Table, idx int) 229 #define P3ptr R1 230 #define P1ptr R2 231 #define COUNT R4 232 233 #define X1L V0 234 #define X1H V1 235 #define Y1L V2 236 #define Y1H V3 237 #define Z1L V4 238 #define Z1H V5 239 #define X2L V6 240 #define X2H V7 241 #define Y2L V8 242 #define Y2H V9 243 #define Z2L V10 244 #define Z2H V11 245 246 #define ONE V18 247 #define IDX V19 248 #define SEL1 V20 249 #define SEL2 V21 250 TEXT ·p256Select(SB), NOSPLIT, $0 251 MOVD res+0(FP), P3ptr 252 MOVD table+8(FP), P1ptr 253 VLREPB idx+(16+7)(FP), IDX 254 VREPIB $1, ONE 255 VREPIB $1, SEL2 256 MOVD $1, COUNT 257 258 VZERO X1H 259 VZERO X1L 260 VZERO Y1H 261 VZERO Y1L 262 VZERO Z1H 263 VZERO Z1L 264 265 loop_select: 266 VL 0(P1ptr), X2H 267 VL 16(P1ptr), X2L 268 VL 32(P1ptr), Y2H 269 VL 48(P1ptr), Y2L 270 VL 64(P1ptr), Z2H 271 VL 80(P1ptr), Z2L 272 273 VCEQG SEL2, IDX, SEL1 274 275 VSEL X2L, X1L, SEL1, X1L 276 VSEL X2H, X1H, SEL1, X1H 277 VSEL Y2L, Y1L, SEL1, Y1L 278 VSEL Y2H, Y1H, SEL1, Y1H 279 VSEL Z2L, Z1L, SEL1, Z1L 280 VSEL Z2H, Z1H, SEL1, Z1H 281 282 VAB SEL2, ONE, SEL2 283 ADDW $1, COUNT 284 ADD $96, P1ptr 285 CMPW COUNT, $17 286 BLT loop_select 287 288 VST X1H, 0(P3ptr) 289 VST X1L, 16(P3ptr) 290 VST Y1H, 32(P3ptr) 291 VST Y1L, 48(P3ptr) 292 VST Z1H, 64(P3ptr) 293 VST Z1L, 80(P3ptr) 294 RET 295 296 #undef P3ptr 297 #undef P1ptr 298 #undef COUNT 299 #undef X1L 300 #undef X1H 301 #undef Y1L 302 #undef Y1H 303 #undef Z1L 304 #undef Z1H 305 #undef X2L 306 #undef X2H 307 #undef Y2L 308 #undef Y2H 309 #undef Z2L 310 #undef Z2H 311 #undef ONE 312 #undef IDX 313 #undef SEL1 314 #undef SEL2 315 316 // --------------------------------------- 317 318 // func p256FromMont(res, in *p256Element) 319 #define res_ptr R1 320 #define x_ptr R2 321 #define CPOOL R4 322 323 #define T0 V0 324 #define T1 V1 325 #define T2 V2 326 #define TT0 V3 327 #define TT1 V4 328 329 #define ZER V6 330 #define SEL1 V7 331 #define SEL2 V8 332 #define CAR1 V9 333 #define CAR2 V10 334 #define RED1 V11 335 #define RED2 V12 336 #define PL V13 337 #define PH V14 338 339 TEXT ·p256FromMont(SB), NOSPLIT, $0 340 MOVD res+0(FP), res_ptr 341 MOVD in+8(FP), x_ptr 342 343 VZERO T2 344 VZERO ZER 345 MOVD $p256<>+0x00(SB), CPOOL 346 VL 16(CPOOL), PL 347 VL 0(CPOOL), PH 348 VL 48(CPOOL), SEL2 349 VL 64(CPOOL), SEL1 350 351 VL (0*16)(x_ptr), T0 352 VPDI $0x4, T0, T0, T0 353 VL (1*16)(x_ptr), T1 354 VPDI $0x4, T1, T1, T1 355 356 // First round 357 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 358 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 359 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 360 361 VSLDB $8, T1, T0, T0 362 VSLDB $8, T2, T1, T1 363 364 VACCQ T0, RED1, CAR1 365 VAQ T0, RED1, T0 366 VACCCQ T1, RED2, CAR1, CAR2 367 VACQ T1, RED2, CAR1, T1 368 VAQ T2, CAR2, T2 369 370 // Second round 371 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 372 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 373 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 374 375 VSLDB $8, T1, T0, T0 376 VSLDB $8, T2, T1, T1 377 378 VACCQ T0, RED1, CAR1 379 VAQ T0, RED1, T0 380 VACCCQ T1, RED2, CAR1, CAR2 381 VACQ T1, RED2, CAR1, T1 382 VAQ T2, CAR2, T2 383 384 // Third round 385 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 386 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 387 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 388 389 VSLDB $8, T1, T0, T0 390 VSLDB $8, T2, T1, T1 391 392 VACCQ T0, RED1, CAR1 393 VAQ T0, RED1, T0 394 VACCCQ T1, RED2, CAR1, CAR2 395 VACQ T1, RED2, CAR1, T1 396 VAQ T2, CAR2, T2 397 398 // Last round 399 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 400 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 401 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 402 403 VSLDB $8, T1, T0, T0 404 VSLDB $8, T2, T1, T1 405 406 VACCQ T0, RED1, CAR1 407 VAQ T0, RED1, T0 408 VACCCQ T1, RED2, CAR1, CAR2 409 VACQ T1, RED2, CAR1, T1 410 VAQ T2, CAR2, T2 411 412 // --------------------------------------------------- 413 414 VSCBIQ PL, T0, CAR1 415 VSQ PL, T0, TT0 416 VSBCBIQ T1, PH, CAR1, CAR2 417 VSBIQ T1, PH, CAR1, TT1 418 VSBIQ T2, ZER, CAR2, T2 419 420 // what output to use, TT1||TT0 or T1||T0? 421 VSEL T0, TT0, T2, T0 422 VSEL T1, TT1, T2, T1 423 424 VPDI $0x4, T0, T0, TT0 425 VST TT0, (0*16)(res_ptr) 426 VPDI $0x4, T1, T1, TT1 427 VST TT1, (1*16)(res_ptr) 428 RET 429 430 #undef res_ptr 431 #undef x_ptr 432 #undef CPOOL 433 #undef T0 434 #undef T1 435 #undef T2 436 #undef TT0 437 #undef TT1 438 #undef ZER 439 #undef SEL1 440 #undef SEL2 441 #undef CAR1 442 #undef CAR2 443 #undef RED1 444 #undef RED2 445 #undef PL 446 #undef PH 447 448 // Constant time table access 449 // Indexed from 1 to 15, with -1 offset 450 // (index 0 is implicitly point at infinity) 451 // func p256SelectBase(point *p256Point, table []p256Point, idx int) 452 // new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 453 454 #define P3ptr R1 455 #define P1ptr R2 456 #define COUNT R4 457 #define CPOOL R5 458 459 #define X1L V0 460 #define X1H V1 461 #define Y1L V2 462 #define Y1H V3 463 #define Z1L V4 464 #define Z1H V5 465 #define X2L V6 466 #define X2H V7 467 #define Y2L V8 468 #define Y2H V9 469 #define Z2L V10 470 #define Z2H V11 471 #define LE2BE V12 472 473 #define ONE V18 474 #define IDX V19 475 #define SEL1 V20 476 #define SEL2 V21 477 478 TEXT ·p256SelectAffine(SB), NOSPLIT, $0 479 MOVD res+0(FP), P3ptr 480 MOVD table+8(FP), P1ptr 481 MOVD $p256<>+0x00(SB), CPOOL 482 VLREPB idx+(16+7)(FP), IDX 483 VREPIB $1, ONE 484 VREPIB $1, SEL2 485 MOVD $1, COUNT 486 VL 80(CPOOL), LE2BE 487 488 VZERO X1H 489 VZERO X1L 490 VZERO Y1H 491 VZERO Y1L 492 493 loop_select: 494 VL 0(P1ptr), X2H 495 VL 16(P1ptr), X2L 496 VL 32(P1ptr), Y2H 497 VL 48(P1ptr), Y2L 498 499 VCEQG SEL2, IDX, SEL1 500 501 VSEL X2L, X1L, SEL1, X1L 502 VSEL X2H, X1H, SEL1, X1H 503 VSEL Y2L, Y1L, SEL1, Y1L 504 VSEL Y2H, Y1H, SEL1, Y1H 505 506 VAB SEL2, ONE, SEL2 507 ADDW $1, COUNT 508 ADD $64, P1ptr 509 CMPW COUNT, $65 510 BLT loop_select 511 VST X1H, 0(P3ptr) 512 VST X1L, 16(P3ptr) 513 VST Y1H, 32(P3ptr) 514 VST Y1L, 48(P3ptr) 515 516 RET 517 518 #undef P3ptr 519 #undef P1ptr 520 #undef COUNT 521 #undef X1L 522 #undef X1H 523 #undef Y1L 524 #undef Y1H 525 #undef Z1L 526 #undef Z1H 527 #undef X2L 528 #undef X2H 529 #undef Y2L 530 #undef Y2H 531 #undef Z2L 532 #undef Z2H 533 #undef ONE 534 #undef IDX 535 #undef SEL1 536 #undef SEL2 537 #undef CPOOL 538 539 // --------------------------------------- 540 541 // func p256OrdMul(res, in1, in2 *p256OrdElement) 542 #define res_ptr R1 543 #define x_ptr R2 544 #define y_ptr R3 545 #define X0 V0 546 #define X1 V1 547 #define Y0 V2 548 #define Y1 V3 549 #define M0 V4 550 #define M1 V5 551 #define T0 V6 552 #define T1 V7 553 #define T2 V8 554 #define YDIG V9 555 556 #define ADD1 V16 557 #define ADD1H V17 558 #define ADD2 V18 559 #define ADD2H V19 560 #define RED1 V20 561 #define RED1H V21 562 #define RED2 V22 563 #define RED2H V23 564 #define CAR1 V24 565 #define CAR1M V25 566 567 #define MK0 V30 568 #define K0 V31 569 TEXT ·p256OrdMul<>(SB), NOSPLIT, $0 570 MOVD res+0(FP), res_ptr 571 MOVD in1+8(FP), x_ptr 572 MOVD in2+16(FP), y_ptr 573 574 VZERO T2 575 MOVD $p256ordK0<>+0x00(SB), R4 576 577 // VLEF $3, 0(R4), K0 578 WORD $0xE7F40000 579 BYTE $0x38 580 BYTE $0x03 581 MOVD $p256ord<>+0x00(SB), R4 582 VL 16(R4), M0 583 VL 0(R4), M1 584 585 VL (0*16)(x_ptr), X0 586 VPDI $0x4, X0, X0, X0 587 VL (1*16)(x_ptr), X1 588 VPDI $0x4, X1, X1, X1 589 VL (0*16)(y_ptr), Y0 590 VPDI $0x4, Y0, Y0, Y0 591 VL (1*16)(y_ptr), Y1 592 VPDI $0x4, Y1, Y1, Y1 593 594 // ---------------------------------------------------------------------------/ 595 VREPF $3, Y0, YDIG 596 VMLF X0, YDIG, ADD1 597 VMLF ADD1, K0, MK0 598 VREPF $3, MK0, MK0 599 600 VMLF X1, YDIG, ADD2 601 VMLHF X0, YDIG, ADD1H 602 VMLHF X1, YDIG, ADD2H 603 604 VMALF M0, MK0, ADD1, RED1 605 VMALHF M0, MK0, ADD1, RED1H 606 VMALF M1, MK0, ADD2, RED2 607 VMALHF M1, MK0, ADD2, RED2H 608 609 VSLDB $12, RED2, RED1, RED1 610 VSLDB $12, T2, RED2, RED2 611 612 VACCQ RED1, ADD1H, CAR1 613 VAQ RED1, ADD1H, T0 614 VACCQ RED1H, T0, CAR1M 615 VAQ RED1H, T0, T0 616 617 // << ready for next MK0 618 619 VACQ RED2, ADD2H, CAR1, T1 620 VACCCQ RED2, ADD2H, CAR1, CAR1 621 VACCCQ RED2H, T1, CAR1M, T2 622 VACQ RED2H, T1, CAR1M, T1 623 VAQ CAR1, T2, T2 624 625 // --------------------------------------------------- 626 /* * 627 * ---+--------+--------+ 628 * T2| T1 | T0 | 629 * ---+--------+--------+ 630 * *(add)* 631 * +--------+--------+ 632 * | X1 | X0 | 633 * +--------+--------+ 634 * *(mul)* 635 * +--------+--------+ 636 * | YDIG | YDIG | 637 * +--------+--------+ 638 * *(add)* 639 * +--------+--------+ 640 * | M1 | M0 | 641 * +--------+--------+ 642 * *(mul)* 643 * +--------+--------+ 644 * | MK0 | MK0 | 645 * +--------+--------+ 646 * 647 * --------------------- 648 * 649 * +--------+--------+ 650 * | ADD2 | ADD1 | 651 * +--------+--------+ 652 * +--------+--------+ 653 * | ADD2H | ADD1H | 654 * +--------+--------+ 655 * +--------+--------+ 656 * | RED2 | RED1 | 657 * +--------+--------+ 658 * +--------+--------+ 659 * | RED2H | RED1H | 660 * +--------+--------+ 661 */ 662 VREPF $2, Y0, YDIG 663 VMALF X0, YDIG, T0, ADD1 664 VMLF ADD1, K0, MK0 665 VREPF $3, MK0, MK0 666 667 VMALF X1, YDIG, T1, ADD2 668 VMALHF X0, YDIG, T0, ADD1H 669 VMALHF X1, YDIG, T1, ADD2H 670 671 VMALF M0, MK0, ADD1, RED1 672 VMALHF M0, MK0, ADD1, RED1H 673 VMALF M1, MK0, ADD2, RED2 674 VMALHF M1, MK0, ADD2, RED2H 675 676 VSLDB $12, RED2, RED1, RED1 677 VSLDB $12, T2, RED2, RED2 678 679 VACCQ RED1, ADD1H, CAR1 680 VAQ RED1, ADD1H, T0 681 VACCQ RED1H, T0, CAR1M 682 VAQ RED1H, T0, T0 683 684 // << ready for next MK0 685 686 VACQ RED2, ADD2H, CAR1, T1 687 VACCCQ RED2, ADD2H, CAR1, CAR1 688 VACCCQ RED2H, T1, CAR1M, T2 689 VACQ RED2H, T1, CAR1M, T1 690 VAQ CAR1, T2, T2 691 692 // --------------------------------------------------- 693 VREPF $1, Y0, YDIG 694 VMALF X0, YDIG, T0, ADD1 695 VMLF ADD1, K0, MK0 696 VREPF $3, MK0, MK0 697 698 VMALF X1, YDIG, T1, ADD2 699 VMALHF X0, YDIG, T0, ADD1H 700 VMALHF X1, YDIG, T1, ADD2H 701 702 VMALF M0, MK0, ADD1, RED1 703 VMALHF M0, MK0, ADD1, RED1H 704 VMALF M1, MK0, ADD2, RED2 705 VMALHF M1, MK0, ADD2, RED2H 706 707 VSLDB $12, RED2, RED1, RED1 708 VSLDB $12, T2, RED2, RED2 709 710 VACCQ RED1, ADD1H, CAR1 711 VAQ RED1, ADD1H, T0 712 VACCQ RED1H, T0, CAR1M 713 VAQ RED1H, T0, T0 714 715 // << ready for next MK0 716 717 VACQ RED2, ADD2H, CAR1, T1 718 VACCCQ RED2, ADD2H, CAR1, CAR1 719 VACCCQ RED2H, T1, CAR1M, T2 720 VACQ RED2H, T1, CAR1M, T1 721 VAQ CAR1, T2, T2 722 723 // --------------------------------------------------- 724 VREPF $0, Y0, YDIG 725 VMALF X0, YDIG, T0, ADD1 726 VMLF ADD1, K0, MK0 727 VREPF $3, MK0, MK0 728 729 VMALF X1, YDIG, T1, ADD2 730 VMALHF X0, YDIG, T0, ADD1H 731 VMALHF X1, YDIG, T1, ADD2H 732 733 VMALF M0, MK0, ADD1, RED1 734 VMALHF M0, MK0, ADD1, RED1H 735 VMALF M1, MK0, ADD2, RED2 736 VMALHF M1, MK0, ADD2, RED2H 737 738 VSLDB $12, RED2, RED1, RED1 739 VSLDB $12, T2, RED2, RED2 740 741 VACCQ RED1, ADD1H, CAR1 742 VAQ RED1, ADD1H, T0 743 VACCQ RED1H, T0, CAR1M 744 VAQ RED1H, T0, T0 745 746 // << ready for next MK0 747 748 VACQ RED2, ADD2H, CAR1, T1 749 VACCCQ RED2, ADD2H, CAR1, CAR1 750 VACCCQ RED2H, T1, CAR1M, T2 751 VACQ RED2H, T1, CAR1M, T1 752 VAQ CAR1, T2, T2 753 754 // --------------------------------------------------- 755 VREPF $3, Y1, YDIG 756 VMALF X0, YDIG, T0, ADD1 757 VMLF ADD1, K0, MK0 758 VREPF $3, MK0, MK0 759 760 VMALF X1, YDIG, T1, ADD2 761 VMALHF X0, YDIG, T0, ADD1H 762 VMALHF X1, YDIG, T1, ADD2H 763 764 VMALF M0, MK0, ADD1, RED1 765 VMALHF M0, MK0, ADD1, RED1H 766 VMALF M1, MK0, ADD2, RED2 767 VMALHF M1, MK0, ADD2, RED2H 768 769 VSLDB $12, RED2, RED1, RED1 770 VSLDB $12, T2, RED2, RED2 771 772 VACCQ RED1, ADD1H, CAR1 773 VAQ RED1, ADD1H, T0 774 VACCQ RED1H, T0, CAR1M 775 VAQ RED1H, T0, T0 776 777 // << ready for next MK0 778 779 VACQ RED2, ADD2H, CAR1, T1 780 VACCCQ RED2, ADD2H, CAR1, CAR1 781 VACCCQ RED2H, T1, CAR1M, T2 782 VACQ RED2H, T1, CAR1M, T1 783 VAQ CAR1, T2, T2 784 785 // --------------------------------------------------- 786 VREPF $2, Y1, YDIG 787 VMALF X0, YDIG, T0, ADD1 788 VMLF ADD1, K0, MK0 789 VREPF $3, MK0, MK0 790 791 VMALF X1, YDIG, T1, ADD2 792 VMALHF X0, YDIG, T0, ADD1H 793 VMALHF X1, YDIG, T1, ADD2H 794 795 VMALF M0, MK0, ADD1, RED1 796 VMALHF M0, MK0, ADD1, RED1H 797 VMALF M1, MK0, ADD2, RED2 798 VMALHF M1, MK0, ADD2, RED2H 799 800 VSLDB $12, RED2, RED1, RED1 801 VSLDB $12, T2, RED2, RED2 802 803 VACCQ RED1, ADD1H, CAR1 804 VAQ RED1, ADD1H, T0 805 VACCQ RED1H, T0, CAR1M 806 VAQ RED1H, T0, T0 807 808 // << ready for next MK0 809 810 VACQ RED2, ADD2H, CAR1, T1 811 VACCCQ RED2, ADD2H, CAR1, CAR1 812 VACCCQ RED2H, T1, CAR1M, T2 813 VACQ RED2H, T1, CAR1M, T1 814 VAQ CAR1, T2, T2 815 816 // --------------------------------------------------- 817 VREPF $1, Y1, YDIG 818 VMALF X0, YDIG, T0, ADD1 819 VMLF ADD1, K0, MK0 820 VREPF $3, MK0, MK0 821 822 VMALF X1, YDIG, T1, ADD2 823 VMALHF X0, YDIG, T0, ADD1H 824 VMALHF X1, YDIG, T1, ADD2H 825 826 VMALF M0, MK0, ADD1, RED1 827 VMALHF M0, MK0, ADD1, RED1H 828 VMALF M1, MK0, ADD2, RED2 829 VMALHF M1, MK0, ADD2, RED2H 830 831 VSLDB $12, RED2, RED1, RED1 832 VSLDB $12, T2, RED2, RED2 833 834 VACCQ RED1, ADD1H, CAR1 835 VAQ RED1, ADD1H, T0 836 VACCQ RED1H, T0, CAR1M 837 VAQ RED1H, T0, T0 838 839 // << ready for next MK0 840 841 VACQ RED2, ADD2H, CAR1, T1 842 VACCCQ RED2, ADD2H, CAR1, CAR1 843 VACCCQ RED2H, T1, CAR1M, T2 844 VACQ RED2H, T1, CAR1M, T1 845 VAQ CAR1, T2, T2 846 847 // --------------------------------------------------- 848 VREPF $0, Y1, YDIG 849 VMALF X0, YDIG, T0, ADD1 850 VMLF ADD1, K0, MK0 851 VREPF $3, MK0, MK0 852 853 VMALF X1, YDIG, T1, ADD2 854 VMALHF X0, YDIG, T0, ADD1H 855 VMALHF X1, YDIG, T1, ADD2H 856 857 VMALF M0, MK0, ADD1, RED1 858 VMALHF M0, MK0, ADD1, RED1H 859 VMALF M1, MK0, ADD2, RED2 860 VMALHF M1, MK0, ADD2, RED2H 861 862 VSLDB $12, RED2, RED1, RED1 863 VSLDB $12, T2, RED2, RED2 864 865 VACCQ RED1, ADD1H, CAR1 866 VAQ RED1, ADD1H, T0 867 VACCQ RED1H, T0, CAR1M 868 VAQ RED1H, T0, T0 869 870 // << ready for next MK0 871 872 VACQ RED2, ADD2H, CAR1, T1 873 VACCCQ RED2, ADD2H, CAR1, CAR1 874 VACCCQ RED2H, T1, CAR1M, T2 875 VACQ RED2H, T1, CAR1M, T1 876 VAQ CAR1, T2, T2 877 878 // --------------------------------------------------- 879 880 VZERO RED1 881 VSCBIQ M0, T0, CAR1 882 VSQ M0, T0, ADD1 883 VSBCBIQ T1, M1, CAR1, CAR1M 884 VSBIQ T1, M1, CAR1, ADD2 885 VSBIQ T2, RED1, CAR1M, T2 886 887 // what output to use, ADD2||ADD1 or T1||T0? 888 VSEL T0, ADD1, T2, T0 889 VSEL T1, ADD2, T2, T1 890 891 VPDI $0x4, T0, T0, T0 892 VST T0, (0*16)(res_ptr) 893 VPDI $0x4, T1, T1, T1 894 VST T1, (1*16)(res_ptr) 895 RET 896 897 #undef res_ptr 898 #undef x_ptr 899 #undef y_ptr 900 #undef X0 901 #undef X1 902 #undef Y0 903 #undef Y1 904 #undef M0 905 #undef M1 906 #undef T0 907 #undef T1 908 #undef T2 909 #undef YDIG 910 911 #undef ADD1 912 #undef ADD1H 913 #undef ADD2 914 #undef ADD2H 915 #undef RED1 916 #undef RED1H 917 #undef RED2 918 #undef RED2H 919 #undef CAR1 920 #undef CAR1M 921 922 #undef MK0 923 #undef K0 924 925 // --------------------------------------- 926 // p256MulInternal 927 // V0-V3,V30,V31 - Not Modified 928 // V4-V15 - Volatile 929 930 #define CPOOL R4 931 932 // Parameters 933 #define X0 V0 // Not modified 934 #define X1 V1 // Not modified 935 #define Y0 V2 // Not modified 936 #define Y1 V3 // Not modified 937 #define T0 V4 938 #define T1 V5 939 #define P0 V30 // Not modified 940 #define P1 V31 // Not modified 941 942 // Temporaries 943 #define YDIG V6 // Overloaded with CAR2, ZER 944 #define ADD1H V7 // Overloaded with ADD3H 945 #define ADD2H V8 // Overloaded with ADD4H 946 #define ADD3 V9 // Overloaded with SEL2,SEL5 947 #define ADD4 V10 // Overloaded with SEL3,SEL6 948 #define RED1 V11 // Overloaded with CAR2 949 #define RED2 V12 950 #define RED3 V13 // Overloaded with SEL1 951 #define T2 V14 952 // Overloaded temporaries 953 #define ADD1 V4 // Overloaded with T0 954 #define ADD2 V5 // Overloaded with T1 955 #define ADD3H V7 // Overloaded with ADD1H 956 #define ADD4H V8 // Overloaded with ADD2H 957 #define ZER V6 // Overloaded with YDIG, CAR2 958 #define CAR1 V6 // Overloaded with YDIG, ZER 959 #define CAR2 V11 // Overloaded with RED1 960 // Constant Selects 961 #define SEL1 V13 // Overloaded with RED3 962 #define SEL2 V9 // Overloaded with ADD3,SEL5 963 #define SEL3 V10 // Overloaded with ADD4,SEL6 964 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER 965 #define SEL5 V9 // Overloaded with ADD3,SEL2 966 #define SEL6 V10 // Overloaded with ADD4,SEL3 967 968 /* * 969 * To follow the flow of bits, for your own sanity a stiff drink, need you shall. 970 * Of a single round, a 'helpful' picture, here is. Meaning, column position has. 971 * With you, SIMD be... 972 * 973 * +--------+--------+ 974 * +--------| RED2 | RED1 | 975 * | +--------+--------+ 976 * | ---+--------+--------+ 977 * | +---- T2| T1 | T0 |--+ 978 * | | ---+--------+--------+ | 979 * | | | 980 * | | ======================= | 981 * | | | 982 * | | +--------+--------+<-+ 983 * | +-------| ADD2 | ADD1 |--|-----+ 984 * | | +--------+--------+ | | 985 * | | +--------+--------+<---+ | 986 * | | | ADD2H | ADD1H |--+ | 987 * | | +--------+--------+ | | 988 * | | +--------+--------+<-+ | 989 * | | | ADD4 | ADD3 |--|-+ | 990 * | | +--------+--------+ | | | 991 * | | +--------+--------+<---+ | | 992 * | | | ADD4H | ADD3H |------|-+ |(+vzero) 993 * | | +--------+--------+ | | V 994 * | | ------------------------ | | +--------+ 995 * | | | | | RED3 | [d0 0 0 d0] 996 * | | | | +--------+ 997 * | +---->+--------+--------+ | | | 998 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | 999 * | +--------+--------+ | | | 1000 * +---->---+--------+--------+ | | | 1001 * T2| T1 | T0 |----+ | | 1002 * ---+--------+--------+ | | | 1003 * ---+--------+--------+<---+ | | 1004 * +--- T2| T1 | T0 |----------+ 1005 * | ---+--------+--------+ | | 1006 * | +--------+--------+<-------------+ 1007 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 1008 * | +--------+--------+ | | | 1009 * | +--------+<----------------------+ 1010 * | | RED3 |--------------+ | [0 0 d1 d0] 1011 * | +--------+ | | 1012 * +--->+--------+--------+ | | 1013 * | T1 | T0 |--------+ 1014 * +--------+--------+ | | 1015 * --------------------------- | | 1016 * | | 1017 * +--------+--------+<----+ | 1018 * | RED2 | RED1 | | 1019 * +--------+--------+ | 1020 * ---+--------+--------+<-------+ 1021 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 1022 * ---+--------+--------+ 1023 * 1024 * *Mi obra de arte de siglo XXI @vpaprots 1025 * 1026 * 1027 * First group is special, doesn't get the two inputs: 1028 * +--------+--------+<-+ 1029 * +-------| ADD2 | ADD1 |--|-----+ 1030 * | +--------+--------+ | | 1031 * | +--------+--------+<---+ | 1032 * | | ADD2H | ADD1H |--+ | 1033 * | +--------+--------+ | | 1034 * | +--------+--------+<-+ | 1035 * | | ADD4 | ADD3 |--|-+ | 1036 * | +--------+--------+ | | | 1037 * | +--------+--------+<---+ | | 1038 * | | ADD4H | ADD3H |------|-+ |(+vzero) 1039 * | +--------+--------+ | | V 1040 * | ------------------------ | | +--------+ 1041 * | | | | RED3 | [d0 0 0 d0] 1042 * | | | +--------+ 1043 * +---->+--------+--------+ | | | 1044 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | 1045 * +--------+--------+ | | | 1046 * ---+--------+--------+<---+ | | 1047 * +--- T2| T1 | T0 |----------+ 1048 * | ---+--------+--------+ | | 1049 * | +--------+--------+<-------------+ 1050 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 1051 * | +--------+--------+ | | | 1052 * | +--------+<----------------------+ 1053 * | | RED3 |--------------+ | [0 0 d1 d0] 1054 * | +--------+ | | 1055 * +--->+--------+--------+ | | 1056 * | T1 | T0 |--------+ 1057 * +--------+--------+ | | 1058 * --------------------------- | | 1059 * | | 1060 * +--------+--------+<----+ | 1061 * | RED2 | RED1 | | 1062 * +--------+--------+ | 1063 * ---+--------+--------+<-------+ 1064 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 1065 * ---+--------+--------+ 1066 * 1067 * Last 'group' needs to RED2||RED1 shifted less 1068 */ 1069 TEXT p256MulInternal<>(SB), NOSPLIT, $0-0 1070 VL 32(CPOOL), SEL1 1071 VL 48(CPOOL), SEL2 1072 VL 64(CPOOL), SEL3 1073 VL 80(CPOOL), SEL4 1074 1075 // --------------------------------------------------- 1076 1077 VREPF $3, Y0, YDIG 1078 VMLHF X0, YDIG, ADD1H 1079 VMLHF X1, YDIG, ADD2H 1080 VMLF X0, YDIG, ADD1 1081 VMLF X1, YDIG, ADD2 1082 1083 VREPF $2, Y0, YDIG 1084 VMALF X0, YDIG, ADD1H, ADD3 1085 VMALF X1, YDIG, ADD2H, ADD4 1086 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1087 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1088 1089 VZERO ZER 1090 VL 32(CPOOL), SEL1 1091 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1092 1093 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1094 VSLDB $12, ZER, ADD2, T1 // ADD2 Free 1095 1096 VACCQ T0, ADD3, CAR1 1097 VAQ T0, ADD3, T0 // ADD3 Free 1098 VACCCQ T1, ADD4, CAR1, T2 1099 VACQ T1, ADD4, CAR1, T1 // ADD4 Free 1100 1101 VL 48(CPOOL), SEL2 1102 VL 64(CPOOL), SEL3 1103 VL 80(CPOOL), SEL4 1104 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1105 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1106 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1107 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1108 1109 VSLDB $12, T1, T0, T0 1110 VSLDB $12, T2, T1, T1 1111 1112 VACCQ T0, ADD3H, CAR1 1113 VAQ T0, ADD3H, T0 1114 VACCCQ T1, ADD4H, CAR1, T2 1115 VACQ T1, ADD4H, CAR1, T1 1116 1117 // --------------------------------------------------- 1118 1119 VREPF $1, Y0, YDIG 1120 VMALHF X0, YDIG, T0, ADD1H 1121 VMALHF X1, YDIG, T1, ADD2H 1122 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 1123 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 1124 1125 VREPF $0, Y0, YDIG 1126 VMALF X0, YDIG, ADD1H, ADD3 1127 VMALF X1, YDIG, ADD2H, ADD4 1128 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 1129 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 1130 1131 VZERO ZER 1132 VL 32(CPOOL), SEL1 1133 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1134 1135 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 1136 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free 1137 1138 VACCQ T0, RED1, CAR1 1139 VAQ T0, RED1, T0 1140 VACCCQ T1, RED2, CAR1, T2 1141 VACQ T1, RED2, CAR1, T1 1142 1143 VACCQ T0, ADD3, CAR1 1144 VAQ T0, ADD3, T0 1145 VACCCQ T1, ADD4, CAR1, CAR2 1146 VACQ T1, ADD4, CAR1, T1 1147 VAQ T2, CAR2, T2 1148 1149 VL 48(CPOOL), SEL2 1150 VL 64(CPOOL), SEL3 1151 VL 80(CPOOL), SEL4 1152 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1153 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1154 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1155 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1156 1157 VSLDB $12, T1, T0, T0 1158 VSLDB $12, T2, T1, T1 1159 1160 VACCQ T0, ADD3H, CAR1 1161 VAQ T0, ADD3H, T0 1162 VACCCQ T1, ADD4H, CAR1, T2 1163 VACQ T1, ADD4H, CAR1, T1 1164 1165 // --------------------------------------------------- 1166 1167 VREPF $3, Y1, YDIG 1168 VMALHF X0, YDIG, T0, ADD1H 1169 VMALHF X1, YDIG, T1, ADD2H 1170 VMALF X0, YDIG, T0, ADD1 1171 VMALF X1, YDIG, T1, ADD2 1172 1173 VREPF $2, Y1, YDIG 1174 VMALF X0, YDIG, ADD1H, ADD3 1175 VMALF X1, YDIG, ADD2H, ADD4 1176 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1177 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1178 1179 VZERO ZER 1180 VL 32(CPOOL), SEL1 1181 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1182 1183 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1184 VSLDB $12, T2, ADD2, T1 // ADD2 Free 1185 1186 VACCQ T0, RED1, CAR1 1187 VAQ T0, RED1, T0 1188 VACCCQ T1, RED2, CAR1, T2 1189 VACQ T1, RED2, CAR1, T1 1190 1191 VACCQ T0, ADD3, CAR1 1192 VAQ T0, ADD3, T0 1193 VACCCQ T1, ADD4, CAR1, CAR2 1194 VACQ T1, ADD4, CAR1, T1 1195 VAQ T2, CAR2, T2 1196 1197 VL 48(CPOOL), SEL2 1198 VL 64(CPOOL), SEL3 1199 VL 80(CPOOL), SEL4 1200 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1201 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1202 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1203 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1204 1205 VSLDB $12, T1, T0, T0 1206 VSLDB $12, T2, T1, T1 1207 1208 VACCQ T0, ADD3H, CAR1 1209 VAQ T0, ADD3H, T0 1210 VACCCQ T1, ADD4H, CAR1, T2 1211 VACQ T1, ADD4H, CAR1, T1 1212 1213 // --------------------------------------------------- 1214 1215 VREPF $1, Y1, YDIG 1216 VMALHF X0, YDIG, T0, ADD1H 1217 VMALHF X1, YDIG, T1, ADD2H 1218 VMALF X0, YDIG, T0, ADD1 1219 VMALF X1, YDIG, T1, ADD2 1220 1221 VREPF $0, Y1, YDIG 1222 VMALF X0, YDIG, ADD1H, ADD3 1223 VMALF X1, YDIG, ADD2H, ADD4 1224 VMALHF X0, YDIG, ADD1H, ADD3H 1225 VMALHF X1, YDIG, ADD2H, ADD4H 1226 1227 VZERO ZER 1228 VL 32(CPOOL), SEL1 1229 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1230 1231 VSLDB $12, ADD2, ADD1, T0 1232 VSLDB $12, T2, ADD2, T1 1233 1234 VACCQ T0, RED1, CAR1 1235 VAQ T0, RED1, T0 1236 VACCCQ T1, RED2, CAR1, T2 1237 VACQ T1, RED2, CAR1, T1 1238 1239 VACCQ T0, ADD3, CAR1 1240 VAQ T0, ADD3, T0 1241 VACCCQ T1, ADD4, CAR1, CAR2 1242 VACQ T1, ADD4, CAR1, T1 1243 VAQ T2, CAR2, T2 1244 1245 VL 96(CPOOL), SEL5 1246 VL 112(CPOOL), SEL6 1247 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] 1248 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] 1249 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 1250 1251 VSLDB $12, T1, T0, T0 1252 VSLDB $12, T2, T1, T1 1253 1254 VACCQ T0, ADD3H, CAR1 1255 VAQ T0, ADD3H, T0 1256 VACCCQ T1, ADD4H, CAR1, T2 1257 VACQ T1, ADD4H, CAR1, T1 1258 1259 VACCQ T0, RED1, CAR1 1260 VAQ T0, RED1, T0 1261 VACCCQ T1, RED2, CAR1, CAR2 1262 VACQ T1, RED2, CAR1, T1 1263 VAQ T2, CAR2, T2 1264 1265 // --------------------------------------------------- 1266 1267 VZERO RED3 1268 VSCBIQ P0, T0, CAR1 1269 VSQ P0, T0, ADD1H 1270 VSBCBIQ T1, P1, CAR1, CAR2 1271 VSBIQ T1, P1, CAR1, ADD2H 1272 VSBIQ T2, RED3, CAR2, T2 1273 1274 // what output to use, ADD2H||ADD1H or T1||T0? 1275 VSEL T0, ADD1H, T2, T0 1276 VSEL T1, ADD2H, T2, T1 1277 RET 1278 1279 #undef CPOOL 1280 1281 #undef X0 1282 #undef X1 1283 #undef Y0 1284 #undef Y1 1285 #undef T0 1286 #undef T1 1287 #undef P0 1288 #undef P1 1289 1290 #undef SEL1 1291 #undef SEL2 1292 #undef SEL3 1293 #undef SEL4 1294 #undef SEL5 1295 #undef SEL6 1296 1297 #undef YDIG 1298 #undef ADD1H 1299 #undef ADD2H 1300 #undef ADD3 1301 #undef ADD4 1302 #undef RED1 1303 #undef RED2 1304 #undef RED3 1305 #undef T2 1306 #undef ADD1 1307 #undef ADD2 1308 #undef ADD3H 1309 #undef ADD4H 1310 #undef ZER 1311 #undef CAR1 1312 #undef CAR2 1313 1314 // --------------------------------------- 1315 1316 // Parameters 1317 #define X0 V0 1318 #define X1 V1 1319 #define Y0 V2 1320 #define Y1 V3 1321 1322 TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0 1323 VLR X0, Y0 1324 VLR X1, Y1 1325 BR p256MulInternal<>(SB) 1326 1327 #undef X0 1328 #undef X1 1329 #undef Y0 1330 #undef Y1 1331 1332 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1333 VZERO ZER \ 1334 VSCBIQ Y0, X0, CAR1 \ 1335 VSQ Y0, X0, T0 \ 1336 VSBCBIQ X1, Y1, CAR1, SEL1 \ 1337 VSBIQ X1, Y1, CAR1, T1 \ 1338 VSQ SEL1, ZER, SEL1 \ 1339 \ 1340 VACCQ T0, PL, CAR1 \ 1341 VAQ T0, PL, TT0 \ 1342 VACQ T1, PH, CAR1, TT1 \ 1343 \ 1344 VSEL T0, TT0, SEL1, T0 \ 1345 VSEL T1, TT1, SEL1, T1 \ 1346 1347 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1348 VACCQ X0, Y0, CAR1 \ 1349 VAQ X0, Y0, T0 \ 1350 VACCCQ X1, Y1, CAR1, T2 \ 1351 VACQ X1, Y1, CAR1, T1 \ 1352 \ 1353 VZERO ZER \ 1354 VSCBIQ PL, T0, CAR1 \ 1355 VSQ PL, T0, TT0 \ 1356 VSBCBIQ T1, PH, CAR1, CAR2 \ 1357 VSBIQ T1, PH, CAR1, TT1 \ 1358 VSBIQ T2, ZER, CAR2, SEL1 \ 1359 \ 1360 VSEL T0, TT0, SEL1, T0 \ 1361 VSEL T1, TT1, SEL1, T1 1362 1363 #define p256HalfInternal(T1, T0, X1, X0) \ 1364 VZERO ZER \ 1365 VSBIQ ZER, ZER, X0, SEL1 \ 1366 \ 1367 VACCQ X0, PL, CAR1 \ 1368 VAQ X0, PL, T0 \ 1369 VACCCQ X1, PH, CAR1, T2 \ 1370 VACQ X1, PH, CAR1, T1 \ 1371 \ 1372 VSEL X0, T0, SEL1, T0 \ 1373 VSEL X1, T1, SEL1, T1 \ 1374 VSEL ZER, T2, SEL1, T2 \ 1375 \ 1376 VSLDB $15, T2, ZER, TT1 \ 1377 VSLDB $15, T1, ZER, TT0 \ 1378 VREPIB $1, SEL1 \ 1379 VSRL SEL1, T0, T0 \ 1380 VSRL SEL1, T1, T1 \ 1381 VREPIB $7, SEL1 \ 1382 VSL SEL1, TT0, TT0 \ 1383 VSL SEL1, TT1, TT1 \ 1384 VO T0, TT0, T0 \ 1385 VO T1, TT1, T1 1386 1387 // --------------------------------------- 1388 // func p256Mul(res, in1, in2 *p256Element) 1389 #define res_ptr R1 1390 #define x_ptr R2 1391 #define y_ptr R3 1392 #define CPOOL R4 1393 1394 // Parameters 1395 #define X0 V0 1396 #define X1 V1 1397 #define Y0 V2 1398 #define Y1 V3 1399 #define T0 V4 1400 #define T1 V5 1401 1402 // Constants 1403 #define P0 V30 1404 #define P1 V31 1405 TEXT ·p256Mul(SB), NOSPLIT, $0 1406 MOVD res+0(FP), res_ptr 1407 MOVD in1+8(FP), x_ptr 1408 MOVD in2+16(FP), y_ptr 1409 1410 VL (0*16)(x_ptr), X0 1411 VPDI $0x4, X0, X0, X0 1412 VL (1*16)(x_ptr), X1 1413 VPDI $0x4, X1, X1, X1 1414 VL (0*16)(y_ptr), Y0 1415 VPDI $0x4, Y0, Y0, Y0 1416 VL (1*16)(y_ptr), Y1 1417 VPDI $0x4, Y1, Y1, Y1 1418 1419 MOVD $p256mul<>+0x00(SB), CPOOL 1420 VL 16(CPOOL), P0 1421 VL 0(CPOOL), P1 1422 1423 CALL p256MulInternal<>(SB) 1424 1425 VPDI $0x4, T0, T0, T0 1426 VST T0, (0*16)(res_ptr) 1427 VPDI $0x4, T1, T1, T1 1428 VST T1, (1*16)(res_ptr) 1429 RET 1430 1431 #undef res_ptr 1432 #undef x_ptr 1433 #undef y_ptr 1434 #undef CPOOL 1435 1436 #undef X0 1437 #undef X1 1438 #undef Y0 1439 #undef Y1 1440 #undef T0 1441 #undef T1 1442 #undef P0 1443 #undef P1 1444 1445 // --------------------------------------- 1446 // func p256Sqr(res, in *p256Element, n int) 1447 #define res_ptr R1 1448 #define x_ptr R2 1449 #define y_ptr R3 1450 #define CPOOL R4 1451 #define COUNT R5 1452 #define N R6 1453 1454 // Parameters 1455 #define X0 V0 1456 #define X1 V1 1457 #define T0 V4 1458 #define T1 V5 1459 1460 // Constants 1461 #define P0 V30 1462 #define P1 V31 1463 TEXT ·p256Sqr(SB), NOSPLIT, $0 1464 MOVD res+0(FP), res_ptr 1465 MOVD in+8(FP), x_ptr 1466 1467 VL (0*16)(x_ptr), X0 1468 VPDI $0x4, X0, X0, X0 1469 VL (1*16)(x_ptr), X1 1470 VPDI $0x4, X1, X1, X1 1471 1472 MOVD $p256mul<>+0x00(SB), CPOOL 1473 MOVD $0, COUNT 1474 MOVD n+16(FP), N 1475 VL 16(CPOOL), P0 1476 VL 0(CPOOL), P1 1477 1478 loop: 1479 CALL p256SqrInternal<>(SB) 1480 VLR T0, X0 1481 VLR T1, X1 1482 ADDW $1, COUNT 1483 CMPW COUNT, N 1484 BLT loop 1485 1486 VPDI $0x4, T0, T0, T0 1487 VST T0, (0*16)(res_ptr) 1488 VPDI $0x4, T1, T1, T1 1489 VST T1, (1*16)(res_ptr) 1490 RET 1491 1492 #undef res_ptr 1493 #undef x_ptr 1494 #undef y_ptr 1495 #undef CPOOL 1496 #undef COUNT 1497 #undef N 1498 1499 #undef X0 1500 #undef X1 1501 #undef T0 1502 #undef T1 1503 #undef P0 1504 #undef P1 1505 1506 // Point add with P2 being affine point 1507 // If sign == 1 -> P2 = -P2 1508 // If sel == 0 -> P3 = P1 1509 // if zero == 0 -> P3 = P2 1510 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1511 #define P3ptr R1 1512 #define P1ptr R2 1513 #define P2ptr R3 1514 #define CPOOL R4 1515 1516 // Temporaries in REGs 1517 #define Y2L V15 1518 #define Y2H V16 1519 #define T1L V17 1520 #define T1H V18 1521 #define T2L V19 1522 #define T2H V20 1523 #define T3L V21 1524 #define T3H V22 1525 #define T4L V23 1526 #define T4H V24 1527 1528 // Temps for Sub and Add 1529 #define TT0 V11 1530 #define TT1 V12 1531 #define T2 V13 1532 1533 // p256MulAsm Parameters 1534 #define X0 V0 1535 #define X1 V1 1536 #define Y0 V2 1537 #define Y1 V3 1538 #define T0 V4 1539 #define T1 V5 1540 1541 #define PL V30 1542 #define PH V31 1543 1544 // Names for zero/sel selects 1545 #define X1L V0 1546 #define X1H V1 1547 #define Y1L V2 // p256MulAsmParmY 1548 #define Y1H V3 // p256MulAsmParmY 1549 #define Z1L V4 1550 #define Z1H V5 1551 #define X2L V0 1552 #define X2H V1 1553 #define Z2L V4 1554 #define Z2H V5 1555 #define X3L V17 // T1L 1556 #define X3H V18 // T1H 1557 #define Y3L V21 // T3L 1558 #define Y3H V22 // T3H 1559 #define Z3L V28 1560 #define Z3H V29 1561 1562 #define ZER V6 1563 #define SEL1 V7 1564 #define CAR1 V8 1565 #define CAR2 V9 1566 /* * 1567 * Three operand formula: 1568 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1569 * T1 = Z1² 1570 * T2 = T1*Z1 1571 * T1 = T1*X2 1572 * T2 = T2*Y2 1573 * T1 = T1-X1 1574 * T2 = T2-Y1 1575 * Z3 = Z1*T1 1576 * T3 = T1² 1577 * T4 = T3*T1 1578 * T3 = T3*X1 1579 * T1 = 2*T3 1580 * X3 = T2² 1581 * X3 = X3-T1 1582 * X3 = X3-T4 1583 * T3 = T3-X3 1584 * T3 = T3*T2 1585 * T4 = T4*Y1 1586 * Y3 = T3-T4 1587 1588 * Three operand formulas, but with MulInternal X,Y used to store temps 1589 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1590 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1591 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1592 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1593 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1594 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1595 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1596 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1597 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1598 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1599 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1600 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1601 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1602 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1603 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1604 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1605 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1606 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1607 1608 */ 1609 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 1610 MOVD res+0(FP), P3ptr 1611 MOVD in1+8(FP), P1ptr 1612 MOVD in2+16(FP), P2ptr 1613 1614 MOVD $p256mul<>+0x00(SB), CPOOL 1615 VL 16(CPOOL), PL 1616 VL 0(CPOOL), PH 1617 1618 // if (sign == 1) { 1619 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 1620 // } 1621 1622 VL 48(P2ptr), Y2H 1623 VPDI $0x4, Y2H, Y2H, Y2H 1624 VL 32(P2ptr), Y2L 1625 VPDI $0x4, Y2L, Y2L, Y2L 1626 1627 VLREPG sign+24(FP), SEL1 1628 VZERO ZER 1629 VCEQG SEL1, ZER, SEL1 1630 1631 VSCBIQ Y2L, PL, CAR1 1632 VSQ Y2L, PL, T1L 1633 VSBIQ PH, Y2H, CAR1, T1H 1634 1635 VSEL Y2L, T1L, SEL1, Y2L 1636 VSEL Y2H, T1H, SEL1, Y2H 1637 1638 /* * 1639 * Three operand formula: 1640 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1641 */ 1642 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1643 VL 80(P1ptr), X1 // Z1H 1644 VPDI $0x4, X1, X1, X1 1645 VL 64(P1ptr), X0 // Z1L 1646 VPDI $0x4, X0, X0, X0 1647 VLR X0, Y0 1648 VLR X1, Y1 1649 CALL p256SqrInternal<>(SB) 1650 1651 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1652 VLR T0, X0 1653 VLR T1, X1 1654 CALL p256MulInternal<>(SB) 1655 VLR T0, T2L 1656 VLR T1, T2H 1657 1658 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1659 VL 16(P2ptr), Y1 // X2H 1660 VPDI $0x4, Y1, Y1, Y1 1661 VL 0(P2ptr), Y0 // X2L 1662 VPDI $0x4, Y0, Y0, Y0 1663 CALL p256MulInternal<>(SB) 1664 VLR T0, T1L 1665 VLR T1, T1H 1666 1667 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1668 VLR T2L, X0 1669 VLR T2H, X1 1670 VLR Y2L, Y0 1671 VLR Y2H, Y1 1672 CALL p256MulInternal<>(SB) 1673 1674 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1675 VL 48(P1ptr), Y1H 1676 VPDI $0x4, Y1H, Y1H, Y1H 1677 VL 32(P1ptr), Y1L 1678 VPDI $0x4, Y1L, Y1L, Y1L 1679 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1680 1681 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1682 VL 16(P1ptr), X1H 1683 VPDI $0x4, X1H, X1H, X1H 1684 VL 0(P1ptr), X1L 1685 VPDI $0x4, X1L, X1L, X1L 1686 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1687 1688 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1689 VL 80(P1ptr), X1 // Z1H 1690 VPDI $0x4, X1, X1, X1 1691 VL 64(P1ptr), X0 // Z1L 1692 VPDI $0x4, X0, X0, X0 1693 CALL p256MulInternal<>(SB) 1694 1695 // VST T1, 64(P3ptr) 1696 // VST T0, 80(P3ptr) 1697 VLR T0, Z3L 1698 VLR T1, Z3H 1699 1700 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1701 VLR Y0, X0 1702 VLR Y1, X1 1703 CALL p256SqrInternal<>(SB) 1704 VLR T0, X0 1705 VLR T1, X1 1706 1707 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1708 CALL p256MulInternal<>(SB) 1709 VLR T0, T4L 1710 VLR T1, T4H 1711 1712 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1713 VL 16(P1ptr), Y1 // X1H 1714 VPDI $0x4, Y1, Y1, Y1 1715 VL 0(P1ptr), Y0 // X1L 1716 VPDI $0x4, Y0, Y0, Y0 1717 CALL p256MulInternal<>(SB) 1718 VLR T0, T3L 1719 VLR T1, T3H 1720 1721 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1722 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1723 1724 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1725 VLR T2L, X0 1726 VLR T2H, X1 1727 VLR T2L, Y0 1728 VLR T2H, Y1 1729 CALL p256SqrInternal<>(SB) 1730 1731 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1732 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1733 1734 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1735 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1736 VLR T0, X3L 1737 VLR T1, X3H 1738 1739 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1740 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1741 1742 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1743 CALL p256MulInternal<>(SB) 1744 VLR T0, T3L 1745 VLR T1, T3H 1746 1747 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1748 VLR T4L, X0 1749 VLR T4H, X1 1750 VL 48(P1ptr), Y1 // Y1H 1751 VPDI $0x4, Y1, Y1, Y1 1752 VL 32(P1ptr), Y0 // Y1L 1753 VPDI $0x4, Y0, Y0, Y0 1754 CALL p256MulInternal<>(SB) 1755 1756 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 1757 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 1758 1759 // if (sel == 0) { 1760 // copy(P3.x[:], X1) 1761 // copy(P3.y[:], Y1) 1762 // copy(P3.z[:], Z1) 1763 // } 1764 1765 VL 16(P1ptr), X1H 1766 VPDI $0x4, X1H, X1H, X1H 1767 VL 0(P1ptr), X1L 1768 VPDI $0x4, X1L, X1L, X1L 1769 1770 // Y1 already loaded, left over from addition 1771 VL 80(P1ptr), Z1H 1772 VPDI $0x4, Z1H, Z1H, Z1H 1773 VL 64(P1ptr), Z1L 1774 VPDI $0x4, Z1L, Z1L, Z1L 1775 1776 VLREPG sel+32(FP), SEL1 1777 VZERO ZER 1778 VCEQG SEL1, ZER, SEL1 1779 1780 VSEL X1L, X3L, SEL1, X3L 1781 VSEL X1H, X3H, SEL1, X3H 1782 VSEL Y1L, Y3L, SEL1, Y3L 1783 VSEL Y1H, Y3H, SEL1, Y3H 1784 VSEL Z1L, Z3L, SEL1, Z3L 1785 VSEL Z1H, Z3H, SEL1, Z3H 1786 1787 // if (zero == 0) { 1788 // copy(P3.x[:], X2) 1789 // copy(P3.y[:], Y2) 1790 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1791 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p 1792 // } 1793 VL 16(P2ptr), X2H 1794 VPDI $0x4, X2H, X2H, X2H 1795 VL 0(P2ptr), X2L 1796 VPDI $0x4, X2L, X2L, X2L 1797 1798 // Y2 already loaded 1799 VL 128(CPOOL), Z2H 1800 VL 144(CPOOL), Z2L 1801 1802 VLREPG zero+40(FP), SEL1 1803 VZERO ZER 1804 VCEQG SEL1, ZER, SEL1 1805 1806 VSEL X2L, X3L, SEL1, X3L 1807 VSEL X2H, X3H, SEL1, X3H 1808 VSEL Y2L, Y3L, SEL1, Y3L 1809 VSEL Y2H, Y3H, SEL1, Y3H 1810 VSEL Z2L, Z3L, SEL1, Z3L 1811 VSEL Z2H, Z3H, SEL1, Z3H 1812 1813 // All done, store out the result!!! 1814 VPDI $0x4, X3H, X3H, X3H 1815 VST X3H, 16(P3ptr) 1816 VPDI $0x4, X3L, X3L, X3L 1817 VST X3L, 0(P3ptr) 1818 VPDI $0x4, Y3H, Y3H, Y3H 1819 VST Y3H, 48(P3ptr) 1820 VPDI $0x4, Y3L, Y3L, Y3L 1821 VST Y3L, 32(P3ptr) 1822 VPDI $0x4, Z3H, Z3H, Z3H 1823 VST Z3H, 80(P3ptr) 1824 VPDI $0x4, Z3L, Z3L, Z3L 1825 VST Z3L, 64(P3ptr) 1826 1827 RET 1828 1829 #undef P3ptr 1830 #undef P1ptr 1831 #undef P2ptr 1832 #undef CPOOL 1833 1834 #undef Y2L 1835 #undef Y2H 1836 #undef T1L 1837 #undef T1H 1838 #undef T2L 1839 #undef T2H 1840 #undef T3L 1841 #undef T3H 1842 #undef T4L 1843 #undef T4H 1844 1845 #undef TT0 1846 #undef TT1 1847 #undef T2 1848 1849 #undef X0 1850 #undef X1 1851 #undef Y0 1852 #undef Y1 1853 #undef T0 1854 #undef T1 1855 1856 #undef PL 1857 #undef PH 1858 1859 #undef X1L 1860 #undef X1H 1861 #undef Y1L 1862 #undef Y1H 1863 #undef Z1L 1864 #undef Z1H 1865 #undef X2L 1866 #undef X2H 1867 #undef Z2L 1868 #undef Z2H 1869 #undef X3L 1870 #undef X3H 1871 #undef Y3L 1872 #undef Y3H 1873 #undef Z3L 1874 #undef Z3H 1875 1876 #undef ZER 1877 #undef SEL1 1878 #undef CAR1 1879 #undef CAR2 1880 1881 // func p256PointDoubleAsm(res, in *P256Point) 1882 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 1883 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 1884 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 1885 #define P3ptr R1 1886 #define P1ptr R2 1887 #define CPOOL R4 1888 1889 // Temporaries in REGs 1890 #define X3L V15 1891 #define X3H V16 1892 #define Y3L V17 1893 #define Y3H V18 1894 #define T1L V19 1895 #define T1H V20 1896 #define T2L V21 1897 #define T2H V22 1898 #define T3L V23 1899 #define T3H V24 1900 1901 #define X1L V6 1902 #define X1H V7 1903 #define Y1L V8 1904 #define Y1H V9 1905 #define Z1L V10 1906 #define Z1H V11 1907 1908 // Temps for Sub and Add 1909 #define TT0 V11 1910 #define TT1 V12 1911 #define T2 V13 1912 1913 // p256MulAsm Parameters 1914 #define X0 V0 1915 #define X1 V1 1916 #define Y0 V2 1917 #define Y1 V3 1918 #define T0 V4 1919 #define T1 V5 1920 1921 #define PL V30 1922 #define PH V31 1923 1924 #define Z3L V23 1925 #define Z3H V24 1926 1927 #define ZER V26 1928 #define SEL1 V27 1929 #define CAR1 V28 1930 #define CAR2 V29 1931 /* 1932 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 1933 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 1934 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1935 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 1936 * B = 2Y₁ 1937 * Z₃ = B×Z₁ 1938 * C = B² 1939 * D = C×X₁ 1940 * X₃ = A²-2D 1941 * Y₃ = (D-X₃)×A-C²/2 1942 * 1943 * Three-operand formula: 1944 * T1 = Z1² 1945 * T2 = X1-T1 1946 * T1 = X1+T1 1947 * T2 = T2*T1 1948 * T2 = 3*T2 1949 * Y3 = 2*Y1 1950 * Z3 = Y3*Z1 1951 * Y3 = Y3² 1952 * T3 = Y3*X1 1953 * Y3 = Y3² 1954 * Y3 = half*Y3 1955 * X3 = T2² 1956 * T1 = 2*T3 1957 * X3 = X3-T1 1958 * T1 = T3-X3 1959 * T1 = T1*T2 1960 * Y3 = T1-Y3 1961 */ 1962 1963 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 1964 MOVD res+0(FP), P3ptr 1965 MOVD in+8(FP), P1ptr 1966 1967 MOVD $p256mul<>+0x00(SB), CPOOL 1968 VL 16(CPOOL), PL 1969 VL 0(CPOOL), PH 1970 1971 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 1972 VL 80(P1ptr), X1 // Z1H 1973 VPDI $0x4, X1, X1, X1 1974 VL 64(P1ptr), X0 // Z1L 1975 VPDI $0x4, X0, X0, X0 1976 VLR X0, Y0 1977 VLR X1, Y1 1978 CALL p256SqrInternal<>(SB) 1979 1980 // SUB(X<X1-T) // T2 = X1-T1 1981 VL 16(P1ptr), X1H 1982 VPDI $0x4, X1H, X1H, X1H 1983 VL 0(P1ptr), X1L 1984 VPDI $0x4, X1L, X1L, X1L 1985 p256SubInternal(X1,X0,X1H,X1L,T1,T0) 1986 1987 // ADD(Y<X1+T) // T1 = X1+T1 1988 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) 1989 1990 // X- ; Y- ; MUL; T- // T2 = T2*T1 1991 CALL p256MulInternal<>(SB) 1992 1993 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 1994 p256AddInternal(T2H,T2L,T1,T0,T1,T0) 1995 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) 1996 1997 // ADD(X<Y1+Y1) // Y3 = 2*Y1 1998 VL 48(P1ptr), Y1H 1999 VPDI $0x4, Y1H, Y1H, Y1H 2000 VL 32(P1ptr), Y1L 2001 VPDI $0x4, Y1L, Y1L, Y1L 2002 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) 2003 2004 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 2005 VL 80(P1ptr), Y1 // Z1H 2006 VPDI $0x4, Y1, Y1, Y1 2007 VL 64(P1ptr), Y0 // Z1L 2008 VPDI $0x4, Y0, Y0, Y0 2009 CALL p256MulInternal<>(SB) 2010 VPDI $0x4, T1, T1, TT1 2011 VST TT1, 80(P3ptr) 2012 VPDI $0x4, T0, T0, TT0 2013 VST TT0, 64(P3ptr) 2014 2015 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2016 VLR X0, Y0 2017 VLR X1, Y1 2018 CALL p256SqrInternal<>(SB) 2019 2020 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 2021 VLR T0, X0 2022 VLR T1, X1 2023 VL 16(P1ptr), Y1 2024 VPDI $0x4, Y1, Y1, Y1 2025 VL 0(P1ptr), Y0 2026 VPDI $0x4, Y0, Y0, Y0 2027 CALL p256MulInternal<>(SB) 2028 VLR T0, T3L 2029 VLR T1, T3H 2030 2031 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2032 VLR X0, Y0 2033 VLR X1, Y1 2034 CALL p256SqrInternal<>(SB) 2035 2036 // HAL(Y3<T) // Y3 = half*Y3 2037 p256HalfInternal(Y3H,Y3L, T1,T0) 2038 2039 // X=T2; Y=T2; MUL; T- // X3 = T2² 2040 VLR T2L, X0 2041 VLR T2H, X1 2042 VLR T2L, Y0 2043 VLR T2H, Y1 2044 CALL p256SqrInternal<>(SB) 2045 2046 // ADD(T1<T3+T3) // T1 = 2*T3 2047 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) 2048 2049 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 2050 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) 2051 VPDI $0x4, X3H, X3H, TT1 2052 VST TT1, 16(P3ptr) 2053 VPDI $0x4, X3L, X3L, TT0 2054 VST TT0, 0(P3ptr) 2055 2056 // SUB(X<T3-X3) // T1 = T3-X3 2057 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) 2058 2059 // X- ; Y- ; MUL; T- // T1 = T1*T2 2060 CALL p256MulInternal<>(SB) 2061 2062 // SUB(Y3<T-Y3) // Y3 = T1-Y3 2063 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) 2064 2065 VPDI $0x4, Y3H, Y3H, Y3H 2066 VST Y3H, 48(P3ptr) 2067 VPDI $0x4, Y3L, Y3L, Y3L 2068 VST Y3L, 32(P3ptr) 2069 RET 2070 2071 #undef P3ptr 2072 #undef P1ptr 2073 #undef CPOOL 2074 #undef X3L 2075 #undef X3H 2076 #undef Y3L 2077 #undef Y3H 2078 #undef T1L 2079 #undef T1H 2080 #undef T2L 2081 #undef T2H 2082 #undef T3L 2083 #undef T3H 2084 #undef X1L 2085 #undef X1H 2086 #undef Y1L 2087 #undef Y1H 2088 #undef Z1L 2089 #undef Z1H 2090 #undef TT0 2091 #undef TT1 2092 #undef T2 2093 #undef X0 2094 #undef X1 2095 #undef Y0 2096 #undef Y1 2097 #undef T0 2098 #undef T1 2099 #undef PL 2100 #undef PH 2101 #undef Z3L 2102 #undef Z3H 2103 #undef ZER 2104 #undef SEL1 2105 #undef CAR1 2106 #undef CAR2 2107 2108 // func p256PointAddAsm(res, in1, in2 *P256Point) int 2109 #define P3ptr R1 2110 #define P1ptr R2 2111 #define P2ptr R3 2112 #define CPOOL R4 2113 #define ISZERO R5 2114 #define TRUE R6 2115 2116 // Temporaries in REGs 2117 #define T1L V16 2118 #define T1H V17 2119 #define T2L V18 2120 #define T2H V19 2121 #define U1L V20 2122 #define U1H V21 2123 #define S1L V22 2124 #define S1H V23 2125 #define HL V24 2126 #define HH V25 2127 #define RL V26 2128 #define RH V27 2129 2130 // Temps for Sub and Add 2131 #define ZER V6 2132 #define SEL1 V7 2133 #define CAR1 V8 2134 #define CAR2 V9 2135 #define TT0 V11 2136 #define TT1 V12 2137 #define T2 V13 2138 2139 // p256MulAsm Parameters 2140 #define X0 V0 2141 #define X1 V1 2142 #define Y0 V2 2143 #define Y1 V3 2144 #define T0 V4 2145 #define T1 V5 2146 2147 #define PL V30 2148 #define PH V31 2149 /* 2150 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 2151 * 2152 * A = X₁×Z₂² 2153 * B = Y₁×Z₂³ 2154 * C = X₂×Z₁²-A 2155 * D = Y₂×Z₁³-B 2156 * X₃ = D² - 2A×C² - C³ 2157 * Y₃ = D×(A×C² - X₃) - B×C³ 2158 * Z₃ = Z₁×Z₂×C 2159 * 2160 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 2161 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 2162 * 2163 * T1 = Z1*Z1 2164 * T2 = Z2*Z2 2165 * U1 = X1*T2 2166 * H = X2*T1 2167 * H = H-U1 2168 * Z3 = Z1*Z2 2169 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2170 * 2171 * S1 = Z2*T2 2172 * S1 = Y1*S1 2173 * R = Z1*T1 2174 * R = Y2*R 2175 * R = R-S1 2176 * 2177 * T1 = H*H 2178 * T2 = H*T1 2179 * U1 = U1*T1 2180 * 2181 * X3 = R*R 2182 * X3 = X3-T2 2183 * T1 = 2*U1 2184 * X3 = X3-T1 << store-out X3 result reg 2185 * 2186 * T2 = S1*T2 2187 * Y3 = U1-X3 2188 * Y3 = R*Y3 2189 * Y3 = Y3-T2 << store-out Y3 result reg 2190 2191 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2192 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2193 // X=X2; Y- ; MUL; H=T // H = X2*T1 2194 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2195 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2196 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2197 // SUB(H<H-T) // H = H-U1 2198 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2199 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2200 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2201 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2202 // SUB(R<T-S1) // R = R-S1 2203 // X=H ; Y=H ; MUL; T- // T1 = H*H 2204 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2205 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2206 // X=R ; Y=R ; MUL; T- // X3 = R*R 2207 // SUB(T<T-T2) // X3 = X3-T2 2208 // ADD(X<U1+U1) // T1 = 2*U1 2209 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2210 // SUB(Y<U1-T) // Y3 = U1-X3 2211 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2212 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2213 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2214 */ 2215 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 2216 MOVD res+0(FP), P3ptr 2217 MOVD in1+8(FP), P1ptr 2218 MOVD in2+16(FP), P2ptr 2219 2220 MOVD $p256mul<>+0x00(SB), CPOOL 2221 VL 16(CPOOL), PL 2222 VL 0(CPOOL), PH 2223 2224 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2225 VL 80(P1ptr), X1 // Z1H 2226 VPDI $0x4, X1, X1, X1 2227 VL 64(P1ptr), X0 // Z1L 2228 VPDI $0x4, X0, X0, X0 2229 VLR X0, Y0 2230 VLR X1, Y1 2231 CALL p256SqrInternal<>(SB) 2232 2233 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2234 VLR T0, Y0 2235 VLR T1, Y1 2236 CALL p256MulInternal<>(SB) 2237 VLR T0, RL 2238 VLR T1, RH 2239 2240 // X=X2; Y- ; MUL; H=T // H = X2*T1 2241 VL 16(P2ptr), X1 // X2H 2242 VPDI $0x4, X1, X1, X1 2243 VL 0(P2ptr), X0 // X2L 2244 VPDI $0x4, X0, X0, X0 2245 CALL p256MulInternal<>(SB) 2246 VLR T0, HL 2247 VLR T1, HH 2248 2249 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2250 VL 80(P2ptr), X1 // Z2H 2251 VPDI $0x4, X1, X1, X1 2252 VL 64(P2ptr), X0 // Z2L 2253 VPDI $0x4, X0, X0, X0 2254 VLR X0, Y0 2255 VLR X1, Y1 2256 CALL p256SqrInternal<>(SB) 2257 2258 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2259 VLR T0, Y0 2260 VLR T1, Y1 2261 CALL p256MulInternal<>(SB) 2262 VLR T0, S1L 2263 VLR T1, S1H 2264 2265 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2266 VL 16(P1ptr), X1 // X1H 2267 VPDI $0x4, X1, X1, X1 2268 VL 0(P1ptr), X0 // X1L 2269 VPDI $0x4, X0, X0, X0 2270 CALL p256MulInternal<>(SB) 2271 VLR T0, U1L 2272 VLR T1, U1H 2273 2274 // SUB(H<H-T) // H = H-U1 2275 p256SubInternal(HH,HL,HH,HL,T1,T0) 2276 2277 // if H == 0 or H^P == 0 then ret=1 else ret=0 2278 // clobbers T1H and T1L 2279 MOVD $0, ISZERO 2280 MOVD $1, TRUE 2281 VZERO ZER 2282 VO HL, HH, T1H 2283 VCEQGS ZER, T1H, T1H 2284 MOVDEQ TRUE, ISZERO 2285 VX HL, PL, T1L 2286 VX HH, PH, T1H 2287 VO T1L, T1H, T1H 2288 VCEQGS ZER, T1H, T1H 2289 MOVDEQ TRUE, ISZERO 2290 MOVD ISZERO, ret+24(FP) 2291 2292 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2293 VL 80(P1ptr), X1 // Z1H 2294 VPDI $0x4, X1, X1, X1 2295 VL 64(P1ptr), X0 // Z1L 2296 VPDI $0x4, X0, X0, X0 2297 VL 80(P2ptr), Y1 // Z2H 2298 VPDI $0x4, Y1, Y1, Y1 2299 VL 64(P2ptr), Y0 // Z2L 2300 VPDI $0x4, Y0, Y0, Y0 2301 CALL p256MulInternal<>(SB) 2302 2303 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2304 VLR T0, X0 2305 VLR T1, X1 2306 VLR HL, Y0 2307 VLR HH, Y1 2308 CALL p256MulInternal<>(SB) 2309 VPDI $0x4, T1, T1, TT1 2310 VST TT1, 80(P3ptr) 2311 VPDI $0x4, T0, T0, TT0 2312 VST TT0, 64(P3ptr) 2313 2314 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2315 VL 48(P1ptr), X1 2316 VPDI $0x4, X1, X1, X1 2317 VL 32(P1ptr), X0 2318 VPDI $0x4, X0, X0, X0 2319 VLR S1L, Y0 2320 VLR S1H, Y1 2321 CALL p256MulInternal<>(SB) 2322 VLR T0, S1L 2323 VLR T1, S1H 2324 2325 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2326 VL 48(P2ptr), X1 2327 VPDI $0x4, X1, X1, X1 2328 VL 32(P2ptr), X0 2329 VPDI $0x4, X0, X0, X0 2330 VLR RL, Y0 2331 VLR RH, Y1 2332 CALL p256MulInternal<>(SB) 2333 2334 // SUB(R<T-S1) // R = T-S1 2335 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2336 2337 // if R == 0 or R^P == 0 then ret=ret else ret=0 2338 // clobbers T1H and T1L 2339 MOVD $0, ISZERO 2340 MOVD $1, TRUE 2341 VZERO ZER 2342 VO RL, RH, T1H 2343 VCEQGS ZER, T1H, T1H 2344 MOVDEQ TRUE, ISZERO 2345 VX RL, PL, T1L 2346 VX RH, PH, T1H 2347 VO T1L, T1H, T1H 2348 VCEQGS ZER, T1H, T1H 2349 MOVDEQ TRUE, ISZERO 2350 AND ret+24(FP), ISZERO 2351 MOVD ISZERO, ret+24(FP) 2352 2353 // X=H ; Y=H ; MUL; T- // T1 = H*H 2354 VLR HL, X0 2355 VLR HH, X1 2356 VLR HL, Y0 2357 VLR HH, Y1 2358 CALL p256SqrInternal<>(SB) 2359 2360 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2361 VLR T0, Y0 2362 VLR T1, Y1 2363 CALL p256MulInternal<>(SB) 2364 VLR T0, T2L 2365 VLR T1, T2H 2366 2367 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2368 VLR U1L, X0 2369 VLR U1H, X1 2370 CALL p256MulInternal<>(SB) 2371 VLR T0, U1L 2372 VLR T1, U1H 2373 2374 // X=R ; Y=R ; MUL; T- // X3 = R*R 2375 VLR RL, X0 2376 VLR RH, X1 2377 VLR RL, Y0 2378 VLR RH, Y1 2379 CALL p256SqrInternal<>(SB) 2380 2381 // SUB(T<T-T2) // X3 = X3-T2 2382 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2383 2384 // ADD(X<U1+U1) // T1 = 2*U1 2385 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2386 2387 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2388 p256SubInternal(T1,T0,T1,T0,X1,X0) 2389 VPDI $0x4, T1, T1, TT1 2390 VST TT1, 16(P3ptr) 2391 VPDI $0x4, T0, T0, TT0 2392 VST TT0, 0(P3ptr) 2393 2394 // SUB(Y<U1-T) // Y3 = U1-X3 2395 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2396 2397 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2398 VLR RL, X0 2399 VLR RH, X1 2400 CALL p256MulInternal<>(SB) 2401 VLR T0, U1L 2402 VLR T1, U1H 2403 2404 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2405 VLR S1L, X0 2406 VLR S1H, X1 2407 VLR T2L, Y0 2408 VLR T2H, Y1 2409 CALL p256MulInternal<>(SB) 2410 2411 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2412 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2413 VPDI $0x4, T1, T1, T1 2414 VST T1, 48(P3ptr) 2415 VPDI $0x4, T0, T0, T0 2416 VST T0, 32(P3ptr) 2417 2418 RET