github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/crypto/internal/nistec/p256_asm_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 #include "go_asm.h" 9 10 DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f 11 DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 12 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff 13 DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84 14 DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551 15 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 16 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 17 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 18 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 19 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 20 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 21 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 22 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 23 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 24 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 25 DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask 26 DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask 27 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 28 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 29 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 30 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 31 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 32 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 33 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 34 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 35 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 36 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 37 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 38 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 39 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 40 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 41 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 42 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 43 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 44 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 45 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 46 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 47 GLOBL p256ordK0<>(SB), 8, $4 48 GLOBL p256ord<>(SB), 8, $32 49 GLOBL p256<>(SB), 8, $96 50 GLOBL p256mul<>(SB), 8, $160 51 52 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 53 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0 54 JMP ·p256BigToLittle(SB) 55 56 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 57 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0 58 JMP ·p256BigToLittle(SB) 59 60 // --------------------------------------- 61 // func p256LittleToBig(res *[32]byte, in *p256Element) 62 TEXT ·p256LittleToBig(SB), NOSPLIT, $0 63 JMP ·p256BigToLittle(SB) 64 65 // func p256BigToLittle(res *p256Element, in *[32]byte) 66 #define res_ptr R1 67 #define in_ptr R2 68 #define T1L V2 69 #define T1H V3 70 71 TEXT ·p256BigToLittle(SB), NOSPLIT, $0 72 MOVD res+0(FP), res_ptr 73 MOVD in+8(FP), in_ptr 74 75 VL 0(in_ptr), T1H 76 VL 16(in_ptr), T1L 77 78 VPDI $0x4, T1L, T1L, T1L 79 VPDI $0x4, T1H, T1H, T1H 80 81 VST T1L, 0(res_ptr) 82 VST T1H, 16(res_ptr) 83 RET 84 85 #undef res_ptr 86 #undef in_ptr 87 #undef T1L 88 #undef T1H 89 90 // --------------------------------------- 91 // iff cond == 1 val <- -val 92 // func p256NegCond(val *p256Element, cond int) 93 #define P1ptr R1 94 #define CPOOL R4 95 96 #define Y1L V0 97 #define Y1H V1 98 #define T1L V2 99 #define T1H V3 100 101 #define PL V30 102 #define PH V31 103 104 #define ZER V4 105 #define SEL1 V5 106 #define CAR1 V6 107 TEXT ·p256NegCond(SB), NOSPLIT, $0 108 MOVD val+0(FP), P1ptr 109 110 MOVD $p256mul<>+0x00(SB), CPOOL 111 VL 16(CPOOL), PL 112 VL 0(CPOOL), PH 113 114 VL 16(P1ptr), Y1H 115 VPDI $0x4, Y1H, Y1H, Y1H 116 VL 0(P1ptr), Y1L 117 VPDI $0x4, Y1L, Y1L, Y1L 118 119 VLREPG cond+8(FP), SEL1 120 VZERO ZER 121 VCEQG SEL1, ZER, SEL1 122 123 VSCBIQ Y1L, PL, CAR1 124 VSQ Y1L, PL, T1L 125 VSBIQ PH, Y1H, CAR1, T1H 126 127 VSEL Y1L, T1L, SEL1, Y1L 128 VSEL Y1H, T1H, SEL1, Y1H 129 130 VPDI $0x4, Y1H, Y1H, Y1H 131 VST Y1H, 16(P1ptr) 132 VPDI $0x4, Y1L, Y1L, Y1L 133 VST Y1L, 0(P1ptr) 134 RET 135 136 #undef P1ptr 137 #undef CPOOL 138 #undef Y1L 139 #undef Y1H 140 #undef T1L 141 #undef T1H 142 #undef PL 143 #undef PH 144 #undef ZER 145 #undef SEL1 146 #undef CAR1 147 148 // --------------------------------------- 149 // if cond == 0 res <- b; else res <- a 150 // func p256MovCond(res, a, b *P256Point, cond int) 151 #define P3ptr R1 152 #define P1ptr R2 153 #define P2ptr R3 154 155 #define X1L V0 156 #define X1H V1 157 #define Y1L V2 158 #define Y1H V3 159 #define Z1L V4 160 #define Z1H V5 161 #define X2L V6 162 #define X2H V7 163 #define Y2L V8 164 #define Y2H V9 165 #define Z2L V10 166 #define Z2H V11 167 168 #define ZER V18 169 #define SEL1 V19 170 TEXT ·p256MovCond(SB), NOSPLIT, $0 171 MOVD res+0(FP), P3ptr 172 MOVD a+8(FP), P1ptr 173 MOVD b+16(FP), P2ptr 174 VLREPG cond+24(FP), SEL1 175 VZERO ZER 176 VCEQG SEL1, ZER, SEL1 177 178 VL 0(P1ptr), X1H 179 VL 16(P1ptr), X1L 180 VL 32(P1ptr), Y1H 181 VL 48(P1ptr), Y1L 182 VL 64(P1ptr), Z1H 183 VL 80(P1ptr), Z1L 184 185 VL 0(P2ptr), X2H 186 VL 16(P2ptr), X2L 187 VL 32(P2ptr), Y2H 188 VL 48(P2ptr), Y2L 189 VL 64(P2ptr), Z2H 190 VL 80(P2ptr), Z2L 191 192 VSEL X2L, X1L, SEL1, X1L 193 VSEL X2H, X1H, SEL1, X1H 194 VSEL Y2L, Y1L, SEL1, Y1L 195 VSEL Y2H, Y1H, SEL1, Y1H 196 VSEL Z2L, Z1L, SEL1, Z1L 197 VSEL Z2H, Z1H, SEL1, Z1H 198 199 VST X1H, 0(P3ptr) 200 VST X1L, 16(P3ptr) 201 VST Y1H, 32(P3ptr) 202 VST Y1L, 48(P3ptr) 203 VST Z1H, 64(P3ptr) 204 VST Z1L, 80(P3ptr) 205 206 RET 207 208 #undef P3ptr 209 #undef P1ptr 210 #undef P2ptr 211 #undef X1L 212 #undef X1H 213 #undef Y1L 214 #undef Y1H 215 #undef Z1L 216 #undef Z1H 217 #undef X2L 218 #undef X2H 219 #undef Y2L 220 #undef Y2H 221 #undef Z2L 222 #undef Z2H 223 #undef ZER 224 #undef SEL1 225 226 // --------------------------------------- 227 // Constant time table access 228 // Indexed from 1 to 15, with -1 offset 229 // (index 0 is implicitly point at infinity) 230 // func p256Select(res *P256Point, table *p256Table, idx int) 231 #define P3ptr R1 232 #define P1ptr R2 233 #define COUNT R4 234 235 #define X1L V0 236 #define X1H V1 237 #define Y1L V2 238 #define Y1H V3 239 #define Z1L V4 240 #define Z1H V5 241 #define X2L V6 242 #define X2H V7 243 #define Y2L V8 244 #define Y2H V9 245 #define Z2L V10 246 #define Z2H V11 247 248 #define ONE V18 249 #define IDX V19 250 #define SEL1 V20 251 #define SEL2 V21 252 TEXT ·p256Select(SB), NOSPLIT, $0 253 MOVD res+0(FP), P3ptr 254 MOVD table+8(FP), P1ptr 255 VLREPB idx+(16+7)(FP), IDX 256 VREPIB $1, ONE 257 VREPIB $1, SEL2 258 MOVD $1, COUNT 259 260 VZERO X1H 261 VZERO X1L 262 VZERO Y1H 263 VZERO Y1L 264 VZERO Z1H 265 VZERO Z1L 266 267 loop_select: 268 VL 0(P1ptr), X2H 269 VL 16(P1ptr), X2L 270 VL 32(P1ptr), Y2H 271 VL 48(P1ptr), Y2L 272 VL 64(P1ptr), Z2H 273 VL 80(P1ptr), Z2L 274 275 VCEQG SEL2, IDX, SEL1 276 277 VSEL X2L, X1L, SEL1, X1L 278 VSEL X2H, X1H, SEL1, X1H 279 VSEL Y2L, Y1L, SEL1, Y1L 280 VSEL Y2H, Y1H, SEL1, Y1H 281 VSEL Z2L, Z1L, SEL1, Z1L 282 VSEL Z2H, Z1H, SEL1, Z1H 283 284 VAB SEL2, ONE, SEL2 285 ADDW $1, COUNT 286 ADD $96, P1ptr 287 CMPW COUNT, $17 288 BLT loop_select 289 290 VST X1H, 0(P3ptr) 291 VST X1L, 16(P3ptr) 292 VST Y1H, 32(P3ptr) 293 VST Y1L, 48(P3ptr) 294 VST Z1H, 64(P3ptr) 295 VST Z1L, 80(P3ptr) 296 RET 297 298 #undef P3ptr 299 #undef P1ptr 300 #undef COUNT 301 #undef X1L 302 #undef X1H 303 #undef Y1L 304 #undef Y1H 305 #undef Z1L 306 #undef Z1H 307 #undef X2L 308 #undef X2H 309 #undef Y2L 310 #undef Y2H 311 #undef Z2L 312 #undef Z2H 313 #undef ONE 314 #undef IDX 315 #undef SEL1 316 #undef SEL2 317 318 // --------------------------------------- 319 320 // func p256FromMont(res, in *p256Element) 321 #define res_ptr R1 322 #define x_ptr R2 323 #define CPOOL R4 324 325 #define T0 V0 326 #define T1 V1 327 #define T2 V2 328 #define TT0 V3 329 #define TT1 V4 330 331 #define ZER V6 332 #define SEL1 V7 333 #define SEL2 V8 334 #define CAR1 V9 335 #define CAR2 V10 336 #define RED1 V11 337 #define RED2 V12 338 #define PL V13 339 #define PH V14 340 341 TEXT ·p256FromMont(SB), NOSPLIT, $0 342 MOVD res+0(FP), res_ptr 343 MOVD in+8(FP), x_ptr 344 345 VZERO T2 346 VZERO ZER 347 MOVD $p256<>+0x00(SB), CPOOL 348 VL 16(CPOOL), PL 349 VL 0(CPOOL), PH 350 VL 48(CPOOL), SEL2 351 VL 64(CPOOL), SEL1 352 353 VL (0*16)(x_ptr), T0 354 VPDI $0x4, T0, T0, T0 355 VL (1*16)(x_ptr), T1 356 VPDI $0x4, T1, T1, T1 357 358 // First round 359 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 360 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 361 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 362 363 VSLDB $8, T1, T0, T0 364 VSLDB $8, T2, T1, T1 365 366 VACCQ T0, RED1, CAR1 367 VAQ T0, RED1, T0 368 VACCCQ T1, RED2, CAR1, CAR2 369 VACQ T1, RED2, CAR1, T1 370 VAQ T2, CAR2, T2 371 372 // Second round 373 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 374 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 375 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 376 377 VSLDB $8, T1, T0, T0 378 VSLDB $8, T2, T1, T1 379 380 VACCQ T0, RED1, CAR1 381 VAQ T0, RED1, T0 382 VACCCQ T1, RED2, CAR1, CAR2 383 VACQ T1, RED2, CAR1, T1 384 VAQ T2, CAR2, T2 385 386 // Third round 387 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 388 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 389 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 390 391 VSLDB $8, T1, T0, T0 392 VSLDB $8, T2, T1, T1 393 394 VACCQ T0, RED1, CAR1 395 VAQ T0, RED1, T0 396 VACCCQ T1, RED2, CAR1, CAR2 397 VACQ T1, RED2, CAR1, T1 398 VAQ T2, CAR2, T2 399 400 // Last round 401 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 402 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 403 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 404 405 VSLDB $8, T1, T0, T0 406 VSLDB $8, T2, T1, T1 407 408 VACCQ T0, RED1, CAR1 409 VAQ T0, RED1, T0 410 VACCCQ T1, RED2, CAR1, CAR2 411 VACQ T1, RED2, CAR1, T1 412 VAQ T2, CAR2, T2 413 414 // --------------------------------------------------- 415 416 VSCBIQ PL, T0, CAR1 417 VSQ PL, T0, TT0 418 VSBCBIQ T1, PH, CAR1, CAR2 419 VSBIQ T1, PH, CAR1, TT1 420 VSBIQ T2, ZER, CAR2, T2 421 422 // what output to use, TT1||TT0 or T1||T0? 423 VSEL T0, TT0, T2, T0 424 VSEL T1, TT1, T2, T1 425 426 VPDI $0x4, T0, T0, TT0 427 VST TT0, (0*16)(res_ptr) 428 VPDI $0x4, T1, T1, TT1 429 VST TT1, (1*16)(res_ptr) 430 RET 431 432 #undef res_ptr 433 #undef x_ptr 434 #undef CPOOL 435 #undef T0 436 #undef T1 437 #undef T2 438 #undef TT0 439 #undef TT1 440 #undef ZER 441 #undef SEL1 442 #undef SEL2 443 #undef CAR1 444 #undef CAR2 445 #undef RED1 446 #undef RED2 447 #undef PL 448 #undef PH 449 450 // Constant time table access 451 // Indexed from 1 to 15, with -1 offset 452 // (index 0 is implicitly point at infinity) 453 // func p256SelectBase(point *p256Point, table []p256Point, idx int) 454 // new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 455 456 #define P3ptr R1 457 #define P1ptr R2 458 #define COUNT R4 459 #define CPOOL R5 460 461 #define X1L V0 462 #define X1H V1 463 #define Y1L V2 464 #define Y1H V3 465 #define Z1L V4 466 #define Z1H V5 467 #define X2L V6 468 #define X2H V7 469 #define Y2L V8 470 #define Y2H V9 471 #define Z2L V10 472 #define Z2H V11 473 #define LE2BE V12 474 475 #define ONE V18 476 #define IDX V19 477 #define SEL1 V20 478 #define SEL2 V21 479 480 TEXT ·p256SelectAffine(SB), NOSPLIT, $0 481 MOVD res+0(FP), P3ptr 482 MOVD table+8(FP), P1ptr 483 MOVD $p256<>+0x00(SB), CPOOL 484 VLREPB idx+(16+7)(FP), IDX 485 VREPIB $1, ONE 486 VREPIB $1, SEL2 487 MOVD $1, COUNT 488 VL 80(CPOOL), LE2BE 489 490 VZERO X1H 491 VZERO X1L 492 VZERO Y1H 493 VZERO Y1L 494 495 loop_select: 496 VL 0(P1ptr), X2H 497 VL 16(P1ptr), X2L 498 VL 32(P1ptr), Y2H 499 VL 48(P1ptr), Y2L 500 501 VCEQG SEL2, IDX, SEL1 502 503 VSEL X2L, X1L, SEL1, X1L 504 VSEL X2H, X1H, SEL1, X1H 505 VSEL Y2L, Y1L, SEL1, Y1L 506 VSEL Y2H, Y1H, SEL1, Y1H 507 508 VAB SEL2, ONE, SEL2 509 ADDW $1, COUNT 510 ADD $64, P1ptr 511 CMPW COUNT, $65 512 BLT loop_select 513 VST X1H, 0(P3ptr) 514 VST X1L, 16(P3ptr) 515 VST Y1H, 32(P3ptr) 516 VST Y1L, 48(P3ptr) 517 518 RET 519 520 #undef P3ptr 521 #undef P1ptr 522 #undef COUNT 523 #undef X1L 524 #undef X1H 525 #undef Y1L 526 #undef Y1H 527 #undef Z1L 528 #undef Z1H 529 #undef X2L 530 #undef X2H 531 #undef Y2L 532 #undef Y2H 533 #undef Z2L 534 #undef Z2H 535 #undef ONE 536 #undef IDX 537 #undef SEL1 538 #undef SEL2 539 #undef CPOOL 540 541 // --------------------------------------- 542 543 // func p256OrdMul(res, in1, in2 *p256OrdElement) 544 #define res_ptr R1 545 #define x_ptr R2 546 #define y_ptr R3 547 #define X0 V0 548 #define X1 V1 549 #define Y0 V2 550 #define Y1 V3 551 #define M0 V4 552 #define M1 V5 553 #define T0 V6 554 #define T1 V7 555 #define T2 V8 556 #define YDIG V9 557 558 #define ADD1 V16 559 #define ADD1H V17 560 #define ADD2 V18 561 #define ADD2H V19 562 #define RED1 V20 563 #define RED1H V21 564 #define RED2 V22 565 #define RED2H V23 566 #define CAR1 V24 567 #define CAR1M V25 568 569 #define MK0 V30 570 #define K0 V31 571 TEXT ·p256OrdMul<>(SB), NOSPLIT, $0 572 MOVD res+0(FP), res_ptr 573 MOVD in1+8(FP), x_ptr 574 MOVD in2+16(FP), y_ptr 575 576 VZERO T2 577 MOVD $p256ordK0<>+0x00(SB), R4 578 579 // VLEF $3, 0(R4), K0 580 WORD $0xE7F40000 581 BYTE $0x38 582 BYTE $0x03 583 MOVD $p256ord<>+0x00(SB), R4 584 VL 16(R4), M0 585 VL 0(R4), M1 586 587 VL (0*16)(x_ptr), X0 588 VPDI $0x4, X0, X0, X0 589 VL (1*16)(x_ptr), X1 590 VPDI $0x4, X1, X1, X1 591 VL (0*16)(y_ptr), Y0 592 VPDI $0x4, Y0, Y0, Y0 593 VL (1*16)(y_ptr), Y1 594 VPDI $0x4, Y1, Y1, Y1 595 596 // ---------------------------------------------------------------------------/ 597 VREPF $3, Y0, YDIG 598 VMLF X0, YDIG, ADD1 599 VMLF ADD1, K0, MK0 600 VREPF $3, MK0, MK0 601 602 VMLF X1, YDIG, ADD2 603 VMLHF X0, YDIG, ADD1H 604 VMLHF X1, YDIG, ADD2H 605 606 VMALF M0, MK0, ADD1, RED1 607 VMALHF M0, MK0, ADD1, RED1H 608 VMALF M1, MK0, ADD2, RED2 609 VMALHF M1, MK0, ADD2, RED2H 610 611 VSLDB $12, RED2, RED1, RED1 612 VSLDB $12, T2, RED2, RED2 613 614 VACCQ RED1, ADD1H, CAR1 615 VAQ RED1, ADD1H, T0 616 VACCQ RED1H, T0, CAR1M 617 VAQ RED1H, T0, T0 618 619 // << ready for next MK0 620 621 VACQ RED2, ADD2H, CAR1, T1 622 VACCCQ RED2, ADD2H, CAR1, CAR1 623 VACCCQ RED2H, T1, CAR1M, T2 624 VACQ RED2H, T1, CAR1M, T1 625 VAQ CAR1, T2, T2 626 627 // --------------------------------------------------- 628 /* * 629 * ---+--------+--------+ 630 * T2| T1 | T0 | 631 * ---+--------+--------+ 632 * *(add)* 633 * +--------+--------+ 634 * | X1 | X0 | 635 * +--------+--------+ 636 * *(mul)* 637 * +--------+--------+ 638 * | YDIG | YDIG | 639 * +--------+--------+ 640 * *(add)* 641 * +--------+--------+ 642 * | M1 | M0 | 643 * +--------+--------+ 644 * *(mul)* 645 * +--------+--------+ 646 * | MK0 | MK0 | 647 * +--------+--------+ 648 * 649 * --------------------- 650 * 651 * +--------+--------+ 652 * | ADD2 | ADD1 | 653 * +--------+--------+ 654 * +--------+--------+ 655 * | ADD2H | ADD1H | 656 * +--------+--------+ 657 * +--------+--------+ 658 * | RED2 | RED1 | 659 * +--------+--------+ 660 * +--------+--------+ 661 * | RED2H | RED1H | 662 * +--------+--------+ 663 */ 664 VREPF $2, Y0, YDIG 665 VMALF X0, YDIG, T0, ADD1 666 VMLF ADD1, K0, MK0 667 VREPF $3, MK0, MK0 668 669 VMALF X1, YDIG, T1, ADD2 670 VMALHF X0, YDIG, T0, ADD1H 671 VMALHF X1, YDIG, T1, ADD2H 672 673 VMALF M0, MK0, ADD1, RED1 674 VMALHF M0, MK0, ADD1, RED1H 675 VMALF M1, MK0, ADD2, RED2 676 VMALHF M1, MK0, ADD2, RED2H 677 678 VSLDB $12, RED2, RED1, RED1 679 VSLDB $12, T2, RED2, RED2 680 681 VACCQ RED1, ADD1H, CAR1 682 VAQ RED1, ADD1H, T0 683 VACCQ RED1H, T0, CAR1M 684 VAQ RED1H, T0, T0 685 686 // << ready for next MK0 687 688 VACQ RED2, ADD2H, CAR1, T1 689 VACCCQ RED2, ADD2H, CAR1, CAR1 690 VACCCQ RED2H, T1, CAR1M, T2 691 VACQ RED2H, T1, CAR1M, T1 692 VAQ CAR1, T2, T2 693 694 // --------------------------------------------------- 695 VREPF $1, Y0, YDIG 696 VMALF X0, YDIG, T0, ADD1 697 VMLF ADD1, K0, MK0 698 VREPF $3, MK0, MK0 699 700 VMALF X1, YDIG, T1, ADD2 701 VMALHF X0, YDIG, T0, ADD1H 702 VMALHF X1, YDIG, T1, ADD2H 703 704 VMALF M0, MK0, ADD1, RED1 705 VMALHF M0, MK0, ADD1, RED1H 706 VMALF M1, MK0, ADD2, RED2 707 VMALHF M1, MK0, ADD2, RED2H 708 709 VSLDB $12, RED2, RED1, RED1 710 VSLDB $12, T2, RED2, RED2 711 712 VACCQ RED1, ADD1H, CAR1 713 VAQ RED1, ADD1H, T0 714 VACCQ RED1H, T0, CAR1M 715 VAQ RED1H, T0, T0 716 717 // << ready for next MK0 718 719 VACQ RED2, ADD2H, CAR1, T1 720 VACCCQ RED2, ADD2H, CAR1, CAR1 721 VACCCQ RED2H, T1, CAR1M, T2 722 VACQ RED2H, T1, CAR1M, T1 723 VAQ CAR1, T2, T2 724 725 // --------------------------------------------------- 726 VREPF $0, Y0, YDIG 727 VMALF X0, YDIG, T0, ADD1 728 VMLF ADD1, K0, MK0 729 VREPF $3, MK0, MK0 730 731 VMALF X1, YDIG, T1, ADD2 732 VMALHF X0, YDIG, T0, ADD1H 733 VMALHF X1, YDIG, T1, ADD2H 734 735 VMALF M0, MK0, ADD1, RED1 736 VMALHF M0, MK0, ADD1, RED1H 737 VMALF M1, MK0, ADD2, RED2 738 VMALHF M1, MK0, ADD2, RED2H 739 740 VSLDB $12, RED2, RED1, RED1 741 VSLDB $12, T2, RED2, RED2 742 743 VACCQ RED1, ADD1H, CAR1 744 VAQ RED1, ADD1H, T0 745 VACCQ RED1H, T0, CAR1M 746 VAQ RED1H, T0, T0 747 748 // << ready for next MK0 749 750 VACQ RED2, ADD2H, CAR1, T1 751 VACCCQ RED2, ADD2H, CAR1, CAR1 752 VACCCQ RED2H, T1, CAR1M, T2 753 VACQ RED2H, T1, CAR1M, T1 754 VAQ CAR1, T2, T2 755 756 // --------------------------------------------------- 757 VREPF $3, Y1, YDIG 758 VMALF X0, YDIG, T0, ADD1 759 VMLF ADD1, K0, MK0 760 VREPF $3, MK0, MK0 761 762 VMALF X1, YDIG, T1, ADD2 763 VMALHF X0, YDIG, T0, ADD1H 764 VMALHF X1, YDIG, T1, ADD2H 765 766 VMALF M0, MK0, ADD1, RED1 767 VMALHF M0, MK0, ADD1, RED1H 768 VMALF M1, MK0, ADD2, RED2 769 VMALHF M1, MK0, ADD2, RED2H 770 771 VSLDB $12, RED2, RED1, RED1 772 VSLDB $12, T2, RED2, RED2 773 774 VACCQ RED1, ADD1H, CAR1 775 VAQ RED1, ADD1H, T0 776 VACCQ RED1H, T0, CAR1M 777 VAQ RED1H, T0, T0 778 779 // << ready for next MK0 780 781 VACQ RED2, ADD2H, CAR1, T1 782 VACCCQ RED2, ADD2H, CAR1, CAR1 783 VACCCQ RED2H, T1, CAR1M, T2 784 VACQ RED2H, T1, CAR1M, T1 785 VAQ CAR1, T2, T2 786 787 // --------------------------------------------------- 788 VREPF $2, Y1, YDIG 789 VMALF X0, YDIG, T0, ADD1 790 VMLF ADD1, K0, MK0 791 VREPF $3, MK0, MK0 792 793 VMALF X1, YDIG, T1, ADD2 794 VMALHF X0, YDIG, T0, ADD1H 795 VMALHF X1, YDIG, T1, ADD2H 796 797 VMALF M0, MK0, ADD1, RED1 798 VMALHF M0, MK0, ADD1, RED1H 799 VMALF M1, MK0, ADD2, RED2 800 VMALHF M1, MK0, ADD2, RED2H 801 802 VSLDB $12, RED2, RED1, RED1 803 VSLDB $12, T2, RED2, RED2 804 805 VACCQ RED1, ADD1H, CAR1 806 VAQ RED1, ADD1H, T0 807 VACCQ RED1H, T0, CAR1M 808 VAQ RED1H, T0, T0 809 810 // << ready for next MK0 811 812 VACQ RED2, ADD2H, CAR1, T1 813 VACCCQ RED2, ADD2H, CAR1, CAR1 814 VACCCQ RED2H, T1, CAR1M, T2 815 VACQ RED2H, T1, CAR1M, T1 816 VAQ CAR1, T2, T2 817 818 // --------------------------------------------------- 819 VREPF $1, Y1, YDIG 820 VMALF X0, YDIG, T0, ADD1 821 VMLF ADD1, K0, MK0 822 VREPF $3, MK0, MK0 823 824 VMALF X1, YDIG, T1, ADD2 825 VMALHF X0, YDIG, T0, ADD1H 826 VMALHF X1, YDIG, T1, ADD2H 827 828 VMALF M0, MK0, ADD1, RED1 829 VMALHF M0, MK0, ADD1, RED1H 830 VMALF M1, MK0, ADD2, RED2 831 VMALHF M1, MK0, ADD2, RED2H 832 833 VSLDB $12, RED2, RED1, RED1 834 VSLDB $12, T2, RED2, RED2 835 836 VACCQ RED1, ADD1H, CAR1 837 VAQ RED1, ADD1H, T0 838 VACCQ RED1H, T0, CAR1M 839 VAQ RED1H, T0, T0 840 841 // << ready for next MK0 842 843 VACQ RED2, ADD2H, CAR1, T1 844 VACCCQ RED2, ADD2H, CAR1, CAR1 845 VACCCQ RED2H, T1, CAR1M, T2 846 VACQ RED2H, T1, CAR1M, T1 847 VAQ CAR1, T2, T2 848 849 // --------------------------------------------------- 850 VREPF $0, Y1, YDIG 851 VMALF X0, YDIG, T0, ADD1 852 VMLF ADD1, K0, MK0 853 VREPF $3, MK0, MK0 854 855 VMALF X1, YDIG, T1, ADD2 856 VMALHF X0, YDIG, T0, ADD1H 857 VMALHF X1, YDIG, T1, ADD2H 858 859 VMALF M0, MK0, ADD1, RED1 860 VMALHF M0, MK0, ADD1, RED1H 861 VMALF M1, MK0, ADD2, RED2 862 VMALHF M1, MK0, ADD2, RED2H 863 864 VSLDB $12, RED2, RED1, RED1 865 VSLDB $12, T2, RED2, RED2 866 867 VACCQ RED1, ADD1H, CAR1 868 VAQ RED1, ADD1H, T0 869 VACCQ RED1H, T0, CAR1M 870 VAQ RED1H, T0, T0 871 872 // << ready for next MK0 873 874 VACQ RED2, ADD2H, CAR1, T1 875 VACCCQ RED2, ADD2H, CAR1, CAR1 876 VACCCQ RED2H, T1, CAR1M, T2 877 VACQ RED2H, T1, CAR1M, T1 878 VAQ CAR1, T2, T2 879 880 // --------------------------------------------------- 881 882 VZERO RED1 883 VSCBIQ M0, T0, CAR1 884 VSQ M0, T0, ADD1 885 VSBCBIQ T1, M1, CAR1, CAR1M 886 VSBIQ T1, M1, CAR1, ADD2 887 VSBIQ T2, RED1, CAR1M, T2 888 889 // what output to use, ADD2||ADD1 or T1||T0? 890 VSEL T0, ADD1, T2, T0 891 VSEL T1, ADD2, T2, T1 892 893 VPDI $0x4, T0, T0, T0 894 VST T0, (0*16)(res_ptr) 895 VPDI $0x4, T1, T1, T1 896 VST T1, (1*16)(res_ptr) 897 RET 898 899 #undef res_ptr 900 #undef x_ptr 901 #undef y_ptr 902 #undef X0 903 #undef X1 904 #undef Y0 905 #undef Y1 906 #undef M0 907 #undef M1 908 #undef T0 909 #undef T1 910 #undef T2 911 #undef YDIG 912 913 #undef ADD1 914 #undef ADD1H 915 #undef ADD2 916 #undef ADD2H 917 #undef RED1 918 #undef RED1H 919 #undef RED2 920 #undef RED2H 921 #undef CAR1 922 #undef CAR1M 923 924 #undef MK0 925 #undef K0 926 927 // --------------------------------------- 928 // p256MulInternal 929 // V0-V3,V30,V31 - Not Modified 930 // V4-V15 - Volatile 931 932 #define CPOOL R4 933 934 // Parameters 935 #define X0 V0 // Not modified 936 #define X1 V1 // Not modified 937 #define Y0 V2 // Not modified 938 #define Y1 V3 // Not modified 939 #define T0 V4 940 #define T1 V5 941 #define P0 V30 // Not modified 942 #define P1 V31 // Not modified 943 944 // Temporaries 945 #define YDIG V6 // Overloaded with CAR2, ZER 946 #define ADD1H V7 // Overloaded with ADD3H 947 #define ADD2H V8 // Overloaded with ADD4H 948 #define ADD3 V9 // Overloaded with SEL2,SEL5 949 #define ADD4 V10 // Overloaded with SEL3,SEL6 950 #define RED1 V11 // Overloaded with CAR2 951 #define RED2 V12 952 #define RED3 V13 // Overloaded with SEL1 953 #define T2 V14 954 // Overloaded temporaries 955 #define ADD1 V4 // Overloaded with T0 956 #define ADD2 V5 // Overloaded with T1 957 #define ADD3H V7 // Overloaded with ADD1H 958 #define ADD4H V8 // Overloaded with ADD2H 959 #define ZER V6 // Overloaded with YDIG, CAR2 960 #define CAR1 V6 // Overloaded with YDIG, ZER 961 #define CAR2 V11 // Overloaded with RED1 962 // Constant Selects 963 #define SEL1 V13 // Overloaded with RED3 964 #define SEL2 V9 // Overloaded with ADD3,SEL5 965 #define SEL3 V10 // Overloaded with ADD4,SEL6 966 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER 967 #define SEL5 V9 // Overloaded with ADD3,SEL2 968 #define SEL6 V10 // Overloaded with ADD4,SEL3 969 970 /* * 971 * To follow the flow of bits, for your own sanity a stiff drink, need you shall. 972 * Of a single round, a 'helpful' picture, here is. Meaning, column position has. 973 * With you, SIMD be... 974 * 975 * +--------+--------+ 976 * +--------| RED2 | RED1 | 977 * | +--------+--------+ 978 * | ---+--------+--------+ 979 * | +---- T2| T1 | T0 |--+ 980 * | | ---+--------+--------+ | 981 * | | | 982 * | | ======================= | 983 * | | | 984 * | | +--------+--------+<-+ 985 * | +-------| ADD2 | ADD1 |--|-----+ 986 * | | +--------+--------+ | | 987 * | | +--------+--------+<---+ | 988 * | | | ADD2H | ADD1H |--+ | 989 * | | +--------+--------+ | | 990 * | | +--------+--------+<-+ | 991 * | | | ADD4 | ADD3 |--|-+ | 992 * | | +--------+--------+ | | | 993 * | | +--------+--------+<---+ | | 994 * | | | ADD4H | ADD3H |------|-+ |(+vzero) 995 * | | +--------+--------+ | | V 996 * | | ------------------------ | | +--------+ 997 * | | | | | RED3 | [d0 0 0 d0] 998 * | | | | +--------+ 999 * | +---->+--------+--------+ | | | 1000 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | 1001 * | +--------+--------+ | | | 1002 * +---->---+--------+--------+ | | | 1003 * T2| T1 | T0 |----+ | | 1004 * ---+--------+--------+ | | | 1005 * ---+--------+--------+<---+ | | 1006 * +--- T2| T1 | T0 |----------+ 1007 * | ---+--------+--------+ | | 1008 * | +--------+--------+<-------------+ 1009 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 1010 * | +--------+--------+ | | | 1011 * | +--------+<----------------------+ 1012 * | | RED3 |--------------+ | [0 0 d1 d0] 1013 * | +--------+ | | 1014 * +--->+--------+--------+ | | 1015 * | T1 | T0 |--------+ 1016 * +--------+--------+ | | 1017 * --------------------------- | | 1018 * | | 1019 * +--------+--------+<----+ | 1020 * | RED2 | RED1 | | 1021 * +--------+--------+ | 1022 * ---+--------+--------+<-------+ 1023 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 1024 * ---+--------+--------+ 1025 * 1026 * *Mi obra de arte de siglo XXI @vpaprots 1027 * 1028 * 1029 * First group is special, doesn't get the two inputs: 1030 * +--------+--------+<-+ 1031 * +-------| ADD2 | ADD1 |--|-----+ 1032 * | +--------+--------+ | | 1033 * | +--------+--------+<---+ | 1034 * | | ADD2H | ADD1H |--+ | 1035 * | +--------+--------+ | | 1036 * | +--------+--------+<-+ | 1037 * | | ADD4 | ADD3 |--|-+ | 1038 * | +--------+--------+ | | | 1039 * | +--------+--------+<---+ | | 1040 * | | ADD4H | ADD3H |------|-+ |(+vzero) 1041 * | +--------+--------+ | | V 1042 * | ------------------------ | | +--------+ 1043 * | | | | RED3 | [d0 0 0 d0] 1044 * | | | +--------+ 1045 * +---->+--------+--------+ | | | 1046 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | 1047 * +--------+--------+ | | | 1048 * ---+--------+--------+<---+ | | 1049 * +--- T2| T1 | T0 |----------+ 1050 * | ---+--------+--------+ | | 1051 * | +--------+--------+<-------------+ 1052 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 1053 * | +--------+--------+ | | | 1054 * | +--------+<----------------------+ 1055 * | | RED3 |--------------+ | [0 0 d1 d0] 1056 * | +--------+ | | 1057 * +--->+--------+--------+ | | 1058 * | T1 | T0 |--------+ 1059 * +--------+--------+ | | 1060 * --------------------------- | | 1061 * | | 1062 * +--------+--------+<----+ | 1063 * | RED2 | RED1 | | 1064 * +--------+--------+ | 1065 * ---+--------+--------+<-------+ 1066 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 1067 * ---+--------+--------+ 1068 * 1069 * Last 'group' needs to RED2||RED1 shifted less 1070 */ 1071 TEXT p256MulInternal<>(SB), NOSPLIT, $0-0 1072 VL 32(CPOOL), SEL1 1073 VL 48(CPOOL), SEL2 1074 VL 64(CPOOL), SEL3 1075 VL 80(CPOOL), SEL4 1076 1077 // --------------------------------------------------- 1078 1079 VREPF $3, Y0, YDIG 1080 VMLHF X0, YDIG, ADD1H 1081 VMLHF X1, YDIG, ADD2H 1082 VMLF X0, YDIG, ADD1 1083 VMLF X1, YDIG, ADD2 1084 1085 VREPF $2, Y0, YDIG 1086 VMALF X0, YDIG, ADD1H, ADD3 1087 VMALF X1, YDIG, ADD2H, ADD4 1088 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1089 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1090 1091 VZERO ZER 1092 VL 32(CPOOL), SEL1 1093 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1094 1095 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1096 VSLDB $12, ZER, ADD2, T1 // ADD2 Free 1097 1098 VACCQ T0, ADD3, CAR1 1099 VAQ T0, ADD3, T0 // ADD3 Free 1100 VACCCQ T1, ADD4, CAR1, T2 1101 VACQ T1, ADD4, CAR1, T1 // ADD4 Free 1102 1103 VL 48(CPOOL), SEL2 1104 VL 64(CPOOL), SEL3 1105 VL 80(CPOOL), SEL4 1106 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1107 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1108 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1109 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1110 1111 VSLDB $12, T1, T0, T0 1112 VSLDB $12, T2, T1, T1 1113 1114 VACCQ T0, ADD3H, CAR1 1115 VAQ T0, ADD3H, T0 1116 VACCCQ T1, ADD4H, CAR1, T2 1117 VACQ T1, ADD4H, CAR1, T1 1118 1119 // --------------------------------------------------- 1120 1121 VREPF $1, Y0, YDIG 1122 VMALHF X0, YDIG, T0, ADD1H 1123 VMALHF X1, YDIG, T1, ADD2H 1124 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 1125 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 1126 1127 VREPF $0, Y0, YDIG 1128 VMALF X0, YDIG, ADD1H, ADD3 1129 VMALF X1, YDIG, ADD2H, ADD4 1130 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 1131 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 1132 1133 VZERO ZER 1134 VL 32(CPOOL), SEL1 1135 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1136 1137 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 1138 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free 1139 1140 VACCQ T0, RED1, CAR1 1141 VAQ T0, RED1, T0 1142 VACCCQ T1, RED2, CAR1, T2 1143 VACQ T1, RED2, CAR1, T1 1144 1145 VACCQ T0, ADD3, CAR1 1146 VAQ T0, ADD3, T0 1147 VACCCQ T1, ADD4, CAR1, CAR2 1148 VACQ T1, ADD4, CAR1, T1 1149 VAQ T2, CAR2, T2 1150 1151 VL 48(CPOOL), SEL2 1152 VL 64(CPOOL), SEL3 1153 VL 80(CPOOL), SEL4 1154 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1155 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1156 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1157 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1158 1159 VSLDB $12, T1, T0, T0 1160 VSLDB $12, T2, T1, T1 1161 1162 VACCQ T0, ADD3H, CAR1 1163 VAQ T0, ADD3H, T0 1164 VACCCQ T1, ADD4H, CAR1, T2 1165 VACQ T1, ADD4H, CAR1, T1 1166 1167 // --------------------------------------------------- 1168 1169 VREPF $3, Y1, YDIG 1170 VMALHF X0, YDIG, T0, ADD1H 1171 VMALHF X1, YDIG, T1, ADD2H 1172 VMALF X0, YDIG, T0, ADD1 1173 VMALF X1, YDIG, T1, ADD2 1174 1175 VREPF $2, Y1, YDIG 1176 VMALF X0, YDIG, ADD1H, ADD3 1177 VMALF X1, YDIG, ADD2H, ADD4 1178 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1179 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1180 1181 VZERO ZER 1182 VL 32(CPOOL), SEL1 1183 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1184 1185 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1186 VSLDB $12, T2, ADD2, T1 // ADD2 Free 1187 1188 VACCQ T0, RED1, CAR1 1189 VAQ T0, RED1, T0 1190 VACCCQ T1, RED2, CAR1, T2 1191 VACQ T1, RED2, CAR1, T1 1192 1193 VACCQ T0, ADD3, CAR1 1194 VAQ T0, ADD3, T0 1195 VACCCQ T1, ADD4, CAR1, CAR2 1196 VACQ T1, ADD4, CAR1, T1 1197 VAQ T2, CAR2, T2 1198 1199 VL 48(CPOOL), SEL2 1200 VL 64(CPOOL), SEL3 1201 VL 80(CPOOL), SEL4 1202 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1203 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1204 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1205 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1206 1207 VSLDB $12, T1, T0, T0 1208 VSLDB $12, T2, T1, T1 1209 1210 VACCQ T0, ADD3H, CAR1 1211 VAQ T0, ADD3H, T0 1212 VACCCQ T1, ADD4H, CAR1, T2 1213 VACQ T1, ADD4H, CAR1, T1 1214 1215 // --------------------------------------------------- 1216 1217 VREPF $1, Y1, YDIG 1218 VMALHF X0, YDIG, T0, ADD1H 1219 VMALHF X1, YDIG, T1, ADD2H 1220 VMALF X0, YDIG, T0, ADD1 1221 VMALF X1, YDIG, T1, ADD2 1222 1223 VREPF $0, Y1, YDIG 1224 VMALF X0, YDIG, ADD1H, ADD3 1225 VMALF X1, YDIG, ADD2H, ADD4 1226 VMALHF X0, YDIG, ADD1H, ADD3H 1227 VMALHF X1, YDIG, ADD2H, ADD4H 1228 1229 VZERO ZER 1230 VL 32(CPOOL), SEL1 1231 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1232 1233 VSLDB $12, ADD2, ADD1, T0 1234 VSLDB $12, T2, ADD2, T1 1235 1236 VACCQ T0, RED1, CAR1 1237 VAQ T0, RED1, T0 1238 VACCCQ T1, RED2, CAR1, T2 1239 VACQ T1, RED2, CAR1, T1 1240 1241 VACCQ T0, ADD3, CAR1 1242 VAQ T0, ADD3, T0 1243 VACCCQ T1, ADD4, CAR1, CAR2 1244 VACQ T1, ADD4, CAR1, T1 1245 VAQ T2, CAR2, T2 1246 1247 VL 96(CPOOL), SEL5 1248 VL 112(CPOOL), SEL6 1249 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] 1250 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] 1251 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 1252 1253 VSLDB $12, T1, T0, T0 1254 VSLDB $12, T2, T1, T1 1255 1256 VACCQ T0, ADD3H, CAR1 1257 VAQ T0, ADD3H, T0 1258 VACCCQ T1, ADD4H, CAR1, T2 1259 VACQ T1, ADD4H, CAR1, T1 1260 1261 VACCQ T0, RED1, CAR1 1262 VAQ T0, RED1, T0 1263 VACCCQ T1, RED2, CAR1, CAR2 1264 VACQ T1, RED2, CAR1, T1 1265 VAQ T2, CAR2, T2 1266 1267 // --------------------------------------------------- 1268 1269 VZERO RED3 1270 VSCBIQ P0, T0, CAR1 1271 VSQ P0, T0, ADD1H 1272 VSBCBIQ T1, P1, CAR1, CAR2 1273 VSBIQ T1, P1, CAR1, ADD2H 1274 VSBIQ T2, RED3, CAR2, T2 1275 1276 // what output to use, ADD2H||ADD1H or T1||T0? 1277 VSEL T0, ADD1H, T2, T0 1278 VSEL T1, ADD2H, T2, T1 1279 RET 1280 1281 #undef CPOOL 1282 1283 #undef X0 1284 #undef X1 1285 #undef Y0 1286 #undef Y1 1287 #undef T0 1288 #undef T1 1289 #undef P0 1290 #undef P1 1291 1292 #undef SEL1 1293 #undef SEL2 1294 #undef SEL3 1295 #undef SEL4 1296 #undef SEL5 1297 #undef SEL6 1298 1299 #undef YDIG 1300 #undef ADD1H 1301 #undef ADD2H 1302 #undef ADD3 1303 #undef ADD4 1304 #undef RED1 1305 #undef RED2 1306 #undef RED3 1307 #undef T2 1308 #undef ADD1 1309 #undef ADD2 1310 #undef ADD3H 1311 #undef ADD4H 1312 #undef ZER 1313 #undef CAR1 1314 #undef CAR2 1315 1316 // --------------------------------------- 1317 1318 // Parameters 1319 #define X0 V0 1320 #define X1 V1 1321 #define Y0 V2 1322 #define Y1 V3 1323 1324 TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0 1325 VLR X0, Y0 1326 VLR X1, Y1 1327 BR p256MulInternal<>(SB) 1328 1329 #undef X0 1330 #undef X1 1331 #undef Y0 1332 #undef Y1 1333 1334 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1335 VZERO ZER \ 1336 VSCBIQ Y0, X0, CAR1 \ 1337 VSQ Y0, X0, T0 \ 1338 VSBCBIQ X1, Y1, CAR1, SEL1 \ 1339 VSBIQ X1, Y1, CAR1, T1 \ 1340 VSQ SEL1, ZER, SEL1 \ 1341 \ 1342 VACCQ T0, PL, CAR1 \ 1343 VAQ T0, PL, TT0 \ 1344 VACQ T1, PH, CAR1, TT1 \ 1345 \ 1346 VSEL T0, TT0, SEL1, T0 \ 1347 VSEL T1, TT1, SEL1, T1 \ 1348 1349 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1350 VACCQ X0, Y0, CAR1 \ 1351 VAQ X0, Y0, T0 \ 1352 VACCCQ X1, Y1, CAR1, T2 \ 1353 VACQ X1, Y1, CAR1, T1 \ 1354 \ 1355 VZERO ZER \ 1356 VSCBIQ PL, T0, CAR1 \ 1357 VSQ PL, T0, TT0 \ 1358 VSBCBIQ T1, PH, CAR1, CAR2 \ 1359 VSBIQ T1, PH, CAR1, TT1 \ 1360 VSBIQ T2, ZER, CAR2, SEL1 \ 1361 \ 1362 VSEL T0, TT0, SEL1, T0 \ 1363 VSEL T1, TT1, SEL1, T1 1364 1365 #define p256HalfInternal(T1, T0, X1, X0) \ 1366 VZERO ZER \ 1367 VSBIQ ZER, ZER, X0, SEL1 \ 1368 \ 1369 VACCQ X0, PL, CAR1 \ 1370 VAQ X0, PL, T0 \ 1371 VACCCQ X1, PH, CAR1, T2 \ 1372 VACQ X1, PH, CAR1, T1 \ 1373 \ 1374 VSEL X0, T0, SEL1, T0 \ 1375 VSEL X1, T1, SEL1, T1 \ 1376 VSEL ZER, T2, SEL1, T2 \ 1377 \ 1378 VSLDB $15, T2, ZER, TT1 \ 1379 VSLDB $15, T1, ZER, TT0 \ 1380 VREPIB $1, SEL1 \ 1381 VSRL SEL1, T0, T0 \ 1382 VSRL SEL1, T1, T1 \ 1383 VREPIB $7, SEL1 \ 1384 VSL SEL1, TT0, TT0 \ 1385 VSL SEL1, TT1, TT1 \ 1386 VO T0, TT0, T0 \ 1387 VO T1, TT1, T1 1388 1389 // --------------------------------------- 1390 // func p256Mul(res, in1, in2 *p256Element) 1391 #define res_ptr R1 1392 #define x_ptr R2 1393 #define y_ptr R3 1394 #define CPOOL R4 1395 1396 // Parameters 1397 #define X0 V0 1398 #define X1 V1 1399 #define Y0 V2 1400 #define Y1 V3 1401 #define T0 V4 1402 #define T1 V5 1403 1404 // Constants 1405 #define P0 V30 1406 #define P1 V31 1407 TEXT ·p256Mul(SB), NOSPLIT, $0 1408 MOVD res+0(FP), res_ptr 1409 MOVD in1+8(FP), x_ptr 1410 MOVD in2+16(FP), y_ptr 1411 1412 VL (0*16)(x_ptr), X0 1413 VPDI $0x4, X0, X0, X0 1414 VL (1*16)(x_ptr), X1 1415 VPDI $0x4, X1, X1, X1 1416 VL (0*16)(y_ptr), Y0 1417 VPDI $0x4, Y0, Y0, Y0 1418 VL (1*16)(y_ptr), Y1 1419 VPDI $0x4, Y1, Y1, Y1 1420 1421 MOVD $p256mul<>+0x00(SB), CPOOL 1422 VL 16(CPOOL), P0 1423 VL 0(CPOOL), P1 1424 1425 CALL p256MulInternal<>(SB) 1426 1427 VPDI $0x4, T0, T0, T0 1428 VST T0, (0*16)(res_ptr) 1429 VPDI $0x4, T1, T1, T1 1430 VST T1, (1*16)(res_ptr) 1431 RET 1432 1433 #undef res_ptr 1434 #undef x_ptr 1435 #undef y_ptr 1436 #undef CPOOL 1437 1438 #undef X0 1439 #undef X1 1440 #undef Y0 1441 #undef Y1 1442 #undef T0 1443 #undef T1 1444 #undef P0 1445 #undef P1 1446 1447 // --------------------------------------- 1448 // func p256Sqr(res, in *p256Element, n int) 1449 #define res_ptr R1 1450 #define x_ptr R2 1451 #define y_ptr R3 1452 #define CPOOL R4 1453 #define COUNT R5 1454 #define N R6 1455 1456 // Parameters 1457 #define X0 V0 1458 #define X1 V1 1459 #define T0 V4 1460 #define T1 V5 1461 1462 // Constants 1463 #define P0 V30 1464 #define P1 V31 1465 TEXT ·p256Sqr(SB), NOSPLIT, $0 1466 MOVD res+0(FP), res_ptr 1467 MOVD in+8(FP), x_ptr 1468 1469 VL (0*16)(x_ptr), X0 1470 VPDI $0x4, X0, X0, X0 1471 VL (1*16)(x_ptr), X1 1472 VPDI $0x4, X1, X1, X1 1473 1474 MOVD $p256mul<>+0x00(SB), CPOOL 1475 MOVD $0, COUNT 1476 MOVD n+16(FP), N 1477 VL 16(CPOOL), P0 1478 VL 0(CPOOL), P1 1479 1480 loop: 1481 CALL p256SqrInternal<>(SB) 1482 VLR T0, X0 1483 VLR T1, X1 1484 ADDW $1, COUNT 1485 CMPW COUNT, N 1486 BLT loop 1487 1488 VPDI $0x4, T0, T0, T0 1489 VST T0, (0*16)(res_ptr) 1490 VPDI $0x4, T1, T1, T1 1491 VST T1, (1*16)(res_ptr) 1492 RET 1493 1494 #undef res_ptr 1495 #undef x_ptr 1496 #undef y_ptr 1497 #undef CPOOL 1498 #undef COUNT 1499 #undef N 1500 1501 #undef X0 1502 #undef X1 1503 #undef T0 1504 #undef T1 1505 #undef P0 1506 #undef P1 1507 1508 // Point add with P2 being affine point 1509 // If sign == 1 -> P2 = -P2 1510 // If sel == 0 -> P3 = P1 1511 // if zero == 0 -> P3 = P2 1512 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1513 #define P3ptr R1 1514 #define P1ptr R2 1515 #define P2ptr R3 1516 #define CPOOL R4 1517 1518 // Temporaries in REGs 1519 #define Y2L V15 1520 #define Y2H V16 1521 #define T1L V17 1522 #define T1H V18 1523 #define T2L V19 1524 #define T2H V20 1525 #define T3L V21 1526 #define T3H V22 1527 #define T4L V23 1528 #define T4H V24 1529 1530 // Temps for Sub and Add 1531 #define TT0 V11 1532 #define TT1 V12 1533 #define T2 V13 1534 1535 // p256MulAsm Parameters 1536 #define X0 V0 1537 #define X1 V1 1538 #define Y0 V2 1539 #define Y1 V3 1540 #define T0 V4 1541 #define T1 V5 1542 1543 #define PL V30 1544 #define PH V31 1545 1546 // Names for zero/sel selects 1547 #define X1L V0 1548 #define X1H V1 1549 #define Y1L V2 // p256MulAsmParmY 1550 #define Y1H V3 // p256MulAsmParmY 1551 #define Z1L V4 1552 #define Z1H V5 1553 #define X2L V0 1554 #define X2H V1 1555 #define Z2L V4 1556 #define Z2H V5 1557 #define X3L V17 // T1L 1558 #define X3H V18 // T1H 1559 #define Y3L V21 // T3L 1560 #define Y3H V22 // T3H 1561 #define Z3L V28 1562 #define Z3H V29 1563 1564 #define ZER V6 1565 #define SEL1 V7 1566 #define CAR1 V8 1567 #define CAR2 V9 1568 /* * 1569 * Three operand formula: 1570 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1571 * T1 = Z1² 1572 * T2 = T1*Z1 1573 * T1 = T1*X2 1574 * T2 = T2*Y2 1575 * T1 = T1-X1 1576 * T2 = T2-Y1 1577 * Z3 = Z1*T1 1578 * T3 = T1² 1579 * T4 = T3*T1 1580 * T3 = T3*X1 1581 * T1 = 2*T3 1582 * X3 = T2² 1583 * X3 = X3-T1 1584 * X3 = X3-T4 1585 * T3 = T3-X3 1586 * T3 = T3*T2 1587 * T4 = T4*Y1 1588 * Y3 = T3-T4 1589 1590 * Three operand formulas, but with MulInternal X,Y used to store temps 1591 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1592 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1593 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1594 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1595 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1596 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1597 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1598 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1599 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1600 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1601 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1602 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1603 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1604 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1605 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1606 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1607 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1608 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1609 1610 */ 1611 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 1612 MOVD res+0(FP), P3ptr 1613 MOVD in1+8(FP), P1ptr 1614 MOVD in2+16(FP), P2ptr 1615 1616 MOVD $p256mul<>+0x00(SB), CPOOL 1617 VL 16(CPOOL), PL 1618 VL 0(CPOOL), PH 1619 1620 // if (sign == 1) { 1621 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 1622 // } 1623 1624 VL 48(P2ptr), Y2H 1625 VPDI $0x4, Y2H, Y2H, Y2H 1626 VL 32(P2ptr), Y2L 1627 VPDI $0x4, Y2L, Y2L, Y2L 1628 1629 VLREPG sign+24(FP), SEL1 1630 VZERO ZER 1631 VCEQG SEL1, ZER, SEL1 1632 1633 VSCBIQ Y2L, PL, CAR1 1634 VSQ Y2L, PL, T1L 1635 VSBIQ PH, Y2H, CAR1, T1H 1636 1637 VSEL Y2L, T1L, SEL1, Y2L 1638 VSEL Y2H, T1H, SEL1, Y2H 1639 1640 /* * 1641 * Three operand formula: 1642 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1643 */ 1644 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1645 VL 80(P1ptr), X1 // Z1H 1646 VPDI $0x4, X1, X1, X1 1647 VL 64(P1ptr), X0 // Z1L 1648 VPDI $0x4, X0, X0, X0 1649 VLR X0, Y0 1650 VLR X1, Y1 1651 CALL p256SqrInternal<>(SB) 1652 1653 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1654 VLR T0, X0 1655 VLR T1, X1 1656 CALL p256MulInternal<>(SB) 1657 VLR T0, T2L 1658 VLR T1, T2H 1659 1660 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1661 VL 16(P2ptr), Y1 // X2H 1662 VPDI $0x4, Y1, Y1, Y1 1663 VL 0(P2ptr), Y0 // X2L 1664 VPDI $0x4, Y0, Y0, Y0 1665 CALL p256MulInternal<>(SB) 1666 VLR T0, T1L 1667 VLR T1, T1H 1668 1669 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1670 VLR T2L, X0 1671 VLR T2H, X1 1672 VLR Y2L, Y0 1673 VLR Y2H, Y1 1674 CALL p256MulInternal<>(SB) 1675 1676 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1677 VL 48(P1ptr), Y1H 1678 VPDI $0x4, Y1H, Y1H, Y1H 1679 VL 32(P1ptr), Y1L 1680 VPDI $0x4, Y1L, Y1L, Y1L 1681 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1682 1683 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1684 VL 16(P1ptr), X1H 1685 VPDI $0x4, X1H, X1H, X1H 1686 VL 0(P1ptr), X1L 1687 VPDI $0x4, X1L, X1L, X1L 1688 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1689 1690 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1691 VL 80(P1ptr), X1 // Z1H 1692 VPDI $0x4, X1, X1, X1 1693 VL 64(P1ptr), X0 // Z1L 1694 VPDI $0x4, X0, X0, X0 1695 CALL p256MulInternal<>(SB) 1696 1697 // VST T1, 64(P3ptr) 1698 // VST T0, 80(P3ptr) 1699 VLR T0, Z3L 1700 VLR T1, Z3H 1701 1702 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1703 VLR Y0, X0 1704 VLR Y1, X1 1705 CALL p256SqrInternal<>(SB) 1706 VLR T0, X0 1707 VLR T1, X1 1708 1709 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1710 CALL p256MulInternal<>(SB) 1711 VLR T0, T4L 1712 VLR T1, T4H 1713 1714 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1715 VL 16(P1ptr), Y1 // X1H 1716 VPDI $0x4, Y1, Y1, Y1 1717 VL 0(P1ptr), Y0 // X1L 1718 VPDI $0x4, Y0, Y0, Y0 1719 CALL p256MulInternal<>(SB) 1720 VLR T0, T3L 1721 VLR T1, T3H 1722 1723 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1724 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1725 1726 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1727 VLR T2L, X0 1728 VLR T2H, X1 1729 VLR T2L, Y0 1730 VLR T2H, Y1 1731 CALL p256SqrInternal<>(SB) 1732 1733 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1734 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1735 1736 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1737 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1738 VLR T0, X3L 1739 VLR T1, X3H 1740 1741 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1742 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1743 1744 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1745 CALL p256MulInternal<>(SB) 1746 VLR T0, T3L 1747 VLR T1, T3H 1748 1749 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1750 VLR T4L, X0 1751 VLR T4H, X1 1752 VL 48(P1ptr), Y1 // Y1H 1753 VPDI $0x4, Y1, Y1, Y1 1754 VL 32(P1ptr), Y0 // Y1L 1755 VPDI $0x4, Y0, Y0, Y0 1756 CALL p256MulInternal<>(SB) 1757 1758 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 1759 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 1760 1761 // if (sel == 0) { 1762 // copy(P3.x[:], X1) 1763 // copy(P3.y[:], Y1) 1764 // copy(P3.z[:], Z1) 1765 // } 1766 1767 VL 16(P1ptr), X1H 1768 VPDI $0x4, X1H, X1H, X1H 1769 VL 0(P1ptr), X1L 1770 VPDI $0x4, X1L, X1L, X1L 1771 1772 // Y1 already loaded, left over from addition 1773 VL 80(P1ptr), Z1H 1774 VPDI $0x4, Z1H, Z1H, Z1H 1775 VL 64(P1ptr), Z1L 1776 VPDI $0x4, Z1L, Z1L, Z1L 1777 1778 VLREPG sel+32(FP), SEL1 1779 VZERO ZER 1780 VCEQG SEL1, ZER, SEL1 1781 1782 VSEL X1L, X3L, SEL1, X3L 1783 VSEL X1H, X3H, SEL1, X3H 1784 VSEL Y1L, Y3L, SEL1, Y3L 1785 VSEL Y1H, Y3H, SEL1, Y3H 1786 VSEL Z1L, Z3L, SEL1, Z3L 1787 VSEL Z1H, Z3H, SEL1, Z3H 1788 1789 // if (zero == 0) { 1790 // copy(P3.x[:], X2) 1791 // copy(P3.y[:], Y2) 1792 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1793 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p 1794 // } 1795 VL 16(P2ptr), X2H 1796 VPDI $0x4, X2H, X2H, X2H 1797 VL 0(P2ptr), X2L 1798 VPDI $0x4, X2L, X2L, X2L 1799 1800 // Y2 already loaded 1801 VL 128(CPOOL), Z2H 1802 VL 144(CPOOL), Z2L 1803 1804 VLREPG zero+40(FP), SEL1 1805 VZERO ZER 1806 VCEQG SEL1, ZER, SEL1 1807 1808 VSEL X2L, X3L, SEL1, X3L 1809 VSEL X2H, X3H, SEL1, X3H 1810 VSEL Y2L, Y3L, SEL1, Y3L 1811 VSEL Y2H, Y3H, SEL1, Y3H 1812 VSEL Z2L, Z3L, SEL1, Z3L 1813 VSEL Z2H, Z3H, SEL1, Z3H 1814 1815 // All done, store out the result!!! 1816 VPDI $0x4, X3H, X3H, X3H 1817 VST X3H, 16(P3ptr) 1818 VPDI $0x4, X3L, X3L, X3L 1819 VST X3L, 0(P3ptr) 1820 VPDI $0x4, Y3H, Y3H, Y3H 1821 VST Y3H, 48(P3ptr) 1822 VPDI $0x4, Y3L, Y3L, Y3L 1823 VST Y3L, 32(P3ptr) 1824 VPDI $0x4, Z3H, Z3H, Z3H 1825 VST Z3H, 80(P3ptr) 1826 VPDI $0x4, Z3L, Z3L, Z3L 1827 VST Z3L, 64(P3ptr) 1828 1829 RET 1830 1831 #undef P3ptr 1832 #undef P1ptr 1833 #undef P2ptr 1834 #undef CPOOL 1835 1836 #undef Y2L 1837 #undef Y2H 1838 #undef T1L 1839 #undef T1H 1840 #undef T2L 1841 #undef T2H 1842 #undef T3L 1843 #undef T3H 1844 #undef T4L 1845 #undef T4H 1846 1847 #undef TT0 1848 #undef TT1 1849 #undef T2 1850 1851 #undef X0 1852 #undef X1 1853 #undef Y0 1854 #undef Y1 1855 #undef T0 1856 #undef T1 1857 1858 #undef PL 1859 #undef PH 1860 1861 #undef X1L 1862 #undef X1H 1863 #undef Y1L 1864 #undef Y1H 1865 #undef Z1L 1866 #undef Z1H 1867 #undef X2L 1868 #undef X2H 1869 #undef Z2L 1870 #undef Z2H 1871 #undef X3L 1872 #undef X3H 1873 #undef Y3L 1874 #undef Y3H 1875 #undef Z3L 1876 #undef Z3H 1877 1878 #undef ZER 1879 #undef SEL1 1880 #undef CAR1 1881 #undef CAR2 1882 1883 // func p256PointDoubleAsm(res, in *P256Point) 1884 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 1885 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 1886 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 1887 #define P3ptr R1 1888 #define P1ptr R2 1889 #define CPOOL R4 1890 1891 // Temporaries in REGs 1892 #define X3L V15 1893 #define X3H V16 1894 #define Y3L V17 1895 #define Y3H V18 1896 #define T1L V19 1897 #define T1H V20 1898 #define T2L V21 1899 #define T2H V22 1900 #define T3L V23 1901 #define T3H V24 1902 1903 #define X1L V6 1904 #define X1H V7 1905 #define Y1L V8 1906 #define Y1H V9 1907 #define Z1L V10 1908 #define Z1H V11 1909 1910 // Temps for Sub and Add 1911 #define TT0 V11 1912 #define TT1 V12 1913 #define T2 V13 1914 1915 // p256MulAsm Parameters 1916 #define X0 V0 1917 #define X1 V1 1918 #define Y0 V2 1919 #define Y1 V3 1920 #define T0 V4 1921 #define T1 V5 1922 1923 #define PL V30 1924 #define PH V31 1925 1926 #define Z3L V23 1927 #define Z3H V24 1928 1929 #define ZER V26 1930 #define SEL1 V27 1931 #define CAR1 V28 1932 #define CAR2 V29 1933 /* 1934 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 1935 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 1936 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1937 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 1938 * B = 2Y₁ 1939 * Z₃ = B×Z₁ 1940 * C = B² 1941 * D = C×X₁ 1942 * X₃ = A²-2D 1943 * Y₃ = (D-X₃)×A-C²/2 1944 * 1945 * Three-operand formula: 1946 * T1 = Z1² 1947 * T2 = X1-T1 1948 * T1 = X1+T1 1949 * T2 = T2*T1 1950 * T2 = 3*T2 1951 * Y3 = 2*Y1 1952 * Z3 = Y3*Z1 1953 * Y3 = Y3² 1954 * T3 = Y3*X1 1955 * Y3 = Y3² 1956 * Y3 = half*Y3 1957 * X3 = T2² 1958 * T1 = 2*T3 1959 * X3 = X3-T1 1960 * T1 = T3-X3 1961 * T1 = T1*T2 1962 * Y3 = T1-Y3 1963 */ 1964 1965 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 1966 MOVD res+0(FP), P3ptr 1967 MOVD in+8(FP), P1ptr 1968 1969 MOVD $p256mul<>+0x00(SB), CPOOL 1970 VL 16(CPOOL), PL 1971 VL 0(CPOOL), PH 1972 1973 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 1974 VL 80(P1ptr), X1 // Z1H 1975 VPDI $0x4, X1, X1, X1 1976 VL 64(P1ptr), X0 // Z1L 1977 VPDI $0x4, X0, X0, X0 1978 VLR X0, Y0 1979 VLR X1, Y1 1980 CALL p256SqrInternal<>(SB) 1981 1982 // SUB(X<X1-T) // T2 = X1-T1 1983 VL 16(P1ptr), X1H 1984 VPDI $0x4, X1H, X1H, X1H 1985 VL 0(P1ptr), X1L 1986 VPDI $0x4, X1L, X1L, X1L 1987 p256SubInternal(X1,X0,X1H,X1L,T1,T0) 1988 1989 // ADD(Y<X1+T) // T1 = X1+T1 1990 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) 1991 1992 // X- ; Y- ; MUL; T- // T2 = T2*T1 1993 CALL p256MulInternal<>(SB) 1994 1995 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 1996 p256AddInternal(T2H,T2L,T1,T0,T1,T0) 1997 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) 1998 1999 // ADD(X<Y1+Y1) // Y3 = 2*Y1 2000 VL 48(P1ptr), Y1H 2001 VPDI $0x4, Y1H, Y1H, Y1H 2002 VL 32(P1ptr), Y1L 2003 VPDI $0x4, Y1L, Y1L, Y1L 2004 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) 2005 2006 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 2007 VL 80(P1ptr), Y1 // Z1H 2008 VPDI $0x4, Y1, Y1, Y1 2009 VL 64(P1ptr), Y0 // Z1L 2010 VPDI $0x4, Y0, Y0, Y0 2011 CALL p256MulInternal<>(SB) 2012 VPDI $0x4, T1, T1, TT1 2013 VST TT1, 80(P3ptr) 2014 VPDI $0x4, T0, T0, TT0 2015 VST TT0, 64(P3ptr) 2016 2017 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2018 VLR X0, Y0 2019 VLR X1, Y1 2020 CALL p256SqrInternal<>(SB) 2021 2022 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 2023 VLR T0, X0 2024 VLR T1, X1 2025 VL 16(P1ptr), Y1 2026 VPDI $0x4, Y1, Y1, Y1 2027 VL 0(P1ptr), Y0 2028 VPDI $0x4, Y0, Y0, Y0 2029 CALL p256MulInternal<>(SB) 2030 VLR T0, T3L 2031 VLR T1, T3H 2032 2033 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2034 VLR X0, Y0 2035 VLR X1, Y1 2036 CALL p256SqrInternal<>(SB) 2037 2038 // HAL(Y3<T) // Y3 = half*Y3 2039 p256HalfInternal(Y3H,Y3L, T1,T0) 2040 2041 // X=T2; Y=T2; MUL; T- // X3 = T2² 2042 VLR T2L, X0 2043 VLR T2H, X1 2044 VLR T2L, Y0 2045 VLR T2H, Y1 2046 CALL p256SqrInternal<>(SB) 2047 2048 // ADD(T1<T3+T3) // T1 = 2*T3 2049 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) 2050 2051 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 2052 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) 2053 VPDI $0x4, X3H, X3H, TT1 2054 VST TT1, 16(P3ptr) 2055 VPDI $0x4, X3L, X3L, TT0 2056 VST TT0, 0(P3ptr) 2057 2058 // SUB(X<T3-X3) // T1 = T3-X3 2059 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) 2060 2061 // X- ; Y- ; MUL; T- // T1 = T1*T2 2062 CALL p256MulInternal<>(SB) 2063 2064 // SUB(Y3<T-Y3) // Y3 = T1-Y3 2065 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) 2066 2067 VPDI $0x4, Y3H, Y3H, Y3H 2068 VST Y3H, 48(P3ptr) 2069 VPDI $0x4, Y3L, Y3L, Y3L 2070 VST Y3L, 32(P3ptr) 2071 RET 2072 2073 #undef P3ptr 2074 #undef P1ptr 2075 #undef CPOOL 2076 #undef X3L 2077 #undef X3H 2078 #undef Y3L 2079 #undef Y3H 2080 #undef T1L 2081 #undef T1H 2082 #undef T2L 2083 #undef T2H 2084 #undef T3L 2085 #undef T3H 2086 #undef X1L 2087 #undef X1H 2088 #undef Y1L 2089 #undef Y1H 2090 #undef Z1L 2091 #undef Z1H 2092 #undef TT0 2093 #undef TT1 2094 #undef T2 2095 #undef X0 2096 #undef X1 2097 #undef Y0 2098 #undef Y1 2099 #undef T0 2100 #undef T1 2101 #undef PL 2102 #undef PH 2103 #undef Z3L 2104 #undef Z3H 2105 #undef ZER 2106 #undef SEL1 2107 #undef CAR1 2108 #undef CAR2 2109 2110 // func p256PointAddAsm(res, in1, in2 *P256Point) int 2111 #define P3ptr R1 2112 #define P1ptr R2 2113 #define P2ptr R3 2114 #define CPOOL R4 2115 #define ISZERO R5 2116 #define TRUE R6 2117 2118 // Temporaries in REGs 2119 #define T1L V16 2120 #define T1H V17 2121 #define T2L V18 2122 #define T2H V19 2123 #define U1L V20 2124 #define U1H V21 2125 #define S1L V22 2126 #define S1H V23 2127 #define HL V24 2128 #define HH V25 2129 #define RL V26 2130 #define RH V27 2131 2132 // Temps for Sub and Add 2133 #define ZER V6 2134 #define SEL1 V7 2135 #define CAR1 V8 2136 #define CAR2 V9 2137 #define TT0 V11 2138 #define TT1 V12 2139 #define T2 V13 2140 2141 // p256MulAsm Parameters 2142 #define X0 V0 2143 #define X1 V1 2144 #define Y0 V2 2145 #define Y1 V3 2146 #define T0 V4 2147 #define T1 V5 2148 2149 #define PL V30 2150 #define PH V31 2151 /* 2152 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 2153 * 2154 * A = X₁×Z₂² 2155 * B = Y₁×Z₂³ 2156 * C = X₂×Z₁²-A 2157 * D = Y₂×Z₁³-B 2158 * X₃ = D² - 2A×C² - C³ 2159 * Y₃ = D×(A×C² - X₃) - B×C³ 2160 * Z₃ = Z₁×Z₂×C 2161 * 2162 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 2163 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 2164 * 2165 * T1 = Z1*Z1 2166 * T2 = Z2*Z2 2167 * U1 = X1*T2 2168 * H = X2*T1 2169 * H = H-U1 2170 * Z3 = Z1*Z2 2171 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2172 * 2173 * S1 = Z2*T2 2174 * S1 = Y1*S1 2175 * R = Z1*T1 2176 * R = Y2*R 2177 * R = R-S1 2178 * 2179 * T1 = H*H 2180 * T2 = H*T1 2181 * U1 = U1*T1 2182 * 2183 * X3 = R*R 2184 * X3 = X3-T2 2185 * T1 = 2*U1 2186 * X3 = X3-T1 << store-out X3 result reg 2187 * 2188 * T2 = S1*T2 2189 * Y3 = U1-X3 2190 * Y3 = R*Y3 2191 * Y3 = Y3-T2 << store-out Y3 result reg 2192 2193 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2194 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2195 // X=X2; Y- ; MUL; H=T // H = X2*T1 2196 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2197 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2198 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2199 // SUB(H<H-T) // H = H-U1 2200 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2201 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2202 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2203 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2204 // SUB(R<T-S1) // R = R-S1 2205 // X=H ; Y=H ; MUL; T- // T1 = H*H 2206 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2207 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2208 // X=R ; Y=R ; MUL; T- // X3 = R*R 2209 // SUB(T<T-T2) // X3 = X3-T2 2210 // ADD(X<U1+U1) // T1 = 2*U1 2211 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2212 // SUB(Y<U1-T) // Y3 = U1-X3 2213 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2214 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2215 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2216 */ 2217 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 2218 MOVD res+0(FP), P3ptr 2219 MOVD in1+8(FP), P1ptr 2220 MOVD in2+16(FP), P2ptr 2221 2222 MOVD $p256mul<>+0x00(SB), CPOOL 2223 VL 16(CPOOL), PL 2224 VL 0(CPOOL), PH 2225 2226 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2227 VL 80(P1ptr), X1 // Z1H 2228 VPDI $0x4, X1, X1, X1 2229 VL 64(P1ptr), X0 // Z1L 2230 VPDI $0x4, X0, X0, X0 2231 VLR X0, Y0 2232 VLR X1, Y1 2233 CALL p256SqrInternal<>(SB) 2234 2235 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2236 VLR T0, Y0 2237 VLR T1, Y1 2238 CALL p256MulInternal<>(SB) 2239 VLR T0, RL 2240 VLR T1, RH 2241 2242 // X=X2; Y- ; MUL; H=T // H = X2*T1 2243 VL 16(P2ptr), X1 // X2H 2244 VPDI $0x4, X1, X1, X1 2245 VL 0(P2ptr), X0 // X2L 2246 VPDI $0x4, X0, X0, X0 2247 CALL p256MulInternal<>(SB) 2248 VLR T0, HL 2249 VLR T1, HH 2250 2251 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2252 VL 80(P2ptr), X1 // Z2H 2253 VPDI $0x4, X1, X1, X1 2254 VL 64(P2ptr), X0 // Z2L 2255 VPDI $0x4, X0, X0, X0 2256 VLR X0, Y0 2257 VLR X1, Y1 2258 CALL p256SqrInternal<>(SB) 2259 2260 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2261 VLR T0, Y0 2262 VLR T1, Y1 2263 CALL p256MulInternal<>(SB) 2264 VLR T0, S1L 2265 VLR T1, S1H 2266 2267 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2268 VL 16(P1ptr), X1 // X1H 2269 VPDI $0x4, X1, X1, X1 2270 VL 0(P1ptr), X0 // X1L 2271 VPDI $0x4, X0, X0, X0 2272 CALL p256MulInternal<>(SB) 2273 VLR T0, U1L 2274 VLR T1, U1H 2275 2276 // SUB(H<H-T) // H = H-U1 2277 p256SubInternal(HH,HL,HH,HL,T1,T0) 2278 2279 // if H == 0 or H^P == 0 then ret=1 else ret=0 2280 // clobbers T1H and T1L 2281 MOVD $0, ISZERO 2282 MOVD $1, TRUE 2283 VZERO ZER 2284 VO HL, HH, T1H 2285 VCEQGS ZER, T1H, T1H 2286 MOVDEQ TRUE, ISZERO 2287 VX HL, PL, T1L 2288 VX HH, PH, T1H 2289 VO T1L, T1H, T1H 2290 VCEQGS ZER, T1H, T1H 2291 MOVDEQ TRUE, ISZERO 2292 MOVD ISZERO, ret+24(FP) 2293 2294 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2295 VL 80(P1ptr), X1 // Z1H 2296 VPDI $0x4, X1, X1, X1 2297 VL 64(P1ptr), X0 // Z1L 2298 VPDI $0x4, X0, X0, X0 2299 VL 80(P2ptr), Y1 // Z2H 2300 VPDI $0x4, Y1, Y1, Y1 2301 VL 64(P2ptr), Y0 // Z2L 2302 VPDI $0x4, Y0, Y0, Y0 2303 CALL p256MulInternal<>(SB) 2304 2305 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2306 VLR T0, X0 2307 VLR T1, X1 2308 VLR HL, Y0 2309 VLR HH, Y1 2310 CALL p256MulInternal<>(SB) 2311 VPDI $0x4, T1, T1, TT1 2312 VST TT1, 80(P3ptr) 2313 VPDI $0x4, T0, T0, TT0 2314 VST TT0, 64(P3ptr) 2315 2316 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2317 VL 48(P1ptr), X1 2318 VPDI $0x4, X1, X1, X1 2319 VL 32(P1ptr), X0 2320 VPDI $0x4, X0, X0, X0 2321 VLR S1L, Y0 2322 VLR S1H, Y1 2323 CALL p256MulInternal<>(SB) 2324 VLR T0, S1L 2325 VLR T1, S1H 2326 2327 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2328 VL 48(P2ptr), X1 2329 VPDI $0x4, X1, X1, X1 2330 VL 32(P2ptr), X0 2331 VPDI $0x4, X0, X0, X0 2332 VLR RL, Y0 2333 VLR RH, Y1 2334 CALL p256MulInternal<>(SB) 2335 2336 // SUB(R<T-S1) // R = T-S1 2337 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2338 2339 // if R == 0 or R^P == 0 then ret=ret else ret=0 2340 // clobbers T1H and T1L 2341 MOVD $0, ISZERO 2342 MOVD $1, TRUE 2343 VZERO ZER 2344 VO RL, RH, T1H 2345 VCEQGS ZER, T1H, T1H 2346 MOVDEQ TRUE, ISZERO 2347 VX RL, PL, T1L 2348 VX RH, PH, T1H 2349 VO T1L, T1H, T1H 2350 VCEQGS ZER, T1H, T1H 2351 MOVDEQ TRUE, ISZERO 2352 AND ret+24(FP), ISZERO 2353 MOVD ISZERO, ret+24(FP) 2354 2355 // X=H ; Y=H ; MUL; T- // T1 = H*H 2356 VLR HL, X0 2357 VLR HH, X1 2358 VLR HL, Y0 2359 VLR HH, Y1 2360 CALL p256SqrInternal<>(SB) 2361 2362 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2363 VLR T0, Y0 2364 VLR T1, Y1 2365 CALL p256MulInternal<>(SB) 2366 VLR T0, T2L 2367 VLR T1, T2H 2368 2369 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2370 VLR U1L, X0 2371 VLR U1H, X1 2372 CALL p256MulInternal<>(SB) 2373 VLR T0, U1L 2374 VLR T1, U1H 2375 2376 // X=R ; Y=R ; MUL; T- // X3 = R*R 2377 VLR RL, X0 2378 VLR RH, X1 2379 VLR RL, Y0 2380 VLR RH, Y1 2381 CALL p256SqrInternal<>(SB) 2382 2383 // SUB(T<T-T2) // X3 = X3-T2 2384 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2385 2386 // ADD(X<U1+U1) // T1 = 2*U1 2387 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2388 2389 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2390 p256SubInternal(T1,T0,T1,T0,X1,X0) 2391 VPDI $0x4, T1, T1, TT1 2392 VST TT1, 16(P3ptr) 2393 VPDI $0x4, T0, T0, TT0 2394 VST TT0, 0(P3ptr) 2395 2396 // SUB(Y<U1-T) // Y3 = U1-X3 2397 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2398 2399 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2400 VLR RL, X0 2401 VLR RH, X1 2402 CALL p256MulInternal<>(SB) 2403 VLR T0, U1L 2404 VLR T1, U1H 2405 2406 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2407 VLR S1L, X0 2408 VLR S1H, X1 2409 VLR T2L, Y0 2410 VLR T2H, Y1 2411 CALL p256MulInternal<>(SB) 2412 2413 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2414 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2415 VPDI $0x4, T1, T1, T1 2416 VST T1, 48(P3ptr) 2417 VPDI $0x4, T0, T0, T0 2418 VST T0, 32(P3ptr) 2419 2420 RET